diff options
author | thegeorg <thegeorg@yandex-team.ru> | 2022-02-10 16:45:12 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:12 +0300 |
commit | 49116032d905455a7b1c994e4a696afc885c1e71 (patch) | |
tree | be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/libs/re2 | |
parent | 4e839db24a3bbc9f1c610c43d6faaaa99824dcca (diff) | |
download | ydb-49116032d905455a7b1c994e4a696afc885c1e71.tar.gz |
Restoring authorship annotation for <thegeorg@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/re2')
81 files changed, 15510 insertions, 15510 deletions
diff --git a/contrib/libs/re2/AUTHORS b/contrib/libs/re2/AUTHORS index 38866ae96a..0754006fec 100644 --- a/contrib/libs/re2/AUTHORS +++ b/contrib/libs/re2/AUTHORS @@ -1,13 +1,13 @@ -# This is the official list of RE2 authors for copyright purposes. -# This file is distinct from the CONTRIBUTORS files. -# See the latter for an explanation. - -# Names should be added to this file as -# Name or Organization <email address> -# The email address is not required for organizations. - -# Please keep the list sorted. - -Google Inc. -Samsung Electronics -Stefano Rivera <stefano.rivera@gmail.com> +# This is the official list of RE2 authors for copyright purposes. +# This file is distinct from the CONTRIBUTORS files. +# See the latter for an explanation. + +# Names should be added to this file as +# Name or Organization <email address> +# The email address is not required for organizations. + +# Please keep the list sorted. + +Google Inc. +Samsung Electronics +Stefano Rivera <stefano.rivera@gmail.com> diff --git a/contrib/libs/re2/CONTRIBUTING.md b/contrib/libs/re2/CONTRIBUTING.md index f8d54cec2c..882b0e2f34 100644 --- a/contrib/libs/re2/CONTRIBUTING.md +++ b/contrib/libs/re2/CONTRIBUTING.md @@ -1,2 +1,2 @@ -RE2 uses Gerrit instead of GitHub pull requests. +RE2 uses Gerrit instead of GitHub pull requests. See the [Contribute](https://github.com/google/re2/wiki/Contribute) wiki page. diff --git a/contrib/libs/re2/CONTRIBUTORS b/contrib/libs/re2/CONTRIBUTORS index fdceed8aa2..1a1c84827d 100644 --- a/contrib/libs/re2/CONTRIBUTORS +++ b/contrib/libs/re2/CONTRIBUTORS @@ -1,41 +1,41 @@ -# This is the official list of people who can contribute -# (and typically have contributed) code to the RE2 repository. -# The AUTHORS file lists the copyright holders; this file -# lists people. For example, Google employees are listed here -# but not in AUTHORS, because Google holds the copyright. -# -# The submission process automatically checks to make sure -# that people submitting code are listed in this file (by email address). -# -# Names should be added to this file only after verifying that -# the individual or the individual's organization has agreed to -# the appropriate Contributor License Agreement, found here: -# -# http://code.google.com/legal/individual-cla-v1.0.html -# http://code.google.com/legal/corporate-cla-v1.0.html -# -# The agreement for individuals can be filled out on the web. -# -# When adding J Random Contributor's name to this file, -# either J's name or J's organization's name should be -# added to the AUTHORS file, depending on whether the -# individual or corporate CLA was used. - -# Names should be added to this file like so: -# Name <email address> - -# Please keep the list sorted. - -Dominic Battré <battre@chromium.org> -Doug Kwan <dougkwan@google.com> -Dmitriy Vyukov <dvyukov@google.com> -John Millikin <jmillikin@gmail.com> -Mike Nazarewicz <mpn@google.com> -Nico Weber <thakis@chromium.org> -Pawel Hajdan <phajdan.jr@gmail.com> -Rob Pike <r@google.com> -Russ Cox <rsc@swtch.com> -Sanjay Ghemawat <sanjay@google.com> -Stefano Rivera <stefano.rivera@gmail.com> -Srinivasan Venkatachary <vsri@google.com> -Viatcheslav Ostapenko <sl.ostapenko@samsung.com> +# This is the official list of people who can contribute +# (and typically have contributed) code to the RE2 repository. +# The AUTHORS file lists the copyright holders; this file +# lists people. For example, Google employees are listed here +# but not in AUTHORS, because Google holds the copyright. +# +# The submission process automatically checks to make sure +# that people submitting code are listed in this file (by email address). +# +# Names should be added to this file only after verifying that +# the individual or the individual's organization has agreed to +# the appropriate Contributor License Agreement, found here: +# +# http://code.google.com/legal/individual-cla-v1.0.html +# http://code.google.com/legal/corporate-cla-v1.0.html +# +# The agreement for individuals can be filled out on the web. +# +# When adding J Random Contributor's name to this file, +# either J's name or J's organization's name should be +# added to the AUTHORS file, depending on whether the +# individual or corporate CLA was used. + +# Names should be added to this file like so: +# Name <email address> + +# Please keep the list sorted. + +Dominic Battré <battre@chromium.org> +Doug Kwan <dougkwan@google.com> +Dmitriy Vyukov <dvyukov@google.com> +John Millikin <jmillikin@gmail.com> +Mike Nazarewicz <mpn@google.com> +Nico Weber <thakis@chromium.org> +Pawel Hajdan <phajdan.jr@gmail.com> +Rob Pike <r@google.com> +Russ Cox <rsc@swtch.com> +Sanjay Ghemawat <sanjay@google.com> +Stefano Rivera <stefano.rivera@gmail.com> +Srinivasan Venkatachary <vsri@google.com> +Viatcheslav Ostapenko <sl.ostapenko@samsung.com> diff --git a/contrib/libs/re2/LICENSE b/contrib/libs/re2/LICENSE index 36747eca8b..09e5ec1c74 100644 --- a/contrib/libs/re2/LICENSE +++ b/contrib/libs/re2/LICENSE @@ -1,27 +1,27 @@ -// Copyright (c) 2009 The RE2 Authors. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Copyright (c) 2009 The RE2 Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/libs/re2/README b/contrib/libs/re2/README index 2a9cc501da..caee6afb6b 100644 --- a/contrib/libs/re2/README +++ b/contrib/libs/re2/README @@ -1,42 +1,42 @@ -This is the source code repository for RE2, a regular expression library. - -For documentation about how to install and use RE2, -visit https://github.com/google/re2/. - -The short version is: - -make -make test -make install -make testinstall - -There is a fair amount of documentation (including code snippets) in -the re2.h header file. - -More information can be found on the wiki: -https://github.com/google/re2/wiki - -Issue tracker: -https://github.com/google/re2/issues - -Mailing list: -https://groups.google.com/group/re2-dev - -Unless otherwise noted, the RE2 source files are distributed -under the BSD-style license found in the LICENSE file. - -RE2's native language is C++. - -The Python wrapper is at https://github.com/google/re2/tree/abseil/python -and on PyPI (https://pypi.org/project/google-re2/). - -A C wrapper is at https://github.com/marcomaggi/cre2/. +This is the source code repository for RE2, a regular expression library. + +For documentation about how to install and use RE2, +visit https://github.com/google/re2/. + +The short version is: + +make +make test +make install +make testinstall + +There is a fair amount of documentation (including code snippets) in +the re2.h header file. + +More information can be found on the wiki: +https://github.com/google/re2/wiki + +Issue tracker: +https://github.com/google/re2/issues + +Mailing list: +https://groups.google.com/group/re2-dev + +Unless otherwise noted, the RE2 source files are distributed +under the BSD-style license found in the LICENSE file. + +RE2's native language is C++. + +The Python wrapper is at https://github.com/google/re2/tree/abseil/python +and on PyPI (https://pypi.org/project/google-re2/). + +A C wrapper is at https://github.com/marcomaggi/cre2/. A D wrapper is at https://github.com/ShigekiKarita/re2d/ and on DUB (code.dlang.org). -An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm). -An Inferno wrapper is at https://github.com/powerman/inferno-re2/. -A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com). -An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org). -A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org). -An R wrapper is at https://github.com/girishji/re2/ and on CRAN (cran.r-project.org). -A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org). -A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com). +An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm). +An Inferno wrapper is at https://github.com/powerman/inferno-re2/. +A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com). +An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org). +A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org). +An R wrapper is at https://github.com/girishji/re2/ and on CRAN (cran.r-project.org). +A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org). +A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com). diff --git a/contrib/libs/re2/SECURITY.md b/contrib/libs/re2/SECURITY.md index 8b7e853ad7..39ba0e93f2 100644 --- a/contrib/libs/re2/SECURITY.md +++ b/contrib/libs/re2/SECURITY.md @@ -1,4 +1,4 @@ -To report a security issue, please use https://g.co/vulnz. We use -https://g.co/vulnz for our intake, and do coordination and disclosure here on -GitHub (including using GitHub Security Advisory). The Google Security Team will -respond within 5 working days of your report on https://g.co/vulnz. +To report a security issue, please use https://g.co/vulnz. We use +https://g.co/vulnz for our intake, and do coordination and disclosure here on +GitHub (including using GitHub Security Advisory). The Google Security Team will +respond within 5 working days of your report on https://g.co/vulnz. diff --git a/contrib/libs/re2/include/re2/re2.h b/contrib/libs/re2/include/re2/re2.h index 3305b909e5..31cfa08363 100644 --- a/contrib/libs/re2/include/re2/re2.h +++ b/contrib/libs/re2/include/re2/re2.h @@ -1 +1 @@ -#include "../../re2/re2.h" /* inclink generated by yamaker */ +#include "../../re2/re2.h" /* inclink generated by yamaker */ diff --git a/contrib/libs/re2/include/re2/stringpiece.h b/contrib/libs/re2/include/re2/stringpiece.h index 7e4d849d40..fce36b84eb 100644 --- a/contrib/libs/re2/include/re2/stringpiece.h +++ b/contrib/libs/re2/include/re2/stringpiece.h @@ -1 +1 @@ -#include "../../re2/stringpiece.h" /* inclink generated by yamaker */ +#include "../../re2/stringpiece.h" /* inclink generated by yamaker */ diff --git a/contrib/libs/re2/include/util/logging.h b/contrib/libs/re2/include/util/logging.h index a03a5f7595..6b83bd42dd 100644 --- a/contrib/libs/re2/include/util/logging.h +++ b/contrib/libs/re2/include/util/logging.h @@ -1 +1 @@ -#include "../../util/logging.h" /* inclink generated by yamaker */ +#include "../../util/logging.h" /* inclink generated by yamaker */ diff --git a/contrib/libs/re2/include/util/utf.h b/contrib/libs/re2/include/util/utf.h index 7542658dc3..fa6fec714a 100644 --- a/contrib/libs/re2/include/util/utf.h +++ b/contrib/libs/re2/include/util/utf.h @@ -1 +1 @@ -#include "../../util/utf.h" /* inclink generated by yamaker */ +#include "../../util/utf.h" /* inclink generated by yamaker */ diff --git a/contrib/libs/re2/include/util/util.h b/contrib/libs/re2/include/util/util.h index da1cab83fd..86b8c06006 100644 --- a/contrib/libs/re2/include/util/util.h +++ b/contrib/libs/re2/include/util/util.h @@ -1 +1 @@ -#include "../../util/util.h" /* inclink generated by yamaker */ +#include "../../util/util.h" /* inclink generated by yamaker */ diff --git a/contrib/libs/re2/libre2.symbols b/contrib/libs/re2/libre2.symbols index 7b667473b8..93b71b4862 100644 --- a/contrib/libs/re2/libre2.symbols +++ b/contrib/libs/re2/libre2.symbols @@ -1,19 +1,19 @@ -{ - global: - # re2::RE2* - _ZN3re23RE2*; - _ZNK3re23RE2*; - # re2::StringPiece* - _ZN3re211StringPiece*; - _ZNK3re211StringPiece*; - # re2::operator<<* - _ZN3re2ls*; - # re2::FilteredRE2* - _ZN3re211FilteredRE2*; - _ZNK3re211FilteredRE2*; - # re2::re2_internal* - _ZN3re212re2_internal*; - _ZNK3re212re2_internal*; - local: - *; -}; +{ + global: + # re2::RE2* + _ZN3re23RE2*; + _ZNK3re23RE2*; + # re2::StringPiece* + _ZN3re211StringPiece*; + _ZNK3re211StringPiece*; + # re2::operator<<* + _ZN3re2ls*; + # re2::FilteredRE2* + _ZN3re211FilteredRE2*; + _ZNK3re211FilteredRE2*; + # re2::re2_internal* + _ZN3re212re2_internal*; + _ZNK3re212re2_internal*; + local: + *; +}; diff --git a/contrib/libs/re2/re2/bitmap256.h b/contrib/libs/re2/re2/bitmap256.h index 9328aea005..4899379e4d 100644 --- a/contrib/libs/re2/re2/bitmap256.h +++ b/contrib/libs/re2/re2/bitmap256.h @@ -19,11 +19,11 @@ namespace re2 { class Bitmap256 { public: Bitmap256() { - Clear(); - } - - // Clears all of the bits. - void Clear() { + Clear(); + } + + // Clears all of the bits. + void Clear() { memset(words_, 0, sizeof words_); } @@ -32,7 +32,7 @@ class Bitmap256 { DCHECK_GE(c, 0); DCHECK_LE(c, 255); - return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0; + return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0; } // Sets the bit with index c. @@ -40,7 +40,7 @@ class Bitmap256 { DCHECK_GE(c, 0); DCHECK_LE(c, 255); - words_[c / 64] |= (uint64_t{1} << (c % 64)); + words_[c / 64] |= (uint64_t{1} << (c % 64)); } // Finds the next non-zero bit with index >= c. @@ -88,7 +88,7 @@ int Bitmap256::FindNextSetBit(int c) const { // Check the word that contains the bit. Mask out any lower bits. int i = c / 64; - uint64_t word = words_[i] & (~uint64_t{0} << (c % 64)); + uint64_t word = words_[i] & (~uint64_t{0} << (c % 64)); if (word != 0) return (i * 64) + FindLSBSet(word); diff --git a/contrib/libs/re2/re2/bitstate.cc b/contrib/libs/re2/re2/bitstate.cc index 6dfee10cc5..877e548234 100644 --- a/contrib/libs/re2/re2/bitstate.cc +++ b/contrib/libs/re2/re2/bitstate.cc @@ -5,10 +5,10 @@ // Tested by search_test.cc, exhaustive_test.cc, tester.cc // Prog::SearchBitState is a regular expression search with submatch -// tracking for small regular expressions and texts. Similarly to -// testing/backtrack.cc, it allocates a bitmap with (count of -// lists) * (length of text) bits to make sure it never explores the -// same (instruction list, character position) multiple times. This +// tracking for small regular expressions and texts. Similarly to +// testing/backtrack.cc, it allocates a bitmap with (count of +// lists) * (length of text) bits to make sure it never explores the +// same (instruction list, character position) multiple times. This // limits the search to run in time linear in the length of the text. // // Unlike testing/backtrack.cc, SearchBitState is not recursive @@ -20,11 +20,11 @@ #include <stddef.h> #include <stdint.h> #include <string.h> -#include <limits> -#include <utility> +#include <limits> +#include <utility> #include "util/logging.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -32,7 +32,7 @@ namespace re2 { struct Job { int id; - int rle; // run length encoding + int rle; // run length encoding const char* p; }; @@ -48,8 +48,8 @@ class BitState { private: inline bool ShouldVisit(int id, const char* p); - void Push(int id, const char* p); - void GrowStack(); + void Push(int id, const char* p); + void GrowStack(); bool TrySearch(int id, const char* p); // Search parameters @@ -59,18 +59,18 @@ class BitState { bool anchored_; // whether search is anchored at text.begin() bool longest_; // whether search wants leftmost-longest match bool endmatch_; // whether match must end at text.end() - StringPiece* submatch_; // submatches to fill in + StringPiece* submatch_; // submatches to fill in int nsubmatch_; // # of submatches to fill in // Search state - static constexpr int kVisitedBits = 64; - PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited - PODArray<const char*> cap_; // capture registers - PODArray<Job> job_; // stack of text positions to explore - int njob_; // stack size - - BitState(const BitState&) = delete; - BitState& operator=(const BitState&) = delete; + static constexpr int kVisitedBits = 64; + PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited + PODArray<const char*> cap_; // capture registers + PODArray<Job> job_; // stack of text positions to explore + int njob_; // stack size + + BitState(const BitState&) = delete; + BitState& operator=(const BitState&) = delete; }; BitState::BitState(Prog* prog) @@ -80,115 +80,115 @@ BitState::BitState(Prog* prog) endmatch_(false), submatch_(NULL), nsubmatch_(0), - njob_(0) { + njob_(0) { } -// Given id, which *must* be a list head, we can look up its list ID. -// Then the question is: Should the search visit the (list ID, p) pair? +// Given id, which *must* be a list head, we can look up its list ID. +// Then the question is: Should the search visit the (list ID, p) pair? // If so, remember that it was visited so that the next time, // we don't repeat the visit. bool BitState::ShouldVisit(int id, const char* p) { - int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) + - static_cast<int>(p-text_.data()); - if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1)))) + int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) + + static_cast<int>(p-text_.data()); + if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1)))) return false; - visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1)); + visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1)); return true; } // Grow the stack. -void BitState::GrowStack() { - PODArray<Job> tmp(2*job_.size()); - memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]); - job_ = std::move(tmp); +void BitState::GrowStack() { + PODArray<Job> tmp(2*job_.size()); + memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]); + job_ = std::move(tmp); } -// Push (id, p) onto the stack, growing it if necessary. -void BitState::Push(int id, const char* p) { - if (njob_ >= job_.size()) { - GrowStack(); - if (njob_ >= job_.size()) { - LOG(DFATAL) << "GrowStack() failed: " - << "njob_ = " << njob_ << ", " - << "job_.size() = " << job_.size(); +// Push (id, p) onto the stack, growing it if necessary. +void BitState::Push(int id, const char* p) { + if (njob_ >= job_.size()) { + GrowStack(); + if (njob_ >= job_.size()) { + LOG(DFATAL) << "GrowStack() failed: " + << "njob_ = " << njob_ << ", " + << "job_.size() = " << job_.size(); return; - } + } } - // If id < 0, it's undoing a Capture, - // so we mustn't interfere with that. - if (id >= 0 && njob_ > 0) { - Job* top = &job_[njob_-1]; - if (id == top->id && - p == top->p + top->rle + 1 && - top->rle < std::numeric_limits<int>::max()) { - ++top->rle; - return; - } - } - - Job* top = &job_[njob_++]; - top->id = id; - top->rle = 0; - top->p = p; + // If id < 0, it's undoing a Capture, + // so we mustn't interfere with that. + if (id >= 0 && njob_ > 0) { + Job* top = &job_[njob_-1]; + if (id == top->id && + p == top->p + top->rle + 1 && + top->rle < std::numeric_limits<int>::max()) { + ++top->rle; + return; + } + } + + Job* top = &job_[njob_++]; + top->id = id; + top->rle = 0; + top->p = p; } // Try a search from instruction id0 in state p0. // Return whether it succeeded. bool BitState::TrySearch(int id0, const char* p0) { bool matched = false; - const char* end = text_.data() + text_.size(); + const char* end = text_.data() + text_.size(); njob_ = 0; - // Push() no longer checks ShouldVisit(), - // so we must perform the check ourselves. - if (ShouldVisit(id0, p0)) - Push(id0, p0); + // Push() no longer checks ShouldVisit(), + // so we must perform the check ourselves. + if (ShouldVisit(id0, p0)) + Push(id0, p0); while (njob_ > 0) { // Pop job off stack. --njob_; int id = job_[njob_].id; - int& rle = job_[njob_].rle; + int& rle = job_[njob_].rle; const char* p = job_[njob_].p; - if (id < 0) { - // Undo the Capture. - cap_[prog_->inst(-id)->cap()] = p; - continue; - } - - if (rle > 0) { - p += rle; - // Revivify job on stack. - --rle; - ++njob_; + if (id < 0) { + // Undo the Capture. + cap_[prog_->inst(-id)->cap()] = p; + continue; + } + + if (rle > 0) { + p += rle; + // Revivify job on stack. + --rle; + ++njob_; } - Loop: - // Visit id, p. + Loop: + // Visit id, p. Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { default: - LOG(DFATAL) << "Unexpected opcode: " << ip->opcode(); + LOG(DFATAL) << "Unexpected opcode: " << ip->opcode(); return false; case kInstFail: - break; + break; case kInstAltMatch: - if (ip->greedy(prog_)) { - // out1 is the Match instruction. - id = ip->out1(); - p = end; - goto Loop; + if (ip->greedy(prog_)) { + // out1 is the Match instruction. + id = ip->out1(); + p = end; + goto Loop; } - if (longest_) { - // ip must be non-greedy... - // out is the Match instruction. - id = ip->out(); - p = end; - goto Loop; - } - goto Next; + if (longest_) { + // ip must be non-greedy... + // out is the Match instruction. + id = ip->out(); + p = end; + goto Loop; + } + goto Next; case kInstByteRange: { int c = -1; @@ -197,50 +197,50 @@ bool BitState::TrySearch(int id0, const char* p0) { if (!ip->Matches(c)) goto Next; - if (ip->hint() != 0) - Push(id+ip->hint(), p); // try the next when we're done + if (ip->hint() != 0) + Push(id+ip->hint(), p); // try the next when we're done id = ip->out(); p++; goto CheckAndLoop; } case kInstCapture: - if (!ip->last()) - Push(id+1, p); // try the next when we're done + if (!ip->last()) + Push(id+1, p); // try the next when we're done - if (0 <= ip->cap() && ip->cap() < cap_.size()) { - // Capture p to register, but save old value first. - Push(-id, cap_[ip->cap()]); // undo when we're done - cap_[ip->cap()] = p; - } + if (0 <= ip->cap() && ip->cap() < cap_.size()) { + // Capture p to register, but save old value first. + Push(-id, cap_[ip->cap()]); // undo when we're done + cap_[ip->cap()] = p; + } - id = ip->out(); - goto CheckAndLoop; + id = ip->out(); + goto CheckAndLoop; case kInstEmptyWidth: if (ip->empty() & ~Prog::EmptyFlags(context_, p)) goto Next; if (!ip->last()) - Push(id+1, p); // try the next when we're done + Push(id+1, p); // try the next when we're done id = ip->out(); goto CheckAndLoop; case kInstNop: if (!ip->last()) - Push(id+1, p); // try the next when we're done + Push(id+1, p); // try the next when we're done id = ip->out(); - CheckAndLoop: - // Sanity check: id is the head of its list, which must - // be the case if id-1 is the last of *its* list. :) - DCHECK(id == 0 || prog_->inst(id-1)->last()); - if (ShouldVisit(id, p)) - goto Loop; - break; - + CheckAndLoop: + // Sanity check: id is the head of its list, which must + // be the case if id-1 is the last of *its* list. :) + DCHECK(id == 0 || prog_->inst(id-1)->last()); + if (ShouldVisit(id, p)) + goto Loop; + break; + case kInstMatch: { - if (endmatch_ && p != end) + if (endmatch_ && p != end) goto Next; // We found a match. If the caller doesn't care @@ -254,7 +254,7 @@ bool BitState::TrySearch(int id0, const char* p0) { matched = true; cap_[1] = p; if (submatch_[0].data() == NULL || - (longest_ && p > submatch_[0].data() + submatch_[0].size())) { + (longest_ && p > submatch_[0].data() + submatch_[0].size())) { for (int i = 0; i < nsubmatch_; i++) submatch_[i] = StringPiece(cap_[2 * i], @@ -266,18 +266,18 @@ bool BitState::TrySearch(int id0, const char* p0) { return true; // If we used the entire text, no longer match is possible. - if (p == end) + if (p == end) return true; // Otherwise, continue on in hope of a longer match. - // Note the absence of the ShouldVisit() check here - // due to execution remaining in the same list. - Next: - if (!ip->last()) { - id++; - goto Loop; - } - break; + // Note the absence of the ShouldVisit() check here + // due to execution remaining in the same list. + Next: + if (!ip->last()) { + id++; + goto Loop; + } + break; } } } @@ -291,7 +291,7 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // Search parameters. text_ = text; context_ = context; - if (context_.data() == NULL) + if (context_.data() == NULL) context_ = text; if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text)) return false; @@ -306,24 +306,24 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, submatch_[i] = StringPiece(); // Allocate scratch space. - int nvisited = prog_->list_count() * static_cast<int>(text.size()+1); - nvisited = (nvisited + kVisitedBits-1) / kVisitedBits; - visited_ = PODArray<uint64_t>(nvisited); - memset(visited_.data(), 0, nvisited*sizeof visited_[0]); + int nvisited = prog_->list_count() * static_cast<int>(text.size()+1); + nvisited = (nvisited + kVisitedBits-1) / kVisitedBits; + visited_ = PODArray<uint64_t>(nvisited); + memset(visited_.data(), 0, nvisited*sizeof visited_[0]); - int ncap = 2*nsubmatch; - if (ncap < 2) - ncap = 2; - cap_ = PODArray<const char*>(ncap); - memset(cap_.data(), 0, ncap*sizeof cap_[0]); + int ncap = 2*nsubmatch; + if (ncap < 2) + ncap = 2; + cap_ = PODArray<const char*>(ncap); + memset(cap_.data(), 0, ncap*sizeof cap_[0]); - // When sizeof(Job) == 16, we start with a nice round 1KiB. :) - job_ = PODArray<Job>(64); + // When sizeof(Job) == 16, we start with a nice round 1KiB. :) + job_ = PODArray<Job>(64); // Anchored search must start at text.begin(). if (anchored_) { - cap_[0] = text.data(); - return TrySearch(prog_->start(), text.data()); + cap_[0] = text.data(); + return TrySearch(prog_->start(), text.data()); } // Unanchored search, starting from each possible text position. @@ -332,22 +332,22 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // This looks like it's quadratic in the size of the text, // but we are not clearing visited_ between calls to TrySearch, // so no work is duplicated and it ends up still being linear. - const char* etext = text.data() + text.size(); - for (const char* p = text.data(); p <= etext; p++) { - // Try to use prefix accel (e.g. memchr) to skip ahead. - if (p < etext && prog_->can_prefix_accel()) { - p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p)); + const char* etext = text.data() + text.size(); + for (const char* p = text.data(); p <= etext; p++) { + // Try to use prefix accel (e.g. memchr) to skip ahead. + if (p < etext && prog_->can_prefix_accel()) { + p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p)); if (p == NULL) - p = etext; + p = etext; } cap_[0] = p; if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. return true; - // Avoid invoking undefined behavior (arithmetic on a null pointer) - // by simply not continuing the loop. - if (p == NULL) - break; + // Avoid invoking undefined behavior (arithmetic on a null pointer) + // by simply not continuing the loop. + if (p == NULL) + break; } return false; } diff --git a/contrib/libs/re2/re2/compile.cc b/contrib/libs/re2/re2/compile.cc index 2d836818a0..61d801a630 100644 --- a/contrib/libs/re2/re2/compile.cc +++ b/contrib/libs/re2/re2/compile.cc @@ -15,7 +15,7 @@ #include "util/logging.h" #include "util/utf.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" @@ -30,60 +30,60 @@ namespace re2 { // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. // // Because the out and out1 fields in Inst are no longer pointers, -// we can't use pointers directly here either. Instead, head refers -// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1). -// head == 0 represents the NULL list. This is okay because instruction #0 +// we can't use pointers directly here either. Instead, head refers +// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1). +// head == 0 represents the NULL list. This is okay because instruction #0 // is always the fail instruction, which never appears on a list. struct PatchList { // Returns patch list containing just p. - static PatchList Mk(uint32_t p) { - return {p, p}; - } + static PatchList Mk(uint32_t p) { + return {p, p}; + } - // Patches all the entries on l to have value p. + // Patches all the entries on l to have value p. // Caller must not ever use patch list again. - static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) { - while (l.head != 0) { - Prog::Inst* ip = &inst0[l.head>>1]; - if (l.head&1) { - l.head = ip->out1(); - ip->out1_ = p; - } else { - l.head = ip->out(); - ip->set_out(p); - } + static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) { + while (l.head != 0) { + Prog::Inst* ip = &inst0[l.head>>1]; + if (l.head&1) { + l.head = ip->out1(); + ip->out1_ = p; + } else { + l.head = ip->out(); + ip->set_out(p); + } } } - // Appends two patch lists and returns result. - static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { - if (l1.head == 0) - return l2; - if (l2.head == 0) - return l1; - Prog::Inst* ip = &inst0[l1.tail>>1]; - if (l1.tail&1) - ip->out1_ = l2.head; - else - ip->set_out(l2.head); - return {l1.head, l2.tail}; + // Appends two patch lists and returns result. + static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { + if (l1.head == 0) + return l2; + if (l2.head == 0) + return l1; + Prog::Inst* ip = &inst0[l1.tail>>1]; + if (l1.tail&1) + ip->out1_ = l2.head; + else + ip->set_out(l2.head); + return {l1.head, l2.tail}; } - uint32_t head; - uint32_t tail; // for constant-time append -}; + uint32_t head; + uint32_t tail; // for constant-time append +}; -static const PatchList kNullPatchList = {0, 0}; +static const PatchList kNullPatchList = {0, 0}; // Compiled program fragment. struct Frag { uint32_t begin; PatchList end; - bool nullable; + bool nullable; - Frag() : begin(0), end(kNullPatchList), nullable(false) {} - Frag(uint32_t begin, PatchList end, bool nullable) - : begin(begin), end(end), nullable(nullable) {} + Frag() : begin(0), end(kNullPatchList), nullable(false) {} + Frag(uint32_t begin, PatchList end, bool nullable) + : begin(begin), end(end), nullable(nullable) {} }; // Input encodings. @@ -105,7 +105,7 @@ class Compiler : public Regexp::Walker<Frag> { // Compiles alternation of all the re to a new Prog. // Each re has a match with an id equal to its index in the vector. - static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); + static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); // Interface for Regexp::Walker, which helps traverse the Regexp. // The walk is purely post-recursive: given the machines for the @@ -180,8 +180,8 @@ class Compiler : public Regexp::Walker<Frag> { int AddSuffixRecursive(int root, int id); // Finds the trie node for the given suffix. Returns a Frag in order to - // distinguish between pointing at the root node directly (end.head == 0) - // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively). + // distinguish between pointing at the root node directly (end.head == 0) + // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively). Frag FindByteRange(int root, int id); // Compares two ByteRanges and returns true iff they are equal. @@ -193,8 +193,8 @@ class Compiler : public Regexp::Walker<Frag> { // Single rune. Frag Literal(Rune r, bool foldcase); - void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); - Prog* Finish(Regexp* re); + void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); + Prog* Finish(Regexp* re); // Returns .* where dot = any byte Frag DotStar(); @@ -205,9 +205,9 @@ class Compiler : public Regexp::Walker<Frag> { Encoding encoding_; // Input encoding bool reversed_; // Should program run backward over text? - PODArray<Prog::Inst> inst_; - int ninst_; // Number of instructions used. - int max_ninst_; // Maximum number of instructions. + PODArray<Prog::Inst> inst_; + int ninst_; // Number of instructions used. + int max_ninst_; // Maximum number of instructions. int64_t max_mem_; // Total memory budget. @@ -225,12 +225,12 @@ Compiler::Compiler() { failed_ = false; encoding_ = kEncodingUTF8; reversed_ = false; - ninst_ = 0; - max_ninst_ = 1; // make AllocInst for fail instruction okay + ninst_ = 0; + max_ninst_ = 1; // make AllocInst for fail instruction okay max_mem_ = 0; int fail = AllocInst(1); inst_[fail].InitFail(); - max_ninst_ = 0; // Caller must change + max_ninst_ = 0; // Caller must change } Compiler::~Compiler() { @@ -238,25 +238,25 @@ Compiler::~Compiler() { } int Compiler::AllocInst(int n) { - if (failed_ || ninst_ + n > max_ninst_) { + if (failed_ || ninst_ + n > max_ninst_) { failed_ = true; return -1; } - if (ninst_ + n > inst_.size()) { - int cap = inst_.size(); - if (cap == 0) - cap = 8; - while (ninst_ + n > cap) - cap *= 2; - PODArray<Prog::Inst> inst(cap); - if (inst_.data() != NULL) - memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]); - memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]); - inst_ = std::move(inst); + if (ninst_ + n > inst_.size()) { + int cap = inst_.size(); + if (cap == 0) + cap = 8; + while (ninst_ + n > cap) + cap *= 2; + PODArray<Prog::Inst> inst(cap); + if (inst_.data() != NULL) + memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]); + memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]); + inst_ = std::move(inst); } - int id = ninst_; - ninst_ += n; + int id = ninst_; + ninst_ += n; return id; } @@ -266,7 +266,7 @@ int Compiler::AllocInst(int n) { // Returns an unmatchable fragment. Frag Compiler::NoMatch() { - return Frag(); + return Frag(); } // Is a an unmatchable fragment? @@ -282,21 +282,21 @@ Frag Compiler::Cat(Frag a, Frag b) { // Elide no-op. Prog::Inst* begin = &inst_[a.begin]; if (begin->opcode() == kInstNop && - a.end.head == (a.begin << 1) && + a.end.head == (a.begin << 1) && begin->out() == 0) { - // in case refs to a somewhere - PatchList::Patch(inst_.data(), a.end, b.begin); + // in case refs to a somewhere + PatchList::Patch(inst_.data(), a.end, b.begin); return b; } // To run backward over string, reverse all concatenations. if (reversed_) { - PatchList::Patch(inst_.data(), b.end, a.begin); - return Frag(b.begin, a.end, b.nullable && a.nullable); + PatchList::Patch(inst_.data(), b.end, a.begin); + return Frag(b.begin, a.end, b.nullable && a.nullable); } - PatchList::Patch(inst_.data(), a.end, b.begin); - return Frag(a.begin, b.end, a.nullable && b.nullable); + PatchList::Patch(inst_.data(), a.end, b.begin); + return Frag(a.begin, b.end, a.nullable && b.nullable); } // Given fragments for a and b, returns fragment for a|b. @@ -312,8 +312,8 @@ Frag Compiler::Alt(Frag a, Frag b) { return NoMatch(); inst_[id].InitAlt(a.begin, b.begin); - return Frag(id, PatchList::Append(inst_.data(), a.end, b.end), - a.nullable || b.nullable); + return Frag(id, PatchList::Append(inst_.data(), a.end, b.end), + a.nullable || b.nullable); } // When capturing submatches in like-Perl mode, a kOpAlt Inst @@ -323,44 +323,44 @@ Frag Compiler::Alt(Frag a, Frag b) { // then the operator is greedy. If out1_ is the repetition // (and out_ moves forward), then the operator is non-greedy. -// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) -Frag Compiler::Plus(Frag a, bool nongreedy) { +// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) +Frag Compiler::Plus(Frag a, bool nongreedy) { int id = AllocInst(1); if (id < 0) return NoMatch(); - PatchList pl; + PatchList pl; if (nongreedy) { - inst_[id].InitAlt(0, a.begin); - pl = PatchList::Mk(id << 1); + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); } else { - inst_[id].InitAlt(a.begin, 0); - pl = PatchList::Mk((id << 1) | 1); + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); } - PatchList::Patch(inst_.data(), a.end, id); - return Frag(a.begin, pl, a.nullable); + PatchList::Patch(inst_.data(), a.end, id); + return Frag(a.begin, pl, a.nullable); } -// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy) -Frag Compiler::Star(Frag a, bool nongreedy) { - // When the subexpression is nullable, one Alt isn't enough to guarantee - // correct priority ordering within the transitive closure. The simplest - // solution is to handle it as (a+)? instead, which adds the second Alt. - if (a.nullable) - return Quest(Plus(a, nongreedy), nongreedy); - - int id = AllocInst(1); - if (id < 0) - return NoMatch(); - PatchList pl; - if (nongreedy) { - inst_[id].InitAlt(0, a.begin); - pl = PatchList::Mk(id << 1); - } else { - inst_[id].InitAlt(a.begin, 0); - pl = PatchList::Mk((id << 1) | 1); - } - PatchList::Patch(inst_.data(), a.end, id); - return Frag(id, pl, true); +// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy) +Frag Compiler::Star(Frag a, bool nongreedy) { + // When the subexpression is nullable, one Alt isn't enough to guarantee + // correct priority ordering within the transitive closure. The simplest + // solution is to handle it as (a+)? instead, which adds the second Alt. + if (a.nullable) + return Quest(Plus(a, nongreedy), nongreedy); + + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + PatchList pl; + if (nongreedy) { + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); + } else { + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); + } + PatchList::Patch(inst_.data(), a.end, id); + return Frag(id, pl, true); } // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) @@ -378,7 +378,7 @@ Frag Compiler::Quest(Frag a, bool nongreedy) { inst_[id].InitAlt(a.begin, 0); pl = PatchList::Mk((id << 1) | 1); } - return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true); + return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true); } // Returns a fragment for the byte range lo-hi. @@ -387,7 +387,7 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { if (id < 0) return NoMatch(); inst_[id].InitByteRange(lo, hi, foldcase, 0); - return Frag(id, PatchList::Mk(id << 1), false); + return Frag(id, PatchList::Mk(id << 1), false); } // Returns a no-op fragment. Sometimes unavoidable. @@ -396,7 +396,7 @@ Frag Compiler::Nop() { if (id < 0) return NoMatch(); inst_[id].InitNop(0); - return Frag(id, PatchList::Mk(id << 1), true); + return Frag(id, PatchList::Mk(id << 1), true); } // Returns a fragment that signals a match. @@ -405,7 +405,7 @@ Frag Compiler::Match(int32_t match_id) { if (id < 0) return NoMatch(); inst_[id].InitMatch(match_id); - return Frag(id, kNullPatchList, false); + return Frag(id, kNullPatchList, false); } // Returns a fragment matching a particular empty-width op (like ^ or $) @@ -414,7 +414,7 @@ Frag Compiler::EmptyWidth(EmptyOp empty) { if (id < 0) return NoMatch(); inst_[id].InitEmptyWidth(empty, 0); - return Frag(id, PatchList::Mk(id << 1), true); + return Frag(id, PatchList::Mk(id << 1), true); } // Given a fragment a, returns a fragment with capturing parens around a. @@ -426,9 +426,9 @@ Frag Compiler::Capture(Frag a, int n) { return NoMatch(); inst_[id].InitCapture(2*n, a.begin); inst_[id+1].InitCapture(2*n+1, 0); - PatchList::Patch(inst_.data(), a.end, id+1); + PatchList::Patch(inst_.data(), a.end, id+1); - return Frag(id, PatchList::Mk((id+1) << 1), a.nullable); + return Frag(id, PatchList::Mk((id+1) << 1), a.nullable); } // A Rune is a name for a Unicode code point. @@ -453,16 +453,16 @@ static int MaxRune(int len) { void Compiler::BeginRange() { rune_cache_.clear(); rune_range_.begin = 0; - rune_range_.end = kNullPatchList; + rune_range_.end = kNullPatchList; } int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) { Frag f = ByteRange(lo, hi, foldcase); if (next != 0) { - PatchList::Patch(inst_.data(), f.end, next); + PatchList::Patch(inst_.data(), f.end, next); } else { - rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end); + rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end); } return f.begin; } @@ -534,9 +534,9 @@ int Compiler::AddSuffixRecursive(int root, int id) { } int br; - if (f.end.head == 0) + if (f.end.head == 0) br = root; - else if (f.end.head&1) + else if (f.end.head&1) br = inst_[f.begin].out1(); else br = inst_[f.begin].out(); @@ -552,9 +552,9 @@ int Compiler::AddSuffixRecursive(int root, int id) { // Ensure that the parent points to the clone, not to the original. // Note that this could leave the head unreachable except via the cache. br = byterange; - if (f.end.head == 0) + if (f.end.head == 0) root = br; - else if (f.end.head&1) + else if (f.end.head&1) inst_[f.begin].out1_ = br; else inst_[f.begin].set_out(br); @@ -564,10 +564,10 @@ int Compiler::AddSuffixRecursive(int root, int id) { if (!IsCachedRuneByteSuffix(id)) { // The head should be the instruction most recently allocated, so free it // instead of leaving it unreachable. - DCHECK_EQ(id, ninst_-1); + DCHECK_EQ(id, ninst_-1); inst_[id].out_opcode_ = 0; inst_[id].out1_ = 0; - ninst_--; + ninst_--; } out = AddSuffixRecursive(inst_[br].out(), out); @@ -587,7 +587,7 @@ bool Compiler::ByteRangeEqual(int id1, int id2) { Frag Compiler::FindByteRange(int root, int id) { if (inst_[root].opcode() == kInstByteRange) { if (ByteRangeEqual(root, id)) - return Frag(root, kNullPatchList, false); + return Frag(root, kNullPatchList, false); else return NoMatch(); } @@ -595,7 +595,7 @@ Frag Compiler::FindByteRange(int root, int id) { while (inst_[root].opcode() == kInstAlt) { int out1 = inst_[root].out1(); if (ByteRangeEqual(out1, id)) - return Frag(root, PatchList::Mk((root << 1) | 1), false); + return Frag(root, PatchList::Mk((root << 1) | 1), false); // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't // what we're looking for, then we can stop immediately. Unfortunately, we @@ -607,7 +607,7 @@ Frag Compiler::FindByteRange(int root, int id) { if (inst_[out].opcode() == kInstAlt) root = out; else if (ByteRangeEqual(out, id)) - return Frag(root, PatchList::Mk(root << 1), false); + return Frag(root, PatchList::Mk(root << 1), false); else return NoMatch(); } @@ -648,43 +648,43 @@ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { static_cast<uint8_t>(hi), foldcase, 0)); } -void Compiler::Add_80_10ffff() { - // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough - // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by - // permitting overlong encodings in E0 and F0 sequences and code points - // over 10FFFF in F4 sequences, the size of the bytecode and the number - // of equivalence classes are reduced significantly. - int id; - if (reversed_) { - // Prefix factoring matters, but we don't have to handle it here - // because the rune range trie logic takes care of that already. - id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0); - id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); - AddSuffix(id); - - id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0); - id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); - id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); - AddSuffix(id); - - id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0); - id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); - id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); - id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); - AddSuffix(id); - } else { - // Suffix factoring matters - and we do have to handle it here. - int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0); - id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1); - AddSuffix(id); - - int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1); - id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2); - AddSuffix(id); - - int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2); - id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3); - AddSuffix(id); +void Compiler::Add_80_10ffff() { + // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough + // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by + // permitting overlong encodings in E0 and F0 sequences and code points + // over 10FFFF in F4 sequences, the size of the bytecode and the number + // of equivalence classes are reduced significantly. + int id; + if (reversed_) { + // Prefix factoring matters, but we don't have to handle it here + // because the rune range trie logic takes care of that already. + id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + + id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + + id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + } else { + // Suffix factoring matters - and we do have to handle it here. + int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0); + id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1); + AddSuffix(id); + + int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1); + id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2); + AddSuffix(id); + + int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2); + id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3); + AddSuffix(id); } } @@ -692,8 +692,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { if (lo > hi) return; - // Pick off 80-10FFFF as a common special case. - if (lo == 0x80 && hi == 0x10ffff) { + // Pick off 80-10FFFF as a common special case. + if (lo == 0x80 && hi == 0x10ffff) { Add_80_10ffff(); return; } @@ -854,11 +854,11 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, case kRegexpHaveMatch: { Frag f = Match(re->match_id()); - if (anchor_ == RE2::ANCHOR_BOTH) { - // Append \z or else the subexpression will effectively be unanchored. - // Complemented by the UNANCHORED case in CompileSet(). - f = Cat(EmptyWidth(kEmptyEndText), f); - } + if (anchor_ == RE2::ANCHOR_BOTH) { + // Append \z or else the subexpression will effectively be unanchored. + // Complemented by the UNANCHORED case in CompileSet(). + f = Cat(EmptyWidth(kEmptyEndText), f); + } return f; } @@ -998,11 +998,11 @@ static bool IsAnchorStart(Regexp** pre, int depth) { if (re->nsub() > 0) { sub = re->sub()[0]->Incref(); if (IsAnchorStart(&sub, depth+1)) { - PODArray<Regexp*> subcopy(re->nsub()); + PODArray<Regexp*> subcopy(re->nsub()); subcopy[0] = sub; // already have reference for (int i = 1; i < re->nsub(); i++) subcopy[i] = re->sub()[i]->Incref(); - *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); + *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); re->Decref(); return true; } @@ -1045,11 +1045,11 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { if (re->nsub() > 0) { sub = re->sub()[re->nsub() - 1]->Incref(); if (IsAnchorEnd(&sub, depth+1)) { - PODArray<Regexp*> subcopy(re->nsub()); + PODArray<Regexp*> subcopy(re->nsub()); subcopy[re->nsub() - 1] = sub; // already have reference for (int i = 0; i < re->nsub() - 1; i++) subcopy[i] = re->sub()[i]->Incref(); - *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); + *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); re->Decref(); return true; } @@ -1079,15 +1079,15 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, encoding_ = kEncodingLatin1; max_mem_ = max_mem; if (max_mem <= 0) { - max_ninst_ = 100000; // more than enough + max_ninst_ = 100000; // more than enough } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) { // No room for anything. - max_ninst_ = 0; + max_ninst_ = 0; } else { int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); // Limit instruction count so that inst->id() fits nicely in an int. // SparseArray also assumes that the indices (inst->id()) are ints. - // The call to WalkExponential uses 2*max_ninst_ below, + // The call to WalkExponential uses 2*max_ninst_ below, // and other places in the code use 2 or 3 * prog->size(). // Limiting to 2^24 should avoid overflow in those places. // (The point of allowing more than 32 bits of memory is to @@ -1098,7 +1098,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, // Inst imposes its own limit (currently bigger than 2^24 but be safe). if (m > Prog::Inst::kMaxInst) m = Prog::Inst::kMaxInst; - max_ninst_ = static_cast<int>(m); + max_ninst_ = static_cast<int>(m); } anchor_ = anchor; } @@ -1110,7 +1110,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, // The reversed flag is also recorded in the returned program. Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { Compiler c; - c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */); + c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */); c.reversed_ = reversed; // Simplify to remove things like counted repetitions @@ -1125,7 +1125,7 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { bool is_anchor_end = IsAnchorEnd(&sre, 0); // Generate fragment for entire regexp. - Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); + Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); sre->Decref(); if (c.failed_) return NULL; @@ -1134,10 +1134,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { // Turn off c.reversed_ (if it is set) to force the remaining concatenations // to behave normally. c.reversed_ = false; - all = c.Cat(all, c.Match(0)); + all = c.Cat(all, c.Match(0)); - c.prog_->set_reversed(reversed); - if (c.prog_->reversed()) { + c.prog_->set_reversed(reversed); + if (c.prog_->reversed()) { c.prog_->set_anchor_start(is_anchor_end); c.prog_->set_anchor_end(is_anchor_start); } else { @@ -1145,49 +1145,49 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { c.prog_->set_anchor_end(is_anchor_end); } - c.prog_->set_start(all.begin); - if (!c.prog_->anchor_start()) { - // Also create unanchored version, which starts with a .*? loop. - all = c.Cat(c.DotStar(), all); + c.prog_->set_start(all.begin); + if (!c.prog_->anchor_start()) { + // Also create unanchored version, which starts with a .*? loop. + all = c.Cat(c.DotStar(), all); } - c.prog_->set_start_unanchored(all.begin); + c.prog_->set_start_unanchored(all.begin); // Hand ownership of prog_ to caller. - return c.Finish(re); + return c.Finish(re); } -Prog* Compiler::Finish(Regexp* re) { +Prog* Compiler::Finish(Regexp* re) { if (failed_) return NULL; if (prog_->start() == 0 && prog_->start_unanchored() == 0) { // No possible matches; keep Fail instruction only. - ninst_ = 1; + ninst_ = 1; } // Hand off the array to Prog. - prog_->inst_ = std::move(inst_); - prog_->size_ = ninst_; + prog_->inst_ = std::move(inst_); + prog_->size_ = ninst_; prog_->Optimize(); prog_->Flatten(); prog_->ComputeByteMap(); - if (!prog_->reversed()) { - std::string prefix; - bool prefix_foldcase; - if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) - prog_->ConfigurePrefixAccel(prefix, prefix_foldcase); - } - + if (!prog_->reversed()) { + std::string prefix; + bool prefix_foldcase; + if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) + prog_->ConfigurePrefixAccel(prefix, prefix_foldcase); + } + // Record remaining memory for DFA. if (max_mem_ <= 0) { prog_->set_dfa_mem(1<<20); } else { - int64_t m = max_mem_ - sizeof(Prog); - m -= prog_->size_*sizeof(Prog::Inst); // account for inst_ - if (prog_->CanBitState()) - m -= prog_->size_*sizeof(uint16_t); // account for list_heads_ + int64_t m = max_mem_ - sizeof(Prog); + m -= prog_->size_*sizeof(Prog::Inst); // account for inst_ + if (prog_->CanBitState()) + m -= prog_->size_*sizeof(uint16_t); // account for list_heads_ if (m < 0) m = 0; prog_->set_dfa_mem(m); @@ -1212,31 +1212,31 @@ Frag Compiler::DotStar() { } // Compiles RE set to Prog. -Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { +Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { Compiler c; - c.Setup(re->parse_flags(), max_mem, anchor); + c.Setup(re->parse_flags(), max_mem, anchor); - Regexp* sre = re->Simplify(); - if (sre == NULL) - return NULL; + Regexp* sre = re->Simplify(); + if (sre == NULL) + return NULL; - Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); - sre->Decref(); + Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); + sre->Decref(); if (c.failed_) return NULL; - c.prog_->set_anchor_start(true); - c.prog_->set_anchor_end(true); - + c.prog_->set_anchor_start(true); + c.prog_->set_anchor_end(true); + if (anchor == RE2::UNANCHORED) { - // Prepend .* or else the expression will effectively be anchored. - // Complemented by the ANCHOR_BOTH case in PostVisit(). + // Prepend .* or else the expression will effectively be anchored. + // Complemented by the ANCHOR_BOTH case in PostVisit(). all = c.Cat(c.DotStar(), all); } c.prog_->set_start(all.begin); c.prog_->set_start_unanchored(all.begin); - Prog* prog = c.Finish(re); + Prog* prog = c.Finish(re); if (prog == NULL) return NULL; @@ -1254,8 +1254,8 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { return prog; } -Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { - return Compiler::CompileSet(re, anchor, max_mem); +Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { + return Compiler::CompileSet(re, anchor, max_mem); } } // namespace re2 diff --git a/contrib/libs/re2/re2/dfa.cc b/contrib/libs/re2/re2/dfa.cc index 0c5ba373db..d47c7d50a7 100644 --- a/contrib/libs/re2/re2/dfa.cc +++ b/contrib/libs/re2/re2/dfa.cc @@ -27,11 +27,11 @@ #include <string.h> #include <algorithm> #include <atomic> -#include <deque> +#include <deque> #include <mutex> #include <new> #include <string> -#include <unordered_map> +#include <unordered_map> #include <unordered_set> #include <utility> #include <vector> @@ -40,10 +40,10 @@ #include "util/mix.h" #include "util/mutex.h" #include "util/strutil.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/prog.h" -#include "re2/re2.h" -#include "re2/sparse_set.h" +#include "re2/re2.h" +#include "re2/sparse_set.h" #include "re2/stringpiece.h" // Silence "zero-sized array in struct/union" warning for DFA::State::next_. @@ -56,10 +56,10 @@ namespace re2 { // Controls whether the DFA should bail out early if the NFA would be faster. static bool dfa_should_bail_when_slow = true; -void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) { - dfa_should_bail_when_slow = b; -} - +void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) { + dfa_should_bail_when_slow = b; +} + // Changing this to true compiles in prints that trace execution of the DFA. // Generates a lot of output -- only useful for debugging. static const bool ExtraDebug = false; @@ -90,17 +90,17 @@ class DFA { // memory), it sets *failed and returns false. bool Search(const StringPiece& text, const StringPiece& context, bool anchored, bool want_earliest_match, bool run_forward, - bool* failed, const char** ep, SparseSet* matches); + bool* failed, const char** ep, SparseSet* matches); - // Builds out all states for the entire DFA. - // If cb is not empty, it receives one callback per state built. - // Returns the number of states built. - // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. - int BuildAllStates(const Prog::DFAStateCallback& cb); + // Builds out all states for the entire DFA. + // If cb is not empty, it receives one callback per state built. + // Returns the number of states built. + // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. + int BuildAllStates(const Prog::DFAStateCallback& cb); // Computes min and max for matching strings. Won't return strings // bigger than maxlen. - bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); + bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); // These data structures are logically private, but C++ makes it too // difficult to mark them as such. @@ -120,7 +120,7 @@ class DFA { // into this state, along with kFlagMatch if this // is a matching state. -// Work around the bug affecting flexible array members in GCC 6.x (for x >= 1). +// Work around the bug affecting flexible array members in GCC 6.x (for x >= 1). // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932) #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1 std::atomic<State*> next_[0]; // Outgoing arrows from State, @@ -134,9 +134,9 @@ class DFA { enum { kByteEndText = 256, // imaginary byte at end of text - kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags - kFlagMatch = 0x0100, // State.flag_: this is a matching state - kFlagLastWord = 0x0200, // State.flag_: last byte was a word char + kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags + kFlagMatch = 0x0100, // State.flag_: this is a matching state + kFlagLastWord = 0x0200, // State.flag_: last byte was a word char kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left }; @@ -171,9 +171,9 @@ class DFA { typedef std::unordered_set<State*, StateHash, StateEqual> StateSet; private: - // Make it easier to swap in a scalable reader-writer mutex. - using CacheMutex = Mutex; - + // Make it easier to swap in a scalable reader-writer mutex. + using CacheMutex = Mutex; + enum { // Indices into start_ for unanchored searches. // Add kStartAnchored for anchored searches. @@ -196,7 +196,7 @@ class DFA { // Looks up and returns the State corresponding to a Workq. // L >= mutex_ - State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag); + State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag); // Looks up and returns a State matching the inst, ninst, and flag. // L >= mutex_ @@ -219,7 +219,7 @@ class DFA { // sets *ismatch to true. // L >= mutex_ void RunWorkqOnByte(Workq* q, Workq* nq, - int c, uint32_t flag, bool* ismatch); + int c, uint32_t flag, bool* ismatch); // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. // L >= mutex_ @@ -231,38 +231,38 @@ class DFA { void AddToQueue(Workq* q, int id, uint32_t flag); // For debugging, returns a text representation of State. - static std::string DumpState(State* state); + static std::string DumpState(State* state); // For debugging, returns a text representation of a Workq. - static std::string DumpWorkq(Workq* q); + static std::string DumpWorkq(Workq* q); // Search parameters struct SearchParams { SearchParams(const StringPiece& text, const StringPiece& context, RWLocker* cache_lock) - : text(text), - context(context), + : text(text), + context(context), anchored(false), - can_prefix_accel(false), + can_prefix_accel(false), want_earliest_match(false), run_forward(false), start(NULL), cache_lock(cache_lock), failed(false), ep(NULL), - matches(NULL) {} + matches(NULL) {} StringPiece text; StringPiece context; bool anchored; - bool can_prefix_accel; + bool can_prefix_accel; bool want_earliest_match; bool run_forward; State* start; - RWLocker* cache_lock; + RWLocker* cache_lock; bool failed; // "out" parameter: whether search gave up const char* ep; // "out" parameter: end pointer for match - SparseSet* matches; + SparseSet* matches; private: SearchParams(const SearchParams&) = delete; @@ -270,13 +270,13 @@ class DFA { }; // Before each search, the parameters to Search are analyzed by - // AnalyzeSearch to determine the state in which to start. + // AnalyzeSearch to determine the state in which to start. struct StartInfo { - StartInfo() : start(NULL) {} - std::atomic<State*> start; + StartInfo() : start(NULL) {} + std::atomic<State*> start; }; - // Fills in params->start and params->can_prefix_accel using + // Fills in params->start and params->can_prefix_accel using // the other search parameters. Returns true on success, // false on failure. // cache_mutex_.r <= L < mutex_ @@ -287,10 +287,10 @@ class DFA { // The generic search loop, inlined to create specialized versions. // cache_mutex_.r <= L < mutex_ // Might unlock and relock cache_mutex_ via params->cache_lock. - template <bool can_prefix_accel, - bool want_earliest_match, - bool run_forward> - inline bool InlinedSearchLoop(SearchParams* params); + template <bool can_prefix_accel, + bool want_earliest_match, + bool run_forward> + inline bool InlinedSearchLoop(SearchParams* params); // The specialized versions of InlinedSearchLoop. The three letters // at the ends of the name denote the true/false values used as the @@ -330,7 +330,7 @@ class DFA { // Scratch areas, protected by mutex_. Workq* q0_; // Two pre-allocated work queues. Workq* q1_; - PODArray<int> stack_; // Pre-allocated stack for AddToQueue + PODArray<int> stack_; // Pre-allocated stack for AddToQueue // State* cache. Many threads use and add to the cache simultaneously, // holding cache_mutex_ for reading and mutex_ (above) when adding. @@ -338,14 +338,14 @@ class DFA { // while holding cache_mutex_ for writing, to avoid interrupting other // readers. Any State* pointers are only valid while cache_mutex_ // is held. - CacheMutex cache_mutex_; + CacheMutex cache_mutex_; int64_t mem_budget_; // Total memory budget for all States. int64_t state_budget_; // Amount of memory remaining for new States. StateSet state_cache_; // All States computed so far. StartInfo start_[kMaxStart]; - - DFA(const DFA&) = delete; - DFA& operator=(const DFA&) = delete; + + DFA(const DFA&) = delete; + DFA& operator=(const DFA&) = delete; }; // Shorthand for casting to uint8_t*. @@ -359,10 +359,10 @@ static inline const uint8_t* BytePtr(const void* v) { // in the work queue when in leftmost-longest matching mode. #define Mark (-1) -// Separates the match IDs from the instructions in inst_. -// Used only for "many match" DFA states. -#define MatchSep (-2) - +// Separates the match IDs from the instructions in inst_. +// Used only for "many match" DFA states. +#define MatchSep (-2) + // Internally, the DFA uses a sparse array of // program instruction pointers as a work queue. // In leftmost longest mode, marks separate sections @@ -428,21 +428,21 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem) q1_(NULL), mem_budget_(max_mem) { if (ExtraDebug) - fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str()); + fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str()); int nmark = 0; if (kind_ == Prog::kLongestMatch) nmark = prog_->size(); // See DFA::AddToQueue() for why this is so. - int nstack = prog_->inst_count(kInstCapture) + - prog_->inst_count(kInstEmptyWidth) + - prog_->inst_count(kInstNop) + - nmark + 1; // + 1 for start inst + int nstack = prog_->inst_count(kInstCapture) + + prog_->inst_count(kInstEmptyWidth) + + prog_->inst_count(kInstNop) + + nmark + 1; // + 1 for start inst - // Account for space needed for DFA, q0, q1, stack. + // Account for space needed for DFA, q0, q1, stack. mem_budget_ -= sizeof(DFA); mem_budget_ -= (prog_->size() + nmark) * (sizeof(int)+sizeof(int)) * 2; // q0, q1 - mem_budget_ -= nstack * sizeof(int); // stack + mem_budget_ -= nstack * sizeof(int); // stack if (mem_budget_ < 0) { init_failed_ = true; return; @@ -466,7 +466,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem) q0_ = new Workq(prog_->size(), nmark); q1_ = new Workq(prog_->size(), nmark); - stack_ = PODArray<int>(nstack); + stack_ = PODArray<int>(nstack); } DFA::~DFA() { @@ -490,15 +490,15 @@ DFA::~DFA() { // Debugging printouts // For debugging, returns a string representation of the work queue. -std::string DFA::DumpWorkq(Workq* q) { - std::string s; +std::string DFA::DumpWorkq(Workq* q) { + std::string s; const char* sep = ""; - for (Workq::iterator it = q->begin(); it != q->end(); ++it) { + for (Workq::iterator it = q->begin(); it != q->end(); ++it) { if (q->is_mark(*it)) { - s += "|"; + s += "|"; sep = ""; } else { - s += StringPrintf("%s%d", sep, *it); + s += StringPrintf("%s%d", sep, *it); sep = ","; } } @@ -506,29 +506,29 @@ std::string DFA::DumpWorkq(Workq* q) { } // For debugging, returns a string representation of the state. -std::string DFA::DumpState(State* state) { +std::string DFA::DumpState(State* state) { if (state == NULL) return "_"; if (state == DeadState) return "X"; if (state == FullMatchState) return "*"; - std::string s; + std::string s; const char* sep = ""; - s += StringPrintf("(%p)", state); + s += StringPrintf("(%p)", state); for (int i = 0; i < state->ninst_; i++) { if (state->inst_[i] == Mark) { - s += "|"; + s += "|"; + sep = ""; + } else if (state->inst_[i] == MatchSep) { + s += "||"; sep = ""; - } else if (state->inst_[i] == MatchSep) { - s += "||"; - sep = ""; } else { - s += StringPrintf("%s%d", sep, state->inst_[i]); + s += StringPrintf("%s%d", sep, state->inst_[i]); sep = ","; } } - s += StringPrintf(" flag=%#x", state->flag_); + s += StringPrintf(" flag=%#x", state->flag_); return s; } @@ -590,16 +590,16 @@ std::string DFA::DumpState(State* state) { // Looks in the State cache for a State matching q, flag. // If one is found, returns it. If one is not found, allocates one, // inserts it in the cache, and returns it. -// If mq is not null, MatchSep and the match IDs in mq will be appended -// to the State. -DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { +// If mq is not null, MatchSep and the match IDs in mq will be appended +// to the State. +DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { //mutex_.AssertHeld(); // Construct array of instruction ids for the new state. // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: // those are the only operators with any effect in // RunWorkqOnEmptyString or RunWorkqOnByte. - PODArray<int> inst(q->size()); + PODArray<int> inst(q->size()); int n = 0; uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions bool sawmatch = false; // whether queue contains guaranteed kInstMatch @@ -684,7 +684,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // unordered state sets separated by Marks. Sort each set // to canonicalize, to reduce the number of distinct sets stored. if (kind_ == Prog::kLongestMatch) { - int* ip = inst.data(); + int* ip = inst.data(); int* ep = ip + n; while (ip < ep) { int* markp = ip; @@ -697,30 +697,30 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { } } - // If we're in many match mode, canonicalize for similar reasons: - // we have an unordered set of states (i.e. we don't have Marks) - // and sorting will reduce the number of distinct sets stored. - if (kind_ == Prog::kManyMatch) { - int* ip = inst.data(); - int* ep = ip + n; - std::sort(ip, ep); - } - - // Append MatchSep and the match IDs in mq if necessary. - if (mq != NULL) { - inst[n++] = MatchSep; - for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) { - int id = *i; - Prog::Inst* ip = prog_->inst(id); - if (ip->opcode() == kInstMatch) - inst[n++] = ip->match_id(); - } - } - + // If we're in many match mode, canonicalize for similar reasons: + // we have an unordered set of states (i.e. we don't have Marks) + // and sorting will reduce the number of distinct sets stored. + if (kind_ == Prog::kManyMatch) { + int* ip = inst.data(); + int* ep = ip + n; + std::sort(ip, ep); + } + + // Append MatchSep and the match IDs in mq if necessary. + if (mq != NULL) { + inst[n++] = MatchSep; + for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) { + int id = *i; + Prog::Inst* ip = prog_->inst(id); + if (ip->opcode() == kInstMatch) + inst[n++] = ip->match_id(); + } + } + // Save the needed empty-width flags in the top bits for use later. flag |= needflags << kFlagNeedShift; - State* state = CachedState(inst.data(), n, flag); + State* state = CachedState(inst.data(), n, flag); return state; } @@ -759,7 +759,7 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) { mem_budget_ -= mem + kStateCacheOverhead; // Allocate new state along with room for next_ and inst_. - char* space = std::allocator<char>().allocate(mem); + char* space = std::allocator<char>().allocate(mem); State* s = new (space) State; (void) new (s->next_) std::atomic<State*>[nnext]; // Work around a unfortunate bug in older versions of libstdc++. @@ -786,12 +786,12 @@ void DFA::ClearCache() { StateSet::iterator tmp = begin; ++begin; // Deallocate the blob of memory that we allocated in DFA::CachedState(). - // We recompute mem in order to benefit from sized delete where possible. - int ninst = (*tmp)->ninst_; - int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot - int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) + - ninst*sizeof(int); - std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem); + // We recompute mem in order to benefit from sized delete where possible. + int ninst = (*tmp)->ninst_; + int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot + int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) + + ninst*sizeof(int); + std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem); } state_cache_.clear(); } @@ -800,22 +800,22 @@ void DFA::ClearCache() { void DFA::StateToWorkq(State* s, Workq* q) { q->clear(); for (int i = 0; i < s->ninst_; i++) { - if (s->inst_[i] == Mark) { + if (s->inst_[i] == Mark) { q->mark(); - } else if (s->inst_[i] == MatchSep) { - // Nothing after this is an instruction! - break; - } else { + } else if (s->inst_[i] == MatchSep) { + // Nothing after this is an instruction! + break; + } else { // Explore from the head of the list. AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask); - } + } } } // Adds ip to the work queue, following empty arrows according to flag. void DFA::AddToQueue(Workq* q, int id, uint32_t flag) { - // Use stack_ to hold our stack of instructions yet to process. + // Use stack_ to hold our stack of instructions yet to process. // It was preallocated as follows: // one entry per Capture; // one entry per EmptyWidth; and @@ -824,12 +824,12 @@ void DFA::AddToQueue(Workq* q, int id, uint32_t flag) { // perform. (Each instruction can be processed at most once.) // When using marks, we also added nmark == prog_->size(). // (Otherwise, nmark == 0.) - int* stk = stack_.data(); + int* stk = stack_.data(); int nstk = 0; stk[nstk++] = id; while (nstk > 0) { - DCHECK_LE(nstk, stack_.size()); + DCHECK_LE(nstk, stack_.size()); id = stk[--nstk]; Loop: @@ -928,7 +928,7 @@ void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) { // means to match c$. Sets the bool *ismatch to true if the end of the // regular expression program has been reached (the regexp has matched). void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, - int c, uint32_t flag, bool* ismatch) { + int c, uint32_t flag, bool* ismatch) { //mutex_.AssertHeld(); newq->clear(); @@ -954,29 +954,29 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, break; case kInstByteRange: // can follow if c is in range - if (!ip->Matches(c)) - break; - AddToQueue(newq, ip->out(), flag); - if (ip->hint() != 0) { - // We have a hint, but we must cancel out the - // increment that will occur after the break. - i += ip->hint() - 1; - } else { - // We have no hint, so we must find the end - // of the current list and then skip to it. - Prog::Inst* ip0 = ip; - while (!ip->last()) - ++ip; - i += ip - ip0; - } + if (!ip->Matches(c)) + break; + AddToQueue(newq, ip->out(), flag); + if (ip->hint() != 0) { + // We have a hint, but we must cancel out the + // increment that will occur after the break. + i += ip->hint() - 1; + } else { + // We have no hint, so we must find the end + // of the current list and then skip to it. + Prog::Inst* ip0 = ip; + while (!ip->last()) + ++ip; + i += ip - ip0; + } break; case kInstMatch: - if (prog_->anchor_end() && c != kByteEndText && - kind_ != Prog::kManyMatch) + if (prog_->anchor_end() && c != kByteEndText && + kind_ != Prog::kManyMatch) break; *ismatch = true; - if (kind_ == Prog::kFirstMatch) { + if (kind_ == Prog::kFirstMatch) { // Can stop processing work queue since we found a match. return; } @@ -985,8 +985,8 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, } if (ExtraDebug) - fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", - DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch); + fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", + DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch); } // Processes input byte c in state, returning new state. @@ -1068,9 +1068,9 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { swap(q0_, q1_); } bool ismatch = false; - RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch); - using std::swap; - swap(q0_, q1_); + RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch); + using std::swap; + swap(q0_, q1_); // Save afterflag along with ismatch and isword in new state. uint32_t flag = afterflag; @@ -1079,10 +1079,10 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { if (isword) flag |= kFlagLastWord; - if (ismatch && kind_ == Prog::kManyMatch) - ns = WorkqToCachedState(q0_, q1_, flag); - else - ns = WorkqToCachedState(q0_, NULL, flag); + if (ismatch && kind_ == Prog::kManyMatch) + ns = WorkqToCachedState(q0_, q1_, flag); + else + ns = WorkqToCachedState(q0_, NULL, flag); // Flush ns before linking to it. // Write barrier before updating state->next_ so that the @@ -1113,7 +1113,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { class DFA::RWLocker { public: - explicit RWLocker(CacheMutex* mu); + explicit RWLocker(CacheMutex* mu); ~RWLocker(); // If the lock is only held for reading right now, @@ -1123,19 +1123,19 @@ class DFA::RWLocker { void LockForWriting(); private: - CacheMutex* mu_; + CacheMutex* mu_; bool writing_; RWLocker(const RWLocker&) = delete; RWLocker& operator=(const RWLocker&) = delete; }; -DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) { +DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) { mu_->ReaderLock(); } -// This function is marked as NO_THREAD_SAFETY_ANALYSIS because -// the annotations don't support lock upgrade. +// This function is marked as NO_THREAD_SAFETY_ANALYSIS because +// the annotations don't support lock upgrade. void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { if (!writing_) { mu_->ReaderUnlock(); @@ -1167,14 +1167,14 @@ void DFA::ResetCache(RWLocker* cache_lock) { // Re-acquire the cache_mutex_ for writing (exclusive use). cache_lock->LockForWriting(); - hooks::GetDFAStateCacheResetHook()({ - state_budget_, - state_cache_.size(), - }); - + hooks::GetDFAStateCacheResetHook()({ + state_budget_, + state_cache_.size(), + }); + // Clear the cache, reset the memory budget. - for (int i = 0; i < kMaxStart; i++) - start_[i].start.store(NULL, std::memory_order_relaxed); + for (int i = 0; i < kMaxStart; i++) + start_[i].start.store(NULL, std::memory_order_relaxed); ClearCache(); mem_budget_ = state_budget_; } @@ -1289,7 +1289,7 @@ DFA::State* DFA::StateSaver::Restore() { // situation, the DFA can do better than executing the simple loop. // Instead, it can call memchr to search very quickly for the byte c. // Whether the start state has this property is determined during a -// pre-compilation pass and the "can_prefix_accel" argument is set. +// pre-compilation pass and the "can_prefix_accel" argument is set. // // Fourth, the desired behavior is to search for the leftmost-best match // (approximately, the same one that Perl would find), which is not @@ -1321,16 +1321,16 @@ DFA::State* DFA::StateSaver::Restore() { // The bools are equal to the same-named variables in params, but // making them function arguments lets the inliner specialize // this function to each combination (see two paragraphs above). -template <bool can_prefix_accel, - bool want_earliest_match, - bool run_forward> -inline bool DFA::InlinedSearchLoop(SearchParams* params) { +template <bool can_prefix_accel, + bool want_earliest_match, + bool run_forward> +inline bool DFA::InlinedSearchLoop(SearchParams* params) { State* start = params->start; - const uint8_t* bp = BytePtr(params->text.data()); // start of text - const uint8_t* p = bp; // text scanning point - const uint8_t* ep = BytePtr(params->text.data() + - params->text.size()); // end of text - const uint8_t* resetp = NULL; // p at last cache reset + const uint8_t* bp = BytePtr(params->text.data()); // start of text + const uint8_t* p = bp; // text scanning point + const uint8_t* ep = BytePtr(params->text.data() + + params->text.size()); // end of text + const uint8_t* resetp = NULL; // p at last cache reset if (!run_forward) { using std::swap; swap(p, ep); @@ -1339,24 +1339,24 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { const uint8_t* bytemap = prog_->bytemap(); const uint8_t* lastmatch = NULL; // most recent matching position in text bool matched = false; - + State* s = start; - if (ExtraDebug) - fprintf(stderr, "@stx: %s\n", DumpState(s).c_str()); + if (ExtraDebug) + fprintf(stderr, "@stx: %s\n", DumpState(s).c_str()); if (s->IsMatch()) { matched = true; lastmatch = p; - if (ExtraDebug) - fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str()); - if (params->matches != NULL && kind_ == Prog::kManyMatch) { - for (int i = s->ninst_ - 1; i >= 0; i--) { - int id = s->inst_[i]; - if (id == MatchSep) - break; - params->matches->insert(id); - } - } + if (ExtraDebug) + fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str()); + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); + } + } if (want_earliest_match) { params->ep = reinterpret_cast<const char*>(lastmatch); return true; @@ -1365,16 +1365,16 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { while (p != ep) { if (ExtraDebug) - fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); - - if (can_prefix_accel && s == start) { - // In start state, only way out is to find the prefix, - // so we use prefix accel (e.g. memchr) to skip ahead. - // If not found, we can skip to the end of the string. - p = BytePtr(prog_->PrefixAccel(p, ep - p)); - if (p == NULL) { - p = ep; - break; + fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); + + if (can_prefix_accel && s == start) { + // In start state, only way out is to find the prefix, + // so we use prefix accel (e.g. memchr) to skip ahead. + // If not found, we can skip to the end of the string. + p = BytePtr(prog_->PrefixAccel(p, ep - p)); + if (p == NULL) { + p = ep; + break; } } @@ -1413,11 +1413,11 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the // same at about 2 MB/s. Unless we're processing an average // of 10 bytes per state computation, fail so that RE2 can - // fall back to the NFA. However, RE2::Set cannot fall back, - // so we just have to keep on keeping on in that case. + // fall back to the NFA. However, RE2::Set cannot fall back, + // so we just have to keep on keeping on in that case. if (dfa_should_bail_when_slow && resetp != NULL && - static_cast<size_t>(p - resetp) < 10*state_cache_.size() && - kind_ != Prog::kManyMatch) { + static_cast<size_t>(p - resetp) < 10*state_cache_.size() && + kind_ != Prog::kManyMatch) { params->failed = true; return false; } @@ -1454,7 +1454,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { params->ep = reinterpret_cast<const char*>(ep); return true; } - + s = ns; if (s->IsMatch()) { matched = true; @@ -1465,15 +1465,15 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { else lastmatch = p + 1; if (ExtraDebug) - fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str()); - if (params->matches != NULL && kind_ == Prog::kManyMatch) { - for (int i = s->ninst_ - 1; i >= 0; i--) { - int id = s->inst_[i]; - if (id == MatchSep) - break; - params->matches->insert(id); - } - } + fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str()); + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); + } + } if (want_earliest_match) { params->ep = reinterpret_cast<const char*>(lastmatch); return true; @@ -1483,9 +1483,9 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { // Process one more byte to see if it triggers a match. // (Remember, matches are delayed one byte.) - if (ExtraDebug) - fprintf(stderr, "@etx: %s\n", DumpState(s).c_str()); - + if (ExtraDebug) + fprintf(stderr, "@etx: %s\n", DumpState(s).c_str()); + int lastbyte; if (run_forward) { if (EndPtr(params->text) == EndPtr(params->context)) @@ -1517,60 +1517,60 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { } } } - if (ns <= SpecialStateMax) { - if (ns == DeadState) { - params->ep = reinterpret_cast<const char*>(lastmatch); - return matched; - } - // FullMatchState + if (ns <= SpecialStateMax) { + if (ns == DeadState) { + params->ep = reinterpret_cast<const char*>(lastmatch); + return matched; + } + // FullMatchState params->ep = reinterpret_cast<const char*>(ep); return true; } - - s = ns; - if (s->IsMatch()) { + + s = ns; + if (s->IsMatch()) { matched = true; lastmatch = p; - if (ExtraDebug) - fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str()); - if (params->matches != NULL && kind_ == Prog::kManyMatch) { - for (int i = s->ninst_ - 1; i >= 0; i--) { - int id = s->inst_[i]; - if (id == MatchSep) - break; - params->matches->insert(id); + if (ExtraDebug) + fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str()); + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); } } } - + params->ep = reinterpret_cast<const char*>(lastmatch); return matched; } // Inline specializations of the general loop. bool DFA::SearchFFF(SearchParams* params) { - return InlinedSearchLoop<false, false, false>(params); + return InlinedSearchLoop<false, false, false>(params); } bool DFA::SearchFFT(SearchParams* params) { - return InlinedSearchLoop<false, false, true>(params); + return InlinedSearchLoop<false, false, true>(params); } bool DFA::SearchFTF(SearchParams* params) { - return InlinedSearchLoop<false, true, false>(params); + return InlinedSearchLoop<false, true, false>(params); } bool DFA::SearchFTT(SearchParams* params) { - return InlinedSearchLoop<false, true, true>(params); + return InlinedSearchLoop<false, true, true>(params); } bool DFA::SearchTFF(SearchParams* params) { - return InlinedSearchLoop<true, false, false>(params); + return InlinedSearchLoop<true, false, false>(params); } bool DFA::SearchTFT(SearchParams* params) { - return InlinedSearchLoop<true, false, true>(params); + return InlinedSearchLoop<true, false, true>(params); } bool DFA::SearchTTF(SearchParams* params) { - return InlinedSearchLoop<true, true, false>(params); + return InlinedSearchLoop<true, true, false>(params); } bool DFA::SearchTTT(SearchParams* params) { - return InlinedSearchLoop<true, true, true>(params); + return InlinedSearchLoop<true, true, true>(params); } // For performance, calls the appropriate specialized version @@ -1589,7 +1589,7 @@ bool DFA::FastSearchLoop(SearchParams* params) { &DFA::SearchTTT, }; - int index = 4 * params->can_prefix_accel + + int index = 4 * params->can_prefix_accel + 2 * params->want_earliest_match + 1 * params->run_forward; return (this->*Searches[index])(params); @@ -1665,7 +1665,7 @@ bool DFA::AnalyzeSearch(SearchParams* params) { flags = 0; } } - if (params->anchored) + if (params->anchored) start |= kStartAnchored; StartInfo* info = &start_[start]; @@ -1681,22 +1681,22 @@ bool DFA::AnalyzeSearch(SearchParams* params) { } } - params->start = info->start.load(std::memory_order_acquire); - - // Even if we could prefix accel, we cannot do so when anchored and, - // less obviously, we cannot do so when we are going to need flags. - // This trick works only when there is a single byte that leads to a - // different state! - if (prog_->can_prefix_accel() && - !params->anchored && - params->start > SpecialStateMax && - params->start->flag_ >> kFlagNeedShift == 0) - params->can_prefix_accel = true; - + params->start = info->start.load(std::memory_order_acquire); + + // Even if we could prefix accel, we cannot do so when anchored and, + // less obviously, we cannot do so when we are going to need flags. + // This trick works only when there is a single byte that leads to a + // different state! + if (prog_->can_prefix_accel() && + !params->anchored && + params->start > SpecialStateMax && + params->start->flag_ >> kFlagNeedShift == 0) + params->can_prefix_accel = true; + if (ExtraDebug) - fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", + fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", params->anchored, params->run_forward, flags, - DumpState(params->start).c_str(), params->can_prefix_accel); + DumpState(params->start).c_str(), params->can_prefix_accel); return true; } @@ -1705,25 +1705,25 @@ bool DFA::AnalyzeSearch(SearchParams* params) { bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint32_t flags) { // Quick check. - State* start = info->start.load(std::memory_order_acquire); - if (start != NULL) + State* start = info->start.load(std::memory_order_acquire); + if (start != NULL) return true; MutexLock l(&mutex_); - start = info->start.load(std::memory_order_relaxed); - if (start != NULL) + start = info->start.load(std::memory_order_relaxed); + if (start != NULL) return true; q0_->clear(); AddToQueue(q0_, params->anchored ? prog_->start() : prog_->start_unanchored(), flags); - start = WorkqToCachedState(q0_, NULL, flags); - if (start == NULL) + start = WorkqToCachedState(q0_, NULL, flags); + if (start == NULL) return false; // Synchronize with "quick check" above. - info->start.store(start, std::memory_order_release); + info->start.store(start, std::memory_order_release); return true; } @@ -1735,7 +1735,7 @@ bool DFA::Search(const StringPiece& text, bool run_forward, bool* failed, const char** epp, - SparseSet* matches) { + SparseSet* matches) { *epp = NULL; if (!ok()) { *failed = true; @@ -1746,7 +1746,7 @@ bool DFA::Search(const StringPiece& text, if (ExtraDebug) { fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", - std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_); + std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_); } RWLocker l(&cache_mutex_); @@ -1764,9 +1764,9 @@ bool DFA::Search(const StringPiece& text, return false; if (params.start == FullMatchState) { if (run_forward == want_earliest_match) - *epp = text.data(); + *epp = text.data(); else - *epp = text.data() + text.size(); + *epp = text.data() + text.size(); return true; } if (ExtraDebug) @@ -1825,17 +1825,17 @@ void Prog::DeleteDFA(DFA* dfa) { // bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, Anchor anchor, MatchKind kind, StringPiece* match0, - bool* failed, SparseSet* matches) { + bool* failed, SparseSet* matches) { *failed = false; StringPiece context = const_context; - if (context.data() == NULL) + if (context.data() == NULL) context = text; - bool caret = anchor_start(); + bool caret = anchor_start(); bool dollar = anchor_end(); if (reversed_) { - using std::swap; - swap(caret, dollar); + using std::swap; + swap(caret, dollar); } if (caret && BeginPtr(context) != BeginPtr(text)) return false; @@ -1847,7 +1847,7 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; bool endmatch = false; if (kind == kManyMatch) { - // This is split out in order to avoid clobbering kind. + // This is split out in order to avoid clobbering kind. } else if (kind == kFullMatch || anchor_end()) { endmatch = true; kind = kLongestMatch; @@ -1855,32 +1855,32 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, // If the caller doesn't care where the match is (just whether one exists), // then we can stop at the very first match we find, the so-called - // "earliest match". - bool want_earliest_match = false; - if (kind == kManyMatch) { - // This is split out in order to avoid clobbering kind. - if (matches == NULL) { - want_earliest_match = true; - } - } else if (match0 == NULL && !endmatch) { - want_earliest_match = true; + // "earliest match". + bool want_earliest_match = false; + if (kind == kManyMatch) { + // This is split out in order to avoid clobbering kind. + if (matches == NULL) { + want_earliest_match = true; + } + } else if (match0 == NULL && !endmatch) { + want_earliest_match = true; kind = kLongestMatch; } DFA* dfa = GetDFA(kind); const char* ep; bool matched = dfa->Search(text, context, anchored, - want_earliest_match, !reversed_, + want_earliest_match, !reversed_, failed, &ep, matches); - if (*failed) { - hooks::GetDFASearchFailureHook()({ - // Nothing yet... - }); + if (*failed) { + hooks::GetDFASearchFailureHook()({ + // Nothing yet... + }); return false; - } + } if (!matched) return false; - if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size())) + if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size())) return false; // If caller cares, record the boundary of the match. @@ -1888,17 +1888,17 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, // as the beginning. if (match0) { if (reversed_) - *match0 = - StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep)); + *match0 = + StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep)); else *match0 = - StringPiece(text.data(), static_cast<size_t>(ep - text.data())); + StringPiece(text.data(), static_cast<size_t>(ep - text.data())); } return true; } // Build out all states in DFA. Returns number of states. -int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { +int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { if (!ok()) return 0; @@ -1907,72 +1907,72 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { RWLocker l(&cache_mutex_); SearchParams params(StringPiece(), StringPiece(), &l); params.anchored = false; - if (!AnalyzeSearch(¶ms) || - params.start == NULL || - params.start == DeadState) + if (!AnalyzeSearch(¶ms) || + params.start == NULL || + params.start == DeadState) return 0; // Add start state to work queue. - // Note that any State* that we handle here must point into the cache, - // so we can simply depend on pointer-as-a-number hashing and equality. - std::unordered_map<State*, int> m; - std::deque<State*> q; - m.emplace(params.start, static_cast<int>(m.size())); + // Note that any State* that we handle here must point into the cache, + // so we can simply depend on pointer-as-a-number hashing and equality. + std::unordered_map<State*, int> m; + std::deque<State*> q; + m.emplace(params.start, static_cast<int>(m.size())); q.push_back(params.start); - // Compute the input bytes needed to cover all of the next pointers. - int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot - std::vector<int> input(nnext); - for (int c = 0; c < 256; c++) { - int b = prog_->bytemap()[c]; - while (c < 256-1 && prog_->bytemap()[c+1] == b) - c++; - input[b] = c; - } - input[prog_->bytemap_range()] = kByteEndText; - - // Scratch space for the output. - std::vector<int> output(nnext); - + // Compute the input bytes needed to cover all of the next pointers. + int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot + std::vector<int> input(nnext); + for (int c = 0; c < 256; c++) { + int b = prog_->bytemap()[c]; + while (c < 256-1 && prog_->bytemap()[c+1] == b) + c++; + input[b] = c; + } + input[prog_->bytemap_range()] = kByteEndText; + + // Scratch space for the output. + std::vector<int> output(nnext); + // Flood to expand every state. - bool oom = false; - while (!q.empty()) { - State* s = q.front(); - q.pop_front(); - for (int c : input) { + bool oom = false; + while (!q.empty()) { + State* s = q.front(); + q.pop_front(); + for (int c : input) { State* ns = RunStateOnByteUnlocked(s, c); - if (ns == NULL) { - oom = true; - break; - } - if (ns == DeadState) { - output[ByteMap(c)] = -1; - continue; - } - if (m.find(ns) == m.end()) { - m.emplace(ns, static_cast<int>(m.size())); + if (ns == NULL) { + oom = true; + break; + } + if (ns == DeadState) { + output[ByteMap(c)] = -1; + continue; + } + if (m.find(ns) == m.end()) { + m.emplace(ns, static_cast<int>(m.size())); q.push_back(ns); } - output[ByteMap(c)] = m[ns]; + output[ByteMap(c)] = m[ns]; } - if (cb) - cb(oom ? NULL : output.data(), - s == FullMatchState || s->IsMatch()); - if (oom) - break; + if (cb) + cb(oom ? NULL : output.data(), + s == FullMatchState || s->IsMatch()); + if (oom) + break; } - return static_cast<int>(m.size()); + return static_cast<int>(m.size()); } // Build out all states in DFA for kind. Returns number of states. -int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) { - return GetDFA(kind)->BuildAllStates(cb); +int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) { + return GetDFA(kind)->BuildAllStates(cb); } // Computes min and max for matching string. // Won't return strings bigger than maxlen. -bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { +bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { if (!ok()) return false; @@ -1989,7 +1989,7 @@ bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { // Also note that previously_visited_states[UnseenStatePtr] will, in the STL // tradition, implicitly insert a '0' value at first use. We take advantage // of that property below. - std::unordered_map<State*, int> previously_visited_states; + std::unordered_map<State*, int> previously_visited_states; // Pick out start state for anchored search at beginning of text. RWLocker l(&cache_mutex_); @@ -2094,7 +2094,7 @@ bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { } // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b - PrefixSuccessor(max); + PrefixSuccessor(max); // If there are no bytes left, we have no way to say "there is no maximum // string". We could make the interface more complicated and be able to @@ -2109,7 +2109,7 @@ bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { } // PossibleMatchRange for a Prog. -bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { +bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { // Have to use dfa_longest_ to get all strings for full matches. // For example, (a|aa) never matches aa in first-match mode. return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen); diff --git a/contrib/libs/re2/re2/filtered_re2.cc b/contrib/libs/re2/re2/filtered_re2.cc index 9f64cbf645..5df97456e2 100644 --- a/contrib/libs/re2/re2/filtered_re2.cc +++ b/contrib/libs/re2/re2/filtered_re2.cc @@ -6,7 +6,7 @@ #include <stddef.h> #include <string> -#include <utility> +#include <utility> #include "util/util.h" #include "util/logging.h" @@ -30,22 +30,22 @@ FilteredRE2::~FilteredRE2() { delete re2_vec_[i]; } -FilteredRE2::FilteredRE2(FilteredRE2&& other) - : re2_vec_(std::move(other.re2_vec_)), - compiled_(other.compiled_), - prefilter_tree_(std::move(other.prefilter_tree_)) { - other.re2_vec_.clear(); - other.re2_vec_.shrink_to_fit(); - other.compiled_ = false; - other.prefilter_tree_.reset(new PrefilterTree()); -} - -FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { - this->~FilteredRE2(); - (void) new (this) FilteredRE2(std::move(other)); - return *this; -} - +FilteredRE2::FilteredRE2(FilteredRE2&& other) + : re2_vec_(std::move(other.re2_vec_)), + compiled_(other.compiled_), + prefilter_tree_(std::move(other.prefilter_tree_)) { + other.re2_vec_.clear(); + other.re2_vec_.shrink_to_fit(); + other.compiled_ = false; + other.prefilter_tree_.reset(new PrefilterTree()); +} + +FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { + this->~FilteredRE2(); + (void) new (this) FilteredRE2(std::move(other)); + return *this; +} + RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, const RE2::Options& options, int* id) { RE2* re = new RE2(pattern, options); @@ -54,7 +54,7 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, if (!re->ok()) { if (options.log_errors()) { LOG(ERROR) << "Couldn't compile regular expression, skipping: " - << pattern << " due to error " << re->error(); + << pattern << " due to error " << re->error(); } delete re; } else { @@ -65,7 +65,7 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, return code; } -void FilteredRE2::Compile(std::vector<std::string>* atoms) { +void FilteredRE2::Compile(std::vector<std::string>* atoms) { if (compiled_) { LOG(ERROR) << "Compile called already."; return; @@ -134,4 +134,4 @@ void FilteredRE2::PrintPrefilter(int regexpid) { prefilter_tree_->PrintPrefilter(regexpid); } -} // namespace re2 +} // namespace re2 diff --git a/contrib/libs/re2/re2/filtered_re2.h b/contrib/libs/re2/re2/filtered_re2.h index 4f2e0e2900..dd618c70e8 100644 --- a/contrib/libs/re2/re2/filtered_re2.h +++ b/contrib/libs/re2/re2/filtered_re2.h @@ -10,18 +10,18 @@ // number of regexps that need to be actually searched. // // By design, it does not include a string matching engine. This is to -// allow the user of the class to use their favorite string matching +// allow the user of the class to use their favorite string matching // engine. The overall flow is: Add all the regexps using Add, then -// Compile the FilteredRE2. Compile returns strings that need to be -// matched. Note that the returned strings are lowercased and distinct. -// For applying regexps to a search text, the caller does the string -// matching using the returned strings. When doing the string match, -// note that the caller has to do that in a case-insensitive way or -// on a lowercased version of the search text. Then call FirstMatch -// or AllMatches with a vector of indices of strings that were found -// in the text to get the actual regexp matches. - -#include <memory> +// Compile the FilteredRE2. Compile returns strings that need to be +// matched. Note that the returned strings are lowercased and distinct. +// For applying regexps to a search text, the caller does the string +// matching using the returned strings. When doing the string match, +// note that the caller has to do that in a case-insensitive way or +// on a lowercased version of the search text. Then call FirstMatch +// or AllMatches with a vector of indices of strings that were found +// in the text to get the actual regexp matches. + +#include <memory> #include <string> #include <vector> @@ -37,27 +37,27 @@ class FilteredRE2 { explicit FilteredRE2(int min_atom_len); ~FilteredRE2(); - // Not copyable. - FilteredRE2(const FilteredRE2&) = delete; - FilteredRE2& operator=(const FilteredRE2&) = delete; - // Movable. - FilteredRE2(FilteredRE2&& other); - FilteredRE2& operator=(FilteredRE2&& other); - + // Not copyable. + FilteredRE2(const FilteredRE2&) = delete; + FilteredRE2& operator=(const FilteredRE2&) = delete; + // Movable. + FilteredRE2(FilteredRE2&& other); + FilteredRE2& operator=(FilteredRE2&& other); + // Uses RE2 constructor to create a RE2 object (re). Returns // re->error_code(). If error_code is other than NoError, then re is // deleted and not added to re2_vec_. RE2::ErrorCode Add(const StringPiece& pattern, const RE2::Options& options, - int* id); + int* id); // Prepares the regexps added by Add for filtering. Returns a set // of strings that the caller should check for in candidate texts. - // The returned strings are lowercased and distinct. When doing - // string matching, it should be performed in a case-insensitive - // way or the search text should be lowercased first. Call after + // The returned strings are lowercased and distinct. When doing + // string matching, it should be performed in a case-insensitive + // way or the search text should be lowercased first. Call after // all Add calls are done. - void Compile(std::vector<std::string>* strings_to_match); + void Compile(std::vector<std::string>* strings_to_match); // Returns the index of the first matching regexp. // Returns -1 on no match. Can be called prior to Compile. @@ -88,9 +88,9 @@ class FilteredRE2 { // The number of regexps added. int NumRegexps() const { return static_cast<int>(re2_vec_.size()); } - // Get the individual RE2 objects. - const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } - + // Get the individual RE2 objects. + const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } + private: // Print prefilter. void PrintPrefilter(int regexpid); @@ -106,7 +106,7 @@ class FilteredRE2 { bool compiled_; // An AND-OR tree of string atoms used for filtering regexps. - std::unique_ptr<PrefilterTree> prefilter_tree_; + std::unique_ptr<PrefilterTree> prefilter_tree_; }; } // namespace re2 diff --git a/contrib/libs/re2/re2/mimics_pcre.cc b/contrib/libs/re2/re2/mimics_pcre.cc index a75b943a10..b1d6a51228 100644 --- a/contrib/libs/re2/re2/mimics_pcre.cc +++ b/contrib/libs/re2/re2/mimics_pcre.cc @@ -39,20 +39,20 @@ class PCREWalker : public Regexp::Walker<bool> { public: PCREWalker() {} - virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args); - - virtual bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - LOG(DFATAL) << "PCREWalker::ShortVisit called"; -#endif + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + LOG(DFATAL) << "PCREWalker::ShortVisit called"; +#endif return a; } - - private: - PCREWalker(const PCREWalker&) = delete; - PCREWalker& operator=(const PCREWalker&) = delete; + + private: + PCREWalker(const PCREWalker&) = delete; + PCREWalker& operator=(const PCREWalker&) = delete; }; // Called after visiting each of re's children and accumulating @@ -121,16 +121,16 @@ bool Regexp::MimicsPCRE() { class EmptyStringWalker : public Regexp::Walker<bool> { public: - EmptyStringWalker() {} - - virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args); - - virtual bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + EmptyStringWalker() {} + + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; -#endif +#endif return a; } diff --git a/contrib/libs/re2/re2/nfa.cc b/contrib/libs/re2/re2/nfa.cc index f25bbcfac1..c7339f8ffd 100644 --- a/contrib/libs/re2/re2/nfa.cc +++ b/contrib/libs/re2/re2/nfa.cc @@ -27,18 +27,18 @@ #include <stdio.h> #include <string.h> #include <algorithm> -#include <deque> +#include <deque> #include <string> #include <utility> #include <vector> -#include "util/logging.h" -#include "util/strutil.h" -#include "re2/pod_array.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" -#include "re2/sparse_array.h" -#include "re2/sparse_set.h" +#include "re2/sparse_array.h" +#include "re2/sparse_set.h" namespace re2 { @@ -90,41 +90,41 @@ class NFA { // Follows all empty arrows from id0 and enqueues all the states reached. // Enqueues only the ByteRange instructions that match byte c. - // context is used (with p) for evaluating empty-width specials. + // context is used (with p) for evaluating empty-width specials. // p is the current input position, and t0 is the current thread. - void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, + void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, const char* p, Thread* t0); // Run runq on byte c, appending new states to nextq. // Updates matched_ and match_ as new, better matches are found. - // context is used (with p) for evaluating empty-width specials. - // p is the position of byte c in the input string for AddToThreadq; - // p-1 will be used when processing Match instructions. + // context is used (with p) for evaluating empty-width specials. + // p is the position of byte c in the input string for AddToThreadq; + // p-1 will be used when processing Match instructions. // Frees all the threads on runq. // If there is a shortcut to the end, returns that shortcut. - int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, - const char* p); + int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, + const char* p); // Returns text version of capture information, for debugging. - std::string FormatCapture(const char** capture); - - void CopyCapture(const char** dst, const char** src) { - memmove(dst, src, ncapture_*sizeof src[0]); - } - - Prog* prog_; // underlying program - int start_; // start instruction in program - int ncapture_; // number of submatches to track - bool longest_; // whether searching for longest match - bool endmatch_; // whether match must end at text.end() - const char* btext_; // beginning of text (for FormatSubmatch) - const char* etext_; // end of text (for endmatch_) - Threadq q0_, q1_; // pre-allocated for Search. - PODArray<AddState> stack_; // pre-allocated for AddToThreadq - std::deque<Thread> arena_; // thread arena - Thread* freelist_; // thread freelist - const char** match_; // best match so far - bool matched_; // any match so far? + std::string FormatCapture(const char** capture); + + void CopyCapture(const char** dst, const char** src) { + memmove(dst, src, ncapture_*sizeof src[0]); + } + + Prog* prog_; // underlying program + int start_; // start instruction in program + int ncapture_; // number of submatches to track + bool longest_; // whether searching for longest match + bool endmatch_; // whether match must end at text.end() + const char* btext_; // beginning of text (for FormatSubmatch) + const char* etext_; // end of text (for endmatch_) + Threadq q0_, q1_; // pre-allocated for Search. + PODArray<AddState> stack_; // pre-allocated for AddToThreadq + std::deque<Thread> arena_; // thread arena + Thread* freelist_; // thread freelist + const char** match_; // best match so far + bool matched_; // any match so far? NFA(const NFA&) = delete; NFA& operator=(const NFA&) = delete; @@ -141,34 +141,34 @@ NFA::NFA(Prog* prog) { q0_.resize(prog_->size()); q1_.resize(prog_->size()); // See NFA::AddToThreadq() for why this is so. - int nstack = 2*prog_->inst_count(kInstCapture) + - prog_->inst_count(kInstEmptyWidth) + - prog_->inst_count(kInstNop) + 1; // + 1 for start inst - stack_ = PODArray<AddState>(nstack); - freelist_ = NULL; + int nstack = 2*prog_->inst_count(kInstCapture) + + prog_->inst_count(kInstEmptyWidth) + + prog_->inst_count(kInstNop) + 1; // + 1 for start inst + stack_ = PODArray<AddState>(nstack); + freelist_ = NULL; match_ = NULL; matched_ = false; } NFA::~NFA() { delete[] match_; - for (const Thread& t : arena_) - delete[] t.capture; + for (const Thread& t : arena_) + delete[] t.capture; } NFA::Thread* NFA::AllocThread() { - Thread* t = freelist_; - if (t != NULL) { - freelist_ = t->next; + Thread* t = freelist_; + if (t != NULL) { + freelist_ = t->next; t->ref = 1; - // We don't need to touch t->capture because - // the caller will immediately overwrite it. + // We don't need to touch t->capture because + // the caller will immediately overwrite it. return t; } - arena_.emplace_back(); - t = &arena_.back(); + arena_.emplace_back(); + t = &arena_.back(); t->ref = 1; - t->capture = new const char*[ncapture_]; + t->capture = new const char*[ncapture_]; return t; } @@ -179,37 +179,37 @@ NFA::Thread* NFA::Incref(Thread* t) { } void NFA::Decref(Thread* t) { - DCHECK(t != NULL); + DCHECK(t != NULL); t->ref--; if (t->ref > 0) return; DCHECK_EQ(t->ref, 0); - t->next = freelist_; - freelist_ = t; + t->next = freelist_; + freelist_ = t; } // Follows all empty arrows from id0 and enqueues all the states reached. // Enqueues only the ByteRange instructions that match byte c. -// context is used (with p) for evaluating empty-width specials. +// context is used (with p) for evaluating empty-width specials. // p is the current input position, and t0 is the current thread. -void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, +void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, const char* p, Thread* t0) { if (id0 == 0) return; - // Use stack_ to hold our stack of instructions yet to process. + // Use stack_ to hold our stack of instructions yet to process. // It was preallocated as follows: // two entries per Capture; // one entry per EmptyWidth; and // one entry per Nop. // This reflects the maximum number of stack pushes that each can // perform. (Each instruction can be processed at most once.) - AddState* stk = stack_.data(); + AddState* stk = stack_.data(); int nstk = 0; - stk[nstk++] = {id0, NULL}; + stk[nstk++] = {id0, NULL}; while (nstk > 0) { - DCHECK_LE(nstk, stack_.size()); + DCHECK_LE(nstk, stack_.size()); AddState a = stk[--nstk]; Loop: @@ -233,7 +233,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, // or we might not. Even if not, it is necessary to have it, // so that we don't revisit id0 during the recursion. q->set_new(id, NULL); - Thread** tp = &q->get_existing(id); + Thread** tp = &q->get_existing(id); int j; Thread* t; Prog::Inst* ip = prog_->inst(id); @@ -251,25 +251,25 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, *tp = t; DCHECK(!ip->last()); - a = {id+1, NULL}; + a = {id+1, NULL}; goto Loop; case kInstNop: if (!ip->last()) - stk[nstk++] = {id+1, NULL}; + stk[nstk++] = {id+1, NULL}; // Continue on. - a = {ip->out(), NULL}; + a = {ip->out(), NULL}; goto Loop; case kInstCapture: if (!ip->last()) - stk[nstk++] = {id+1, NULL}; + stk[nstk++] = {id+1, NULL}; if ((j=ip->cap()) < ncapture_) { // Push a dummy whose only job is to restore t0 // once we finish exploring this possibility. - stk[nstk++] = {0, t0}; + stk[nstk++] = {0, t0}; // Record capture. t = AllocThread(); @@ -277,7 +277,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, t->capture[j] = p; t0 = t; } - a = {ip->out(), NULL}; + a = {ip->out(), NULL}; goto Loop; case kInstByteRange: @@ -290,32 +290,32 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, if (ExtraDebug) fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str()); - if (ip->hint() == 0) - break; - a = {id+ip->hint(), NULL}; - goto Loop; - - case kInstMatch: - // Save state; will pick up at next byte. - t = Incref(t0); - *tp = t; - if (ExtraDebug) - fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str()); - + if (ip->hint() == 0) + break; + a = {id+ip->hint(), NULL}; + goto Loop; + + case kInstMatch: + // Save state; will pick up at next byte. + t = Incref(t0); + *tp = t; + if (ExtraDebug) + fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str()); + Next: if (ip->last()) break; - a = {id+1, NULL}; + a = {id+1, NULL}; goto Loop; case kInstEmptyWidth: if (!ip->last()) - stk[nstk++] = {id+1, NULL}; + stk[nstk++] = {id+1, NULL}; // Continue on if we have all the right flag bits. - if (ip->empty() & ~Prog::EmptyFlags(context, p)) + if (ip->empty() & ~Prog::EmptyFlags(context, p)) break; - a = {ip->out(), NULL}; + a = {ip->out(), NULL}; goto Loop; } } @@ -323,17 +323,17 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, // Run runq on byte c, appending new states to nextq. // Updates matched_ and match_ as new, better matches are found. -// context is used (with p) for evaluating empty-width specials. -// p is the position of byte c in the input string for AddToThreadq; -// p-1 will be used when processing Match instructions. +// context is used (with p) for evaluating empty-width specials. +// p is the position of byte c in the input string for AddToThreadq; +// p-1 will be used when processing Match instructions. // Frees all the threads on runq. // If there is a shortcut to the end, returns that shortcut. -int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, - const char* p) { +int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, + const char* p) { nextq->clear(); for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { - Thread* t = i->value(); + Thread* t = i->value(); if (t == NULL) continue; @@ -355,7 +355,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, break; case kInstByteRange: - AddToThreadq(nextq, ip->out(), c, context, p, t); + AddToThreadq(nextq, ip->out(), c, context, p, t); break; case kInstAltMatch: @@ -367,10 +367,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, matched_ = true; Decref(t); - for (++i; i != runq->end(); ++i) { - if (i->value() != NULL) - Decref(i->value()); - } + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); if (ip->greedy(prog_)) return ip->out1(); @@ -378,50 +378,50 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, } break; - case kInstMatch: { - // Avoid invoking undefined behavior (arithmetic on a null pointer) - // by storing p instead of p-1. (What would the latter even mean?!) - // This complements the special case in NFA::Search(). - if (p == NULL) { - CopyCapture(match_, t->capture); - match_[1] = p; - matched_ = true; + case kInstMatch: { + // Avoid invoking undefined behavior (arithmetic on a null pointer) + // by storing p instead of p-1. (What would the latter even mean?!) + // This complements the special case in NFA::Search(). + if (p == NULL) { + CopyCapture(match_, t->capture); + match_[1] = p; + matched_ = true; + break; + } + + if (endmatch_ && p-1 != etext_) break; - } - if (endmatch_ && p-1 != etext_) - break; - if (longest_) { // Leftmost-longest mode: save this match only if // it is either farther to the left or at the same // point but longer than an existing match. if (!matched_ || t->capture[0] < match_[0] || - (t->capture[0] == match_[0] && p-1 > match_[1])) { + (t->capture[0] == match_[0] && p-1 > match_[1])) { CopyCapture(match_, t->capture); - match_[1] = p-1; + match_[1] = p-1; matched_ = true; } } else { // Leftmost-biased mode: this match is by definition // better than what we've already found (see next line). CopyCapture(match_, t->capture); - match_[1] = p-1; + match_[1] = p-1; matched_ = true; // Cut off the threads that can only find matches // worse than the one we just found: don't run the // rest of the current Threadq. Decref(t); - for (++i; i != runq->end(); ++i) { - if (i->value() != NULL) - Decref(i->value()); - } + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); return 0; } break; - } + } } Decref(t); } @@ -429,18 +429,18 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, return 0; } -std::string NFA::FormatCapture(const char** capture) { - std::string s; +std::string NFA::FormatCapture(const char** capture) { + std::string s; for (int i = 0; i < ncapture_; i+=2) { if (capture[i] == NULL) - s += "(?,?)"; + s += "(?,?)"; else if (capture[i+1] == NULL) - s += StringPrintf("(%td,?)", - capture[i] - btext_); + s += StringPrintf("(%td,?)", + capture[i] - btext_); else - s += StringPrintf("(%td,%td)", - capture[i] - btext_, - capture[i+1] - btext_); + s += StringPrintf("(%td,%td)", + capture[i] - btext_, + capture[i+1] - btext_); } return s; } @@ -452,7 +452,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, return false; StringPiece context = const_context; - if (context.data() == NULL) + if (context.data() == NULL) context = text; // Sanity check: make sure that text lies within context. @@ -488,17 +488,17 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } match_ = new const char*[ncapture_]; - memset(match_, 0, ncapture_*sizeof match_[0]); + memset(match_, 0, ncapture_*sizeof match_[0]); matched_ = false; // For debugging prints. - btext_ = context.data(); - // For convenience. - etext_ = text.data() + text.size(); + btext_ = context.data(); + // For convenience. + etext_ = text.data() + text.size(); if (ExtraDebug) fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", - std::string(text).c_str(), std::string(context).c_str(), anchored, longest); + std::string(text).c_str(), std::string(context).c_str(), anchored, longest); // Set up search. Threadq* runq = &q0_; @@ -507,19 +507,19 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, nextq->clear(); // Loop over the text, stepping the machine. - for (const char* p = text.data();; p++) { + for (const char* p = text.data();; p++) { if (ExtraDebug) { int c = 0; - if (p == btext_) + if (p == btext_) c = '^'; - else if (p > etext_) + else if (p > etext_) c = '$'; - else if (p < etext_) + else if (p < etext_) c = p[0] & 0xFF; - fprintf(stderr, "%c:", c); + fprintf(stderr, "%c:", c); for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { - Thread* t = i->value(); + Thread* t = i->value(); if (t == NULL) continue; fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str()); @@ -528,14 +528,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } // This is a no-op the first time around the loop because runq is empty. - int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); + int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); DCHECK_EQ(runq->size(), 0); using std::swap; swap(nextq, runq); nextq->clear(); if (id != 0) { // We're done: full match ahead. - p = etext_; + p = etext_; for (;;) { Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { @@ -563,29 +563,29 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, break; } - if (p > etext_) + if (p > etext_) break; // Start a new thread if there have not been any matches. // (No point in starting a new thread if there have been // matches, since it would be to the right of the match // we already found.) - if (!matched_ && (!anchored || p == text.data())) { - // Try to use prefix accel (e.g. memchr) to skip ahead. - // The search must be unanchored and there must be zero - // possible matches already. + if (!matched_ && (!anchored || p == text.data())) { + // Try to use prefix accel (e.g. memchr) to skip ahead. + // The search must be unanchored and there must be zero + // possible matches already. if (!anchored && runq->size() == 0 && - p < etext_ && prog_->can_prefix_accel()) { - p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p)); - if (p == NULL) - p = etext_; + p < etext_ && prog_->can_prefix_accel()) { + p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p)); + if (p == NULL) + p = etext_; } Thread* t = AllocThread(); CopyCapture(t->capture, match_); t->capture[0] = p; - AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p, - t); + AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p, + t); Decref(t); } @@ -596,23 +596,23 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, break; } - // Avoid invoking undefined behavior (arithmetic on a null pointer) - // by simply not continuing the loop. - // This complements the special case in NFA::Step(). - if (p == NULL) { - (void) Step(runq, nextq, -1, context, p); - DCHECK_EQ(runq->size(), 0); - using std::swap; - swap(nextq, runq); - nextq->clear(); - break; - } + // Avoid invoking undefined behavior (arithmetic on a null pointer) + // by simply not continuing the loop. + // This complements the special case in NFA::Step(). + if (p == NULL) { + (void) Step(runq, nextq, -1, context, p); + DCHECK_EQ(runq->size(), 0); + using std::swap; + swap(nextq, runq); + nextq->clear(); + break; + } } - for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { - if (i->value() != NULL) - Decref(i->value()); - } + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } if (matched_) { for (int i = 0; i < nsubmatch; i++) @@ -621,8 +621,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, static_cast<size_t>(match_[2 * i + 1] - match_[2 * i])); if (ExtraDebug) fprintf(stderr, "match (%td,%td)\n", - match_[0] - btext_, - match_[1] - btext_); + match_[0] - btext_, + match_[1] - btext_); return true; } return false; @@ -663,7 +663,7 @@ void Prog::Fanout(SparseArray<int>* fanout) { fanout->clear(); fanout->set_new(start(), 0); for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) { - int* count = &i->value(); + int* count = &i->value(); reachable.clear(); reachable.insert(i->index()); for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) { diff --git a/contrib/libs/re2/re2/onepass.cc b/contrib/libs/re2/re2/onepass.cc index 55bd6849e7..263974654d 100644 --- a/contrib/libs/re2/re2/onepass.cc +++ b/contrib/libs/re2/re2/onepass.cc @@ -61,9 +61,9 @@ #include "util/logging.h" #include "util/strutil.h" #include "util/utf.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/prog.h" -#include "re2/sparse_set.h" +#include "re2/sparse_set.h" #include "re2/stringpiece.h" // Silence "zero-sized array in struct/union" warning for OneState::action. @@ -235,7 +235,7 @@ bool Prog::SearchOnePass(const StringPiece& text, matchcap[i] = NULL; StringPiece context = const_context; - if (context.data() == NULL) + if (context.data() == NULL) context = text; if (anchor_start() && BeginPtr(context) != BeginPtr(text)) return false; @@ -244,13 +244,13 @@ bool Prog::SearchOnePass(const StringPiece& text, if (anchor_end()) kind = kFullMatch; - uint8_t* nodes = onepass_nodes_.data(); + uint8_t* nodes = onepass_nodes_.data(); int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); // start() is always mapped to the zeroth OneState. OneState* state = IndexToNode(nodes, statesize, 0); uint8_t* bytemap = bytemap_; - const char* bp = text.data(); - const char* ep = text.data() + text.size(); + const char* bp = text.data(); + const char* ep = text.data() + text.size(); const char* p; bool matched = false; matchcap[0] = bp; @@ -383,7 +383,7 @@ struct InstCond { // Constructs and saves corresponding one-pass NFA on success. bool Prog::IsOnePass() { if (did_onepass_) - return onepass_nodes_.data() != NULL; + return onepass_nodes_.data() != NULL; did_onepass_ = true; if (start() == 0) // no match @@ -404,11 +404,11 @@ bool Prog::IsOnePass() { int stacksize = inst_count(kInstCapture) + inst_count(kInstEmptyWidth) + inst_count(kInstNop) + 1; // + 1 for start inst - PODArray<InstCond> stack(stacksize); + PODArray<InstCond> stack(stacksize); int size = this->size(); - PODArray<int> nodebyid(size); // indexed by ip - memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]); + PODArray<int> nodebyid(size); // indexed by ip + memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]); // Originally, nodes was a uint8_t[maxnodes*statesize], but that was // unnecessarily optimistic: why allocate a large amount of memory @@ -550,7 +550,7 @@ bool Prog::IsOnePass() { if (!AddQ(&workq, ip->out())) { if (ExtraDebug) LOG(ERROR) << StringPrintf( - "Not OnePass: multiple paths %d -> %d", *it, ip->out()); + "Not OnePass: multiple paths %d -> %d", *it, ip->out()); goto fail; } id = ip->out(); @@ -561,7 +561,7 @@ bool Prog::IsOnePass() { // (3) is violated if (ExtraDebug) LOG(ERROR) << StringPrintf( - "Not OnePass: multiple matches from %d", *it); + "Not OnePass: multiple matches from %d", *it); goto fail; } matched = true; @@ -590,30 +590,30 @@ bool Prog::IsOnePass() { if (nodebyid[i] != -1) idmap[nodebyid[i]] = i; - std::string dump; + std::string dump; for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { int id = *it; int nodeindex = nodebyid[id]; if (nodeindex == -1) continue; OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); - dump += StringPrintf("node %d id=%d: matchcond=%#x\n", - nodeindex, id, node->matchcond); + dump += StringPrintf("node %d id=%d: matchcond=%#x\n", + nodeindex, id, node->matchcond); for (int i = 0; i < bytemap_range_; i++) { if ((node->action[i] & kImpossible) == kImpossible) continue; - dump += StringPrintf(" %d cond %#x -> %d id=%d\n", - i, node->action[i] & 0xFFFF, - node->action[i] >> kIndexShift, - idmap[node->action[i] >> kIndexShift]); + dump += StringPrintf(" %d cond %#x -> %d id=%d\n", + i, node->action[i] & 0xFFFF, + node->action[i] >> kIndexShift, + idmap[node->action[i] >> kIndexShift]); } } LOG(ERROR) << "nodes:\n" << dump; } dfa_mem_ -= nalloc*statesize; - onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize); - memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize); + onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize); + memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize); return true; fail: diff --git a/contrib/libs/re2/re2/parse.cc b/contrib/libs/re2/re2/parse.cc index be002ce281..85f16f060b 100644 --- a/contrib/libs/re2/re2/parse.cc +++ b/contrib/libs/re2/re2/parse.cc @@ -23,13 +23,13 @@ #include <algorithm> #include <map> #include <string> -#include <vector> +#include <vector> #include "util/util.h" #include "util/logging.h" #include "util/strutil.h" #include "util/utf.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/regexp.h" #include "re2/stringpiece.h" #include "re2/unicode_casefold.h" @@ -44,13 +44,13 @@ namespace re2 { -// Controls the maximum repeat count permitted by the parser. -static int maximum_repeat_count = 1000; +// Controls the maximum repeat count permitted by the parser. +static int maximum_repeat_count = 1000; + +void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) { + maximum_repeat_count = i; +} -void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) { - maximum_repeat_count = i; -} - // Regular expression parse state. // The list of parsed regexps so far is maintained as a vector of // Regexp pointers called the stack. Left parenthesis and vertical @@ -93,7 +93,7 @@ class Regexp::ParseState { bool PushSimpleOp(RegexpOp op); // Pushes a ^ onto the stack. - bool PushCaret(); + bool PushCaret(); // Pushes a \b (word == true) or \B (word == false) onto the stack. bool PushWordBoundary(bool word); @@ -423,7 +423,7 @@ bool Regexp::ParseState::PushLiteral(Rune r) { } // Pushes a ^ onto the stack. -bool Regexp::ParseState::PushCaret() { +bool Regexp::ParseState::PushCaret() { if (flags_ & OneLine) { return PushSimpleOp(kRegexpBeginText); } @@ -556,10 +556,10 @@ int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, } int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "RepetitionWalker::ShortVisit called"; -#endif +#endif return 0; } @@ -568,9 +568,9 @@ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { bool Regexp::ParseState::PushRepetition(int min, int max, const StringPiece& s, bool nongreedy) { - if ((max != -1 && max < min) || - min > maximum_repeat_count || - max > maximum_repeat_count) { + if ((max != -1 && max < min) || + min > maximum_repeat_count || + max > maximum_repeat_count) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; @@ -593,7 +593,7 @@ bool Regexp::ParseState::PushRepetition(int min, int max, stacktop_ = re; if (min >= 2 || max >= 2) { RepetitionWalker w; - if (w.Walk(stacktop_, maximum_repeat_count) == 0) { + if (w.Walk(stacktop_, maximum_repeat_count) == 0) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; @@ -613,7 +613,7 @@ bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { Regexp* re = new Regexp(kLeftParen, flags_); re->cap_ = ++ncap_; if (name.data() != NULL) - re->name_ = new std::string(name); + re->name_ = new std::string(name); return PushRegexp(re); } @@ -687,7 +687,7 @@ bool Regexp::ParseState::DoRightParen() { if ((r1 = stacktop_) == NULL || (r2 = r1->down_) == NULL || r2->op() != kLeftParen) { - status_->set_code(kRegexpUnexpectedParen); + status_->set_code(kRegexpUnexpectedParen); status_->set_error_arg(whole_regexp_); return false; } @@ -804,7 +804,7 @@ void Regexp::RemoveLeadingString(Regexp* re, int n) { // limit on the size of a concatenation, so we should never // see more than two here. Regexp* stk[4]; - size_t d = 0; + size_t d = 0; while (re->op() == kRegexpConcat) { if (d < arraysize(stk)) stk[d++] = re; @@ -835,8 +835,8 @@ void Regexp::RemoveLeadingString(Regexp* re, int n) { } // If re is now empty, concatenations might simplify too. - while (d > 0) { - re = stk[--d]; + while (d > 0) { + re = stk[--d]; Regexp** sub = re->sub(); if (sub[0]->op() == kRegexpEmptyMatch) { sub[0]->Decref(); @@ -870,180 +870,180 @@ void Regexp::RemoveLeadingString(Regexp* re, int n) { } } -// In the context of factoring alternations, a Splice is: a factored prefix or -// merged character class computed by one iteration of one round of factoring; -// the span of subexpressions of the alternation to be "spliced" (i.e. removed -// and replaced); and, for a factored prefix, the number of suffixes after any -// factoring that might have subsequently been performed on them. For a merged -// character class, there are no suffixes, of course, so the field is ignored. -struct Splice { - Splice(Regexp* prefix, Regexp** sub, int nsub) - : prefix(prefix), - sub(sub), - nsub(nsub), - nsuffix(-1) {} - - Regexp* prefix; - Regexp** sub; - int nsub; - int nsuffix; -}; - -// Named so because it is used to implement an explicit stack, a Frame is: the -// span of subexpressions of the alternation to be factored; the current round -// of factoring; any Splices computed; and, for a factored prefix, an iterator -// to the next Splice to be factored (i.e. in another Frame) because suffixes. -struct Frame { - Frame(Regexp** sub, int nsub) - : sub(sub), - nsub(nsub), - round(0) {} - - Regexp** sub; - int nsub; - int round; - std::vector<Splice> splices; - int spliceidx; -}; - -// Bundled into a class for friend access to Regexp without needing to declare -// (or define) Splice in regexp.h. -class FactorAlternationImpl { - public: - static void Round1(Regexp** sub, int nsub, - Regexp::ParseFlags flags, - std::vector<Splice>* splices); - static void Round2(Regexp** sub, int nsub, - Regexp::ParseFlags flags, - std::vector<Splice>* splices); - static void Round3(Regexp** sub, int nsub, - Regexp::ParseFlags flags, - std::vector<Splice>* splices); -}; - +// In the context of factoring alternations, a Splice is: a factored prefix or +// merged character class computed by one iteration of one round of factoring; +// the span of subexpressions of the alternation to be "spliced" (i.e. removed +// and replaced); and, for a factored prefix, the number of suffixes after any +// factoring that might have subsequently been performed on them. For a merged +// character class, there are no suffixes, of course, so the field is ignored. +struct Splice { + Splice(Regexp* prefix, Regexp** sub, int nsub) + : prefix(prefix), + sub(sub), + nsub(nsub), + nsuffix(-1) {} + + Regexp* prefix; + Regexp** sub; + int nsub; + int nsuffix; +}; + +// Named so because it is used to implement an explicit stack, a Frame is: the +// span of subexpressions of the alternation to be factored; the current round +// of factoring; any Splices computed; and, for a factored prefix, an iterator +// to the next Splice to be factored (i.e. in another Frame) because suffixes. +struct Frame { + Frame(Regexp** sub, int nsub) + : sub(sub), + nsub(nsub), + round(0) {} + + Regexp** sub; + int nsub; + int round; + std::vector<Splice> splices; + int spliceidx; +}; + +// Bundled into a class for friend access to Regexp without needing to declare +// (or define) Splice in regexp.h. +class FactorAlternationImpl { + public: + static void Round1(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices); + static void Round2(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices); + static void Round3(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices); +}; + // Factors common prefixes from alternation. // For example, // ABC|ABD|AEF|BCX|BCY // simplifies to // A(B(C|D)|EF)|BC(X|Y) -// and thence to +// and thence to // A(B[CD]|EF)|BC[XY] // // Rewrites sub to contain simplified list to alternate and returns // the new length of sub. Adjusts reference counts accordingly // (incoming sub[i] decremented, outgoing sub[i] incremented). -int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { - std::vector<Frame> stk; - stk.emplace_back(sub, nsub); - - for (;;) { - auto& sub = stk.back().sub; - auto& nsub = stk.back().nsub; - auto& round = stk.back().round; - auto& splices = stk.back().splices; - auto& spliceidx = stk.back().spliceidx; - - if (splices.empty()) { - // Advance to the next round of factoring. Note that this covers - // the initialised state: when splices is empty and round is 0. - round++; - } else if (spliceidx < static_cast<int>(splices.size())) { - // We have at least one more Splice to factor. Recurse logically. - stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub); - continue; - } else { - // We have no more Splices to factor. Apply them. - auto iter = splices.begin(); - int out = 0; - for (int i = 0; i < nsub; ) { - // Copy until we reach where the next Splice begins. - while (sub + i < iter->sub) - sub[out++] = sub[i++]; - switch (round) { - case 1: - case 2: { - // Assemble the Splice prefix and the suffixes. - Regexp* re[2]; - re[0] = iter->prefix; - re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags); - sub[out++] = Regexp::Concat(re, 2, flags); - i += iter->nsub; - break; - } - case 3: - // Just use the Splice prefix. - sub[out++] = iter->prefix; - i += iter->nsub; - break; - default: - LOG(DFATAL) << "unknown round: " << round; - break; - } - // If we are done, copy until the end of sub. - if (++iter == splices.end()) { - while (i < nsub) - sub[out++] = sub[i++]; - } - } - splices.clear(); - nsub = out; - // Advance to the next round of factoring. - round++; - } - - switch (round) { - case 1: - FactorAlternationImpl::Round1(sub, nsub, flags, &splices); - break; - case 2: - FactorAlternationImpl::Round2(sub, nsub, flags, &splices); - break; - case 3: - FactorAlternationImpl::Round3(sub, nsub, flags, &splices); - break; - case 4: - if (stk.size() == 1) { - // We are at the top of the stack. Just return. - return nsub; - } else { - // Pop the stack and set the number of suffixes. - // (Note that references will be invalidated!) - int nsuffix = nsub; - stk.pop_back(); - stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix; - ++stk.back().spliceidx; - continue; - } - default: - LOG(DFATAL) << "unknown round: " << round; - break; - } - - // Set spliceidx depending on whether we have Splices to factor. - if (splices.empty() || round == 3) { - spliceidx = static_cast<int>(splices.size()); - } else { - spliceidx = 0; - } - } -} - -void FactorAlternationImpl::Round1(Regexp** sub, int nsub, - Regexp::ParseFlags flags, - std::vector<Splice>* splices) { +int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { + std::vector<Frame> stk; + stk.emplace_back(sub, nsub); + + for (;;) { + auto& sub = stk.back().sub; + auto& nsub = stk.back().nsub; + auto& round = stk.back().round; + auto& splices = stk.back().splices; + auto& spliceidx = stk.back().spliceidx; + + if (splices.empty()) { + // Advance to the next round of factoring. Note that this covers + // the initialised state: when splices is empty and round is 0. + round++; + } else if (spliceidx < static_cast<int>(splices.size())) { + // We have at least one more Splice to factor. Recurse logically. + stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub); + continue; + } else { + // We have no more Splices to factor. Apply them. + auto iter = splices.begin(); + int out = 0; + for (int i = 0; i < nsub; ) { + // Copy until we reach where the next Splice begins. + while (sub + i < iter->sub) + sub[out++] = sub[i++]; + switch (round) { + case 1: + case 2: { + // Assemble the Splice prefix and the suffixes. + Regexp* re[2]; + re[0] = iter->prefix; + re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags); + sub[out++] = Regexp::Concat(re, 2, flags); + i += iter->nsub; + break; + } + case 3: + // Just use the Splice prefix. + sub[out++] = iter->prefix; + i += iter->nsub; + break; + default: + LOG(DFATAL) << "unknown round: " << round; + break; + } + // If we are done, copy until the end of sub. + if (++iter == splices.end()) { + while (i < nsub) + sub[out++] = sub[i++]; + } + } + splices.clear(); + nsub = out; + // Advance to the next round of factoring. + round++; + } + + switch (round) { + case 1: + FactorAlternationImpl::Round1(sub, nsub, flags, &splices); + break; + case 2: + FactorAlternationImpl::Round2(sub, nsub, flags, &splices); + break; + case 3: + FactorAlternationImpl::Round3(sub, nsub, flags, &splices); + break; + case 4: + if (stk.size() == 1) { + // We are at the top of the stack. Just return. + return nsub; + } else { + // Pop the stack and set the number of suffixes. + // (Note that references will be invalidated!) + int nsuffix = nsub; + stk.pop_back(); + stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix; + ++stk.back().spliceidx; + continue; + } + default: + LOG(DFATAL) << "unknown round: " << round; + break; + } + + // Set spliceidx depending on whether we have Splices to factor. + if (splices.empty() || round == 3) { + spliceidx = static_cast<int>(splices.size()); + } else { + spliceidx = 0; + } + } +} + +void FactorAlternationImpl::Round1(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices) { // Round 1: Factor out common literal prefixes. - int start = 0; - Rune* rune = NULL; + int start = 0; + Rune* rune = NULL; int nrune = 0; Regexp::ParseFlags runeflags = Regexp::NoParseFlags; - for (int i = 0; i <= nsub; i++) { - // Invariant: sub[start:i] consists of regexps that all - // begin with rune[0:nrune]. + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // begin with rune[0:nrune]. Rune* rune_i = NULL; int nrune_i = 0; Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; - if (i < nsub) { - rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i); + if (i < nsub) { + rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i); if (runeflags_i == runeflags) { int same = 0; while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) @@ -1057,32 +1057,32 @@ void FactorAlternationImpl::Round1(Regexp** sub, int nsub, } // Found end of a run with common leading literal string: - // sub[start:i] all begin with rune[0:nrune], - // but sub[i] does not even begin with rune[0]. + // sub[start:i] all begin with rune[0:nrune], + // but sub[i] does not even begin with rune[0]. if (i == start) { // Nothing to do - first iteration. } else if (i == start+1) { // Just one: don't bother factoring. } else { - Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags); + Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags); for (int j = start; j < i; j++) - Regexp::RemoveLeadingString(sub[j], nrune); - splices->emplace_back(prefix, sub + start, i - start); + Regexp::RemoveLeadingString(sub[j], nrune); + splices->emplace_back(prefix, sub + start, i - start); } - // Prepare for next iteration (if there is one). - if (i < nsub) { + // Prepare for next iteration (if there is one). + if (i < nsub) { start = i; rune = rune_i; nrune = nrune_i; runeflags = runeflags_i; } } -} +} -void FactorAlternationImpl::Round2(Regexp** sub, int nsub, - Regexp::ParseFlags flags, - std::vector<Splice>* splices) { +void FactorAlternationImpl::Round2(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices) { // Round 2: Factor out common simple prefixes, // just the first piece of each concatenation. // This will be good enough a lot of the time. @@ -1091,15 +1091,15 @@ void FactorAlternationImpl::Round2(Regexp** sub, int nsub, // are not safe to factor because that collapses their // distinct paths through the automaton, which affects // correctness in some cases. - int start = 0; + int start = 0; Regexp* first = NULL; - for (int i = 0; i <= nsub; i++) { - // Invariant: sub[start:i] consists of regexps that all - // begin with first. + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // begin with first. Regexp* first_i = NULL; - if (i < nsub) { - first_i = Regexp::LeadingRegexp(sub[i]); - if (first != NULL && + if (i < nsub) { + first_i = Regexp::LeadingRegexp(sub[i]); + if (first != NULL && // first must be an empty-width op // OR a char class, any char or any byte // OR a fixed repeat of a literal, char class, any char or any byte. @@ -1117,60 +1117,60 @@ void FactorAlternationImpl::Round2(Regexp** sub, int nsub, (first->sub()[0]->op() == kRegexpLiteral || first->sub()[0]->op() == kRegexpCharClass || first->sub()[0]->op() == kRegexpAnyChar || - first->sub()[0]->op() == kRegexpAnyByte))) && - Regexp::Equal(first, first_i)) + first->sub()[0]->op() == kRegexpAnyByte))) && + Regexp::Equal(first, first_i)) continue; } // Found end of a run with common leading regexp: - // sub[start:i] all begin with first, - // but sub[i] does not. + // sub[start:i] all begin with first, + // but sub[i] does not. if (i == start) { // Nothing to do - first iteration. } else if (i == start+1) { // Just one: don't bother factoring. } else { - Regexp* prefix = first->Incref(); + Regexp* prefix = first->Incref(); for (int j = start; j < i; j++) - sub[j] = Regexp::RemoveLeadingRegexp(sub[j]); - splices->emplace_back(prefix, sub + start, i - start); + sub[j] = Regexp::RemoveLeadingRegexp(sub[j]); + splices->emplace_back(prefix, sub + start, i - start); } - // Prepare for next iteration (if there is one). - if (i < nsub) { + // Prepare for next iteration (if there is one). + if (i < nsub) { start = i; first = first_i; } } -} - -void FactorAlternationImpl::Round3(Regexp** sub, int nsub, - Regexp::ParseFlags flags, - std::vector<Splice>* splices) { - // Round 3: Merge runs of literals and/or character classes. - int start = 0; - Regexp* first = NULL; - for (int i = 0; i <= nsub; i++) { - // Invariant: sub[start:i] consists of regexps that all - // are either literals (i.e. runes) or character classes. - Regexp* first_i = NULL; - if (i < nsub) { - first_i = sub[i]; - if (first != NULL && - (first->op() == kRegexpLiteral || - first->op() == kRegexpCharClass) && - (first_i->op() == kRegexpLiteral || - first_i->op() == kRegexpCharClass)) - continue; - } - - // Found end of a run of Literal/CharClass: - // sub[start:i] all are either one or the other, - // but sub[i] is not. +} + +void FactorAlternationImpl::Round3(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices) { + // Round 3: Merge runs of literals and/or character classes. + int start = 0; + Regexp* first = NULL; + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // are either literals (i.e. runes) or character classes. + Regexp* first_i = NULL; + if (i < nsub) { + first_i = sub[i]; + if (first != NULL && + (first->op() == kRegexpLiteral || + first->op() == kRegexpCharClass) && + (first_i->op() == kRegexpLiteral || + first_i->op() == kRegexpCharClass)) + continue; + } + + // Found end of a run of Literal/CharClass: + // sub[start:i] all are either one or the other, + // but sub[i] is not. if (i == start) { - // Nothing to do - first iteration. + // Nothing to do - first iteration. } else if (i == start+1) { - // Just one: don't bother factoring. + // Just one: don't bother factoring. } else { CharClassBuilder ccb; for (int j = start; j < i; j++) { @@ -1187,14 +1187,14 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub, } re->Decref(); } - Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); - splices->emplace_back(re, sub + start, i - start); + Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); + splices->emplace_back(re, sub + start, i - start); } - // Prepare for next iteration (if there is one). - if (i < nsub) { - start = i; - first = first_i; + // Prepare for next iteration (if there is one). + if (i < nsub) { + start = i; + first = first_i; } } } @@ -1221,7 +1221,7 @@ void Regexp::ParseState::DoCollapse(RegexpOp op) { return; // Construct op (alternation or concatenation), flattening op of op. - PODArray<Regexp*> subs(n); + PODArray<Regexp*> subs(n); next = NULL; int i = n; for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { @@ -1236,7 +1236,7 @@ void Regexp::ParseState::DoCollapse(RegexpOp op) { } } - Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true); + Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true); re->simple_ = re->ComputeSimple(); re->down_ = next; stacktop_ = re; @@ -1323,17 +1323,17 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { // Lexing routines. -// Parses a decimal integer, storing it in *np. +// Parses a decimal integer, storing it in *np. // Sets *s to span the remainder of the string. static bool ParseInteger(StringPiece* s, int* np) { - if (s->empty() || !isdigit((*s)[0] & 0xFF)) + if (s->empty() || !isdigit((*s)[0] & 0xFF)) return false; // Disallow leading zeros. if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) return false; int n = 0; int c; - while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) { + while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) { // Avoid overflow. if (n >= 100000000) return false; @@ -1355,16 +1355,16 @@ static bool ParseInteger(StringPiece* s, int* np) { // s must NOT be edited unless MaybeParseRepetition returns true. static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { StringPiece s = *sp; - if (s.empty() || s[0] != '{') + if (s.empty() || s[0] != '{') return false; s.remove_prefix(1); // '{' if (!ParseInteger(&s, lo)) return false; - if (s.empty()) + if (s.empty()) return false; if (s[0] == ',') { s.remove_prefix(1); // ',' - if (s.empty()) + if (s.empty()) return false; if (s[0] == '}') { // {2,} means at least 2 @@ -1378,7 +1378,7 @@ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { // {2} means exactly two *hi = *lo; } - if (s.empty() || s[0] != '}') + if (s.empty() || s[0] != '}') return false; s.remove_prefix(1); // '}' *sp = s; @@ -1393,7 +1393,7 @@ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { // fullrune() takes int, not size_t. However, it just looks // at the leading byte and treats any length >= 4 the same. - if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) { + if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) { int n = chartorune(r, sp->data()); // Some copies of chartorune have a bug that accepts // encodings of values in (10FFFF, 1FFFFF] as valid. @@ -1421,7 +1421,7 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { StringPiece t = s; Rune r; - while (!t.empty()) { + while (!t.empty()) { if (StringPieceToRune(&r, &t, status) < 0) return false; } @@ -1452,14 +1452,14 @@ static int UnHex(int c) { // Sets *rp to the named character. static bool ParseEscape(StringPiece* s, Rune* rp, RegexpStatus* status, int rune_max) { - const char* begin = s->data(); - if (s->empty() || (*s)[0] != '\\') { + const char* begin = s->data(); + if (s->empty() || (*s)[0] != '\\') { // Should not happen - caller always checks. status->set_code(kRegexpInternalError); status->set_error_arg(StringPiece()); return false; } - if (s->size() == 1) { + if (s->size() == 1) { status->set_code(kRegexpTrailingBackslash); status->set_error_arg(StringPiece()); return false; @@ -1490,16 +1490,16 @@ static bool ParseEscape(StringPiece* s, Rune* rp, case '6': case '7': // Single non-zero octal digit is a backreference; not supported. - if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7') + if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7') goto BadEscape; FALLTHROUGH_INTENDED; case '0': // consume up to three octal digits; already have one. code = c - '0'; - if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') { + if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') { code = code * 8 + c - '0'; s->remove_prefix(1); // digit - if (!s->empty()) { + if (!s->empty()) { c = (*s)[0]; if ('0' <= c && c <= '7') { code = code * 8 + c - '0'; @@ -1514,7 +1514,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // Hexadecimal escapes case 'x': - if (s->empty()) + if (s->empty()) goto BadEscape; if (StringPieceToRune(&c, s, status) < 0) return false; @@ -1534,7 +1534,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, code = code * 16 + UnHex(c); if (code > rune_max) goto BadEscape; - if (s->empty()) + if (s->empty()) goto BadEscape; if (StringPieceToRune(&c, s, status) < 0) return false; @@ -1545,7 +1545,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, return true; } // Easy case: two hex digits. - if (s->empty()) + if (s->empty()) goto BadEscape; if (StringPieceToRune(&c1, s, status) < 0) return false; @@ -1595,7 +1595,7 @@ BadEscape: // Unrecognized escape sequence. status->set_code(kRegexpBadEscape); status->set_error_arg( - StringPiece(begin, static_cast<size_t>(s->data() - begin))); + StringPiece(begin, static_cast<size_t>(s->data() - begin))); return false; } @@ -1715,7 +1715,7 @@ const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_fl return NULL; // Could use StringPieceToRune, but there aren't // any non-ASCII Perl group names. - StringPiece name(s->data(), 2); + StringPiece name(s->data(), 2); const UGroup *g = LookupPerlGroup(name); if (g == NULL) return NULL; @@ -1755,8 +1755,8 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, return kParseError; if (c != '{') { // Name is the bit of string we just skipped over for c. - const char* p = seq.data() + 2; - name = StringPiece(p, static_cast<size_t>(s->data() - p)); + const char* p = seq.data() + 2; + name = StringPiece(p, static_cast<size_t>(s->data() - p)); } else { // Name is in braces. Look for closing } size_t end = s->find('}', 0); @@ -1767,16 +1767,16 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, status->set_error_arg(seq); return kParseError; } - name = StringPiece(s->data(), end); // without '}' + name = StringPiece(s->data(), end); // without '}' s->remove_prefix(end + 1); // with '}' if (!IsValidUTF8(name, status)) return kParseError; } // Chop seq where s now begins. - seq = StringPiece(seq.data(), static_cast<size_t>(s->data() - seq.data())); + seq = StringPiece(seq.data(), static_cast<size_t>(s->data() - seq.data())); - if (!name.empty() && name[0] == '^') { + if (!name.empty() && name[0] == '^') { sign = -sign; name.remove_prefix(1); // '^' } @@ -1795,7 +1795,7 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, // Look up the group in the ICU Unicode data. Because ICU provides full // Unicode properties support, this could be more than a lookup by name. ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8( - std::string("\\p{") + std::string(name) + std::string("}")); + std::string("\\p{") + std::string(name) + std::string("}")); UErrorCode uerr = U_ZERO_ERROR; ::icu::UnicodeSet uset(ustr, uerr); if (U_FAILURE(uerr)) { @@ -1806,12 +1806,12 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, // Convert the UnicodeSet to a URange32 and UGroup that we can add. int nr = uset.getRangeCount(); - PODArray<URange32> r(nr); + PODArray<URange32> r(nr); for (int i = 0; i < nr; i++) { r[i].lo = uset.getRangeStart(i); r[i].hi = uset.getRangeEnd(i); } - UGroup g = {"", +1, 0, 0, r.data(), nr}; + UGroup g = {"", +1, 0, 0, r.data(), nr}; AddUGroup(cc, &g, sign, parse_flags); #endif @@ -1862,7 +1862,7 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, const StringPiece& whole_class, RegexpStatus* status) { - if (s->empty()) { + if (s->empty()) { status->set_code(kRegexpMissingBracket); status->set_error_arg(whole_class); return false; @@ -1870,7 +1870,7 @@ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, // Allow regular escape sequences even though // many need not be escaped in this context. - if ((*s)[0] == '\\') + if ((*s)[0] == '\\') return ParseEscape(s, rp, status, rune_max_); // Otherwise take the next rune. @@ -1912,7 +1912,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, Regexp** out_re, RegexpStatus* status) { StringPiece whole_class = *s; - if (s->empty() || (*s)[0] != '[') { + if (s->empty() || (*s)[0] != '[') { // Caller checked this. status->set_code(kRegexpInternalError); status->set_error_arg(StringPiece()); @@ -1922,7 +1922,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); re->ccb_ = new CharClassBuilder; s->remove_prefix(1); // '[' - if (!s->empty() && (*s)[0] == '^') { + if (!s->empty() && (*s)[0] == '^') { s->remove_prefix(1); // '^' negated = true; if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { @@ -1932,7 +1932,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, } } bool first = true; // ] is okay as first char in class - while (!s->empty() && ((*s)[0] != ']' || first)) { + while (!s->empty() && ((*s)[0] != ']' || first)) { // - is only okay unescaped as first or last in class. // Except that Perl allows - anywhere. if ((*s)[0] == '-' && !first && !(flags_&PerlX) && @@ -2000,7 +2000,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, // in the flags. re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); } - if (s->empty()) { + if (s->empty()) { status->set_code(kRegexpMissingBracket); status->set_error_arg(whole_class); re->Decref(); @@ -2017,7 +2017,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, // Returns whether name is a valid capture name. static bool IsValidCaptureName(const StringPiece& name) { - if (name.empty()) + if (name.empty()) return false; // Historically, we effectively used [0-9A-Za-z_]+ to validate; that @@ -2093,8 +2093,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { } // t is "P<name>...", t[end] == '>' - StringPiece capture(t.data()-2, end+3); // "(?P<name>" - StringPiece name(t.data()+2, end-2); // "name" + StringPiece capture(t.data()-2, end+3); // "(?P<name>" + StringPiece name(t.data()+2, end-2); // "name" if (!IsValidUTF8(name, status_)) return false; if (!IsValidCaptureName(name)) { @@ -2108,8 +2108,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { return false; } - s->remove_prefix( - static_cast<size_t>(capture.data() + capture.size() - s->data())); + s->remove_prefix( + static_cast<size_t>(capture.data() + capture.size() - s->data())); return true; } @@ -2118,7 +2118,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { int nflags = flags_; Rune c; for (bool done = false; !done; ) { - if (t.empty()) + if (t.empty()) goto BadPerlOp; if (StringPieceToRune(&c, &t, status_) < 0) return false; @@ -2193,7 +2193,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { BadPerlOp: status_->set_code(kRegexpBadPerlOp); status_->set_error_arg( - StringPiece(s->data(), static_cast<size_t>(t.data() - s->data()))); + StringPiece(s->data(), static_cast<size_t>(t.data() - s->data()))); return false; } @@ -2201,7 +2201,7 @@ BadPerlOp: // into UTF8 encoding in string. // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is // deprecated and because it rejects code points 0x80-0x9F. -void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) { +void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) { char buf[UTFmax]; utf->clear(); @@ -2228,7 +2228,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, // Convert regexp to UTF-8 (easier on the rest of the parser). if (global_flags & Latin1) { - std::string* tmp = new std::string; + std::string* tmp = new std::string; ConvertLatin1ToUTF8(t, tmp); status->set_tmp(tmp); t = *tmp; @@ -2236,7 +2236,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (global_flags & Literal) { // Special parse loop for literal string. - while (!t.empty()) { + while (!t.empty()) { Rune r; if (StringPieceToRune(&r, &t, status) < 0) return NULL; @@ -2247,7 +2247,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, } StringPiece lastunary = StringPiece(); - while (!t.empty()) { + while (!t.empty()) { StringPiece isunary = StringPiece(); switch (t[0]) { default: { @@ -2290,7 +2290,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, break; case '^': // Beginning of line. - if (!ps.PushCaret()) + if (!ps.PushCaret()) return NULL; t.remove_prefix(1); // '^' break; @@ -2331,18 +2331,18 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, bool nongreedy = false; t.remove_prefix(1); // '*' or '+' or '?' if (ps.flags() & PerlX) { - if (!t.empty() && t[0] == '?') { + if (!t.empty() && t[0] == '?') { nongreedy = true; t.remove_prefix(1); // '?' } - if (!lastunary.empty()) { + if (!lastunary.empty()) { // In Perl it is not allowed to stack repetition operators: // a** is a syntax error, not a double-star. // (and a++ means something else entirely, which we don't support!) status->set_code(kRegexpRepeatOp); status->set_error_arg(StringPiece( - lastunary.data(), - static_cast<size_t>(t.data() - lastunary.data()))); + lastunary.data(), + static_cast<size_t>(t.data() - lastunary.data()))); return NULL; } } @@ -2366,16 +2366,16 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, } bool nongreedy = false; if (ps.flags() & PerlX) { - if (!t.empty() && t[0] == '?') { + if (!t.empty() && t[0] == '?') { nongreedy = true; t.remove_prefix(1); // '?' } - if (!lastunary.empty()) { + if (!lastunary.empty()) { // Not allowed to stack repetition operators. status->set_code(kRegexpRepeatOp); status->set_error_arg(StringPiece( - lastunary.data(), - static_cast<size_t>(t.data() - lastunary.data()))); + lastunary.data(), + static_cast<size_t>(t.data() - lastunary.data()))); return NULL; } } @@ -2424,7 +2424,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (t[1] == 'Q') { // \Q ... \E: the ... is always literals t.remove_prefix(2); // '\\', 'Q' - while (!t.empty()) { + while (!t.empty()) { if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { t.remove_prefix(2); // '\\', 'E' break; diff --git a/contrib/libs/re2/re2/perl_groups.cc b/contrib/libs/re2/re2/perl_groups.cc index 17c74a9287..4687444581 100644 --- a/contrib/libs/re2/re2/perl_groups.cc +++ b/contrib/libs/re2/re2/perl_groups.cc @@ -20,12 +20,12 @@ static const URange16 code3[] = { /* \w */ { 0x61, 0x7a }, }; const UGroup perl_groups[] = { - { "\\d", +1, code1, 1, 0, 0 }, - { "\\D", -1, code1, 1, 0, 0 }, - { "\\s", +1, code2, 3, 0, 0 }, - { "\\S", -1, code2, 3, 0, 0 }, - { "\\w", +1, code3, 4, 0, 0 }, - { "\\W", -1, code3, 4, 0, 0 }, + { "\\d", +1, code1, 1, 0, 0 }, + { "\\D", -1, code1, 1, 0, 0 }, + { "\\s", +1, code2, 3, 0, 0 }, + { "\\S", -1, code2, 3, 0, 0 }, + { "\\w", +1, code3, 4, 0, 0 }, + { "\\W", -1, code3, 4, 0, 0 }, }; const int num_perl_groups = 6; static const URange16 code4[] = { /* [:alnum:] */ @@ -85,34 +85,34 @@ static const URange16 code17[] = { /* [:xdigit:] */ { 0x61, 0x66 }, }; const UGroup posix_groups[] = { - { "[:alnum:]", +1, code4, 3, 0, 0 }, - { "[:^alnum:]", -1, code4, 3, 0, 0 }, - { "[:alpha:]", +1, code5, 2, 0, 0 }, - { "[:^alpha:]", -1, code5, 2, 0, 0 }, - { "[:ascii:]", +1, code6, 1, 0, 0 }, - { "[:^ascii:]", -1, code6, 1, 0, 0 }, - { "[:blank:]", +1, code7, 2, 0, 0 }, - { "[:^blank:]", -1, code7, 2, 0, 0 }, - { "[:cntrl:]", +1, code8, 2, 0, 0 }, - { "[:^cntrl:]", -1, code8, 2, 0, 0 }, - { "[:digit:]", +1, code9, 1, 0, 0 }, - { "[:^digit:]", -1, code9, 1, 0, 0 }, - { "[:graph:]", +1, code10, 1, 0, 0 }, - { "[:^graph:]", -1, code10, 1, 0, 0 }, - { "[:lower:]", +1, code11, 1, 0, 0 }, - { "[:^lower:]", -1, code11, 1, 0, 0 }, - { "[:print:]", +1, code12, 1, 0, 0 }, - { "[:^print:]", -1, code12, 1, 0, 0 }, - { "[:punct:]", +1, code13, 4, 0, 0 }, - { "[:^punct:]", -1, code13, 4, 0, 0 }, - { "[:space:]", +1, code14, 2, 0, 0 }, - { "[:^space:]", -1, code14, 2, 0, 0 }, - { "[:upper:]", +1, code15, 1, 0, 0 }, - { "[:^upper:]", -1, code15, 1, 0, 0 }, - { "[:word:]", +1, code16, 4, 0, 0 }, - { "[:^word:]", -1, code16, 4, 0, 0 }, - { "[:xdigit:]", +1, code17, 3, 0, 0 }, - { "[:^xdigit:]", -1, code17, 3, 0, 0 }, + { "[:alnum:]", +1, code4, 3, 0, 0 }, + { "[:^alnum:]", -1, code4, 3, 0, 0 }, + { "[:alpha:]", +1, code5, 2, 0, 0 }, + { "[:^alpha:]", -1, code5, 2, 0, 0 }, + { "[:ascii:]", +1, code6, 1, 0, 0 }, + { "[:^ascii:]", -1, code6, 1, 0, 0 }, + { "[:blank:]", +1, code7, 2, 0, 0 }, + { "[:^blank:]", -1, code7, 2, 0, 0 }, + { "[:cntrl:]", +1, code8, 2, 0, 0 }, + { "[:^cntrl:]", -1, code8, 2, 0, 0 }, + { "[:digit:]", +1, code9, 1, 0, 0 }, + { "[:^digit:]", -1, code9, 1, 0, 0 }, + { "[:graph:]", +1, code10, 1, 0, 0 }, + { "[:^graph:]", -1, code10, 1, 0, 0 }, + { "[:lower:]", +1, code11, 1, 0, 0 }, + { "[:^lower:]", -1, code11, 1, 0, 0 }, + { "[:print:]", +1, code12, 1, 0, 0 }, + { "[:^print:]", -1, code12, 1, 0, 0 }, + { "[:punct:]", +1, code13, 4, 0, 0 }, + { "[:^punct:]", -1, code13, 4, 0, 0 }, + { "[:space:]", +1, code14, 2, 0, 0 }, + { "[:^space:]", -1, code14, 2, 0, 0 }, + { "[:upper:]", +1, code15, 1, 0, 0 }, + { "[:^upper:]", -1, code15, 1, 0, 0 }, + { "[:word:]", +1, code16, 4, 0, 0 }, + { "[:^word:]", -1, code16, 4, 0, 0 }, + { "[:xdigit:]", +1, code17, 3, 0, 0 }, + { "[:^xdigit:]", -1, code17, 3, 0, 0 }, }; const int num_posix_groups = 28; diff --git a/contrib/libs/re2/re2/pod_array.h b/contrib/libs/re2/re2/pod_array.h index fdec6ffa03..f234e976f4 100644 --- a/contrib/libs/re2/re2/pod_array.h +++ b/contrib/libs/re2/re2/pod_array.h @@ -1,55 +1,55 @@ -// Copyright 2018 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef RE2_POD_ARRAY_H_ -#define RE2_POD_ARRAY_H_ - -#include <memory> -#include <type_traits> - -namespace re2 { - -template <typename T> -class PODArray { - public: - static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value, - "T must be POD"); - - PODArray() - : ptr_() {} - explicit PODArray(int len) - : ptr_(std::allocator<T>().allocate(len), Deleter(len)) {} - - T* data() const { - return ptr_.get(); - } - - int size() const { - return ptr_.get_deleter().len_; - } - - T& operator[](int pos) const { - return ptr_[pos]; - } - - private: - struct Deleter { - Deleter() - : len_(0) {} - explicit Deleter(int len) - : len_(len) {} - - void operator()(T* ptr) const { - std::allocator<T>().deallocate(ptr, len_); - } - - int len_; - }; - - std::unique_ptr<T[], Deleter> ptr_; -}; - -} // namespace re2 - -#endif // RE2_POD_ARRAY_H_ +// Copyright 2018 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_POD_ARRAY_H_ +#define RE2_POD_ARRAY_H_ + +#include <memory> +#include <type_traits> + +namespace re2 { + +template <typename T> +class PODArray { + public: + static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value, + "T must be POD"); + + PODArray() + : ptr_() {} + explicit PODArray(int len) + : ptr_(std::allocator<T>().allocate(len), Deleter(len)) {} + + T* data() const { + return ptr_.get(); + } + + int size() const { + return ptr_.get_deleter().len_; + } + + T& operator[](int pos) const { + return ptr_[pos]; + } + + private: + struct Deleter { + Deleter() + : len_(0) {} + explicit Deleter(int len) + : len_(len) {} + + void operator()(T* ptr) const { + std::allocator<T>().deallocate(ptr, len_); + } + + int len_; + }; + + std::unique_ptr<T[], Deleter> ptr_; +}; + +} // namespace re2 + +#endif // RE2_POD_ARRAY_H_ diff --git a/contrib/libs/re2/re2/prefilter.cc b/contrib/libs/re2/re2/prefilter.cc index 30b2570612..a47b3120fb 100644 --- a/contrib/libs/re2/re2/prefilter.cc +++ b/contrib/libs/re2/re2/prefilter.cc @@ -21,8 +21,8 @@ namespace re2 { static const bool ExtraDebug = false; -typedef std::set<std::string>::iterator SSIter; -typedef std::set<std::string>::const_iterator ConstSSIter; +typedef std::set<std::string>::iterator SSIter; +typedef std::set<std::string>::const_iterator ConstSSIter; // Initializes a Prefilter, allocating subs_ as necessary. Prefilter::Prefilter(Op op) { @@ -140,35 +140,35 @@ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { return AndOr(OR, a, b); } -static void SimplifyStringSet(std::set<std::string>* ss) { +static void SimplifyStringSet(std::set<std::string>* ss) { // Now make sure that the strings aren't redundant. For example, if // we know "ab" is a required string, then it doesn't help at all to // know that "abc" is also a required string, so delete "abc". This // is because, when we are performing a string search to filter - // regexps, matching "ab" will already allow this regexp to be a - // candidate for match, so further matching "abc" is redundant. - // Note that we must ignore "" because find() would find it at the - // start of everything and thus we would end up erasing everything. + // regexps, matching "ab" will already allow this regexp to be a + // candidate for match, so further matching "abc" is redundant. + // Note that we must ignore "" because find() would find it at the + // start of everything and thus we would end up erasing everything. for (SSIter i = ss->begin(); i != ss->end(); ++i) { - if (i->empty()) - continue; + if (i->empty()) + continue; SSIter j = i; ++j; while (j != ss->end()) { - if (j->find(*i) != std::string::npos) { - j = ss->erase(j); - continue; - } + if (j->find(*i) != std::string::npos) { + j = ss->erase(j); + continue; + } ++j; } } } -Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) { - Prefilter* or_prefilter = new Prefilter(NONE); +Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) { + Prefilter* or_prefilter = new Prefilter(NONE); SimplifyStringSet(ss); - for (SSIter i = ss->begin(); i != ss->end(); ++i) - or_prefilter = Or(or_prefilter, FromString(*i)); + for (SSIter i = ss->begin(); i != ss->end(); ++i) + or_prefilter = Or(or_prefilter, FromString(*i)); return or_prefilter; } @@ -191,7 +191,7 @@ static Rune ToLowerRuneLatin1(Rune r) { return r; } -Prefilter* Prefilter::FromString(const std::string& str) { +Prefilter* Prefilter::FromString(const std::string& str) { Prefilter* m = new Prefilter(Prefilter::ATOM); m->atom_ = str; return m; @@ -214,26 +214,26 @@ class Prefilter::Info { static Info* Quest(Info* a); static Info* EmptyString(); static Info* NoMatch(); - static Info* AnyCharOrAnyByte(); + static Info* AnyCharOrAnyByte(); static Info* CClass(CharClass* cc, bool latin1); static Info* Literal(Rune r); static Info* LiteralLatin1(Rune r); static Info* AnyMatch(); // Format Info as a string. - std::string ToString(); + std::string ToString(); // Caller takes ownership of the Prefilter. Prefilter* TakeMatch(); - std::set<std::string>& exact() { return exact_; } + std::set<std::string>& exact() { return exact_; } bool is_exact() const { return is_exact_; } class Walker; private: - std::set<std::string> exact_; + std::set<std::string> exact_; // When is_exact_ is true, the strings that match // are placed in exact_. When it is no longer an exact @@ -268,11 +268,11 @@ Prefilter* Prefilter::Info::TakeMatch() { } // Format a Info in string form. -std::string Prefilter::Info::ToString() { +std::string Prefilter::Info::ToString() { if (is_exact_) { int n = 0; - std::string s; - for (SSIter i = exact_.begin(); i != exact_.end(); ++i) { + std::string s; + for (SSIter i = exact_.begin(); i != exact_.end(); ++i) { if (n++ > 0) s += ","; s += *i; @@ -287,17 +287,17 @@ std::string Prefilter::Info::ToString() { } // Add the strings from src to dst. -static void CopyIn(const std::set<std::string>& src, - std::set<std::string>* dst) { +static void CopyIn(const std::set<std::string>& src, + std::set<std::string>* dst) { for (ConstSSIter i = src.begin(); i != src.end(); ++i) dst->insert(*i); } // Add the cross-product of a and b to dst. // (For each string i in a and j in b, add i+j.) -static void CrossProduct(const std::set<std::string>& a, - const std::set<std::string>& b, - std::set<std::string>* dst) { +static void CrossProduct(const std::set<std::string>& a, + const std::set<std::string>& b, + std::set<std::string>* dst) { for (ConstSSIter i = a.begin(); i != a.end(); ++i) for (ConstSSIter j = b.begin(); j != b.end(); ++j) dst->insert(*i + *j); @@ -388,15 +388,15 @@ Prefilter::Info* Prefilter::Info::Plus(Info *a) { return ab; } -static std::string RuneToString(Rune r) { +static std::string RuneToString(Rune r) { char buf[UTFmax]; int n = runetochar(buf, &r); - return std::string(buf, n); + return std::string(buf, n); } -static std::string RuneToStringLatin1(Rune r) { +static std::string RuneToStringLatin1(Rune r) { char c = r & 0xff; - return std::string(&c, 1); + return std::string(&c, 1); } // Constructs Info for literal rune. @@ -415,8 +415,8 @@ Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) { return info; } -// Constructs Info for dot (any character) or \C (any byte). -Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() { +// Constructs Info for dot (any character) or \C (any byte). +Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() { Prefilter::Info* info = new Prefilter::Info(); info->match_ = new Prefilter(ALL); return info; @@ -459,7 +459,7 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, // If the class is too large, it's okay to overestimate. if (cc->size() > 10) - return AnyCharOrAnyByte(); + return AnyCharOrAnyByte(); Prefilter::Info *a = new Prefilter::Info(); for (CCIter i = cc->begin(); i != cc->end(); ++i) @@ -620,9 +620,9 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( break; case kRegexpAnyChar: - case kRegexpAnyByte: + case kRegexpAnyByte: // Claim nothing, except that it's not empty. - info = AnyCharOrAnyByte(); + info = AnyCharOrAnyByte(); break; case kRegexpCharClass: @@ -648,10 +648,10 @@ Prefilter* Prefilter::FromRegexp(Regexp* re) { return NULL; Regexp* simple = re->Simplify(); - if (simple == NULL) - return NULL; + if (simple == NULL) + return NULL; - Prefilter::Info* info = BuildInfo(simple); + Prefilter::Info* info = BuildInfo(simple); simple->Decref(); if (info == NULL) return NULL; @@ -661,7 +661,7 @@ Prefilter* Prefilter::FromRegexp(Regexp* re) { return m; } -std::string Prefilter::DebugString() const { +std::string Prefilter::DebugString() const { switch (op_) { default: LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; @@ -673,7 +673,7 @@ std::string Prefilter::DebugString() const { case ALL: return ""; case AND: { - std::string s = ""; + std::string s = ""; for (size_t i = 0; i < subs_->size(); i++) { if (i > 0) s += " "; @@ -683,7 +683,7 @@ std::string Prefilter::DebugString() const { return s; } case OR: { - std::string s = "("; + std::string s = "("; for (size_t i = 0; i < subs_->size(); i++) { if (i > 0) s += "|"; diff --git a/contrib/libs/re2/re2/prefilter.h b/contrib/libs/re2/re2/prefilter.h index b11369ddac..4fedeb4a7c 100644 --- a/contrib/libs/re2/re2/prefilter.h +++ b/contrib/libs/re2/re2/prefilter.h @@ -37,7 +37,7 @@ class Prefilter { ~Prefilter(); Op op() { return op_; } - const std::string& atom() const { return atom_; } + const std::string& atom() const { return atom_; } void set_unique_id(int id) { unique_id_ = id; } int unique_id() const { return unique_id_; } @@ -57,7 +57,7 @@ class Prefilter { static Prefilter* FromRE2(const RE2* re2); // Returns a readable debug string of the prefilter. - std::string DebugString() const; + std::string DebugString() const; private: class Info; @@ -75,9 +75,9 @@ class Prefilter { static Prefilter* FromRegexp(Regexp* a); - static Prefilter* FromString(const std::string& str); + static Prefilter* FromString(const std::string& str); - static Prefilter* OrStrings(std::set<std::string>* ss); + static Prefilter* OrStrings(std::set<std::string>* ss); static Info* BuildInfo(Regexp* re); @@ -90,7 +90,7 @@ class Prefilter { std::vector<Prefilter*>* subs_; // Actual string to match in leaf node. - std::string atom_; + std::string atom_; // If different prefilters have the same string atom, or if they are // structurally the same (e.g., OR of same atom strings) they are diff --git a/contrib/libs/re2/re2/prefilter_tree.cc b/contrib/libs/re2/re2/prefilter_tree.cc index 6f24aa6aa3..fdf4e083c9 100644 --- a/contrib/libs/re2/re2/prefilter_tree.cc +++ b/contrib/libs/re2/re2/prefilter_tree.cc @@ -15,7 +15,7 @@ #include "util/util.h" #include "util/logging.h" -#include "util/strutil.h" +#include "util/strutil.h" #include "re2/prefilter.h" #include "re2/re2.h" @@ -54,22 +54,22 @@ void PrefilterTree::Add(Prefilter* prefilter) { prefilter_vec_.push_back(prefilter); } -void PrefilterTree::Compile(std::vector<std::string>* atom_vec) { +void PrefilterTree::Compile(std::vector<std::string>* atom_vec) { if (compiled_) { LOG(DFATAL) << "Compile called already."; return; } - // Some legacy users of PrefilterTree call Compile() before - // adding any regexps and expect Compile() to have no effect. + // Some legacy users of PrefilterTree call Compile() before + // adding any regexps and expect Compile() to have no effect. if (prefilter_vec_.empty()) return; compiled_ = true; - // TODO(junyer): Use std::unordered_set<Prefilter*> instead? - NodeMap nodes; - AssignUniqueIds(&nodes, atom_vec); + // TODO(junyer): Use std::unordered_set<Prefilter*> instead? + NodeMap nodes; + AssignUniqueIds(&nodes, atom_vec); // Identify nodes that are too common among prefilters and are // triggering too many parents. Then get rid of them if possible. @@ -102,27 +102,27 @@ void PrefilterTree::Compile(std::vector<std::string>* atom_vec) { } if (ExtraDebug) - PrintDebugInfo(&nodes); + PrintDebugInfo(&nodes); } -Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { - std::string node_string = NodeString(node); - NodeMap::iterator iter = nodes->find(node_string); - if (iter == nodes->end()) +Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { + std::string node_string = NodeString(node); + NodeMap::iterator iter = nodes->find(node_string); + if (iter == nodes->end()) return NULL; return (*iter).second; } -std::string PrefilterTree::NodeString(Prefilter* node) const { +std::string PrefilterTree::NodeString(Prefilter* node) const { // Adding the operation disambiguates AND/OR/atom nodes. - std::string s = StringPrintf("%d", node->op()) + ":"; + std::string s = StringPrintf("%d", node->op()) + ":"; if (node->op() == Prefilter::ATOM) { s += node->atom(); } else { for (size_t i = 0; i < node->subs()->size(); i++) { if (i > 0) s += ','; - s += StringPrintf("%d", (*node->subs())[i]->unique_id()); + s += StringPrintf("%d", (*node->subs())[i]->unique_id()); } } return s; @@ -138,7 +138,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const { return false; case Prefilter::ALL: - case Prefilter::NONE: + case Prefilter::NONE: return false; case Prefilter::ATOM: @@ -165,8 +165,8 @@ bool PrefilterTree::KeepNode(Prefilter* node) const { } } -void PrefilterTree::AssignUniqueIds(NodeMap* nodes, - std::vector<std::string>* atom_vec) { +void PrefilterTree::AssignUniqueIds(NodeMap* nodes, + std::vector<std::string>* atom_vec) { atom_vec->clear(); // Build vector of all filter nodes, sorted topologically @@ -203,11 +203,11 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, if (node == NULL) continue; node->set_unique_id(-1); - Prefilter* canonical = CanonicalNode(nodes, node); + Prefilter* canonical = CanonicalNode(nodes, node); if (canonical == NULL) { // Any further nodes that have the same node string // will find this node as the canonical node. - nodes->emplace(NodeString(node), node); + nodes->emplace(NodeString(node), node); if (node->op() == Prefilter::ATOM) { atom_vec->push_back(node->atom()); atom_index_to_id_.push_back(unique_id); @@ -217,7 +217,7 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, node->set_unique_id(canonical->unique_id()); } } - entries_.resize(nodes->size()); + entries_.resize(nodes->size()); // Create parent StdIntMap for the entries. for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { @@ -225,7 +225,7 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, if (prefilter == NULL) continue; - if (CanonicalNode(nodes, prefilter) != prefilter) + if (CanonicalNode(nodes, prefilter) != prefilter) continue; Entry* entry = &entries_[prefilter->unique_id()]; @@ -238,7 +238,7 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, if (prefilter == NULL) continue; - if (CanonicalNode(nodes, prefilter) != prefilter) + if (CanonicalNode(nodes, prefilter) != prefilter) continue; Entry* entry = &entries_[prefilter->unique_id()]; @@ -258,7 +258,7 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, std::set<int> uniq_child; for (size_t j = 0; j < prefilter->subs()->size(); j++) { Prefilter* child = (*prefilter->subs())[j]; - Prefilter* canonical = CanonicalNode(nodes, child); + Prefilter* canonical = CanonicalNode(nodes, child); if (canonical == NULL) { LOG(DFATAL) << "Null canonical node"; return; @@ -285,7 +285,7 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, for (size_t i = 0; i < prefilter_vec_.size(); i++) { if (prefilter_vec_[i] == NULL) continue; - int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id(); + int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id(); DCHECK_LE(0, id); Entry* entry = &entries_[id]; entry->regexps.push_back(static_cast<int>(i)); @@ -298,27 +298,27 @@ void PrefilterTree::RegexpsGivenStrings( std::vector<int>* regexps) const { regexps->clear(); if (!compiled_) { - // Some legacy users of PrefilterTree call Compile() before - // adding any regexps and expect Compile() to have no effect. - // This kludge is a counterpart to that kludge. - if (prefilter_vec_.empty()) - return; - + // Some legacy users of PrefilterTree call Compile() before + // adding any regexps and expect Compile() to have no effect. + // This kludge is a counterpart to that kludge. + if (prefilter_vec_.empty()) + return; + LOG(ERROR) << "RegexpsGivenStrings called before Compile."; - for (size_t i = 0; i < prefilter_vec_.size(); i++) + for (size_t i = 0; i < prefilter_vec_.size(); i++) regexps->push_back(static_cast<int>(i)); } else { - IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); - std::vector<int> matched_atom_ids; - for (size_t j = 0; j < matched_atoms.size(); j++) - matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); - PropagateMatch(matched_atom_ids, ®exps_map); - for (IntMap::iterator it = regexps_map.begin(); - it != regexps_map.end(); - ++it) - regexps->push_back(it->index()); - - regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); + IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); + std::vector<int> matched_atom_ids; + for (size_t j = 0; j < matched_atoms.size(); j++) + matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); + PropagateMatch(matched_atom_ids, ®exps_map); + for (IntMap::iterator it = regexps_map.begin(); + it != regexps_map.end(); + ++it) + regexps->push_back(it->index()); + + regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); } std::sort(regexps->begin(), regexps->end()); } @@ -364,11 +364,11 @@ void PrefilterTree::PrintPrefilter(int regexpid) { LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]); } -void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { +void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size(); LOG(ERROR) << "#Unique Nodes: " << entries_.size(); - for (size_t i = 0; i < entries_.size(); i++) { + for (size_t i = 0; i < entries_.size(); i++) { StdIntMap* parents = entries_[i].parents; const std::vector<int>& regexps = entries_[i].regexps; LOG(ERROR) << "EntryId: " << i @@ -377,14 +377,14 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { LOG(ERROR) << it->first; } LOG(ERROR) << "Map:"; - for (NodeMap::const_iterator iter = nodes->begin(); - iter != nodes->end(); ++iter) + for (NodeMap::const_iterator iter = nodes->begin(); + iter != nodes->end(); ++iter) LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() << " Str: " << (*iter).first; } -std::string PrefilterTree::DebugNodeString(Prefilter* node) const { - std::string node_string = ""; +std::string PrefilterTree::DebugNodeString(Prefilter* node) const { + std::string node_string = ""; if (node->op() == Prefilter::ATOM) { DCHECK(!node->atom().empty()); node_string += node->atom(); @@ -395,7 +395,7 @@ std::string PrefilterTree::DebugNodeString(Prefilter* node) const { for (size_t i = 0; i < node->subs()->size(); i++) { if (i > 0) node_string += ','; - node_string += StringPrintf("%d", (*node->subs())[i]->unique_id()); + node_string += StringPrintf("%d", (*node->subs())[i]->unique_id()); node_string += ":"; node_string += DebugNodeString((*node->subs())[i]); } diff --git a/contrib/libs/re2/re2/prefilter_tree.h b/contrib/libs/re2/re2/prefilter_tree.h index d61fdce948..5d73074d97 100644 --- a/contrib/libs/re2/re2/prefilter_tree.h +++ b/contrib/libs/re2/re2/prefilter_tree.h @@ -7,7 +7,7 @@ // The PrefilterTree class is used to form an AND-OR tree of strings // that would trigger each regexp. The 'prefilter' of each regexp is -// added to PrefilterTree, and then PrefilterTree is used to find all +// added to PrefilterTree, and then PrefilterTree is used to find all // the unique strings across the prefilters. During search, by using // matches from a string matching engine, PrefilterTree deduces the // set of regexps that are to be triggered. The 'string matching @@ -21,8 +21,8 @@ #include <vector> #include "util/util.h" -#include "re2/prefilter.h" -#include "re2/sparse_array.h" +#include "re2/prefilter.h" +#include "re2/sparse_array.h" namespace re2 { @@ -43,7 +43,7 @@ class PrefilterTree { // The caller should use the returned set of strings to do string matching. // Each time a string matches, the corresponding index then has to be // and passed to RegexpsGivenStrings below. - void Compile(std::vector<std::string>* atom_vec); + void Compile(std::vector<std::string>* atom_vec); // Given the indices of the atoms that matched, returns the indexes // of regexps that should be searched. The matched_atoms should @@ -57,10 +57,10 @@ class PrefilterTree { // nodes of the prefilter of the regexp. void PrintPrefilter(int regexpid); - private: - typedef SparseArray<int> IntMap; - typedef std::map<int, int> StdIntMap; - typedef std::map<std::string, Prefilter*> NodeMap; + private: + typedef SparseArray<int> IntMap; + typedef std::map<int, int> StdIntMap; + typedef std::map<std::string, Prefilter*> NodeMap; // Each unique node has a corresponding Entry that helps in // passing the matching trigger information along the tree. @@ -90,7 +90,7 @@ class PrefilterTree { // This function assigns unique ids to various parts of the // prefilter, by looking at if these nodes are already in the // PrefilterTree. - void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec); + void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec); // Given the matching atoms, find the regexps to be triggered. void PropagateMatch(const std::vector<int>& atom_ids, @@ -98,17 +98,17 @@ class PrefilterTree { // Returns the prefilter node that has the same NodeString as this // node. For the canonical node, returns node. - Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); + Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); // A string that uniquely identifies the node. Assumes that the // children of node has already been assigned unique ids. - std::string NodeString(Prefilter* node) const; + std::string NodeString(Prefilter* node) const; // Recursively constructs a readable prefilter string. - std::string DebugNodeString(Prefilter* node) const; + std::string DebugNodeString(Prefilter* node) const; // Used for debugging. - void PrintDebugInfo(NodeMap* nodes); + void PrintDebugInfo(NodeMap* nodes); // These are all the nodes formed by Compile. Essentially, there is // one node for each unique atom and each unique AND/OR node. diff --git a/contrib/libs/re2/re2/prog.cc b/contrib/libs/re2/re2/prog.cc index 754bc88df0..a700d35de3 100644 --- a/contrib/libs/re2/re2/prog.cc +++ b/contrib/libs/re2/re2/prog.cc @@ -7,12 +7,12 @@ #include "re2/prog.h" -#if defined(__AVX2__) -#include <immintrin.h> -#ifdef _MSC_VER -#include <intrin.h> -#endif -#endif +#if defined(__AVX2__) +#include <immintrin.h> +#ifdef _MSC_VER +#include <intrin.h> +#endif +#endif #include <stdint.h> #include <string.h> #include <algorithm> @@ -40,7 +40,7 @@ void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) { set_out_opcode(out, kInstByteRange); lo_ = lo & 0xFF; hi_ = hi & 0xFF; - hint_foldcase_ = foldcase&1; + hint_foldcase_ = foldcase&1; } void Prog::Inst::InitCapture(int cap, uint32_t out) { @@ -71,7 +71,7 @@ void Prog::Inst::InitFail() { set_opcode(kInstFail); } -std::string Prog::Inst::Dump() { +std::string Prog::Inst::Dump() { switch (opcode()) { default: return StringPrintf("opcode %d", static_cast<int>(opcode())); @@ -83,9 +83,9 @@ std::string Prog::Inst::Dump() { return StringPrintf("altmatch -> %d | %d", out(), out1_); case kInstByteRange: - return StringPrintf("byte%s [%02x-%02x] %d -> %d", - foldcase() ? "/i" : "", - lo_, hi_, hint(), out()); + return StringPrintf("byte%s [%02x-%02x] %d -> %d", + foldcase() ? "/i" : "", + lo_, hi_, hint(), out()); case kInstCapture: return StringPrintf("capture %d -> %d", cap_, out()); @@ -115,8 +115,8 @@ Prog::Prog() start_unanchored_(0), size_(0), bytemap_range_(0), - prefix_foldcase_(false), - prefix_size_(0), + prefix_foldcase_(false), + prefix_size_(0), list_count_(0), bit_state_text_max_size_(0), dfa_mem_(0), @@ -127,8 +127,8 @@ Prog::Prog() Prog::~Prog() { DeleteDFA(dfa_longest_); DeleteDFA(dfa_first_); - if (prefix_foldcase_) - delete[] prefix_dfa_; + if (prefix_foldcase_) + delete[] prefix_dfa_; } typedef SparseSet Workq; @@ -138,12 +138,12 @@ static inline void AddToQueue(Workq* q, int id) { q->insert(id); } -static std::string ProgToString(Prog* prog, Workq* q) { - std::string s; +static std::string ProgToString(Prog* prog, Workq* q) { + std::string s; for (Workq::iterator i = q->begin(); i != q->end(); ++i) { int id = *i; Prog::Inst* ip = prog->inst(id); - s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); + s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); AddToQueue(q, ip->out()); if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) AddToQueue(q, ip->out1()); @@ -151,19 +151,19 @@ static std::string ProgToString(Prog* prog, Workq* q) { return s; } -static std::string FlattenedProgToString(Prog* prog, int start) { - std::string s; +static std::string FlattenedProgToString(Prog* prog, int start) { + std::string s; for (int id = start; id < prog->size(); id++) { Prog::Inst* ip = prog->inst(id); if (ip->last()) - s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); + s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); else - s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str()); + s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str()); } return s; } -std::string Prog::Dump() { +std::string Prog::Dump() { if (did_flatten_) return FlattenedProgToString(this, start_); @@ -172,7 +172,7 @@ std::string Prog::Dump() { return ProgToString(this, &q); } -std::string Prog::DumpUnanchored() { +std::string Prog::DumpUnanchored() { if (did_flatten_) return FlattenedProgToString(this, start_unanchored_); @@ -181,43 +181,43 @@ std::string Prog::DumpUnanchored() { return ProgToString(this, &q); } -std::string Prog::DumpByteMap() { - std::string map; +std::string Prog::DumpByteMap() { + std::string map; for (int c = 0; c < 256; c++) { int b = bytemap_[c]; int lo = c; while (c < 256-1 && bytemap_[c+1] == b) c++; int hi = c; - map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b); + map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b); } return map; } -// Is ip a guaranteed match at end of text, perhaps after some capturing? -static bool IsMatch(Prog* prog, Prog::Inst* ip) { - for (;;) { - switch (ip->opcode()) { - default: - LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); - return false; - - case kInstAlt: - case kInstAltMatch: - case kInstByteRange: - case kInstFail: - case kInstEmptyWidth: - return false; - - case kInstCapture: - case kInstNop: - ip = prog->inst(ip->out()); - break; - - case kInstMatch: - return true; - } - } +// Is ip a guaranteed match at end of text, perhaps after some capturing? +static bool IsMatch(Prog* prog, Prog::Inst* ip) { + for (;;) { + switch (ip->opcode()) { + default: + LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); + return false; + + case kInstAlt: + case kInstAltMatch: + case kInstByteRange: + case kInstFail: + case kInstEmptyWidth: + return false; + + case kInstCapture: + case kInstNop: + ip = prog->inst(ip->out()); + break; + + case kInstMatch: + return true; + } + } } // Peep-hole optimizer. @@ -288,24 +288,24 @@ uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { int flags = 0; // ^ and \A - if (p == text.data()) + if (p == text.data()) flags |= kEmptyBeginText | kEmptyBeginLine; else if (p[-1] == '\n') flags |= kEmptyBeginLine; // $ and \z - if (p == text.data() + text.size()) + if (p == text.data() + text.size()) flags |= kEmptyEndText | kEmptyEndLine; - else if (p < text.data() + text.size() && p[0] == '\n') + else if (p < text.data() + text.size() && p[0] == '\n') flags |= kEmptyEndLine; // \b and \B - if (p == text.data() && p == text.data() + text.size()) { + if (p == text.data() && p == text.data() + text.size()) { // no word boundary here - } else if (p == text.data()) { + } else if (p == text.data()) { if (IsWordChar(p[0])) flags |= kEmptyWordBoundary; - } else if (p == text.data() + text.size()) { + } else if (p == text.data() + text.size()) { if (IsWordChar(p[-1])) flags |= kEmptyWordBoundary; } else { @@ -353,7 +353,7 @@ class ByteMapBuilder { int Recolor(int oldcolor); Bitmap256 splits_; - int colors_[256]; + int colors_[256]; int nextcolor_; std::vector<std::pair<int, int>> colormap_; std::vector<std::pair<int, int>> ranges_; @@ -467,11 +467,11 @@ void Prog::ComputeByteMap() { foldlo = 'a'; if (foldhi > 'z') foldhi = 'z'; - if (foldlo <= foldhi) { - foldlo += 'A' - 'a'; - foldhi += 'A' - 'a'; - builder.Mark(foldlo, foldhi); - } + if (foldlo <= foldhi) { + foldlo += 'A' - 'a'; + foldhi += 'A' - 'a'; + builder.Mark(foldlo, foldhi); + } } // If this Inst is not the last Inst in its list AND the next Inst is // also a ByteRange AND the Insts have the same out, defer the merge. @@ -538,7 +538,7 @@ void Prog::ComputeByteMap() { // dominator of the instructions reachable from some "successor root" (i.e. it // has an unreachable predecessor) and is considered a "dominator root". Since // only Alt instructions can be "dominator roots" (other instructions would be -// "leaves"), only Alt instructions are required to be marked as predecessors. +// "leaves"), only Alt instructions are required to be marked as predecessors. // // Dividing the Prog into "trees" comprises two passes: marking the "successor // roots" and the predecessors; and marking the "dominator roots". Sorting the @@ -593,9 +593,9 @@ void Prog::Flatten() { flatmap[i->value()] = static_cast<int>(flat.size()); EmitList(i->index(), &rootmap, &flat, &reachable, &stk); flat.back().set_last(); - // We have the bounds of the "list", so this is the - // most convenient point at which to compute hints. - ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size())); + // We have the bounds of the "list", so this is the + // most convenient point at which to compute hints. + ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size())); } list_count_ = static_cast<int>(flatmap.size()); @@ -632,18 +632,18 @@ void Prog::Flatten() { // Finally, replace the old instructions with the new instructions. size_ = static_cast<int>(flat.size()); - inst_ = PODArray<Inst>(size_); - memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]); - - // Populate the list heads for BitState. - // 512 instructions limits the memory footprint to 1KiB. - if (size_ <= 512) { - list_heads_ = PODArray<uint16_t>(size_); - // 0xFF makes it more obvious if we try to look up a non-head. - memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]); - for (int i = 0; i < list_count_; ++i) - list_heads_[flatmap[i]] = i; - } + inst_ = PODArray<Inst>(size_); + memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]); + + // Populate the list heads for BitState. + // 512 instructions limits the memory footprint to 1KiB. + if (size_ <= 512) { + list_heads_ = PODArray<uint16_t>(size_); + // 0xFF makes it more obvious if we try to look up a non-head. + memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]); + for (int i = 0; i < list_count_; ++i) + list_heads_[flatmap[i]] = i; + } // BitState allocates a bitmap of size list_count_ * (text.size()+1) // for tracking pairs of possibilities that it has already explored. @@ -841,335 +841,335 @@ void Prog::EmitList(int root, SparseArray<int>* rootmap, } } -// For each ByteRange instruction in [begin, end), computes a hint to execution -// engines: the delta to the next instruction (in flat) worth exploring iff the -// current instruction matched. -// -// Implements a coloring algorithm related to ByteMapBuilder, but in this case, -// colors are instructions and recoloring ranges precisely identifies conflicts -// between instructions. Iterating backwards over [begin, end) is guaranteed to -// identify the nearest conflict (if any) with only linear complexity. -void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) { - Bitmap256 splits; - int colors[256]; - - bool dirty = false; - for (int id = end; id >= begin; --id) { - if (id == end || - (*flat)[id].opcode() != kInstByteRange) { - if (dirty) { - dirty = false; - splits.Clear(); - } - splits.Set(255); - colors[255] = id; - // At this point, the [0-255] range is colored with id. - // Thus, hints cannot point beyond id; and if id == end, - // hints that would have pointed to id will be 0 instead. - continue; - } - dirty = true; - - // We recolor the [lo-hi] range with id. Note that first ratchets backwards - // from end to the nearest conflict (if any) during recoloring. - int first = end; - auto Recolor = [&](int lo, int hi) { - // Like ByteMapBuilder, we split at lo-1 and at hi. - --lo; - - if (0 <= lo && !splits.Test(lo)) { - splits.Set(lo); - int next = splits.FindNextSetBit(lo+1); - colors[lo] = colors[next]; - } - if (!splits.Test(hi)) { - splits.Set(hi); - int next = splits.FindNextSetBit(hi+1); - colors[hi] = colors[next]; - } - - int c = lo+1; - while (c < 256) { - int next = splits.FindNextSetBit(c); - // Ratchet backwards... - first = std::min(first, colors[next]); - // Recolor with id - because it's the new nearest conflict! - colors[next] = id; - if (next == hi) - break; - c = next+1; - } - }; - - Inst* ip = &(*flat)[id]; - int lo = ip->lo(); - int hi = ip->hi(); - Recolor(lo, hi); - if (ip->foldcase() && lo <= 'z' && hi >= 'a') { - int foldlo = lo; - int foldhi = hi; - if (foldlo < 'a') - foldlo = 'a'; - if (foldhi > 'z') - foldhi = 'z'; - if (foldlo <= foldhi) { - foldlo += 'A' - 'a'; - foldhi += 'A' - 'a'; - Recolor(foldlo, foldhi); - } - } - - if (first != end) { - uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767)); - ip->hint_foldcase_ |= hint<<1; - } - } -} - -// The final state will always be this, which frees up a register for the hot -// loop and thus avoids the spilling that can occur when building with Clang. -static const size_t kShiftDFAFinal = 9; - -// This function takes the prefix as std::string (i.e. not const std::string& -// as normal) because it's going to clobber it, so a temporary is convenient. -static uint64_t* BuildShiftDFA(std::string prefix) { - // This constant is for convenience now and also for correctness later when - // we clobber the prefix, but still need to know how long it was initially. - const size_t size = prefix.size(); - - // Construct the NFA. - // The table is indexed by input byte; each element is a bitfield of states - // reachable by the input byte. Given a bitfield of the current states, the - // bitfield of states reachable from those is - for this specific purpose - - // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives - // the bitfield of the next states reached by stepping over the input byte. - // Credits for this technique: the Hyperscan paper by Geoff Langdale et al. - uint16_t nfa[256]{}; - for (size_t i = 0; i < size; ++i) { - uint8_t b = prefix[i]; - nfa[b] |= 1 << (i+1); - } - // This is the `\C*?` for unanchored search. - for (int b = 0; b < 256; ++b) - nfa[b] |= 1; - - // This maps from DFA state to NFA states; the reverse mapping is used when - // recording transitions and gets implemented with plain old linear search. - // The "Shift DFA" technique limits this to ten states when using uint64_t; - // to allow for the initial state, we use at most nine bytes of the prefix. - // That same limit is also why uint16_t is sufficient for the NFA bitfield. - uint16_t states[kShiftDFAFinal+1]{}; - states[0] = 1; - for (size_t dcurr = 0; dcurr < size; ++dcurr) { - uint8_t b = prefix[dcurr]; - uint16_t ncurr = states[dcurr]; - uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); - size_t dnext = dcurr+1; - if (dnext == size) - dnext = kShiftDFAFinal; - states[dnext] = nnext; - } - - // Sort and unique the bytes of the prefix to avoid repeating work while we - // record transitions. This clobbers the prefix, but it's no longer needed. - std::sort(prefix.begin(), prefix.end()); - prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end()); - - // Construct the DFA. - // The table is indexed by input byte; each element is effectively a packed - // array of uint6_t; each array value will be multiplied by six in order to - // avoid having to do so later in the hot loop as well as masking/shifting. - // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen. - uint64_t* dfa = new uint64_t[256]{}; - // Record a transition from each state for each of the bytes of the prefix. - // Note that all other input bytes go back to the initial state by default. - for (size_t dcurr = 0; dcurr < size; ++dcurr) { - for (uint8_t b : prefix) { - uint16_t ncurr = states[dcurr]; - uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); - size_t dnext = 0; - while (states[dnext] != nnext) - ++dnext; - dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6); - // Convert ASCII letters to uppercase and record the extra transitions. - // Note that ASCII letters are guaranteed to be lowercase at this point - // because that's how the parser normalises them. #FunFact: 'k' and 's' - // match U+212A and U+017F, respectively, so they won't occur here when - // using UTF-8 encoding because the parser will emit character classes. - if ('a' <= b && b <= 'z') { - b -= 'a' - 'A'; - dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6); - } - } - } - // This lets the final state "saturate", which will matter for performance: - // in the hot loop, we check for a match only at the end of each iteration, - // so we must keep signalling the match until we get around to checking it. - for (int b = 0; b < 256; ++b) - dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6); - - return dfa; -} - -void Prog::ConfigurePrefixAccel(const std::string& prefix, - bool prefix_foldcase) { - prefix_foldcase_ = prefix_foldcase; - prefix_size_ = prefix.size(); - if (prefix_foldcase_) { - // Use PrefixAccel_ShiftDFA(). - // ... and no more than nine bytes of the prefix. (See above for details.) - prefix_size_ = std::min(prefix_size_, kShiftDFAFinal); - prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_)); - } else if (prefix_size_ != 1) { - // Use PrefixAccel_FrontAndBack(). - prefix_front_ = prefix.front(); - prefix_back_ = prefix.back(); - } else { - // Use memchr(3). - prefix_front_ = prefix.front(); - } -} - -const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) { - if (size < prefix_size_) - return NULL; - - uint64_t curr = 0; - - // At the time of writing, rough benchmarks on a Broadwell machine showed - // that this unroll factor (i.e. eight) achieves a speedup factor of two. - if (size >= 8) { - const uint8_t* p = reinterpret_cast<const uint8_t*>(data); - const uint8_t* endp = p + (size&~7); - do { - uint8_t b0 = p[0]; - uint8_t b1 = p[1]; - uint8_t b2 = p[2]; - uint8_t b3 = p[3]; - uint8_t b4 = p[4]; - uint8_t b5 = p[5]; - uint8_t b6 = p[6]; - uint8_t b7 = p[7]; - - uint64_t next0 = prefix_dfa_[b0]; - uint64_t next1 = prefix_dfa_[b1]; - uint64_t next2 = prefix_dfa_[b2]; - uint64_t next3 = prefix_dfa_[b3]; - uint64_t next4 = prefix_dfa_[b4]; - uint64_t next5 = prefix_dfa_[b5]; - uint64_t next6 = prefix_dfa_[b6]; - uint64_t next7 = prefix_dfa_[b7]; - - uint64_t curr0 = next0 >> (curr & 63); - uint64_t curr1 = next1 >> (curr0 & 63); - uint64_t curr2 = next2 >> (curr1 & 63); - uint64_t curr3 = next3 >> (curr2 & 63); - uint64_t curr4 = next4 >> (curr3 & 63); - uint64_t curr5 = next5 >> (curr4 & 63); - uint64_t curr6 = next6 >> (curr5 & 63); - uint64_t curr7 = next7 >> (curr6 & 63); - - if ((curr7 & 63) == kShiftDFAFinal * 6) { - // At the time of writing, using the same masking subexpressions from - // the preceding lines caused Clang to clutter the hot loop computing - // them - even though they aren't actually needed for shifting! Hence - // these rewritten conditions, which achieve a speedup factor of two. - if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_; - if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_; - if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_; - if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_; - if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_; - if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_; - if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_; - if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_; - } - - curr = curr7; - p += 8; - } while (p != endp); - data = p; - size = size&7; - } - - const uint8_t* p = reinterpret_cast<const uint8_t*>(data); - const uint8_t* endp = p + size; - while (p != endp) { - uint8_t b = *p++; - uint64_t next = prefix_dfa_[b]; - curr = next >> (curr & 63); - if ((curr & 63) == kShiftDFAFinal * 6) - return p-prefix_size_; - } - return NULL; -} - -#if defined(__AVX2__) -// Finds the least significant non-zero bit in n. -static int FindLSBSet(uint32_t n) { - DCHECK_NE(n, 0); -#if defined(__GNUC__) - return __builtin_ctz(n); -#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) - unsigned long c; - _BitScanForward(&c, n); - return static_cast<int>(c); -#else - int c = 31; - for (int shift = 1 << 4; shift != 0; shift >>= 1) { - uint32_t word = n << shift; - if (word != 0) { - n = word; - c -= shift; - } - } - return c; -#endif -} -#endif - -const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { - DCHECK_GE(prefix_size_, 2); - if (size < prefix_size_) - return NULL; - // Don't bother searching the last prefix_size_-1 bytes for prefix_front_. - // This also means that probing for prefix_back_ doesn't go out of bounds. - size -= prefix_size_-1; - -#if defined(__AVX2__) - // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time. - if (size >= sizeof(__m256i)) { - const __m256i* fp = reinterpret_cast<const __m256i*>( - reinterpret_cast<const char*>(data)); - const __m256i* bp = reinterpret_cast<const __m256i*>( - reinterpret_cast<const char*>(data) + prefix_size_-1); - const __m256i* endfp = fp + size/sizeof(__m256i); - const __m256i f_set1 = _mm256_set1_epi8(prefix_front_); - const __m256i b_set1 = _mm256_set1_epi8(prefix_back_); - do { - const __m256i f_loadu = _mm256_loadu_si256(fp++); - const __m256i b_loadu = _mm256_loadu_si256(bp++); - const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu); - const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu); - const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq); - if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero. - const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq); - const int fb_movemask = _mm256_movemask_epi8(fb_and); - const int fb_ctz = FindLSBSet(fb_movemask); - return reinterpret_cast<const char*>(fp-1) + fb_ctz; - } - } while (fp != endfp); - data = fp; - size = size%sizeof(__m256i); - } -#endif - - const char* p0 = reinterpret_cast<const char*>(data); - for (const char* p = p0;; p++) { - DCHECK_GE(size, static_cast<size_t>(p-p0)); - p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0))); - if (p == NULL || p[prefix_size_-1] == prefix_back_) - return p; - } -} - +// For each ByteRange instruction in [begin, end), computes a hint to execution +// engines: the delta to the next instruction (in flat) worth exploring iff the +// current instruction matched. +// +// Implements a coloring algorithm related to ByteMapBuilder, but in this case, +// colors are instructions and recoloring ranges precisely identifies conflicts +// between instructions. Iterating backwards over [begin, end) is guaranteed to +// identify the nearest conflict (if any) with only linear complexity. +void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) { + Bitmap256 splits; + int colors[256]; + + bool dirty = false; + for (int id = end; id >= begin; --id) { + if (id == end || + (*flat)[id].opcode() != kInstByteRange) { + if (dirty) { + dirty = false; + splits.Clear(); + } + splits.Set(255); + colors[255] = id; + // At this point, the [0-255] range is colored with id. + // Thus, hints cannot point beyond id; and if id == end, + // hints that would have pointed to id will be 0 instead. + continue; + } + dirty = true; + + // We recolor the [lo-hi] range with id. Note that first ratchets backwards + // from end to the nearest conflict (if any) during recoloring. + int first = end; + auto Recolor = [&](int lo, int hi) { + // Like ByteMapBuilder, we split at lo-1 and at hi. + --lo; + + if (0 <= lo && !splits.Test(lo)) { + splits.Set(lo); + int next = splits.FindNextSetBit(lo+1); + colors[lo] = colors[next]; + } + if (!splits.Test(hi)) { + splits.Set(hi); + int next = splits.FindNextSetBit(hi+1); + colors[hi] = colors[next]; + } + + int c = lo+1; + while (c < 256) { + int next = splits.FindNextSetBit(c); + // Ratchet backwards... + first = std::min(first, colors[next]); + // Recolor with id - because it's the new nearest conflict! + colors[next] = id; + if (next == hi) + break; + c = next+1; + } + }; + + Inst* ip = &(*flat)[id]; + int lo = ip->lo(); + int hi = ip->hi(); + Recolor(lo, hi); + if (ip->foldcase() && lo <= 'z' && hi >= 'a') { + int foldlo = lo; + int foldhi = hi; + if (foldlo < 'a') + foldlo = 'a'; + if (foldhi > 'z') + foldhi = 'z'; + if (foldlo <= foldhi) { + foldlo += 'A' - 'a'; + foldhi += 'A' - 'a'; + Recolor(foldlo, foldhi); + } + } + + if (first != end) { + uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767)); + ip->hint_foldcase_ |= hint<<1; + } + } +} + +// The final state will always be this, which frees up a register for the hot +// loop and thus avoids the spilling that can occur when building with Clang. +static const size_t kShiftDFAFinal = 9; + +// This function takes the prefix as std::string (i.e. not const std::string& +// as normal) because it's going to clobber it, so a temporary is convenient. +static uint64_t* BuildShiftDFA(std::string prefix) { + // This constant is for convenience now and also for correctness later when + // we clobber the prefix, but still need to know how long it was initially. + const size_t size = prefix.size(); + + // Construct the NFA. + // The table is indexed by input byte; each element is a bitfield of states + // reachable by the input byte. Given a bitfield of the current states, the + // bitfield of states reachable from those is - for this specific purpose - + // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives + // the bitfield of the next states reached by stepping over the input byte. + // Credits for this technique: the Hyperscan paper by Geoff Langdale et al. + uint16_t nfa[256]{}; + for (size_t i = 0; i < size; ++i) { + uint8_t b = prefix[i]; + nfa[b] |= 1 << (i+1); + } + // This is the `\C*?` for unanchored search. + for (int b = 0; b < 256; ++b) + nfa[b] |= 1; + + // This maps from DFA state to NFA states; the reverse mapping is used when + // recording transitions and gets implemented with plain old linear search. + // The "Shift DFA" technique limits this to ten states when using uint64_t; + // to allow for the initial state, we use at most nine bytes of the prefix. + // That same limit is also why uint16_t is sufficient for the NFA bitfield. + uint16_t states[kShiftDFAFinal+1]{}; + states[0] = 1; + for (size_t dcurr = 0; dcurr < size; ++dcurr) { + uint8_t b = prefix[dcurr]; + uint16_t ncurr = states[dcurr]; + uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); + size_t dnext = dcurr+1; + if (dnext == size) + dnext = kShiftDFAFinal; + states[dnext] = nnext; + } + + // Sort and unique the bytes of the prefix to avoid repeating work while we + // record transitions. This clobbers the prefix, but it's no longer needed. + std::sort(prefix.begin(), prefix.end()); + prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end()); + + // Construct the DFA. + // The table is indexed by input byte; each element is effectively a packed + // array of uint6_t; each array value will be multiplied by six in order to + // avoid having to do so later in the hot loop as well as masking/shifting. + // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen. + uint64_t* dfa = new uint64_t[256]{}; + // Record a transition from each state for each of the bytes of the prefix. + // Note that all other input bytes go back to the initial state by default. + for (size_t dcurr = 0; dcurr < size; ++dcurr) { + for (uint8_t b : prefix) { + uint16_t ncurr = states[dcurr]; + uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); + size_t dnext = 0; + while (states[dnext] != nnext) + ++dnext; + dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6); + // Convert ASCII letters to uppercase and record the extra transitions. + // Note that ASCII letters are guaranteed to be lowercase at this point + // because that's how the parser normalises them. #FunFact: 'k' and 's' + // match U+212A and U+017F, respectively, so they won't occur here when + // using UTF-8 encoding because the parser will emit character classes. + if ('a' <= b && b <= 'z') { + b -= 'a' - 'A'; + dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6); + } + } + } + // This lets the final state "saturate", which will matter for performance: + // in the hot loop, we check for a match only at the end of each iteration, + // so we must keep signalling the match until we get around to checking it. + for (int b = 0; b < 256; ++b) + dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6); + + return dfa; +} + +void Prog::ConfigurePrefixAccel(const std::string& prefix, + bool prefix_foldcase) { + prefix_foldcase_ = prefix_foldcase; + prefix_size_ = prefix.size(); + if (prefix_foldcase_) { + // Use PrefixAccel_ShiftDFA(). + // ... and no more than nine bytes of the prefix. (See above for details.) + prefix_size_ = std::min(prefix_size_, kShiftDFAFinal); + prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_)); + } else if (prefix_size_ != 1) { + // Use PrefixAccel_FrontAndBack(). + prefix_front_ = prefix.front(); + prefix_back_ = prefix.back(); + } else { + // Use memchr(3). + prefix_front_ = prefix.front(); + } +} + +const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) { + if (size < prefix_size_) + return NULL; + + uint64_t curr = 0; + + // At the time of writing, rough benchmarks on a Broadwell machine showed + // that this unroll factor (i.e. eight) achieves a speedup factor of two. + if (size >= 8) { + const uint8_t* p = reinterpret_cast<const uint8_t*>(data); + const uint8_t* endp = p + (size&~7); + do { + uint8_t b0 = p[0]; + uint8_t b1 = p[1]; + uint8_t b2 = p[2]; + uint8_t b3 = p[3]; + uint8_t b4 = p[4]; + uint8_t b5 = p[5]; + uint8_t b6 = p[6]; + uint8_t b7 = p[7]; + + uint64_t next0 = prefix_dfa_[b0]; + uint64_t next1 = prefix_dfa_[b1]; + uint64_t next2 = prefix_dfa_[b2]; + uint64_t next3 = prefix_dfa_[b3]; + uint64_t next4 = prefix_dfa_[b4]; + uint64_t next5 = prefix_dfa_[b5]; + uint64_t next6 = prefix_dfa_[b6]; + uint64_t next7 = prefix_dfa_[b7]; + + uint64_t curr0 = next0 >> (curr & 63); + uint64_t curr1 = next1 >> (curr0 & 63); + uint64_t curr2 = next2 >> (curr1 & 63); + uint64_t curr3 = next3 >> (curr2 & 63); + uint64_t curr4 = next4 >> (curr3 & 63); + uint64_t curr5 = next5 >> (curr4 & 63); + uint64_t curr6 = next6 >> (curr5 & 63); + uint64_t curr7 = next7 >> (curr6 & 63); + + if ((curr7 & 63) == kShiftDFAFinal * 6) { + // At the time of writing, using the same masking subexpressions from + // the preceding lines caused Clang to clutter the hot loop computing + // them - even though they aren't actually needed for shifting! Hence + // these rewritten conditions, which achieve a speedup factor of two. + if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_; + if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_; + if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_; + if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_; + if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_; + if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_; + if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_; + if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_; + } + + curr = curr7; + p += 8; + } while (p != endp); + data = p; + size = size&7; + } + + const uint8_t* p = reinterpret_cast<const uint8_t*>(data); + const uint8_t* endp = p + size; + while (p != endp) { + uint8_t b = *p++; + uint64_t next = prefix_dfa_[b]; + curr = next >> (curr & 63); + if ((curr & 63) == kShiftDFAFinal * 6) + return p-prefix_size_; + } + return NULL; +} + +#if defined(__AVX2__) +// Finds the least significant non-zero bit in n. +static int FindLSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return __builtin_ctz(n); +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + unsigned long c; + _BitScanForward(&c, n); + return static_cast<int>(c); +#else + int c = 31; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n << shift; + if (word != 0) { + n = word; + c -= shift; + } + } + return c; +#endif +} +#endif + +const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { + DCHECK_GE(prefix_size_, 2); + if (size < prefix_size_) + return NULL; + // Don't bother searching the last prefix_size_-1 bytes for prefix_front_. + // This also means that probing for prefix_back_ doesn't go out of bounds. + size -= prefix_size_-1; + +#if defined(__AVX2__) + // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time. + if (size >= sizeof(__m256i)) { + const __m256i* fp = reinterpret_cast<const __m256i*>( + reinterpret_cast<const char*>(data)); + const __m256i* bp = reinterpret_cast<const __m256i*>( + reinterpret_cast<const char*>(data) + prefix_size_-1); + const __m256i* endfp = fp + size/sizeof(__m256i); + const __m256i f_set1 = _mm256_set1_epi8(prefix_front_); + const __m256i b_set1 = _mm256_set1_epi8(prefix_back_); + do { + const __m256i f_loadu = _mm256_loadu_si256(fp++); + const __m256i b_loadu = _mm256_loadu_si256(bp++); + const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu); + const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu); + const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq); + if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero. + const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq); + const int fb_movemask = _mm256_movemask_epi8(fb_and); + const int fb_ctz = FindLSBSet(fb_movemask); + return reinterpret_cast<const char*>(fp-1) + fb_ctz; + } + } while (fp != endfp); + data = fp; + size = size%sizeof(__m256i); + } +#endif + + const char* p0 = reinterpret_cast<const char*>(data); + for (const char* p = p0;; p++) { + DCHECK_GE(size, static_cast<size_t>(p-p0)); + p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0))); + if (p == NULL || p[prefix_size_-1] == prefix_back_) + return p; + } +} + } // namespace re2 diff --git a/contrib/libs/re2/re2/prog.h b/contrib/libs/re2/re2/prog.h index f563aaf384..4af012ab6f 100644 --- a/contrib/libs/re2/re2/prog.h +++ b/contrib/libs/re2/re2/prog.h @@ -10,18 +10,18 @@ // expression symbolically. #include <stdint.h> -#include <functional> +#include <functional> #include <mutex> #include <string> #include <vector> -#include <type_traits> +#include <type_traits> #include "util/util.h" #include "util/logging.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/re2.h" -#include "re2/sparse_array.h" -#include "re2/sparse_set.h" +#include "re2/sparse_array.h" +#include "re2/sparse_set.h" namespace re2 { @@ -61,8 +61,8 @@ class Prog { // Single instruction in regexp program. class Inst { public: - // See the assertion below for why this is so. - Inst() = default; + // See the assertion below for why this is so. + Inst() = default; // Copyable. Inst(const Inst&) = default; @@ -78,7 +78,7 @@ class Prog { void InitFail(); // Getters - int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); } + int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); } InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } int last() { return (out_opcode_>>3)&1; } int out() { return out_opcode_>>4; } @@ -86,8 +86,8 @@ class Prog { int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } - int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; } - int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; } + int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; } + int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; } int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } @@ -101,13 +101,13 @@ class Prog { // Does this inst (an kInstByteRange) match c? inline bool Matches(int c) { DCHECK_EQ(opcode(), kInstByteRange); - if (foldcase() && 'A' <= c && c <= 'Z') + if (foldcase() && 'A' <= c && c <= 'Z') c += 'a' - 'A'; return lo_ <= c && c <= hi_; } // Returns string representation for debugging. - std::string Dump(); + std::string Dump(); // Maximum instruction id. // (Must fit in out_opcode_. PatchList/last steal another bit.) @@ -130,31 +130,31 @@ class Prog { out_opcode_ = (out<<4) | (last()<<3) | opcode; } - uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode - union { // additional instruction arguments: - uint32_t out1_; // opcode == kInstAlt - // alternate next instruction - - int32_t cap_; // opcode == kInstCapture - // Index of capture register (holds text - // position recorded by capturing parentheses). - // For \n (the submatch for the nth parentheses), - // the left parenthesis captures into register 2*n - // and the right one captures into register 2*n+1. - - int32_t match_id_; // opcode == kInstMatch - // Match ID to identify this match (for re2::Set). - - struct { // opcode == kInstByteRange - uint8_t lo_; // byte range is lo_-hi_ inclusive - uint8_t hi_; // - uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase - // hint to execution engines: the delta to the - // next instruction (in the current list) worth - // exploring iff this instruction matched; 0 - // means there are no remaining possibilities, - // which is most likely for character classes. - // foldcase: A-Z -> a-z before checking range. + uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode + union { // additional instruction arguments: + uint32_t out1_; // opcode == kInstAlt + // alternate next instruction + + int32_t cap_; // opcode == kInstCapture + // Index of capture register (holds text + // position recorded by capturing parentheses). + // For \n (the submatch for the nth parentheses), + // the left parenthesis captures into register 2*n + // and the right one captures into register 2*n+1. + + int32_t match_id_; // opcode == kInstMatch + // Match ID to identify this match (for re2::Set). + + struct { // opcode == kInstByteRange + uint8_t lo_; // byte range is lo_-hi_ inclusive + uint8_t hi_; // + uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase + // hint to execution engines: the delta to the + // next instruction (in the current list) worth + // exploring iff this instruction matched; 0 + // means there are no remaining possibilities, + // which is most likely for character classes. + // foldcase: A-Z -> a-z before checking range. }; EmptyOp empty_; // opcode == kInstEmptyWidth @@ -166,11 +166,11 @@ class Prog { friend class Prog; }; - // Inst must be trivial so that we can freely clear it with memset(3). - // Arrays of Inst are initialised by copying the initial elements with - // memmove(3) and then clearing any remaining elements with memset(3). - static_assert(std::is_trivial<Inst>::value, "Inst must be trivial"); - + // Inst must be trivial so that we can freely clear it with memset(3). + // Arrays of Inst are initialised by copying the initial elements with + // memmove(3) and then clearing any remaining elements with memset(3). + static_assert(std::is_trivial<Inst>::value, "Inst must be trivial"); + // Whether to anchor the search. enum Anchor { kUnanchored, // match anywhere @@ -198,7 +198,7 @@ class Prog { Inst *inst(int id) { return &inst_[id]; } int start() { return start_; } - void set_start(int start) { start_ = start; } + void set_start(int start) { start_ = start; } int start_unanchored() { return start_unanchored_; } void set_start_unanchored(int start) { start_unanchored_ = start; } int size() { return size_; } @@ -206,9 +206,9 @@ class Prog { void set_reversed(bool reversed) { reversed_ = reversed; } int list_count() { return list_count_; } int inst_count(InstOp op) { return inst_count_[op]; } - uint16_t* list_heads() { return list_heads_.data(); } + uint16_t* list_heads() { return list_heads_.data(); } size_t bit_state_text_max_size() { return bit_state_text_max_size_; } - int64_t dfa_mem() { return dfa_mem_; } + int64_t dfa_mem() { return dfa_mem_; } void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } bool anchor_start() { return anchor_start_; } void set_anchor_start(bool b) { anchor_start_ = b; } @@ -216,36 +216,36 @@ class Prog { void set_anchor_end(bool b) { anchor_end_ = b; } int bytemap_range() { return bytemap_range_; } const uint8_t* bytemap() { return bytemap_; } - bool can_prefix_accel() { return prefix_size_ != 0; } - - // Accelerates to the first likely occurrence of the prefix. - // Returns a pointer to the first byte or NULL if not found. - const void* PrefixAccel(const void* data, size_t size) { - DCHECK(can_prefix_accel()); - if (prefix_foldcase_) { - return PrefixAccel_ShiftDFA(data, size); - } else if (prefix_size_ != 1) { - return PrefixAccel_FrontAndBack(data, size); - } else { - return memchr(data, prefix_front_, size); - } - } - - // Configures prefix accel using the analysis performed during compilation. - void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase); - - // An implementation of prefix accel that uses prefix_dfa_ to perform - // case-insensitive search. - const void* PrefixAccel_ShiftDFA(const void* data, size_t size); - - // An implementation of prefix accel that looks for prefix_front_ and - // prefix_back_ to return fewer false positives than memchr(3) alone. - const void* PrefixAccel_FrontAndBack(const void* data, size_t size); - + bool can_prefix_accel() { return prefix_size_ != 0; } + + // Accelerates to the first likely occurrence of the prefix. + // Returns a pointer to the first byte or NULL if not found. + const void* PrefixAccel(const void* data, size_t size) { + DCHECK(can_prefix_accel()); + if (prefix_foldcase_) { + return PrefixAccel_ShiftDFA(data, size); + } else if (prefix_size_ != 1) { + return PrefixAccel_FrontAndBack(data, size); + } else { + return memchr(data, prefix_front_, size); + } + } + + // Configures prefix accel using the analysis performed during compilation. + void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase); + + // An implementation of prefix accel that uses prefix_dfa_ to perform + // case-insensitive search. + const void* PrefixAccel_ShiftDFA(const void* data, size_t size); + + // An implementation of prefix accel that looks for prefix_front_ and + // prefix_back_ to return fewer false positives than memchr(3) alone. + const void* PrefixAccel_FrontAndBack(const void* data, size_t size); + // Returns string representation of program for debugging. - std::string Dump(); - std::string DumpUnanchored(); - std::string DumpByteMap(); + std::string Dump(); + std::string DumpUnanchored(); + std::string DumpByteMap(); // Returns the set of kEmpty flags that are in effect at // position p within context. @@ -292,24 +292,24 @@ class Prog { // SearchDFA fills matches with the match IDs of the final matching state. bool SearchDFA(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, StringPiece* match0, - bool* failed, SparseSet* matches); - - // The callback issued after building each DFA state with BuildEntireDFA(). - // If next is null, then the memory budget has been exhausted and building - // will halt. Otherwise, the state has been built and next points to an array - // of bytemap_range()+1 slots holding the next states as per the bytemap and - // kByteEndText. The number of the state is implied by the callback sequence: - // the first callback is for state 0, the second callback is for state 1, ... - // match indicates whether the state is a matching state. - using DFAStateCallback = std::function<void(const int* next, bool match)>; - - // Build the entire DFA for the given match kind. + bool* failed, SparseSet* matches); + + // The callback issued after building each DFA state with BuildEntireDFA(). + // If next is null, then the memory budget has been exhausted and building + // will halt. Otherwise, the state has been built and next points to an array + // of bytemap_range()+1 slots holding the next states as per the bytemap and + // kByteEndText. The number of the state is implied by the callback sequence: + // the first callback is for state 0, the second callback is for state 1, ... + // match indicates whether the state is a matching state. + using DFAStateCallback = std::function<void(const int* next, bool match)>; + + // Build the entire DFA for the given match kind. // Usually the DFA is built out incrementally, as needed, which - // avoids lots of unnecessary work. - // If cb is not empty, it receives one callback per state built. - // Returns the number of states built. - // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. - int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); + // avoids lots of unnecessary work. + // If cb is not empty, it receives one callback per state built. + // Returns the number of states built. + // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. + int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); // Compute bytemap. void ComputeByteMap(); @@ -326,8 +326,8 @@ class Prog { StringPiece* match, int nmatch); // Bit-state backtracking. Fast on small cases but uses memory - // proportional to the product of the list count and the text size. - bool CanBitState() { return list_heads_.data() != NULL; } + // proportional to the product of the list count and the text size. + bool CanBitState() { return list_heads_.data() != NULL; } bool SearchBitState(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, StringPiece* match, int nmatch); @@ -359,15 +359,15 @@ class Prog { // do not compile down to infinite repetitions. // // Returns true on success, false on error. - bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); + bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); // EXPERIMENTAL! SUBJECT TO CHANGE! // Outputs the program fanout into the given sparse array. void Fanout(SparseArray<int>* fanout); // Compiles a collection of regexps to Prog. Each regexp will have - // its own Match instruction recording the index in the output vector. - static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); + // its own Match instruction recording the index in the output vector. + static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); // Flattens the Prog from "tree" form to "list" form. This is an in-place // operation in the sense that the old instructions are lost. @@ -396,13 +396,13 @@ class Prog { std::vector<Inst>* flat, SparseSet* reachable, std::vector<int>* stk); - // Computes hints for ByteRange instructions in [begin, end). - void ComputeHints(std::vector<Inst>* flat, int begin, int end); - - // Controls whether the DFA should bail out early if the NFA would be faster. - // FOR TESTING ONLY. - static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b); - + // Computes hints for ByteRange instructions in [begin, end). + void ComputeHints(std::vector<Inst>* flat, int begin, int end); + + // Controls whether the DFA should bail out early if the NFA would be faster. + // FOR TESTING ONLY. + static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b); + private: friend class Compiler; @@ -419,16 +419,16 @@ class Prog { int start_unanchored_; // unanchored entry point for program int size_; // number of instructions int bytemap_range_; // bytemap_[x] < bytemap_range_ - - bool prefix_foldcase_; // whether prefix is case-insensitive - size_t prefix_size_; // size of prefix (0 if no prefix) - union { - uint64_t* prefix_dfa_; // "Shift DFA" for prefix - struct { - int prefix_front_; // first byte of prefix - int prefix_back_; // last byte of prefix - }; - }; + + bool prefix_foldcase_; // whether prefix is case-insensitive + size_t prefix_size_; // size of prefix (0 if no prefix) + union { + uint64_t* prefix_dfa_; // "Shift DFA" for prefix + struct { + int prefix_front_; // first byte of prefix + int prefix_back_; // last byte of prefix + }; + }; int list_count_; // count of lists (see above) int inst_count_[kNumInst]; // count of instructions by opcode @@ -436,8 +436,8 @@ class Prog { // not populated if size_ is overly large size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size() - PODArray<Inst> inst_; // pointer to instruction array - PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes + PODArray<Inst> inst_; // pointer to instruction array + PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes int64_t dfa_mem_; // Maximum memory for DFAs. DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch diff --git a/contrib/libs/re2/re2/re2.cc b/contrib/libs/re2/re2/re2.cc index 96680da33c..47fb385e4e 100644 --- a/contrib/libs/re2/re2/re2.cc +++ b/contrib/libs/re2/re2/re2.cc @@ -12,14 +12,14 @@ #include <assert.h> #include <ctype.h> #include <errno.h> -#ifdef _MSC_VER -#include <intrin.h> -#endif +#ifdef _MSC_VER +#include <intrin.h> +#endif #include <stdint.h> #include <stdlib.h> #include <string.h> #include <algorithm> -#include <atomic> +#include <atomic> #include <iterator> #include <mutex> #include <string> @@ -32,7 +32,7 @@ #include "util/utf.h" #include "re2/prog.h" #include "re2/regexp.h" -#include "re2/sparse_array.h" +#include "re2/sparse_array.h" namespace re2 { @@ -60,9 +60,9 @@ RE2::Options::Options(RE2::CannedOptions opt) // static empty objects for use as const references. // To avoid global constructors, allocated in RE2::Init(). -static const std::string* empty_string; -static const std::map<std::string, int>* empty_named_groups; -static const std::map<int, std::string>* empty_group_names; +static const std::string* empty_string; +static const std::map<std::string, int>* empty_named_groups; +static const std::map<int, std::string>* empty_group_names; // Converts from Regexp error code to RE2 error code. // Maybe some day they will diverge. In any event, this @@ -83,8 +83,8 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { return RE2::ErrorMissingBracket; case re2::kRegexpMissingParen: return RE2::ErrorMissingParen; - case re2::kRegexpUnexpectedParen: - return RE2::ErrorUnexpectedParen; + case re2::kRegexpUnexpectedParen: + return RE2::ErrorUnexpectedParen; case re2::kRegexpTrailingBackslash: return RE2::ErrorTrailingBackslash; case re2::kRegexpRepeatArgument: @@ -103,10 +103,10 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { return RE2::ErrorInternal; } -static std::string trunc(const StringPiece& pattern) { +static std::string trunc(const StringPiece& pattern) { if (pattern.size() < 100) - return std::string(pattern); - return std::string(pattern.substr(0, 100)) + "..."; + return std::string(pattern); + return std::string(pattern.substr(0, 100)) + "..."; } @@ -114,7 +114,7 @@ RE2::RE2(const char* pattern) { Init(pattern, DefaultOptions); } -RE2::RE2(const std::string& pattern) { +RE2::RE2(const std::string& pattern) { Init(pattern, DefaultOptions); } @@ -173,24 +173,24 @@ int RE2::Options::ParseFlags() const { void RE2::Init(const StringPiece& pattern, const Options& options) { static std::once_flag empty_once; std::call_once(empty_once, []() { - empty_string = new std::string; - empty_named_groups = new std::map<std::string, int>; - empty_group_names = new std::map<int, std::string>; + empty_string = new std::string; + empty_named_groups = new std::map<std::string, int>; + empty_group_names = new std::map<int, std::string>; }); - pattern_.assign(pattern.data(), pattern.size()); + pattern_.assign(pattern.data(), pattern.size()); options_.Copy(options); entire_regexp_ = NULL; - error_ = empty_string; - error_code_ = NoError; - error_arg_.clear(); - prefix_.clear(); - prefix_foldcase_ = false; + error_ = empty_string; + error_code_ = NoError; + error_arg_.clear(); + prefix_.clear(); + prefix_foldcase_ = false; suffix_regexp_ = NULL; prog_ = NULL; - num_captures_ = -1; - is_one_pass_ = false; - + num_captures_ = -1; + is_one_pass_ = false; + rprog_ = NULL; named_groups_ = NULL; group_names_ = NULL; @@ -205,9 +205,9 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " << status.Text(); } - error_ = new std::string(status.Text()); + error_ = new std::string(status.Text()); error_code_ = RegexpErrorToRE2(status.code()); - error_arg_ = std::string(status.error_arg()); + error_arg_ = std::string(status.error_arg()); return; } @@ -224,16 +224,16 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { if (prog_ == NULL) { if (options_.log_errors()) LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; - error_ = new std::string("pattern too large - compile failed"); + error_ = new std::string("pattern too large - compile failed"); error_code_ = RE2::ErrorPatternTooLarge; return; } - // We used to compute this lazily, but it's used during the - // typical control flow for a match call, so we now compute - // it eagerly, which avoids the overhead of std::once_flag. - num_captures_ = suffix_regexp_->NumCaptures(); - + // We used to compute this lazily, but it's used during the + // typical control flow for a match call, so we now compute + // it eagerly, which avoids the overhead of std::once_flag. + num_captures_ = suffix_regexp_->NumCaptures(); + // Could delay this until the first match call that // cares about submatch information, but the one-pass // machine's memory gets cut from the DFA memory budget, @@ -250,11 +250,11 @@ re2::Prog* RE2::ReverseProg() const { if (re->rprog_ == NULL) { if (re->options_.log_errors()) LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'"; - // We no longer touch error_ and error_code_ because failing to compile - // the reverse Prog is not a showstopper: falling back to NFA execution - // is fine. More importantly, an RE2 object is supposed to be logically - // immutable: whatever ok() would have returned after Init() completed, - // it should continue to return that no matter what ReverseProg() does. + // We no longer touch error_ and error_code_ because failing to compile + // the reverse Prog is not a showstopper: falling back to NFA execution + // is fine. More importantly, an RE2 object is supposed to be logically + // immutable: whatever ok() would have returned after Init() completed, + // it should continue to return that no matter what ReverseProg() does. } }, this); return rprog_; @@ -281,73 +281,73 @@ int RE2::ProgramSize() const { return prog_->size(); } -int RE2::ReverseProgramSize() const { +int RE2::ReverseProgramSize() const { if (prog_ == NULL) return -1; - Prog* prog = ReverseProg(); - if (prog == NULL) - return -1; - return prog->size(); -} - -// Finds the most significant non-zero bit in n. -static int FindMSBSet(uint32_t n) { - DCHECK_NE(n, 0); -#if defined(__GNUC__) - return 31 ^ __builtin_clz(n); -#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) - unsigned long c; - _BitScanReverse(&c, n); - return static_cast<int>(c); -#else - int c = 0; - for (int shift = 1 << 4; shift != 0; shift >>= 1) { - uint32_t word = n >> shift; - if (word != 0) { - n = word; - c += shift; + Prog* prog = ReverseProg(); + if (prog == NULL) + return -1; + return prog->size(); +} + +// Finds the most significant non-zero bit in n. +static int FindMSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return 31 ^ __builtin_clz(n); +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + unsigned long c; + _BitScanReverse(&c, n); + return static_cast<int>(c); +#else + int c = 0; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n >> shift; + if (word != 0) { + n = word; + c += shift; } } - return c; -#endif + return c; +#endif +} + +static int Fanout(Prog* prog, std::vector<int>* histogram) { + SparseArray<int> fanout(prog->size()); + prog->Fanout(&fanout); + int data[32] = {}; + int size = 0; + for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { + if (i->value() == 0) + continue; + uint32_t value = i->value(); + int bucket = FindMSBSet(value); + bucket += value & (value-1) ? 1 : 0; + ++data[bucket]; + size = std::max(size, bucket+1); + } + if (histogram != NULL) + histogram->assign(data, data+size); + return size-1; } -static int Fanout(Prog* prog, std::vector<int>* histogram) { - SparseArray<int> fanout(prog->size()); - prog->Fanout(&fanout); - int data[32] = {}; - int size = 0; - for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { - if (i->value() == 0) - continue; - uint32_t value = i->value(); - int bucket = FindMSBSet(value); - bucket += value & (value-1) ? 1 : 0; - ++data[bucket]; - size = std::max(size, bucket+1); - } - if (histogram != NULL) - histogram->assign(data, data+size); - return size-1; +int RE2::ProgramFanout(std::vector<int>* histogram) const { + if (prog_ == NULL) + return -1; + return Fanout(prog_, histogram); +} + +int RE2::ReverseProgramFanout(std::vector<int>* histogram) const { + if (prog_ == NULL) + return -1; + Prog* prog = ReverseProg(); + if (prog == NULL) + return -1; + return Fanout(prog, histogram); } -int RE2::ProgramFanout(std::vector<int>* histogram) const { - if (prog_ == NULL) - return -1; - return Fanout(prog_, histogram); -} - -int RE2::ReverseProgramFanout(std::vector<int>* histogram) const { - if (prog_ == NULL) - return -1; - Prog* prog = ReverseProg(); - if (prog == NULL) - return -1; - return Fanout(prog, histogram); -} - // Returns named_groups_, computing it if needed. -const std::map<std::string, int>& RE2::NamedCapturingGroups() const { +const std::map<std::string, int>& RE2::NamedCapturingGroups() const { std::call_once(named_groups_once_, [](const RE2* re) { if (re->suffix_regexp_ != NULL) re->named_groups_ = re->suffix_regexp_->NamedCaptures(); @@ -358,7 +358,7 @@ const std::map<std::string, int>& RE2::NamedCapturingGroups() const { } // Returns group_names_, computing it if needed. -const std::map<int, std::string>& RE2::CapturingGroupNames() const { +const std::map<int, std::string>& RE2::CapturingGroupNames() const { std::call_once(group_names_once_, [](const RE2* re) { if (re->suffix_regexp_ != NULL) re->group_names_ = re->suffix_regexp_->CaptureNames(); @@ -402,42 +402,42 @@ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, } } -bool RE2::Replace(std::string* str, - const RE2& re, - const StringPiece& rewrite) { +bool RE2::Replace(std::string* str, + const RE2& re, + const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); - if (nvec > 1 + re.NumberOfCapturingGroups()) + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; + if (nvec > static_cast<int>(arraysize(vec))) return false; - if (nvec > static_cast<int>(arraysize(vec))) - return false; if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) return false; - std::string s; + std::string s; if (!re.Rewrite(&s, rewrite, vec, nvec)) return false; - assert(vec[0].data() >= str->data()); - assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); + assert(vec[0].data() >= str->data()); + assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); str->replace(vec[0].data() - str->data(), vec[0].size(), s); return true; } -int RE2::GlobalReplace(std::string* str, - const RE2& re, - const StringPiece& rewrite) { +int RE2::GlobalReplace(std::string* str, + const RE2& re, + const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); - if (nvec > 1 + re.NumberOfCapturingGroups()) + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; + if (nvec > static_cast<int>(arraysize(vec))) return false; - if (nvec > static_cast<int>(arraysize(vec))) - return false; const char* p = str->data(); const char* ep = p + str->size(); const char* lastend = NULL; - std::string out; + std::string out; int count = 0; #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION // Iterate just once when fuzzing. Otherwise, we easily get bogged down @@ -449,15 +449,15 @@ int RE2::GlobalReplace(std::string* str, if (!re.Match(*str, static_cast<size_t>(p - str->data()), str->size(), UNANCHORED, vec, nvec)) break; - if (p < vec[0].data()) - out.append(p, vec[0].data() - p); - if (vec[0].data() == lastend && vec[0].empty()) { + if (p < vec[0].data()) + out.append(p, vec[0].data() - p); + if (vec[0].data() == lastend && vec[0].empty()) { // Disallow empty match at end of last match: skip ahead. // - // fullrune() takes int, not ptrdiff_t. However, it just looks + // fullrune() takes int, not ptrdiff_t. However, it just looks // at the leading byte and treats any length >= 4 the same. if (re.options().encoding() == RE2::Options::EncodingUTF8 && - fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) { + fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) { // re is in UTF-8 mode and there is enough left of str // to allow us to advance by up to UTFmax bytes. Rune r; @@ -482,7 +482,7 @@ int RE2::GlobalReplace(std::string* str, continue; } re.Rewrite(&out, rewrite, vec, nvec); - p = vec[0].data() + vec[0].size(); + p = vec[0].data() + vec[0].size(); lastend = p; count++; } @@ -497,16 +497,16 @@ int RE2::GlobalReplace(std::string* str, return count; } -bool RE2::Extract(const StringPiece& text, - const RE2& re, - const StringPiece& rewrite, - std::string* out) { +bool RE2::Extract(const StringPiece& text, + const RE2& re, + const StringPiece& rewrite, + std::string* out) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); - if (nvec > 1 + re.NumberOfCapturingGroups()) + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; + if (nvec > static_cast<int>(arraysize(vec))) return false; - if (nvec > static_cast<int>(arraysize(vec))) - return false; if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) return false; @@ -514,8 +514,8 @@ bool RE2::Extract(const StringPiece& text, return re.Rewrite(out, rewrite, vec, nvec); } -std::string RE2::QuoteMeta(const StringPiece& unquoted) { - std::string result; +std::string RE2::QuoteMeta(const StringPiece& unquoted) { + std::string result; result.reserve(unquoted.size() << 1); // Escape any ascii character not in [A-Za-z_0-9]. @@ -552,8 +552,8 @@ std::string RE2::QuoteMeta(const StringPiece& unquoted) { return result; } -bool RE2::PossibleMatchRange(std::string* min, std::string* max, - int maxlen) const { +bool RE2::PossibleMatchRange(std::string* min, std::string* max, + int maxlen) const { if (prog_ == NULL) return false; @@ -562,28 +562,28 @@ bool RE2::PossibleMatchRange(std::string* min, std::string* max, n = maxlen; // Determine initial min max from prefix_ literal. - *min = prefix_.substr(0, n); - *max = prefix_.substr(0, n); + *min = prefix_.substr(0, n); + *max = prefix_.substr(0, n); if (prefix_foldcase_) { - // prefix is ASCII lowercase; change *min to uppercase. + // prefix is ASCII lowercase; change *min to uppercase. for (int i = 0; i < n; i++) { - char& c = (*min)[i]; - if ('a' <= c && c <= 'z') - c += 'A' - 'a'; + char& c = (*min)[i]; + if ('a' <= c && c <= 'z') + c += 'A' - 'a'; } } // Add to prefix min max using PossibleMatchRange on regexp. - std::string dmin, dmax; + std::string dmin, dmax; maxlen -= n; if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { - min->append(dmin); - max->append(dmax); - } else if (!max->empty()) { + min->append(dmin); + max->append(dmax); + } else if (!max->empty()) { // prog_->PossibleMatchRange has failed us, // but we still have useful information from prefix_. - // Round up *max to allow any possible suffix. - PrefixSuccessor(max); + // Round up *max to allow any possible suffix. + PrefixSuccessor(max); } else { // Nothing useful. *min = ""; @@ -597,7 +597,7 @@ bool RE2::PossibleMatchRange(std::string* min, std::string* max, // Avoid possible locale nonsense in standard strcasecmp. // The string a is known to be all lowercase. static int ascii_strcasecmp(const char* a, const char* b, size_t len) { - const char* ae = a + len; + const char* ae = a + len; for (; a < ae; a++, b++) { uint8_t x = *a; @@ -619,7 +619,7 @@ bool RE2::Match(const StringPiece& text, Anchor re_anchor, StringPiece* submatch, int nsubmatch) const { - if (!ok()) { + if (!ok()) { if (options_.log_errors()) LOG(ERROR) << "Invalid RE2: " << *error_; return false; @@ -654,8 +654,8 @@ bool RE2::Match(const StringPiece& text, // If the regexp is anchored explicitly, must not be in middle of text. if (prog_->anchor_start() && startpos != 0) return false; - if (prog_->anchor_end() && endpos != text.size()) - return false; + if (prog_->anchor_end() && endpos != text.size()) + return false; // If the regexp is anchored explicitly, update re_anchor // so that we can potentially fall into a faster case below. @@ -691,85 +691,85 @@ bool RE2::Match(const StringPiece& text, kind = Prog::kLongestMatch; bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; - bool can_bit_state = prog_->CanBitState(); + bool can_bit_state = prog_->CanBitState(); size_t bit_state_text_max_size = prog_->bit_state_text_max_size(); -#ifdef RE2_HAVE_THREAD_LOCAL - hooks::context = this; -#endif +#ifdef RE2_HAVE_THREAD_LOCAL + hooks::context = this; +#endif bool dfa_failed = false; - bool skipped_test = false; + bool skipped_test = false; switch (re_anchor) { default: - LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor; - return false; - + LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor; + return false; + case UNANCHORED: { - if (prog_->anchor_end()) { - // This is a very special case: we don't need the forward DFA because - // we already know where the match must end! Instead, the reverse DFA - // can say whether there is a match and (optionally) where it starts. - Prog* prog = ReverseProg(); - if (prog == NULL) { - // Fall back to NFA below. - skipped_test = true; - break; - } - if (!prog->SearchDFA(subtext, text, Prog::kAnchored, - Prog::kLongestMatch, matchp, &dfa_failed, NULL)) { - if (dfa_failed) { - if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " - << "program size " << prog->size() << ", " - << "list count " << prog->list_count() << ", " - << "bytemap range " << prog->bytemap_range(); - // Fall back to NFA below. - skipped_test = true; - break; - } - return false; - } - if (matchp == NULL) // Matched. Don't care where. - return true; - break; - } - + if (prog_->anchor_end()) { + // This is a very special case: we don't need the forward DFA because + // we already know where the match must end! Instead, the reverse DFA + // can say whether there is a match and (optionally) where it starts. + Prog* prog = ReverseProg(); + if (prog == NULL) { + // Fall back to NFA below. + skipped_test = true; + break; + } + if (!prog->SearchDFA(subtext, text, Prog::kAnchored, + Prog::kLongestMatch, matchp, &dfa_failed, NULL)) { + if (dfa_failed) { + if (options_.log_errors()) + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog->size() << ", " + << "list count " << prog->list_count() << ", " + << "bytemap range " << prog->bytemap_range(); + // Fall back to NFA below. + skipped_test = true; + break; + } + return false; + } + if (matchp == NULL) // Matched. Don't care where. + return true; + break; + } + if (!prog_->SearchDFA(subtext, text, anchor, kind, matchp, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " - << "program size " << prog_->size() << ", " - << "list count " << prog_->list_count() << ", " - << "bytemap range " << prog_->bytemap_range(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; } return false; } - if (matchp == NULL) // Matched. Don't care where. + if (matchp == NULL) // Matched. Don't care where. return true; - // SearchDFA set match.end() but didn't know where the - // match started. Run the regexp backward from match.end() + // SearchDFA set match.end() but didn't know where the + // match started. Run the regexp backward from match.end() // to find the longest possible match -- that's where it started. Prog* prog = ReverseProg(); - if (prog == NULL) { - // Fall back to NFA below. - skipped_test = true; - break; - } + if (prog == NULL) { + // Fall back to NFA below. + skipped_test = true; + break; + } if (!prog->SearchDFA(match, text, Prog::kAnchored, Prog::kLongestMatch, &match, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " - << "program size " << prog->size() << ", " - << "list count " << prog->list_count() << ", " - << "bytemap range " << prog->bytemap_range(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog->size() << ", " + << "list count " << prog->list_count() << ", " + << "bytemap range " << prog->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; @@ -808,11 +808,11 @@ bool RE2::Match(const StringPiece& text, &match, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " - << "program size " << prog_->size() << ", " - << "list count " << prog_->list_count() << ", " - << "bytemap range " << prog_->bytemap_range(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; @@ -876,7 +876,7 @@ bool RE2::Match(const StringPiece& text, // Internal matcher - like Match() but takes Args not StringPieces. bool RE2::DoMatch(const StringPiece& text, - Anchor re_anchor, + Anchor re_anchor, size_t* consumed, const Arg* const* args, int n) const { @@ -886,11 +886,11 @@ bool RE2::DoMatch(const StringPiece& text, return false; } - if (NumberOfCapturingGroups() < n) { - // RE has fewer capturing groups than number of Arg pointers passed in. - return false; - } - + if (NumberOfCapturingGroups() < n) { + // RE has fewer capturing groups than number of Arg pointers passed in. + return false; + } + // Count number of capture groups needed. int nvec; if (n == 0 && consumed == NULL) @@ -902,14 +902,14 @@ bool RE2::DoMatch(const StringPiece& text, StringPiece stkvec[kVecSize]; StringPiece* heapvec = NULL; - if (nvec <= static_cast<int>(arraysize(stkvec))) { + if (nvec <= static_cast<int>(arraysize(stkvec))) { vec = stkvec; } else { vec = new StringPiece[nvec]; heapvec = vec; } - if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { + if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { delete[] heapvec; return false; } @@ -939,8 +939,8 @@ bool RE2::DoMatch(const StringPiece& text, // Checks that the rewrite string is well-formed with respect to this // regular expression. -bool RE2::CheckRewriteString(const StringPiece& rewrite, - std::string* error) const { +bool RE2::CheckRewriteString(const StringPiece& rewrite, + std::string* error) const { int max_token = -1; for (const char *s = rewrite.data(), *end = s + rewrite.size(); s < end; s++) { @@ -968,125 +968,125 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite, } if (max_token > NumberOfCapturingGroups()) { - *error = StringPrintf( - "Rewrite schema requests %d matches, but the regexp only has %d " - "parenthesized subexpressions.", - max_token, NumberOfCapturingGroups()); + *error = StringPrintf( + "Rewrite schema requests %d matches, but the regexp only has %d " + "parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); return false; } return true; } -// Returns the maximum submatch needed for the rewrite to be done by Replace(). -// E.g. if rewrite == "foo \\2,\\1", returns 2. -int RE2::MaxSubmatch(const StringPiece& rewrite) { - int max = 0; - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - if (*s == '\\') { - s++; - int c = (s < end) ? *s : -1; - if (isdigit(c)) { - int n = (c - '0'); - if (n > max) - max = n; - } - } - } - return max; -} - -// Append the "rewrite" string, with backslash subsitutions from "vec", -// to string "out". -bool RE2::Rewrite(std::string* out, - const StringPiece& rewrite, - const StringPiece* vec, - int veclen) const { - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - if (*s != '\\') { - out->push_back(*s); - continue; - } - s++; - int c = (s < end) ? *s : -1; - if (isdigit(c)) { - int n = (c - '0'); - if (n >= veclen) { - if (options_.log_errors()) { - LOG(ERROR) << "invalid substitution \\" << n - << " from " << veclen << " groups"; - } - return false; - } - StringPiece snip = vec[n]; - if (!snip.empty()) - out->append(snip.data(), snip.size()); - } else if (c == '\\') { - out->push_back('\\'); - } else { - if (options_.log_errors()) - LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); - return false; - } - } - return true; -} - +// Returns the maximum submatch needed for the rewrite to be done by Replace(). +// E.g. if rewrite == "foo \\2,\\1", returns 2. +int RE2::MaxSubmatch(const StringPiece& rewrite) { + int max = 0; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s == '\\') { + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n > max) + max = n; + } + } + } + return max; +} + +// Append the "rewrite" string, with backslash subsitutions from "vec", +// to string "out". +bool RE2::Rewrite(std::string* out, + const StringPiece& rewrite, + const StringPiece* vec, + int veclen) const { + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s != '\\') { + out->push_back(*s); + continue; + } + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (options_.log_errors()) { + LOG(ERROR) << "invalid substitution \\" << n + << " from " << veclen << " groups"; + } + return false; + } + StringPiece snip = vec[n]; + if (!snip.empty()) + out->append(snip.data(), snip.size()); + } else if (c == '\\') { + out->push_back('\\'); + } else { + if (options_.log_errors()) + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } + return true; +} + /***** Parsers for various types *****/ -namespace re2_internal { - -template <> -bool Parse(const char* str, size_t n, void* dest) { +namespace re2_internal { + +template <> +bool Parse(const char* str, size_t n, void* dest) { // We fail if somebody asked us to store into a non-NULL void* pointer return (dest == NULL); } -template <> -bool Parse(const char* str, size_t n, std::string* dest) { +template <> +bool Parse(const char* str, size_t n, std::string* dest) { if (dest == NULL) return true; - dest->assign(str, n); + dest->assign(str, n); return true; } #if defined(ARCADIA_ROOT) -template <> -bool Parse(const char* str, size_t n, TString* dest) { +template <> +bool Parse(const char* str, size_t n, TString* dest) { if (dest == NULL) return true; - dest->assign(str, n); + dest->assign(str, n); return true; } #endif -template <> -bool Parse(const char* str, size_t n, StringPiece* dest) { +template <> +bool Parse(const char* str, size_t n, StringPiece* dest) { if (dest == NULL) return true; - *dest = StringPiece(str, n); + *dest = StringPiece(str, n); return true; } -template <> -bool Parse(const char* str, size_t n, char* dest) { +template <> +bool Parse(const char* str, size_t n, char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *dest = str[0]; + *dest = str[0]; return true; } -template <> -bool Parse(const char* str, size_t n, signed char* dest) { +template <> +bool Parse(const char* str, size_t n, signed char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *dest = str[0]; + *dest = str[0]; return true; } -template <> -bool Parse(const char* str, size_t n, unsigned char* dest) { +template <> +bool Parse(const char* str, size_t n, unsigned char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *dest = str[0]; + *dest = str[0]; return true; } @@ -1150,41 +1150,41 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, return buf; } -template <> -bool Parse(const char* str, size_t n, float* dest) { +template <> +bool Parse(const char* str, size_t n, float* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); + char* end; + errno = 0; + float r = strtof(str, &end); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, double* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); + char* end; + errno = 0; + double r = strtod(str, &end); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, long* dest, int radix) { if (n == 0) return false; - static const int kMaxLength = 200; - char buf[kMaxLength+1]; - str = TerminateNumber(buf, sizeof buf, str, &n, true); - char* end; - errno = 0; - float r = strtof(str, &end); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *dest = r; - return true; -} - -template <> -bool Parse(const char* str, size_t n, double* dest) { - if (n == 0) return false; - static const int kMaxLength = 200; - char buf[kMaxLength+1]; - str = TerminateNumber(buf, sizeof buf, str, &n, true); - char* end; - errno = 0; - double r = strtod(str, &end); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *dest = r; - return true; -} - -template <> -bool Parse(const char* str, size_t n, long* dest, int radix) { - if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); char* end; @@ -1193,12 +1193,12 @@ bool Parse(const char* str, size_t n, long* dest, int radix) { if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *dest = r; + *dest = r; return true; } -template <> -bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1214,52 +1214,52 @@ bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *dest = r; + *dest = r; return true; } -template <> -bool Parse(const char* str, size_t n, short* dest, int radix) { +template <> +bool Parse(const char* str, size_t n, short* dest, int radix) { long r; - if (!Parse(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *dest = (short)r; + *dest = (short)r; return true; } -template <> -bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { unsigned long r; - if (!Parse(str, n, &r, radix)) return false; // Could not parse - if ((unsigned short)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range if (dest == NULL) return true; - *dest = (unsigned short)r; + *dest = (unsigned short)r; return true; } -template <> -bool Parse(const char* str, size_t n, int* dest, int radix) { +template <> +bool Parse(const char* str, size_t n, int* dest, int radix) { long r; - if (!Parse(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range if (dest == NULL) return true; - *dest = (int)r; + *dest = (int)r; return true; } -template <> -bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { unsigned long r; - if (!Parse(str, n, &r, radix)) return false; // Could not parse - if ((unsigned int)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range if (dest == NULL) return true; - *dest = (unsigned int)r; + *dest = (unsigned int)r; return true; } -template <> -bool Parse(const char* str, size_t n, long long* dest, int radix) { +template <> +bool Parse(const char* str, size_t n, long long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1269,12 +1269,12 @@ bool Parse(const char* str, size_t n, long long* dest, int radix) { if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *dest = r; + *dest = r; return true; } -template <> -bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1289,47 +1289,47 @@ bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *dest = r; + *dest = r; return true; } -} // namespace re2_internal - -namespace hooks { - -#ifdef RE2_HAVE_THREAD_LOCAL -thread_local const RE2* context = NULL; -#endif - -template <typename T> -union Hook { - void Store(T* cb) { cb_.store(cb, std::memory_order_release); } - T* Load() const { return cb_.load(std::memory_order_acquire); } - -#if !defined(__clang__) && defined(_MSC_VER) - // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, - // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC. - static_assert(std::atomic<T*>::is_always_lock_free, - "std::atomic<T*> must be always lock-free"); - T* cb_for_constinit_; -#endif - - std::atomic<T*> cb_; -}; - -template <typename T> -static void DoNothing(const T&) {} - -#define DEFINE_HOOK(type, name) \ - static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \ - void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \ - type##Callback* Get##type##Hook() { return name##_hook.Load(); } - -DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset) -DEFINE_HOOK(DFASearchFailure, dfa_search_failure) - -#undef DEFINE_HOOK - -} // namespace hooks - +} // namespace re2_internal + +namespace hooks { + +#ifdef RE2_HAVE_THREAD_LOCAL +thread_local const RE2* context = NULL; +#endif + +template <typename T> +union Hook { + void Store(T* cb) { cb_.store(cb, std::memory_order_release); } + T* Load() const { return cb_.load(std::memory_order_acquire); } + +#if !defined(__clang__) && defined(_MSC_VER) + // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, + // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC. + static_assert(std::atomic<T*>::is_always_lock_free, + "std::atomic<T*> must be always lock-free"); + T* cb_for_constinit_; +#endif + + std::atomic<T*> cb_; +}; + +template <typename T> +static void DoNothing(const T&) {} + +#define DEFINE_HOOK(type, name) \ + static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \ + void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \ + type##Callback* Get##type##Hook() { return name##_hook.Load(); } + +DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset) +DEFINE_HOOK(DFASearchFailure, dfa_search_failure) + +#undef DEFINE_HOOK + +} // namespace hooks + } // namespace re2 diff --git a/contrib/libs/re2/re2/re2.h b/contrib/libs/re2/re2/re2.h index a51cd556c1..f8f8043daf 100644 --- a/contrib/libs/re2/re2/re2.h +++ b/contrib/libs/re2/re2/re2.h @@ -30,19 +30,19 @@ // "(?i)hello" -- (?i) turns on case-insensitive matching // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible // -// The double backslashes are needed when writing C++ string literals. -// However, they should NOT be used when writing C++11 raw string literals: -// -// R"(hello (\w+) world)" -- \w matches a "word" character -// R"(version (\d+))" -- \d matches a digit -// R"(hello\s+world)" -- \s matches any whitespace character -// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary -// R"((?i)hello)" -- (?i) turns on case-insensitive matching -// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible -// -// When using UTF-8 encoding, case-insensitive matching will perform -// simple case folding, not full case folding. -// +// The double backslashes are needed when writing C++ string literals. +// However, they should NOT be used when writing C++11 raw string literals: +// +// R"(hello (\w+) world)" -- \w matches a "word" character +// R"(version (\d+))" -- \d matches a digit +// R"(hello\s+world)" -- \s matches any whitespace character +// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary +// R"((?i)hello)" -- (?i) turns on case-insensitive matching +// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible +// +// When using UTF-8 encoding, case-insensitive matching will perform +// simple case folding, not full case folding. +// // ----------------------------------------------------------------------- // MATCHING INTERFACE: // @@ -66,29 +66,29 @@ // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); // // ----------------------------------------------------------------------- -// MATCHING WITH SUBSTRING EXTRACTION: +// MATCHING WITH SUBSTRING EXTRACTION: // -// You can supply extra pointer arguments to extract matched substrings. -// On match failure, none of the pointees will have been modified. -// On match success, the substrings will be converted (as necessary) and -// their values will be assigned to their pointees until all conversions -// have succeeded or one conversion has failed. -// On conversion failure, the pointees will be in an indeterminate state -// because the caller has no way of knowing which conversion failed. -// However, conversion cannot fail for types like string and StringPiece -// that do not inspect the substring contents. Hence, in the common case -// where all of the pointees are of such types, failure is always due to -// match failure and thus none of the pointees will have been modified. +// You can supply extra pointer arguments to extract matched substrings. +// On match failure, none of the pointees will have been modified. +// On match success, the substrings will be converted (as necessary) and +// their values will be assigned to their pointees until all conversions +// have succeeded or one conversion has failed. +// On conversion failure, the pointees will be in an indeterminate state +// because the caller has no way of knowing which conversion failed. +// However, conversion cannot fail for types like string and StringPiece +// that do not inspect the substring contents. Hence, in the common case +// where all of the pointees are of such types, failure is always due to +// match failure and thus none of the pointees will have been modified. // // Example: extracts "ruby" into "s" and 1234 into "i" // int i; -// std::string s; +// std::string s; // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); // // Example: fails because string cannot be stored in integer // CHECK(!RE2::FullMatch("ruby", "(.*)", &i)); // -// Example: fails because there aren't enough sub-patterns +// Example: fails because there aren't enough sub-patterns // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s)); // // Example: does not try to extract any extra sub-patterns @@ -144,10 +144,10 @@ // which represents a sub-range of a real string. // // Example: read lines of the form "var = value" from a string. -// std::string contents = ...; // Fill string somehow +// std::string contents = ...; // Fill string somehow // StringPiece input(contents); // Wrap a StringPiece around it // -// std::string var; +// std::string var; // int value; // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { // ...; @@ -208,16 +208,16 @@ #include <map> #include <mutex> #include <string> -#include <type_traits> -#include <vector> +#include <type_traits> +#include <vector> #if defined(ARCADIA_ROOT) #include <util/generic/string.h> #endif -#if defined(__APPLE__) -#include <TargetConditionals.h> -#endif - +#if defined(__APPLE__) +#include <TargetConditionals.h> +#endif + #include "re2/stringpiece.h" namespace re2 { @@ -251,7 +251,7 @@ class RE2 { ErrorBadCharRange, // bad character class range ErrorMissingBracket, // missing closing ] ErrorMissingParen, // missing closing ) - ErrorUnexpectedParen, // unexpected closing ) + ErrorUnexpectedParen, // unexpected closing ) ErrorTrailingBackslash, // trailing \ at end of regexp ErrorRepeatArgument, // repeat argument missing, e.g. "*" ErrorRepeatSize, // bad repetition argument @@ -274,12 +274,12 @@ class RE2 { Quiet // do not log about regexp parse errors }; - // Need to have the const char* and const std::string& forms for implicit + // Need to have the const char* and const std::string& forms for implicit // conversions when passing string literals to FullMatch and PartialMatch. // Otherwise the StringPiece form would be sufficient. #ifndef SWIG RE2(const char* pattern); - RE2(const std::string& pattern); + RE2(const std::string& pattern); #endif RE2(const StringPiece& pattern); RE2(const StringPiece& pattern, const Options& options); @@ -295,11 +295,11 @@ class RE2 { // The string specification for this RE2. E.g. // RE2 re("ab*c?d+"); // re.pattern(); // "ab*c?d+" - const std::string& pattern() const { return pattern_; } + const std::string& pattern() const { return pattern_; } // If RE2 could not be created properly, returns an error string. // Else returns the empty string. - const std::string& error() const { return *error_; } + const std::string& error() const { return *error_; } // If RE2 could not be created properly, returns an error code. // Else returns RE2::NoError (== 0). @@ -307,81 +307,81 @@ class RE2 { // If RE2 could not be created properly, returns the offending // portion of the regexp. - const std::string& error_arg() const { return error_arg_; } + const std::string& error_arg() const { return error_arg_; } // Returns the program size, a very approximate measure of a regexp's "cost". // Larger numbers are more expensive than smaller numbers. int ProgramSize() const; - int ReverseProgramSize() const; + int ReverseProgramSize() const; - // If histogram is not null, outputs the program fanout - // as a histogram bucketed by powers of 2. + // If histogram is not null, outputs the program fanout + // as a histogram bucketed by powers of 2. // Returns the number of the largest non-empty bucket. - int ProgramFanout(std::vector<int>* histogram) const; - int ReverseProgramFanout(std::vector<int>* histogram) const; + int ProgramFanout(std::vector<int>* histogram) const; + int ReverseProgramFanout(std::vector<int>* histogram) const; // Returns the underlying Regexp; not for general use. // Returns entire_regexp_ so that callers don't need // to know about prefix_ and prefix_foldcase_. re2::Regexp* Regexp() const { return entire_regexp_; } - /***** The array-based matching interface ******/ - - // The functions here have names ending in 'N' and are used to implement - // the functions whose names are the prefix before the 'N'. It is sometimes - // useful to invoke them directly, but the syntax is awkward, so the 'N'-less - // versions should be preferred. - static bool FullMatchN(const StringPiece& text, const RE2& re, - const Arg* const args[], int n); - static bool PartialMatchN(const StringPiece& text, const RE2& re, - const Arg* const args[], int n); - static bool ConsumeN(StringPiece* input, const RE2& re, - const Arg* const args[], int n); - static bool FindAndConsumeN(StringPiece* input, const RE2& re, - const Arg* const args[], int n); - -#ifndef SWIG - private: - template <typename F, typename SP> - static inline bool Apply(F f, SP sp, const RE2& re) { - return f(sp, re, NULL, 0); - } - - template <typename F, typename SP, typename... A> - static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) { - const Arg* const args[] = {&a...}; - const int n = sizeof...(a); - return f(sp, re, args, n); - } - - public: - // In order to allow FullMatch() et al. to be called with a varying number - // of arguments of varying types, we use two layers of variadic templates. - // The first layer constructs the temporary Arg objects. The second layer - // (above) constructs the array of pointers to the temporary Arg objects. - + /***** The array-based matching interface ******/ + + // The functions here have names ending in 'N' and are used to implement + // the functions whose names are the prefix before the 'N'. It is sometimes + // useful to invoke them directly, but the syntax is awkward, so the 'N'-less + // versions should be preferred. + static bool FullMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n); + static bool PartialMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n); + static bool ConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n); + static bool FindAndConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n); + +#ifndef SWIG + private: + template <typename F, typename SP> + static inline bool Apply(F f, SP sp, const RE2& re) { + return f(sp, re, NULL, 0); + } + + template <typename F, typename SP, typename... A> + static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) { + const Arg* const args[] = {&a...}; + const int n = sizeof...(a); + return f(sp, re, args, n); + } + + public: + // In order to allow FullMatch() et al. to be called with a varying number + // of arguments of varying types, we use two layers of variadic templates. + // The first layer constructs the temporary Arg objects. The second layer + // (above) constructs the array of pointers to the temporary Arg objects. + /***** The useful part: the matching interface *****/ // Matches "text" against "re". If pointer arguments are // supplied, copies matched sub-patterns into them. // - // You can pass in a "const char*" or a "std::string" for "text". - // You can pass in a "const char*" or a "std::string" or a "RE2" for "re". + // You can pass in a "const char*" or a "std::string" for "text". + // You can pass in a "const char*" or a "std::string" or a "RE2" for "re". // // The provided pointer arguments can be pointers to any scalar numeric // type, or one of: - // std::string (matched piece is copied to string) + // std::string (matched piece is copied to string) // StringPiece (StringPiece is mutated to point to matched piece) // T (where "bool T::ParseFrom(const char*, size_t)" exists) // (void*)NULL (the corresponding matched sub-pattern is not copied) // // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "re" fully - from the beginning to the end of "text". - // b. The number of matched sub-patterns is >= number of supplied pointers. + // a. "text" matches "re" fully - from the beginning to the end of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. // c. The "i"th argument has a suitable type for holding the // string captured as the "i"th sub-pattern. If you pass in // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, the "i"th captured sub-pattern is + // number of sub-patterns, the "i"th captured sub-pattern is // ignored. // // CAVEAT: An optional sub-pattern that does not exist in the @@ -395,80 +395,80 @@ class RE2 { return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...); } - // Like FullMatch(), except that "re" is allowed to match a substring - // of "text". - // - // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "re" partially - for some substring of "text". - // b. The number of matched sub-patterns is >= number of supplied pointers. - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, the "i"th captured sub-pattern is - // ignored. + // Like FullMatch(), except that "re" is allowed to match a substring + // of "text". + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "re" partially - for some substring of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template <typename... A> static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) { return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...); } - // Like FullMatch() and PartialMatch(), except that "re" has to match - // a prefix of the text, and "input" is advanced past the matched - // text. Note: "input" is modified iff this routine returns true - // and "re" matched a non-empty substring of "input". - // - // Returns true iff all of the following conditions are satisfied: - // a. "input" matches "re" partially - for some prefix of "input". - // b. The number of matched sub-patterns is >= number of supplied pointers. - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, the "i"th captured sub-pattern is - // ignored. + // Like FullMatch() and PartialMatch(), except that "re" has to match + // a prefix of the text, and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true + // and "re" matched a non-empty substring of "input". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some prefix of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template <typename... A> static bool Consume(StringPiece* input, const RE2& re, A&&... a) { return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...); } - // Like Consume(), but does not anchor the match at the beginning of - // the text. That is, "re" need not start its match at the beginning - // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds - // the next word in "s" and stores it in "word". - // - // Returns true iff all of the following conditions are satisfied: - // a. "input" matches "re" partially - for some substring of "input". - // b. The number of matched sub-patterns is >= number of supplied pointers. - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, the "i"th captured sub-pattern is - // ignored. + // Like Consume(), but does not anchor the match at the beginning of + // the text. That is, "re" need not start its match at the beginning + // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds + // the next word in "s" and stores it in "word". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some substring of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template <typename... A> static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...); } #endif - // Replace the first match of "re" in "str" with "rewrite". + // Replace the first match of "re" in "str" with "rewrite". // Within "rewrite", backslash-escaped digits (\1 to \9) can be // used to insert text matching corresponding parenthesized group // from the pattern. \0 in "rewrite" refers to the entire matching // text. E.g., // - // std::string s = "yabba dabba doo"; + // std::string s = "yabba dabba doo"; // CHECK(RE2::Replace(&s, "b+", "d")); // // will leave "s" containing "yada dabba doo" // // Returns true if the pattern matches and a replacement occurs, // false otherwise. - static bool Replace(std::string* str, - const RE2& re, + static bool Replace(std::string* str, + const RE2& re, const StringPiece& rewrite); #if defined(ARCADIA_ROOT) static bool Replace(TString *str, const RE2& pattern, const StringPiece& rewrite) { - std::string tmp(*str); + std::string tmp(*str); bool res = Replace(&tmp, pattern, rewrite); *str = tmp; return res; @@ -478,7 +478,7 @@ class RE2 { // Like Replace(), except replaces successive non-overlapping occurrences // of the pattern in the string with the rewrite. E.g. // - // std::string s = "yabba dabba doo"; + // std::string s = "yabba dabba doo"; // CHECK(RE2::GlobalReplace(&s, "b+", "d")); // // will leave "s" containing "yada dada doo" @@ -488,15 +488,15 @@ class RE2 { // replacing "ana" within "banana" makes only one replacement, not two. // // Returns the number of replacements made. - static int GlobalReplace(std::string* str, - const RE2& re, + static int GlobalReplace(std::string* str, + const RE2& re, const StringPiece& rewrite); #if defined(ARCADIA_ROOT) - static int GlobalReplace(TString* str, - const RE2& pattern, - const StringPiece& rewrite) { - std::string tmp(*str); + static int GlobalReplace(TString* str, + const RE2& pattern, + const StringPiece& rewrite) { + std::string tmp(*str); int res = GlobalReplace(&tmp, pattern, rewrite); *str = tmp; return res; @@ -511,15 +511,15 @@ class RE2 { // successfully; if no match occurs, the string is left unaffected. // // REQUIRES: "text" must not alias any part of "*out". - static bool Extract(const StringPiece& text, - const RE2& re, - const StringPiece& rewrite, - std::string* out); + static bool Extract(const StringPiece& text, + const RE2& re, + const StringPiece& rewrite, + std::string* out); #if defined(ARCADIA_ROOT) - static bool Extract(const StringPiece& text, + static bool Extract(const StringPiece& text, const RE2& pattern, - const StringPiece& rewrite, + const StringPiece& rewrite, TString *out) { std::string tmp; bool res = Extract(text, pattern, rewrite, &tmp); @@ -530,11 +530,11 @@ class RE2 { // Escapes all potentially meaningful regexp characters in // 'unquoted'. The returned string, used as a regular expression, - // will match exactly the original string. For example, + // will match exactly the original string. For example, // 1.5-2.0? // may become: // 1\.5\-2\.0\? - static std::string QuoteMeta(const StringPiece& unquoted); + static std::string QuoteMeta(const StringPiece& unquoted); // Computes range for any strings matching regexp. The min and max can in // some cases be arbitrarily precise, so the caller gets to specify the @@ -550,8 +550,8 @@ class RE2 { // do not compile down to infinite repetitions. // // Returns true on success, false on error. - bool PossibleMatchRange(std::string* min, std::string* max, - int maxlen) const; + bool PossibleMatchRange(std::string* min, std::string* max, + int maxlen) const; // Generic matching interface @@ -565,46 +565,46 @@ class RE2 { // Return the number of capturing subpatterns, or -1 if the // regexp wasn't valid on construction. The overall match ($0) // does not count: if the regexp is "(a)(b)", returns 2. - int NumberOfCapturingGroups() const { return num_captures_; } + int NumberOfCapturingGroups() const { return num_captures_; } // Return a map from names to capturing indices. // The map records the index of the leftmost group // with the given name. // Only valid until the re is deleted. - const std::map<std::string, int>& NamedCapturingGroups() const; + const std::map<std::string, int>& NamedCapturingGroups() const; // Return a map from capturing indices to names. // The map has no entries for unnamed groups. // Only valid until the re is deleted. - const std::map<int, std::string>& CapturingGroupNames() const; + const std::map<int, std::string>& CapturingGroupNames() const; // General matching routine. // Match against text starting at offset startpos // and stopping the search at offset endpos. // Returns true if match found, false if not. - // On a successful match, fills in submatch[] (up to nsubmatch entries) + // On a successful match, fills in submatch[] (up to nsubmatch entries) // with information about submatches. - // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with - // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar", - // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL. - // Caveat: submatch[] may be clobbered even on match failure. + // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with + // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar", + // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL. + // Caveat: submatch[] may be clobbered even on match failure. // // Don't ask for more match information than you will use: - // runs much faster with nsubmatch == 1 than nsubmatch > 1, and - // runs even faster if nsubmatch == 0. - // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(), + // runs much faster with nsubmatch == 1 than nsubmatch > 1, and + // runs even faster if nsubmatch == 0. + // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(), // but will be handled correctly. // // Passing text == StringPiece(NULL, 0) will be handled like any other // empty string, but note that on return, it will not be possible to tell // whether submatch i matched the empty string or did not match: - // either way, submatch[i].data() == NULL. + // either way, submatch[i].data() == NULL. bool Match(const StringPiece& text, size_t startpos, size_t endpos, - Anchor re_anchor, - StringPiece* submatch, - int nsubmatch) const; + Anchor re_anchor, + StringPiece* submatch, + int nsubmatch) const; // Check that the given rewrite string is suitable for use with this // regular expression. It checks that: @@ -614,11 +614,11 @@ class RE2 { // '\' followed by anything other than a digit or '\'. // A true return value guarantees that Replace() and Extract() won't // fail because of a bad rewrite string. - bool CheckRewriteString(const StringPiece& rewrite, - std::string* error) const; + bool CheckRewriteString(const StringPiece& rewrite, + std::string* error) const; - bool CheckRewriteString(const StringPiece& rewrite, std::nullptr_t error) const { - return CheckRewriteString(rewrite, static_cast<std::string*>(error)); + bool CheckRewriteString(const StringPiece& rewrite, std::nullptr_t error) const { + return CheckRewriteString(rewrite, static_cast<std::string*>(error)); } #if defined(ARCADIA_ROOT) @@ -643,8 +643,8 @@ class RE2 { // Returns true on success. This method can fail because of a malformed // rewrite string. CheckRewriteString guarantees that the rewrite will // be sucessful. - bool Rewrite(std::string* out, - const StringPiece& rewrite, + bool Rewrite(std::string* out, + const StringPiece& rewrite, const StringPiece* vec, int veclen) const; @@ -666,9 +666,9 @@ class RE2 { // with (?i) unless in posix_syntax mode) // // The following options are only consulted when posix_syntax == true. - // When posix_syntax == false, these features are always enabled and - // cannot be turned off; to perform multi-line matching in that case, - // begin the regexp with (?m). + // When posix_syntax == false, these features are always enabled and + // cannot be turned off; to perform multi-line matching in that case, + // begin the regexp with (?m). // perl_classes (false) allow Perl's \d \s \w \D \S \W // word_boundary (false) allow Perl's \b \B (word boundary and not) // one_line (false) ^ and $ only match beginning and end of text @@ -684,7 +684,7 @@ class RE2 { // can have two DFAs (one first match, one longest match). // That makes 4 DFAs: // - // forward, first-match - used for UNANCHORED or ANCHOR_START searches + // forward, first-match - used for UNANCHORED or ANCHOR_START searches // if opt.longest_match() == false // forward, longest-match - used for all ANCHOR_BOTH searches, // and the other two kinds if @@ -789,46 +789,46 @@ class RE2 { }; // Returns the options set in the constructor. - const Options& options() const { return options_; } + const Options& options() const { return options_; } // Argument converters; see below. - template <typename T> - static Arg CRadix(T* ptr); - template <typename T> - static Arg Hex(T* ptr); - template <typename T> - static Arg Octal(T* ptr); + template <typename T> + static Arg CRadix(T* ptr); + template <typename T> + static Arg Hex(T* ptr); + template <typename T> + static Arg Octal(T* ptr); private: void Init(const StringPiece& pattern, const Options& options); bool DoMatch(const StringPiece& text, - Anchor re_anchor, + Anchor re_anchor, size_t* consumed, const Arg* const args[], int n) const; re2::Prog* ReverseProg() const; - std::string pattern_; // string regular expression - Options options_; // option flags - re2::Regexp* entire_regexp_; // parsed regular expression - const std::string* error_; // error indicator (or points to empty string) - ErrorCode error_code_; // error code - std::string error_arg_; // fragment of regexp showing error - std::string prefix_; // required prefix (before suffix_regexp_) - bool prefix_foldcase_; // prefix_ is ASCII case-insensitive - re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed - re2::Prog* prog_; // compiled program for regexp - int num_captures_; // number of capturing groups - bool is_one_pass_; // can use prog_->SearchOnePass? - - // Reverse Prog for DFA execution only - mutable re2::Prog* rprog_; + std::string pattern_; // string regular expression + Options options_; // option flags + re2::Regexp* entire_regexp_; // parsed regular expression + const std::string* error_; // error indicator (or points to empty string) + ErrorCode error_code_; // error code + std::string error_arg_; // fragment of regexp showing error + std::string prefix_; // required prefix (before suffix_regexp_) + bool prefix_foldcase_; // prefix_ is ASCII case-insensitive + re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed + re2::Prog* prog_; // compiled program for regexp + int num_captures_; // number of capturing groups + bool is_one_pass_; // can use prog_->SearchOnePass? + + // Reverse Prog for DFA execution only + mutable re2::Prog* rprog_; // Map from capture names to indices - mutable const std::map<std::string, int>* named_groups_; + mutable const std::map<std::string, int>* named_groups_; // Map from capture indices to names - mutable const std::map<int, std::string>* group_names_; + mutable const std::map<int, std::string>* group_names_; mutable std::once_flag rprog_once_; mutable std::once_flag named_groups_once_; @@ -840,137 +840,137 @@ class RE2 { /***** Implementation details *****/ -namespace re2_internal { +namespace re2_internal { -// Types for which the 3-ary Parse() function template has specializations. -template <typename T> struct Parse3ary : public std::false_type {}; -template <> struct Parse3ary<void> : public std::true_type {}; -template <> struct Parse3ary<std::string> : public std::true_type {}; -template <> struct Parse3ary<StringPiece> : public std::true_type {}; +// Types for which the 3-ary Parse() function template has specializations. +template <typename T> struct Parse3ary : public std::false_type {}; +template <> struct Parse3ary<void> : public std::true_type {}; +template <> struct Parse3ary<std::string> : public std::true_type {}; +template <> struct Parse3ary<StringPiece> : public std::true_type {}; #if defined(ARCADIA_ROOT) -template <> struct Parse3ary<TString> : public std::true_type {}; +template <> struct Parse3ary<TString> : public std::true_type {}; #endif -template <> struct Parse3ary<char> : public std::true_type {}; -template <> struct Parse3ary<signed char> : public std::true_type {}; -template <> struct Parse3ary<unsigned char> : public std::true_type {}; -template <> struct Parse3ary<float> : public std::true_type {}; -template <> struct Parse3ary<double> : public std::true_type {}; - -template <typename T> -bool Parse(const char* str, size_t n, T* dest); - -// Types for which the 4-ary Parse() function template has specializations. -template <typename T> struct Parse4ary : public std::false_type {}; -template <> struct Parse4ary<long> : public std::true_type {}; -template <> struct Parse4ary<unsigned long> : public std::true_type {}; -template <> struct Parse4ary<short> : public std::true_type {}; -template <> struct Parse4ary<unsigned short> : public std::true_type {}; -template <> struct Parse4ary<int> : public std::true_type {}; -template <> struct Parse4ary<unsigned int> : public std::true_type {}; -template <> struct Parse4ary<long long> : public std::true_type {}; -template <> struct Parse4ary<unsigned long long> : public std::true_type {}; - -template <typename T> -bool Parse(const char* str, size_t n, T* dest, int radix); - -} // namespace re2_internal - +template <> struct Parse3ary<char> : public std::true_type {}; +template <> struct Parse3ary<signed char> : public std::true_type {}; +template <> struct Parse3ary<unsigned char> : public std::true_type {}; +template <> struct Parse3ary<float> : public std::true_type {}; +template <> struct Parse3ary<double> : public std::true_type {}; + +template <typename T> +bool Parse(const char* str, size_t n, T* dest); + +// Types for which the 4-ary Parse() function template has specializations. +template <typename T> struct Parse4ary : public std::false_type {}; +template <> struct Parse4ary<long> : public std::true_type {}; +template <> struct Parse4ary<unsigned long> : public std::true_type {}; +template <> struct Parse4ary<short> : public std::true_type {}; +template <> struct Parse4ary<unsigned short> : public std::true_type {}; +template <> struct Parse4ary<int> : public std::true_type {}; +template <> struct Parse4ary<unsigned int> : public std::true_type {}; +template <> struct Parse4ary<long long> : public std::true_type {}; +template <> struct Parse4ary<unsigned long long> : public std::true_type {}; + +template <typename T> +bool Parse(const char* str, size_t n, T* dest, int radix); + +} // namespace re2_internal + class RE2::Arg { - private: - template <typename T> - using CanParse3ary = typename std::enable_if< - re2_internal::Parse3ary<T>::value, - int>::type; - - template <typename T> - using CanParse4ary = typename std::enable_if< - re2_internal::Parse4ary<T>::value, - int>::type; - -#if !defined(_MSC_VER) - template <typename T> - using CanParseFrom = typename std::enable_if< - std::is_member_function_pointer< - decltype(static_cast<bool (T::*)(const char*, size_t)>( - &T::ParseFrom))>::value, - int>::type; -#endif - - public: - Arg() : Arg(nullptr) {} - Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {} - - template <typename T, CanParse3ary<T> = 0> - Arg(T* ptr) : arg_(ptr), parser_(DoParse3ary<T>) {} - - template <typename T, CanParse4ary<T> = 0> - Arg(T* ptr) : arg_(ptr), parser_(DoParse4ary<T>) {} - -#if !defined(_MSC_VER) - template <typename T, CanParseFrom<T> = 0> - Arg(T* ptr) : arg_(ptr), parser_(DoParseFrom<T>) {} -#endif - - typedef bool (*Parser)(const char* str, size_t n, void* dest); - - template <typename T> - Arg(T* ptr, Parser parser) : arg_(ptr), parser_(parser) {} - - bool Parse(const char* str, size_t n) const { - return (*parser_)(str, n, arg_); - } - private: - static bool DoNothing(const char* /*str*/, size_t /*n*/, void* /*dest*/) { - return true; - } - - template <typename T> - static bool DoParse3ary(const char* str, size_t n, void* dest) { - return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest)); - } - - template <typename T> - static bool DoParse4ary(const char* str, size_t n, void* dest) { - return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 10); - } - -#if !defined(_MSC_VER) - template <typename T> - static bool DoParseFrom(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - return reinterpret_cast<T*>(dest)->ParseFrom(str, n); - } -#endif - - void* arg_; - Parser parser_; + template <typename T> + using CanParse3ary = typename std::enable_if< + re2_internal::Parse3ary<T>::value, + int>::type; + + template <typename T> + using CanParse4ary = typename std::enable_if< + re2_internal::Parse4ary<T>::value, + int>::type; + +#if !defined(_MSC_VER) + template <typename T> + using CanParseFrom = typename std::enable_if< + std::is_member_function_pointer< + decltype(static_cast<bool (T::*)(const char*, size_t)>( + &T::ParseFrom))>::value, + int>::type; +#endif + + public: + Arg() : Arg(nullptr) {} + Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {} + + template <typename T, CanParse3ary<T> = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParse3ary<T>) {} + + template <typename T, CanParse4ary<T> = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParse4ary<T>) {} + +#if !defined(_MSC_VER) + template <typename T, CanParseFrom<T> = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParseFrom<T>) {} +#endif + + typedef bool (*Parser)(const char* str, size_t n, void* dest); + + template <typename T> + Arg(T* ptr, Parser parser) : arg_(ptr), parser_(parser) {} + + bool Parse(const char* str, size_t n) const { + return (*parser_)(str, n, arg_); + } + + private: + static bool DoNothing(const char* /*str*/, size_t /*n*/, void* /*dest*/) { + return true; + } + + template <typename T> + static bool DoParse3ary(const char* str, size_t n, void* dest) { + return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest)); + } + + template <typename T> + static bool DoParse4ary(const char* str, size_t n, void* dest) { + return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 10); + } + +#if !defined(_MSC_VER) + template <typename T> + static bool DoParseFrom(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + return reinterpret_cast<T*>(dest)->ParseFrom(str, n); + } +#endif + + void* arg_; + Parser parser_; }; -template <typename T> -inline RE2::Arg RE2::CRadix(T* ptr) { - return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { - return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 0); - }); -} - -template <typename T> -inline RE2::Arg RE2::Hex(T* ptr) { - return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { - return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 16); - }); +template <typename T> +inline RE2::Arg RE2::CRadix(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 0); + }); } -template <typename T> -inline RE2::Arg RE2::Octal(T* ptr) { - return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { - return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 8); - }); -} +template <typename T> +inline RE2::Arg RE2::Hex(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 16); + }); +} + +template <typename T> +inline RE2::Arg RE2::Octal(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 8); + }); +} #ifndef SWIG // Silence warnings about missing initializers for members of LazyRE2. -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif @@ -1019,56 +1019,56 @@ class LazyRE2 { void operator=(const LazyRE2&); // disallowed }; -#endif - -namespace hooks { - -// Most platforms support thread_local. Older versions of iOS don't support -// thread_local, but for the sake of brevity, we lump together all versions -// of Apple platforms that aren't macOS. If an iOS application really needs -// the context pointee someday, we can get more specific then... +#endif + +namespace hooks { + +// Most platforms support thread_local. Older versions of iOS don't support +// thread_local, but for the sake of brevity, we lump together all versions +// of Apple platforms that aren't macOS. If an iOS application really needs +// the context pointee someday, we can get more specific then... // // As per https://github.com/google/re2/issues/325, thread_local support in // MinGW seems to be buggy. (FWIW, Abseil folks also avoid it.) -#define RE2_HAVE_THREAD_LOCAL +#define RE2_HAVE_THREAD_LOCAL #if (defined(__APPLE__) && !(defined(TARGET_OS_OSX) && TARGET_OS_OSX)) || defined(__MINGW32__) -#undef RE2_HAVE_THREAD_LOCAL -#endif - -// A hook must not make any assumptions regarding the lifetime of the context -// pointee beyond the current invocation of the hook. Pointers and references -// obtained via the context pointee should be considered invalidated when the -// hook returns. Hence, any data about the context pointee (e.g. its pattern) -// would have to be copied in order for it to be kept for an indefinite time. -// -// A hook must not use RE2 for matching. Control flow reentering RE2::Match() -// could result in infinite mutual recursion. To discourage that possibility, -// RE2 will not maintain the context pointer correctly when used in that way. -#ifdef RE2_HAVE_THREAD_LOCAL -extern thread_local const RE2* context; -#endif - -struct DFAStateCacheReset { - int64_t state_budget; - size_t state_cache_size; -}; - -struct DFASearchFailure { - // Nothing yet... -}; - -#define DECLARE_HOOK(type) \ - using type##Callback = void(const type&); \ - void Set##type##Hook(type##Callback* cb); \ - type##Callback* Get##type##Hook(); - -DECLARE_HOOK(DFAStateCacheReset) -DECLARE_HOOK(DFASearchFailure) - -#undef DECLARE_HOOK - -} // namespace hooks - +#undef RE2_HAVE_THREAD_LOCAL +#endif + +// A hook must not make any assumptions regarding the lifetime of the context +// pointee beyond the current invocation of the hook. Pointers and references +// obtained via the context pointee should be considered invalidated when the +// hook returns. Hence, any data about the context pointee (e.g. its pattern) +// would have to be copied in order for it to be kept for an indefinite time. +// +// A hook must not use RE2 for matching. Control flow reentering RE2::Match() +// could result in infinite mutual recursion. To discourage that possibility, +// RE2 will not maintain the context pointer correctly when used in that way. +#ifdef RE2_HAVE_THREAD_LOCAL +extern thread_local const RE2* context; +#endif + +struct DFAStateCacheReset { + int64_t state_budget; + size_t state_cache_size; +}; + +struct DFASearchFailure { + // Nothing yet... +}; + +#define DECLARE_HOOK(type) \ + using type##Callback = void(const type&); \ + void Set##type##Hook(type##Callback* cb); \ + type##Callback* Get##type##Hook(); + +DECLARE_HOOK(DFAStateCacheReset) +DECLARE_HOOK(DFASearchFailure) + +#undef DECLARE_HOOK + +} // namespace hooks + } // namespace re2 using re2::RE2; diff --git a/contrib/libs/re2/re2/regexp.cc b/contrib/libs/re2/re2/regexp.cc index 44359cba9c..ca1318b43d 100644 --- a/contrib/libs/re2/re2/regexp.cc +++ b/contrib/libs/re2/re2/regexp.cc @@ -20,7 +20,7 @@ #include "util/logging.h" #include "util/mutex.h" #include "util/utf.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/stringpiece.h" #include "re2/walker-inl.h" @@ -244,12 +244,12 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, return new Regexp(kRegexpEmptyMatch, flags); } - PODArray<Regexp*> subcopy; + PODArray<Regexp*> subcopy; if (op == kRegexpAlternate && can_factor) { // Going to edit sub; make a copy so we don't step on caller. - subcopy = PODArray<Regexp*>(nsub); - memmove(subcopy.data(), sub, nsub * sizeof sub[0]); - sub = subcopy.data(); + subcopy = PODArray<Regexp*>(nsub); + memmove(subcopy.data(), sub, nsub * sizeof sub[0]); + sub = subcopy.data(); nsub = FactorAlternation(sub, nsub, flags); if (nsub == 1) { Regexp* re = sub[0]; @@ -333,14 +333,14 @@ Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { } void Regexp::Swap(Regexp* that) { - // Regexp is not trivially copyable, so we cannot freely copy it with - // memmove(3), but swapping objects like so is safe for our purposes. + // Regexp is not trivially copyable, so we cannot freely copy it with + // memmove(3), but swapping objects like so is safe for our purposes. char tmp[sizeof *this]; - void* vthis = reinterpret_cast<void*>(this); - void* vthat = reinterpret_cast<void*>(that); - memmove(tmp, vthis, sizeof *this); - memmove(vthis, vthat, sizeof *this); - memmove(vthat, tmp, sizeof *this); + void* vthis = reinterpret_cast<void*>(this); + void* vthat = reinterpret_cast<void*>(that); + memmove(tmp, vthis, sizeof *this); + memmove(vthis, vthat, sizeof *this); + memmove(vthat, tmp, sizeof *this); } // Tests equality of all top-level structure but not subregexps. @@ -498,7 +498,7 @@ static const char *kErrorStrings[] = { "invalid character class range", "missing ]", "missing )", - "unexpected )", + "unexpected )", "trailing \\", "no argument for repetition operator", "invalid repetition size", @@ -508,16 +508,16 @@ static const char *kErrorStrings[] = { "invalid named capture group", }; -std::string RegexpStatus::CodeText(enum RegexpStatusCode code) { +std::string RegexpStatus::CodeText(enum RegexpStatusCode code) { if (code < 0 || code >= arraysize(kErrorStrings)) code = kRegexpInternalError; return kErrorStrings[code]; } -std::string RegexpStatus::Text() const { +std::string RegexpStatus::Text() const { if (error_arg_.empty()) return CodeText(code_); - std::string s; + std::string s; s.append(CodeText(code_)); s.append(": "); s.append(error_arg_.data(), error_arg_.size()); @@ -542,12 +542,12 @@ class NumCapturesWalker : public Regexp::Walker<Ignored> { ncapture_++; return ignored; } - + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; -#endif +#endif return ignored; } @@ -570,17 +570,17 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> { NamedCapturesWalker() : map_(NULL) {} ~NamedCapturesWalker() { delete map_; } - std::map<std::string, int>* TakeMap() { - std::map<std::string, int>* m = map_; + std::map<std::string, int>* TakeMap() { + std::map<std::string, int>* m = map_; map_ = NULL; return m; } - virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) - map_ = new std::map<std::string, int>; + map_ = new std::map<std::string, int>; // Record first occurrence of each name. // (The rule is that if you have the same name @@ -591,21 +591,21 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> { } virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; -#endif +#endif return ignored; } private: - std::map<std::string, int>* map_; + std::map<std::string, int>* map_; NamedCapturesWalker(const NamedCapturesWalker&) = delete; NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; }; -std::map<std::string, int>* Regexp::NamedCaptures() { +std::map<std::string, int>* Regexp::NamedCaptures() { NamedCapturesWalker w; w.Walk(this, 0); return w.TakeMap(); @@ -617,17 +617,17 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> { CaptureNamesWalker() : map_(NULL) {} ~CaptureNamesWalker() { delete map_; } - std::map<int, std::string>* TakeMap() { - std::map<int, std::string>* m = map_; + std::map<int, std::string>* TakeMap() { + std::map<int, std::string>* m = map_; map_ = NULL; return m; } - virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) - map_ = new std::map<int, std::string>; + map_ = new std::map<int, std::string>; (*map_)[re->cap()] = *re->name(); } @@ -635,52 +635,52 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> { } virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; -#endif +#endif return ignored; } private: - std::map<int, std::string>* map_; + std::map<int, std::string>* map_; CaptureNamesWalker(const CaptureNamesWalker&) = delete; CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; }; -std::map<int, std::string>* Regexp::CaptureNames() { +std::map<int, std::string>* Regexp::CaptureNames() { CaptureNamesWalker w; w.Walk(this, 0); return w.TakeMap(); } -void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, - std::string* bytes) { - if (latin1) { - bytes->resize(nrunes); - for (int i = 0; i < nrunes; i++) - (*bytes)[i] = static_cast<char>(runes[i]); - } else { - bytes->resize(nrunes * UTFmax); // worst case - char* p = &(*bytes)[0]; - for (int i = 0; i < nrunes; i++) - p += runetochar(p, &runes[i]); - bytes->resize(p - &(*bytes)[0]); - bytes->shrink_to_fit(); - } -} - +void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, + std::string* bytes) { + if (latin1) { + bytes->resize(nrunes); + for (int i = 0; i < nrunes; i++) + (*bytes)[i] = static_cast<char>(runes[i]); + } else { + bytes->resize(nrunes * UTFmax); // worst case + char* p = &(*bytes)[0]; + for (int i = 0; i < nrunes; i++) + p += runetochar(p, &runes[i]); + bytes->resize(p - &(*bytes)[0]); + bytes->shrink_to_fit(); + } +} + // Determines whether regexp matches must be anchored // with a fixed string prefix. If so, returns the prefix and // the regexp that remains after the prefix. The prefix might // be ASCII case-insensitive. -bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, - Regexp** suffix) { - prefix->clear(); - *foldcase = false; - *suffix = NULL; - +bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, + Regexp** suffix) { + prefix->clear(); + *foldcase = false; + *suffix = NULL; + // No need for a walker: the regexp must be of the form // 1. some number of ^ anchors // 2. a literal char or string @@ -688,59 +688,59 @@ bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, if (op_ != kRegexpConcat) return false; int i = 0; - while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) + while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) i++; if (i == 0 || i >= nsub_) return false; - Regexp* re = sub()[i]; - if (re->op_ != kRegexpLiteral && - re->op_ != kRegexpLiteralString) - return false; + Regexp* re = sub()[i]; + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; i++; if (i < nsub_) { for (int j = i; j < nsub_; j++) - sub()[j]->Incref(); - *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); + sub()[j]->Incref(); + *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); } else { - *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); - } - - bool latin1 = (re->parse_flags() & Latin1) != 0; - Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; - int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; - ConvertRunesToBytes(latin1, runes, nrunes, prefix); - *foldcase = (re->parse_flags() & FoldCase) != 0; + *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); + } + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; + return true; +} + +// Determines whether regexp matches must be unanchored +// with a fixed string prefix. If so, returns the prefix. +// The prefix might be ASCII case-insensitive. +bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { + prefix->clear(); + *foldcase = false; + + // No need for a walker: the regexp must either begin with or be + // a literal char or string. We "see through" capturing groups, + // but make no effort to glue multiple prefix fragments together. + Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this; + while (re->op_ == kRegexpCapture) { + re = re->sub()[0]; + if (re->op_ == kRegexpConcat && re->nsub_ > 0) + re = re->sub()[0]; + } + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; return true; } -// Determines whether regexp matches must be unanchored -// with a fixed string prefix. If so, returns the prefix. -// The prefix might be ASCII case-insensitive. -bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { - prefix->clear(); - *foldcase = false; - - // No need for a walker: the regexp must either begin with or be - // a literal char or string. We "see through" capturing groups, - // but make no effort to glue multiple prefix fragments together. - Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this; - while (re->op_ == kRegexpCapture) { - re = re->sub()[0]; - if (re->op_ == kRegexpConcat && re->nsub_ > 0) - re = re->sub()[0]; - } - if (re->op_ != kRegexpLiteral && - re->op_ != kRegexpLiteralString) - return false; - - bool latin1 = (re->parse_flags() & Latin1) != 0; - Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; - int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; - ConvertRunesToBytes(latin1, runes, nrunes, prefix); - *foldcase = (re->parse_flags() & FoldCase) != 0; - return true; -} - // Character class builder is a balanced binary tree (STL set) // containing non-overlapping, non-abutting RuneRanges. // The less-than operator used in the tree treats two @@ -918,7 +918,7 @@ void CharClassBuilder::Negate() { // The ranges are allocated in the same block as the header, // necessitating a special allocator and Delete method. -CharClass* CharClass::New(size_t maxranges) { +CharClass* CharClass::New(size_t maxranges) { CharClass* cc; uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; cc = reinterpret_cast<CharClass*>(data); @@ -935,7 +935,7 @@ void CharClass::Delete() { } CharClass* CharClass::Negate() { - CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1)); + CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1)); cc->folds_ascii_ = folds_ascii_; cc->nrunes_ = Runemax + 1 - nrunes_; int n = 0; @@ -972,7 +972,7 @@ bool CharClass::Contains(Rune r) const { } CharClass* CharClassBuilder::GetCharClass() { - CharClass* cc = CharClass::New(ranges_.size()); + CharClass* cc = CharClass::New(ranges_.size()); int n = 0; for (iterator it = begin(); it != end(); ++it) cc->ranges_[n++] = *it; diff --git a/contrib/libs/re2/re2/regexp.h b/contrib/libs/re2/re2/regexp.h index 1fb5ed4e44..b6446f9fe5 100644 --- a/contrib/libs/re2/re2/regexp.h +++ b/contrib/libs/re2/re2/regexp.h @@ -86,15 +86,15 @@ // form accessible to clients, so that client code can analyze the // parsed regular expressions. -#include <stddef.h> +#include <stddef.h> #include <stdint.h> #include <map> #include <set> #include <string> -#include "util/util.h" -#include "util/logging.h" -#include "util/utf.h" +#include "util/util.h" +#include "util/logging.h" +#include "util/utf.h" #include "re2/stringpiece.h" namespace re2 { @@ -178,7 +178,7 @@ enum RegexpStatusCode { kRegexpBadCharRange, // bad character class range kRegexpMissingBracket, // missing closing ] kRegexpMissingParen, // missing closing ) - kRegexpUnexpectedParen, // unexpected closing ) + kRegexpUnexpectedParen, // unexpected closing ) kRegexpTrailingBackslash, // at end of regexp kRegexpRepeatArgument, // repeat argument missing, e.g. "*" kRegexpRepeatSize, // bad repetition argument @@ -196,7 +196,7 @@ class RegexpStatus { void set_code(RegexpStatusCode code) { code_ = code; } void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } - void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; } + void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; } RegexpStatusCode code() const { return code_; } const StringPiece& error_arg() const { return error_arg_; } bool ok() const { return code() == kRegexpSuccess; } @@ -206,16 +206,16 @@ class RegexpStatus { // Returns text equivalent of code, e.g.: // "Bad character class" - static std::string CodeText(RegexpStatusCode code); + static std::string CodeText(RegexpStatusCode code); // Returns text describing error, e.g.: // "Bad character class: [z-a]" - std::string Text() const; + std::string Text() const; private: RegexpStatusCode code_; // Kind of error - StringPiece error_arg_; // Piece of regexp containing syntax error. - std::string* tmp_; // Temporary storage, possibly where error_arg_ is. + StringPiece error_arg_; // Piece of regexp containing syntax error. + std::string* tmp_; // Temporary storage, possibly where error_arg_ is. RegexpStatus(const RegexpStatus&) = delete; RegexpStatus& operator=(const RegexpStatus&) = delete; @@ -260,7 +260,7 @@ class CharClass { private: CharClass(); // not implemented ~CharClass(); // not implemented - static CharClass* New(size_t maxranges); + static CharClass* New(size_t maxranges); friend class CharClassBuilder; @@ -338,7 +338,7 @@ class Regexp { Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } - const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } + const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } @@ -370,7 +370,7 @@ class Regexp { // string representation of the simplified form. Returns true on success. // Returns false and sets *status (if status != NULL) on parse error. static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, - std::string* dst, RegexpStatus* status); + std::string* dst, RegexpStatus* status); // Returns the number of capturing groups in the regexp. int NumCaptures(); @@ -379,16 +379,16 @@ class Regexp { // Returns a map from names to capturing group indices, // or NULL if the regexp contains no named capture groups. // The caller is responsible for deleting the map. - std::map<std::string, int>* NamedCaptures(); + std::map<std::string, int>* NamedCaptures(); // Returns a map from capturing group indices to capturing group // names or NULL if the regexp contains no named capture groups. The // caller is responsible for deleting the map. - std::map<int, std::string>* CaptureNames(); + std::map<int, std::string>* CaptureNames(); // Returns a string representation of the current regexp, // using as few parentheses as possible. - std::string ToString(); + std::string ToString(); // Convenience functions. They consume the passed reference, // so in many cases you should use, e.g., Plus(re->Incref(), flags). @@ -410,7 +410,7 @@ class Regexp { // Debugging function. Returns string format for regexp // that makes structure clear. Does NOT use regexp syntax. - std::string Dump(); + std::string Dump(); // Helper traversal class, defined fully in walker-inl.h. template<typename T> class Walker; @@ -437,22 +437,22 @@ class Regexp { // begin with a non-empty fixed string (perhaps after ASCII // case-folding). If so, returns the prefix and the sub-regexp that // follows it. - // Callers should expect *prefix, *foldcase and *suffix to be "zeroed" - // regardless of the return value. - bool RequiredPrefix(std::string* prefix, bool* foldcase, - Regexp** suffix); - - // Whether every match of this regexp must be unanchored and - // begin with a non-empty fixed string (perhaps after ASCII - // case-folding). If so, returns the prefix. - // Callers should expect *prefix and *foldcase to be "zeroed" - // regardless of the return value. - bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase); - - // Controls the maximum repeat count permitted by the parser. - // FOR FUZZING ONLY. - static void FUZZING_ONLY_set_maximum_repeat_count(int i); - + // Callers should expect *prefix, *foldcase and *suffix to be "zeroed" + // regardless of the return value. + bool RequiredPrefix(std::string* prefix, bool* foldcase, + Regexp** suffix); + + // Whether every match of this regexp must be unanchored and + // begin with a non-empty fixed string (perhaps after ASCII + // case-folding). If so, returns the prefix. + // Callers should expect *prefix and *foldcase to be "zeroed" + // regardless of the return value. + bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase); + + // Controls the maximum repeat count permitted by the parser. + // FOR FUZZING ONLY. + static void FUZZING_ONLY_set_maximum_repeat_count(int i); + private: // Constructor allocates vectors as appropriate for operator. explicit Regexp(RegexpOp op, ParseFlags parse_flags); @@ -507,7 +507,7 @@ class Regexp { // Simplifies an alternation of literal strings by factoring out // common prefixes. static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); - friend class FactorAlternationImpl; + friend class FactorAlternationImpl; // Is a == b? Only efficient on regexps that have not been through // Simplify yet - the expansion of a kRegexpRepeat will make this @@ -519,7 +519,7 @@ class Regexp { DCHECK(n >= 0 && static_cast<uint16_t>(n) == n); if (n > 1) submany_ = new Regexp*[n]; - nsub_ = static_cast<uint16_t>(n); + nsub_ = static_cast<uint16_t>(n); } // Add Rune to LiteralString @@ -577,7 +577,7 @@ class Regexp { }; struct { // Capture int cap_; - std::string* name_; + std::string* name_; }; struct { // LiteralString int nrunes_; diff --git a/contrib/libs/re2/re2/set.cc b/contrib/libs/re2/re2/set.cc index d847ad1f31..18705663a5 100644 --- a/contrib/libs/re2/re2/set.cc +++ b/contrib/libs/re2/re2/set.cc @@ -5,56 +5,56 @@ #include "re2/set.h" #include <stddef.h> -#include <algorithm> -#include <memory> -#include <utility> +#include <algorithm> +#include <memory> +#include <utility> #include "util/util.h" #include "util/logging.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" -#include "re2/stringpiece.h" +#include "re2/stringpiece.h" namespace re2 { -RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) - : options_(options), - anchor_(anchor), - compiled_(false), - size_(0) { - options_.set_never_capture(true); // might unblock some optimisations +RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) + : options_(options), + anchor_(anchor), + compiled_(false), + size_(0) { + options_.set_never_capture(true); // might unblock some optimisations } RE2::Set::~Set() { - for (size_t i = 0; i < elem_.size(); i++) - elem_[i].second->Decref(); + for (size_t i = 0; i < elem_.size(); i++) + elem_[i].second->Decref(); } -RE2::Set::Set(Set&& other) - : options_(other.options_), - anchor_(other.anchor_), - elem_(std::move(other.elem_)), - compiled_(other.compiled_), - size_(other.size_), - prog_(std::move(other.prog_)) { - other.elem_.clear(); - other.elem_.shrink_to_fit(); - other.compiled_ = false; - other.size_ = 0; - other.prog_.reset(); -} - -RE2::Set& RE2::Set::operator=(Set&& other) { - this->~Set(); - (void) new (this) Set(std::move(other)); - return *this; -} - -int RE2::Set::Add(const StringPiece& pattern, std::string* error) { +RE2::Set::Set(Set&& other) + : options_(other.options_), + anchor_(other.anchor_), + elem_(std::move(other.elem_)), + compiled_(other.compiled_), + size_(other.size_), + prog_(std::move(other.prog_)) { + other.elem_.clear(); + other.elem_.shrink_to_fit(); + other.compiled_ = false; + other.size_ = 0; + other.prog_.reset(); +} + +RE2::Set& RE2::Set::operator=(Set&& other) { + this->~Set(); + (void) new (this) Set(std::move(other)); + return *this; +} + +int RE2::Set::Add(const StringPiece& pattern, std::string* error) { if (compiled_) { - LOG(DFATAL) << "RE2::Set::Add() called after compiling"; + LOG(DFATAL) << "RE2::Set::Add() called after compiling"; return -1; } @@ -71,105 +71,105 @@ int RE2::Set::Add(const StringPiece& pattern, std::string* error) { } // Concatenate with match index and push on vector. - int n = static_cast<int>(elem_.size()); + int n = static_cast<int>(elem_.size()); re2::Regexp* m = re2::Regexp::HaveMatch(n, pf); if (re->op() == kRegexpConcat) { int nsub = re->nsub(); - PODArray<re2::Regexp*> sub(nsub + 1); + PODArray<re2::Regexp*> sub(nsub + 1); for (int i = 0; i < nsub; i++) sub[i] = re->sub()[i]->Incref(); sub[nsub] = m; re->Decref(); - re = re2::Regexp::Concat(sub.data(), nsub + 1, pf); + re = re2::Regexp::Concat(sub.data(), nsub + 1, pf); } else { re2::Regexp* sub[2]; sub[0] = re; sub[1] = m; re = re2::Regexp::Concat(sub, 2, pf); } - elem_.emplace_back(std::string(pattern), re); + elem_.emplace_back(std::string(pattern), re); return n; } bool RE2::Set::Compile() { if (compiled_) { - LOG(DFATAL) << "RE2::Set::Compile() called more than once"; + LOG(DFATAL) << "RE2::Set::Compile() called more than once"; return false; } compiled_ = true; - size_ = static_cast<int>(elem_.size()); - - // Sort the elements by their patterns. This is good enough for now - // until we have a Regexp comparison function. (Maybe someday...) - std::sort(elem_.begin(), elem_.end(), - [](const Elem& a, const Elem& b) -> bool { - return a.first < b.first; - }); - - PODArray<re2::Regexp*> sub(size_); - for (int i = 0; i < size_; i++) - sub[i] = elem_[i].second; - elem_.clear(); - elem_.shrink_to_fit(); - + size_ = static_cast<int>(elem_.size()); + + // Sort the elements by their patterns. This is good enough for now + // until we have a Regexp comparison function. (Maybe someday...) + std::sort(elem_.begin(), elem_.end(), + [](const Elem& a, const Elem& b) -> bool { + return a.first < b.first; + }); + + PODArray<re2::Regexp*> sub(size_); + for (int i = 0; i < size_; i++) + sub[i] = elem_[i].second; + elem_.clear(); + elem_.shrink_to_fit(); + Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( options_.ParseFlags()); - re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf); - - prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem())); + re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf); + + prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem())); re->Decref(); - return prog_ != nullptr; -} + return prog_ != nullptr; +} -bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const { - return Match(text, v, NULL); +bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const { + return Match(text, v, NULL); } -bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, - ErrorInfo* error_info) const { +bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, + ErrorInfo* error_info) const { if (!compiled_) { - LOG(DFATAL) << "RE2::Set::Match() called before compiling"; - if (error_info != NULL) - error_info->kind = kNotCompiled; + LOG(DFATAL) << "RE2::Set::Match() called before compiling"; + if (error_info != NULL) + error_info->kind = kNotCompiled; return false; } -#ifdef RE2_HAVE_THREAD_LOCAL - hooks::context = NULL; -#endif - bool dfa_failed = false; - std::unique_ptr<SparseSet> matches; - if (v != NULL) { - matches.reset(new SparseSet(size_)); +#ifdef RE2_HAVE_THREAD_LOCAL + hooks::context = NULL; +#endif + bool dfa_failed = false; + std::unique_ptr<SparseSet> matches; + if (v != NULL) { + matches.reset(new SparseSet(size_)); v->clear(); - } - bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch, - NULL, &dfa_failed, matches.get()); + } + bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch, + NULL, &dfa_failed, matches.get()); if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: " - << "program size " << prog_->size() << ", " - << "list count " << prog_->list_count() << ", " - << "bytemap range " << prog_->bytemap_range(); - if (error_info != NULL) - error_info->kind = kOutOfMemory; + LOG(ERROR) << "DFA out of memory: " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); + if (error_info != NULL) + error_info->kind = kOutOfMemory; return false; } - if (ret == false) { - if (error_info != NULL) - error_info->kind = kNoError; + if (ret == false) { + if (error_info != NULL) + error_info->kind = kNoError; return false; } - if (v != NULL) { - if (matches->empty()) { - LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; - if (error_info != NULL) - error_info->kind = kInconsistent; - return false; - } - v->assign(matches->begin(), matches->end()); - } - if (error_info != NULL) - error_info->kind = kNoError; + if (v != NULL) { + if (matches->empty()) { + LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; + if (error_info != NULL) + error_info->kind = kInconsistent; + return false; + } + v->assign(matches->begin(), matches->end()); + } + if (error_info != NULL) + error_info->kind = kNoError; return true; } diff --git a/contrib/libs/re2/re2/set.h b/contrib/libs/re2/re2/set.h index 540791cb34..8d64f30ccd 100644 --- a/contrib/libs/re2/re2/set.h +++ b/contrib/libs/re2/re2/set.h @@ -5,9 +5,9 @@ #ifndef RE2_SET_H_ #define RE2_SET_H_ -#include <memory> +#include <memory> #include <string> -#include <utility> +#include <utility> #include <vector> #include "re2/re2.h" @@ -23,61 +23,61 @@ namespace re2 { // be searched for simultaneously. class RE2::Set { public: - enum ErrorKind { - kNoError = 0, - kNotCompiled, // The set is not compiled. - kOutOfMemory, // The DFA ran out of memory. - kInconsistent, // The result is inconsistent. This should never happen. - }; - - struct ErrorInfo { - ErrorKind kind; - }; - + enum ErrorKind { + kNoError = 0, + kNotCompiled, // The set is not compiled. + kOutOfMemory, // The DFA ran out of memory. + kInconsistent, // The result is inconsistent. This should never happen. + }; + + struct ErrorInfo { + ErrorKind kind; + }; + Set(const RE2::Options& options, RE2::Anchor anchor); ~Set(); - // Not copyable. - Set(const Set&) = delete; - Set& operator=(const Set&) = delete; - // Movable. - Set(Set&& other); - Set& operator=(Set&& other); - - // Adds pattern to the set using the options passed to the constructor. - // Returns the index that will identify the regexp in the output of Match(), - // or -1 if the regexp cannot be parsed. + // Not copyable. + Set(const Set&) = delete; + Set& operator=(const Set&) = delete; + // Movable. + Set(Set&& other); + Set& operator=(Set&& other); + + // Adds pattern to the set using the options passed to the constructor. + // Returns the index that will identify the regexp in the output of Match(), + // or -1 if the regexp cannot be parsed. // Indices are assigned in sequential order starting from 0. - // Errors do not increment the index; if error is not NULL, *error will hold - // the error message from the parser. - int Add(const StringPiece& pattern, std::string* error); - - // Compiles the set in preparation for matching. - // Returns false if the compiler runs out of memory. - // Add() must not be called again after Compile(). - // Compile() must be called before Match(). + // Errors do not increment the index; if error is not NULL, *error will hold + // the error message from the parser. + int Add(const StringPiece& pattern, std::string* error); + + // Compiles the set in preparation for matching. + // Returns false if the compiler runs out of memory. + // Add() must not be called again after Compile(). + // Compile() must be called before Match(). bool Compile(); - // Returns true if text matches at least one of the regexps in the set. - // Fills v (if not NULL) with the indices of the matching regexps. + // Returns true if text matches at least one of the regexps in the set. + // Fills v (if not NULL) with the indices of the matching regexps. // Callers must not expect v to be sorted. bool Match(const StringPiece& text, std::vector<int>* v) const; - // As above, but populates error_info (if not NULL) when none of the regexps - // in the set matched. This can inform callers when DFA execution fails, for - // example, because they might wish to handle that case differently. - bool Match(const StringPiece& text, std::vector<int>* v, - ErrorInfo* error_info) const; - + // As above, but populates error_info (if not NULL) when none of the regexps + // in the set matched. This can inform callers when DFA execution fails, for + // example, because they might wish to handle that case differently. + bool Match(const StringPiece& text, std::vector<int>* v, + ErrorInfo* error_info) const; + private: - typedef std::pair<std::string, re2::Regexp*> Elem; - + typedef std::pair<std::string, re2::Regexp*> Elem; + RE2::Options options_; RE2::Anchor anchor_; - std::vector<Elem> elem_; + std::vector<Elem> elem_; bool compiled_; - int size_; - std::unique_ptr<re2::Prog> prog_; + int size_; + std::unique_ptr<re2::Prog> prog_; }; } // namespace re2 diff --git a/contrib/libs/re2/re2/simplify.cc b/contrib/libs/re2/re2/simplify.cc index 94b06d1c1a..663d5fcd45 100644 --- a/contrib/libs/re2/re2/simplify.cc +++ b/contrib/libs/re2/re2/simplify.cc @@ -11,7 +11,7 @@ #include "util/util.h" #include "util/logging.h" #include "util/utf.h" -#include "re2/pod_array.h" +#include "re2/pod_array.h" #include "re2/regexp.h" #include "re2/walker-inl.h" @@ -21,7 +21,7 @@ namespace re2 { // string representation of the simplified form. Returns true on success. // Returns false and sets *error (if error != NULL) on error. bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, - std::string* dst, RegexpStatus* status) { + std::string* dst, RegexpStatus* status) { Regexp* re = Parse(src, flags, status); if (re == NULL) return false; @@ -178,20 +178,20 @@ Regexp* Regexp::Simplify() { CoalesceWalker cw; Regexp* cre = cw.Walk(this, NULL); if (cre == NULL) - return NULL; - if (cw.stopped_early()) { - cre->Decref(); - return NULL; - } + return NULL; + if (cw.stopped_early()) { + cre->Decref(); + return NULL; + } SimplifyWalker sw; Regexp* sre = sw.Walk(cre, NULL); cre->Decref(); - if (sre == NULL) - return NULL; - if (sw.stopped_early()) { - sre->Decref(); - return NULL; - } + if (sre == NULL) + return NULL; + if (sw.stopped_early()) { + sre->Decref(); + return NULL; + } return sre; } @@ -220,10 +220,10 @@ Regexp* CoalesceWalker::Copy(Regexp* re) { } Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; -#endif +#endif return re->Incref(); } @@ -446,10 +446,10 @@ Regexp* SimplifyWalker::Copy(Regexp* re) { } Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; -#endif +#endif return re->Incref(); } @@ -599,11 +599,11 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, return Regexp::Plus(re->Incref(), f); // General case: x{4,} is xxxx+ - PODArray<Regexp*> nre_subs(min); + PODArray<Regexp*> nre_subs(min); for (int i = 0; i < min-1; i++) nre_subs[i] = re->Incref(); nre_subs[min-1] = Regexp::Plus(re->Incref(), f); - return Regexp::Concat(nre_subs.data(), min, f); + return Regexp::Concat(nre_subs.data(), min, f); } // Special case: (x){0} matches only empty string. @@ -621,10 +621,10 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, // Build leading prefix: xx. Capturing only on the last one. Regexp* nre = NULL; if (min > 0) { - PODArray<Regexp*> nre_subs(min); + PODArray<Regexp*> nre_subs(min); for (int i = 0; i < min; i++) nre_subs[i] = re->Incref(); - nre = Regexp::Concat(nre_subs.data(), min, f); + nre = Regexp::Concat(nre_subs.data(), min, f); } // Build and attach suffix: (x(x(x)?)?)? diff --git a/contrib/libs/re2/re2/sparse_array.h b/contrib/libs/re2/re2/sparse_array.h index 3577d6dc76..09ffe086b7 100644 --- a/contrib/libs/re2/re2/sparse_array.h +++ b/contrib/libs/re2/re2/sparse_array.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_SPARSE_ARRAY_H_ -#define RE2_SPARSE_ARRAY_H_ +#ifndef RE2_SPARSE_ARRAY_H_ +#define RE2_SPARSE_ARRAY_H_ // DESCRIPTION // @@ -55,55 +55,55 @@ // IMPLEMENTATION // -// SparseArray is an array dense_ and an array sparse_ of identical size. -// At any point, the number of elements in the sparse array is size_. +// SparseArray is an array dense_ and an array sparse_ of identical size. +// At any point, the number of elements in the sparse array is size_. // -// The array dense_ contains the size_ elements in the sparse array (with +// The array dense_ contains the size_ elements in the sparse array (with // their indices), // in the order that the elements were first inserted. This array is dense: // the size_ pairs are dense_[0] through dense_[size_-1]. // -// The array sparse_ maps from indices in [0,m) to indices in [0,size_). -// For indices present in the array, dense_[sparse_[i]].index_ == i. -// For indices not present in the array, sparse_ can contain any value at all, -// perhaps outside the range [0, size_) but perhaps not. +// The array sparse_ maps from indices in [0,m) to indices in [0,size_). +// For indices present in the array, dense_[sparse_[i]].index_ == i. +// For indices not present in the array, sparse_ can contain any value at all, +// perhaps outside the range [0, size_) but perhaps not. // -// The lax requirement on sparse_ values makes clearing the array very easy: -// set size_ to 0. Lookups are slightly more complicated. -// An index i has a value in the array if and only if: -// sparse_[i] is in [0, size_) AND -// dense_[sparse_[i]].index_ == i. +// The lax requirement on sparse_ values makes clearing the array very easy: +// set size_ to 0. Lookups are slightly more complicated. +// An index i has a value in the array if and only if: +// sparse_[i] is in [0, size_) AND +// dense_[sparse_[i]].index_ == i. // If both these properties hold, only then it is safe to refer to -// dense_[sparse_[i]].value_ +// dense_[sparse_[i]].value_ // as the value associated with index i. // -// To insert a new entry, set sparse_[i] to size_, +// To insert a new entry, set sparse_[i] to size_, // initialize dense_[size_], and then increment size_. // // To make the sparse array as efficient as possible for non-primitive types, // elements may or may not be destroyed when they are deleted from the sparse -// array through a call to resize(). They immediately become inaccessible, but -// they are only guaranteed to be destroyed when the SparseArray destructor is -// called. +// array through a call to resize(). They immediately become inaccessible, but +// they are only guaranteed to be destroyed when the SparseArray destructor is +// called. // // A moved-from SparseArray will be empty. -// Doing this simplifies the logic below. -#ifndef __has_feature -#define __has_feature(x) 0 -#endif - +// Doing this simplifies the logic below. +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + #include <assert.h> #include <stdint.h> -#if __has_feature(memory_sanitizer) -#include <sanitizer/msan_interface.h> -#endif +#if __has_feature(memory_sanitizer) +#include <sanitizer/msan_interface.h> +#endif #include <algorithm> #include <memory> #include <utility> -#include "re2/pod_array.h" - +#include "re2/pod_array.h" + namespace re2 { template<typename Value> @@ -116,14 +116,14 @@ class SparseArray { // IndexValue pairs: exposed in SparseArray::iterator. class IndexValue; - typedef IndexValue* iterator; - typedef const IndexValue* const_iterator; + typedef IndexValue* iterator; + typedef const IndexValue* const_iterator; SparseArray(const SparseArray& src); - SparseArray(SparseArray&& src); + SparseArray(SparseArray&& src); SparseArray& operator=(const SparseArray& src); - SparseArray& operator=(SparseArray&& src); + SparseArray& operator=(SparseArray&& src); // Return the number of entries in the array. int size() const { @@ -137,30 +137,30 @@ class SparseArray { // Iterate over the array. iterator begin() { - return dense_.data(); + return dense_.data(); } iterator end() { - return dense_.data() + size_; + return dense_.data() + size_; } const_iterator begin() const { - return dense_.data(); + return dense_.data(); } const_iterator end() const { - return dense_.data() + size_; + return dense_.data() + size_; } // Change the maximum size of the array. // Invalidates all iterators. - void resize(int new_max_size); + void resize(int new_max_size); // Return the maximum size of the array. // Indices can be in the range [0, max_size). int max_size() const { - if (dense_.data() != NULL) - return dense_.size(); - else - return 0; + if (dense_.data() != NULL) + return dense_.size(); + else + return 0; } // Clear the array. @@ -183,55 +183,55 @@ class SparseArray { return SetInternal(true, i, v); } - // Set the value at new index i to v. - // Fast but unsafe: only use if has_index(i) is false. - iterator set_new(int i, const Value& v) { - return SetInternal(false, i, v); + // Set the value at new index i to v. + // Fast but unsafe: only use if has_index(i) is false. + iterator set_new(int i, const Value& v) { + return SetInternal(false, i, v); } - // Set the value at index i to v. + // Set the value at index i to v. // Fast but unsafe: only use if has_index(i) is true. iterator set_existing(int i, const Value& v) { return SetExistingInternal(i, v); } - // Get the value at index i. - // Fast but unsafe: only use if has_index(i) is true. - Value& get_existing(int i) { - assert(has_index(i)); - return dense_[sparse_[i]].value_; + // Get the value at index i. + // Fast but unsafe: only use if has_index(i) is true. + Value& get_existing(int i) { + assert(has_index(i)); + return dense_[sparse_[i]].value_; } - const Value& get_existing(int i) const { - assert(has_index(i)); - return dense_[sparse_[i]].value_; + const Value& get_existing(int i) const { + assert(has_index(i)); + return dense_[sparse_[i]].value_; } private: - iterator SetInternal(bool allow_existing, int i, const Value& v) { + iterator SetInternal(bool allow_existing, int i, const Value& v) { DebugCheckInvariants(); - if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { + if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { assert(false && "illegal index"); // Semantically, end() would be better here, but we already know // the user did something stupid, so begin() insulates them from // dereferencing an invalid pointer. return begin(); } - if (!allow_existing) { + if (!allow_existing) { assert(!has_index(i)); create_index(i); } else { if (!has_index(i)) create_index(i); } - return SetExistingInternal(i, v); + return SetExistingInternal(i, v); } - iterator SetExistingInternal(int i, const Value& v) { + iterator SetExistingInternal(int i, const Value& v) { DebugCheckInvariants(); assert(has_index(i)); - dense_[sparse_[i]].value_ = v; + dense_[sparse_[i]].value_ = v; DebugCheckInvariants(); - return dense_.data() + sparse_[i]; + return dense_.data() + sparse_[i]; } // Add the index i to the array. @@ -246,20 +246,20 @@ class SparseArray { // and at the beginning and end of all public non-const member functions. void DebugCheckInvariants() const; - // Initializes memory for elements [min, max). - void MaybeInitializeMemory(int min, int max) { -#if __has_feature(memory_sanitizer) - __msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]); -#elif defined(RE2_ON_VALGRIND) - for (int i = min; i < max; i++) { - sparse_[i] = 0xababababU; - } -#endif - } - + // Initializes memory for elements [min, max). + void MaybeInitializeMemory(int min, int max) { +#if __has_feature(memory_sanitizer) + __msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]); +#elif defined(RE2_ON_VALGRIND) + for (int i = min; i < max; i++) { + sparse_[i] = 0xababababU; + } +#endif + } + int size_ = 0; - PODArray<int> sparse_; - PODArray<IndexValue> dense_; + PODArray<int> sparse_; + PODArray<IndexValue> dense_; }; template<typename Value> @@ -268,38 +268,38 @@ SparseArray<Value>::SparseArray() = default; template<typename Value> SparseArray<Value>::SparseArray(const SparseArray& src) : size_(src.size_), - sparse_(src.max_size()), - dense_(src.max_size()) { - std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data()); - std::copy_n(src.dense_.data(), src.max_size(), dense_.data()); + sparse_(src.max_size()), + dense_(src.max_size()) { + std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data()); + std::copy_n(src.dense_.data(), src.max_size(), dense_.data()); } template<typename Value> -SparseArray<Value>::SparseArray(SparseArray&& src) +SparseArray<Value>::SparseArray(SparseArray&& src) : size_(src.size_), - sparse_(std::move(src.sparse_)), + sparse_(std::move(src.sparse_)), dense_(std::move(src.dense_)) { src.size_ = 0; } template<typename Value> SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) { - // Construct these first for exception safety. - PODArray<int> a(src.max_size()); - PODArray<IndexValue> b(src.max_size()); - + // Construct these first for exception safety. + PODArray<int> a(src.max_size()); + PODArray<IndexValue> b(src.max_size()); + size_ = src.size_; - sparse_ = std::move(a); - dense_ = std::move(b); - std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data()); - std::copy_n(src.dense_.data(), src.max_size(), dense_.data()); + sparse_ = std::move(a); + dense_ = std::move(b); + std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data()); + std::copy_n(src.dense_.data(), src.max_size(), dense_.data()); return *this; } template<typename Value> -SparseArray<Value>& SparseArray<Value>::operator=(SparseArray&& src) { +SparseArray<Value>& SparseArray<Value>::operator=(SparseArray&& src) { size_ = src.size_; - sparse_ = std::move(src.sparse_); + sparse_ = std::move(src.sparse_); dense_ = std::move(src.dense_); src.size_ = 0; return *this; @@ -310,37 +310,37 @@ template<typename Value> class SparseArray<Value>::IndexValue { public: int index() const { return index_; } - Value& value() { return value_; } - const Value& value() const { return value_; } + Value& value() { return value_; } + const Value& value() const { return value_; } private: - friend class SparseArray; + friend class SparseArray; int index_; - Value value_; + Value value_; }; // Change the maximum size of the array. // Invalidates all iterators. template<typename Value> -void SparseArray<Value>::resize(int new_max_size) { +void SparseArray<Value>::resize(int new_max_size) { DebugCheckInvariants(); - if (new_max_size > max_size()) { - const int old_max_size = max_size(); - - // Construct these first for exception safety. - PODArray<int> a(new_max_size); - PODArray<IndexValue> b(new_max_size); - - std::copy_n(sparse_.data(), old_max_size, a.data()); - std::copy_n(dense_.data(), old_max_size, b.data()); - - sparse_ = std::move(a); - dense_ = std::move(b); - - MaybeInitializeMemory(old_max_size, new_max_size); + if (new_max_size > max_size()) { + const int old_max_size = max_size(); + + // Construct these first for exception safety. + PODArray<int> a(new_max_size); + PODArray<IndexValue> b(new_max_size); + + std::copy_n(sparse_.data(), old_max_size, a.data()); + std::copy_n(dense_.data(), old_max_size, b.data()); + + sparse_ = std::move(a); + dense_ = std::move(b); + + MaybeInitializeMemory(old_max_size, new_max_size); } - if (size_ > new_max_size) - size_ = new_max_size; + if (size_ > new_max_size) + size_ = new_max_size; DebugCheckInvariants(); } @@ -348,27 +348,27 @@ void SparseArray<Value>::resize(int new_max_size) { template<typename Value> bool SparseArray<Value>::has_index(int i) const { assert(i >= 0); - assert(i < max_size()); - if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { + assert(i < max_size()); + if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { return false; } - // Unsigned comparison avoids checking sparse_[i] < 0. - return (uint32_t)sparse_[i] < (uint32_t)size_ && - dense_[sparse_[i]].index_ == i; + // Unsigned comparison avoids checking sparse_[i] < 0. + return (uint32_t)sparse_[i] < (uint32_t)size_ && + dense_[sparse_[i]].index_ == i; } template<typename Value> void SparseArray<Value>::create_index(int i) { assert(!has_index(i)); - assert(size_ < max_size()); - sparse_[i] = size_; + assert(size_ < max_size()); + sparse_[i] = size_; dense_[size_].index_ = i; size_++; } -template<typename Value> SparseArray<Value>::SparseArray(int max_size) : - sparse_(max_size), dense_(max_size) { - MaybeInitializeMemory(size_, max_size); +template<typename Value> SparseArray<Value>::SparseArray(int max_size) : + sparse_(max_size), dense_(max_size) { + MaybeInitializeMemory(size_, max_size); DebugCheckInvariants(); } @@ -378,7 +378,7 @@ template<typename Value> SparseArray<Value>::~SparseArray() { template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const { assert(0 <= size_); - assert(size_ <= max_size()); + assert(size_ <= max_size()); } // Comparison function for sorting. @@ -389,4 +389,4 @@ template<typename Value> bool SparseArray<Value>::less(const IndexValue& a, } // namespace re2 -#endif // RE2_SPARSE_ARRAY_H_ +#endif // RE2_SPARSE_ARRAY_H_ diff --git a/contrib/libs/re2/re2/sparse_set.h b/contrib/libs/re2/re2/sparse_set.h index 88b22de332..06ed88d81b 100644 --- a/contrib/libs/re2/re2/sparse_set.h +++ b/contrib/libs/re2/re2/sparse_set.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_SPARSE_SET_H_ -#define RE2_SPARSE_SET_H_ +#ifndef RE2_SPARSE_SET_H_ +#define RE2_SPARSE_SET_H_ // DESCRIPTION // @@ -47,22 +47,22 @@ // // See sparse_array.h for implementation details. -// Doing this simplifies the logic below. -#ifndef __has_feature -#define __has_feature(x) 0 -#endif - +// Doing this simplifies the logic below. +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + #include <assert.h> #include <stdint.h> -#if __has_feature(memory_sanitizer) -#include <sanitizer/msan_interface.h> -#endif +#if __has_feature(memory_sanitizer) +#include <sanitizer/msan_interface.h> +#endif #include <algorithm> #include <memory> #include <utility> -#include "re2/pod_array.h" - +#include "re2/pod_array.h" + namespace re2 { template<typename Value> @@ -72,8 +72,8 @@ class SparseSetT { explicit SparseSetT(int max_size); ~SparseSetT(); - typedef int* iterator; - typedef const int* const_iterator; + typedef int* iterator; + typedef const int* const_iterator; // Return the number of entries in the set. int size() const { @@ -87,30 +87,30 @@ class SparseSetT { // Iterate over the set. iterator begin() { - return dense_.data(); + return dense_.data(); } iterator end() { - return dense_.data() + size_; + return dense_.data() + size_; } const_iterator begin() const { - return dense_.data(); + return dense_.data(); } const_iterator end() const { - return dense_.data() + size_; + return dense_.data() + size_; } // Change the maximum size of the set. // Invalidates all iterators. - void resize(int new_max_size); + void resize(int new_max_size); // Return the maximum size of the set. // Indices can be in the range [0, max_size). int max_size() const { - if (dense_.data() != NULL) - return dense_.size(); - else - return 0; + if (dense_.data() != NULL) + return dense_.size(); + else + return 0; } // Clear the set. @@ -142,7 +142,7 @@ class SparseSetT { private: iterator InsertInternal(bool allow_existing, int i) { DebugCheckInvariants(); - if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { + if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { assert(false && "illegal index"); // Semantically, end() would be better here, but we already know // the user did something stupid, so begin() insulates them from @@ -157,7 +157,7 @@ class SparseSetT { create_index(i); } DebugCheckInvariants(); - return dense_.data() + sparse_[i]; + return dense_.data() + sparse_[i]; } // Add the index i to the set. @@ -171,20 +171,20 @@ class SparseSetT { // and at the beginning and end of all public non-const member functions. void DebugCheckInvariants() const; - // Initializes memory for elements [min, max). - void MaybeInitializeMemory(int min, int max) { -#if __has_feature(memory_sanitizer) - __msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]); -#elif defined(RE2_ON_VALGRIND) - for (int i = min; i < max; i++) { - sparse_[i] = 0xababababU; - } -#endif - } - + // Initializes memory for elements [min, max). + void MaybeInitializeMemory(int min, int max) { +#if __has_feature(memory_sanitizer) + __msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]); +#elif defined(RE2_ON_VALGRIND) + for (int i = min; i < max; i++) { + sparse_[i] = 0xababababU; + } +#endif + } + int size_ = 0; - PODArray<int> sparse_; - PODArray<int> dense_; + PODArray<int> sparse_; + PODArray<int> dense_; }; template<typename Value> @@ -193,25 +193,25 @@ SparseSetT<Value>::SparseSetT() = default; // Change the maximum size of the set. // Invalidates all iterators. template<typename Value> -void SparseSetT<Value>::resize(int new_max_size) { +void SparseSetT<Value>::resize(int new_max_size) { DebugCheckInvariants(); - if (new_max_size > max_size()) { - const int old_max_size = max_size(); - - // Construct these first for exception safety. - PODArray<int> a(new_max_size); - PODArray<int> b(new_max_size); - - std::copy_n(sparse_.data(), old_max_size, a.data()); - std::copy_n(dense_.data(), old_max_size, b.data()); - - sparse_ = std::move(a); - dense_ = std::move(b); - - MaybeInitializeMemory(old_max_size, new_max_size); + if (new_max_size > max_size()) { + const int old_max_size = max_size(); + + // Construct these first for exception safety. + PODArray<int> a(new_max_size); + PODArray<int> b(new_max_size); + + std::copy_n(sparse_.data(), old_max_size, a.data()); + std::copy_n(dense_.data(), old_max_size, b.data()); + + sparse_ = std::move(a); + dense_ = std::move(b); + + MaybeInitializeMemory(old_max_size, new_max_size); } - if (size_ > new_max_size) - size_ = new_max_size; + if (size_ > new_max_size) + size_ = new_max_size; DebugCheckInvariants(); } @@ -219,27 +219,27 @@ void SparseSetT<Value>::resize(int new_max_size) { template<typename Value> bool SparseSetT<Value>::contains(int i) const { assert(i >= 0); - assert(i < max_size()); - if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { + assert(i < max_size()); + if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { return false; } - // Unsigned comparison avoids checking sparse_[i] < 0. - return (uint32_t)sparse_[i] < (uint32_t)size_ && - dense_[sparse_[i]] == i; + // Unsigned comparison avoids checking sparse_[i] < 0. + return (uint32_t)sparse_[i] < (uint32_t)size_ && + dense_[sparse_[i]] == i; } template<typename Value> void SparseSetT<Value>::create_index(int i) { assert(!contains(i)); - assert(size_ < max_size()); - sparse_[i] = size_; + assert(size_ < max_size()); + sparse_[i] = size_; dense_[size_] = i; size_++; } -template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) : - sparse_(max_size), dense_(max_size) { - MaybeInitializeMemory(size_, max_size); +template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) : + sparse_(max_size), dense_(max_size) { + MaybeInitializeMemory(size_, max_size); DebugCheckInvariants(); } @@ -249,7 +249,7 @@ template<typename Value> SparseSetT<Value>::~SparseSetT() { template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const { assert(0 <= size_); - assert(size_ <= max_size()); + assert(size_ <= max_size()); } // Comparison function for sorting. @@ -261,4 +261,4 @@ typedef SparseSetT<void> SparseSet; } // namespace re2 -#endif // RE2_SPARSE_SET_H_ +#endif // RE2_SPARSE_SET_H_ diff --git a/contrib/libs/re2/re2/stringpiece.h b/contrib/libs/re2/re2/stringpiece.h index 1f53a7f5e4..ef73683401 100644 --- a/contrib/libs/re2/re2/stringpiece.h +++ b/contrib/libs/re2/re2/stringpiece.h @@ -19,20 +19,20 @@ // // Arghh! I wish C++ literals were "string". -// Doing this simplifies the logic below. -#ifndef __has_include -#define __has_include(x) 0 -#endif - +// Doing this simplifies the logic below. +#ifndef __has_include +#define __has_include(x) 0 +#endif + #include <stddef.h> #include <string.h> #include <algorithm> #include <iosfwd> #include <iterator> #include <string> -#if __has_include(<string_view>) && __cplusplus >= 201703L -#include <string_view> -#endif +#if __has_include(<string_view>) && __cplusplus >= 201703L +#include <string_view> +#endif #if defined(ARCADIA_ROOT) #include <util/generic/string.h> #endif @@ -41,7 +41,7 @@ namespace re2 { class StringPiece { public: - typedef std::char_traits<char> traits_type; + typedef std::char_traits<char> traits_type; typedef char value_type; typedef char* pointer; typedef const char* const_pointer; @@ -60,10 +60,10 @@ class StringPiece { // expected. StringPiece() : data_(NULL), size_(0) {} -#if __has_include(<string_view>) && __cplusplus >= 201703L - StringPiece(const std::string_view& str) - : data_(str.data()), size_(str.size()) {} -#endif +#if __has_include(<string_view>) && __cplusplus >= 201703L + StringPiece(const std::string_view& str) + : data_(str.data()), size_(str.size()) {} +#endif StringPiece(const std::string& str) : data_(str.data()), size_(str.size()) {} StringPiece(const char* str) @@ -71,8 +71,8 @@ class StringPiece { StringPiece(const char* str, size_type len) : data_(str), size_(len) {} #if defined(ARCADIA_ROOT) - StringPiece(const TString& str) - : StringPiece(str.data(), str.size()) {} + StringPiece(const TString& str) + : StringPiece(str.data(), str.size()) {} #endif const_iterator begin() const { return data_; } @@ -110,13 +110,13 @@ class StringPiece { size_ = len; } - // Converts to `std::basic_string`. - template <typename A> - explicit operator std::basic_string<char, traits_type, A>() const { - if (!data_) return {}; - return std::basic_string<char, traits_type, A>(data_, size_); - } - + // Converts to `std::basic_string`. + template <typename A> + explicit operator std::basic_string<char, traits_type, A>() const { + if (!data_) return {}; + return std::basic_string<char, traits_type, A>(data_, size_); + } + std::string as_string() const { return std::string(data_, size_); } diff --git a/contrib/libs/re2/re2/testing/backtrack.cc b/contrib/libs/re2/re2/testing/backtrack.cc index cc6253ddf7..920a4534dc 100644 --- a/contrib/libs/re2/re2/testing/backtrack.cc +++ b/contrib/libs/re2/re2/testing/backtrack.cc @@ -1,275 +1,275 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Tested by search_test.cc, exhaustive_test.cc, tester.cc -// -// Prog::UnsafeSearchBacktrack is a backtracking regular expression search, -// except that it remembers where it has been, trading a lot of -// memory for a lot of time. It exists only for testing purposes. -// -// Let me repeat that. -// -// THIS CODE SHOULD NEVER BE USED IN PRODUCTION: -// - It uses a ton of memory. -// - It uses a ton of stack. -// - It uses CHECK and LOG(FATAL). -// - It implements unanchored search by repeated anchored search. -// -// On the other hand, it is very simple and a good reference -// implementation for the more complicated regexp packages. -// -// In BUILD, this file is linked into the ":testing" library, -// not the main library, in order to make it harder to pick up -// accidentally. - -#include <stddef.h> -#include <stdint.h> -#include <string.h> - -#include "util/util.h" -#include "util/logging.h" -#include "re2/pod_array.h" -#include "re2/prog.h" -#include "re2/regexp.h" - -namespace re2 { - -// Backtracker holds the state for a backtracking search. -// -// Excluding the search parameters, the main search state -// is just the "capture registers", which record, for the -// current execution, the string position at which each -// parenthesis was passed. cap_[0] and cap_[1] are the -// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc. -// -// To avoid infinite loops during backtracking on expressions -// like (a*)*, the visited_[] bitmap marks the (state, string-position) -// pairs that have already been explored and are thus not worth -// re-exploring if we get there via another path. Modern backtracking -// libraries engineer their program representation differently, to make -// such infinite loops possible to avoid without keeping a giant visited_ -// bitmap, but visited_ works fine for a reference implementation -// and it has the nice benefit of making the search run in linear time. -class Backtracker { - public: - explicit Backtracker(Prog* prog); - - bool Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch); - - private: - // Explores from instruction id at string position p looking for a match. - // Returns true if found (so that caller can stop trying other possibilities). - bool Visit(int id, const char* p); - - // Tries instruction id at string position p. - // Returns true if a match is found. - bool Try(int id, const char* p); - - // Search parameters - Prog* prog_; // program being run - StringPiece text_; // text being searched - StringPiece context_; // greater context of text being searched - bool anchored_; // whether search is anchored at text.begin() - bool longest_; // whether search wants leftmost-longest match - bool endmatch_; // whether search must end at text.end() - StringPiece *submatch_; // submatches to fill in - int nsubmatch_; // # of submatches to fill in - - // Search state - const char* cap_[64]; // capture registers - PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited - - Backtracker(const Backtracker&) = delete; - Backtracker& operator=(const Backtracker&) = delete; -}; - -Backtracker::Backtracker(Prog* prog) - : prog_(prog), - anchored_(false), - longest_(false), - endmatch_(false), - submatch_(NULL), - nsubmatch_(0) { -} - -// Runs a backtracking search. -bool Backtracker::Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch) { - text_ = text; - context_ = context; - if (context_.data() == NULL) - context_ = text; +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc, exhaustive_test.cc, tester.cc +// +// Prog::UnsafeSearchBacktrack is a backtracking regular expression search, +// except that it remembers where it has been, trading a lot of +// memory for a lot of time. It exists only for testing purposes. +// +// Let me repeat that. +// +// THIS CODE SHOULD NEVER BE USED IN PRODUCTION: +// - It uses a ton of memory. +// - It uses a ton of stack. +// - It uses CHECK and LOG(FATAL). +// - It implements unanchored search by repeated anchored search. +// +// On the other hand, it is very simple and a good reference +// implementation for the more complicated regexp packages. +// +// In BUILD, this file is linked into the ":testing" library, +// not the main library, in order to make it harder to pick up +// accidentally. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include "util/util.h" +#include "util/logging.h" +#include "re2/pod_array.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +// Backtracker holds the state for a backtracking search. +// +// Excluding the search parameters, the main search state +// is just the "capture registers", which record, for the +// current execution, the string position at which each +// parenthesis was passed. cap_[0] and cap_[1] are the +// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc. +// +// To avoid infinite loops during backtracking on expressions +// like (a*)*, the visited_[] bitmap marks the (state, string-position) +// pairs that have already been explored and are thus not worth +// re-exploring if we get there via another path. Modern backtracking +// libraries engineer their program representation differently, to make +// such infinite loops possible to avoid without keeping a giant visited_ +// bitmap, but visited_ works fine for a reference implementation +// and it has the nice benefit of making the search run in linear time. +class Backtracker { + public: + explicit Backtracker(Prog* prog); + + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch); + + private: + // Explores from instruction id at string position p looking for a match. + // Returns true if found (so that caller can stop trying other possibilities). + bool Visit(int id, const char* p); + + // Tries instruction id at string position p. + // Returns true if a match is found. + bool Try(int id, const char* p); + + // Search parameters + Prog* prog_; // program being run + StringPiece text_; // text being searched + StringPiece context_; // greater context of text being searched + bool anchored_; // whether search is anchored at text.begin() + bool longest_; // whether search wants leftmost-longest match + bool endmatch_; // whether search must end at text.end() + StringPiece *submatch_; // submatches to fill in + int nsubmatch_; // # of submatches to fill in + + // Search state + const char* cap_[64]; // capture registers + PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited + + Backtracker(const Backtracker&) = delete; + Backtracker& operator=(const Backtracker&) = delete; +}; + +Backtracker::Backtracker(Prog* prog) + : prog_(prog), + anchored_(false), + longest_(false), + endmatch_(false), + submatch_(NULL), + nsubmatch_(0) { +} + +// Runs a backtracking search. +bool Backtracker::Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch) { + text_ = text; + context_ = context; + if (context_.data() == NULL) + context_ = text; if (prog_->anchor_start() && BeginPtr(text) > BeginPtr(context_)) - return false; + return false; if (prog_->anchor_end() && EndPtr(text) < EndPtr(context_)) - return false; - anchored_ = anchored | prog_->anchor_start(); - longest_ = longest | prog_->anchor_end(); - endmatch_ = prog_->anchor_end(); - submatch_ = submatch; - nsubmatch_ = nsubmatch; - CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_))); - memset(cap_, 0, sizeof cap_); - - // We use submatch_[0] for our own bookkeeping, - // so it had better exist. - StringPiece sp0; - if (nsubmatch < 1) { - submatch_ = &sp0; - nsubmatch_ = 1; - } - submatch_[0] = StringPiece(); - - // Allocate new visited_ bitmap -- size is proportional - // to text, so have to reallocate on each call to Search. - int nvisited = prog_->size() * static_cast<int>(text.size()+1); - nvisited = (nvisited + 31) / 32; - visited_ = PODArray<uint32_t>(nvisited); - memset(visited_.data(), 0, nvisited*sizeof visited_[0]); - - // Anchored search must start at text.begin(). - if (anchored_) { - cap_[0] = text.data(); - return Visit(prog_->start(), text.data()); - } - - // Unanchored search, starting from each possible text position. - // Notice that we have to try the empty string at the end of - // the text, so the loop condition is p <= text.end(), not p < text.end(). - for (const char* p = text.data(); p <= text.data() + text.size(); p++) { - cap_[0] = p; - if (Visit(prog_->start(), p)) // Match must be leftmost; done. - return true; - // Avoid invoking undefined behavior (arithmetic on a null pointer) - // by simply not continuing the loop. - if (p == NULL) - break; - } - return false; -} - -// Explores from instruction id at string position p looking for a match. -// Return true if found (so that caller can stop trying other possibilities). -bool Backtracker::Visit(int id, const char* p) { - // Check bitmap. If we've already explored from here, - // either it didn't match or it did but we're hoping for a better match. - // Either way, don't go down that road again. - CHECK(p <= text_.data() + text_.size()); - int n = id * static_cast<int>(text_.size()+1) + - static_cast<int>(p-text_.data()); - CHECK_LT(n/32, visited_.size()); - if (visited_[n/32] & (1 << (n&31))) - return false; - visited_[n/32] |= 1 << (n&31); - - Prog::Inst* ip = prog_->inst(id); - if (Try(id, p)) { - if (longest_ && !ip->last()) - Visit(id+1, p); - return true; - } - if (!ip->last()) - return Visit(id+1, p); - return false; -} - -// Tries instruction id at string position p. -// Returns true if a match is found. -bool Backtracker::Try(int id, const char* p) { - // Pick out byte at current position. If at end of string, - // have to explore in hope of finishing a match. Use impossible byte -1. - int c = -1; - if (p < text_.data() + text_.size()) - c = *p & 0xFF; - - Prog::Inst* ip = prog_->inst(id); - switch (ip->opcode()) { - default: - LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); - return false; // not reached - - case kInstAltMatch: - // Ignored. - return false; - - case kInstByteRange: - if (ip->Matches(c)) - return Visit(ip->out(), p+1); - return false; - - case kInstCapture: - if (0 <= ip->cap() && - ip->cap() < static_cast<int>(arraysize(cap_))) { - // Capture p to register, but save old value. - const char* q = cap_[ip->cap()]; - cap_[ip->cap()] = p; - bool ret = Visit(ip->out(), p); - // Restore old value as we backtrack. - cap_[ip->cap()] = q; - return ret; - } - return Visit(ip->out(), p); - - case kInstEmptyWidth: - if (ip->empty() & ~Prog::EmptyFlags(context_, p)) - return false; - return Visit(ip->out(), p); - - case kInstNop: - return Visit(ip->out(), p); - - case kInstMatch: - // We found a match. If it's the best so far, record the - // parameters in the caller's submatch_ array. - if (endmatch_ && p != context_.data() + context_.size()) - return false; - cap_[1] = p; - if (submatch_[0].data() == NULL || - (longest_ && p > submatch_[0].data() + submatch_[0].size())) { - // First match so far - or better match. - for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = StringPiece( - cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i])); - } - return true; - - case kInstFail: - return false; - } -} - -// Runs a backtracking search. -bool Prog::UnsafeSearchBacktrack(const StringPiece& text, - const StringPiece& context, - Anchor anchor, - MatchKind kind, - StringPiece* match, - int nmatch) { - // If full match, we ask for an anchored longest match - // and then check that match[0] == text. - // So make sure match[0] exists. - StringPiece sp0; - if (kind == kFullMatch) { - anchor = kAnchored; - if (nmatch < 1) { - match = &sp0; - nmatch = 1; - } - } - - // Run the search. - Backtracker b(this); - bool anchored = anchor == kAnchored; - bool longest = kind != kFirstMatch; - if (!b.Search(text, context, anchored, longest, match, nmatch)) - return false; + return false; + anchored_ = anchored | prog_->anchor_start(); + longest_ = longest | prog_->anchor_end(); + endmatch_ = prog_->anchor_end(); + submatch_ = submatch; + nsubmatch_ = nsubmatch; + CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_))); + memset(cap_, 0, sizeof cap_); + + // We use submatch_[0] for our own bookkeeping, + // so it had better exist. + StringPiece sp0; + if (nsubmatch < 1) { + submatch_ = &sp0; + nsubmatch_ = 1; + } + submatch_[0] = StringPiece(); + + // Allocate new visited_ bitmap -- size is proportional + // to text, so have to reallocate on each call to Search. + int nvisited = prog_->size() * static_cast<int>(text.size()+1); + nvisited = (nvisited + 31) / 32; + visited_ = PODArray<uint32_t>(nvisited); + memset(visited_.data(), 0, nvisited*sizeof visited_[0]); + + // Anchored search must start at text.begin(). + if (anchored_) { + cap_[0] = text.data(); + return Visit(prog_->start(), text.data()); + } + + // Unanchored search, starting from each possible text position. + // Notice that we have to try the empty string at the end of + // the text, so the loop condition is p <= text.end(), not p < text.end(). + for (const char* p = text.data(); p <= text.data() + text.size(); p++) { + cap_[0] = p; + if (Visit(prog_->start(), p)) // Match must be leftmost; done. + return true; + // Avoid invoking undefined behavior (arithmetic on a null pointer) + // by simply not continuing the loop. + if (p == NULL) + break; + } + return false; +} + +// Explores from instruction id at string position p looking for a match. +// Return true if found (so that caller can stop trying other possibilities). +bool Backtracker::Visit(int id, const char* p) { + // Check bitmap. If we've already explored from here, + // either it didn't match or it did but we're hoping for a better match. + // Either way, don't go down that road again. + CHECK(p <= text_.data() + text_.size()); + int n = id * static_cast<int>(text_.size()+1) + + static_cast<int>(p-text_.data()); + CHECK_LT(n/32, visited_.size()); + if (visited_[n/32] & (1 << (n&31))) + return false; + visited_[n/32] |= 1 << (n&31); + + Prog::Inst* ip = prog_->inst(id); + if (Try(id, p)) { + if (longest_ && !ip->last()) + Visit(id+1, p); + return true; + } + if (!ip->last()) + return Visit(id+1, p); + return false; +} + +// Tries instruction id at string position p. +// Returns true if a match is found. +bool Backtracker::Try(int id, const char* p) { + // Pick out byte at current position. If at end of string, + // have to explore in hope of finishing a match. Use impossible byte -1. + int c = -1; + if (p < text_.data() + text_.size()) + c = *p & 0xFF; + + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: + LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); + return false; // not reached + + case kInstAltMatch: + // Ignored. + return false; + + case kInstByteRange: + if (ip->Matches(c)) + return Visit(ip->out(), p+1); + return false; + + case kInstCapture: + if (0 <= ip->cap() && + ip->cap() < static_cast<int>(arraysize(cap_))) { + // Capture p to register, but save old value. + const char* q = cap_[ip->cap()]; + cap_[ip->cap()] = p; + bool ret = Visit(ip->out(), p); + // Restore old value as we backtrack. + cap_[ip->cap()] = q; + return ret; + } + return Visit(ip->out(), p); + + case kInstEmptyWidth: + if (ip->empty() & ~Prog::EmptyFlags(context_, p)) + return false; + return Visit(ip->out(), p); + + case kInstNop: + return Visit(ip->out(), p); + + case kInstMatch: + // We found a match. If it's the best so far, record the + // parameters in the caller's submatch_ array. + if (endmatch_ && p != context_.data() + context_.size()) + return false; + cap_[1] = p; + if (submatch_[0].data() == NULL || + (longest_ && p > submatch_[0].data() + submatch_[0].size())) { + // First match so far - or better match. + for (int i = 0; i < nsubmatch_; i++) + submatch_[i] = StringPiece( + cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i])); + } + return true; + + case kInstFail: + return false; + } +} + +// Runs a backtracking search. +bool Prog::UnsafeSearchBacktrack(const StringPiece& text, + const StringPiece& context, + Anchor anchor, + MatchKind kind, + StringPiece* match, + int nmatch) { + // If full match, we ask for an anchored longest match + // and then check that match[0] == text. + // So make sure match[0] exists. + StringPiece sp0; + if (kind == kFullMatch) { + anchor = kAnchored; + if (nmatch < 1) { + match = &sp0; + nmatch = 1; + } + } + + // Run the search. + Backtracker b(this); + bool anchored = anchor == kAnchored; + bool longest = kind != kFirstMatch; + if (!b.Search(text, context, anchored, longest, match, nmatch)) + return false; if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) - return false; - return true; -} - -} // namespace re2 + return false; + return true; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/charclass_test.cc b/contrib/libs/re2/re2/testing/charclass_test.cc index 0af75ba8d1..9a8b7ac6a0 100644 --- a/contrib/libs/re2/re2/testing/charclass_test.cc +++ b/contrib/libs/re2/re2/testing/charclass_test.cc @@ -1,226 +1,226 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Test character class manipulations. - -#include <stdio.h> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/utf.h" -#include "re2/regexp.h" - -namespace re2 { - -struct CCTest { - struct { - Rune lo; - Rune hi; - } add[10]; - int remove; - struct { - Rune lo; - Rune hi; - } final[10]; -}; - -static CCTest tests[] = { - { { { 10, 20 }, {-1} }, -1, - { { 10, 20 }, {-1} } }, - - { { { 10, 20 }, { 20, 30 }, {-1} }, -1, - { { 10, 30 }, {-1} } }, - - { { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1, - { { 10, 40 }, {-1} } }, - - { { { 0, 50 }, { 20, 30 }, {-1} }, -1, - { { 0, 50 }, {-1} } }, - - { { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1, - { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, - - { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1, - { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, - - { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1, - { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, - - { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1, - { { 5, 25 }, {-1} } }, - - { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1, - { { 10, 23 }, {-1} } }, - - // These check boundary cases during negation. - { { { 0, Runemax }, {-1} }, -1, - { { 0, Runemax }, {-1} } }, - - { { { 0, 50 }, {-1} }, -1, - { { 0, 50 }, {-1} } }, - - { { { 50, Runemax }, {-1} }, -1, - { { 50, Runemax }, {-1} } }, - - // Check RemoveAbove. - { { { 50, Runemax }, {-1} }, 255, - { { 50, 255 }, {-1} } }, - - { { { 50, Runemax }, {-1} }, 65535, - { { 50, 65535 }, {-1} } }, - - { { { 50, Runemax }, {-1} }, Runemax, - { { 50, Runemax }, {-1} } }, - - { { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255, - { { 50, 60 }, { 250, 255 }, {-1} } }, - - { { { 50, 60 }, {-1} }, 255, - { { 50, 60 }, {-1} } }, - - { { { 350, 360 }, {-1} }, 255, - { {-1} } }, - - { { {-1} }, 255, - { {-1} } }, -}; - -template <typename CharClass> -static void Broke(const char *desc, const CCTest* t, CharClass* cc) { - if (t == NULL) { - printf("\t%s:", desc); - } else { - printf("\n"); - printf("CharClass added: [%s]", desc); - for (int k = 0; t->add[k].lo >= 0; k++) - printf(" %d-%d", t->add[k].lo, t->add[k].hi); - printf("\n"); - if (t->remove >= 0) - printf("Removed > %d\n", t->remove); - printf("\twant:"); - for (int k = 0; t->final[k].lo >= 0; k++) - printf(" %d-%d", t->final[k].lo, t->final[k].hi); - printf("\n"); - printf("\thave:"); - } - - for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it) - printf(" %d-%d", it->lo, it->hi); - printf("\n"); -} - -bool ShouldContain(CCTest *t, int x) { - for (int j = 0; t->final[j].lo >= 0; j++) - if (t->final[j].lo <= x && x <= t->final[j].hi) - return true; - return false; -} - -// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder. - -CharClass* Negate(CharClass *cc) { - return cc->Negate(); -} - -void Delete(CharClass* cc) { - cc->Delete(); -} - -CharClassBuilder* Negate(CharClassBuilder* cc) { - CharClassBuilder* ncc = cc->Copy(); - ncc->Negate(); - return ncc; -} - -void Delete(CharClassBuilder* cc) { - delete cc; -} - -template <typename CharClass> -bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { - typename CharClass::iterator it = cc->begin(); - int size = 0; - for (int j = 0; t->final[j].lo >= 0; j++, ++it) { - if (it == cc->end() || - it->lo != t->final[j].lo || - it->hi != t->final[j].hi) { - Broke(desc, t, cc); - return false; - } - size += it->hi - it->lo + 1; - } - if (it != cc->end()) { - Broke(desc, t, cc); - return false; - } - if (cc->size() != size) { - Broke(desc, t, cc); - printf("wrong size: want %d have %d\n", size, cc->size()); - return false; - } - - for (int j = 0; j < 101; j++) { - if (j == 100) - j = Runemax; - if (ShouldContain(t, j) != cc->Contains(j)) { - Broke(desc, t, cc); - printf("want contains(%d)=%d, got %d\n", - j, ShouldContain(t, j), cc->Contains(j)); - return false; - } - } - - CharClass* ncc = Negate(cc); - for (int j = 0; j < 101; j++) { - if (j == 100) - j = Runemax; - if (ShouldContain(t, j) == ncc->Contains(j)) { - Broke(desc, t, cc); - Broke("ncc", NULL, ncc); - printf("want ncc contains(%d)!=%d, got %d\n", - j, ShouldContain(t, j), ncc->Contains(j)); - Delete(ncc); - return false; - } - if (ncc->size() != Runemax+1 - cc->size()) { - Broke(desc, t, cc); - Broke("ncc", NULL, ncc); - printf("ncc size should be %d is %d\n", - Runemax+1 - cc->size(), ncc->size()); - Delete(ncc); - return false; - } - } - Delete(ncc); - return true; -} - -TEST(TestCharClassBuilder, Adds) { - int nfail = 0; - for (size_t i = 0; i < arraysize(tests); i++) { - CharClassBuilder ccb; - CCTest* t = &tests[i]; - for (int j = 0; t->add[j].lo >= 0; j++) - ccb.AddRange(t->add[j].lo, t->add[j].hi); - if (t->remove >= 0) - ccb.RemoveAbove(t->remove); - if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)")) - nfail++; - CharClass* cc = ccb.GetCharClass(); - if (!CorrectCC(cc, t, "before copy (CharClass)")) - nfail++; - cc->Delete(); - - CharClassBuilder *ccb1 = ccb.Copy(); - if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)")) - nfail++; - cc = ccb.GetCharClass(); - if (!CorrectCC(cc, t, "after copy (CharClass)")) - nfail++; - cc->Delete(); - delete ccb1; - } - EXPECT_EQ(nfail, 0); -} - -} // namespace re2 +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test character class manipulations. + +#include <stdio.h> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/utf.h" +#include "re2/regexp.h" + +namespace re2 { + +struct CCTest { + struct { + Rune lo; + Rune hi; + } add[10]; + int remove; + struct { + Rune lo; + Rune hi; + } final[10]; +}; + +static CCTest tests[] = { + { { { 10, 20 }, {-1} }, -1, + { { 10, 20 }, {-1} } }, + + { { { 10, 20 }, { 20, 30 }, {-1} }, -1, + { { 10, 30 }, {-1} } }, + + { { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1, + { { 10, 40 }, {-1} } }, + + { { { 0, 50 }, { 20, 30 }, {-1} }, -1, + { { 0, 50 }, {-1} } }, + + { { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1, + { { 5, 25 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1, + { { 10, 23 }, {-1} } }, + + // These check boundary cases during negation. + { { { 0, Runemax }, {-1} }, -1, + { { 0, Runemax }, {-1} } }, + + { { { 0, 50 }, {-1} }, -1, + { { 0, 50 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, -1, + { { 50, Runemax }, {-1} } }, + + // Check RemoveAbove. + { { { 50, Runemax }, {-1} }, 255, + { { 50, 255 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, 65535, + { { 50, 65535 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, Runemax, + { { 50, Runemax }, {-1} } }, + + { { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255, + { { 50, 60 }, { 250, 255 }, {-1} } }, + + { { { 50, 60 }, {-1} }, 255, + { { 50, 60 }, {-1} } }, + + { { { 350, 360 }, {-1} }, 255, + { {-1} } }, + + { { {-1} }, 255, + { {-1} } }, +}; + +template <typename CharClass> +static void Broke(const char *desc, const CCTest* t, CharClass* cc) { + if (t == NULL) { + printf("\t%s:", desc); + } else { + printf("\n"); + printf("CharClass added: [%s]", desc); + for (int k = 0; t->add[k].lo >= 0; k++) + printf(" %d-%d", t->add[k].lo, t->add[k].hi); + printf("\n"); + if (t->remove >= 0) + printf("Removed > %d\n", t->remove); + printf("\twant:"); + for (int k = 0; t->final[k].lo >= 0; k++) + printf(" %d-%d", t->final[k].lo, t->final[k].hi); + printf("\n"); + printf("\thave:"); + } + + for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it) + printf(" %d-%d", it->lo, it->hi); + printf("\n"); +} + +bool ShouldContain(CCTest *t, int x) { + for (int j = 0; t->final[j].lo >= 0; j++) + if (t->final[j].lo <= x && x <= t->final[j].hi) + return true; + return false; +} + +// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder. + +CharClass* Negate(CharClass *cc) { + return cc->Negate(); +} + +void Delete(CharClass* cc) { + cc->Delete(); +} + +CharClassBuilder* Negate(CharClassBuilder* cc) { + CharClassBuilder* ncc = cc->Copy(); + ncc->Negate(); + return ncc; +} + +void Delete(CharClassBuilder* cc) { + delete cc; +} + +template <typename CharClass> +bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { + typename CharClass::iterator it = cc->begin(); + int size = 0; + for (int j = 0; t->final[j].lo >= 0; j++, ++it) { + if (it == cc->end() || + it->lo != t->final[j].lo || + it->hi != t->final[j].hi) { + Broke(desc, t, cc); + return false; + } + size += it->hi - it->lo + 1; + } + if (it != cc->end()) { + Broke(desc, t, cc); + return false; + } + if (cc->size() != size) { + Broke(desc, t, cc); + printf("wrong size: want %d have %d\n", size, cc->size()); + return false; + } + + for (int j = 0; j < 101; j++) { + if (j == 100) + j = Runemax; + if (ShouldContain(t, j) != cc->Contains(j)) { + Broke(desc, t, cc); + printf("want contains(%d)=%d, got %d\n", + j, ShouldContain(t, j), cc->Contains(j)); + return false; + } + } + + CharClass* ncc = Negate(cc); + for (int j = 0; j < 101; j++) { + if (j == 100) + j = Runemax; + if (ShouldContain(t, j) == ncc->Contains(j)) { + Broke(desc, t, cc); + Broke("ncc", NULL, ncc); + printf("want ncc contains(%d)!=%d, got %d\n", + j, ShouldContain(t, j), ncc->Contains(j)); + Delete(ncc); + return false; + } + if (ncc->size() != Runemax+1 - cc->size()) { + Broke(desc, t, cc); + Broke("ncc", NULL, ncc); + printf("ncc size should be %d is %d\n", + Runemax+1 - cc->size(), ncc->size()); + Delete(ncc); + return false; + } + } + Delete(ncc); + return true; +} + +TEST(TestCharClassBuilder, Adds) { + int nfail = 0; + for (size_t i = 0; i < arraysize(tests); i++) { + CharClassBuilder ccb; + CCTest* t = &tests[i]; + for (int j = 0; t->add[j].lo >= 0; j++) + ccb.AddRange(t->add[j].lo, t->add[j].hi); + if (t->remove >= 0) + ccb.RemoveAbove(t->remove); + if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)")) + nfail++; + CharClass* cc = ccb.GetCharClass(); + if (!CorrectCC(cc, t, "before copy (CharClass)")) + nfail++; + cc->Delete(); + + CharClassBuilder *ccb1 = ccb.Copy(); + if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)")) + nfail++; + cc = ccb.GetCharClass(); + if (!CorrectCC(cc, t, "after copy (CharClass)")) + nfail++; + cc->Delete(); + delete ccb1; + } + EXPECT_EQ(nfail, 0); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/compile_test.cc b/contrib/libs/re2/re2/testing/compile_test.cc index 107ad31b64..cf1c4cbf97 100644 --- a/contrib/libs/re2/re2/testing/compile_test.cc +++ b/contrib/libs/re2/re2/testing/compile_test.cc @@ -1,427 +1,427 @@ -// Copyright 2007 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Test prog.cc, compile.cc - -#include <string> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/regexp.h" -#include "re2/prog.h" - -namespace re2 { - -// Simple input/output tests checking that -// the regexp compiles to the expected code. -// These are just to sanity check the basic implementation. -// The real confidence tests happen by testing the NFA/DFA -// that run the compiled code. - -struct Test { - const char* regexp; - const char* code; -}; - -static Test tests[] = { - { "a", - "3. byte [61-61] 0 -> 4\n" - "4. match! 0\n" }, - { "ab", - "3. byte [61-61] 0 -> 4\n" - "4. byte [62-62] 0 -> 5\n" - "5. match! 0\n" }, - { "a|c", - "3+ byte [61-61] 0 -> 5\n" - "4. byte [63-63] 0 -> 5\n" - "5. match! 0\n" }, - { "a|b", - "3. byte [61-62] 0 -> 4\n" - "4. match! 0\n" }, - { "[ab]", - "3. byte [61-62] 0 -> 4\n" - "4. match! 0\n" }, - { "a+", - "3. byte [61-61] 0 -> 4\n" - "4+ nop -> 3\n" - "5. match! 0\n" }, - { "a+?", - "3. byte [61-61] 0 -> 4\n" - "4+ match! 0\n" - "5. nop -> 3\n" }, - { "a*", - "3+ byte [61-61] 1 -> 3\n" - "4. match! 0\n" }, - { "a*?", - "3+ match! 0\n" - "4. byte [61-61] 0 -> 3\n" }, - { "a?", - "3+ byte [61-61] 1 -> 5\n" - "4. nop -> 5\n" - "5. match! 0\n" }, - { "a??", - "3+ nop -> 5\n" - "4. byte [61-61] 0 -> 5\n" - "5. match! 0\n" }, - { "a{4}", - "3. byte [61-61] 0 -> 4\n" - "4. byte [61-61] 0 -> 5\n" - "5. byte [61-61] 0 -> 6\n" - "6. byte [61-61] 0 -> 7\n" - "7. match! 0\n" }, - { "(a)", - "3. capture 2 -> 4\n" - "4. byte [61-61] 0 -> 5\n" - "5. capture 3 -> 6\n" - "6. match! 0\n" }, - { "(?:a)", - "3. byte [61-61] 0 -> 4\n" - "4. match! 0\n" }, - { "", - "3. match! 0\n" }, - { ".", - "3+ byte [00-09] 0 -> 5\n" - "4. byte [0b-ff] 0 -> 5\n" - "5. match! 0\n" }, - { "[^ab]", - "3+ byte [00-09] 0 -> 6\n" - "4+ byte [0b-60] 0 -> 6\n" - "5. byte [63-ff] 0 -> 6\n" - "6. match! 0\n" }, - { "[Aa]", - "3. byte/i [61-61] 0 -> 4\n" - "4. match! 0\n" }, - { "\\C+", - "3. byte [00-ff] 0 -> 4\n" - "4+ altmatch -> 5 | 6\n" - "5+ nop -> 3\n" - "6. match! 0\n" }, - { "\\C*", - "3+ altmatch -> 4 | 5\n" - "4+ byte [00-ff] 1 -> 3\n" - "5. match! 0\n" }, - { "\\C?", - "3+ byte [00-ff] 1 -> 5\n" - "4. nop -> 5\n" - "5. match! 0\n" }, - // Issue 20992936 - { "[[-`]", - "3. byte [5b-60] 0 -> 4\n" - "4. match! 0\n" }, - // Issue 310 - { "(?:|a)*", - "3+ nop -> 7\n" - "4. nop -> 9\n" - "5+ nop -> 7\n" - "6. nop -> 9\n" - "7+ nop -> 5\n" - "8. byte [61-61] 0 -> 5\n" - "9. match! 0\n" }, - { "(?:|a)+", - "3+ nop -> 5\n" - "4. byte [61-61] 0 -> 5\n" - "5+ nop -> 3\n" - "6. match! 0\n" }, -}; - -TEST(TestRegexpCompileToProg, Simple) { - int failed = 0; - for (size_t i = 0; i < arraysize(tests); i++) { - const re2::Test& t = tests[i]; - Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL); - if (re == NULL) { - LOG(ERROR) << "Cannot parse: " << t.regexp; - failed++; - continue; - } - Prog* prog = re->CompileToProg(0); - if (prog == NULL) { - LOG(ERROR) << "Cannot compile: " << t.regexp; - re->Decref(); - failed++; - continue; - } - ASSERT_TRUE(re->CompileToProg(1) == NULL); - std::string s = prog->Dump(); - if (s != t.code) { - LOG(ERROR) << "Incorrect compiled code for: " << t.regexp; - LOG(ERROR) << "Want:\n" << t.code; - LOG(ERROR) << "Got:\n" << s; - failed++; - } - delete prog; - re->Decref(); - } - EXPECT_EQ(failed, 0); -} - -static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, - std::string* bytemap) { - Regexp* re = Regexp::Parse(pattern, flags, NULL); - EXPECT_TRUE(re != NULL); - - { - Prog* prog = re->CompileToProg(0); - EXPECT_TRUE(prog != NULL); - *bytemap = prog->DumpByteMap(); - delete prog; - } - - { - Prog* prog = re->CompileToReverseProg(0); - EXPECT_TRUE(prog != NULL); - EXPECT_EQ(*bytemap, prog->DumpByteMap()); - delete prog; - } - - re->Decref(); -} - -TEST(TestCompile, Latin1Ranges) { - // The distinct byte ranges involved in the Latin-1 dot ([^\n]). - - std::string bytemap; - - DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap); - EXPECT_EQ("[00-09] -> 0\n" - "[0a-0a] -> 1\n" - "[0b-ff] -> 0\n", - bytemap); -} - -TEST(TestCompile, OtherByteMapTests) { - std::string bytemap; - - // Test that "absent" ranges are mapped to the same byte class. - DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap); - EXPECT_EQ("[00-2f] -> 0\n" - "[30-39] -> 1\n" - "[3a-40] -> 0\n" - "[41-46] -> 1\n" - "[47-60] -> 0\n" - "[61-66] -> 1\n" - "[67-ff] -> 0\n", - bytemap); - - // Test the byte classes for \b. - DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap); - EXPECT_EQ("[00-2f] -> 0\n" - "[30-39] -> 1\n" - "[3a-40] -> 0\n" - "[41-5a] -> 1\n" - "[5b-5e] -> 0\n" - "[5f-5f] -> 1\n" - "[60-60] -> 0\n" - "[61-7a] -> 1\n" - "[7b-ff] -> 0\n", - bytemap); - - // Bug in the ASCII case-folding optimization created too many byte classes. - DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap); - EXPECT_EQ("[00-5e] -> 0\n" - "[5f-5f] -> 1\n" - "[60-ff] -> 0\n", - bytemap); -} - -TEST(TestCompile, UTF8Ranges) { - // The distinct byte ranges involved in the UTF-8 dot ([^\n]). - // Once, erroneously split between 0x3f and 0x40 because it is - // a 6-bit boundary. - - std::string bytemap; - - DumpByteMap(".", Regexp::PerlX, &bytemap); - EXPECT_EQ("[00-09] -> 0\n" - "[0a-0a] -> 1\n" - "[0b-7f] -> 0\n" - "[80-bf] -> 2\n" - "[c0-c1] -> 1\n" - "[c2-df] -> 3\n" - "[e0-ef] -> 4\n" - "[f0-f4] -> 5\n" - "[f5-ff] -> 1\n", - bytemap); -} - -TEST(TestCompile, InsufficientMemory) { - Regexp* re = Regexp::Parse( - "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$", - Regexp::LikePerl, NULL); - EXPECT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(850); - // If the memory budget has been exhausted, compilation should fail - // and return NULL instead of trying to do anything with NoMatch(). - EXPECT_TRUE(prog == NULL); - re->Decref(); -} - -static void Dump(StringPiece pattern, Regexp::ParseFlags flags, - std::string* forward, std::string* reverse) { - Regexp* re = Regexp::Parse(pattern, flags, NULL); - EXPECT_TRUE(re != NULL); - - if (forward != NULL) { - Prog* prog = re->CompileToProg(0); - EXPECT_TRUE(prog != NULL); - *forward = prog->Dump(); - delete prog; - } - - if (reverse != NULL) { - Prog* prog = re->CompileToReverseProg(0); - EXPECT_TRUE(prog != NULL); - *reverse = prog->Dump(); - delete prog; - } - - re->Decref(); -} - -TEST(TestCompile, Bug26705922) { - // Bug in the compiler caused inefficient bytecode to be generated for Unicode - // groups: common suffixes were cached, but common prefixes were not factored. - - std::string forward, reverse; - - Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse); - EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n" - "4. byte [90-90] 0 -> 5\n" - "5. byte [80-80] 0 -> 6\n" - "6+ byte [80-80] 0 -> 8\n" - "7. byte [90-90] 0 -> 8\n" - "8. match! 0\n", - forward); - EXPECT_EQ("3+ byte [80-80] 0 -> 5\n" - "4. byte [90-90] 0 -> 5\n" - "5. byte [80-80] 0 -> 6\n" - "6. byte [90-90] 0 -> 7\n" - "7. byte [f0-f0] 0 -> 8\n" - "8. match! 0\n", - reverse); - - Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse); - EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n" - "4. byte [f0-f0] 0 -> 8\n" - "5. byte [80-bf] 0 -> 6\n" - "6. byte [80-bf] 0 -> 7\n" - "7. match! 0\n" - "8. byte [90-90] 0 -> 5\n", - forward); - EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" - "4. byte [80-bf] 0 -> 5\n" - "5+ byte [e8-ef] 0 -> 7\n" - "6. byte [90-90] 0 -> 8\n" - "7. match! 0\n" - "8. byte [f0-f0] 0 -> 7\n", - reverse); - - Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse); - EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n" - "4+ byte [e0-ef] 0 -> 8\n" - "5. byte [f0-f4] 0 -> 9\n" - "6. byte [80-bf] 0 -> 7\n" - "7. match! 0\n" - "8. byte [80-bf] 0 -> 6\n" - "9. byte [80-bf] 0 -> 8\n", - forward); - EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" - "4+ byte [c2-df] 0 -> 6\n" - "5. byte [80-bf] 0 -> 7\n" - "6. match! 0\n" - "7+ byte [e0-ef] 0 -> 6\n" - "8. byte [80-bf] 0 -> 9\n" - "9. byte [f0-f4] 0 -> 6\n", - reverse); -} - -TEST(TestCompile, Bug35237384) { - // Bug in the compiler caused inefficient bytecode to be generated for - // nested nullable subexpressions. - - std::string forward; - - Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); - EXPECT_EQ("3+ byte [61-61] 1 -> 3\n" - "4. nop -> 5\n" - "5+ byte [61-61] 1 -> 5\n" - "6. nop -> 7\n" - "7+ byte [61-61] 1 -> 7\n" - "8. match! 0\n", - forward); - - Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); - EXPECT_EQ("3+ nop -> 28\n" - "4. nop -> 30\n" - "5+ byte [61-61] 1 -> 5\n" - "6. nop -> 32\n" - "7+ byte [61-61] 1 -> 7\n" - "8. nop -> 26\n" - "9+ byte [61-61] 1 -> 9\n" - "10. nop -> 20\n" - "11+ byte [62-62] 1 -> 11\n" - "12. nop -> 20\n" - "13+ byte [62-62] 1 -> 13\n" - "14. nop -> 26\n" - "15+ byte [62-62] 1 -> 15\n" - "16. nop -> 32\n" - "17+ nop -> 9\n" - "18. nop -> 11\n" - "19. match! 0\n" - "20+ nop -> 17\n" - "21. nop -> 19\n" - "22+ nop -> 7\n" - "23. nop -> 13\n" - "24+ nop -> 17\n" - "25. nop -> 19\n" - "26+ nop -> 22\n" - "27. nop -> 24\n" - "28+ nop -> 5\n" - "29. nop -> 15\n" - "30+ nop -> 22\n" - "31. nop -> 24\n" - "32+ nop -> 28\n" - "33. nop -> 30\n", - forward); - - Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); - EXPECT_EQ("3+ nop -> 36\n" - "4+ nop -> 31\n" - "5. nop -> 33\n" - "6+ byte [00-09] 0 -> 8\n" - "7. byte [0b-ff] 0 -> 8\n" - "8+ nop -> 6\n" - "9+ nop -> 29\n" - "10. nop -> 28\n" - "11+ byte [00-09] 0 -> 13\n" - "12. byte [0b-ff] 0 -> 13\n" - "13+ nop -> 11\n" - "14+ nop -> 26\n" - "15. nop -> 28\n" - "16+ byte [00-09] 0 -> 18\n" - "17. byte [0b-ff] 0 -> 18\n" - "18+ nop -> 16\n" - "19+ nop -> 36\n" - "20. nop -> 33\n" - "21+ byte [00-09] 0 -> 23\n" - "22. byte [0b-ff] 0 -> 23\n" - "23+ nop -> 21\n" - "24+ nop -> 31\n" - "25. nop -> 33\n" - "26+ nop -> 28\n" - "27. byte [53-53] 0 -> 11\n" - "28. match! 0\n" - "29+ nop -> 28\n" - "30. byte [53-53] 0 -> 6\n" - "31+ nop -> 33\n" - "32. byte [53-53] 0 -> 21\n" - "33+ nop -> 29\n" - "34+ nop -> 26\n" - "35. nop -> 28\n" - "36+ nop -> 33\n" - "37. byte [53-53] 0 -> 16\n", - forward); -} - -} // namespace re2 +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test prog.cc, compile.cc + +#include <string> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/regexp.h" +#include "re2/prog.h" + +namespace re2 { + +// Simple input/output tests checking that +// the regexp compiles to the expected code. +// These are just to sanity check the basic implementation. +// The real confidence tests happen by testing the NFA/DFA +// that run the compiled code. + +struct Test { + const char* regexp; + const char* code; +}; + +static Test tests[] = { + { "a", + "3. byte [61-61] 0 -> 4\n" + "4. match! 0\n" }, + { "ab", + "3. byte [61-61] 0 -> 4\n" + "4. byte [62-62] 0 -> 5\n" + "5. match! 0\n" }, + { "a|c", + "3+ byte [61-61] 0 -> 5\n" + "4. byte [63-63] 0 -> 5\n" + "5. match! 0\n" }, + { "a|b", + "3. byte [61-62] 0 -> 4\n" + "4. match! 0\n" }, + { "[ab]", + "3. byte [61-62] 0 -> 4\n" + "4. match! 0\n" }, + { "a+", + "3. byte [61-61] 0 -> 4\n" + "4+ nop -> 3\n" + "5. match! 0\n" }, + { "a+?", + "3. byte [61-61] 0 -> 4\n" + "4+ match! 0\n" + "5. nop -> 3\n" }, + { "a*", + "3+ byte [61-61] 1 -> 3\n" + "4. match! 0\n" }, + { "a*?", + "3+ match! 0\n" + "4. byte [61-61] 0 -> 3\n" }, + { "a?", + "3+ byte [61-61] 1 -> 5\n" + "4. nop -> 5\n" + "5. match! 0\n" }, + { "a??", + "3+ nop -> 5\n" + "4. byte [61-61] 0 -> 5\n" + "5. match! 0\n" }, + { "a{4}", + "3. byte [61-61] 0 -> 4\n" + "4. byte [61-61] 0 -> 5\n" + "5. byte [61-61] 0 -> 6\n" + "6. byte [61-61] 0 -> 7\n" + "7. match! 0\n" }, + { "(a)", + "3. capture 2 -> 4\n" + "4. byte [61-61] 0 -> 5\n" + "5. capture 3 -> 6\n" + "6. match! 0\n" }, + { "(?:a)", + "3. byte [61-61] 0 -> 4\n" + "4. match! 0\n" }, + { "", + "3. match! 0\n" }, + { ".", + "3+ byte [00-09] 0 -> 5\n" + "4. byte [0b-ff] 0 -> 5\n" + "5. match! 0\n" }, + { "[^ab]", + "3+ byte [00-09] 0 -> 6\n" + "4+ byte [0b-60] 0 -> 6\n" + "5. byte [63-ff] 0 -> 6\n" + "6. match! 0\n" }, + { "[Aa]", + "3. byte/i [61-61] 0 -> 4\n" + "4. match! 0\n" }, + { "\\C+", + "3. byte [00-ff] 0 -> 4\n" + "4+ altmatch -> 5 | 6\n" + "5+ nop -> 3\n" + "6. match! 0\n" }, + { "\\C*", + "3+ altmatch -> 4 | 5\n" + "4+ byte [00-ff] 1 -> 3\n" + "5. match! 0\n" }, + { "\\C?", + "3+ byte [00-ff] 1 -> 5\n" + "4. nop -> 5\n" + "5. match! 0\n" }, + // Issue 20992936 + { "[[-`]", + "3. byte [5b-60] 0 -> 4\n" + "4. match! 0\n" }, + // Issue 310 + { "(?:|a)*", + "3+ nop -> 7\n" + "4. nop -> 9\n" + "5+ nop -> 7\n" + "6. nop -> 9\n" + "7+ nop -> 5\n" + "8. byte [61-61] 0 -> 5\n" + "9. match! 0\n" }, + { "(?:|a)+", + "3+ nop -> 5\n" + "4. byte [61-61] 0 -> 5\n" + "5+ nop -> 3\n" + "6. match! 0\n" }, +}; + +TEST(TestRegexpCompileToProg, Simple) { + int failed = 0; + for (size_t i = 0; i < arraysize(tests); i++) { + const re2::Test& t = tests[i]; + Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL); + if (re == NULL) { + LOG(ERROR) << "Cannot parse: " << t.regexp; + failed++; + continue; + } + Prog* prog = re->CompileToProg(0); + if (prog == NULL) { + LOG(ERROR) << "Cannot compile: " << t.regexp; + re->Decref(); + failed++; + continue; + } + ASSERT_TRUE(re->CompileToProg(1) == NULL); + std::string s = prog->Dump(); + if (s != t.code) { + LOG(ERROR) << "Incorrect compiled code for: " << t.regexp; + LOG(ERROR) << "Want:\n" << t.code; + LOG(ERROR) << "Got:\n" << s; + failed++; + } + delete prog; + re->Decref(); + } + EXPECT_EQ(failed, 0); +} + +static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, + std::string* bytemap) { + Regexp* re = Regexp::Parse(pattern, flags, NULL); + EXPECT_TRUE(re != NULL); + + { + Prog* prog = re->CompileToProg(0); + EXPECT_TRUE(prog != NULL); + *bytemap = prog->DumpByteMap(); + delete prog; + } + + { + Prog* prog = re->CompileToReverseProg(0); + EXPECT_TRUE(prog != NULL); + EXPECT_EQ(*bytemap, prog->DumpByteMap()); + delete prog; + } + + re->Decref(); +} + +TEST(TestCompile, Latin1Ranges) { + // The distinct byte ranges involved in the Latin-1 dot ([^\n]). + + std::string bytemap; + + DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-09] -> 0\n" + "[0a-0a] -> 1\n" + "[0b-ff] -> 0\n", + bytemap); +} + +TEST(TestCompile, OtherByteMapTests) { + std::string bytemap; + + // Test that "absent" ranges are mapped to the same byte class. + DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-2f] -> 0\n" + "[30-39] -> 1\n" + "[3a-40] -> 0\n" + "[41-46] -> 1\n" + "[47-60] -> 0\n" + "[61-66] -> 1\n" + "[67-ff] -> 0\n", + bytemap); + + // Test the byte classes for \b. + DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-2f] -> 0\n" + "[30-39] -> 1\n" + "[3a-40] -> 0\n" + "[41-5a] -> 1\n" + "[5b-5e] -> 0\n" + "[5f-5f] -> 1\n" + "[60-60] -> 0\n" + "[61-7a] -> 1\n" + "[7b-ff] -> 0\n", + bytemap); + + // Bug in the ASCII case-folding optimization created too many byte classes. + DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap); + EXPECT_EQ("[00-5e] -> 0\n" + "[5f-5f] -> 1\n" + "[60-ff] -> 0\n", + bytemap); +} + +TEST(TestCompile, UTF8Ranges) { + // The distinct byte ranges involved in the UTF-8 dot ([^\n]). + // Once, erroneously split between 0x3f and 0x40 because it is + // a 6-bit boundary. + + std::string bytemap; + + DumpByteMap(".", Regexp::PerlX, &bytemap); + EXPECT_EQ("[00-09] -> 0\n" + "[0a-0a] -> 1\n" + "[0b-7f] -> 0\n" + "[80-bf] -> 2\n" + "[c0-c1] -> 1\n" + "[c2-df] -> 3\n" + "[e0-ef] -> 4\n" + "[f0-f4] -> 5\n" + "[f5-ff] -> 1\n", + bytemap); +} + +TEST(TestCompile, InsufficientMemory) { + Regexp* re = Regexp::Parse( + "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$", + Regexp::LikePerl, NULL); + EXPECT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(850); + // If the memory budget has been exhausted, compilation should fail + // and return NULL instead of trying to do anything with NoMatch(). + EXPECT_TRUE(prog == NULL); + re->Decref(); +} + +static void Dump(StringPiece pattern, Regexp::ParseFlags flags, + std::string* forward, std::string* reverse) { + Regexp* re = Regexp::Parse(pattern, flags, NULL); + EXPECT_TRUE(re != NULL); + + if (forward != NULL) { + Prog* prog = re->CompileToProg(0); + EXPECT_TRUE(prog != NULL); + *forward = prog->Dump(); + delete prog; + } + + if (reverse != NULL) { + Prog* prog = re->CompileToReverseProg(0); + EXPECT_TRUE(prog != NULL); + *reverse = prog->Dump(); + delete prog; + } + + re->Decref(); +} + +TEST(TestCompile, Bug26705922) { + // Bug in the compiler caused inefficient bytecode to be generated for Unicode + // groups: common suffixes were cached, but common prefixes were not factored. + + std::string forward, reverse; + + Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n" + "4. byte [90-90] 0 -> 5\n" + "5. byte [80-80] 0 -> 6\n" + "6+ byte [80-80] 0 -> 8\n" + "7. byte [90-90] 0 -> 8\n" + "8. match! 0\n", + forward); + EXPECT_EQ("3+ byte [80-80] 0 -> 5\n" + "4. byte [90-90] 0 -> 5\n" + "5. byte [80-80] 0 -> 6\n" + "6. byte [90-90] 0 -> 7\n" + "7. byte [f0-f0] 0 -> 8\n" + "8. match! 0\n", + reverse); + + Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n" + "4. byte [f0-f0] 0 -> 8\n" + "5. byte [80-bf] 0 -> 6\n" + "6. byte [80-bf] 0 -> 7\n" + "7. match! 0\n" + "8. byte [90-90] 0 -> 5\n", + forward); + EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" + "4. byte [80-bf] 0 -> 5\n" + "5+ byte [e8-ef] 0 -> 7\n" + "6. byte [90-90] 0 -> 8\n" + "7. match! 0\n" + "8. byte [f0-f0] 0 -> 7\n", + reverse); + + Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n" + "4+ byte [e0-ef] 0 -> 8\n" + "5. byte [f0-f4] 0 -> 9\n" + "6. byte [80-bf] 0 -> 7\n" + "7. match! 0\n" + "8. byte [80-bf] 0 -> 6\n" + "9. byte [80-bf] 0 -> 8\n", + forward); + EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" + "4+ byte [c2-df] 0 -> 6\n" + "5. byte [80-bf] 0 -> 7\n" + "6. match! 0\n" + "7+ byte [e0-ef] 0 -> 6\n" + "8. byte [80-bf] 0 -> 9\n" + "9. byte [f0-f4] 0 -> 6\n", + reverse); +} + +TEST(TestCompile, Bug35237384) { + // Bug in the compiler caused inefficient bytecode to be generated for + // nested nullable subexpressions. + + std::string forward; + + Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); + EXPECT_EQ("3+ byte [61-61] 1 -> 3\n" + "4. nop -> 5\n" + "5+ byte [61-61] 1 -> 5\n" + "6. nop -> 7\n" + "7+ byte [61-61] 1 -> 7\n" + "8. match! 0\n", + forward); + + Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); + EXPECT_EQ("3+ nop -> 28\n" + "4. nop -> 30\n" + "5+ byte [61-61] 1 -> 5\n" + "6. nop -> 32\n" + "7+ byte [61-61] 1 -> 7\n" + "8. nop -> 26\n" + "9+ byte [61-61] 1 -> 9\n" + "10. nop -> 20\n" + "11+ byte [62-62] 1 -> 11\n" + "12. nop -> 20\n" + "13+ byte [62-62] 1 -> 13\n" + "14. nop -> 26\n" + "15+ byte [62-62] 1 -> 15\n" + "16. nop -> 32\n" + "17+ nop -> 9\n" + "18. nop -> 11\n" + "19. match! 0\n" + "20+ nop -> 17\n" + "21. nop -> 19\n" + "22+ nop -> 7\n" + "23. nop -> 13\n" + "24+ nop -> 17\n" + "25. nop -> 19\n" + "26+ nop -> 22\n" + "27. nop -> 24\n" + "28+ nop -> 5\n" + "29. nop -> 15\n" + "30+ nop -> 22\n" + "31. nop -> 24\n" + "32+ nop -> 28\n" + "33. nop -> 30\n", + forward); + + Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); + EXPECT_EQ("3+ nop -> 36\n" + "4+ nop -> 31\n" + "5. nop -> 33\n" + "6+ byte [00-09] 0 -> 8\n" + "7. byte [0b-ff] 0 -> 8\n" + "8+ nop -> 6\n" + "9+ nop -> 29\n" + "10. nop -> 28\n" + "11+ byte [00-09] 0 -> 13\n" + "12. byte [0b-ff] 0 -> 13\n" + "13+ nop -> 11\n" + "14+ nop -> 26\n" + "15. nop -> 28\n" + "16+ byte [00-09] 0 -> 18\n" + "17. byte [0b-ff] 0 -> 18\n" + "18+ nop -> 16\n" + "19+ nop -> 36\n" + "20. nop -> 33\n" + "21+ byte [00-09] 0 -> 23\n" + "22. byte [0b-ff] 0 -> 23\n" + "23+ nop -> 21\n" + "24+ nop -> 31\n" + "25. nop -> 33\n" + "26+ nop -> 28\n" + "27. byte [53-53] 0 -> 11\n" + "28. match! 0\n" + "29+ nop -> 28\n" + "30. byte [53-53] 0 -> 6\n" + "31+ nop -> 33\n" + "32. byte [53-53] 0 -> 21\n" + "33+ nop -> 29\n" + "34+ nop -> 26\n" + "35. nop -> 28\n" + "36+ nop -> 33\n" + "37. byte [53-53] 0 -> 16\n", + forward); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/dump.cc b/contrib/libs/re2/re2/testing/dump.cc index 96acb1ecc1..fb3fb7da1d 100644 --- a/contrib/libs/re2/re2/testing/dump.cc +++ b/contrib/libs/re2/re2/testing/dump.cc @@ -1,163 +1,163 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Dump the regexp into a string showing structure. -// Tested by parse_unittest.cc - -// This function traverses the regexp recursively, -// meaning that on inputs like Regexp::Simplify of -// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100}, -// it takes time and space exponential in the size of the -// original regular expression. It can also use stack space -// linear in the size of the regular expression for inputs -// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*. -// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE. -// As a result, Dump is provided only in the testing -// library (see BUILD). - -#include <string> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "util/strutil.h" -#include "util/utf.h" -#include "re2/stringpiece.h" -#include "re2/regexp.h" - -namespace re2 { - -static const char* kOpcodeNames[] = { - "bad", - "no", - "emp", - "lit", - "str", - "cat", - "alt", - "star", - "plus", - "que", - "rep", - "cap", - "dot", - "byte", - "bol", - "eol", - "wb", // kRegexpWordBoundary - "nwb", // kRegexpNoWordBoundary - "bot", - "eot", - "cc", - "match", -}; - -// Create string representation of regexp with explicit structure. -// Nothing pretty, just for testing. -static void DumpRegexpAppending(Regexp* re, std::string* s) { - if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) { - *s += StringPrintf("op%d", re->op()); - } else { - switch (re->op()) { - default: - break; - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - case kRegexpRepeat: - if (re->parse_flags() & Regexp::NonGreedy) - s->append("n"); - break; - } - s->append(kOpcodeNames[re->op()]); - if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) { - Rune r = re->rune(); - if ('a' <= r && r <= 'z') - s->append("fold"); - } - if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) { - for (int i = 0; i < re->nrunes(); i++) { - Rune r = re->runes()[i]; - if ('a' <= r && r <= 'z') { - s->append("fold"); - break; - } - } - } - } - s->append("{"); - switch (re->op()) { - default: - break; - case kRegexpEndText: - if (!(re->parse_flags() & Regexp::WasDollar)) { - s->append("\\z"); - } - break; - case kRegexpLiteral: { - Rune r = re->rune(); - char buf[UTFmax+1]; - buf[runetochar(buf, &r)] = 0; - s->append(buf); - break; - } - case kRegexpLiteralString: - for (int i = 0; i < re->nrunes(); i++) { - Rune r = re->runes()[i]; - char buf[UTFmax+1]; - buf[runetochar(buf, &r)] = 0; - s->append(buf); - } - break; - case kRegexpConcat: - case kRegexpAlternate: - for (int i = 0; i < re->nsub(); i++) - DumpRegexpAppending(re->sub()[i], s); - break; - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - DumpRegexpAppending(re->sub()[0], s); - break; - case kRegexpCapture: - if (re->cap() == 0) - LOG(DFATAL) << "kRegexpCapture cap() == 0"; - if (re->name()) { - s->append(*re->name()); - s->append(":"); - } - DumpRegexpAppending(re->sub()[0], s); - break; - case kRegexpRepeat: - s->append(StringPrintf("%d,%d ", re->min(), re->max())); - DumpRegexpAppending(re->sub()[0], s); - break; - case kRegexpCharClass: { - std::string sep; - for (CharClass::iterator it = re->cc()->begin(); - it != re->cc()->end(); ++it) { - RuneRange rr = *it; - s->append(sep); - if (rr.lo == rr.hi) - s->append(StringPrintf("%#x", rr.lo)); - else - s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi)); - sep = " "; - } - break; - } - } - s->append("}"); -} - -std::string Regexp::Dump() { - // Make sure that we are being called from a unit test. - // Should cause a link error if used outside of testing. - CHECK(!::testing::TempDir().empty()); - - std::string s; - DumpRegexpAppending(this, &s); - return s; -} - -} // namespace re2 +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Dump the regexp into a string showing structure. +// Tested by parse_unittest.cc + +// This function traverses the regexp recursively, +// meaning that on inputs like Regexp::Simplify of +// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100}, +// it takes time and space exponential in the size of the +// original regular expression. It can also use stack space +// linear in the size of the regular expression for inputs +// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*. +// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE. +// As a result, Dump is provided only in the testing +// library (see BUILD). + +#include <string> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "util/utf.h" +#include "re2/stringpiece.h" +#include "re2/regexp.h" + +namespace re2 { + +static const char* kOpcodeNames[] = { + "bad", + "no", + "emp", + "lit", + "str", + "cat", + "alt", + "star", + "plus", + "que", + "rep", + "cap", + "dot", + "byte", + "bol", + "eol", + "wb", // kRegexpWordBoundary + "nwb", // kRegexpNoWordBoundary + "bot", + "eot", + "cc", + "match", +}; + +// Create string representation of regexp with explicit structure. +// Nothing pretty, just for testing. +static void DumpRegexpAppending(Regexp* re, std::string* s) { + if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) { + *s += StringPrintf("op%d", re->op()); + } else { + switch (re->op()) { + default: + break; + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + if (re->parse_flags() & Regexp::NonGreedy) + s->append("n"); + break; + } + s->append(kOpcodeNames[re->op()]); + if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) { + Rune r = re->rune(); + if ('a' <= r && r <= 'z') + s->append("fold"); + } + if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) { + for (int i = 0; i < re->nrunes(); i++) { + Rune r = re->runes()[i]; + if ('a' <= r && r <= 'z') { + s->append("fold"); + break; + } + } + } + } + s->append("{"); + switch (re->op()) { + default: + break; + case kRegexpEndText: + if (!(re->parse_flags() & Regexp::WasDollar)) { + s->append("\\z"); + } + break; + case kRegexpLiteral: { + Rune r = re->rune(); + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + break; + } + case kRegexpLiteralString: + for (int i = 0; i < re->nrunes(); i++) { + Rune r = re->runes()[i]; + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } + break; + case kRegexpConcat: + case kRegexpAlternate: + for (int i = 0; i < re->nsub(); i++) + DumpRegexpAppending(re->sub()[i], s); + break; + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpCapture: + if (re->cap() == 0) + LOG(DFATAL) << "kRegexpCapture cap() == 0"; + if (re->name()) { + s->append(*re->name()); + s->append(":"); + } + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpRepeat: + s->append(StringPrintf("%d,%d ", re->min(), re->max())); + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpCharClass: { + std::string sep; + for (CharClass::iterator it = re->cc()->begin(); + it != re->cc()->end(); ++it) { + RuneRange rr = *it; + s->append(sep); + if (rr.lo == rr.hi) + s->append(StringPrintf("%#x", rr.lo)); + else + s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi)); + sep = " "; + } + break; + } + } + s->append("}"); +} + +std::string Regexp::Dump() { + // Make sure that we are being called from a unit test. + // Should cause a link error if used outside of testing. + CHECK(!::testing::TempDir().empty()); + + std::string s; + DumpRegexpAppending(this, &s); + return s; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/exhaustive_tester.cc b/contrib/libs/re2/re2/testing/exhaustive_tester.cc index 847bf7c170..b75b068299 100644 --- a/contrib/libs/re2/re2/testing/exhaustive_tester.cc +++ b/contrib/libs/re2/re2/testing/exhaustive_tester.cc @@ -1,191 +1,191 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Exhaustive testing of regular expression matching. - -// Each test picks an alphabet (e.g., "abc"), a maximum string length, -// a maximum regular expression length, and a maximum number of letters -// that can appear in the regular expression. Given these parameters, -// it tries every possible regular expression and string, verifying that -// the NFA, DFA, and a trivial backtracking implementation agree about -// the location of the match. - -#include <stdio.h> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/flags.h" -#include "util/logging.h" -#include "util/strutil.h" -#include "re2/testing/exhaustive_tester.h" -#include "re2/testing/tester.h" - -// For target `log' in the Makefile. -#ifndef LOGGING -#define LOGGING 0 -#endif - -DEFINE_FLAG(bool, show_regexps, false, "show regexps during testing"); - -DEFINE_FLAG(int, max_bad_regexp_inputs, 1, - "Stop testing a regular expression after finding this many " - "strings that break it."); - -namespace re2 { - -static char* escape(const StringPiece& sp) { - static char buf[512]; - char* p = buf; - *p++ = '\"'; - for (size_t i = 0; i < sp.size(); i++) { - if(p+5 >= buf+sizeof buf) - LOG(FATAL) << "ExhaustiveTester escape: too long"; - if(sp[i] == '\\' || sp[i] == '\"') { - *p++ = '\\'; - *p++ = sp[i]; - } else if(sp[i] == '\n') { - *p++ = '\\'; - *p++ = 'n'; - } else { - *p++ = sp[i]; - } - } - *p++ = '\"'; - *p = '\0'; - return buf; -} - -static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) { - if (!re.Match(input, 0, input.size(), anchor, m, n)) { - printf("-"); - return; - } - for (int i = 0; i < n; i++) { - if (i > 0) - printf(" "); - if (m[i].data() == NULL) - printf("-"); - else - printf("%td-%td", +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +// Each test picks an alphabet (e.g., "abc"), a maximum string length, +// a maximum regular expression length, and a maximum number of letters +// that can appear in the regular expression. Given these parameters, +// it tries every possible regular expression and string, verifying that +// the NFA, DFA, and a trivial backtracking implementation agree about +// the location of the match. + +#include <stdio.h> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/flags.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/testing/exhaustive_tester.h" +#include "re2/testing/tester.h" + +// For target `log' in the Makefile. +#ifndef LOGGING +#define LOGGING 0 +#endif + +DEFINE_FLAG(bool, show_regexps, false, "show regexps during testing"); + +DEFINE_FLAG(int, max_bad_regexp_inputs, 1, + "Stop testing a regular expression after finding this many " + "strings that break it."); + +namespace re2 { + +static char* escape(const StringPiece& sp) { + static char buf[512]; + char* p = buf; + *p++ = '\"'; + for (size_t i = 0; i < sp.size(); i++) { + if(p+5 >= buf+sizeof buf) + LOG(FATAL) << "ExhaustiveTester escape: too long"; + if(sp[i] == '\\' || sp[i] == '\"') { + *p++ = '\\'; + *p++ = sp[i]; + } else if(sp[i] == '\n') { + *p++ = '\\'; + *p++ = 'n'; + } else { + *p++ = sp[i]; + } + } + *p++ = '\"'; + *p = '\0'; + return buf; +} + +static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) { + if (!re.Match(input, 0, input.size(), anchor, m, n)) { + printf("-"); + return; + } + for (int i = 0; i < n; i++) { + if (i > 0) + printf(" "); + if (m[i].data() == NULL) + printf("-"); + else + printf("%td-%td", BeginPtr(m[i]) - BeginPtr(input), EndPtr(m[i]) - BeginPtr(input)); - } -} - -// Processes a single generated regexp. -// Compiles it using Regexp interface and PCRE, and then -// checks that NFA, DFA, and PCRE all return the same results. -void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) { - regexps_++; - std::string regexp = const_regexp; - if (!topwrapper_.empty()) { - regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str()); - } - - if (GetFlag(FLAGS_show_regexps)) { - printf("\r%s", regexp.c_str()); - fflush(stdout); - } - - if (LOGGING) { - // Write out test cases and answers for use in testing - // other implementations, such as Go's regexp package. - if (randomstrings_) - LOG(ERROR) << "Cannot log with random strings."; - if (regexps_ == 1) { // first - printf("strings\n"); - strgen_.Reset(); - while (strgen_.HasNext()) - printf("%s\n", escape(strgen_.Next())); - printf("regexps\n"); - } - printf("%s\n", escape(regexp)); - - RE2 re(regexp); - RE2::Options longest; - longest.set_longest_match(true); - RE2 relongest(regexp, longest); - int ngroup = re.NumberOfCapturingGroups()+1; - StringPiece* group = new StringPiece[ngroup]; - - strgen_.Reset(); - while (strgen_.HasNext()) { - StringPiece input = strgen_.Next(); - PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup); - printf(";"); - PrintResult(re, input, RE2::UNANCHORED, group, ngroup); - printf(";"); - PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup); - printf(";"); - PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup); - printf("\n"); - } - delete[] group; - return; - } - - Tester tester(regexp); - if (tester.error()) - return; - - strgen_.Reset(); - strgen_.GenerateNULL(); - if (randomstrings_) - strgen_.Random(stringseed_, stringcount_); - int bad_inputs = 0; - while (strgen_.HasNext()) { - tests_++; - if (!tester.TestInput(strgen_.Next())) { - failures_++; - if (++bad_inputs >= GetFlag(FLAGS_max_bad_regexp_inputs)) - break; - } - } -} - -// Runs an exhaustive test on the given parameters. -void ExhaustiveTest(int maxatoms, int maxops, - const std::vector<std::string>& alphabet, - const std::vector<std::string>& ops, - int maxstrlen, - const std::vector<std::string>& stralphabet, - const std::string& wrapper, - const std::string& topwrapper) { - if (RE2_DEBUG_MODE) { - if (maxatoms > 1) - maxatoms--; - if (maxops > 1) - maxops--; - if (maxstrlen > 1) - maxstrlen--; - } - ExhaustiveTester t(maxatoms, maxops, alphabet, ops, - maxstrlen, stralphabet, wrapper, - topwrapper); - t.Generate(); - if (!LOGGING) { - printf("%d regexps, %d tests, %d failures [%d/%d str]\n", - t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); - } - EXPECT_EQ(0, t.failures()); -} - -// Runs an exhaustive test using the given parameters and -// the basic egrep operators. -void EgrepTest(int maxatoms, int maxops, const std::string& alphabet, - int maxstrlen, const std::string& stralphabet, - const std::string& wrapper) { - const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" }; - - for (size_t i = 0; i < arraysize(tops); i++) { - ExhaustiveTest(maxatoms, maxops, - Split("", alphabet), - RegexpGenerator::EgrepOps(), - maxstrlen, - Split("", stralphabet), - wrapper, - tops[i]); - } -} - -} // namespace re2 + } +} + +// Processes a single generated regexp. +// Compiles it using Regexp interface and PCRE, and then +// checks that NFA, DFA, and PCRE all return the same results. +void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) { + regexps_++; + std::string regexp = const_regexp; + if (!topwrapper_.empty()) { + regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str()); + } + + if (GetFlag(FLAGS_show_regexps)) { + printf("\r%s", regexp.c_str()); + fflush(stdout); + } + + if (LOGGING) { + // Write out test cases and answers for use in testing + // other implementations, such as Go's regexp package. + if (randomstrings_) + LOG(ERROR) << "Cannot log with random strings."; + if (regexps_ == 1) { // first + printf("strings\n"); + strgen_.Reset(); + while (strgen_.HasNext()) + printf("%s\n", escape(strgen_.Next())); + printf("regexps\n"); + } + printf("%s\n", escape(regexp)); + + RE2 re(regexp); + RE2::Options longest; + longest.set_longest_match(true); + RE2 relongest(regexp, longest); + int ngroup = re.NumberOfCapturingGroups()+1; + StringPiece* group = new StringPiece[ngroup]; + + strgen_.Reset(); + while (strgen_.HasNext()) { + StringPiece input = strgen_.Next(); + PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup); + printf(";"); + PrintResult(re, input, RE2::UNANCHORED, group, ngroup); + printf(";"); + PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup); + printf(";"); + PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup); + printf("\n"); + } + delete[] group; + return; + } + + Tester tester(regexp); + if (tester.error()) + return; + + strgen_.Reset(); + strgen_.GenerateNULL(); + if (randomstrings_) + strgen_.Random(stringseed_, stringcount_); + int bad_inputs = 0; + while (strgen_.HasNext()) { + tests_++; + if (!tester.TestInput(strgen_.Next())) { + failures_++; + if (++bad_inputs >= GetFlag(FLAGS_max_bad_regexp_inputs)) + break; + } + } +} + +// Runs an exhaustive test on the given parameters. +void ExhaustiveTest(int maxatoms, int maxops, + const std::vector<std::string>& alphabet, + const std::vector<std::string>& ops, + int maxstrlen, + const std::vector<std::string>& stralphabet, + const std::string& wrapper, + const std::string& topwrapper) { + if (RE2_DEBUG_MODE) { + if (maxatoms > 1) + maxatoms--; + if (maxops > 1) + maxops--; + if (maxstrlen > 1) + maxstrlen--; + } + ExhaustiveTester t(maxatoms, maxops, alphabet, ops, + maxstrlen, stralphabet, wrapper, + topwrapper); + t.Generate(); + if (!LOGGING) { + printf("%d regexps, %d tests, %d failures [%d/%d str]\n", + t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); + } + EXPECT_EQ(0, t.failures()); +} + +// Runs an exhaustive test using the given parameters and +// the basic egrep operators. +void EgrepTest(int maxatoms, int maxops, const std::string& alphabet, + int maxstrlen, const std::string& stralphabet, + const std::string& wrapper) { + const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" }; + + for (size_t i = 0; i < arraysize(tops); i++) { + ExhaustiveTest(maxatoms, maxops, + Split("", alphabet), + RegexpGenerator::EgrepOps(), + maxstrlen, + Split("", stralphabet), + wrapper, + tops[i]); + } +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/exhaustive_tester.h b/contrib/libs/re2/re2/testing/exhaustive_tester.h index fb26b04dcf..3a14282f01 100644 --- a/contrib/libs/re2/re2/testing/exhaustive_tester.h +++ b/contrib/libs/re2/re2/testing/exhaustive_tester.h @@ -1,105 +1,105 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H_ -#define RE2_TESTING_EXHAUSTIVE_TESTER_H_ - -#include <stdint.h> -#include <string> -#include <vector> - -#include "util/util.h" -#include "re2/testing/regexp_generator.h" -#include "re2/testing/string_generator.h" - -namespace re2 { - -// Doing this simplifies the logic below. -#ifndef __has_feature -#define __has_feature(x) 0 -#endif - -#if !defined(NDEBUG) -// We are in a debug build. -const bool RE2_DEBUG_MODE = true; -#elif __has_feature(address_sanitizer) || __has_feature(memory_sanitizer) || __has_feature(thread_sanitizer) -// Not a debug build, but still under sanitizers. -const bool RE2_DEBUG_MODE = true; -#else -const bool RE2_DEBUG_MODE = false; -#endif - -// Exhaustive regular expression test: generate all regexps within parameters, -// then generate all strings of a given length over a given alphabet, -// then check that NFA, DFA, and PCRE agree about whether each regexp matches -// each possible string, and if so, where the match is. -// -// Can also be used in a "random" mode that generates a given number -// of random regexp and strings, allowing testing of larger expressions -// and inputs. -class ExhaustiveTester : public RegexpGenerator { - public: - ExhaustiveTester(int maxatoms, - int maxops, - const std::vector<std::string>& alphabet, - const std::vector<std::string>& ops, - int maxstrlen, - const std::vector<std::string>& stralphabet, - const std::string& wrapper, - const std::string& topwrapper) - : RegexpGenerator(maxatoms, maxops, alphabet, ops), - strgen_(maxstrlen, stralphabet), - wrapper_(wrapper), - topwrapper_(topwrapper), - regexps_(0), tests_(0), failures_(0), - randomstrings_(0), stringseed_(0), stringcount_(0) { } - - int regexps() { return regexps_; } - int tests() { return tests_; } - int failures() { return failures_; } - - // Needed for RegexpGenerator interface. - void HandleRegexp(const std::string& regexp); - - // Causes testing to generate random input strings. - void RandomStrings(int32_t seed, int32_t count) { - randomstrings_ = true; - stringseed_ = seed; - stringcount_ = count; - } - - private: - StringGenerator strgen_; - std::string wrapper_; // Regexp wrapper - either empty or has one %s. - std::string topwrapper_; // Regexp top-level wrapper. - int regexps_; // Number of HandleRegexp calls - int tests_; // Number of regexp tests. - int failures_; // Number of tests failed. - - bool randomstrings_; // Whether to use random strings - int32_t stringseed_; // If so, the seed. - int stringcount_; // If so, how many to generate. - - ExhaustiveTester(const ExhaustiveTester&) = delete; - ExhaustiveTester& operator=(const ExhaustiveTester&) = delete; -}; - -// Runs an exhaustive test on the given parameters. -void ExhaustiveTest(int maxatoms, int maxops, - const std::vector<std::string>& alphabet, - const std::vector<std::string>& ops, - int maxstrlen, - const std::vector<std::string>& stralphabet, - const std::string& wrapper, - const std::string& topwrapper); - -// Runs an exhaustive test using the given parameters and -// the basic egrep operators. -void EgrepTest(int maxatoms, int maxops, const std::string& alphabet, - int maxstrlen, const std::string& stralphabet, - const std::string& wrapper); - -} // namespace re2 - -#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H_ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H_ +#define RE2_TESTING_EXHAUSTIVE_TESTER_H_ + +#include <stdint.h> +#include <string> +#include <vector> + +#include "util/util.h" +#include "re2/testing/regexp_generator.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +// Doing this simplifies the logic below. +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + +#if !defined(NDEBUG) +// We are in a debug build. +const bool RE2_DEBUG_MODE = true; +#elif __has_feature(address_sanitizer) || __has_feature(memory_sanitizer) || __has_feature(thread_sanitizer) +// Not a debug build, but still under sanitizers. +const bool RE2_DEBUG_MODE = true; +#else +const bool RE2_DEBUG_MODE = false; +#endif + +// Exhaustive regular expression test: generate all regexps within parameters, +// then generate all strings of a given length over a given alphabet, +// then check that NFA, DFA, and PCRE agree about whether each regexp matches +// each possible string, and if so, where the match is. +// +// Can also be used in a "random" mode that generates a given number +// of random regexp and strings, allowing testing of larger expressions +// and inputs. +class ExhaustiveTester : public RegexpGenerator { + public: + ExhaustiveTester(int maxatoms, + int maxops, + const std::vector<std::string>& alphabet, + const std::vector<std::string>& ops, + int maxstrlen, + const std::vector<std::string>& stralphabet, + const std::string& wrapper, + const std::string& topwrapper) + : RegexpGenerator(maxatoms, maxops, alphabet, ops), + strgen_(maxstrlen, stralphabet), + wrapper_(wrapper), + topwrapper_(topwrapper), + regexps_(0), tests_(0), failures_(0), + randomstrings_(0), stringseed_(0), stringcount_(0) { } + + int regexps() { return regexps_; } + int tests() { return tests_; } + int failures() { return failures_; } + + // Needed for RegexpGenerator interface. + void HandleRegexp(const std::string& regexp); + + // Causes testing to generate random input strings. + void RandomStrings(int32_t seed, int32_t count) { + randomstrings_ = true; + stringseed_ = seed; + stringcount_ = count; + } + + private: + StringGenerator strgen_; + std::string wrapper_; // Regexp wrapper - either empty or has one %s. + std::string topwrapper_; // Regexp top-level wrapper. + int regexps_; // Number of HandleRegexp calls + int tests_; // Number of regexp tests. + int failures_; // Number of tests failed. + + bool randomstrings_; // Whether to use random strings + int32_t stringseed_; // If so, the seed. + int stringcount_; // If so, how many to generate. + + ExhaustiveTester(const ExhaustiveTester&) = delete; + ExhaustiveTester& operator=(const ExhaustiveTester&) = delete; +}; + +// Runs an exhaustive test on the given parameters. +void ExhaustiveTest(int maxatoms, int maxops, + const std::vector<std::string>& alphabet, + const std::vector<std::string>& ops, + int maxstrlen, + const std::vector<std::string>& stralphabet, + const std::string& wrapper, + const std::string& topwrapper); + +// Runs an exhaustive test using the given parameters and +// the basic egrep operators. +void EgrepTest(int maxatoms, int maxops, const std::string& alphabet, + int maxstrlen, const std::string& stralphabet, + const std::string& wrapper); + +} // namespace re2 + +#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H_ diff --git a/contrib/libs/re2/re2/testing/filtered_re2_test.cc b/contrib/libs/re2/re2/testing/filtered_re2_test.cc index 684d4356c4..073a70a745 100644 --- a/contrib/libs/re2/re2/testing/filtered_re2_test.cc +++ b/contrib/libs/re2/re2/testing/filtered_re2_test.cc @@ -1,340 +1,340 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <stddef.h> -#include <algorithm> -#include <memory> -#include <string> -#include <vector> -#include <utility> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/filtered_re2.h" -#include "re2/re2.h" - -namespace re2 { - -struct FilterTestVars { - FilterTestVars() {} - explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {} - - std::vector<std::string> atoms; - std::vector<int> atom_indices; - std::vector<int> matches; - RE2::Options opts; - FilteredRE2 f; -}; - -TEST(FilteredRE2Test, EmptyTest) { - FilterTestVars v; - - v.f.Compile(&v.atoms); - EXPECT_EQ(0, v.atoms.size()); - - // Compile has no effect at all when called before Add: it will not - // record that it has been called and it will not clear the vector. - // The second point does not matter here, but the first point means - // that an error will be logged during the call to AllMatches. - v.f.AllMatches("foo", v.atom_indices, &v.matches); - EXPECT_EQ(0, v.matches.size()); -} - -TEST(FilteredRE2Test, SmallOrTest) { - FilterTestVars v(4); // override the minimum atom length - int id; - v.f.Add("(foo|bar)", v.opts, &id); - - v.f.Compile(&v.atoms); - EXPECT_EQ(0, v.atoms.size()); - - v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); - EXPECT_EQ(1, v.matches.size()); - EXPECT_EQ(id, v.matches[0]); -} - -TEST(FilteredRE2Test, SmallLatinTest) { - FilterTestVars v; - int id; - - v.opts.set_encoding(RE2::Options::EncodingLatin1); - v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); - v.f.Compile(&v.atoms); - EXPECT_EQ(1, v.atoms.size()); - EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); - - v.atom_indices.push_back(0); - v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); - EXPECT_EQ(1, v.matches.size()); - EXPECT_EQ(id, v.matches[0]); -} - -struct AtomTest { - const char* testname; - // If any test needs more than this many regexps or atoms, increase - // the size of the corresponding array. - const char* regexps[20]; - const char* atoms[20]; -}; - -AtomTest atom_tests[] = { - { - // This test checks to make sure empty patterns are allowed. - "CheckEmptyPattern", - {""}, - {} - }, { - // This test checks that all atoms of length greater than min length - // are found, and no atoms that are of smaller length are found. - "AllAtomsGtMinLengthFound", { - "(abc123|def456|ghi789).*mnop[x-z]+", - "abc..yyy..zz", - "mnmnpp[a-z]+PPP" - }, { - "abc123", - "def456", - "ghi789", - "mnop", - "abc", - "yyy", - "mnmnpp", - "ppp" - } - }, { - // Test to make sure that any atoms that have another atom as a - // substring in an OR are removed; that is, only the shortest - // substring is kept. - "SubstrAtomRemovesSuperStrInOr", { - "(abc123|abc|ghi789|abc1234).*[x-z]+", - "abcd..yyy..yyyzzz", - "mnmnpp[a-z]+PPP" - }, { - "abc", - "ghi789", - "abcd", - "yyy", - "yyyzzz", - "mnmnpp", - "ppp" - } - }, { - // Test character class expansion. - "CharClassExpansion", { - "m[a-c][d-f]n.*[x-z]+", - "[x-y]bcde[ab]" - }, { - "madn", "maen", "mafn", - "mbdn", "mben", "mbfn", - "mcdn", "mcen", "mcfn", - "xbcdea", "xbcdeb", - "ybcdea", "ybcdeb" - } - }, { - // Test upper/lower of non-ASCII. - "UnicodeLower", { - "(?i)ΔδΠϖπΣςσ", - "ΛΜΝΟΠ", - "ψρστυ", - }, { - "δδπππσσσ", - "λμνοπ", - "ψρστυ", - }, - }, -}; - -void AddRegexpsAndCompile(const char* regexps[], - size_t n, - struct FilterTestVars* v) { - for (size_t i = 0; i < n; i++) { - int id; - v->f.Add(regexps[i], v->opts, &id); - } - v->f.Compile(&v->atoms); -} - -bool CheckExpectedAtoms(const char* atoms[], - size_t n, - const char* testname, - struct FilterTestVars* v) { - std::vector<std::string> expected; - for (size_t i = 0; i < n; i++) - expected.push_back(atoms[i]); - - bool pass = expected.size() == v->atoms.size(); - - std::sort(v->atoms.begin(), v->atoms.end()); - std::sort(expected.begin(), expected.end()); - for (size_t i = 0; pass && i < n; i++) - pass = pass && expected[i] == v->atoms[i]; - - if (!pass) { - LOG(ERROR) << "Failed " << testname; - LOG(ERROR) << "Expected #atoms = " << expected.size(); - for (size_t i = 0; i < expected.size(); i++) - LOG(ERROR) << expected[i]; - LOG(ERROR) << "Found #atoms = " << v->atoms.size(); - for (size_t i = 0; i < v->atoms.size(); i++) - LOG(ERROR) << v->atoms[i]; - } - - return pass; -} - -TEST(FilteredRE2Test, AtomTests) { - int nfail = 0; - for (size_t i = 0; i < arraysize(atom_tests); i++) { - FilterTestVars v; - AtomTest* t = &atom_tests[i]; - size_t nregexp, natom; - for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) - if (t->regexps[nregexp] == NULL) - break; - for (natom = 0; natom < arraysize(t->atoms); natom++) - if (t->atoms[natom] == NULL) - break; - AddRegexpsAndCompile(t->regexps, nregexp, &v); - if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) - nfail++; - } - EXPECT_EQ(0, nfail); -} - -void FindAtomIndices(const std::vector<std::string>& atoms, - const std::vector<std::string>& matched_atoms, - std::vector<int>* atom_indices) { - atom_indices->clear(); - for (size_t i = 0; i < matched_atoms.size(); i++) { - for (size_t j = 0; j < atoms.size(); j++) { - if (matched_atoms[i] == atoms[j]) { - atom_indices->push_back(static_cast<int>(j)); - break; - } - } - } -} - -TEST(FilteredRE2Test, MatchEmptyPattern) { - FilterTestVars v; - AtomTest* t = &atom_tests[0]; - // We are using the regexps used in one of the atom tests - // for this test. Adding the EXPECT here to make sure - // the index we use for the test is for the correct test. - EXPECT_EQ("CheckEmptyPattern", std::string(t->testname)); - size_t nregexp; - for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) - if (t->regexps[nregexp] == NULL) - break; - AddRegexpsAndCompile(t->regexps, nregexp, &v); - std::string text = "0123"; - std::vector<int> atom_ids; - std::vector<int> matching_regexps; - EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); -} - -TEST(FilteredRE2Test, MatchTests) { - FilterTestVars v; - AtomTest* t = &atom_tests[2]; - // We are using the regexps used in one of the atom tests - // for this test. - EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname)); - size_t nregexp; - for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) - if (t->regexps[nregexp] == NULL) - break; - AddRegexpsAndCompile(t->regexps, nregexp, &v); - - std::string text = "abc121212xyz"; - // atoms = abc - std::vector<int> atom_ids; - std::vector<std::string> atoms; - atoms.push_back("abc"); - FindAtomIndices(v.atoms, atoms, &atom_ids); - std::vector<int> matching_regexps; - v.f.AllMatches(text, atom_ids, &matching_regexps); - EXPECT_EQ(1, matching_regexps.size()); - - text = "abc12312yyyzzz"; - atoms.clear(); - atoms.push_back("abc"); - atoms.push_back("yyy"); - atoms.push_back("yyyzzz"); - FindAtomIndices(v.atoms, atoms, &atom_ids); - v.f.AllMatches(text, atom_ids, &matching_regexps); - EXPECT_EQ(1, matching_regexps.size()); - - text = "abcd12yyy32yyyzzz"; - atoms.clear(); - atoms.push_back("abc"); - atoms.push_back("abcd"); - atoms.push_back("yyy"); - atoms.push_back("yyyzzz"); - FindAtomIndices(v.atoms, atoms, &atom_ids); - LOG(INFO) << "S: " << atom_ids.size(); - for (size_t i = 0; i < atom_ids.size(); i++) - LOG(INFO) << "i: " << i << " : " << atom_ids[i]; - v.f.AllMatches(text, atom_ids, &matching_regexps); - EXPECT_EQ(2, matching_regexps.size()); -} - -TEST(FilteredRE2Test, EmptyStringInStringSetBug) { - // Bug due to find() finding "" at the start of everything in a string - // set and thus SimplifyStringSet() would end up erasing everything. - // In order to test this, we have to keep PrefilterTree from discarding - // the OR entirely, so we have to make the minimum atom length zero. - - FilterTestVars v(0); // override the minimum atom length - const char* regexps[] = {"-R.+(|ADD=;AA){12}}"}; - const char* atoms[] = {"", "-r", "add=;aa", "}"}; - AddRegexpsAndCompile(regexps, arraysize(regexps), &v); - EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms), - "EmptyStringInStringSetBug", &v)); -} - -TEST(FilteredRE2Test, MoveSemantics) { - FilterTestVars v1; - int id; - v1.f.Add("foo\\d+", v1.opts, &id); - EXPECT_EQ(0, id); - v1.f.Compile(&v1.atoms); - EXPECT_EQ(1, v1.atoms.size()); - EXPECT_EQ("foo", v1.atoms[0]); - v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); - EXPECT_EQ(1, v1.matches.size()); - EXPECT_EQ(0, v1.matches[0]); - v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); - EXPECT_EQ(0, v1.matches.size()); - - // The moved-to object should do what the moved-from object did. - FilterTestVars v2; - v2.f = std::move(v1.f); - v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches); - EXPECT_EQ(1, v2.matches.size()); - EXPECT_EQ(0, v2.matches[0]); - v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches); - EXPECT_EQ(0, v2.matches.size()); - - // The moved-from object should have been reset and be reusable. - v1.f.Add("bar\\d+", v1.opts, &id); - EXPECT_EQ(0, id); - v1.f.Compile(&v1.atoms); - EXPECT_EQ(1, v1.atoms.size()); - EXPECT_EQ("bar", v1.atoms[0]); - v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); - EXPECT_EQ(0, v1.matches.size()); - v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); - EXPECT_EQ(1, v1.matches.size()); - EXPECT_EQ(0, v1.matches[0]); - - // Verify that "overwriting" works and also doesn't leak memory. - // (The latter will need a leak detector such as LeakSanitizer.) - v1.f = std::move(v2.f); - v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); - EXPECT_EQ(1, v1.matches.size()); - EXPECT_EQ(0, v1.matches[0]); - v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); - EXPECT_EQ(0, v1.matches.size()); -} - -} // namespace re2 +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <stddef.h> +#include <algorithm> +#include <memory> +#include <string> +#include <vector> +#include <utility> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/filtered_re2.h" +#include "re2/re2.h" + +namespace re2 { + +struct FilterTestVars { + FilterTestVars() {} + explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {} + + std::vector<std::string> atoms; + std::vector<int> atom_indices; + std::vector<int> matches; + RE2::Options opts; + FilteredRE2 f; +}; + +TEST(FilteredRE2Test, EmptyTest) { + FilterTestVars v; + + v.f.Compile(&v.atoms); + EXPECT_EQ(0, v.atoms.size()); + + // Compile has no effect at all when called before Add: it will not + // record that it has been called and it will not clear the vector. + // The second point does not matter here, but the first point means + // that an error will be logged during the call to AllMatches. + v.f.AllMatches("foo", v.atom_indices, &v.matches); + EXPECT_EQ(0, v.matches.size()); +} + +TEST(FilteredRE2Test, SmallOrTest) { + FilterTestVars v(4); // override the minimum atom length + int id; + v.f.Add("(foo|bar)", v.opts, &id); + + v.f.Compile(&v.atoms); + EXPECT_EQ(0, v.atoms.size()); + + v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +TEST(FilteredRE2Test, SmallLatinTest) { + FilterTestVars v; + int id; + + v.opts.set_encoding(RE2::Options::EncodingLatin1); + v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); + v.f.Compile(&v.atoms); + EXPECT_EQ(1, v.atoms.size()); + EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); + + v.atom_indices.push_back(0); + v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +struct AtomTest { + const char* testname; + // If any test needs more than this many regexps or atoms, increase + // the size of the corresponding array. + const char* regexps[20]; + const char* atoms[20]; +}; + +AtomTest atom_tests[] = { + { + // This test checks to make sure empty patterns are allowed. + "CheckEmptyPattern", + {""}, + {} + }, { + // This test checks that all atoms of length greater than min length + // are found, and no atoms that are of smaller length are found. + "AllAtomsGtMinLengthFound", { + "(abc123|def456|ghi789).*mnop[x-z]+", + "abc..yyy..zz", + "mnmnpp[a-z]+PPP" + }, { + "abc123", + "def456", + "ghi789", + "mnop", + "abc", + "yyy", + "mnmnpp", + "ppp" + } + }, { + // Test to make sure that any atoms that have another atom as a + // substring in an OR are removed; that is, only the shortest + // substring is kept. + "SubstrAtomRemovesSuperStrInOr", { + "(abc123|abc|ghi789|abc1234).*[x-z]+", + "abcd..yyy..yyyzzz", + "mnmnpp[a-z]+PPP" + }, { + "abc", + "ghi789", + "abcd", + "yyy", + "yyyzzz", + "mnmnpp", + "ppp" + } + }, { + // Test character class expansion. + "CharClassExpansion", { + "m[a-c][d-f]n.*[x-z]+", + "[x-y]bcde[ab]" + }, { + "madn", "maen", "mafn", + "mbdn", "mben", "mbfn", + "mcdn", "mcen", "mcfn", + "xbcdea", "xbcdeb", + "ybcdea", "ybcdeb" + } + }, { + // Test upper/lower of non-ASCII. + "UnicodeLower", { + "(?i)ΔδΠϖπΣςσ", + "ΛΜΝΟΠ", + "ψρστυ", + }, { + "δδπππσσσ", + "λμνοπ", + "ψρστυ", + }, + }, +}; + +void AddRegexpsAndCompile(const char* regexps[], + size_t n, + struct FilterTestVars* v) { + for (size_t i = 0; i < n; i++) { + int id; + v->f.Add(regexps[i], v->opts, &id); + } + v->f.Compile(&v->atoms); +} + +bool CheckExpectedAtoms(const char* atoms[], + size_t n, + const char* testname, + struct FilterTestVars* v) { + std::vector<std::string> expected; + for (size_t i = 0; i < n; i++) + expected.push_back(atoms[i]); + + bool pass = expected.size() == v->atoms.size(); + + std::sort(v->atoms.begin(), v->atoms.end()); + std::sort(expected.begin(), expected.end()); + for (size_t i = 0; pass && i < n; i++) + pass = pass && expected[i] == v->atoms[i]; + + if (!pass) { + LOG(ERROR) << "Failed " << testname; + LOG(ERROR) << "Expected #atoms = " << expected.size(); + for (size_t i = 0; i < expected.size(); i++) + LOG(ERROR) << expected[i]; + LOG(ERROR) << "Found #atoms = " << v->atoms.size(); + for (size_t i = 0; i < v->atoms.size(); i++) + LOG(ERROR) << v->atoms[i]; + } + + return pass; +} + +TEST(FilteredRE2Test, AtomTests) { + int nfail = 0; + for (size_t i = 0; i < arraysize(atom_tests); i++) { + FilterTestVars v; + AtomTest* t = &atom_tests[i]; + size_t nregexp, natom; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + for (natom = 0; natom < arraysize(t->atoms); natom++) + if (t->atoms[natom] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) + nfail++; + } + EXPECT_EQ(0, nfail); +} + +void FindAtomIndices(const std::vector<std::string>& atoms, + const std::vector<std::string>& matched_atoms, + std::vector<int>* atom_indices) { + atom_indices->clear(); + for (size_t i = 0; i < matched_atoms.size(); i++) { + for (size_t j = 0; j < atoms.size(); j++) { + if (matched_atoms[i] == atoms[j]) { + atom_indices->push_back(static_cast<int>(j)); + break; + } + } + } +} + +TEST(FilteredRE2Test, MatchEmptyPattern) { + FilterTestVars v; + AtomTest* t = &atom_tests[0]; + // We are using the regexps used in one of the atom tests + // for this test. Adding the EXPECT here to make sure + // the index we use for the test is for the correct test. + EXPECT_EQ("CheckEmptyPattern", std::string(t->testname)); + size_t nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + std::string text = "0123"; + std::vector<int> atom_ids; + std::vector<int> matching_regexps; + EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); +} + +TEST(FilteredRE2Test, MatchTests) { + FilterTestVars v; + AtomTest* t = &atom_tests[2]; + // We are using the regexps used in one of the atom tests + // for this test. + EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname)); + size_t nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + + std::string text = "abc121212xyz"; + // atoms = abc + std::vector<int> atom_ids; + std::vector<std::string> atoms; + atoms.push_back("abc"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + std::vector<int> matching_regexps; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abc12312yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abcd12yyy32yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("abcd"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + LOG(INFO) << "S: " << atom_ids.size(); + for (size_t i = 0; i < atom_ids.size(); i++) + LOG(INFO) << "i: " << i << " : " << atom_ids[i]; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(2, matching_regexps.size()); +} + +TEST(FilteredRE2Test, EmptyStringInStringSetBug) { + // Bug due to find() finding "" at the start of everything in a string + // set and thus SimplifyStringSet() would end up erasing everything. + // In order to test this, we have to keep PrefilterTree from discarding + // the OR entirely, so we have to make the minimum atom length zero. + + FilterTestVars v(0); // override the minimum atom length + const char* regexps[] = {"-R.+(|ADD=;AA){12}}"}; + const char* atoms[] = {"", "-r", "add=;aa", "}"}; + AddRegexpsAndCompile(regexps, arraysize(regexps), &v); + EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms), + "EmptyStringInStringSetBug", &v)); +} + +TEST(FilteredRE2Test, MoveSemantics) { + FilterTestVars v1; + int id; + v1.f.Add("foo\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("foo", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + + // The moved-to object should do what the moved-from object did. + FilterTestVars v2; + v2.f = std::move(v1.f); + v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches); + EXPECT_EQ(1, v2.matches.size()); + EXPECT_EQ(0, v2.matches[0]); + v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches); + EXPECT_EQ(0, v2.matches.size()); + + // The moved-from object should have been reset and be reusable. + v1.f.Add("bar\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("bar", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + v1.f = std::move(v2.f); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/mimics_pcre_test.cc b/contrib/libs/re2/re2/testing/mimics_pcre_test.cc index ac2612e71a..cb21aef726 100644 --- a/contrib/libs/re2/re2/testing/mimics_pcre_test.cc +++ b/contrib/libs/re2/re2/testing/mimics_pcre_test.cc @@ -1,77 +1,77 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/prog.h" -#include "re2/regexp.h" - -namespace re2 { - -struct PCRETest { - const char* regexp; - bool should_match; -}; - -static PCRETest tests[] = { - // Most things should behave exactly. - { "abc", true }, - { "(a|b)c", true }, - { "(a*|b)c", true }, - { "(a|b*)c", true }, - { "a(b|c)d", true }, - { "a(()|())c", true }, - { "ab*c", true }, - { "ab+c", true }, - { "a(b*|c*)d", true }, - { "\\W", true }, - { "\\W{1,2}", true }, - { "\\d", true }, - - // Check that repeated empty strings do not. - { "(a*)*", false }, - { "x(a*)*y", false }, - { "(a*)+", false }, - { "(a+)*", true }, - { "(a+)+", true }, - { "(a+)+", true }, - - // \v is the only character class that shouldn't. - { "\\b", true }, - { "\\v", false }, - { "\\d", true }, - - // The handling of ^ in multi-line mode is different, as is - // the handling of $ in single-line mode. (Both involve - // boundary cases if the string ends with \n.) - { "\\A", true }, - { "\\z", true }, - { "(?m)^", false }, - { "(?m)$", true }, - { "(?-m)^", true }, - { "(?-m)$", false }, // In PCRE, == \Z - { "(?m)\\A", true }, - { "(?m)\\z", true }, - { "(?-m)\\A", true }, - { "(?-m)\\z", true }, -}; - -TEST(MimicsPCRE, SimpleTests) { - for (size_t i = 0; i < arraysize(tests); i++) { - const PCRETest& t = tests[i]; - for (size_t j = 0; j < 2; j++) { - Regexp::ParseFlags flags = Regexp::LikePerl; - if (j == 0) - flags = flags | Regexp::Latin1; - Regexp* re = Regexp::Parse(t.regexp, flags, NULL); - ASSERT_TRUE(re != NULL) << " " << t.regexp; - ASSERT_EQ(t.should_match, re->MimicsPCRE()) - << " " << t.regexp << " " - << (j == 0 ? "latin1" : "utf"); - re->Decref(); - } - } -} - -} // namespace re2 +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +struct PCRETest { + const char* regexp; + bool should_match; +}; + +static PCRETest tests[] = { + // Most things should behave exactly. + { "abc", true }, + { "(a|b)c", true }, + { "(a*|b)c", true }, + { "(a|b*)c", true }, + { "a(b|c)d", true }, + { "a(()|())c", true }, + { "ab*c", true }, + { "ab+c", true }, + { "a(b*|c*)d", true }, + { "\\W", true }, + { "\\W{1,2}", true }, + { "\\d", true }, + + // Check that repeated empty strings do not. + { "(a*)*", false }, + { "x(a*)*y", false }, + { "(a*)+", false }, + { "(a+)*", true }, + { "(a+)+", true }, + { "(a+)+", true }, + + // \v is the only character class that shouldn't. + { "\\b", true }, + { "\\v", false }, + { "\\d", true }, + + // The handling of ^ in multi-line mode is different, as is + // the handling of $ in single-line mode. (Both involve + // boundary cases if the string ends with \n.) + { "\\A", true }, + { "\\z", true }, + { "(?m)^", false }, + { "(?m)$", true }, + { "(?-m)^", true }, + { "(?-m)$", false }, // In PCRE, == \Z + { "(?m)\\A", true }, + { "(?m)\\z", true }, + { "(?-m)\\A", true }, + { "(?-m)\\z", true }, +}; + +TEST(MimicsPCRE, SimpleTests) { + for (size_t i = 0; i < arraysize(tests); i++) { + const PCRETest& t = tests[i]; + for (size_t j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + ASSERT_TRUE(re != NULL) << " " << t.regexp; + ASSERT_EQ(t.should_match, re->MimicsPCRE()) + << " " << t.regexp << " " + << (j == 0 ? "latin1" : "utf"); + re->Decref(); + } + } +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/null_walker.cc b/contrib/libs/re2/re2/testing/null_walker.cc index 32a2aa0d8a..adc46068e6 100644 --- a/contrib/libs/re2/re2/testing/null_walker.cc +++ b/contrib/libs/re2/re2/testing/null_walker.cc @@ -1,49 +1,49 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/regexp.h" -#include "re2/walker-inl.h" - -namespace re2 { - -// Null walker. For benchmarking the walker itself. - -class NullWalker : public Regexp::Walker<bool> { - public: - NullWalker() {} - - virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args); - - virtual bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk(), not WalkExponential(). -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - LOG(DFATAL) << "NullWalker::ShortVisit called"; -#endif - return a; - } - - private: - NullWalker(const NullWalker&) = delete; - NullWalker& operator=(const NullWalker&) = delete; -}; - -// Called after visiting re's children. child_args contains the return -// value from each of the children's PostVisits (i.e., whether each child -// can match an empty string). Returns whether this clause can match an -// empty string. -bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args) { - return false; -} - -// Returns whether re can match an empty string. -void Regexp::NullWalk() { - NullWalker w; - w.Walk(this, false); -} - -} // namespace re2 +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// Null walker. For benchmarking the walker itself. + +class NullWalker : public Regexp::Walker<bool> { + public: + NullWalker() {} + + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + LOG(DFATAL) << "NullWalker::ShortVisit called"; +#endif + return a; + } + + private: + NullWalker(const NullWalker&) = delete; + NullWalker& operator=(const NullWalker&) = delete; +}; + +// Called after visiting re's children. child_args contains the return +// value from each of the children's PostVisits (i.e., whether each child +// can match an empty string). Returns whether this clause can match an +// empty string. +bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args) { + return false; +} + +// Returns whether re can match an empty string. +void Regexp::NullWalk() { + NullWalker w; + w.Walk(this, false); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/parse_test.cc b/contrib/libs/re2/re2/testing/parse_test.cc index 6856e4e162..b1cbfdc5c8 100644 --- a/contrib/libs/re2/re2/testing/parse_test.cc +++ b/contrib/libs/re2/re2/testing/parse_test.cc @@ -1,509 +1,509 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Test parse.cc, dump.cc, and tostring.cc. - -#include <string> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/regexp.h" - -namespace re2 { - -// In the past, we used 1<<30 here and zeroed the bit later, but that -// has undefined behaviour, so now we use an internal-only flag because -// otherwise we would have to introduce a new flag value just for this. -static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar; - -struct Test { - const char* regexp; - const char* parse; - Regexp::ParseFlags flags; -}; - -static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | - Regexp::PerlX | - Regexp::PerlClasses | - Regexp::UnicodeGroups; - -static Test tests[] = { - // Base cases - { "a", "lit{a}" }, - { "a.", "cat{lit{a}dot{}}" }, - { "a.b", "cat{lit{a}dot{}lit{b}}" }, - { "ab", "str{ab}" }, - { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" }, - { "abc", "str{abc}" }, - { "a|^", "alt{lit{a}bol{}}" }, - { "a|b", "cc{0x61-0x62}" }, - { "(a)", "cap{lit{a}}" }, - { "(a)|b", "alt{cap{lit{a}}lit{b}}" }, - { "a*", "star{lit{a}}" }, - { "a+", "plus{lit{a}}" }, - { "a?", "que{lit{a}}" }, - { "a{2}", "rep{2,2 lit{a}}" }, - { "a{2,3}", "rep{2,3 lit{a}}" }, - { "a{2,}", "rep{2,-1 lit{a}}" }, - { "a*?", "nstar{lit{a}}" }, - { "a+?", "nplus{lit{a}}" }, - { "a??", "nque{lit{a}}" }, - { "a{2}?", "nrep{2,2 lit{a}}" }, - { "a{2,3}?", "nrep{2,3 lit{a}}" }, - { "a{2,}?", "nrep{2,-1 lit{a}}" }, - { "", "emp{}" }, - { "|", "alt{emp{}emp{}}" }, - { "|x|", "alt{emp{}lit{x}emp{}}" }, - { ".", "dot{}" }, - { "^", "bol{}" }, - { "$", "eol{}" }, - { "\\|", "lit{|}" }, - { "\\(", "lit{(}" }, - { "\\)", "lit{)}" }, - { "\\*", "lit{*}" }, - { "\\+", "lit{+}" }, - { "\\?", "lit{?}" }, - { "{", "lit{{}" }, - { "}", "lit{}}" }, - { "\\.", "lit{.}" }, - { "\\^", "lit{^}" }, - { "\\$", "lit{$}" }, - { "\\\\", "lit{\\}" }, - { "[ace]", "cc{0x61 0x63 0x65}" }, - { "[abc]", "cc{0x61-0x63}" }, - { "[a-z]", "cc{0x61-0x7a}" }, - { "[a]", "lit{a}" }, - { "\\-", "lit{-}" }, - { "-", "lit{-}" }, - { "\\_", "lit{_}" }, - - // Posix and Perl extensions - { "[[:lower:]]", "cc{0x61-0x7a}" }, - { "[a-z]", "cc{0x61-0x7a}" }, - { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, - { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, - { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, - { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, - { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, - { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, - { "\\d", "cc{0x30-0x39}" }, - { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" }, - { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" }, - { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" }, - { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" }, - { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" }, - { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" }, - { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, - { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" }, - { "\\C", "byte{}" }, - - // Unicode, negatives, and a double negative. - { "\\p{Braille}", "cc{0x2800-0x28ff}" }, - { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, - { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, - { "\\P{^Braille}", "cc{0x2800-0x28ff}" }, - - // More interesting regular expressions. - { "a{,2}", "str{a{,2}}" }, - { "\\.\\^\\$\\\\", "str{.^$\\}" }, - { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" }, - { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, - { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8 - { "a*{", "cat{star{lit{a}}lit{{}}" }, - - // Test precedences - { "(?:ab)*", "star{str{ab}}" }, - { "(ab)*", "star{cap{str{ab}}}" }, - { "ab|cd", "alt{str{ab}str{cd}}" }, - { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, - - // Test squashing of **, ++, ?? et cetera. - { "(?:(?:a)*)*", "star{lit{a}}" }, - { "(?:(?:a)+)+", "plus{lit{a}}" }, - { "(?:(?:a)?)?", "que{lit{a}}" }, - { "(?:(?:a)*)+", "star{lit{a}}" }, - { "(?:(?:a)*)?", "star{lit{a}}" }, - { "(?:(?:a)+)*", "star{lit{a}}" }, - { "(?:(?:a)+)?", "star{lit{a}}" }, - { "(?:(?:a)?)*", "star{lit{a}}" }, - { "(?:(?:a)?)+", "star{lit{a}}" }, - - // Test flattening. - { "(?:a)", "lit{a}" }, - { "(?:ab)(?:cd)", "str{abcd}" }, - { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, - { "a|c", "cc{0x61 0x63}" }, - { "a|[cd]", "cc{0x61 0x63-0x64}" }, - { "a|.", "dot{}" }, - { "[ab]|c", "cc{0x61-0x63}" }, - { "[ab]|[cd]", "cc{0x61-0x64}" }, - { "[ab]|.", "dot{}" }, - { ".|c", "dot{}" }, - { ".|[cd]", "dot{}" }, - { ".|.", "dot{}" }, - - // Test Perl quoted literals - { "\\Q+|*?{[\\E", "str{+|*?{[}" }, - { "\\Q+\\E+", "plus{lit{+}}" }, - { "\\Q\\\\E", "lit{\\}" }, - { "\\Q\\\\\\E", "str{\\\\}" }, - { "\\Qa\\E*", "star{lit{a}}" }, - { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" }, - { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" }, - - // Test Perl \A and \z - { "(?m)^", "bol{}" }, - { "(?m)$", "eol{}" }, - { "(?-m)^", "bot{}" }, - { "(?-m)$", "eot{}" }, - { "(?m)\\A", "bot{}" }, - { "(?m)\\z", "eot{\\z}" }, - { "(?-m)\\A", "bot{}" }, - { "(?-m)\\z", "eot{\\z}" }, - - // Test named captures - { "(?P<name>a)", "cap{name:lit{a}}" }, +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test parse.cc, dump.cc, and tostring.cc. + +#include <string> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/regexp.h" + +namespace re2 { + +// In the past, we used 1<<30 here and zeroed the bit later, but that +// has undefined behaviour, so now we use an internal-only flag because +// otherwise we would have to introduce a new flag value just for this. +static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar; + +struct Test { + const char* regexp; + const char* parse; + Regexp::ParseFlags flags; +}; + +static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | + Regexp::PerlX | + Regexp::PerlClasses | + Regexp::UnicodeGroups; + +static Test tests[] = { + // Base cases + { "a", "lit{a}" }, + { "a.", "cat{lit{a}dot{}}" }, + { "a.b", "cat{lit{a}dot{}lit{b}}" }, + { "ab", "str{ab}" }, + { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" }, + { "abc", "str{abc}" }, + { "a|^", "alt{lit{a}bol{}}" }, + { "a|b", "cc{0x61-0x62}" }, + { "(a)", "cap{lit{a}}" }, + { "(a)|b", "alt{cap{lit{a}}lit{b}}" }, + { "a*", "star{lit{a}}" }, + { "a+", "plus{lit{a}}" }, + { "a?", "que{lit{a}}" }, + { "a{2}", "rep{2,2 lit{a}}" }, + { "a{2,3}", "rep{2,3 lit{a}}" }, + { "a{2,}", "rep{2,-1 lit{a}}" }, + { "a*?", "nstar{lit{a}}" }, + { "a+?", "nplus{lit{a}}" }, + { "a??", "nque{lit{a}}" }, + { "a{2}?", "nrep{2,2 lit{a}}" }, + { "a{2,3}?", "nrep{2,3 lit{a}}" }, + { "a{2,}?", "nrep{2,-1 lit{a}}" }, + { "", "emp{}" }, + { "|", "alt{emp{}emp{}}" }, + { "|x|", "alt{emp{}lit{x}emp{}}" }, + { ".", "dot{}" }, + { "^", "bol{}" }, + { "$", "eol{}" }, + { "\\|", "lit{|}" }, + { "\\(", "lit{(}" }, + { "\\)", "lit{)}" }, + { "\\*", "lit{*}" }, + { "\\+", "lit{+}" }, + { "\\?", "lit{?}" }, + { "{", "lit{{}" }, + { "}", "lit{}}" }, + { "\\.", "lit{.}" }, + { "\\^", "lit{^}" }, + { "\\$", "lit{$}" }, + { "\\\\", "lit{\\}" }, + { "[ace]", "cc{0x61 0x63 0x65}" }, + { "[abc]", "cc{0x61-0x63}" }, + { "[a-z]", "cc{0x61-0x7a}" }, + { "[a]", "lit{a}" }, + { "\\-", "lit{-}" }, + { "-", "lit{-}" }, + { "\\_", "lit{_}" }, + + // Posix and Perl extensions + { "[[:lower:]]", "cc{0x61-0x7a}" }, + { "[a-z]", "cc{0x61-0x7a}" }, + { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, + { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, + { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "\\d", "cc{0x30-0x39}" }, + { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" }, + { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" }, + { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" }, + { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" }, + { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" }, + { "\\C", "byte{}" }, + + // Unicode, negatives, and a double negative. + { "\\p{Braille}", "cc{0x2800-0x28ff}" }, + { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, + { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, + { "\\P{^Braille}", "cc{0x2800-0x28ff}" }, + + // More interesting regular expressions. + { "a{,2}", "str{a{,2}}" }, + { "\\.\\^\\$\\\\", "str{.^$\\}" }, + { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" }, + { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, + { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8 + { "a*{", "cat{star{lit{a}}lit{{}}" }, + + // Test precedences + { "(?:ab)*", "star{str{ab}}" }, + { "(ab)*", "star{cap{str{ab}}}" }, + { "ab|cd", "alt{str{ab}str{cd}}" }, + { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, + + // Test squashing of **, ++, ?? et cetera. + { "(?:(?:a)*)*", "star{lit{a}}" }, + { "(?:(?:a)+)+", "plus{lit{a}}" }, + { "(?:(?:a)?)?", "que{lit{a}}" }, + { "(?:(?:a)*)+", "star{lit{a}}" }, + { "(?:(?:a)*)?", "star{lit{a}}" }, + { "(?:(?:a)+)*", "star{lit{a}}" }, + { "(?:(?:a)+)?", "star{lit{a}}" }, + { "(?:(?:a)?)*", "star{lit{a}}" }, + { "(?:(?:a)?)+", "star{lit{a}}" }, + + // Test flattening. + { "(?:a)", "lit{a}" }, + { "(?:ab)(?:cd)", "str{abcd}" }, + { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, + { "a|c", "cc{0x61 0x63}" }, + { "a|[cd]", "cc{0x61 0x63-0x64}" }, + { "a|.", "dot{}" }, + { "[ab]|c", "cc{0x61-0x63}" }, + { "[ab]|[cd]", "cc{0x61-0x64}" }, + { "[ab]|.", "dot{}" }, + { ".|c", "dot{}" }, + { ".|[cd]", "dot{}" }, + { ".|.", "dot{}" }, + + // Test Perl quoted literals + { "\\Q+|*?{[\\E", "str{+|*?{[}" }, + { "\\Q+\\E+", "plus{lit{+}}" }, + { "\\Q\\\\E", "lit{\\}" }, + { "\\Q\\\\\\E", "str{\\\\}" }, + { "\\Qa\\E*", "star{lit{a}}" }, + { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" }, + { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" }, + + // Test Perl \A and \z + { "(?m)^", "bol{}" }, + { "(?m)$", "eol{}" }, + { "(?-m)^", "bot{}" }, + { "(?-m)$", "eot{}" }, + { "(?m)\\A", "bot{}" }, + { "(?m)\\z", "eot{\\z}" }, + { "(?-m)\\A", "bot{}" }, + { "(?-m)\\z", "eot{\\z}" }, + + // Test named captures + { "(?P<name>a)", "cap{name:lit{a}}" }, { "(?P<中文>a)", "cap{中文:lit{a}}" }, - - // Case-folded literals - { "[Aa]", "litfold{a}" }, - - // Strings - { "abcde", "str{abcde}" }, - { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" }, - - // Reported bug involving \n leaking in despite use of NeverNL. - { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, - { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, - { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags }, - { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, - { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags }, - { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, - { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags }, - { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, - { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, - { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, - { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, - { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, - { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", - Regexp::PerlClasses }, - { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", - Regexp::PerlClasses | Regexp::FoldCase }, - { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", - Regexp::PerlClasses | Regexp::NeverNL }, - { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", - Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, - { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", - Regexp::PerlClasses }, - { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", - Regexp::PerlClasses | Regexp::FoldCase }, - { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", - Regexp::PerlClasses | Regexp::NeverNL }, - { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", - Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, - - // Bug in Regexp::ToString() that emitted [^], which - // would (obviously) fail to parse when fed back in. - { "[\\s\\S]", "cc{0-0x10ffff}" }, -}; - -bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { - return Regexp::Equal(a, b); -} - -void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, - const std::string& title) { - Regexp** re = new Regexp*[ntests]; - for (int i = 0; i < ntests; i++) { - RegexpStatus status; - Regexp::ParseFlags f = flags; - if (tests[i].flags != 0) { - f = tests[i].flags & ~TestZeroFlags; - } - re[i] = Regexp::Parse(tests[i].regexp, f, &status); - ASSERT_TRUE(re[i] != NULL) - << " " << tests[i].regexp << " " << status.Text(); - std::string s = re[i]->Dump(); - EXPECT_EQ(std::string(tests[i].parse), s) - << "Regexp: " << tests[i].regexp - << "\nparse: " << std::string(tests[i].parse) - << " s: " << s << " flag=" << f; - } - - for (int i = 0; i < ntests; i++) { - for (int j = 0; j < ntests; j++) { - EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse), - RegexpEqualTestingOnly(re[i], re[j])) - << "Regexp: " << tests[i].regexp << " " << tests[j].regexp; - } - } - - for (int i = 0; i < ntests; i++) - re[i]->Decref(); - delete[] re; -} - -// Test that regexps parse to expected structures. -TEST(TestParse, SimpleRegexps) { - TestParse(tests, arraysize(tests), kTestFlags, "simple"); -} - -Test foldcase_tests[] = { - { "AbCdE", "strfold{abcde}" }, - { "[Aa]", "litfold{a}" }, - { "a", "litfold{a}" }, - - // 0x17F is an old English long s (looks like an f) and folds to s. - // 0x212A is the Kelvin symbol and folds to k. - { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...] - { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, - { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, -}; - -// Test that parsing with FoldCase works. -TEST(TestParse, FoldCase) { - TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase"); -} - -Test literal_tests[] = { - { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" }, -}; - -// Test that parsing with Literal works. -TEST(TestParse, Literal) { - TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal"); -} - -Test matchnl_tests[] = { - { ".", "dot{}" }, - { "\n", "lit{\n}" }, - { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, - { "[a\\n]", "cc{0xa 0x61}" }, -}; - -// Test that parsing with MatchNL works. -// (Also tested above during simple cases.) -TEST(TestParse, MatchNL) { - TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL"); -} - -Test nomatchnl_tests[] = { - { ".", "cc{0-0x9 0xb-0x10ffff}" }, - { "\n", "lit{\n}" }, - { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" }, - { "[a\\n]", "cc{0xa 0x61}" }, -}; - -// Test that parsing without MatchNL works. -TEST(TestParse, NoMatchNL) { - TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL"); -} - -Test prefix_tests[] = { - { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" }, - { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" }, - { "abc|abd|aef|bcx|bcy", - "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}" - "cat{str{bc}cc{0x78-0x79}}}" }, - { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" }, - { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" }, - { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" }, - { ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" }, - { "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" }, - { "x{2}|x{2}[0-9]", - "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, - { "x{2}y|x{2}[0-9]y", - "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" }, - { "n|r|rs", - "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" }, - { "n|rs|r", - "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" }, - { "r|rs|n", - "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" }, - { "rs|r|n", - "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" }, - { "a\\C*?c|a\\C*?b", - "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" }, - { "^/a/bc|^/a/de", - "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" }, - // In the past, factoring was limited to kFactorAlternationMaxDepth (8). - { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa", - "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" - "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" - "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" - "lit{a}}}}}}}}}}}}}}}}}}}" }, - { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones", - "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}" - "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}" - "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" }, -}; - -// Test that prefix factoring works. -TEST(TestParse, Prefix) { - TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix"); -} - -Test nested_tests[] = { - { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))", - "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" }, - { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})", - "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" }, - { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})", - "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" }, - { "((((((x{2}){2}){2}){5}){5}){5})", - "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" }, -}; - -// Test that nested repetition works. -TEST(TestParse, Nested) { - TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested"); -} - -// Invalid regular expressions -const char* badtests[] = { - "(", - ")", - "(a", - "(a|b|", - "(a|b", - "[a-z", - "([a-z)", - "x{1001}", - "\xff", // Invalid UTF-8 - "[\xff]", - "[\\\xff]", - "\\\xff", - "(?P<name>a", - "(?P<name>", - "(?P<name", - "(?P<x y>a)", - "(?P<>a)", - "[a-Z]", - "(?i)[a-Z]", - "a{100000}", - "a{100000,}", - "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", - "(((x{7}){11}){13})", - "\\Q\\E*", -}; - -// Valid in Perl, bad in POSIX -const char* only_perl[] = { - "[a-b-c]", - "\\Qabc\\E", - "\\Q*+?{[\\E", - "\\Q\\\\E", - "\\Q\\\\\\E", - "\\Q\\\\\\\\E", - "\\Q\\\\\\\\\\E", - "(?:a)", - "(?P<name>a)", -}; - -// Valid in POSIX, bad in Perl. -const char* only_posix[] = { - "a++", - "a**", - "a?*", - "a+*", - "a{1}*", -}; - -// Test that parser rejects bad regexps. -TEST(TestParse, InvalidRegexps) { - for (size_t i = 0; i < arraysize(badtests); i++) { - ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL) - << " " << badtests[i]; - ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL) - << " " << badtests[i]; - } - for (size_t i = 0; i < arraysize(only_posix); i++) { - ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL) - << " " << only_posix[i]; - Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL); - ASSERT_TRUE(re != NULL) << " " << only_posix[i]; - re->Decref(); - } - for (size_t i = 0; i < arraysize(only_perl); i++) { - ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL) - << " " << only_perl[i]; - Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL); - ASSERT_TRUE(re != NULL) << " " << only_perl[i]; - re->Decref(); - } -} - -// Test that ToString produces original regexp or equivalent one. -TEST(TestToString, EquivalentParse) { - for (size_t i = 0; i < arraysize(tests); i++) { - RegexpStatus status; - Regexp::ParseFlags f = kTestFlags; - if (tests[i].flags != 0) { - f = tests[i].flags & ~TestZeroFlags; - } - Regexp* re = Regexp::Parse(tests[i].regexp, f, &status); - ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text(); - std::string s = re->Dump(); - EXPECT_EQ(std::string(tests[i].parse), s) - << "Regexp: " << tests[i].regexp - << "\nparse: " << std::string(tests[i].parse) - << " s: " << s << " flag=" << f; - std::string t = re->ToString(); - if (t != tests[i].regexp) { - // If ToString didn't return the original regexp, - // it must have found one with fewer parens. - // Unfortunately we can't check the length here, because - // ToString produces "\\{" for a literal brace, - // but "{" is a shorter equivalent. - // ASSERT_LT(t.size(), strlen(tests[i].regexp)) - // << " t=" << t << " regexp=" << tests[i].regexp; - - // Test that if we parse the new regexp we get the same structure. - Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); - ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text(); - std::string ss = nre->Dump(); - std::string tt = nre->ToString(); - if (s != ss || t != tt) - LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t; - EXPECT_EQ(s, ss); - EXPECT_EQ(t, tt); - nre->Decref(); - } - re->Decref(); - } -} - -// Test that capture error args are correct. -TEST(NamedCaptures, ErrorArgs) { - RegexpStatus status; - Regexp* re; - - re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status); - EXPECT_TRUE(re == NULL); - EXPECT_EQ(status.code(), kRegexpBadNamedCapture); - EXPECT_EQ(status.error_arg(), "(?P<name"); - - re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status); - EXPECT_TRUE(re == NULL); - EXPECT_EQ(status.code(), kRegexpBadNamedCapture); - EXPECT_EQ(status.error_arg(), "(?P<space bar>"); -} - -} // namespace re2 + + // Case-folded literals + { "[Aa]", "litfold{a}" }, + + // Strings + { "abcde", "str{abcde}" }, + { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" }, + + // Reported bug involving \n leaking in despite use of NeverNL. + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::FoldCase }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::FoldCase }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, + + // Bug in Regexp::ToString() that emitted [^], which + // would (obviously) fail to parse when fed back in. + { "[\\s\\S]", "cc{0-0x10ffff}" }, +}; + +bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { + return Regexp::Equal(a, b); +} + +void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, + const std::string& title) { + Regexp** re = new Regexp*[ntests]; + for (int i = 0; i < ntests; i++) { + RegexpStatus status; + Regexp::ParseFlags f = flags; + if (tests[i].flags != 0) { + f = tests[i].flags & ~TestZeroFlags; + } + re[i] = Regexp::Parse(tests[i].regexp, f, &status); + ASSERT_TRUE(re[i] != NULL) + << " " << tests[i].regexp << " " << status.Text(); + std::string s = re[i]->Dump(); + EXPECT_EQ(std::string(tests[i].parse), s) + << "Regexp: " << tests[i].regexp + << "\nparse: " << std::string(tests[i].parse) + << " s: " << s << " flag=" << f; + } + + for (int i = 0; i < ntests; i++) { + for (int j = 0; j < ntests; j++) { + EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse), + RegexpEqualTestingOnly(re[i], re[j])) + << "Regexp: " << tests[i].regexp << " " << tests[j].regexp; + } + } + + for (int i = 0; i < ntests; i++) + re[i]->Decref(); + delete[] re; +} + +// Test that regexps parse to expected structures. +TEST(TestParse, SimpleRegexps) { + TestParse(tests, arraysize(tests), kTestFlags, "simple"); +} + +Test foldcase_tests[] = { + { "AbCdE", "strfold{abcde}" }, + { "[Aa]", "litfold{a}" }, + { "a", "litfold{a}" }, + + // 0x17F is an old English long s (looks like an f) and folds to s. + // 0x212A is the Kelvin symbol and folds to k. + { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...] + { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, +}; + +// Test that parsing with FoldCase works. +TEST(TestParse, FoldCase) { + TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase"); +} + +Test literal_tests[] = { + { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" }, +}; + +// Test that parsing with Literal works. +TEST(TestParse, Literal) { + TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal"); +} + +Test matchnl_tests[] = { + { ".", "dot{}" }, + { "\n", "lit{\n}" }, + { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, + { "[a\\n]", "cc{0xa 0x61}" }, +}; + +// Test that parsing with MatchNL works. +// (Also tested above during simple cases.) +TEST(TestParse, MatchNL) { + TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL"); +} + +Test nomatchnl_tests[] = { + { ".", "cc{0-0x9 0xb-0x10ffff}" }, + { "\n", "lit{\n}" }, + { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" }, + { "[a\\n]", "cc{0xa 0x61}" }, +}; + +// Test that parsing without MatchNL works. +TEST(TestParse, NoMatchNL) { + TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL"); +} + +Test prefix_tests[] = { + { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" }, + { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" }, + { "abc|abd|aef|bcx|bcy", + "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}" + "cat{str{bc}cc{0x78-0x79}}}" }, + { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" }, + { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" }, + { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" }, + { ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" }, + { "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" }, + { "x{2}|x{2}[0-9]", + "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, + { "x{2}y|x{2}[0-9]y", + "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" }, + { "n|r|rs", + "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" }, + { "n|rs|r", + "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" }, + { "r|rs|n", + "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" }, + { "rs|r|n", + "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" }, + { "a\\C*?c|a\\C*?b", + "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" }, + { "^/a/bc|^/a/de", + "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" }, + // In the past, factoring was limited to kFactorAlternationMaxDepth (8). + { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa", + "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" + "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" + "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" + "lit{a}}}}}}}}}}}}}}}}}}}" }, + { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones", + "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}" + "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}" + "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" }, +}; + +// Test that prefix factoring works. +TEST(TestParse, Prefix) { + TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix"); +} + +Test nested_tests[] = { + { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))", + "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" }, + { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})", + "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" }, + { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})", + "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" }, + { "((((((x{2}){2}){2}){5}){5}){5})", + "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" }, +}; + +// Test that nested repetition works. +TEST(TestParse, Nested) { + TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested"); +} + +// Invalid regular expressions +const char* badtests[] = { + "(", + ")", + "(a", + "(a|b|", + "(a|b", + "[a-z", + "([a-z)", + "x{1001}", + "\xff", // Invalid UTF-8 + "[\xff]", + "[\\\xff]", + "\\\xff", + "(?P<name>a", + "(?P<name>", + "(?P<name", + "(?P<x y>a)", + "(?P<>a)", + "[a-Z]", + "(?i)[a-Z]", + "a{100000}", + "a{100000,}", + "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", + "(((x{7}){11}){13})", + "\\Q\\E*", +}; + +// Valid in Perl, bad in POSIX +const char* only_perl[] = { + "[a-b-c]", + "\\Qabc\\E", + "\\Q*+?{[\\E", + "\\Q\\\\E", + "\\Q\\\\\\E", + "\\Q\\\\\\\\E", + "\\Q\\\\\\\\\\E", + "(?:a)", + "(?P<name>a)", +}; + +// Valid in POSIX, bad in Perl. +const char* only_posix[] = { + "a++", + "a**", + "a?*", + "a+*", + "a{1}*", +}; + +// Test that parser rejects bad regexps. +TEST(TestParse, InvalidRegexps) { + for (size_t i = 0; i < arraysize(badtests); i++) { + ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL) + << " " << badtests[i]; + ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL) + << " " << badtests[i]; + } + for (size_t i = 0; i < arraysize(only_posix); i++) { + ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL) + << " " << only_posix[i]; + Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL); + ASSERT_TRUE(re != NULL) << " " << only_posix[i]; + re->Decref(); + } + for (size_t i = 0; i < arraysize(only_perl); i++) { + ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL) + << " " << only_perl[i]; + Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL); + ASSERT_TRUE(re != NULL) << " " << only_perl[i]; + re->Decref(); + } +} + +// Test that ToString produces original regexp or equivalent one. +TEST(TestToString, EquivalentParse) { + for (size_t i = 0; i < arraysize(tests); i++) { + RegexpStatus status; + Regexp::ParseFlags f = kTestFlags; + if (tests[i].flags != 0) { + f = tests[i].flags & ~TestZeroFlags; + } + Regexp* re = Regexp::Parse(tests[i].regexp, f, &status); + ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text(); + std::string s = re->Dump(); + EXPECT_EQ(std::string(tests[i].parse), s) + << "Regexp: " << tests[i].regexp + << "\nparse: " << std::string(tests[i].parse) + << " s: " << s << " flag=" << f; + std::string t = re->ToString(); + if (t != tests[i].regexp) { + // If ToString didn't return the original regexp, + // it must have found one with fewer parens. + // Unfortunately we can't check the length here, because + // ToString produces "\\{" for a literal brace, + // but "{" is a shorter equivalent. + // ASSERT_LT(t.size(), strlen(tests[i].regexp)) + // << " t=" << t << " regexp=" << tests[i].regexp; + + // Test that if we parse the new regexp we get the same structure. + Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); + ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text(); + std::string ss = nre->Dump(); + std::string tt = nre->ToString(); + if (s != ss || t != tt) + LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t; + EXPECT_EQ(s, ss); + EXPECT_EQ(t, tt); + nre->Decref(); + } + re->Decref(); + } +} + +// Test that capture error args are correct. +TEST(NamedCaptures, ErrorArgs) { + RegexpStatus status; + Regexp* re; + + re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status); + EXPECT_TRUE(re == NULL); + EXPECT_EQ(status.code(), kRegexpBadNamedCapture); + EXPECT_EQ(status.error_arg(), "(?P<name"); + + re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status); + EXPECT_TRUE(re == NULL); + EXPECT_EQ(status.code(), kRegexpBadNamedCapture); + EXPECT_EQ(status.error_arg(), "(?P<space bar>"); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/possible_match_test.cc b/contrib/libs/re2/re2/testing/possible_match_test.cc index 6b06053dde..f337217b92 100644 --- a/contrib/libs/re2/re2/testing/possible_match_test.cc +++ b/contrib/libs/re2/re2/testing/possible_match_test.cc @@ -1,247 +1,247 @@ -// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <string.h> -#include <string> -#include <vector> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "util/strutil.h" -#include "re2/prog.h" -#include "re2/re2.h" -#include "re2/regexp.h" -#include "re2/testing/exhaustive_tester.h" -#include "re2/testing/regexp_generator.h" -#include "re2/testing/string_generator.h" - -namespace re2 { - -// Test that C++ strings are compared as uint8s, not int8s. -// PossibleMatchRange doesn't depend on this, but callers probably will. -TEST(CplusplusStrings, EightBit) { - std::string s = "\x70"; - std::string t = "\xA0"; - EXPECT_LT(s, t); -} - -struct PrefixTest { - const char* regexp; - int maxlen; - const char* min; - const char* max; -}; - -static PrefixTest tests[] = { - { "", 10, "", "", }, - { "Abcdef", 10, "Abcdef", "Abcdef" }, - { "abc(def|ghi)", 10, "abcdef", "abcghi" }, - { "a+hello", 10, "aa", "ahello" }, - { "a*hello", 10, "a", "hello" }, - { "def|abc", 10, "abc", "def" }, - { "a(b)(c)[d]", 10, "abcd", "abcd" }, - { "ab(cab|cat)", 10, "abcab", "abcat" }, - { "ab(cab|ca)x", 10, "abcabx", "abcax" }, - { "(ab|x)(c|de)", 10, "abc", "xde" }, - { "(ab|x)?(c|z)?", 10, "", "z" }, - { "[^\\s\\S]", 10, "", "" }, - { "(abc)+", 5, "abc", "abcac" }, - { "(abc)+", 2, "ab", "ac" }, - { "(abc)+", 1, "a", "b" }, - { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, - { "a*", 10, "", "ab" }, - - { "(?i)Abcdef", 10, "ABCDEF", "abcdef" }, - { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" }, - { "(?i)a+hello", 10, "AA", "ahello" }, - { "(?i)a*hello", 10, "A", "hello" }, - { "(?i)def|abc", 10, "ABC", "def" }, - { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" }, - { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" }, - { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" }, - { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" }, - { "(?i)(ab|x)?(c|z)?", 10, "", "z" }, - { "(?i)[^\\s\\S]", 10, "", "" }, - { "(?i)(abc)+", 5, "ABC", "abcac" }, - { "(?i)(abc)+", 2, "AB", "ac" }, - { "(?i)(abc)+", 1, "A", "b" }, - { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, - { "(?i)a*", 10, "", "ab" }, - { "(?i)A*", 10, "", "ab" }, - - { "\\AAbcdef", 10, "Abcdef", "Abcdef" }, - { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" }, - { "\\Aa+hello", 10, "aa", "ahello" }, - { "\\Aa*hello", 10, "a", "hello" }, - { "\\Adef|abc", 10, "abc", "def" }, - { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" }, - { "\\Aab(cab|cat)", 10, "abcab", "abcat" }, - { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" }, - { "\\A(ab|x)(c|de)", 10, "abc", "xde" }, - { "\\A(ab|x)?(c|z)?", 10, "", "z" }, - { "\\A[^\\s\\S]", 10, "", "" }, - { "\\A(abc)+", 5, "abc", "abcac" }, - { "\\A(abc)+", 2, "ab", "ac" }, - { "\\A(abc)+", 1, "a", "b" }, - { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, - { "\\Aa*", 10, "", "ab" }, - - { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" }, - { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" }, - { "(?i)\\Aa+hello", 10, "AA", "ahello" }, - { "(?i)\\Aa*hello", 10, "A", "hello" }, - { "(?i)\\Adef|abc", 10, "ABC", "def" }, - { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" }, - { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" }, - { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" }, - { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" }, - { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" }, - { "(?i)\\A[^\\s\\S]", 10, "", "" }, - { "(?i)\\A(abc)+", 5, "ABC", "abcac" }, - { "(?i)\\A(abc)+", 2, "AB", "ac" }, - { "(?i)\\A(abc)+", 1, "A", "b" }, - { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, - { "(?i)\\Aa*", 10, "", "ab" }, - { "(?i)\\AA*", 10, "", "ab" }, -}; - -TEST(PossibleMatchRange, HandWritten) { - for (size_t i = 0; i < arraysize(tests); i++) { - for (size_t j = 0; j < 2; j++) { - const PrefixTest& t = tests[i]; - std::string min, max; - if (j == 0) { - LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); - Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); - ASSERT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(0); - ASSERT_TRUE(prog != NULL); - ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen)) - << " " << t.regexp; - delete prog; - re->Decref(); - } else { - ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen)); - } - EXPECT_EQ(t.min, min) << t.regexp; - EXPECT_EQ(t.max, max) << t.regexp; - } - } -} - -// Test cases where PossibleMatchRange should return false. -TEST(PossibleMatchRange, Failures) { - std::string min, max; - - // Fails because no room to write max. - EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0)); - - // Fails because there is no max -- any non-empty string matches - // or begins a match. Have to use Latin-1 input, because there - // are no valid UTF-8 strings beginning with byte 0xFF. - EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). - PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); - EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). - PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); - EXPECT_FALSE(RE2(".+hello", RE2::Latin1). - PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); - EXPECT_FALSE(RE2(".*hello", RE2::Latin1). - PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); - EXPECT_FALSE(RE2(".*", RE2::Latin1). - PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); - EXPECT_FALSE(RE2("\\C*"). - PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); - - // Fails because it's a malformed regexp. - EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); -} - -// Exhaustive test: generate all regexps within parameters, -// then generate all strings of a given length over a given alphabet, -// then check that the prefix information agrees with whether -// the regexp matches each of the strings. -class PossibleMatchTester : public RegexpGenerator { - public: - PossibleMatchTester(int maxatoms, - int maxops, - const std::vector<std::string>& alphabet, - const std::vector<std::string>& ops, - int maxstrlen, - const std::vector<std::string>& stralphabet) - : RegexpGenerator(maxatoms, maxops, alphabet, ops), - strgen_(maxstrlen, stralphabet), - regexps_(0), tests_(0) { } - - int regexps() { return regexps_; } - int tests() { return tests_; } - - // Needed for RegexpGenerator interface. - void HandleRegexp(const std::string& regexp); - - private: - StringGenerator strgen_; - - int regexps_; // Number of HandleRegexp calls - int tests_; // Number of regexp tests. - - PossibleMatchTester(const PossibleMatchTester&) = delete; - PossibleMatchTester& operator=(const PossibleMatchTester&) = delete; -}; - -// Processes a single generated regexp. -// Checks that all accepted strings agree with the prefix range. -void PossibleMatchTester::HandleRegexp(const std::string& regexp) { - regexps_++; - - VLOG(3) << CEscape(regexp); - - RE2 re(regexp, RE2::Latin1); - ASSERT_EQ(re.error(), ""); - - std::string min, max; - if(!re.PossibleMatchRange(&min, &max, 10)) { - // There's no good max for "\\C*". Can't use strcmp - // because sometimes it gets embedded in more - // complicated expressions. - if(strstr(regexp.c_str(), "\\C*")) - return; - LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); - } - - strgen_.Reset(); - while (strgen_.HasNext()) { - const StringPiece& s = strgen_.Next(); - tests_++; - if (!RE2::FullMatch(s, re)) - continue; - ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max; - ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min; - } -} - -TEST(PossibleMatchRange, Exhaustive) { - int natom = 3; - int noperator = 3; - int stringlen = 5; - if (RE2_DEBUG_MODE) { - natom = 2; - noperator = 3; - stringlen = 3; - } - PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"), - RegexpGenerator::EgrepOps(), - stringlen, Explode("ab4")); - t.Generate(); - LOG(INFO) << t.regexps() << " regexps, " - << t.tests() << " tests"; -} - -} // namespace re2 +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <string.h> +#include <string> +#include <vector> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/testing/exhaustive_tester.h" +#include "re2/testing/regexp_generator.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +// Test that C++ strings are compared as uint8s, not int8s. +// PossibleMatchRange doesn't depend on this, but callers probably will. +TEST(CplusplusStrings, EightBit) { + std::string s = "\x70"; + std::string t = "\xA0"; + EXPECT_LT(s, t); +} + +struct PrefixTest { + const char* regexp; + int maxlen; + const char* min; + const char* max; +}; + +static PrefixTest tests[] = { + { "", 10, "", "", }, + { "Abcdef", 10, "Abcdef", "Abcdef" }, + { "abc(def|ghi)", 10, "abcdef", "abcghi" }, + { "a+hello", 10, "aa", "ahello" }, + { "a*hello", 10, "a", "hello" }, + { "def|abc", 10, "abc", "def" }, + { "a(b)(c)[d]", 10, "abcd", "abcd" }, + { "ab(cab|cat)", 10, "abcab", "abcat" }, + { "ab(cab|ca)x", 10, "abcabx", "abcax" }, + { "(ab|x)(c|de)", 10, "abc", "xde" }, + { "(ab|x)?(c|z)?", 10, "", "z" }, + { "[^\\s\\S]", 10, "", "" }, + { "(abc)+", 5, "abc", "abcac" }, + { "(abc)+", 2, "ab", "ac" }, + { "(abc)+", 1, "a", "b" }, + { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, + { "a*", 10, "", "ab" }, + + { "(?i)Abcdef", 10, "ABCDEF", "abcdef" }, + { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" }, + { "(?i)a+hello", 10, "AA", "ahello" }, + { "(?i)a*hello", 10, "A", "hello" }, + { "(?i)def|abc", 10, "ABC", "def" }, + { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" }, + { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" }, + { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" }, + { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" }, + { "(?i)(ab|x)?(c|z)?", 10, "", "z" }, + { "(?i)[^\\s\\S]", 10, "", "" }, + { "(?i)(abc)+", 5, "ABC", "abcac" }, + { "(?i)(abc)+", 2, "AB", "ac" }, + { "(?i)(abc)+", 1, "A", "b" }, + { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, + { "(?i)a*", 10, "", "ab" }, + { "(?i)A*", 10, "", "ab" }, + + { "\\AAbcdef", 10, "Abcdef", "Abcdef" }, + { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" }, + { "\\Aa+hello", 10, "aa", "ahello" }, + { "\\Aa*hello", 10, "a", "hello" }, + { "\\Adef|abc", 10, "abc", "def" }, + { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" }, + { "\\Aab(cab|cat)", 10, "abcab", "abcat" }, + { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" }, + { "\\A(ab|x)(c|de)", 10, "abc", "xde" }, + { "\\A(ab|x)?(c|z)?", 10, "", "z" }, + { "\\A[^\\s\\S]", 10, "", "" }, + { "\\A(abc)+", 5, "abc", "abcac" }, + { "\\A(abc)+", 2, "ab", "ac" }, + { "\\A(abc)+", 1, "a", "b" }, + { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, + { "\\Aa*", 10, "", "ab" }, + + { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" }, + { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" }, + { "(?i)\\Aa+hello", 10, "AA", "ahello" }, + { "(?i)\\Aa*hello", 10, "A", "hello" }, + { "(?i)\\Adef|abc", 10, "ABC", "def" }, + { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" }, + { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" }, + { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" }, + { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" }, + { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" }, + { "(?i)\\A[^\\s\\S]", 10, "", "" }, + { "(?i)\\A(abc)+", 5, "ABC", "abcac" }, + { "(?i)\\A(abc)+", 2, "AB", "ac" }, + { "(?i)\\A(abc)+", 1, "A", "b" }, + { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, + { "(?i)\\Aa*", 10, "", "ab" }, + { "(?i)\\AA*", 10, "", "ab" }, +}; + +TEST(PossibleMatchRange, HandWritten) { + for (size_t i = 0; i < arraysize(tests); i++) { + for (size_t j = 0; j < 2; j++) { + const PrefixTest& t = tests[i]; + std::string min, max; + if (j == 0) { + LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); + Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + ASSERT_TRUE(prog != NULL); + ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen)) + << " " << t.regexp; + delete prog; + re->Decref(); + } else { + ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen)); + } + EXPECT_EQ(t.min, min) << t.regexp; + EXPECT_EQ(t.max, max) << t.regexp; + } + } +} + +// Test cases where PossibleMatchRange should return false. +TEST(PossibleMatchRange, Failures) { + std::string min, max; + + // Fails because no room to write max. + EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0)); + + // Fails because there is no max -- any non-empty string matches + // or begins a match. Have to use Latin-1 input, because there + // are no valid UTF-8 strings beginning with byte 0xFF. + EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".+hello", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".*hello", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".*", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2("\\C*"). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + + // Fails because it's a malformed regexp. + EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); +} + +// Exhaustive test: generate all regexps within parameters, +// then generate all strings of a given length over a given alphabet, +// then check that the prefix information agrees with whether +// the regexp matches each of the strings. +class PossibleMatchTester : public RegexpGenerator { + public: + PossibleMatchTester(int maxatoms, + int maxops, + const std::vector<std::string>& alphabet, + const std::vector<std::string>& ops, + int maxstrlen, + const std::vector<std::string>& stralphabet) + : RegexpGenerator(maxatoms, maxops, alphabet, ops), + strgen_(maxstrlen, stralphabet), + regexps_(0), tests_(0) { } + + int regexps() { return regexps_; } + int tests() { return tests_; } + + // Needed for RegexpGenerator interface. + void HandleRegexp(const std::string& regexp); + + private: + StringGenerator strgen_; + + int regexps_; // Number of HandleRegexp calls + int tests_; // Number of regexp tests. + + PossibleMatchTester(const PossibleMatchTester&) = delete; + PossibleMatchTester& operator=(const PossibleMatchTester&) = delete; +}; + +// Processes a single generated regexp. +// Checks that all accepted strings agree with the prefix range. +void PossibleMatchTester::HandleRegexp(const std::string& regexp) { + regexps_++; + + VLOG(3) << CEscape(regexp); + + RE2 re(regexp, RE2::Latin1); + ASSERT_EQ(re.error(), ""); + + std::string min, max; + if(!re.PossibleMatchRange(&min, &max, 10)) { + // There's no good max for "\\C*". Can't use strcmp + // because sometimes it gets embedded in more + // complicated expressions. + if(strstr(regexp.c_str(), "\\C*")) + return; + LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); + } + + strgen_.Reset(); + while (strgen_.HasNext()) { + const StringPiece& s = strgen_.Next(); + tests_++; + if (!RE2::FullMatch(s, re)) + continue; + ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max; + ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min; + } +} + +TEST(PossibleMatchRange, Exhaustive) { + int natom = 3; + int noperator = 3; + int stringlen = 5; + if (RE2_DEBUG_MODE) { + natom = 2; + noperator = 3; + stringlen = 3; + } + PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"), + RegexpGenerator::EgrepOps(), + stringlen, Explode("ab4")); + t.Generate(); + LOG(INFO) << t.regexps() << " regexps, " + << t.tests() << " tests"; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/re2_arg_test.cc b/contrib/libs/re2/re2/testing/re2_arg_test.cc index e576491540..8df90ab8f2 100644 --- a/contrib/libs/re2/re2/testing/re2_arg_test.cc +++ b/contrib/libs/re2/re2/testing/re2_arg_test.cc @@ -1,160 +1,160 @@ -// Copyright 2005 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This tests to make sure numbers are parsed from strings -// correctly. -// Todo: Expand the test to validate strings parsed to the other types -// supported by RE2::Arg class - -#include <stdint.h> -#include <string.h> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/re2.h" - -namespace re2 { - -struct SuccessTable { - const char * value_string; - int64_t value; - bool success[6]; -}; - -// Test boundary cases for different integral sizes. -// Specifically I want to make sure that values outside the boundries -// of an integral type will fail and that negative numbers will fail -// for unsigned types. The following table contains the boundaries for -// the various integral types and has entries for whether or not each -// type can contain the given value. -const SuccessTable kSuccessTable[] = { -// string integer value i16 u16 i32 u32 i64 u64 -// 0 to 2^7-1 -{ "0", 0, { true, true, true, true, true, true }}, -{ "127", 127, { true, true, true, true, true, true }}, - -// -1 to -2^7 -{ "-1", -1, { true, false, true, false, true, false }}, -{ "-128", -128, { true, false, true, false, true, false }}, - -// 2^7 to 2^8-1 -{ "128", 128, { true, true, true, true, true, true }}, -{ "255", 255, { true, true, true, true, true, true }}, - -// 2^8 to 2^15-1 -{ "256", 256, { true, true, true, true, true, true }}, -{ "32767", 32767, { true, true, true, true, true, true }}, - -// -2^7-1 to -2^15 -{ "-129", -129, { true, false, true, false, true, false }}, -{ "-32768", -32768, { true, false, true, false, true, false }}, - -// 2^15 to 2^16-1 -{ "32768", 32768, { false, true, true, true, true, true }}, -{ "65535", 65535, { false, true, true, true, true, true }}, - -// 2^16 to 2^31-1 -{ "65536", 65536, { false, false, true, true, true, true }}, -{ "2147483647", 2147483647, { false, false, true, true, true, true }}, - -// -2^15-1 to -2^31 -{ "-32769", -32769, { false, false, true, false, true, false }}, -{ "-2147483648", static_cast<int64_t>(0xFFFFFFFF80000000LL), - { false, false, true, false, true, false }}, - -// 2^31 to 2^32-1 -{ "2147483648", 2147483648U, { false, false, false, true, true, true }}, -{ "4294967295", 4294967295U, { false, false, false, true, true, true }}, - -// 2^32 to 2^63-1 -{ "4294967296", 4294967296LL, { false, false, false, false, true, true }}, -{ "9223372036854775807", - 9223372036854775807LL, { false, false, false, false, true, true }}, - -// -2^31-1 to -2^63 -{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }}, -{ "-9223372036854775808", static_cast<int64_t>(0x8000000000000000LL), - { false, false, false, false, true, false }}, - -// 2^63 to 2^64-1 -{ "9223372036854775808", static_cast<int64_t>(9223372036854775808ULL), - { false, false, false, false, false, true }}, -{ "18446744073709551615", static_cast<int64_t>(18446744073709551615ULL), - { false, false, false, false, false, true }}, - -// >= 2^64 -{ "18446744073709551616", 0, { false, false, false, false, false, false }}, -}; - -const int kNumStrings = arraysize(kSuccessTable); - -// It's ugly to use a macro, but we apparently can't use the EXPECT_EQ -// macro outside of a TEST block and this seems to be the only way to -// avoid code duplication. I can also pull off a couple nice tricks -// using concatenation for the type I'm checking against. -#define PARSE_FOR_TYPE(type, column) { \ - type r; \ - for (int i = 0; i < kNumStrings; ++i) { \ - RE2::Arg arg(&r); \ - const char* const p = kSuccessTable[i].value_string; \ - bool retval = arg.Parse(p, strlen(p)); \ - bool success = kSuccessTable[i].success[column]; \ - EXPECT_EQ(retval, success) \ - << "Parsing '" << p << "' for type " #type " should return " \ - << success; \ - if (success) { \ - EXPECT_EQ(r, (type)kSuccessTable[i].value); \ - } \ - } \ -} - -TEST(RE2ArgTest, Int16Test) { - PARSE_FOR_TYPE(int16_t, 0); -} - -TEST(RE2ArgTest, Uint16Test) { - PARSE_FOR_TYPE(uint16_t, 1); -} - -TEST(RE2ArgTest, Int32Test) { - PARSE_FOR_TYPE(int32_t, 2); -} - -TEST(RE2ArgTest, Uint32Test) { - PARSE_FOR_TYPE(uint32_t, 3); -} - -TEST(RE2ArgTest, Int64Test) { - PARSE_FOR_TYPE(int64_t, 4); -} - -TEST(RE2ArgTest, Uint64Test) { - PARSE_FOR_TYPE(uint64_t, 5); -} - -TEST(RE2ArgTest, ParseFromTest) { -#if !defined(_MSC_VER) - struct { - bool ParseFrom(const char* str, size_t n) { - LOG(INFO) << "str = " << str << ", n = " << n; - return true; - } - } obj1; - RE2::Arg arg1(&obj1); - EXPECT_TRUE(arg1.Parse("one", 3)); - - struct { - bool ParseFrom(const char* str, size_t n) { - LOG(INFO) << "str = " << str << ", n = " << n; - return false; - } - // Ensure that RE2::Arg works even with overloaded ParseFrom(). - void ParseFrom(const char* str) {} - } obj2; - RE2::Arg arg2(&obj2); - EXPECT_FALSE(arg2.Parse("two", 3)); -#endif -} - -} // namespace re2 +// Copyright 2005 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This tests to make sure numbers are parsed from strings +// correctly. +// Todo: Expand the test to validate strings parsed to the other types +// supported by RE2::Arg class + +#include <stdint.h> +#include <string.h> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/re2.h" + +namespace re2 { + +struct SuccessTable { + const char * value_string; + int64_t value; + bool success[6]; +}; + +// Test boundary cases for different integral sizes. +// Specifically I want to make sure that values outside the boundries +// of an integral type will fail and that negative numbers will fail +// for unsigned types. The following table contains the boundaries for +// the various integral types and has entries for whether or not each +// type can contain the given value. +const SuccessTable kSuccessTable[] = { +// string integer value i16 u16 i32 u32 i64 u64 +// 0 to 2^7-1 +{ "0", 0, { true, true, true, true, true, true }}, +{ "127", 127, { true, true, true, true, true, true }}, + +// -1 to -2^7 +{ "-1", -1, { true, false, true, false, true, false }}, +{ "-128", -128, { true, false, true, false, true, false }}, + +// 2^7 to 2^8-1 +{ "128", 128, { true, true, true, true, true, true }}, +{ "255", 255, { true, true, true, true, true, true }}, + +// 2^8 to 2^15-1 +{ "256", 256, { true, true, true, true, true, true }}, +{ "32767", 32767, { true, true, true, true, true, true }}, + +// -2^7-1 to -2^15 +{ "-129", -129, { true, false, true, false, true, false }}, +{ "-32768", -32768, { true, false, true, false, true, false }}, + +// 2^15 to 2^16-1 +{ "32768", 32768, { false, true, true, true, true, true }}, +{ "65535", 65535, { false, true, true, true, true, true }}, + +// 2^16 to 2^31-1 +{ "65536", 65536, { false, false, true, true, true, true }}, +{ "2147483647", 2147483647, { false, false, true, true, true, true }}, + +// -2^15-1 to -2^31 +{ "-32769", -32769, { false, false, true, false, true, false }}, +{ "-2147483648", static_cast<int64_t>(0xFFFFFFFF80000000LL), + { false, false, true, false, true, false }}, + +// 2^31 to 2^32-1 +{ "2147483648", 2147483648U, { false, false, false, true, true, true }}, +{ "4294967295", 4294967295U, { false, false, false, true, true, true }}, + +// 2^32 to 2^63-1 +{ "4294967296", 4294967296LL, { false, false, false, false, true, true }}, +{ "9223372036854775807", + 9223372036854775807LL, { false, false, false, false, true, true }}, + +// -2^31-1 to -2^63 +{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }}, +{ "-9223372036854775808", static_cast<int64_t>(0x8000000000000000LL), + { false, false, false, false, true, false }}, + +// 2^63 to 2^64-1 +{ "9223372036854775808", static_cast<int64_t>(9223372036854775808ULL), + { false, false, false, false, false, true }}, +{ "18446744073709551615", static_cast<int64_t>(18446744073709551615ULL), + { false, false, false, false, false, true }}, + +// >= 2^64 +{ "18446744073709551616", 0, { false, false, false, false, false, false }}, +}; + +const int kNumStrings = arraysize(kSuccessTable); + +// It's ugly to use a macro, but we apparently can't use the EXPECT_EQ +// macro outside of a TEST block and this seems to be the only way to +// avoid code duplication. I can also pull off a couple nice tricks +// using concatenation for the type I'm checking against. +#define PARSE_FOR_TYPE(type, column) { \ + type r; \ + for (int i = 0; i < kNumStrings; ++i) { \ + RE2::Arg arg(&r); \ + const char* const p = kSuccessTable[i].value_string; \ + bool retval = arg.Parse(p, strlen(p)); \ + bool success = kSuccessTable[i].success[column]; \ + EXPECT_EQ(retval, success) \ + << "Parsing '" << p << "' for type " #type " should return " \ + << success; \ + if (success) { \ + EXPECT_EQ(r, (type)kSuccessTable[i].value); \ + } \ + } \ +} + +TEST(RE2ArgTest, Int16Test) { + PARSE_FOR_TYPE(int16_t, 0); +} + +TEST(RE2ArgTest, Uint16Test) { + PARSE_FOR_TYPE(uint16_t, 1); +} + +TEST(RE2ArgTest, Int32Test) { + PARSE_FOR_TYPE(int32_t, 2); +} + +TEST(RE2ArgTest, Uint32Test) { + PARSE_FOR_TYPE(uint32_t, 3); +} + +TEST(RE2ArgTest, Int64Test) { + PARSE_FOR_TYPE(int64_t, 4); +} + +TEST(RE2ArgTest, Uint64Test) { + PARSE_FOR_TYPE(uint64_t, 5); +} + +TEST(RE2ArgTest, ParseFromTest) { +#if !defined(_MSC_VER) + struct { + bool ParseFrom(const char* str, size_t n) { + LOG(INFO) << "str = " << str << ", n = " << n; + return true; + } + } obj1; + RE2::Arg arg1(&obj1); + EXPECT_TRUE(arg1.Parse("one", 3)); + + struct { + bool ParseFrom(const char* str, size_t n) { + LOG(INFO) << "str = " << str << ", n = " << n; + return false; + } + // Ensure that RE2::Arg works even with overloaded ParseFrom(). + void ParseFrom(const char* str) {} + } obj2; + RE2::Arg arg2(&obj2); + EXPECT_FALSE(arg2.Parse("two", 3)); +#endif +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/re2_test.cc b/contrib/libs/re2/re2/testing/re2_test.cc index ae3b3c3863..9ffe1467d8 100644 --- a/contrib/libs/re2/re2/testing/re2_test.cc +++ b/contrib/libs/re2/re2/testing/re2_test.cc @@ -1,1659 +1,1659 @@ -// -*- coding: utf-8 -*- -// Copyright 2002-2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// TODO: Test extractions for PartialMatch/Consume - -#include <errno.h> -#include <stddef.h> -#include <stdint.h> -#include <string.h> -#include <map> -#include <string> -#include <utility> -#include <vector> -#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) -#include <sys/mman.h> -#include <unistd.h> /* for sysconf */ -#endif - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "util/strutil.h" -#include "re2/re2.h" -#include "re2/regexp.h" - -namespace re2 { - -TEST(RE2, HexTests) { -#define ASSERT_HEX(type, value) \ - do { \ - type v; \ - ASSERT_TRUE( \ - RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \ - ASSERT_EQ(v, 0x##value); \ - ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ - RE2::CRadix(&v))); \ - ASSERT_EQ(v, 0x##value); \ - } while (0) - - ASSERT_HEX(short, 2bad); - ASSERT_HEX(unsigned short, 2badU); - ASSERT_HEX(int, dead); - ASSERT_HEX(unsigned int, deadU); - ASSERT_HEX(long, 7eadbeefL); - ASSERT_HEX(unsigned long, deadbeefUL); - ASSERT_HEX(long long, 12345678deadbeefLL); - ASSERT_HEX(unsigned long long, cafebabedeadbeefULL); - -#undef ASSERT_HEX -} - -TEST(RE2, OctalTests) { -#define ASSERT_OCTAL(type, value) \ - do { \ - type v; \ - ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \ - ASSERT_EQ(v, 0##value); \ - ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ - RE2::CRadix(&v))); \ - ASSERT_EQ(v, 0##value); \ - } while (0) - - ASSERT_OCTAL(short, 77777); - ASSERT_OCTAL(unsigned short, 177777U); - ASSERT_OCTAL(int, 17777777777); - ASSERT_OCTAL(unsigned int, 37777777777U); - ASSERT_OCTAL(long, 17777777777L); - ASSERT_OCTAL(unsigned long, 37777777777UL); - ASSERT_OCTAL(long long, 777777777777777777777LL); - ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL); - -#undef ASSERT_OCTAL -} - -TEST(RE2, DecimalTests) { -#define ASSERT_DECIMAL(type, value) \ - do { \ - type v; \ - ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \ - ASSERT_EQ(v, value); \ - ASSERT_TRUE( \ - RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ - ASSERT_EQ(v, value); \ - } while (0) - - ASSERT_DECIMAL(short, -1); - ASSERT_DECIMAL(unsigned short, 9999); - ASSERT_DECIMAL(int, -1000); - ASSERT_DECIMAL(unsigned int, 12345U); - ASSERT_DECIMAL(long, -10000000L); - ASSERT_DECIMAL(unsigned long, 3083324652U); - ASSERT_DECIMAL(long long, -100000000000000LL); - ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL); - -#undef ASSERT_DECIMAL -} - -TEST(RE2, Replace) { - struct ReplaceTest { - const char *regexp; - const char *rewrite; - const char *original; - const char *single; - const char *global; - int greplace_count; - }; - static const ReplaceTest tests[] = { - { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", - "\\2\\1ay", - "the quick brown fox jumps over the lazy dogs.", - "ethay quick brown fox jumps over the lazy dogs.", - "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", - 9 }, - { "\\w+", - "\\0-NOSPAM", - "abcd.efghi@google.com", - "abcd-NOSPAM.efghi@google.com", - "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", - 4 }, - { "^", - "(START)", - "foo", - "(START)foo", - "(START)foo", - 1 }, - { "^", - "(START)", - "", - "(START)", - "(START)", - 1 }, - { "$", - "(END)", - "", - "(END)", - "(END)", - 1 }, - { "b", - "bb", - "ababababab", - "abbabababab", - "abbabbabbabbabb", - 5 }, - { "b", - "bb", - "bbbbbb", - "bbbbbbb", - "bbbbbbbbbbbb", - 6 }, - { "b+", - "bb", - "bbbbbb", - "bb", - "bb", - 1 }, - { "b*", - "bb", - "bbbbbb", - "bb", - "bb", - 1 }, - { "b*", - "bb", - "aaaaa", - "bbaaaaa", - "bbabbabbabbabbabb", - 6 }, - // Check newline handling - { "a.*a", - "(\\0)", - "aba\naba", - "(aba)\naba", - "(aba)\n(aba)", - 2 }, - { "", NULL, NULL, NULL, NULL, 0 } - }; - - for (const ReplaceTest* t = tests; t->original != NULL; t++) { - std::string one(t->original); - ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite)); - ASSERT_EQ(one, t->single); - std::string all(t->original); - ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) - << "Got: " << all; - ASSERT_EQ(all, t->global); - } -} - -static void TestCheckRewriteString(const char* regexp, const char* rewrite, - bool expect_ok) { - std::string error; - RE2 exp(regexp); - bool actual_ok = exp.CheckRewriteString(rewrite, &error); - EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; -} - -TEST(CheckRewriteString, all) { - TestCheckRewriteString("abc", "foo", true); - TestCheckRewriteString("abc", "foo\\", false); - TestCheckRewriteString("abc", "foo\\0bar", true); - - TestCheckRewriteString("a(b)c", "foo", true); - TestCheckRewriteString("a(b)c", "foo\\0bar", true); - TestCheckRewriteString("a(b)c", "foo\\1bar", true); - TestCheckRewriteString("a(b)c", "foo\\2bar", false); - TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); - - TestCheckRewriteString("a(b)(c)", "foo\\12", true); - TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); - TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); -} - -TEST(RE2, Extract) { - std::string s; - - ASSERT_TRUE(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); - ASSERT_EQ(s, "kremvax!boris"); - - ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s)); - ASSERT_EQ(s, "'foo'"); - // check that false match doesn't overwrite - ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s)); - ASSERT_EQ(s, "'foo'"); -} - -TEST(RE2, MaxSubmatchTooLarge) { - std::string s; - ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); - s = "foo"; - ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); - s = "foo"; - ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); -} - -TEST(RE2, Consume) { - RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace - std::string word; - - std::string s(" aaa b!@#$@#$cccc"); - StringPiece input(s); - - ASSERT_TRUE(RE2::Consume(&input, r, &word)); - ASSERT_EQ(word, "aaa") << " input: " << input; - ASSERT_TRUE(RE2::Consume(&input, r, &word)); - ASSERT_EQ(word, "b") << " input: " << input; - ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input; -} - -TEST(RE2, ConsumeN) { - const std::string s(" one two three 4"); - StringPiece input(s); - - RE2::Arg argv[2]; - const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; - - // 0 arg - EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one". - - // 1 arg - std::string word; - argv[0] = &word; - EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1)); - EXPECT_EQ("two", word); - - // Multi-args - int n; - argv[1] = &n; - EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2)); - EXPECT_EQ("three", word); - EXPECT_EQ(4, n); -} - -TEST(RE2, FindAndConsume) { - RE2 r("(\\w+)"); // matches a word - std::string word; - - std::string s(" aaa b!@#$@#$cccc"); - StringPiece input(s); - - ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); - ASSERT_EQ(word, "aaa"); - ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); - ASSERT_EQ(word, "b"); - ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); - ASSERT_EQ(word, "cccc"); - ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word)); - - // Check that FindAndConsume works without any submatches. - // Earlier version used uninitialized data for - // length to consume. - input = "aaa"; - ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa")); - ASSERT_EQ(input, ""); -} - -TEST(RE2, FindAndConsumeN) { - const std::string s(" one two three 4"); - StringPiece input(s); - - RE2::Arg argv[2]; - const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; - - // 0 arg - EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one". - - // 1 arg - std::string word; - argv[0] = &word; - EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1)); - EXPECT_EQ("two", word); - - // Multi-args - int n; - argv[1] = &n; - EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2)); - EXPECT_EQ("three", word); - EXPECT_EQ(4, n); -} - -TEST(RE2, MatchNumberPeculiarity) { - RE2 r("(foo)|(bar)|(baz)"); - std::string word1; - std::string word2; - std::string word3; - - ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3)); - ASSERT_EQ(word1, "foo"); - ASSERT_EQ(word2, ""); - ASSERT_EQ(word3, ""); - ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3)); - ASSERT_EQ(word1, ""); - ASSERT_EQ(word2, "bar"); - ASSERT_EQ(word3, ""); - ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3)); - ASSERT_EQ(word1, ""); - ASSERT_EQ(word2, ""); - ASSERT_EQ(word3, "baz"); - ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3)); - - std::string a; - ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a)); - ASSERT_EQ(a, ""); -} - -TEST(RE2, Match) { - RE2 re("((\\w+):([0-9]+))"); // extracts host and port - StringPiece group[4]; - - // No match. - StringPiece s = "zyzzyva"; - ASSERT_FALSE( - re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); - - // Matches and extracts. - s = "a chrisr:9000 here"; - ASSERT_TRUE( - re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); - ASSERT_EQ(group[0], "chrisr:9000"); - ASSERT_EQ(group[1], "chrisr:9000"); - ASSERT_EQ(group[2], "chrisr"); - ASSERT_EQ(group[3], "9000"); - - std::string all, host; - int port; - ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port)); - ASSERT_EQ(all, "chrisr:9000"); - ASSERT_EQ(host, "chrisr"); - ASSERT_EQ(port, 9000); -} - -static void TestRecursion(int size, const char* pattern) { - // Fill up a string repeating the pattern given - std::string domain; - domain.resize(size); - size_t patlen = strlen(pattern); - for (int i = 0; i < size; i++) { - domain[i] = pattern[i % patlen]; - } - // Just make sure it doesn't crash due to too much recursion. - RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet); - RE2::FullMatch(domain, re); -} - -// A meta-quoted string, interpreted as a pattern, should always match -// the original unquoted string. -static void TestQuoteMeta(const std::string& unquoted, - const RE2::Options& options = RE2::DefaultOptions) { - std::string quoted = RE2::QuoteMeta(unquoted); - RE2 re(quoted, options); - EXPECT_TRUE(RE2::FullMatch(unquoted, re)) - << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; -} - -// A meta-quoted string, interpreted as a pattern, should always match -// the original unquoted string. -static void NegativeTestQuoteMeta( - const std::string& unquoted, const std::string& should_not_match, - const RE2::Options& options = RE2::DefaultOptions) { - std::string quoted = RE2::QuoteMeta(unquoted); - RE2 re(quoted, options); - EXPECT_FALSE(RE2::FullMatch(should_not_match, re)) - << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; -} - -// Tests that quoted meta characters match their original strings, -// and that a few things that shouldn't match indeed do not. -TEST(QuoteMeta, Simple) { - TestQuoteMeta("foo"); - TestQuoteMeta("foo.bar"); - TestQuoteMeta("foo\\.bar"); - TestQuoteMeta("[1-9]"); - TestQuoteMeta("1.5-2.0?"); - TestQuoteMeta("\\d"); - TestQuoteMeta("Who doesn't like ice cream?"); - TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); - TestQuoteMeta("((?!)xxx).*yyy"); - TestQuoteMeta("(["); -} -TEST(QuoteMeta, SimpleNegative) { - NegativeTestQuoteMeta("foo", "bar"); - NegativeTestQuoteMeta("...", "bar"); - NegativeTestQuoteMeta("\\.", "."); - NegativeTestQuoteMeta("\\.", ".."); - NegativeTestQuoteMeta("(a)", "a"); - NegativeTestQuoteMeta("(a|b)", "a"); - NegativeTestQuoteMeta("(a|b)", "(a)"); - NegativeTestQuoteMeta("(a|b)", "a|b"); - NegativeTestQuoteMeta("[0-9]", "0"); - NegativeTestQuoteMeta("[0-9]", "0-9"); - NegativeTestQuoteMeta("[0-9]", "[9]"); - NegativeTestQuoteMeta("((?!)xxx)", "xxx"); -} - -TEST(QuoteMeta, Latin1) { - TestQuoteMeta("3\xb2 = 9", RE2::Latin1); -} - -TEST(QuoteMeta, UTF8) { - TestQuoteMeta("Plácido Domingo"); - TestQuoteMeta("xyz"); // No fancy utf8. - TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol. - TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character. - TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime. - TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note. - TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should - // still work. - NegativeTestQuoteMeta("27\xc2\xb0", - "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol. -} - -TEST(QuoteMeta, HasNull) { - std::string has_null; - - // string with one null character - has_null += '\0'; - TestQuoteMeta(has_null); - NegativeTestQuoteMeta(has_null, ""); - - // Don't want null-followed-by-'1' to be interpreted as '\01'. - has_null += '1'; - TestQuoteMeta(has_null); - NegativeTestQuoteMeta(has_null, "\1"); -} - -TEST(ProgramSize, BigProgram) { - RE2 re_simple("simple regexp"); - RE2 re_medium("medium.*regexp"); - RE2 re_complex("complex.{1,128}regexp"); - - ASSERT_GT(re_simple.ProgramSize(), 0); - ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); - ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); - - ASSERT_GT(re_simple.ReverseProgramSize(), 0); - ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize()); - ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize()); -} - -TEST(ProgramFanout, BigProgram) { - RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)"); - RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)"); - RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)"); - RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)"); - - std::vector<int> histogram; - - // 3 is the largest non-empty bucket and has 2 element. - ASSERT_EQ(3, re1.ProgramFanout(&histogram)); - ASSERT_EQ(2, histogram[3]); - - // 6 is the largest non-empty bucket and has 11 elements. - ASSERT_EQ(6, re10.ProgramFanout(&histogram)); - ASSERT_EQ(11, histogram[6]); - - // 9 is the largest non-empty bucket and has 101 elements. - ASSERT_EQ(9, re100.ProgramFanout(&histogram)); - ASSERT_EQ(101, histogram[9]); - - // 13 is the largest non-empty bucket and has 1001 elements. - ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); - ASSERT_EQ(1001, histogram[13]); - - // 2 is the largest non-empty bucket and has 2 element. - ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); - ASSERT_EQ(2, histogram[2]); - - // 5 is the largest non-empty bucket and has 11 elements. - ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); - ASSERT_EQ(11, histogram[5]); - - // 9 is the largest non-empty bucket and has 101 elements. - ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram)); - ASSERT_EQ(101, histogram[9]); - - // 12 is the largest non-empty bucket and has 1001 elements. - ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram)); - ASSERT_EQ(1001, histogram[12]); -} - -// Issue 956519: handling empty character sets was -// causing NULL dereference. This tests a few empty character sets. -// (The way to get an empty character set is to negate a full one.) -TEST(EmptyCharset, Fuzz) { - static const char *empties[] = { - "[^\\S\\s]", - "[^\\S[:space:]]", - "[^\\D\\d]", - "[^\\D[:digit:]]" - }; - for (size_t i = 0; i < arraysize(empties); i++) - ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); -} - -// Bitstate assumes that kInstFail instructions in -// alternations or capture groups have been "compiled away". -TEST(EmptyCharset, BitstateAssumptions) { - // Captures trigger use of Bitstate. - static const char *nop_empties[] = { - "((((()))))" "[^\\S\\s]?", - "((((()))))" "([^\\S\\s])?", - "((((()))))" "([^\\S\\s]|[^\\S\\s])?", - "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)" - }; - StringPiece group[6]; - for (size_t i = 0; i < arraysize(nop_empties); i++) - ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6)); -} - -// Test that named groups work correctly. -TEST(Capture, NamedGroups) { - { - RE2 re("(hello world)"); - ASSERT_EQ(re.NumberOfCapturingGroups(), 1); - const std::map<std::string, int>& m = re.NamedCapturingGroups(); - ASSERT_EQ(m.size(), 0); - } - - { - RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))"); - ASSERT_EQ(re.NumberOfCapturingGroups(), 6); - const std::map<std::string, int>& m = re.NamedCapturingGroups(); - ASSERT_EQ(m.size(), 4); - ASSERT_EQ(m.find("A")->second, 1); - ASSERT_EQ(m.find("B")->second, 2); - ASSERT_EQ(m.find("C")->second, 3); - ASSERT_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous - } -} - -TEST(RE2, CapturedGroupTest) { - RE2 re("directions from (?P<S>.*) to (?P<D>.*)"); - int num_groups = re.NumberOfCapturingGroups(); - EXPECT_EQ(2, num_groups); - std::string args[4]; - RE2::Arg arg0(&args[0]); - RE2::Arg arg1(&args[1]); - RE2::Arg arg2(&args[2]); - RE2::Arg arg3(&args[3]); - - const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3}; - EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose", - re, matches, num_groups)); - const std::map<std::string, int>& named_groups = re.NamedCapturingGroups(); - EXPECT_TRUE(named_groups.find("S") != named_groups.end()); - EXPECT_TRUE(named_groups.find("D") != named_groups.end()); - - // The named group index is 1-based. - int source_group_index = named_groups.find("S")->second; - int destination_group_index = named_groups.find("D")->second; - EXPECT_EQ(1, source_group_index); - EXPECT_EQ(2, destination_group_index); - - // The args is zero-based. - EXPECT_EQ("mountain view", args[source_group_index - 1]); - EXPECT_EQ("san jose", args[destination_group_index - 1]); -} - -TEST(RE2, FullMatchWithNoArgs) { - ASSERT_TRUE(RE2::FullMatch("h", "h")); - ASSERT_TRUE(RE2::FullMatch("hello", "hello")); - ASSERT_TRUE(RE2::FullMatch("hello", "h.*o")); - ASSERT_FALSE(RE2::FullMatch("othello", "h.*o")); // Must be anchored at front - ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end -} - -TEST(RE2, PartialMatch) { - ASSERT_TRUE(RE2::PartialMatch("x", "x")); - ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o")); - ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o")); - ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o")); - ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))")); -} - -TEST(RE2, PartialMatchN) { - RE2::Arg argv[2]; - const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; - - // 0 arg - EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0)); - EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0)); - - // 1 arg - int i; - argv[0] = &i; - EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1)); - EXPECT_EQ(1001, i); - EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1)); - - // Multi-arg - std::string s; - argv[1] = &s; - EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2)); - EXPECT_EQ(42, i); - EXPECT_EQ("life", s); - EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2)); -} - -TEST(RE2, FullMatchZeroArg) { - // Zero-arg - ASSERT_TRUE(RE2::FullMatch("1001", "\\d+")); -} - -TEST(RE2, FullMatchOneArg) { - int i; - - // Single-arg - ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)", &i)); - ASSERT_EQ(i, 1001); - ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i)); - ASSERT_EQ(i, -123); - ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i)); - ASSERT_FALSE( - RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i)); -} - -TEST(RE2, FullMatchIntegerArg) { - int i; - - // Digits surrounding integer-arg - ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i)); - ASSERT_EQ(i, 23); - ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i)); - ASSERT_EQ(i, 1); - ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i)); - ASSERT_EQ(i, -1); - ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i)); - ASSERT_EQ(i, 1); - ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i)); - ASSERT_EQ(i, -1); -} - -TEST(RE2, FullMatchStringArg) { - std::string s; - // String-arg - ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s)); - ASSERT_EQ(s, std::string("ell")); -} - -TEST(RE2, FullMatchStringPieceArg) { - int i; - // StringPiece-arg - StringPiece sp; - ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i)); - ASSERT_EQ(sp.size(), 4); - ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0); - ASSERT_EQ(i, 1234); -} - -TEST(RE2, FullMatchMultiArg) { - int i; - std::string s; - // Multi-arg - ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); - ASSERT_EQ(s, std::string("ruby")); - ASSERT_EQ(i, 1234); -} - -TEST(RE2, FullMatchN) { - RE2::Arg argv[2]; - const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; - - // 0 arg - EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0)); - EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0)); - - // 1 arg - int i; - argv[0] = &i; - EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1)); - EXPECT_EQ(1001, i); - EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1)); - - // Multi-arg - std::string s; - argv[1] = &s; - EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2)); - EXPECT_EQ(42, i); - EXPECT_EQ("life", s); - EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2)); -} - -TEST(RE2, FullMatchIgnoredArg) { - int i; - std::string s; - - // Old-school NULL should be ignored. - ASSERT_TRUE( - RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i)); - ASSERT_EQ(s, std::string("ruby")); - ASSERT_EQ(i, 1234); - - // C++11 nullptr should also be ignored. - ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i)); - ASSERT_EQ(s, std::string("rubz")); - ASSERT_EQ(i, 1235); -} - -TEST(RE2, FullMatchTypedNullArg) { - std::string s; - - // Ignore non-void* NULL arg - ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL)); - ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL)); - ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL)); - ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL)); - ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL)); - ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL)); - ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL)); - - // Fail on non-void* NULL arg if the match doesn't parse for the given type. - ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL)); - ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL)); - ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL)); - ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL)); - ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL)); -} - -// Check that numeric parsing code does not read past the end of -// the number being parsed. -// This implementation requires mmap(2) et al. and thus cannot -// be used unless they are available. -TEST(RE2, NULTerminated) { -#if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0 - char *v; - int x; - long pagesize = sysconf(_SC_PAGE_SIZE); - -#ifndef MAP_ANONYMOUS -#define MAP_ANONYMOUS MAP_ANON -#endif - v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); - ASSERT_TRUE(v != reinterpret_cast<char*>(-1)); - LOG(INFO) << "Memory at " << (void*)v; - ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno; - v[pagesize - 1] = '1'; - - x = 0; - ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); - ASSERT_EQ(x, 1); -#endif -} - -TEST(RE2, FullMatchTypeTests) { - // Type tests - std::string zeros(1000, '0'); - { - char c; - ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c)); - ASSERT_EQ(c, 'H'); - } - { - unsigned char c; - ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c)); - ASSERT_EQ(c, static_cast<unsigned char>('H')); - } - { - int16_t v; - ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); - ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); - ASSERT_TRUE(RE2::FullMatch("32767", "(-?\\d+)", &v)); ASSERT_EQ(v, 32767); - ASSERT_TRUE(RE2::FullMatch("-32768", "(-?\\d+)", &v)); ASSERT_EQ(v, -32768); - ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v)); - ASSERT_FALSE(RE2::FullMatch("32768", "(-?\\d+)", &v)); - } - { - uint16_t v; - ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100); - ASSERT_TRUE(RE2::FullMatch("32767", "(\\d+)", &v)); ASSERT_EQ(v, 32767); - ASSERT_TRUE(RE2::FullMatch("65535", "(\\d+)", &v)); ASSERT_EQ(v, 65535); - ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v)); - } - { - int32_t v; - static const int32_t max = INT32_C(0x7fffffff); - static const int32_t min = -max - 1; - ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); - ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); - ASSERT_TRUE(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); ASSERT_EQ(v, max); - ASSERT_TRUE(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); ASSERT_EQ(v, min); - ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v)); - ASSERT_FALSE(RE2::FullMatch("2147483648", "(-?\\d+)", &v)); - - ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v)); - ASSERT_EQ(v, max); - ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v)); - ASSERT_EQ(v, min); - - ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v)); - ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v))); - ASSERT_EQ(v, max); - ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v))); - } - { - uint32_t v; - static const uint32_t max = UINT32_C(0xffffffff); - ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100); - ASSERT_TRUE(RE2::FullMatch("4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max); - ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v)); - ASSERT_FALSE(RE2::FullMatch("-1", "(\\d+)", &v)); - - ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max); - } - { - int64_t v; - static const int64_t max = INT64_C(0x7fffffffffffffff); - static const int64_t min = -max - 1; - std::string str; - - ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); - ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); - - str = std::to_string(max); - ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max); - - str = std::to_string(min); - ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, min); - - str = std::to_string(max); - ASSERT_NE(str.back(), '9'); - str.back()++; - ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); - - str = std::to_string(min); - ASSERT_NE(str.back(), '9'); - str.back()++; - ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); - } - { - uint64_t v; - int64_t v2; - static const uint64_t max = UINT64_C(0xffffffffffffffff); - std::string str; - - ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); - ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100); - - str = std::to_string(max); - ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max); - - ASSERT_NE(str.back(), '9'); - str.back()++; - ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); - } -} - -TEST(RE2, FloatingPointFullMatchTypes) { - std::string zeros(1000, '0'); - { - float v; - ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100); - ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100); - ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, float(1e23)); - ASSERT_TRUE(RE2::FullMatch(" 100", "(.*)", &v)); ASSERT_EQ(v, 100); - - ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); - ASSERT_EQ(v, float(1e23)); - - // 6700000000081920.1 is an edge case. - // 6700000000081920 is exactly halfway between - // two float32s, so the .1 should make it round up. - // However, the .1 is outside the precision possible with - // a float64: the nearest float64 is 6700000000081920. - // So if the code uses strtod and then converts to float32, - // round-to-even will make it round down instead of up. - // To pass the test, the parser must call strtof directly. - // This test case is carefully chosen to use only a 17-digit - // number, since C does not guarantee to get the correctly - // rounded answer for strtod and strtof unless the input is - // short. - // - // This is known to fail on Cygwin and MinGW due to a broken - // implementation of strtof(3). And apparently MSVC too. Sigh. -#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) - ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); - ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); - ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); - ASSERT_EQ(v, 6700000000081920.1f) - << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); -#endif - } - { - double v; - ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100); - ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100); - ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, 1e23); - ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); - ASSERT_EQ(v, double(1e23)); - - ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); - ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); - ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); - ASSERT_EQ(v, 1.0000000596046448) - << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); - } -} - -TEST(RE2, FullMatchAnchored) { - int i; - // Check that matching is fully anchored - ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)", &i)); - ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)", &i)); - ASSERT_TRUE(RE2::FullMatch("x1001", "x(\\d+)", &i)); ASSERT_EQ(i, 1001); - ASSERT_TRUE(RE2::FullMatch("1001x", "(\\d+)x", &i)); ASSERT_EQ(i, 1001); -} - -TEST(RE2, FullMatchBraces) { - // Braces - ASSERT_TRUE(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}")); - ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}")); - ASSERT_FALSE(RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}")); -} - -TEST(RE2, Complicated) { - // Complicated RE2 - ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]")); - ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]")); - ASSERT_TRUE(RE2::FullMatch("X", "foo|bar|[A-Z]")); - ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]")); -} - -TEST(RE2, FullMatchEnd) { - // Check full-match handling (needs '$' tacked on internally) - ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo")); - ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo")); - ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$")); - ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$")); - ASSERT_TRUE(RE2::FullMatch("foo", "foo$")); - ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$")); - ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar")); - - // Uncomment the following if we change the handling of '$' to - // prevent it from matching a trailing newline - if (false) { - // Check that we don't get bitten by pcre's special handling of a - // '\n' at the end of the string matching '$' - ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$")); - } -} - -TEST(RE2, FullMatchArgCount) { - // Number of args - int a[16]; - ASSERT_TRUE(RE2::FullMatch("", "")); - - memset(a, 0, sizeof(0)); - ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0])); - ASSERT_EQ(a[0], 1); - - memset(a, 0, sizeof(0)); - ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1])); - ASSERT_EQ(a[0], 1); - ASSERT_EQ(a[1], 2); - - memset(a, 0, sizeof(0)); - ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2])); - ASSERT_EQ(a[0], 1); - ASSERT_EQ(a[1], 2); - ASSERT_EQ(a[2], 3); - - memset(a, 0, sizeof(0)); - ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1], - &a[2], &a[3])); - ASSERT_EQ(a[0], 1); - ASSERT_EQ(a[1], 2); - ASSERT_EQ(a[2], 3); - ASSERT_EQ(a[3], 4); - - memset(a, 0, sizeof(0)); - ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1], - &a[2], &a[3], &a[4])); - ASSERT_EQ(a[0], 1); - ASSERT_EQ(a[1], 2); - ASSERT_EQ(a[2], 3); - ASSERT_EQ(a[3], 4); - ASSERT_EQ(a[4], 5); - - memset(a, 0, sizeof(0)); - ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], - &a[1], &a[2], &a[3], &a[4], &a[5])); - ASSERT_EQ(a[0], 1); - ASSERT_EQ(a[1], 2); - ASSERT_EQ(a[2], 3); - ASSERT_EQ(a[3], 4); - ASSERT_EQ(a[4], 5); - ASSERT_EQ(a[5], 6); - - memset(a, 0, sizeof(0)); - ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", - &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6])); - ASSERT_EQ(a[0], 1); - ASSERT_EQ(a[1], 2); - ASSERT_EQ(a[2], 3); - ASSERT_EQ(a[3], 4); - ASSERT_EQ(a[4], 5); - ASSERT_EQ(a[5], 6); - ASSERT_EQ(a[6], 7); - - memset(a, 0, sizeof(0)); - ASSERT_TRUE(RE2::FullMatch("1234567890123456", - "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" - "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", - &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], - &a[7], &a[8], &a[9], &a[10], &a[11], &a[12], - &a[13], &a[14], &a[15])); - ASSERT_EQ(a[0], 1); - ASSERT_EQ(a[1], 2); - ASSERT_EQ(a[2], 3); - ASSERT_EQ(a[3], 4); - ASSERT_EQ(a[4], 5); - ASSERT_EQ(a[5], 6); - ASSERT_EQ(a[6], 7); - ASSERT_EQ(a[7], 8); - ASSERT_EQ(a[8], 9); - ASSERT_EQ(a[9], 0); - ASSERT_EQ(a[10], 1); - ASSERT_EQ(a[11], 2); - ASSERT_EQ(a[12], 3); - ASSERT_EQ(a[13], 4); - ASSERT_EQ(a[14], 5); - ASSERT_EQ(a[15], 6); -} - -TEST(RE2, Accessors) { - // Check the pattern() accessor - { - const std::string kPattern = "http://([^/]+)/.*"; - const RE2 re(kPattern); - ASSERT_EQ(kPattern, re.pattern()); - } - - // Check RE2 error field. - { - RE2 re("foo"); - ASSERT_TRUE(re.error().empty()); // Must have no error - ASSERT_TRUE(re.ok()); - ASSERT_EQ(re.error_code(), RE2::NoError); - } -} - -TEST(RE2, UTF8) { - // Check UTF-8 handling - // Three Japanese characters (nihongo) - const char utf8_string[] = { - (char)0xe6, (char)0x97, (char)0xa5, // 65e5 - (char)0xe6, (char)0x9c, (char)0xac, // 627c - (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e - 0 - }; - const char utf8_pattern[] = { - '.', - (char)0xe6, (char)0x9c, (char)0xac, // 627c - '.', - 0 - }; - - // Both should match in either mode, bytes or UTF-8 - RE2 re_test1(".........", RE2::Latin1); - ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1)); - RE2 re_test2("..."); - ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2)); - - // Check that '.' matches one byte or UTF-8 character - // according to the mode. - std::string s; - RE2 re_test3("(.)", RE2::Latin1); - ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s)); - ASSERT_EQ(s, std::string("\xe6")); - RE2 re_test4("(.)"); - ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s)); - ASSERT_EQ(s, std::string("\xe6\x97\xa5")); - - // Check that string matches itself in either mode - RE2 re_test5(utf8_string, RE2::Latin1); - ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5)); - RE2 re_test6(utf8_string); - ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6)); - - // Check that pattern matches string only in UTF8 mode - RE2 re_test7(utf8_pattern, RE2::Latin1); - ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7)); - RE2 re_test8(utf8_pattern); - ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8)); -} - -TEST(RE2, UngreedyUTF8) { - // Check that ungreedy, UTF8 regular expressions don't match when they - // oughtn't -- see bug 82246. - { - // This code always worked. - const char* pattern = "\\w+X"; - const std::string target = "a aX"; - RE2 match_sentence(pattern, RE2::Latin1); - RE2 match_sentence_re(pattern); - - ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); - ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); - } - { - const char* pattern = "(?U)\\w+X"; - const std::string target = "a aX"; - RE2 match_sentence(pattern, RE2::Latin1); - ASSERT_EQ(match_sentence.error(), ""); - RE2 match_sentence_re(pattern); - - ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); - ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); - } -} - -TEST(RE2, Rejects) { - { - RE2 re("a\\1", RE2::Quiet); - ASSERT_FALSE(re.ok()); } - { - RE2 re("a[x", RE2::Quiet); - ASSERT_FALSE(re.ok()); - } - { - RE2 re("a[z-a]", RE2::Quiet); - ASSERT_FALSE(re.ok()); - } - { - RE2 re("a[[:foobar:]]", RE2::Quiet); - ASSERT_FALSE(re.ok()); - } - { - RE2 re("a(b", RE2::Quiet); - ASSERT_FALSE(re.ok()); - } - { - RE2 re("a\\", RE2::Quiet); - ASSERT_FALSE(re.ok()); - } -} - -TEST(RE2, NoCrash) { - // Test that using a bad regexp doesn't crash. - { - RE2 re("a\\", RE2::Quiet); - ASSERT_FALSE(re.ok()); - ASSERT_FALSE(RE2::PartialMatch("a\\b", re)); - } - - // Test that using an enormous regexp doesn't crash - { - RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet); - ASSERT_FALSE(re.ok()); - ASSERT_FALSE(RE2::PartialMatch("aaa", re)); - } - - // Test that a crazy regexp still compiles and runs. - { - RE2 re(".{512}x", RE2::Quiet); - ASSERT_TRUE(re.ok()); - std::string s; - s.append(515, 'c'); - s.append("x"); - ASSERT_TRUE(RE2::PartialMatch(s, re)); - } -} - -TEST(RE2, Recursion) { - // Test that recursion is stopped. - // This test is PCRE-legacy -- there's no recursion in RE2. - int bytes = 15 * 1024; // enough to crash PCRE - TestRecursion(bytes, "."); - TestRecursion(bytes, "a"); - TestRecursion(bytes, "a."); - TestRecursion(bytes, "ab."); - TestRecursion(bytes, "abc."); -} - -TEST(RE2, BigCountedRepetition) { - // Test that counted repetition works, given tons of memory. - RE2::Options opt; - opt.set_max_mem(256<<20); - - RE2 re(".{512}x", opt); - ASSERT_TRUE(re.ok()); - std::string s; - s.append(515, 'c'); - s.append("x"); - ASSERT_TRUE(RE2::PartialMatch(s, re)); -} - -TEST(RE2, DeepRecursion) { - // Test for deep stack recursion. This would fail with a - // segmentation violation due to stack overflow before pcre was - // patched. - // Again, a PCRE legacy test. RE2 doesn't recurse. - std::string comment("x*"); - std::string a(131072, 'a'); - comment += a; - comment += "*x"; - RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)"); - ASSERT_TRUE(RE2::FullMatch(comment, re)); -} - -// Suggested by Josh Hyman. Failed when SearchOnePass was -// not implementing case-folding. -TEST(CaseInsensitive, MatchAndConsume) { - std::string text = "A fish named *Wanda*"; - StringPiece sp(text); - StringPiece result; - EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result)); - EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); -} - -// RE2 should permit implicit conversions from string, StringPiece, const char*, -// and C string literals. -TEST(RE2, ImplicitConversions) { - std::string re_string("."); - StringPiece re_stringpiece("."); - const char* re_cstring = "."; - EXPECT_TRUE(RE2::PartialMatch("e", re_string)); - EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece)); - EXPECT_TRUE(RE2::PartialMatch("e", re_cstring)); - EXPECT_TRUE(RE2::PartialMatch("e", ".")); -} - -// Bugs introduced by 8622304 -TEST(RE2, CL8622304) { - // reported by ingow - std::string dir; - EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok - EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails - - // reported by jacobsa - std::string key, val; - EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true", - "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?", - &key, - &val)); - EXPECT_EQ(key, "bar"); - EXPECT_EQ(val, "1,0x2F,030,4,5"); -} - -// Check that RE2 returns correct regexp pieces on error. -// In particular, make sure it returns whole runes -// and that it always reports invalid UTF-8. -// Also check that Perl error flag piece is big enough. -static struct ErrorTest { - const char *regexp; - RE2::ErrorCode error_code; - const char *error_arg; -} error_tests[] = { - { "ab\\αcd", RE2::ErrorBadEscape, "\\α" }, - { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" }, - { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" }, - { "ij\\x1", RE2::ErrorBadEscape, "\\x1" }, - { "kl\\x", RE2::ErrorBadEscape, "\\x" }, - { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" }, - { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" }, - // used to return (?s but the error is X - { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" }, - { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" }, - { "bb[abc", RE2::ErrorMissingBracket, "[abc" }, - { "abc(def", RE2::ErrorMissingParen, "abc(def" }, - { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" }, - - // no argument string returned for invalid UTF-8 - { "mn\\x1\377", RE2::ErrorBadUTF8, "" }, - { "op\377qr", RE2::ErrorBadUTF8, "" }, - { "st\\x{00000\377", RE2::ErrorBadUTF8, "" }, - { "zz\\p{\377}", RE2::ErrorBadUTF8, "" }, - { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" }, - { "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" }, -}; -TEST(RE2, ErrorCodeAndArg) { - for (size_t i = 0; i < arraysize(error_tests); i++) { - RE2 re(error_tests[i].regexp, RE2::Quiet); - EXPECT_FALSE(re.ok()); - EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error(); - EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error(); - } -} - -// Check that "never match \n" mode never matches \n. -static struct NeverTest { - const char* regexp; - const char* text; - const char* match; -} never_tests[] = { - { "(.*)", "abc\ndef\nghi\n", "abc" }, - { "(?s)(abc.*def)", "abc\ndef\n", NULL }, - { "(abc(.|\n)*def)", "abc\ndef\n", NULL }, - { "(abc[^x]*def)", "abc\ndef\n", NULL }, - { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" }, -}; -TEST(RE2, NeverNewline) { - RE2::Options opt; - opt.set_never_nl(true); - for (size_t i = 0; i < arraysize(never_tests); i++) { - const NeverTest& t = never_tests[i]; - RE2 re(t.regexp, opt); - if (t.match == NULL) { - EXPECT_FALSE(re.PartialMatch(t.text, re)); - } else { - StringPiece m; - EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); - EXPECT_EQ(m, t.match); - } - } -} - -// Check that dot_nl option works. -TEST(RE2, DotNL) { - RE2::Options opt; - opt.set_dot_nl(true); - EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt))); - EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt))); - opt.set_never_nl(true); - EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt))); -} - -// Check that there are no capturing groups in "never capture" mode. -TEST(RE2, NeverCapture) { - RE2::Options opt; - opt.set_never_capture(true); - RE2 re("(r)(e)", opt); - EXPECT_EQ(0, re.NumberOfCapturingGroups()); -} - -// Bitstate bug was looking at submatch[0] even if nsubmatch == 0. -// Triggered by a failed DFA search falling back to Bitstate when -// using Match with a NULL submatch set. Bitstate tried to read -// the submatch[0] entry even if nsubmatch was 0. -TEST(RE2, BitstateCaptureBug) { - RE2::Options opt; - opt.set_max_mem(20000); - RE2 re("(_________$)", opt); - StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; - EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); -} - -// C++ version of bug 609710. -TEST(RE2, UnicodeClasses) { - const std::string str = "ABCDEFGHI譚永鋒"; - std::string a, b, c; - - EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}")); - EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}")); - EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}")); - EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}")); - EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}")); - EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}")); - - EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}")); - EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}")); - EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}")); - EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}")); - EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}")); - EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}")); - - EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}")); - EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}")); - EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}")); - EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}")); - EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}")); - EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}")); - - EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}")); - EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}")); - EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}")); - EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}")); - EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}")); - EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}")); - - EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c)); - EXPECT_EQ("A", a); - EXPECT_EQ("B", b); - EXPECT_EQ("C", c); - - EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c)); - EXPECT_EQ("A", a); - EXPECT_EQ("B", b); - EXPECT_EQ("C", c); - - EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}")); - - EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c)); - EXPECT_EQ("A", a); - EXPECT_EQ("B", b); - EXPECT_EQ("C", c); - - EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]")); - - EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c)); - EXPECT_EQ("譚", a); - EXPECT_EQ("永", b); - EXPECT_EQ("鋒", c); -} - -TEST(RE2, LazyRE2) { - // Test with and without options. - static LazyRE2 a = {"a"}; - static LazyRE2 b = {"b", RE2::Latin1}; - - EXPECT_EQ("a", a->pattern()); - EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding()); - - EXPECT_EQ("b", b->pattern()); - EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding()); -} - -// Bug reported by saito. 2009/02/17 -TEST(RE2, NullVsEmptyString) { - RE2 re(".*"); - EXPECT_TRUE(re.ok()); - - StringPiece null; - EXPECT_TRUE(RE2::FullMatch(null, re)); - - StringPiece empty(""); - EXPECT_TRUE(RE2::FullMatch(empty, re)); -} - -// Similar to the previous test, check that the null string and the empty -// string both match, but also that the null string can only provide null -// submatches whereas the empty string can also provide empty submatches. -TEST(RE2, NullVsEmptyStringSubmatches) { - RE2 re("()|(foo)"); - EXPECT_TRUE(re.ok()); - - // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. - StringPiece matches[4]; - - for (size_t i = 0; i < arraysize(matches); i++) - matches[i] = "bar"; - - StringPiece null; - EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, - matches, arraysize(matches))); - for (size_t i = 0; i < arraysize(matches); i++) { - EXPECT_TRUE(matches[i].data() == NULL); // always null - EXPECT_TRUE(matches[i].empty()); - } - - for (size_t i = 0; i < arraysize(matches); i++) - matches[i] = "bar"; - - StringPiece empty(""); - EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, - matches, arraysize(matches))); - EXPECT_TRUE(matches[0].data() != NULL); // empty, not null - EXPECT_TRUE(matches[0].empty()); - EXPECT_TRUE(matches[1].data() != NULL); // empty, not null - EXPECT_TRUE(matches[1].empty()); - EXPECT_TRUE(matches[2].data() == NULL); - EXPECT_TRUE(matches[2].empty()); - EXPECT_TRUE(matches[3].data() == NULL); - EXPECT_TRUE(matches[3].empty()); -} - -// Issue 1816809 -TEST(RE2, Bug1816809) { - RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))"); - StringPiece piece("llx-3;llx4"); - std::string x; - EXPECT_TRUE(RE2::Consume(&piece, re, &x)); -} - -// Issue 3061120 -TEST(RE2, Bug3061120) { - RE2 re("(?i)\\W"); - EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked - EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin - EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s -} - -TEST(RE2, CapturingGroupNames) { - // Opening parentheses annotated with group IDs: - // 12 3 45 6 7 - RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))"); - EXPECT_TRUE(re.ok()); - const std::map<int, std::string>& have = re.CapturingGroupNames(); - std::map<int, std::string> want; - want[3] = "G2"; - want[6] = "G2"; - want[7] = "G1"; - EXPECT_EQ(want, have); -} - -TEST(RE2, RegexpToStringLossOfAnchor) { - EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at"); - EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at"); - EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$"); - EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); -} - -// Issue 10131674 -TEST(RE2, Bug10131674) { - // Some of these escapes describe values that do not fit in a byte. - RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1); - EXPECT_FALSE(re.ok()); - EXPECT_FALSE(RE2::FullMatch("hello world", re)); -} - -TEST(RE2, Bug18391750) { - // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer. - const char t[] = { - (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08, - (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5, - (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69, - (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31, - (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29, - (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00, - }; - RE2::Options opt; - opt.set_encoding(RE2::Options::EncodingLatin1); - opt.set_longest_match(true); - opt.set_dot_nl(true); - opt.set_case_sensitive(false); - RE2 re(t, opt); - ASSERT_TRUE(re.ok()); - RE2::PartialMatch(t, re); -} - -TEST(RE2, Bug18458852) { - // Bug in parser accepting invalid (too large) rune, - // causing compiler to fail in DCHECK in UTF-8 - // character class code. - const char b[] = { - (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28, - (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87, - (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00, - }; - RE2 re(b); - ASSERT_FALSE(re.ok()); -} - -TEST(RE2, Bug18523943) { - // Bug in BitState: case kFailInst failed the match entirely. - - RE2::Options opt; - const char a[] = { - (char)0x29, (char)0x29, (char)0x24, (char)0x00, - }; - const char b[] = { - (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00, - }; - opt.set_log_errors(false); - opt.set_encoding(RE2::Options::EncodingLatin1); - opt.set_posix_syntax(true); - opt.set_longest_match(true); - opt.set_literal(false); - opt.set_never_nl(true); - - RE2 re((const char*)b, opt); - ASSERT_TRUE(re.ok()); - std::string s1; - ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1)); -} - -TEST(RE2, Bug21371806) { - // Bug in parser accepting Unicode groups in Latin-1 mode, - // causing compiler to fail in DCHECK in prog.cc. - - RE2::Options opt; - opt.set_encoding(RE2::Options::EncodingLatin1); - - RE2 re("g\\p{Zl}]", opt); - ASSERT_TRUE(re.ok()); -} - -TEST(RE2, Bug26356109) { - // Bug in parser caused by factoring of common prefixes in alternations. - - // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would - // consume "ab" and then stop (when unanchored) whereas it should consume all - // of "abc" as per first-match semantics. - RE2 re("a\\C*?c|a\\C*?b"); - ASSERT_TRUE(re.ok()); - - std::string s = "abc"; - StringPiece m; - - ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); - ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'"; - - ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1)); - ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'"; -} - -TEST(RE2, Issue104) { - // RE2::GlobalReplace always advanced by one byte when the empty string was - // matched, which would clobber any rune that is longer than one byte. - - std::string s = "bc"; - ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d")); - ASSERT_EQ("dbdcd", s); - - s = "ąć"; - ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ")); - ASSERT_EQ("ĈąĈćĈ", s); - - s = "人类"; - ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小")); - ASSERT_EQ("小人小类小", s); -} - -TEST(RE2, Issue310) { - // (?:|a)* matched more text than (?:|a)+ did. - - std::string s = "aaa"; - StringPiece m; - - RE2 star("(?:|a)*"); - ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); - ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; - - RE2 plus("(?:|a)+"); - ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); - ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; -} - -} // namespace re2 +// -*- coding: utf-8 -*- +// Copyright 2002-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// TODO: Test extractions for PartialMatch/Consume + +#include <errno.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <map> +#include <string> +#include <utility> +#include <vector> +#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) +#include <sys/mman.h> +#include <unistd.h> /* for sysconf */ +#endif + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +namespace re2 { + +TEST(RE2, HexTests) { +#define ASSERT_HEX(type, value) \ + do { \ + type v; \ + ASSERT_TRUE( \ + RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \ + ASSERT_EQ(v, 0x##value); \ + ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ + RE2::CRadix(&v))); \ + ASSERT_EQ(v, 0x##value); \ + } while (0) + + ASSERT_HEX(short, 2bad); + ASSERT_HEX(unsigned short, 2badU); + ASSERT_HEX(int, dead); + ASSERT_HEX(unsigned int, deadU); + ASSERT_HEX(long, 7eadbeefL); + ASSERT_HEX(unsigned long, deadbeefUL); + ASSERT_HEX(long long, 12345678deadbeefLL); + ASSERT_HEX(unsigned long long, cafebabedeadbeefULL); + +#undef ASSERT_HEX +} + +TEST(RE2, OctalTests) { +#define ASSERT_OCTAL(type, value) \ + do { \ + type v; \ + ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \ + ASSERT_EQ(v, 0##value); \ + ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ + RE2::CRadix(&v))); \ + ASSERT_EQ(v, 0##value); \ + } while (0) + + ASSERT_OCTAL(short, 77777); + ASSERT_OCTAL(unsigned short, 177777U); + ASSERT_OCTAL(int, 17777777777); + ASSERT_OCTAL(unsigned int, 37777777777U); + ASSERT_OCTAL(long, 17777777777L); + ASSERT_OCTAL(unsigned long, 37777777777UL); + ASSERT_OCTAL(long long, 777777777777777777777LL); + ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL); + +#undef ASSERT_OCTAL +} + +TEST(RE2, DecimalTests) { +#define ASSERT_DECIMAL(type, value) \ + do { \ + type v; \ + ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \ + ASSERT_EQ(v, value); \ + ASSERT_TRUE( \ + RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + ASSERT_EQ(v, value); \ + } while (0) + + ASSERT_DECIMAL(short, -1); + ASSERT_DECIMAL(unsigned short, 9999); + ASSERT_DECIMAL(int, -1000); + ASSERT_DECIMAL(unsigned int, 12345U); + ASSERT_DECIMAL(long, -10000000L); + ASSERT_DECIMAL(unsigned long, 3083324652U); + ASSERT_DECIMAL(long long, -100000000000000LL); + ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL); + +#undef ASSERT_DECIMAL +} + +TEST(RE2, Replace) { + struct ReplaceTest { + const char *regexp; + const char *rewrite; + const char *original; + const char *single; + const char *global; + int greplace_count; + }; + static const ReplaceTest tests[] = { + { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", + "\\2\\1ay", + "the quick brown fox jumps over the lazy dogs.", + "ethay quick brown fox jumps over the lazy dogs.", + "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", + 9 }, + { "\\w+", + "\\0-NOSPAM", + "abcd.efghi@google.com", + "abcd-NOSPAM.efghi@google.com", + "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", + 4 }, + { "^", + "(START)", + "foo", + "(START)foo", + "(START)foo", + 1 }, + { "^", + "(START)", + "", + "(START)", + "(START)", + 1 }, + { "$", + "(END)", + "", + "(END)", + "(END)", + 1 }, + { "b", + "bb", + "ababababab", + "abbabababab", + "abbabbabbabbabb", + 5 }, + { "b", + "bb", + "bbbbbb", + "bbbbbbb", + "bbbbbbbbbbbb", + 6 }, + { "b+", + "bb", + "bbbbbb", + "bb", + "bb", + 1 }, + { "b*", + "bb", + "bbbbbb", + "bb", + "bb", + 1 }, + { "b*", + "bb", + "aaaaa", + "bbaaaaa", + "bbabbabbabbabbabb", + 6 }, + // Check newline handling + { "a.*a", + "(\\0)", + "aba\naba", + "(aba)\naba", + "(aba)\n(aba)", + 2 }, + { "", NULL, NULL, NULL, NULL, 0 } + }; + + for (const ReplaceTest* t = tests; t->original != NULL; t++) { + std::string one(t->original); + ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite)); + ASSERT_EQ(one, t->single); + std::string all(t->original); + ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) + << "Got: " << all; + ASSERT_EQ(all, t->global); + } +} + +static void TestCheckRewriteString(const char* regexp, const char* rewrite, + bool expect_ok) { + std::string error; + RE2 exp(regexp); + bool actual_ok = exp.CheckRewriteString(rewrite, &error); + EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; +} + +TEST(CheckRewriteString, all) { + TestCheckRewriteString("abc", "foo", true); + TestCheckRewriteString("abc", "foo\\", false); + TestCheckRewriteString("abc", "foo\\0bar", true); + + TestCheckRewriteString("a(b)c", "foo", true); + TestCheckRewriteString("a(b)c", "foo\\0bar", true); + TestCheckRewriteString("a(b)c", "foo\\1bar", true); + TestCheckRewriteString("a(b)c", "foo\\2bar", false); + TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); + + TestCheckRewriteString("a(b)(c)", "foo\\12", true); + TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); + TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); +} + +TEST(RE2, Extract) { + std::string s; + + ASSERT_TRUE(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); + ASSERT_EQ(s, "kremvax!boris"); + + ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s)); + ASSERT_EQ(s, "'foo'"); + // check that false match doesn't overwrite + ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s)); + ASSERT_EQ(s, "'foo'"); +} + +TEST(RE2, MaxSubmatchTooLarge) { + std::string s; + ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); + s = "foo"; + ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); + s = "foo"; + ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); +} + +TEST(RE2, Consume) { + RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace + std::string word; + + std::string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + ASSERT_TRUE(RE2::Consume(&input, r, &word)); + ASSERT_EQ(word, "aaa") << " input: " << input; + ASSERT_TRUE(RE2::Consume(&input, r, &word)); + ASSERT_EQ(word, "b") << " input: " << input; + ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input; +} + +TEST(RE2, ConsumeN) { + const std::string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one". + + // 1 arg + std::string word; + argv[0] = &word; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, FindAndConsume) { + RE2 r("(\\w+)"); // matches a word + std::string word; + + std::string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "aaa"); + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "b"); + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "cccc"); + ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word)); + + // Check that FindAndConsume works without any submatches. + // Earlier version used uninitialized data for + // length to consume. + input = "aaa"; + ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa")); + ASSERT_EQ(input, ""); +} + +TEST(RE2, FindAndConsumeN) { + const std::string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one". + + // 1 arg + std::string word; + argv[0] = &word; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, MatchNumberPeculiarity) { + RE2 r("(foo)|(bar)|(baz)"); + std::string word1; + std::string word2; + std::string word3; + + ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, "foo"); + ASSERT_EQ(word2, ""); + ASSERT_EQ(word3, ""); + ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, ""); + ASSERT_EQ(word2, "bar"); + ASSERT_EQ(word3, ""); + ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, ""); + ASSERT_EQ(word2, ""); + ASSERT_EQ(word3, "baz"); + ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3)); + + std::string a; + ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a)); + ASSERT_EQ(a, ""); +} + +TEST(RE2, Match) { + RE2 re("((\\w+):([0-9]+))"); // extracts host and port + StringPiece group[4]; + + // No match. + StringPiece s = "zyzzyva"; + ASSERT_FALSE( + re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); + + // Matches and extracts. + s = "a chrisr:9000 here"; + ASSERT_TRUE( + re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); + ASSERT_EQ(group[0], "chrisr:9000"); + ASSERT_EQ(group[1], "chrisr:9000"); + ASSERT_EQ(group[2], "chrisr"); + ASSERT_EQ(group[3], "9000"); + + std::string all, host; + int port; + ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port)); + ASSERT_EQ(all, "chrisr:9000"); + ASSERT_EQ(host, "chrisr"); + ASSERT_EQ(port, 9000); +} + +static void TestRecursion(int size, const char* pattern) { + // Fill up a string repeating the pattern given + std::string domain; + domain.resize(size); + size_t patlen = strlen(pattern); + for (int i = 0; i < size; i++) { + domain[i] = pattern[i % patlen]; + } + // Just make sure it doesn't crash due to too much recursion. + RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet); + RE2::FullMatch(domain, re); +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void TestQuoteMeta(const std::string& unquoted, + const RE2::Options& options = RE2::DefaultOptions) { + std::string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_TRUE(RE2::FullMatch(unquoted, re)) + << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void NegativeTestQuoteMeta( + const std::string& unquoted, const std::string& should_not_match, + const RE2::Options& options = RE2::DefaultOptions) { + std::string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_FALSE(RE2::FullMatch(should_not_match, re)) + << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; +} + +// Tests that quoted meta characters match their original strings, +// and that a few things that shouldn't match indeed do not. +TEST(QuoteMeta, Simple) { + TestQuoteMeta("foo"); + TestQuoteMeta("foo.bar"); + TestQuoteMeta("foo\\.bar"); + TestQuoteMeta("[1-9]"); + TestQuoteMeta("1.5-2.0?"); + TestQuoteMeta("\\d"); + TestQuoteMeta("Who doesn't like ice cream?"); + TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); + TestQuoteMeta("((?!)xxx).*yyy"); + TestQuoteMeta("(["); +} +TEST(QuoteMeta, SimpleNegative) { + NegativeTestQuoteMeta("foo", "bar"); + NegativeTestQuoteMeta("...", "bar"); + NegativeTestQuoteMeta("\\.", "."); + NegativeTestQuoteMeta("\\.", ".."); + NegativeTestQuoteMeta("(a)", "a"); + NegativeTestQuoteMeta("(a|b)", "a"); + NegativeTestQuoteMeta("(a|b)", "(a)"); + NegativeTestQuoteMeta("(a|b)", "a|b"); + NegativeTestQuoteMeta("[0-9]", "0"); + NegativeTestQuoteMeta("[0-9]", "0-9"); + NegativeTestQuoteMeta("[0-9]", "[9]"); + NegativeTestQuoteMeta("((?!)xxx)", "xxx"); +} + +TEST(QuoteMeta, Latin1) { + TestQuoteMeta("3\xb2 = 9", RE2::Latin1); +} + +TEST(QuoteMeta, UTF8) { + TestQuoteMeta("Plácido Domingo"); + TestQuoteMeta("xyz"); // No fancy utf8. + TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol. + TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character. + TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime. + TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note. + TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should + // still work. + NegativeTestQuoteMeta("27\xc2\xb0", + "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol. +} + +TEST(QuoteMeta, HasNull) { + std::string has_null; + + // string with one null character + has_null += '\0'; + TestQuoteMeta(has_null); + NegativeTestQuoteMeta(has_null, ""); + + // Don't want null-followed-by-'1' to be interpreted as '\01'. + has_null += '1'; + TestQuoteMeta(has_null); + NegativeTestQuoteMeta(has_null, "\1"); +} + +TEST(ProgramSize, BigProgram) { + RE2 re_simple("simple regexp"); + RE2 re_medium("medium.*regexp"); + RE2 re_complex("complex.{1,128}regexp"); + + ASSERT_GT(re_simple.ProgramSize(), 0); + ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); + ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); + + ASSERT_GT(re_simple.ReverseProgramSize(), 0); + ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize()); + ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize()); +} + +TEST(ProgramFanout, BigProgram) { + RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)"); + RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)"); + RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)"); + RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)"); + + std::vector<int> histogram; + + // 3 is the largest non-empty bucket and has 2 element. + ASSERT_EQ(3, re1.ProgramFanout(&histogram)); + ASSERT_EQ(2, histogram[3]); + + // 6 is the largest non-empty bucket and has 11 elements. + ASSERT_EQ(6, re10.ProgramFanout(&histogram)); + ASSERT_EQ(11, histogram[6]); + + // 9 is the largest non-empty bucket and has 101 elements. + ASSERT_EQ(9, re100.ProgramFanout(&histogram)); + ASSERT_EQ(101, histogram[9]); + + // 13 is the largest non-empty bucket and has 1001 elements. + ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); + ASSERT_EQ(1001, histogram[13]); + + // 2 is the largest non-empty bucket and has 2 element. + ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); + ASSERT_EQ(2, histogram[2]); + + // 5 is the largest non-empty bucket and has 11 elements. + ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); + ASSERT_EQ(11, histogram[5]); + + // 9 is the largest non-empty bucket and has 101 elements. + ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram)); + ASSERT_EQ(101, histogram[9]); + + // 12 is the largest non-empty bucket and has 1001 elements. + ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram)); + ASSERT_EQ(1001, histogram[12]); +} + +// Issue 956519: handling empty character sets was +// causing NULL dereference. This tests a few empty character sets. +// (The way to get an empty character set is to negate a full one.) +TEST(EmptyCharset, Fuzz) { + static const char *empties[] = { + "[^\\S\\s]", + "[^\\S[:space:]]", + "[^\\D\\d]", + "[^\\D[:digit:]]" + }; + for (size_t i = 0; i < arraysize(empties); i++) + ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); +} + +// Bitstate assumes that kInstFail instructions in +// alternations or capture groups have been "compiled away". +TEST(EmptyCharset, BitstateAssumptions) { + // Captures trigger use of Bitstate. + static const char *nop_empties[] = { + "((((()))))" "[^\\S\\s]?", + "((((()))))" "([^\\S\\s])?", + "((((()))))" "([^\\S\\s]|[^\\S\\s])?", + "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)" + }; + StringPiece group[6]; + for (size_t i = 0; i < arraysize(nop_empties); i++) + ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6)); +} + +// Test that named groups work correctly. +TEST(Capture, NamedGroups) { + { + RE2 re("(hello world)"); + ASSERT_EQ(re.NumberOfCapturingGroups(), 1); + const std::map<std::string, int>& m = re.NamedCapturingGroups(); + ASSERT_EQ(m.size(), 0); + } + + { + RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))"); + ASSERT_EQ(re.NumberOfCapturingGroups(), 6); + const std::map<std::string, int>& m = re.NamedCapturingGroups(); + ASSERT_EQ(m.size(), 4); + ASSERT_EQ(m.find("A")->second, 1); + ASSERT_EQ(m.find("B")->second, 2); + ASSERT_EQ(m.find("C")->second, 3); + ASSERT_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous + } +} + +TEST(RE2, CapturedGroupTest) { + RE2 re("directions from (?P<S>.*) to (?P<D>.*)"); + int num_groups = re.NumberOfCapturingGroups(); + EXPECT_EQ(2, num_groups); + std::string args[4]; + RE2::Arg arg0(&args[0]); + RE2::Arg arg1(&args[1]); + RE2::Arg arg2(&args[2]); + RE2::Arg arg3(&args[3]); + + const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3}; + EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose", + re, matches, num_groups)); + const std::map<std::string, int>& named_groups = re.NamedCapturingGroups(); + EXPECT_TRUE(named_groups.find("S") != named_groups.end()); + EXPECT_TRUE(named_groups.find("D") != named_groups.end()); + + // The named group index is 1-based. + int source_group_index = named_groups.find("S")->second; + int destination_group_index = named_groups.find("D")->second; + EXPECT_EQ(1, source_group_index); + EXPECT_EQ(2, destination_group_index); + + // The args is zero-based. + EXPECT_EQ("mountain view", args[source_group_index - 1]); + EXPECT_EQ("san jose", args[destination_group_index - 1]); +} + +TEST(RE2, FullMatchWithNoArgs) { + ASSERT_TRUE(RE2::FullMatch("h", "h")); + ASSERT_TRUE(RE2::FullMatch("hello", "hello")); + ASSERT_TRUE(RE2::FullMatch("hello", "h.*o")); + ASSERT_FALSE(RE2::FullMatch("othello", "h.*o")); // Must be anchored at front + ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end +} + +TEST(RE2, PartialMatch) { + ASSERT_TRUE(RE2::PartialMatch("x", "x")); + ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))")); +} + +TEST(RE2, PartialMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0)); + EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + std::string s; + argv[1] = &s; + EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchZeroArg) { + // Zero-arg + ASSERT_TRUE(RE2::FullMatch("1001", "\\d+")); +} + +TEST(RE2, FullMatchOneArg) { + int i; + + // Single-arg + ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)", &i)); + ASSERT_EQ(i, 1001); + ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i)); + ASSERT_EQ(i, -123); + ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i)); + ASSERT_FALSE( + RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i)); +} + +TEST(RE2, FullMatchIntegerArg) { + int i; + + // Digits surrounding integer-arg + ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i)); + ASSERT_EQ(i, 23); + ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i)); + ASSERT_EQ(i, 1); + ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i)); + ASSERT_EQ(i, -1); + ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i)); + ASSERT_EQ(i, 1); + ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i)); + ASSERT_EQ(i, -1); +} + +TEST(RE2, FullMatchStringArg) { + std::string s; + // String-arg + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s)); + ASSERT_EQ(s, std::string("ell")); +} + +TEST(RE2, FullMatchStringPieceArg) { + int i; + // StringPiece-arg + StringPiece sp; + ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i)); + ASSERT_EQ(sp.size(), 4); + ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0); + ASSERT_EQ(i, 1234); +} + +TEST(RE2, FullMatchMultiArg) { + int i; + std::string s; + // Multi-arg + ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); + ASSERT_EQ(s, std::string("ruby")); + ASSERT_EQ(i, 1234); +} + +TEST(RE2, FullMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0)); + EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + std::string s; + argv[1] = &s; + EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchIgnoredArg) { + int i; + std::string s; + + // Old-school NULL should be ignored. + ASSERT_TRUE( + RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i)); + ASSERT_EQ(s, std::string("ruby")); + ASSERT_EQ(i, 1234); + + // C++11 nullptr should also be ignored. + ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i)); + ASSERT_EQ(s, std::string("rubz")); + ASSERT_EQ(i, 1235); +} + +TEST(RE2, FullMatchTypedNullArg) { + std::string s; + + // Ignore non-void* NULL arg + ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL)); + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL)); + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL)); + ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL)); + ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL)); + ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL)); + ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL)); + + // Fail on non-void* NULL arg if the match doesn't parse for the given type. + ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL)); + ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL)); +} + +// Check that numeric parsing code does not read past the end of +// the number being parsed. +// This implementation requires mmap(2) et al. and thus cannot +// be used unless they are available. +TEST(RE2, NULTerminated) { +#if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0 + char *v; + int x; + long pagesize = sysconf(_SC_PAGE_SIZE); + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); + ASSERT_TRUE(v != reinterpret_cast<char*>(-1)); + LOG(INFO) << "Memory at " << (void*)v; + ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno; + v[pagesize - 1] = '1'; + + x = 0; + ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); + ASSERT_EQ(x, 1); +#endif +} + +TEST(RE2, FullMatchTypeTests) { + // Type tests + std::string zeros(1000, '0'); + { + char c; + ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c)); + ASSERT_EQ(c, 'H'); + } + { + unsigned char c; + ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c)); + ASSERT_EQ(c, static_cast<unsigned char>('H')); + } + { + int16_t v; + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("32767", "(-?\\d+)", &v)); ASSERT_EQ(v, 32767); + ASSERT_TRUE(RE2::FullMatch("-32768", "(-?\\d+)", &v)); ASSERT_EQ(v, -32768); + ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("32768", "(-?\\d+)", &v)); + } + { + uint16_t v; + ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("32767", "(\\d+)", &v)); ASSERT_EQ(v, 32767); + ASSERT_TRUE(RE2::FullMatch("65535", "(\\d+)", &v)); ASSERT_EQ(v, 65535); + ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v)); + } + { + int32_t v; + static const int32_t max = INT32_C(0x7fffffff); + static const int32_t min = -max - 1; + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); ASSERT_EQ(v, max); + ASSERT_TRUE(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); ASSERT_EQ(v, min); + ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("2147483648", "(-?\\d+)", &v)); + + ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v)); + ASSERT_EQ(v, max); + ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v)); + ASSERT_EQ(v, min); + + ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v)); + ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v))); + ASSERT_EQ(v, max); + ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v))); + } + { + uint32_t v; + static const uint32_t max = UINT32_C(0xffffffff); + ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max); + ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("-1", "(\\d+)", &v)); + + ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max); + } + { + int64_t v; + static const int64_t max = INT64_C(0x7fffffffffffffff); + static const int64_t min = -max - 1; + std::string str; + + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + + str = std::to_string(max); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max); + + str = std::to_string(min); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, min); + + str = std::to_string(max); + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + + str = std::to_string(min); + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + } + { + uint64_t v; + int64_t v2; + static const uint64_t max = UINT64_C(0xffffffffffffffff); + std::string str; + + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100); + + str = std::to_string(max); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max); + + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + } +} + +TEST(RE2, FloatingPointFullMatchTypes) { + std::string zeros(1000, '0'); + { + float v; + ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, float(1e23)); + ASSERT_TRUE(RE2::FullMatch(" 100", "(.*)", &v)); ASSERT_EQ(v, 100); + + ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + ASSERT_EQ(v, float(1e23)); + + // 6700000000081920.1 is an edge case. + // 6700000000081920 is exactly halfway between + // two float32s, so the .1 should make it round up. + // However, the .1 is outside the precision possible with + // a float64: the nearest float64 is 6700000000081920. + // So if the code uses strtod and then converts to float32, + // round-to-even will make it round down instead of up. + // To pass the test, the parser must call strtof directly. + // This test case is carefully chosen to use only a 17-digit + // number, since C does not guarantee to get the correctly + // rounded answer for strtod and strtof unless the input is + // short. + // + // This is known to fail on Cygwin and MinGW due to a broken + // implementation of strtof(3). And apparently MSVC too. Sigh. +#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) + ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); + ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); + ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); + ASSERT_EQ(v, 6700000000081920.1f) + << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); +#endif + } + { + double v; + ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, 1e23); + ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + ASSERT_EQ(v, double(1e23)); + + ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); + ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); + ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); + ASSERT_EQ(v, 1.0000000596046448) + << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); + } +} + +TEST(RE2, FullMatchAnchored) { + int i; + // Check that matching is fully anchored + ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)", &i)); + ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)", &i)); + ASSERT_TRUE(RE2::FullMatch("x1001", "x(\\d+)", &i)); ASSERT_EQ(i, 1001); + ASSERT_TRUE(RE2::FullMatch("1001x", "(\\d+)x", &i)); ASSERT_EQ(i, 1001); +} + +TEST(RE2, FullMatchBraces) { + // Braces + ASSERT_TRUE(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}")); + ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}")); + ASSERT_FALSE(RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}")); +} + +TEST(RE2, Complicated) { + // Complicated RE2 + ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]")); + ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]")); + ASSERT_TRUE(RE2::FullMatch("X", "foo|bar|[A-Z]")); + ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]")); +} + +TEST(RE2, FullMatchEnd) { + // Check full-match handling (needs '$' tacked on internally) + ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo")); + ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo")); + ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$")); + ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$")); + ASSERT_TRUE(RE2::FullMatch("foo", "foo$")); + ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$")); + ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar")); + + // Uncomment the following if we change the handling of '$' to + // prevent it from matching a trailing newline + if (false) { + // Check that we don't get bitten by pcre's special handling of a + // '\n' at the end of the string matching '$' + ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$")); + } +} + +TEST(RE2, FullMatchArgCount) { + // Number of args + int a[16]; + ASSERT_TRUE(RE2::FullMatch("", "")); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0])); + ASSERT_EQ(a[0], 1); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1], + &a[2], &a[3])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1], + &a[2], &a[3], &a[4])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], + &a[1], &a[2], &a[3], &a[4], &a[5])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + ASSERT_EQ(a[6], 7); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234567890123456", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7], &a[8], &a[9], &a[10], &a[11], &a[12], + &a[13], &a[14], &a[15])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + ASSERT_EQ(a[6], 7); + ASSERT_EQ(a[7], 8); + ASSERT_EQ(a[8], 9); + ASSERT_EQ(a[9], 0); + ASSERT_EQ(a[10], 1); + ASSERT_EQ(a[11], 2); + ASSERT_EQ(a[12], 3); + ASSERT_EQ(a[13], 4); + ASSERT_EQ(a[14], 5); + ASSERT_EQ(a[15], 6); +} + +TEST(RE2, Accessors) { + // Check the pattern() accessor + { + const std::string kPattern = "http://([^/]+)/.*"; + const RE2 re(kPattern); + ASSERT_EQ(kPattern, re.pattern()); + } + + // Check RE2 error field. + { + RE2 re("foo"); + ASSERT_TRUE(re.error().empty()); // Must have no error + ASSERT_TRUE(re.ok()); + ASSERT_EQ(re.error_code(), RE2::NoError); + } +} + +TEST(RE2, UTF8) { + // Check UTF-8 handling + // Three Japanese characters (nihongo) + const char utf8_string[] = { + (char)0xe6, (char)0x97, (char)0xa5, // 65e5 + (char)0xe6, (char)0x9c, (char)0xac, // 627c + (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e + 0 + }; + const char utf8_pattern[] = { + '.', + (char)0xe6, (char)0x9c, (char)0xac, // 627c + '.', + 0 + }; + + // Both should match in either mode, bytes or UTF-8 + RE2 re_test1(".........", RE2::Latin1); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1)); + RE2 re_test2("..."); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2)); + + // Check that '.' matches one byte or UTF-8 character + // according to the mode. + std::string s; + RE2 re_test3("(.)", RE2::Latin1); + ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s)); + ASSERT_EQ(s, std::string("\xe6")); + RE2 re_test4("(.)"); + ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s)); + ASSERT_EQ(s, std::string("\xe6\x97\xa5")); + + // Check that string matches itself in either mode + RE2 re_test5(utf8_string, RE2::Latin1); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5)); + RE2 re_test6(utf8_string); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6)); + + // Check that pattern matches string only in UTF8 mode + RE2 re_test7(utf8_pattern, RE2::Latin1); + ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7)); + RE2 re_test8(utf8_pattern); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8)); +} + +TEST(RE2, UngreedyUTF8) { + // Check that ungreedy, UTF8 regular expressions don't match when they + // oughtn't -- see bug 82246. + { + // This code always worked. + const char* pattern = "\\w+X"; + const std::string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + RE2 match_sentence_re(pattern); + + ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); + ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); + } + { + const char* pattern = "(?U)\\w+X"; + const std::string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + ASSERT_EQ(match_sentence.error(), ""); + RE2 match_sentence_re(pattern); + + ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); + ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); + } +} + +TEST(RE2, Rejects) { + { + RE2 re("a\\1", RE2::Quiet); + ASSERT_FALSE(re.ok()); } + { + RE2 re("a[x", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a[z-a]", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a[[:foobar:]]", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a(b", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a\\", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } +} + +TEST(RE2, NoCrash) { + // Test that using a bad regexp doesn't crash. + { + RE2 re("a\\", RE2::Quiet); + ASSERT_FALSE(re.ok()); + ASSERT_FALSE(RE2::PartialMatch("a\\b", re)); + } + + // Test that using an enormous regexp doesn't crash + { + RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet); + ASSERT_FALSE(re.ok()); + ASSERT_FALSE(RE2::PartialMatch("aaa", re)); + } + + // Test that a crazy regexp still compiles and runs. + { + RE2 re(".{512}x", RE2::Quiet); + ASSERT_TRUE(re.ok()); + std::string s; + s.append(515, 'c'); + s.append("x"); + ASSERT_TRUE(RE2::PartialMatch(s, re)); + } +} + +TEST(RE2, Recursion) { + // Test that recursion is stopped. + // This test is PCRE-legacy -- there's no recursion in RE2. + int bytes = 15 * 1024; // enough to crash PCRE + TestRecursion(bytes, "."); + TestRecursion(bytes, "a"); + TestRecursion(bytes, "a."); + TestRecursion(bytes, "ab."); + TestRecursion(bytes, "abc."); +} + +TEST(RE2, BigCountedRepetition) { + // Test that counted repetition works, given tons of memory. + RE2::Options opt; + opt.set_max_mem(256<<20); + + RE2 re(".{512}x", opt); + ASSERT_TRUE(re.ok()); + std::string s; + s.append(515, 'c'); + s.append("x"); + ASSERT_TRUE(RE2::PartialMatch(s, re)); +} + +TEST(RE2, DeepRecursion) { + // Test for deep stack recursion. This would fail with a + // segmentation violation due to stack overflow before pcre was + // patched. + // Again, a PCRE legacy test. RE2 doesn't recurse. + std::string comment("x*"); + std::string a(131072, 'a'); + comment += a; + comment += "*x"; + RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)"); + ASSERT_TRUE(RE2::FullMatch(comment, re)); +} + +// Suggested by Josh Hyman. Failed when SearchOnePass was +// not implementing case-folding. +TEST(CaseInsensitive, MatchAndConsume) { + std::string text = "A fish named *Wanda*"; + StringPiece sp(text); + StringPiece result; + EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result)); + EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); +} + +// RE2 should permit implicit conversions from string, StringPiece, const char*, +// and C string literals. +TEST(RE2, ImplicitConversions) { + std::string re_string("."); + StringPiece re_stringpiece("."); + const char* re_cstring = "."; + EXPECT_TRUE(RE2::PartialMatch("e", re_string)); + EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece)); + EXPECT_TRUE(RE2::PartialMatch("e", re_cstring)); + EXPECT_TRUE(RE2::PartialMatch("e", ".")); +} + +// Bugs introduced by 8622304 +TEST(RE2, CL8622304) { + // reported by ingow + std::string dir; + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails + + // reported by jacobsa + std::string key, val; + EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true", + "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?", + &key, + &val)); + EXPECT_EQ(key, "bar"); + EXPECT_EQ(val, "1,0x2F,030,4,5"); +} + +// Check that RE2 returns correct regexp pieces on error. +// In particular, make sure it returns whole runes +// and that it always reports invalid UTF-8. +// Also check that Perl error flag piece is big enough. +static struct ErrorTest { + const char *regexp; + RE2::ErrorCode error_code; + const char *error_arg; +} error_tests[] = { + { "ab\\αcd", RE2::ErrorBadEscape, "\\α" }, + { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" }, + { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" }, + { "ij\\x1", RE2::ErrorBadEscape, "\\x1" }, + { "kl\\x", RE2::ErrorBadEscape, "\\x" }, + { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" }, + { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" }, + // used to return (?s but the error is X + { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" }, + { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" }, + { "bb[abc", RE2::ErrorMissingBracket, "[abc" }, + { "abc(def", RE2::ErrorMissingParen, "abc(def" }, + { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" }, + + // no argument string returned for invalid UTF-8 + { "mn\\x1\377", RE2::ErrorBadUTF8, "" }, + { "op\377qr", RE2::ErrorBadUTF8, "" }, + { "st\\x{00000\377", RE2::ErrorBadUTF8, "" }, + { "zz\\p{\377}", RE2::ErrorBadUTF8, "" }, + { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" }, + { "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" }, +}; +TEST(RE2, ErrorCodeAndArg) { + for (size_t i = 0; i < arraysize(error_tests); i++) { + RE2 re(error_tests[i].regexp, RE2::Quiet); + EXPECT_FALSE(re.ok()); + EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error(); + EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error(); + } +} + +// Check that "never match \n" mode never matches \n. +static struct NeverTest { + const char* regexp; + const char* text; + const char* match; +} never_tests[] = { + { "(.*)", "abc\ndef\nghi\n", "abc" }, + { "(?s)(abc.*def)", "abc\ndef\n", NULL }, + { "(abc(.|\n)*def)", "abc\ndef\n", NULL }, + { "(abc[^x]*def)", "abc\ndef\n", NULL }, + { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" }, +}; +TEST(RE2, NeverNewline) { + RE2::Options opt; + opt.set_never_nl(true); + for (size_t i = 0; i < arraysize(never_tests); i++) { + const NeverTest& t = never_tests[i]; + RE2 re(t.regexp, opt); + if (t.match == NULL) { + EXPECT_FALSE(re.PartialMatch(t.text, re)); + } else { + StringPiece m; + EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); + EXPECT_EQ(m, t.match); + } + } +} + +// Check that dot_nl option works. +TEST(RE2, DotNL) { + RE2::Options opt; + opt.set_dot_nl(true); + EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt))); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt))); + opt.set_never_nl(true); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt))); +} + +// Check that there are no capturing groups in "never capture" mode. +TEST(RE2, NeverCapture) { + RE2::Options opt; + opt.set_never_capture(true); + RE2 re("(r)(e)", opt); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); +} + +// Bitstate bug was looking at submatch[0] even if nsubmatch == 0. +// Triggered by a failed DFA search falling back to Bitstate when +// using Match with a NULL submatch set. Bitstate tried to read +// the submatch[0] entry even if nsubmatch was 0. +TEST(RE2, BitstateCaptureBug) { + RE2::Options opt; + opt.set_max_mem(20000); + RE2 re("(_________$)", opt); + StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; + EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); +} + +// C++ version of bug 609710. +TEST(RE2, UnicodeClasses) { + const std::string str = "ABCDEFGHI譚永鋒"; + std::string a, b, c; + + EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]")); + + EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("譚", a); + EXPECT_EQ("永", b); + EXPECT_EQ("鋒", c); +} + +TEST(RE2, LazyRE2) { + // Test with and without options. + static LazyRE2 a = {"a"}; + static LazyRE2 b = {"b", RE2::Latin1}; + + EXPECT_EQ("a", a->pattern()); + EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding()); + + EXPECT_EQ("b", b->pattern()); + EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding()); +} + +// Bug reported by saito. 2009/02/17 +TEST(RE2, NullVsEmptyString) { + RE2 re(".*"); + EXPECT_TRUE(re.ok()); + + StringPiece null; + EXPECT_TRUE(RE2::FullMatch(null, re)); + + StringPiece empty(""); + EXPECT_TRUE(RE2::FullMatch(empty, re)); +} + +// Similar to the previous test, check that the null string and the empty +// string both match, but also that the null string can only provide null +// submatches whereas the empty string can also provide empty submatches. +TEST(RE2, NullVsEmptyStringSubmatches) { + RE2 re("()|(foo)"); + EXPECT_TRUE(re.ok()); + + // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. + StringPiece matches[4]; + + for (size_t i = 0; i < arraysize(matches); i++) + matches[i] = "bar"; + + StringPiece null; + EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, + matches, arraysize(matches))); + for (size_t i = 0; i < arraysize(matches); i++) { + EXPECT_TRUE(matches[i].data() == NULL); // always null + EXPECT_TRUE(matches[i].empty()); + } + + for (size_t i = 0; i < arraysize(matches); i++) + matches[i] = "bar"; + + StringPiece empty(""); + EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, + matches, arraysize(matches))); + EXPECT_TRUE(matches[0].data() != NULL); // empty, not null + EXPECT_TRUE(matches[0].empty()); + EXPECT_TRUE(matches[1].data() != NULL); // empty, not null + EXPECT_TRUE(matches[1].empty()); + EXPECT_TRUE(matches[2].data() == NULL); + EXPECT_TRUE(matches[2].empty()); + EXPECT_TRUE(matches[3].data() == NULL); + EXPECT_TRUE(matches[3].empty()); +} + +// Issue 1816809 +TEST(RE2, Bug1816809) { + RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))"); + StringPiece piece("llx-3;llx4"); + std::string x; + EXPECT_TRUE(RE2::Consume(&piece, re, &x)); +} + +// Issue 3061120 +TEST(RE2, Bug3061120) { + RE2 re("(?i)\\W"); + EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked + EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin + EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s +} + +TEST(RE2, CapturingGroupNames) { + // Opening parentheses annotated with group IDs: + // 12 3 45 6 7 + RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))"); + EXPECT_TRUE(re.ok()); + const std::map<int, std::string>& have = re.CapturingGroupNames(); + std::map<int, std::string> want; + want[3] = "G2"; + want[6] = "G2"; + want[7] = "G1"; + EXPECT_EQ(want, have); +} + +TEST(RE2, RegexpToStringLossOfAnchor) { + EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at"); + EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at"); + EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$"); + EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); +} + +// Issue 10131674 +TEST(RE2, Bug10131674) { + // Some of these escapes describe values that do not fit in a byte. + RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(RE2::FullMatch("hello world", re)); +} + +TEST(RE2, Bug18391750) { + // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer. + const char t[] = { + (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08, + (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5, + (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69, + (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31, + (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29, + (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00, + }; + RE2::Options opt; + opt.set_encoding(RE2::Options::EncodingLatin1); + opt.set_longest_match(true); + opt.set_dot_nl(true); + opt.set_case_sensitive(false); + RE2 re(t, opt); + ASSERT_TRUE(re.ok()); + RE2::PartialMatch(t, re); +} + +TEST(RE2, Bug18458852) { + // Bug in parser accepting invalid (too large) rune, + // causing compiler to fail in DCHECK in UTF-8 + // character class code. + const char b[] = { + (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28, + (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87, + (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00, + }; + RE2 re(b); + ASSERT_FALSE(re.ok()); +} + +TEST(RE2, Bug18523943) { + // Bug in BitState: case kFailInst failed the match entirely. + + RE2::Options opt; + const char a[] = { + (char)0x29, (char)0x29, (char)0x24, (char)0x00, + }; + const char b[] = { + (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00, + }; + opt.set_log_errors(false); + opt.set_encoding(RE2::Options::EncodingLatin1); + opt.set_posix_syntax(true); + opt.set_longest_match(true); + opt.set_literal(false); + opt.set_never_nl(true); + + RE2 re((const char*)b, opt); + ASSERT_TRUE(re.ok()); + std::string s1; + ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1)); +} + +TEST(RE2, Bug21371806) { + // Bug in parser accepting Unicode groups in Latin-1 mode, + // causing compiler to fail in DCHECK in prog.cc. + + RE2::Options opt; + opt.set_encoding(RE2::Options::EncodingLatin1); + + RE2 re("g\\p{Zl}]", opt); + ASSERT_TRUE(re.ok()); +} + +TEST(RE2, Bug26356109) { + // Bug in parser caused by factoring of common prefixes in alternations. + + // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would + // consume "ab" and then stop (when unanchored) whereas it should consume all + // of "abc" as per first-match semantics. + RE2 re("a\\C*?c|a\\C*?b"); + ASSERT_TRUE(re.ok()); + + std::string s = "abc"; + StringPiece m; + + ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'"; + + ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1)); + ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'"; +} + +TEST(RE2, Issue104) { + // RE2::GlobalReplace always advanced by one byte when the empty string was + // matched, which would clobber any rune that is longer than one byte. + + std::string s = "bc"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d")); + ASSERT_EQ("dbdcd", s); + + s = "ąć"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ")); + ASSERT_EQ("ĈąĈćĈ", s); + + s = "人类"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小")); + ASSERT_EQ("小人小类小", s); +} + +TEST(RE2, Issue310) { + // (?:|a)* matched more text than (?:|a)+ did. + + std::string s = "aaa"; + StringPiece m; + + RE2 star("(?:|a)*"); + ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; + + RE2 plus("(?:|a)+"); + ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/regexp_generator.cc b/contrib/libs/re2/re2/testing/regexp_generator.cc index 9065835748..3b6c9ba3db 100644 --- a/contrib/libs/re2/re2/testing/regexp_generator.cc +++ b/contrib/libs/re2/re2/testing/regexp_generator.cc @@ -1,276 +1,276 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Regular expression generator: generates all possible -// regular expressions within parameters (see regexp_generator.h for details). - -// The regexp generator first generates a sequence of commands in a simple -// postfix language. Each command in the language is a string, -// like "a" or "%s*" or "%s|%s". -// -// To evaluate a command, enough arguments are popped from the value stack to -// plug into the %s slots. Then the result is pushed onto the stack. -// For example, the command sequence -// a b %s%s c -// results in the stack -// ab c -// -// GeneratePostfix generates all possible command sequences. -// Then RunPostfix turns each sequence into a regular expression -// and passes the regexp to HandleRegexp. - -#include <stddef.h> -#include <stdint.h> -#include <stdio.h> -#include <string.h> -#include <memory> -#include <stack> -#include <string> -#include <vector> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "util/strutil.h" -#include "util/utf.h" -#include "re2/testing/regexp_generator.h" - -namespace re2 { - -// Returns a vector of the egrep regexp operators. -const std::vector<std::string>& RegexpGenerator::EgrepOps() { - static const char *ops[] = { - "%s%s", - "%s|%s", - "%s*", - "%s+", - "%s?", - "%s\\C*", - }; - static std::vector<std::string> v(ops, ops + arraysize(ops)); - return v; -} - -RegexpGenerator::RegexpGenerator(int maxatoms, int maxops, - const std::vector<std::string>& atoms, - const std::vector<std::string>& ops) - : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) { - // Degenerate case. - if (atoms_.empty()) - maxatoms_ = 0; - if (ops_.empty()) - maxops_ = 0; -} - -// Generates all possible regular expressions (within the parameters), -// calling HandleRegexp for each one. -void RegexpGenerator::Generate() { - std::vector<std::string> postfix; - GeneratePostfix(&postfix, 0, 0, 0); -} - -// Generates random regular expressions, calling HandleRegexp for each one. -void RegexpGenerator::GenerateRandom(int32_t seed, int n) { - rng_.seed(seed); - - for (int i = 0; i < n; i++) { - std::vector<std::string> postfix; - GenerateRandomPostfix(&postfix, 0, 0, 0); - } -} - -// Counts and returns the number of occurrences of "%s" in s. -static int CountArgs(const std::string& s) { - const char *p = s.c_str(); - int n = 0; - while ((p = strstr(p, "%s")) != NULL) { - p += 2; - n++; - } - return n; -} - -// Generates all possible postfix command sequences. -// Each sequence is handed off to RunPostfix to generate a regular expression. -// The arguments are: -// post: the current postfix sequence -// nstk: the number of elements that would be on the stack after executing -// the sequence -// ops: the number of operators used in the sequence -// atoms: the number of atoms used in the sequence -// For example, if post were ["a", "b", "%s%s", "c"], -// then nstk = 2, ops = 1, atoms = 3. -// -// The initial call should be GeneratePostfix([empty vector], 0, 0, 0). -// -void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post, - int nstk, int ops, int atoms) { - if (nstk == 1) - RunPostfix(*post); - - // Early out: if used too many operators or can't - // get back down to a single expression on the stack - // using binary operators, give up. - if (ops + nstk - 1 > maxops_) - return; - - // Add atoms if there is room. - if (atoms < maxatoms_) { - for (size_t i = 0; i < atoms_.size(); i++) { - post->push_back(atoms_[i]); - GeneratePostfix(post, nstk + 1, ops, atoms + 1); - post->pop_back(); - } - } - - // Add operators if there are enough arguments. - if (ops < maxops_) { - for (size_t i = 0; i < ops_.size(); i++) { - const std::string& fmt = ops_[i]; - int nargs = CountArgs(fmt); - if (nargs <= nstk) { - post->push_back(fmt); - GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms); - post->pop_back(); - } - } - } -} - -// Generates a random postfix command sequence. -// Stops and returns true once a single sequence has been generated. -bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post, - int nstk, int ops, int atoms) { - std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms); - std::uniform_int_distribution<int> random_bit(0, 1); - std::uniform_int_distribution<int> random_ops_index( - 0, static_cast<int>(ops_.size()) - 1); - std::uniform_int_distribution<int> random_atoms_index( - 0, static_cast<int>(atoms_.size()) - 1); - - for (;;) { - // Stop if we get to a single element, but only sometimes. - if (nstk == 1 && random_stop(rng_) == 0) { - RunPostfix(*post); - return true; - } - - // Early out: if used too many operators or can't - // get back down to a single expression on the stack - // using binary operators, give up. - if (ops + nstk - 1 > maxops_) - return false; - - // Add operators if there are enough arguments. - if (ops < maxops_ && random_bit(rng_) == 0) { - const std::string& fmt = ops_[random_ops_index(rng_)]; - int nargs = CountArgs(fmt); - if (nargs <= nstk) { - post->push_back(fmt); - bool ret = GenerateRandomPostfix(post, nstk - nargs + 1, - ops + 1, atoms); - post->pop_back(); - if (ret) - return true; - } - } - - // Add atoms if there is room. - if (atoms < maxatoms_ && random_bit(rng_) == 0) { - post->push_back(atoms_[random_atoms_index(rng_)]); - bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); - post->pop_back(); - if (ret) - return true; - } - } -} - -// Interprets the postfix command sequence to create a regular expression -// passed to HandleRegexp. The results of operators like %s|%s are wrapped -// in (?: ) to avoid needing to maintain a precedence table. -void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) { - std::stack<std::string> regexps; - for (size_t i = 0; i < post.size(); i++) { - switch (CountArgs(post[i])) { - default: - LOG(FATAL) << "Bad operator: " << post[i]; - case 0: - regexps.push(post[i]); - break; - case 1: { - std::string a = regexps.top(); - regexps.pop(); - regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")"); - break; - } - case 2: { - std::string b = regexps.top(); - regexps.pop(); - std::string a = regexps.top(); - regexps.pop(); - regexps.push("(?:" + - StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) + - ")"); - break; - } - } - } - - if (regexps.size() != 1) { - // Internal error - should never happen. - printf("Bad regexp program:\n"); - for (size_t i = 0; i < post.size(); i++) { - printf(" %s\n", CEscape(post[i]).c_str()); - } - printf("Stack after running program:\n"); - while (!regexps.empty()) { - printf(" %s\n", CEscape(regexps.top()).c_str()); - regexps.pop(); - } - LOG(FATAL) << "Bad regexp program."; - } - - HandleRegexp(regexps.top()); - HandleRegexp("^(?:" + regexps.top() + ")$"); - HandleRegexp("^(?:" + regexps.top() + ")"); - HandleRegexp("(?:" + regexps.top() + ")$"); -} - -// Split s into an vector of strings, one for each UTF-8 character. -std::vector<std::string> Explode(const StringPiece& s) { - std::vector<std::string> v; - - for (const char *q = s.data(); q < s.data() + s.size(); ) { - const char* p = q; - Rune r; - q += chartorune(&r, q); - v.push_back(std::string(p, q - p)); - } - - return v; -} - -// Split string everywhere a substring is found, returning -// vector of pieces. -std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) { - std::vector<std::string> v; - - if (sep.empty()) - return Explode(s); - - const char *p = s.data(); - for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) { - if (StringPiece(q, sep.size()) == sep) { - v.push_back(std::string(p, q - p)); - p = q + sep.size(); - q = p - 1; // -1 for ++ in loop - continue; - } - } - if (p < s.data() + s.size()) - v.push_back(std::string(p, s.data() + s.size() - p)); - return v; -} - -} // namespace re2 +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression generator: generates all possible +// regular expressions within parameters (see regexp_generator.h for details). + +// The regexp generator first generates a sequence of commands in a simple +// postfix language. Each command in the language is a string, +// like "a" or "%s*" or "%s|%s". +// +// To evaluate a command, enough arguments are popped from the value stack to +// plug into the %s slots. Then the result is pushed onto the stack. +// For example, the command sequence +// a b %s%s c +// results in the stack +// ab c +// +// GeneratePostfix generates all possible command sequences. +// Then RunPostfix turns each sequence into a regular expression +// and passes the regexp to HandleRegexp. + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <memory> +#include <stack> +#include <string> +#include <vector> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "util/utf.h" +#include "re2/testing/regexp_generator.h" + +namespace re2 { + +// Returns a vector of the egrep regexp operators. +const std::vector<std::string>& RegexpGenerator::EgrepOps() { + static const char *ops[] = { + "%s%s", + "%s|%s", + "%s*", + "%s+", + "%s?", + "%s\\C*", + }; + static std::vector<std::string> v(ops, ops + arraysize(ops)); + return v; +} + +RegexpGenerator::RegexpGenerator(int maxatoms, int maxops, + const std::vector<std::string>& atoms, + const std::vector<std::string>& ops) + : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) { + // Degenerate case. + if (atoms_.empty()) + maxatoms_ = 0; + if (ops_.empty()) + maxops_ = 0; +} + +// Generates all possible regular expressions (within the parameters), +// calling HandleRegexp for each one. +void RegexpGenerator::Generate() { + std::vector<std::string> postfix; + GeneratePostfix(&postfix, 0, 0, 0); +} + +// Generates random regular expressions, calling HandleRegexp for each one. +void RegexpGenerator::GenerateRandom(int32_t seed, int n) { + rng_.seed(seed); + + for (int i = 0; i < n; i++) { + std::vector<std::string> postfix; + GenerateRandomPostfix(&postfix, 0, 0, 0); + } +} + +// Counts and returns the number of occurrences of "%s" in s. +static int CountArgs(const std::string& s) { + const char *p = s.c_str(); + int n = 0; + while ((p = strstr(p, "%s")) != NULL) { + p += 2; + n++; + } + return n; +} + +// Generates all possible postfix command sequences. +// Each sequence is handed off to RunPostfix to generate a regular expression. +// The arguments are: +// post: the current postfix sequence +// nstk: the number of elements that would be on the stack after executing +// the sequence +// ops: the number of operators used in the sequence +// atoms: the number of atoms used in the sequence +// For example, if post were ["a", "b", "%s%s", "c"], +// then nstk = 2, ops = 1, atoms = 3. +// +// The initial call should be GeneratePostfix([empty vector], 0, 0, 0). +// +void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post, + int nstk, int ops, int atoms) { + if (nstk == 1) + RunPostfix(*post); + + // Early out: if used too many operators or can't + // get back down to a single expression on the stack + // using binary operators, give up. + if (ops + nstk - 1 > maxops_) + return; + + // Add atoms if there is room. + if (atoms < maxatoms_) { + for (size_t i = 0; i < atoms_.size(); i++) { + post->push_back(atoms_[i]); + GeneratePostfix(post, nstk + 1, ops, atoms + 1); + post->pop_back(); + } + } + + // Add operators if there are enough arguments. + if (ops < maxops_) { + for (size_t i = 0; i < ops_.size(); i++) { + const std::string& fmt = ops_[i]; + int nargs = CountArgs(fmt); + if (nargs <= nstk) { + post->push_back(fmt); + GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms); + post->pop_back(); + } + } + } +} + +// Generates a random postfix command sequence. +// Stops and returns true once a single sequence has been generated. +bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post, + int nstk, int ops, int atoms) { + std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms); + std::uniform_int_distribution<int> random_bit(0, 1); + std::uniform_int_distribution<int> random_ops_index( + 0, static_cast<int>(ops_.size()) - 1); + std::uniform_int_distribution<int> random_atoms_index( + 0, static_cast<int>(atoms_.size()) - 1); + + for (;;) { + // Stop if we get to a single element, but only sometimes. + if (nstk == 1 && random_stop(rng_) == 0) { + RunPostfix(*post); + return true; + } + + // Early out: if used too many operators or can't + // get back down to a single expression on the stack + // using binary operators, give up. + if (ops + nstk - 1 > maxops_) + return false; + + // Add operators if there are enough arguments. + if (ops < maxops_ && random_bit(rng_) == 0) { + const std::string& fmt = ops_[random_ops_index(rng_)]; + int nargs = CountArgs(fmt); + if (nargs <= nstk) { + post->push_back(fmt); + bool ret = GenerateRandomPostfix(post, nstk - nargs + 1, + ops + 1, atoms); + post->pop_back(); + if (ret) + return true; + } + } + + // Add atoms if there is room. + if (atoms < maxatoms_ && random_bit(rng_) == 0) { + post->push_back(atoms_[random_atoms_index(rng_)]); + bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); + post->pop_back(); + if (ret) + return true; + } + } +} + +// Interprets the postfix command sequence to create a regular expression +// passed to HandleRegexp. The results of operators like %s|%s are wrapped +// in (?: ) to avoid needing to maintain a precedence table. +void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) { + std::stack<std::string> regexps; + for (size_t i = 0; i < post.size(); i++) { + switch (CountArgs(post[i])) { + default: + LOG(FATAL) << "Bad operator: " << post[i]; + case 0: + regexps.push(post[i]); + break; + case 1: { + std::string a = regexps.top(); + regexps.pop(); + regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")"); + break; + } + case 2: { + std::string b = regexps.top(); + regexps.pop(); + std::string a = regexps.top(); + regexps.pop(); + regexps.push("(?:" + + StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) + + ")"); + break; + } + } + } + + if (regexps.size() != 1) { + // Internal error - should never happen. + printf("Bad regexp program:\n"); + for (size_t i = 0; i < post.size(); i++) { + printf(" %s\n", CEscape(post[i]).c_str()); + } + printf("Stack after running program:\n"); + while (!regexps.empty()) { + printf(" %s\n", CEscape(regexps.top()).c_str()); + regexps.pop(); + } + LOG(FATAL) << "Bad regexp program."; + } + + HandleRegexp(regexps.top()); + HandleRegexp("^(?:" + regexps.top() + ")$"); + HandleRegexp("^(?:" + regexps.top() + ")"); + HandleRegexp("(?:" + regexps.top() + ")$"); +} + +// Split s into an vector of strings, one for each UTF-8 character. +std::vector<std::string> Explode(const StringPiece& s) { + std::vector<std::string> v; + + for (const char *q = s.data(); q < s.data() + s.size(); ) { + const char* p = q; + Rune r; + q += chartorune(&r, q); + v.push_back(std::string(p, q - p)); + } + + return v; +} + +// Split string everywhere a substring is found, returning +// vector of pieces. +std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) { + std::vector<std::string> v; + + if (sep.empty()) + return Explode(s); + + const char *p = s.data(); + for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) { + if (StringPiece(q, sep.size()) == sep) { + v.push_back(std::string(p, q - p)); + p = q + sep.size(); + q = p - 1; // -1 for ++ in loop + continue; + } + } + if (p < s.data() + s.size()) + v.push_back(std::string(p, s.data() + s.size() - p)); + return v; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/regexp_generator.h b/contrib/libs/re2/re2/testing/regexp_generator.h index b07a4ddb1d..7d72aff889 100644 --- a/contrib/libs/re2/re2/testing/regexp_generator.h +++ b/contrib/libs/re2/re2/testing/regexp_generator.h @@ -1,77 +1,77 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef RE2_TESTING_REGEXP_GENERATOR_H_ -#define RE2_TESTING_REGEXP_GENERATOR_H_ - -// Regular expression generator: generates all possible -// regular expressions within given parameters (see below for details). - -#include <stdint.h> -#include <random> -#include <string> -#include <vector> - -#include "util/util.h" -#include "re2/stringpiece.h" - -namespace re2 { - -// Regular expression generator. -// -// Given a set of atom expressions like "a", "b", or "." -// and operators like "%s*", generates all possible regular expressions -// using at most maxbases base expressions and maxops operators. -// For each such expression re, calls HandleRegexp(re). -// -// Callers are expected to subclass RegexpGenerator and provide HandleRegexp. -// -class RegexpGenerator { - public: - RegexpGenerator(int maxatoms, int maxops, - const std::vector<std::string>& atoms, - const std::vector<std::string>& ops); - virtual ~RegexpGenerator() {} - - // Generates all the regular expressions, calling HandleRegexp(re) for each. - void Generate(); - - // Generates n random regular expressions, calling HandleRegexp(re) for each. - void GenerateRandom(int32_t seed, int n); - - // Handles a regular expression. Must be provided by subclass. - virtual void HandleRegexp(const std::string& regexp) = 0; - - // The egrep regexp operators: * + ? | and concatenation. - static const std::vector<std::string>& EgrepOps(); - - private: - void RunPostfix(const std::vector<std::string>& post); - void GeneratePostfix(std::vector<std::string>* post, - int nstk, int ops, int lits); - bool GenerateRandomPostfix(std::vector<std::string>* post, - int nstk, int ops, int lits); - - int maxatoms_; // Maximum number of atoms allowed in expr. - int maxops_; // Maximum number of ops allowed in expr. - std::vector<std::string> atoms_; // Possible atoms. - std::vector<std::string> ops_; // Possible ops. - std::minstd_rand0 rng_; // Random number generator. - - RegexpGenerator(const RegexpGenerator&) = delete; - RegexpGenerator& operator=(const RegexpGenerator&) = delete; -}; - -// Helpers for preparing arguments to RegexpGenerator constructor. - -// Returns one string for each character in s. -std::vector<std::string> Explode(const StringPiece& s); - -// Splits string everywhere sep is found, returning -// vector of pieces. -std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s); - -} // namespace re2 - -#endif // RE2_TESTING_REGEXP_GENERATOR_H_ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_TESTING_REGEXP_GENERATOR_H_ +#define RE2_TESTING_REGEXP_GENERATOR_H_ + +// Regular expression generator: generates all possible +// regular expressions within given parameters (see below for details). + +#include <stdint.h> +#include <random> +#include <string> +#include <vector> + +#include "util/util.h" +#include "re2/stringpiece.h" + +namespace re2 { + +// Regular expression generator. +// +// Given a set of atom expressions like "a", "b", or "." +// and operators like "%s*", generates all possible regular expressions +// using at most maxbases base expressions and maxops operators. +// For each such expression re, calls HandleRegexp(re). +// +// Callers are expected to subclass RegexpGenerator and provide HandleRegexp. +// +class RegexpGenerator { + public: + RegexpGenerator(int maxatoms, int maxops, + const std::vector<std::string>& atoms, + const std::vector<std::string>& ops); + virtual ~RegexpGenerator() {} + + // Generates all the regular expressions, calling HandleRegexp(re) for each. + void Generate(); + + // Generates n random regular expressions, calling HandleRegexp(re) for each. + void GenerateRandom(int32_t seed, int n); + + // Handles a regular expression. Must be provided by subclass. + virtual void HandleRegexp(const std::string& regexp) = 0; + + // The egrep regexp operators: * + ? | and concatenation. + static const std::vector<std::string>& EgrepOps(); + + private: + void RunPostfix(const std::vector<std::string>& post); + void GeneratePostfix(std::vector<std::string>* post, + int nstk, int ops, int lits); + bool GenerateRandomPostfix(std::vector<std::string>* post, + int nstk, int ops, int lits); + + int maxatoms_; // Maximum number of atoms allowed in expr. + int maxops_; // Maximum number of ops allowed in expr. + std::vector<std::string> atoms_; // Possible atoms. + std::vector<std::string> ops_; // Possible ops. + std::minstd_rand0 rng_; // Random number generator. + + RegexpGenerator(const RegexpGenerator&) = delete; + RegexpGenerator& operator=(const RegexpGenerator&) = delete; +}; + +// Helpers for preparing arguments to RegexpGenerator constructor. + +// Returns one string for each character in s. +std::vector<std::string> Explode(const StringPiece& s); + +// Splits string everywhere sep is found, returning +// vector of pieces. +std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s); + +} // namespace re2 + +#endif // RE2_TESTING_REGEXP_GENERATOR_H_ diff --git a/contrib/libs/re2/re2/testing/regexp_test.cc b/contrib/libs/re2/re2/testing/regexp_test.cc index 78f9c74093..11fdfed24b 100644 --- a/contrib/libs/re2/re2/testing/regexp_test.cc +++ b/contrib/libs/re2/re2/testing/regexp_test.cc @@ -1,86 +1,86 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Test parse.cc, dump.cc, and tostring.cc. - -#include <stddef.h> -#include <map> -#include <string> -#include <vector> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/regexp.h" - -namespace re2 { - -// Test that overflowed ref counts work. -TEST(Regexp, BigRef) { - Regexp* re; - re = Regexp::Parse("x", Regexp::NoParseFlags, NULL); - for (int i = 0; i < 100000; i++) - re->Incref(); - for (int i = 0; i < 100000; i++) - re->Decref(); - ASSERT_EQ(re->Ref(), 1); - re->Decref(); -} - -// Test that very large Concats work. -// Depends on overflowed ref counts working. -TEST(Regexp, BigConcat) { - Regexp* x; - x = Regexp::Parse("x", Regexp::NoParseFlags, NULL); - std::vector<Regexp*> v(90000, x); // ToString bails out at 100000 - for (size_t i = 0; i < v.size(); i++) - x->Incref(); - ASSERT_EQ(x->Ref(), 1 + static_cast<int>(v.size())) << x->Ref(); - Regexp* re = Regexp::Concat(v.data(), static_cast<int>(v.size()), - Regexp::NoParseFlags); - ASSERT_EQ(re->ToString(), std::string(v.size(), 'x')); - re->Decref(); - ASSERT_EQ(x->Ref(), 1) << x->Ref(); - x->Decref(); -} - -TEST(Regexp, NamedCaptures) { - Regexp* x; - RegexpStatus status; - x = Regexp::Parse( - "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(4, x->NumCaptures()); - const std::map<std::string, int>* have = x->NamedCaptures(); - EXPECT_TRUE(have != NULL); - EXPECT_EQ(2, have->size()); // there are only two named groups in - // the regexp: 'g1' and 'g2'. - std::map<std::string, int> want; - want["g1"] = 1; - want["g2"] = 3; - EXPECT_EQ(want, *have); - x->Decref(); - delete have; -} - -TEST(Regexp, CaptureNames) { - Regexp* x; - RegexpStatus status; - x = Regexp::Parse( - "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(4, x->NumCaptures()); - const std::map<int, std::string>* have = x->CaptureNames(); - EXPECT_TRUE(have != NULL); - EXPECT_EQ(3, have->size()); - std::map<int, std::string> want; - want[1] = "g1"; - want[3] = "g2"; - want[4] = "g1"; - - EXPECT_EQ(want, *have); - x->Decref(); - delete have; -} - -} // namespace re2 +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test parse.cc, dump.cc, and tostring.cc. + +#include <stddef.h> +#include <map> +#include <string> +#include <vector> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/regexp.h" + +namespace re2 { + +// Test that overflowed ref counts work. +TEST(Regexp, BigRef) { + Regexp* re; + re = Regexp::Parse("x", Regexp::NoParseFlags, NULL); + for (int i = 0; i < 100000; i++) + re->Incref(); + for (int i = 0; i < 100000; i++) + re->Decref(); + ASSERT_EQ(re->Ref(), 1); + re->Decref(); +} + +// Test that very large Concats work. +// Depends on overflowed ref counts working. +TEST(Regexp, BigConcat) { + Regexp* x; + x = Regexp::Parse("x", Regexp::NoParseFlags, NULL); + std::vector<Regexp*> v(90000, x); // ToString bails out at 100000 + for (size_t i = 0; i < v.size(); i++) + x->Incref(); + ASSERT_EQ(x->Ref(), 1 + static_cast<int>(v.size())) << x->Ref(); + Regexp* re = Regexp::Concat(v.data(), static_cast<int>(v.size()), + Regexp::NoParseFlags); + ASSERT_EQ(re->ToString(), std::string(v.size(), 'x')); + re->Decref(); + ASSERT_EQ(x->Ref(), 1) << x->Ref(); + x->Decref(); +} + +TEST(Regexp, NamedCaptures) { + Regexp* x; + RegexpStatus status; + x = Regexp::Parse( + "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(4, x->NumCaptures()); + const std::map<std::string, int>* have = x->NamedCaptures(); + EXPECT_TRUE(have != NULL); + EXPECT_EQ(2, have->size()); // there are only two named groups in + // the regexp: 'g1' and 'g2'. + std::map<std::string, int> want; + want["g1"] = 1; + want["g2"] = 3; + EXPECT_EQ(want, *have); + x->Decref(); + delete have; +} + +TEST(Regexp, CaptureNames) { + Regexp* x; + RegexpStatus status; + x = Regexp::Parse( + "(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(4, x->NumCaptures()); + const std::map<int, std::string>* have = x->CaptureNames(); + EXPECT_TRUE(have != NULL); + EXPECT_EQ(3, have->size()); + std::map<int, std::string> want; + want[1] = "g1"; + want[3] = "g2"; + want[4] = "g1"; + + EXPECT_EQ(want, *have); + x->Decref(); + delete have; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/required_prefix_test.cc b/contrib/libs/re2/re2/testing/required_prefix_test.cc index 4f82f4c5a0..7fc0f0d973 100644 --- a/contrib/libs/re2/re2/testing/required_prefix_test.cc +++ b/contrib/libs/re2/re2/testing/required_prefix_test.cc @@ -1,199 +1,199 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <string> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/prog.h" -#include "re2/regexp.h" - -namespace re2 { - -struct PrefixTest { - const char* regexp; - bool return_value; - const char* prefix; - bool foldcase; - const char* suffix; -}; - -static PrefixTest tests[] = { - // Empty cases. - { "", false }, - { "(?m)^", false }, - { "(?-m)^", false }, - - // If the regexp has no ^, there's no required prefix. - { "abc", false }, - - // If the regexp immediately goes into - // something not a literal match, there's no required prefix. - { "^a*", false }, - { "^(abc)", false }, - - // Otherwise, it should work. - { "^abc$", true, "abc", false, "(?-m:$)" }, - { "^abc", true, "abc", false, "" }, - { "^(?i)abc", true, "abc", true, "" }, - { "^abcd*", true, "abc", false, "d*" }, - { "^[Aa][Bb]cd*", true, "ab", true, "cd*" }, - { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" }, - { "^☺abc", true, "☺abc", false, "" }, -}; - -TEST(RequiredPrefix, SimpleTests) { - for (size_t i = 0; i < arraysize(tests); i++) { - const PrefixTest& t = tests[i]; - for (size_t j = 0; j < 2; j++) { - Regexp::ParseFlags flags = Regexp::LikePerl; - if (j == 0) - flags = flags | Regexp::Latin1; - Regexp* re = Regexp::Parse(t.regexp, flags, NULL); - ASSERT_TRUE(re != NULL) << " " << t.regexp; - - std::string p; - bool f; - Regexp* s; - ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") - << " " << re->Dump(); - if (t.return_value) { - ASSERT_EQ(p, std::string(t.prefix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); - ASSERT_EQ(f, t.foldcase) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); - ASSERT_EQ(s->ToString(), std::string(t.suffix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); - s->Decref(); - } - re->Decref(); - } - } -} - -static PrefixTest for_accel_tests[] = { - // Empty cases. - { "", false }, - { "(?m)^", false }, - { "(?-m)^", false }, - - // If the regexp has a ^, there's no required prefix. - { "^abc", false }, - - // If the regexp immediately goes into - // something not a literal match, there's no required prefix. - { "a*", false }, - - // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through" - // capturing groups, but doesn't try to glue prefix fragments together. - { "(a?)def", false }, - { "(ab?)def", true, "a", false }, - { "(abc?)def", true, "ab", false }, - { "(()a)def", false }, - { "((a)b)def", true, "a", false }, - { "((ab)c)def", true, "ab", false }, - - // Otherwise, it should work. - { "abc$", true, "abc", false }, - { "abc", true, "abc", false }, - { "(?i)abc", true, "abc", true }, - { "abcd*", true, "abc", false }, - { "[Aa][Bb]cd*", true, "ab", true }, - { "ab[Cc]d*", true, "ab", false }, - { "☺abc", true, "☺abc", false }, -}; - -TEST(RequiredPrefixForAccel, SimpleTests) { - for (size_t i = 0; i < arraysize(for_accel_tests); i++) { - const PrefixTest& t = for_accel_tests[i]; - for (size_t j = 0; j < 2; j++) { - Regexp::ParseFlags flags = Regexp::LikePerl; - if (j == 0) - flags = flags | Regexp::Latin1; - Regexp* re = Regexp::Parse(t.regexp, flags, NULL); - ASSERT_TRUE(re != NULL) << " " << t.regexp; - - std::string p; - bool f; - ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") - << " " << re->Dump(); - if (t.return_value) { - ASSERT_EQ(p, std::string(t.prefix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); - ASSERT_EQ(f, t.foldcase) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); - } - re->Decref(); - } - } -} - -TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) { - Regexp* re; - std::string p; - bool f; - - // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'. - re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL); - ASSERT_TRUE(re != NULL); - ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f)); - ASSERT_EQ(p, "klm"); - ASSERT_EQ(f, true); - re->Decref(); - - re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL); - ASSERT_TRUE(re != NULL); - ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f)); - ASSERT_EQ(p, "stu"); - ASSERT_EQ(f, true); - re->Decref(); - - // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'. - // This is because they match U+212A and U+017F, respectively, and - // so the parser ends up emitting character classes, not literals. - re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL); - ASSERT_TRUE(re != NULL); - ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f)); - re->Decref(); - - re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL); - ASSERT_TRUE(re != NULL); - ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f)); - re->Decref(); -} - -static const char* prefix_accel_tests[] = { - "aababc\\d+", - "(?i)AABABC\\d+", -}; - -TEST(PrefixAccel, SimpleTests) { - for (size_t i = 0; i < arraysize(prefix_accel_tests); i++) { - const char* pattern = prefix_accel_tests[i]; - Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL); - ASSERT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(0); - ASSERT_TRUE(prog != NULL); - ASSERT_TRUE(prog->can_prefix_accel()); - for (int j = 0; j < 100; j++) { - std::string text(j, 'a'); - const char* p = reinterpret_cast<const char*>( - prog->PrefixAccel(text.data(), text.size())); - EXPECT_TRUE(p == NULL); - text.append("aababc"); - for (int k = 0; k < 100; k++) { - text.append(k, 'a'); - p = reinterpret_cast<const char*>( - prog->PrefixAccel(text.data(), text.size())); - EXPECT_EQ(j, p - text.data()); - } - } - delete prog; - re->Decref(); - } -} - -} // namespace re2 +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <string> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +struct PrefixTest { + const char* regexp; + bool return_value; + const char* prefix; + bool foldcase; + const char* suffix; +}; + +static PrefixTest tests[] = { + // Empty cases. + { "", false }, + { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has no ^, there's no required prefix. + { "abc", false }, + + // If the regexp immediately goes into + // something not a literal match, there's no required prefix. + { "^a*", false }, + { "^(abc)", false }, + + // Otherwise, it should work. + { "^abc$", true, "abc", false, "(?-m:$)" }, + { "^abc", true, "abc", false, "" }, + { "^(?i)abc", true, "abc", true, "" }, + { "^abcd*", true, "abc", false, "d*" }, + { "^[Aa][Bb]cd*", true, "ab", true, "cd*" }, + { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" }, + { "^☺abc", true, "☺abc", false, "" }, +}; + +TEST(RequiredPrefix, SimpleTests) { + for (size_t i = 0; i < arraysize(tests); i++) { + const PrefixTest& t = tests[i]; + for (size_t j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + ASSERT_TRUE(re != NULL) << " " << t.regexp; + + std::string p; + bool f; + Regexp* s; + ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") + << " " << re->Dump(); + if (t.return_value) { + ASSERT_EQ(p, std::string(t.prefix)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + ASSERT_EQ(f, t.foldcase) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + ASSERT_EQ(s->ToString(), std::string(t.suffix)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + s->Decref(); + } + re->Decref(); + } + } +} + +static PrefixTest for_accel_tests[] = { + // Empty cases. + { "", false }, + { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has a ^, there's no required prefix. + { "^abc", false }, + + // If the regexp immediately goes into + // something not a literal match, there's no required prefix. + { "a*", false }, + + // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through" + // capturing groups, but doesn't try to glue prefix fragments together. + { "(a?)def", false }, + { "(ab?)def", true, "a", false }, + { "(abc?)def", true, "ab", false }, + { "(()a)def", false }, + { "((a)b)def", true, "a", false }, + { "((ab)c)def", true, "ab", false }, + + // Otherwise, it should work. + { "abc$", true, "abc", false }, + { "abc", true, "abc", false }, + { "(?i)abc", true, "abc", true }, + { "abcd*", true, "abc", false }, + { "[Aa][Bb]cd*", true, "ab", true }, + { "ab[Cc]d*", true, "ab", false }, + { "☺abc", true, "☺abc", false }, +}; + +TEST(RequiredPrefixForAccel, SimpleTests) { + for (size_t i = 0; i < arraysize(for_accel_tests); i++) { + const PrefixTest& t = for_accel_tests[i]; + for (size_t j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + ASSERT_TRUE(re != NULL) << " " << t.regexp; + + std::string p; + bool f; + ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") + << " " << re->Dump(); + if (t.return_value) { + ASSERT_EQ(p, std::string(t.prefix)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + ASSERT_EQ(f, t.foldcase) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + } + re->Decref(); + } + } +} + +TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) { + Regexp* re; + std::string p; + bool f; + + // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'. + re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f)); + ASSERT_EQ(p, "klm"); + ASSERT_EQ(f, true); + re->Decref(); + + re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f)); + ASSERT_EQ(p, "stu"); + ASSERT_EQ(f, true); + re->Decref(); + + // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'. + // This is because they match U+212A and U+017F, respectively, and + // so the parser ends up emitting character classes, not literals. + re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f)); + re->Decref(); + + re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f)); + re->Decref(); +} + +static const char* prefix_accel_tests[] = { + "aababc\\d+", + "(?i)AABABC\\d+", +}; + +TEST(PrefixAccel, SimpleTests) { + for (size_t i = 0; i < arraysize(prefix_accel_tests); i++) { + const char* pattern = prefix_accel_tests[i]; + Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + ASSERT_TRUE(prog != NULL); + ASSERT_TRUE(prog->can_prefix_accel()); + for (int j = 0; j < 100; j++) { + std::string text(j, 'a'); + const char* p = reinterpret_cast<const char*>( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_TRUE(p == NULL); + text.append("aababc"); + for (int k = 0; k < 100; k++) { + text.append(k, 'a'); + p = reinterpret_cast<const char*>( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_EQ(j, p - text.data()); + } + } + delete prog; + re->Decref(); + } +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/search_test.cc b/contrib/libs/re2/re2/testing/search_test.cc index f80aadd249..2539295618 100644 --- a/contrib/libs/re2/re2/testing/search_test.cc +++ b/contrib/libs/re2/re2/testing/search_test.cc @@ -1,334 +1,334 @@ -// Copyright 2006-2007 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "library/cpp/testing/gtest/gtest.h" -#include "re2/prog.h" -#include "re2/regexp.h" -#include "re2/testing/tester.h" -#include "re2/testing/exhaustive_tester.h" - -// For target `log' in the Makefile. -#ifndef LOGGING -#define LOGGING 0 -#endif - -namespace re2 { - -struct RegexpTest { - const char* regexp; - const char* text; -}; - -RegexpTest simple_tests[] = { - { "a", "a" }, - { "a", "zyzzyva" }, - { "a+", "aa" }, - { "(a+|b)+", "ab" }, - { "ab|cd", "xabcdx" }, - { "h.*od?", "hello\ngoodbye\n" }, - { "h.*o", "hello\ngoodbye\n" }, - { "h.*o", "goodbye\nhello\n" }, - { "h.*o", "hello world" }, - { "h.*o", "othello, world" }, - { "[^\\s\\S]", "aaaaaaa" }, - { "a", "aaaaaaa" }, - { "a*", "aaaaaaa" }, - { "a*", "" }, - { "ab|cd", "xabcdx" }, - { "a", "cab" }, - { "a*b", "cab" }, - { "((((((((((((((((((((x))))))))))))))))))))", "x" }, - { "[abcd]", "xxxabcdxxx" }, - { "[^x]", "xxxabcdxxx" }, - { "[abcd]+", "xxxabcdxxx" }, - { "[^x]+", "xxxabcdxxx" }, - { "(fo|foo)", "fo" }, - { "(foo|fo)", "foo" }, - - { "aa", "aA" }, - { "a", "Aa" }, - { "a", "A" }, - { "ABC", "abc" }, - { "abc", "XABCY" }, - { "ABC", "xabcy" }, - - // Make sure ^ and $ work. - // The pathological cases didn't work - // in the original grep code. - { "foo|bar|[A-Z]", "foo" }, - { "^(foo|bar|[A-Z])", "foo" }, - { "(foo|bar|[A-Z])$", "foo\n" }, - { "(foo|bar|[A-Z])$", "foo" }, - { "^(foo|bar|[A-Z])$", "foo\n" }, - { "^(foo|bar|[A-Z])$", "foo" }, - { "^(foo|bar|[A-Z])$", "bar" }, - { "^(foo|bar|[A-Z])$", "X" }, - { "^(foo|bar|[A-Z])$", "XY" }, - { "^(fo|foo)$", "fo" }, - { "^(fo|foo)$", "foo" }, - { "^^(fo|foo)$", "fo" }, - { "^^(fo|foo)$", "foo" }, - { "^$", "" }, - { "^$", "x" }, - { "^^$", "" }, - { "^$$", "" }, - { "^^$", "x" }, - { "^$$", "x" }, - { "^^$$", "" }, - { "^^$$", "x" }, - { "^^^^^^^^$$$$$$$$", "" }, - { "^", "x" }, - { "$", "x" }, - - // Word boundaries. - { "\\bfoo\\b", "nofoo foo that" }, - { "a\\b", "faoa x" }, - { "\\bbar", "bar x" }, - { "\\bbar", "foo\nbar x" }, - { "bar\\b", "foobar" }, - { "bar\\b", "foobar\nxxx" }, - { "(foo|bar|[A-Z])\\b", "foo" }, - { "(foo|bar|[A-Z])\\b", "foo\n" }, - { "\\b", "" }, - { "\\b", "x" }, - { "\\b(foo|bar|[A-Z])", "foo" }, - { "\\b(foo|bar|[A-Z])\\b", "X" }, - { "\\b(foo|bar|[A-Z])\\b", "XY" }, - { "\\b(foo|bar|[A-Z])\\b", "bar" }, - { "\\b(foo|bar|[A-Z])\\b", "foo" }, - { "\\b(foo|bar|[A-Z])\\b", "foo\n" }, - { "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" }, - { "\\b(fo|foo)\\b", "fo" }, - { "\\b(fo|foo)\\b", "foo" }, - { "\\b\\b", "" }, - { "\\b\\b", "x" }, - { "\\b$", "" }, - { "\\b$", "x" }, - { "\\b$", "y x" }, - { "\\b.$", "x" }, - { "^\\b(fo|foo)\\b", "fo" }, - { "^\\b(fo|foo)\\b", "foo" }, - { "^\\b", "" }, - { "^\\b", "x" }, - { "^\\b\\b", "" }, - { "^\\b\\b", "x" }, - { "^\\b$", "" }, - { "^\\b$", "x" }, - { "^\\b.$", "x" }, - { "^\\b.\\b$", "x" }, - { "^^^^^^^^\\b$$$$$$$", "" }, - { "^^^^^^^^\\b.$$$$$$", "x" }, - { "^^^^^^^^\\b$$$$$$$", "x" }, - - // Non-word boundaries. - { "\\Bfoo\\B", "n foo xfoox that" }, - { "a\\B", "faoa x" }, - { "\\Bbar", "bar x" }, - { "\\Bbar", "foo\nbar x" }, - { "bar\\B", "foobar" }, - { "bar\\B", "foobar\nxxx" }, - { "(foo|bar|[A-Z])\\B", "foox" }, - { "(foo|bar|[A-Z])\\B", "foo\n" }, - { "\\B", "" }, - { "\\B", "x" }, - { "\\B(foo|bar|[A-Z])", "foo" }, - { "\\B(foo|bar|[A-Z])\\B", "xXy" }, - { "\\B(foo|bar|[A-Z])\\B", "XY" }, - { "\\B(foo|bar|[A-Z])\\B", "XYZ" }, - { "\\B(foo|bar|[A-Z])\\B", "abara" }, - { "\\B(foo|bar|[A-Z])\\B", "xfoo_" }, - { "\\B(foo|bar|[A-Z])\\B", "xfoo\n" }, - { "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" }, - { "\\B(fo|foo)\\B", "xfoo" }, - { "\\B(foo|fo)\\B", "xfooo" }, - { "\\B\\B", "" }, - { "\\B\\B", "x" }, - { "\\B$", "" }, - { "\\B$", "x" }, - { "\\B$", "y x" }, - { "\\B.$", "x" }, - { "^\\B(fo|foo)\\B", "fo" }, - { "^\\B(fo|foo)\\B", "foo" }, - { "^\\B", "" }, - { "^\\B", "x" }, - { "^\\B\\B", "" }, - { "^\\B\\B", "x" }, - { "^\\B$", "" }, - { "^\\B$", "x" }, - { "^\\B.$", "x" }, - { "^\\B.\\B$", "x" }, - { "^^^^^^^^\\B$$$$$$$", "" }, - { "^^^^^^^^\\B.$$$$$$", "x" }, - { "^^^^^^^^\\B$$$$$$$", "x" }, - - // PCRE uses only ASCII for \b computation. - // All non-ASCII are *not* word characters. - { "\\bx\\b", "x" }, - { "\\bx\\b", "x>" }, - { "\\bx\\b", "<x" }, - { "\\bx\\b", "<x>" }, - { "\\bx\\b", "ax" }, - { "\\bx\\b", "xb" }, - { "\\bx\\b", "axb" }, - { "\\bx\\b", "«x" }, - { "\\bx\\b", "x»" }, - { "\\bx\\b", "«x»" }, - { "\\bx\\b", "axb" }, - { "\\bx\\b", "áxβ" }, - { "\\Bx\\B", "axb" }, - { "\\Bx\\B", "áxβ" }, - - // Weird boundary cases. - { "^$^$", "" }, - { "^$^", "" }, - { "$^$", "" }, - - { "^$^$", "x" }, - { "^$^", "x" }, - { "$^$", "x" }, - - { "^$^$", "x\ny" }, - { "^$^", "x\ny" }, - { "$^$", "x\ny" }, - - { "^$^$", "x\n\ny" }, - { "^$^", "x\n\ny" }, - { "$^$", "x\n\ny" }, - - { "^(foo\\$)$", "foo$bar" }, - { "(foo\\$)", "foo$bar" }, - { "^...$", "abc" }, - - // UTF-8 - { "^\xe6\x9c\xac$", "\xe6\x9c\xac" }, - { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, - { "^...$", ".\xe6\x9c\xac." }, - - { "^\\C\\C\\C$", "\xe6\x9c\xac" }, - { "^\\C$", "\xe6\x9c\xac" }, - { "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, - - // Latin1 - { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, - { "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, - { "^...$", ".\xe6\x9c\xac." }, - { "^.....$", ".\xe6\x9c\xac." }, - - // Perl v Posix - { "\\B(fo|foo)\\B", "xfooo" }, - { "(fo|foo)", "foo" }, - - // Octal escapes. - { "\\141", "a" }, - { "\\060", "0" }, - { "\\0600", "00" }, - { "\\608", "08" }, - { "\\01", "\01" }, - { "\\018", "\01" "8" }, - - // Hexadecimal escapes - { "\\x{61}", "a" }, - { "\\x61", "a" }, - { "\\x{00000061}", "a" }, - - // Unicode scripts. - { "\\p{Greek}+", "aαβb" }, - { "\\P{Greek}+", "aαβb" }, - { "\\p{^Greek}+", "aαβb" }, - { "\\P{^Greek}+", "aαβb" }, - - // Unicode properties. Nd is decimal number. N is any number. - { "[^0-9]+", "abc123" }, - { "\\p{Nd}+", "abc123²³¼½¾₀₉" }, - { "\\p{^Nd}+", "abc123²³¼½¾₀₉" }, - { "\\P{Nd}+", "abc123²³¼½¾₀₉" }, - { "\\P{^Nd}+", "abc123²³¼½¾₀₉" }, - { "\\pN+", "abc123²³¼½¾₀₉" }, - { "\\p{N}+", "abc123²³¼½¾₀₉" }, - { "\\p{^N}+", "abc123²³¼½¾₀₉" }, - - { "\\p{Any}+", "abc123" }, - - // Character classes & case folding. - { "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B - { "(?i)[A-Z]+", "aAzZ" }, - { "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z - - // splits the ranges in an interesting way. - - // would like to use, but PCRE mishandles in full-match, non-greedy mode - // { "(?i)[\\\\]+", "Aa" }, - - { "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, - - // Character classes & case folding. - { "[@-A]+", "@AaB" }, - { "[A-Z]+", "aAzZ" }, - { "[^\\\\]+", "Aa\\" }, - { "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, - - // Anchoring. (^abc in aabcdef was a former bug) - // The tester checks for a match in the text and - // subpieces of the text with a byte removed on either side. - { "^abc", "abcdef" }, - { "^abc", "aabcdef" }, - { "^[ay]*[bx]+c", "abcdef" }, - { "^[ay]*[bx]+c", "aabcdef" }, - { "def$", "abcdef" }, - { "def$", "abcdeff" }, - { "d[ex][fy]$", "abcdef" }, - { "d[ex][fy]$", "abcdeff" }, - { "[dz][ex][fy]$", "abcdef" }, - { "[dz][ex][fy]$", "abcdeff" }, - { "(?m)^abc", "abcdef" }, - { "(?m)^abc", "aabcdef" }, - { "(?m)^[ay]*[bx]+c", "abcdef" }, - { "(?m)^[ay]*[bx]+c", "aabcdef" }, - { "(?m)def$", "abcdef" }, - { "(?m)def$", "abcdeff" }, - { "(?m)d[ex][fy]$", "abcdef" }, - { "(?m)d[ex][fy]$", "abcdeff" }, - { "(?m)[dz][ex][fy]$", "abcdef" }, - { "(?m)[dz][ex][fy]$", "abcdeff" }, - { "^", "a" }, - { "^^", "a" }, - - // Context. - // The tester checks for a match in the text and - // subpieces of the text with a byte removed on either side. - { "a", "a" }, - { "ab*", "a" }, - { "a\\C*", "a" }, - { "a\\C+", "a" }, - { "a\\C?", "a" }, - { "a\\C*?", "a" }, - { "a\\C+?", "a" }, - { "a\\C??", "a" }, - - // Former bugs. - { "a\\C*|ba\\C", "baba" }, - { "\\w*I\\w*", "Inc." }, - { "(?:|a)*", "aaa" }, - { "(?:|a)+", "aaa" }, -}; - -TEST(Regexp, SearchTests) { - int failures = 0; - for (size_t i = 0; i < arraysize(simple_tests); i++) { - const RegexpTest& t = simple_tests[i]; - if (!TestRegexpOnText(t.regexp, t.text)) - failures++; - - if (LOGGING) { - // Build a dummy ExhaustiveTest call that will trigger just - // this one test, so that we log the test case. - std::vector<std::string> atom, alpha, ops; - atom.push_back(t.regexp); - alpha.push_back(t.text); - ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", ""); - } - } - EXPECT_EQ(failures, 0); -} - -} // namespace re2 +// Copyright 2006-2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "library/cpp/testing/gtest/gtest.h" +#include "re2/prog.h" +#include "re2/regexp.h" +#include "re2/testing/tester.h" +#include "re2/testing/exhaustive_tester.h" + +// For target `log' in the Makefile. +#ifndef LOGGING +#define LOGGING 0 +#endif + +namespace re2 { + +struct RegexpTest { + const char* regexp; + const char* text; +}; + +RegexpTest simple_tests[] = { + { "a", "a" }, + { "a", "zyzzyva" }, + { "a+", "aa" }, + { "(a+|b)+", "ab" }, + { "ab|cd", "xabcdx" }, + { "h.*od?", "hello\ngoodbye\n" }, + { "h.*o", "hello\ngoodbye\n" }, + { "h.*o", "goodbye\nhello\n" }, + { "h.*o", "hello world" }, + { "h.*o", "othello, world" }, + { "[^\\s\\S]", "aaaaaaa" }, + { "a", "aaaaaaa" }, + { "a*", "aaaaaaa" }, + { "a*", "" }, + { "ab|cd", "xabcdx" }, + { "a", "cab" }, + { "a*b", "cab" }, + { "((((((((((((((((((((x))))))))))))))))))))", "x" }, + { "[abcd]", "xxxabcdxxx" }, + { "[^x]", "xxxabcdxxx" }, + { "[abcd]+", "xxxabcdxxx" }, + { "[^x]+", "xxxabcdxxx" }, + { "(fo|foo)", "fo" }, + { "(foo|fo)", "foo" }, + + { "aa", "aA" }, + { "a", "Aa" }, + { "a", "A" }, + { "ABC", "abc" }, + { "abc", "XABCY" }, + { "ABC", "xabcy" }, + + // Make sure ^ and $ work. + // The pathological cases didn't work + // in the original grep code. + { "foo|bar|[A-Z]", "foo" }, + { "^(foo|bar|[A-Z])", "foo" }, + { "(foo|bar|[A-Z])$", "foo\n" }, + { "(foo|bar|[A-Z])$", "foo" }, + { "^(foo|bar|[A-Z])$", "foo\n" }, + { "^(foo|bar|[A-Z])$", "foo" }, + { "^(foo|bar|[A-Z])$", "bar" }, + { "^(foo|bar|[A-Z])$", "X" }, + { "^(foo|bar|[A-Z])$", "XY" }, + { "^(fo|foo)$", "fo" }, + { "^(fo|foo)$", "foo" }, + { "^^(fo|foo)$", "fo" }, + { "^^(fo|foo)$", "foo" }, + { "^$", "" }, + { "^$", "x" }, + { "^^$", "" }, + { "^$$", "" }, + { "^^$", "x" }, + { "^$$", "x" }, + { "^^$$", "" }, + { "^^$$", "x" }, + { "^^^^^^^^$$$$$$$$", "" }, + { "^", "x" }, + { "$", "x" }, + + // Word boundaries. + { "\\bfoo\\b", "nofoo foo that" }, + { "a\\b", "faoa x" }, + { "\\bbar", "bar x" }, + { "\\bbar", "foo\nbar x" }, + { "bar\\b", "foobar" }, + { "bar\\b", "foobar\nxxx" }, + { "(foo|bar|[A-Z])\\b", "foo" }, + { "(foo|bar|[A-Z])\\b", "foo\n" }, + { "\\b", "" }, + { "\\b", "x" }, + { "\\b(foo|bar|[A-Z])", "foo" }, + { "\\b(foo|bar|[A-Z])\\b", "X" }, + { "\\b(foo|bar|[A-Z])\\b", "XY" }, + { "\\b(foo|bar|[A-Z])\\b", "bar" }, + { "\\b(foo|bar|[A-Z])\\b", "foo" }, + { "\\b(foo|bar|[A-Z])\\b", "foo\n" }, + { "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" }, + { "\\b(fo|foo)\\b", "fo" }, + { "\\b(fo|foo)\\b", "foo" }, + { "\\b\\b", "" }, + { "\\b\\b", "x" }, + { "\\b$", "" }, + { "\\b$", "x" }, + { "\\b$", "y x" }, + { "\\b.$", "x" }, + { "^\\b(fo|foo)\\b", "fo" }, + { "^\\b(fo|foo)\\b", "foo" }, + { "^\\b", "" }, + { "^\\b", "x" }, + { "^\\b\\b", "" }, + { "^\\b\\b", "x" }, + { "^\\b$", "" }, + { "^\\b$", "x" }, + { "^\\b.$", "x" }, + { "^\\b.\\b$", "x" }, + { "^^^^^^^^\\b$$$$$$$", "" }, + { "^^^^^^^^\\b.$$$$$$", "x" }, + { "^^^^^^^^\\b$$$$$$$", "x" }, + + // Non-word boundaries. + { "\\Bfoo\\B", "n foo xfoox that" }, + { "a\\B", "faoa x" }, + { "\\Bbar", "bar x" }, + { "\\Bbar", "foo\nbar x" }, + { "bar\\B", "foobar" }, + { "bar\\B", "foobar\nxxx" }, + { "(foo|bar|[A-Z])\\B", "foox" }, + { "(foo|bar|[A-Z])\\B", "foo\n" }, + { "\\B", "" }, + { "\\B", "x" }, + { "\\B(foo|bar|[A-Z])", "foo" }, + { "\\B(foo|bar|[A-Z])\\B", "xXy" }, + { "\\B(foo|bar|[A-Z])\\B", "XY" }, + { "\\B(foo|bar|[A-Z])\\B", "XYZ" }, + { "\\B(foo|bar|[A-Z])\\B", "abara" }, + { "\\B(foo|bar|[A-Z])\\B", "xfoo_" }, + { "\\B(foo|bar|[A-Z])\\B", "xfoo\n" }, + { "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" }, + { "\\B(fo|foo)\\B", "xfoo" }, + { "\\B(foo|fo)\\B", "xfooo" }, + { "\\B\\B", "" }, + { "\\B\\B", "x" }, + { "\\B$", "" }, + { "\\B$", "x" }, + { "\\B$", "y x" }, + { "\\B.$", "x" }, + { "^\\B(fo|foo)\\B", "fo" }, + { "^\\B(fo|foo)\\B", "foo" }, + { "^\\B", "" }, + { "^\\B", "x" }, + { "^\\B\\B", "" }, + { "^\\B\\B", "x" }, + { "^\\B$", "" }, + { "^\\B$", "x" }, + { "^\\B.$", "x" }, + { "^\\B.\\B$", "x" }, + { "^^^^^^^^\\B$$$$$$$", "" }, + { "^^^^^^^^\\B.$$$$$$", "x" }, + { "^^^^^^^^\\B$$$$$$$", "x" }, + + // PCRE uses only ASCII for \b computation. + // All non-ASCII are *not* word characters. + { "\\bx\\b", "x" }, + { "\\bx\\b", "x>" }, + { "\\bx\\b", "<x" }, + { "\\bx\\b", "<x>" }, + { "\\bx\\b", "ax" }, + { "\\bx\\b", "xb" }, + { "\\bx\\b", "axb" }, + { "\\bx\\b", "«x" }, + { "\\bx\\b", "x»" }, + { "\\bx\\b", "«x»" }, + { "\\bx\\b", "axb" }, + { "\\bx\\b", "áxβ" }, + { "\\Bx\\B", "axb" }, + { "\\Bx\\B", "áxβ" }, + + // Weird boundary cases. + { "^$^$", "" }, + { "^$^", "" }, + { "$^$", "" }, + + { "^$^$", "x" }, + { "^$^", "x" }, + { "$^$", "x" }, + + { "^$^$", "x\ny" }, + { "^$^", "x\ny" }, + { "$^$", "x\ny" }, + + { "^$^$", "x\n\ny" }, + { "^$^", "x\n\ny" }, + { "$^$", "x\n\ny" }, + + { "^(foo\\$)$", "foo$bar" }, + { "(foo\\$)", "foo$bar" }, + { "^...$", "abc" }, + + // UTF-8 + { "^\xe6\x9c\xac$", "\xe6\x9c\xac" }, + { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^...$", ".\xe6\x9c\xac." }, + + { "^\\C\\C\\C$", "\xe6\x9c\xac" }, + { "^\\C$", "\xe6\x9c\xac" }, + { "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + + // Latin1 + { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^...$", ".\xe6\x9c\xac." }, + { "^.....$", ".\xe6\x9c\xac." }, + + // Perl v Posix + { "\\B(fo|foo)\\B", "xfooo" }, + { "(fo|foo)", "foo" }, + + // Octal escapes. + { "\\141", "a" }, + { "\\060", "0" }, + { "\\0600", "00" }, + { "\\608", "08" }, + { "\\01", "\01" }, + { "\\018", "\01" "8" }, + + // Hexadecimal escapes + { "\\x{61}", "a" }, + { "\\x61", "a" }, + { "\\x{00000061}", "a" }, + + // Unicode scripts. + { "\\p{Greek}+", "aαβb" }, + { "\\P{Greek}+", "aαβb" }, + { "\\p{^Greek}+", "aαβb" }, + { "\\P{^Greek}+", "aαβb" }, + + // Unicode properties. Nd is decimal number. N is any number. + { "[^0-9]+", "abc123" }, + { "\\p{Nd}+", "abc123²³¼½¾₀₉" }, + { "\\p{^Nd}+", "abc123²³¼½¾₀₉" }, + { "\\P{Nd}+", "abc123²³¼½¾₀₉" }, + { "\\P{^Nd}+", "abc123²³¼½¾₀₉" }, + { "\\pN+", "abc123²³¼½¾₀₉" }, + { "\\p{N}+", "abc123²³¼½¾₀₉" }, + { "\\p{^N}+", "abc123²³¼½¾₀₉" }, + + { "\\p{Any}+", "abc123" }, + + // Character classes & case folding. + { "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B + { "(?i)[A-Z]+", "aAzZ" }, + { "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z - + // splits the ranges in an interesting way. + + // would like to use, but PCRE mishandles in full-match, non-greedy mode + // { "(?i)[\\\\]+", "Aa" }, + + { "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, + + // Character classes & case folding. + { "[@-A]+", "@AaB" }, + { "[A-Z]+", "aAzZ" }, + { "[^\\\\]+", "Aa\\" }, + { "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, + + // Anchoring. (^abc in aabcdef was a former bug) + // The tester checks for a match in the text and + // subpieces of the text with a byte removed on either side. + { "^abc", "abcdef" }, + { "^abc", "aabcdef" }, + { "^[ay]*[bx]+c", "abcdef" }, + { "^[ay]*[bx]+c", "aabcdef" }, + { "def$", "abcdef" }, + { "def$", "abcdeff" }, + { "d[ex][fy]$", "abcdef" }, + { "d[ex][fy]$", "abcdeff" }, + { "[dz][ex][fy]$", "abcdef" }, + { "[dz][ex][fy]$", "abcdeff" }, + { "(?m)^abc", "abcdef" }, + { "(?m)^abc", "aabcdef" }, + { "(?m)^[ay]*[bx]+c", "abcdef" }, + { "(?m)^[ay]*[bx]+c", "aabcdef" }, + { "(?m)def$", "abcdef" }, + { "(?m)def$", "abcdeff" }, + { "(?m)d[ex][fy]$", "abcdef" }, + { "(?m)d[ex][fy]$", "abcdeff" }, + { "(?m)[dz][ex][fy]$", "abcdef" }, + { "(?m)[dz][ex][fy]$", "abcdeff" }, + { "^", "a" }, + { "^^", "a" }, + + // Context. + // The tester checks for a match in the text and + // subpieces of the text with a byte removed on either side. + { "a", "a" }, + { "ab*", "a" }, + { "a\\C*", "a" }, + { "a\\C+", "a" }, + { "a\\C?", "a" }, + { "a\\C*?", "a" }, + { "a\\C+?", "a" }, + { "a\\C??", "a" }, + + // Former bugs. + { "a\\C*|ba\\C", "baba" }, + { "\\w*I\\w*", "Inc." }, + { "(?:|a)*", "aaa" }, + { "(?:|a)+", "aaa" }, +}; + +TEST(Regexp, SearchTests) { + int failures = 0; + for (size_t i = 0; i < arraysize(simple_tests); i++) { + const RegexpTest& t = simple_tests[i]; + if (!TestRegexpOnText(t.regexp, t.text)) + failures++; + + if (LOGGING) { + // Build a dummy ExhaustiveTest call that will trigger just + // this one test, so that we log the test case. + std::vector<std::string> atom, alpha, ops; + atom.push_back(t.regexp); + alpha.push_back(t.text); + ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", ""); + } + } + EXPECT_EQ(failures, 0); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/set_test.cc b/contrib/libs/re2/re2/testing/set_test.cc index 140c7476d4..14ff3e79c0 100644 --- a/contrib/libs/re2/re2/testing/set_test.cc +++ b/contrib/libs/re2/re2/testing/set_test.cc @@ -1,230 +1,230 @@ -// Copyright 2010 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <stddef.h> -#include <string> -#include <vector> -#include <utility> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/re2.h" -#include "re2/set.h" - -namespace re2 { - -TEST(Set, Unanchored) { - RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); - - ASSERT_EQ(s.Add("foo", NULL), 0); - ASSERT_EQ(s.Add("(", NULL), -1); - ASSERT_EQ(s.Add("bar", NULL), 1); - ASSERT_EQ(s.Compile(), true); - - ASSERT_EQ(s.Match("foobar", NULL), true); - ASSERT_EQ(s.Match("fooba", NULL), true); - ASSERT_EQ(s.Match("oobar", NULL), true); - - std::vector<int> v; - ASSERT_EQ(s.Match("foobar", &v), true); - ASSERT_EQ(v.size(), 2); - ASSERT_EQ(v[0], 0); - ASSERT_EQ(v[1], 1); - - ASSERT_EQ(s.Match("fooba", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 0); - - ASSERT_EQ(s.Match("oobar", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 1); -} - -TEST(Set, UnanchoredFactored) { - RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); - - ASSERT_EQ(s.Add("foo", NULL), 0); - ASSERT_EQ(s.Add("(", NULL), -1); - ASSERT_EQ(s.Add("foobar", NULL), 1); - ASSERT_EQ(s.Compile(), true); - - ASSERT_EQ(s.Match("foobar", NULL), true); - ASSERT_EQ(s.Match("obarfoobaroo", NULL), true); - ASSERT_EQ(s.Match("fooba", NULL), true); - ASSERT_EQ(s.Match("oobar", NULL), false); - - std::vector<int> v; - ASSERT_EQ(s.Match("foobar", &v), true); - ASSERT_EQ(v.size(), 2); - ASSERT_EQ(v[0], 0); - ASSERT_EQ(v[1], 1); - - ASSERT_EQ(s.Match("obarfoobaroo", &v), true); - ASSERT_EQ(v.size(), 2); - ASSERT_EQ(v[0], 0); - ASSERT_EQ(v[1], 1); - - ASSERT_EQ(s.Match("fooba", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 0); - - ASSERT_EQ(s.Match("oobar", &v), false); - ASSERT_EQ(v.size(), 0); -} - -TEST(Set, UnanchoredDollar) { - RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); - - ASSERT_EQ(s.Add("foo$", NULL), 0); - ASSERT_EQ(s.Compile(), true); - - ASSERT_EQ(s.Match("foo", NULL), true); - ASSERT_EQ(s.Match("foobar", NULL), false); - - std::vector<int> v; - ASSERT_EQ(s.Match("foo", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 0); - - ASSERT_EQ(s.Match("foobar", &v), false); - ASSERT_EQ(v.size(), 0); -} - -TEST(Set, UnanchoredWordBoundary) { - RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); - - ASSERT_EQ(s.Add("foo\\b", NULL), 0); - ASSERT_EQ(s.Compile(), true); - - ASSERT_EQ(s.Match("foo", NULL), true); - ASSERT_EQ(s.Match("foobar", NULL), false); - ASSERT_EQ(s.Match("foo bar", NULL), true); - - std::vector<int> v; - ASSERT_EQ(s.Match("foo", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 0); - - ASSERT_EQ(s.Match("foobar", &v), false); - ASSERT_EQ(v.size(), 0); - - ASSERT_EQ(s.Match("foo bar", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 0); -} - -TEST(Set, Anchored) { - RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); - - ASSERT_EQ(s.Add("foo", NULL), 0); - ASSERT_EQ(s.Add("(", NULL), -1); - ASSERT_EQ(s.Add("bar", NULL), 1); - ASSERT_EQ(s.Compile(), true); - - ASSERT_EQ(s.Match("foobar", NULL), false); - ASSERT_EQ(s.Match("fooba", NULL), false); - ASSERT_EQ(s.Match("oobar", NULL), false); - ASSERT_EQ(s.Match("foo", NULL), true); - ASSERT_EQ(s.Match("bar", NULL), true); - - std::vector<int> v; - ASSERT_EQ(s.Match("foobar", &v), false); - ASSERT_EQ(v.size(), 0); - - ASSERT_EQ(s.Match("fooba", &v), false); - ASSERT_EQ(v.size(), 0); - - ASSERT_EQ(s.Match("oobar", &v), false); - ASSERT_EQ(v.size(), 0); - - ASSERT_EQ(s.Match("foo", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 0); - - ASSERT_EQ(s.Match("bar", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 1); -} - -TEST(Set, EmptyUnanchored) { - RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); - - ASSERT_EQ(s.Compile(), true); - - ASSERT_EQ(s.Match("", NULL), false); - ASSERT_EQ(s.Match("foobar", NULL), false); - - std::vector<int> v; - ASSERT_EQ(s.Match("", &v), false); - ASSERT_EQ(v.size(), 0); - - ASSERT_EQ(s.Match("foobar", &v), false); - ASSERT_EQ(v.size(), 0); -} - -TEST(Set, EmptyAnchored) { - RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); - - ASSERT_EQ(s.Compile(), true); - - ASSERT_EQ(s.Match("", NULL), false); - ASSERT_EQ(s.Match("foobar", NULL), false); - - std::vector<int> v; - ASSERT_EQ(s.Match("", &v), false); - ASSERT_EQ(v.size(), 0); - - ASSERT_EQ(s.Match("foobar", &v), false); - ASSERT_EQ(v.size(), 0); -} - -TEST(Set, Prefix) { - RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); - - ASSERT_EQ(s.Add("/prefix/\\d*", NULL), 0); - ASSERT_EQ(s.Compile(), true); - - ASSERT_EQ(s.Match("/prefix", NULL), false); - ASSERT_EQ(s.Match("/prefix/", NULL), true); - ASSERT_EQ(s.Match("/prefix/42", NULL), true); - - std::vector<int> v; - ASSERT_EQ(s.Match("/prefix", &v), false); - ASSERT_EQ(v.size(), 0); - - ASSERT_EQ(s.Match("/prefix/", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 0); - - ASSERT_EQ(s.Match("/prefix/42", &v), true); - ASSERT_EQ(v.size(), 1); - ASSERT_EQ(v[0], 0); -} - -TEST(Set, MoveSemantics) { - RE2::Set s1(RE2::DefaultOptions, RE2::UNANCHORED); - ASSERT_EQ(s1.Add("foo\\d+", NULL), 0); - ASSERT_EQ(s1.Compile(), true); - ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); - ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); - - // The moved-to object should do what the moved-from object did. - RE2::Set s2 = std::move(s1); - ASSERT_EQ(s2.Match("abc foo1 xyz", NULL), true); - ASSERT_EQ(s2.Match("abc bar2 xyz", NULL), false); - - // The moved-from object should have been reset and be reusable. - ASSERT_EQ(s1.Add("bar\\d+", NULL), 0); - ASSERT_EQ(s1.Compile(), true); - ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), false); - ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), true); - - // Verify that "overwriting" works and also doesn't leak memory. - // (The latter will need a leak detector such as LeakSanitizer.) - s1 = std::move(s2); - ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); - ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); -} - -} // namespace re2 +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <stddef.h> +#include <string> +#include <vector> +#include <utility> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/re2.h" +#include "re2/set.h" + +namespace re2 { + +TEST(Set, Unanchored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Add("foo", NULL), 0); + ASSERT_EQ(s.Add("(", NULL), -1); + ASSERT_EQ(s.Add("bar", NULL), 1); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foobar", NULL), true); + ASSERT_EQ(s.Match("fooba", NULL), true); + ASSERT_EQ(s.Match("oobar", NULL), true); + + std::vector<int> v; + ASSERT_EQ(s.Match("foobar", &v), true); + ASSERT_EQ(v.size(), 2); + ASSERT_EQ(v[0], 0); + ASSERT_EQ(v[1], 1); + + ASSERT_EQ(s.Match("fooba", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("oobar", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 1); +} + +TEST(Set, UnanchoredFactored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Add("foo", NULL), 0); + ASSERT_EQ(s.Add("(", NULL), -1); + ASSERT_EQ(s.Add("foobar", NULL), 1); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foobar", NULL), true); + ASSERT_EQ(s.Match("obarfoobaroo", NULL), true); + ASSERT_EQ(s.Match("fooba", NULL), true); + ASSERT_EQ(s.Match("oobar", NULL), false); + + std::vector<int> v; + ASSERT_EQ(s.Match("foobar", &v), true); + ASSERT_EQ(v.size(), 2); + ASSERT_EQ(v[0], 0); + ASSERT_EQ(v[1], 1); + + ASSERT_EQ(s.Match("obarfoobaroo", &v), true); + ASSERT_EQ(v.size(), 2); + ASSERT_EQ(v[0], 0); + ASSERT_EQ(v[1], 1); + + ASSERT_EQ(s.Match("fooba", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("oobar", &v), false); + ASSERT_EQ(v.size(), 0); +} + +TEST(Set, UnanchoredDollar) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Add("foo$", NULL), 0); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foo", NULL), true); + ASSERT_EQ(s.Match("foobar", NULL), false); + + std::vector<int> v; + ASSERT_EQ(s.Match("foo", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); +} + +TEST(Set, UnanchoredWordBoundary) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Add("foo\\b", NULL), 0); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foo", NULL), true); + ASSERT_EQ(s.Match("foobar", NULL), false); + ASSERT_EQ(s.Match("foo bar", NULL), true); + + std::vector<int> v; + ASSERT_EQ(s.Match("foo", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("foo bar", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); +} + +TEST(Set, Anchored) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + ASSERT_EQ(s.Add("foo", NULL), 0); + ASSERT_EQ(s.Add("(", NULL), -1); + ASSERT_EQ(s.Add("bar", NULL), 1); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("foobar", NULL), false); + ASSERT_EQ(s.Match("fooba", NULL), false); + ASSERT_EQ(s.Match("oobar", NULL), false); + ASSERT_EQ(s.Match("foo", NULL), true); + ASSERT_EQ(s.Match("bar", NULL), true); + + std::vector<int> v; + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("fooba", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("oobar", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("foo", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("bar", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 1); +} + +TEST(Set, EmptyUnanchored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("", NULL), false); + ASSERT_EQ(s.Match("foobar", NULL), false); + + std::vector<int> v; + ASSERT_EQ(s.Match("", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); +} + +TEST(Set, EmptyAnchored) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("", NULL), false); + ASSERT_EQ(s.Match("foobar", NULL), false); + + std::vector<int> v; + ASSERT_EQ(s.Match("", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("foobar", &v), false); + ASSERT_EQ(v.size(), 0); +} + +TEST(Set, Prefix) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + ASSERT_EQ(s.Add("/prefix/\\d*", NULL), 0); + ASSERT_EQ(s.Compile(), true); + + ASSERT_EQ(s.Match("/prefix", NULL), false); + ASSERT_EQ(s.Match("/prefix/", NULL), true); + ASSERT_EQ(s.Match("/prefix/42", NULL), true); + + std::vector<int> v; + ASSERT_EQ(s.Match("/prefix", &v), false); + ASSERT_EQ(v.size(), 0); + + ASSERT_EQ(s.Match("/prefix/", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); + + ASSERT_EQ(s.Match("/prefix/42", &v), true); + ASSERT_EQ(v.size(), 1); + ASSERT_EQ(v[0], 0); +} + +TEST(Set, MoveSemantics) { + RE2::Set s1(RE2::DefaultOptions, RE2::UNANCHORED); + ASSERT_EQ(s1.Add("foo\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); + + // The moved-to object should do what the moved-from object did. + RE2::Set s2 = std::move(s1); + ASSERT_EQ(s2.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s2.Match("abc bar2 xyz", NULL), false); + + // The moved-from object should have been reset and be reusable. + ASSERT_EQ(s1.Add("bar\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), false); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), true); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + s1 = std::move(s2); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/simplify_test.cc b/contrib/libs/re2/re2/testing/simplify_test.cc index 4510778fe5..75028930b1 100644 --- a/contrib/libs/re2/re2/testing/simplify_test.cc +++ b/contrib/libs/re2/re2/testing/simplify_test.cc @@ -1,273 +1,273 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Test simplify.cc. - -#include <string.h> -#include <string> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/regexp.h" - -namespace re2 { - -struct Test { - const char* regexp; - const char* simplified; -}; - -static Test tests[] = { - // Already-simple constructs - { "a", "a" }, - { "ab", "ab" }, - { "a|b", "[a-b]" }, - { "ab|cd", "ab|cd" }, - { "(ab)*", "(ab)*" }, - { "(ab)+", "(ab)+" }, - { "(ab)?", "(ab)?" }, - { ".", "." }, - { "^", "^" }, - { "$", "$" }, - { "[ac]", "[ac]" }, - { "[^ac]", "[^ac]" }, - - // Posix character classes - { "[[:alnum:]]", "[0-9A-Za-z]" }, - { "[[:alpha:]]", "[A-Za-z]" }, - { "[[:blank:]]", "[\\t ]" }, - { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" }, - { "[[:digit:]]", "[0-9]" }, - { "[[:graph:]]", "[!-~]" }, - { "[[:lower:]]", "[a-z]" }, - { "[[:print:]]", "[ -~]" }, - { "[[:punct:]]", "[!-/:-@\\[-`{-~]" }, - { "[[:space:]]" , "[\\t-\\r ]" }, - { "[[:upper:]]", "[A-Z]" }, - { "[[:xdigit:]]", "[0-9A-Fa-f]" }, - - // Perl character classes - { "\\d", "[0-9]" }, - { "\\s", "[\\t-\\n\\f-\\r ]" }, - { "\\w", "[0-9A-Z_a-z]" }, - { "\\D", "[^0-9]" }, - { "\\S", "[^\\t-\\n\\f-\\r ]" }, - { "\\W", "[^0-9A-Z_a-z]" }, - { "[\\d]", "[0-9]" }, - { "[\\s]", "[\\t-\\n\\f-\\r ]" }, - { "[\\w]", "[0-9A-Z_a-z]" }, - { "[\\D]", "[^0-9]" }, - { "[\\S]", "[^\\t-\\n\\f-\\r ]" }, - { "[\\W]", "[^0-9A-Z_a-z]" }, - - // Posix repetitions - { "a{1}", "a" }, - { "a{2}", "aa" }, - { "a{5}", "aaaaa" }, - { "a{0,1}", "a?" }, - // The next three are illegible because Simplify inserts (?:) - // parens instead of () parens to avoid creating extra - // captured subexpressions. The comments show a version fewer parens. - { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)? - { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)? - { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)? - { "a{0,2}", "(?:aa?)?" }, // (aa?)? - { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)? - { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)? - { "a{0,}", "a*" }, - { "a{1,}", "a+" }, - { "a{2,}", "aa+" }, - { "a{5,}", "aaaaa+" }, - - // Test that operators simplify their arguments. - // (Simplify used to not simplify arguments to a {} repeat.) - { "(?:a{1,}){1,}", "a+" }, - { "(a{1,}b{1,})", "(a+b+)" }, - { "a{1,}|b{1,}", "a+|b+" }, - { "(?:a{1,})*", "(?:a+)*" }, - { "(?:a{1,})+", "a+" }, - { "(?:a{1,})?", "(?:a+)?" }, - { "a{0}", "" }, - - // Character class simplification - { "[ab]", "[a-b]" }, - { "[a-za-za-z]", "[a-z]" }, - { "[A-Za-zA-Za-z]", "[A-Za-z]" }, - { "[ABCDEFGH]", "[A-H]" }, - { "[AB-CD-EF-GH]", "[A-H]" }, - { "[W-ZP-XE-R]", "[E-Z]" }, - { "[a-ee-gg-m]", "[a-m]" }, - { "[a-ea-ha-m]", "[a-m]" }, - { "[a-ma-ha-e]", "[a-m]" }, - { "[a-zA-Z0-9 -~]", "[ -~]" }, - - // Empty character classes - { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" }, - - // Full character classes - { "[[:cntrl:][:^cntrl:]]", "." }, - - // Unicode case folding. - { "(?i)A", "[Aa]" }, - { "(?i)a", "[Aa]" }, - { "(?i)K", "[Kk\\x{212a}]" }, - { "(?i)k", "[Kk\\x{212a}]" }, - { "(?i)\\x{212a}", "[Kk\\x{212a}]" }, - { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" }, - { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" }, - { "(?i)[\\x00-\\x{10ffff}]", "." }, - - // Empty string as a regular expression. - // Empty string must be preserved inside parens in order - // to make submatches work right, so these are less - // interesting than they used to be. ToString inserts - // explicit (?:) in place of non-parenthesized empty strings, - // to make them easier to spot for other parsers. - { "(a|b|)", "([a-b]|(?:))" }, - { "(|)", "((?:)|(?:))" }, - { "a()", "a()" }, - { "(()|())", "(()|())" }, - { "(a|)", "(a|(?:))" }, - { "ab()cd()", "ab()cd()" }, - { "()", "()" }, - { "()*", "()*" }, - { "()+", "()+" }, - { "()?" , "()?" }, - { "(){0}", "" }, - { "(){1}", "()" }, - { "(){1,}", "()+" }, - { "(){0,2}", "(?:()()?)?" }, - - // Test that coalescing occurs and that the resulting repeats are simplified. - // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal: - { "a*a*", "a*" }, - { "a*a+", "a+" }, - { "a*a?", "a*" }, - { "a*a{2}", "aa+" }, - { "a*a{2,}", "aa+" }, - { "a*a{2,3}", "aa+" }, - { "a+a*", "a+" }, - { "a+a+", "aa+" }, - { "a+a?", "a+" }, - { "a+a{2}", "aaa+" }, - { "a+a{2,}", "aaa+" }, - { "a+a{2,3}", "aaa+" }, - { "a?a*", "a*" }, - { "a?a+", "a+" }, - { "a?a?", "(?:aa?)?" }, - { "a?a{2}", "aaa?" }, - { "a?a{2,}", "aa+" }, - { "a?a{2,3}", "aa(?:aa?)?" }, - { "a{2}a*", "aa+" }, - { "a{2}a+", "aaa+" }, - { "a{2}a?", "aaa?" }, - { "a{2}a{2}", "aaaa" }, - { "a{2}a{2,}", "aaaa+" }, - { "a{2}a{2,3}", "aaaaa?" }, - { "a{2,}a*", "aa+" }, - { "a{2,}a+", "aaa+" }, - { "a{2,}a?", "aa+" }, - { "a{2,}a{2}", "aaaa+" }, - { "a{2,}a{2,}", "aaaa+" }, - { "a{2,}a{2,3}", "aaaa+" }, - { "a{2,3}a*", "aa+" }, - { "a{2,3}a+", "aaa+" }, - { "a{2,3}a?", "aa(?:aa?)?" }, - { "a{2,3}a{2}", "aaaaa?" }, - { "a{2,3}a{2,}", "aaaa+" }, - { "a{2,3}a{2,3}", "aaaa(?:aa?)?" }, - // With a char class, any char and any byte: - { "\\d*\\d*", "[0-9]*" }, - { ".*.*", ".*" }, - { "\\C*\\C*", "\\C*" }, - // FoldCase works, but must be consistent: - { "(?i)A*a*", "[Aa]*" }, - { "(?i)a+A+", "[Aa][Aa]+" }, - { "(?i)A*(?-i)a*", "[Aa]*a*" }, - { "(?i)a+(?-i)A+", "[Aa]+A+" }, - // NonGreedy works, but must be consistent: - { "a*?a*?", "a*?" }, - { "a+?a+?", "aa+?" }, - { "a*?a*", "a*?a*" }, - { "a+a+?", "a+a+?" }, - // The second element is the literal, char class, any char or any byte: - { "a*a", "a+" }, - { "\\d*\\d", "[0-9]+" }, - { ".*.", ".+" }, - { "\\C*\\C", "\\C+" }, - // FoldCase works, but must be consistent: - { "(?i)A*a", "[Aa]+" }, - { "(?i)a+A", "[Aa][Aa]+" }, - { "(?i)A*(?-i)a", "[Aa]*a" }, - { "(?i)a+(?-i)A", "[Aa]+A" }, - // The second element is a literal string that begins with the literal: - { "a*aa", "aa+" }, - { "a*aab", "aa+b" }, - // FoldCase works, but must be consistent: - { "(?i)a*aa", "[Aa][Aa]+" }, - { "(?i)a*aab", "[Aa][Aa]+[Bb]" }, - { "(?i)a*(?-i)aa", "[Aa]*aa" }, - { "(?i)a*(?-i)aab", "[Aa]*aab" }, - // Negative tests with mismatching ops: - { "a*b*", "a*b*" }, - { "\\d*\\D*", "[0-9]*[^0-9]*" }, - { "a+b", "a+b" }, - { "\\d+\\D", "[0-9]+[^0-9]" }, - { "a?bb", "a?bb" }, - // Negative tests with capturing groups: - { "(a*)a*", "(a*)a*" }, - { "a+(a)", "a+(a)" }, - { "(a?)(aa)", "(a?)(aa)" }, - // Just for fun: - { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" }, - - // During coalescing, the child of the repeat changes, so we build a new - // repeat. The new repeat must have the min and max of the old repeat. - // Failure to copy them results in min=0 and max=0 -> empty match. - { "(?:a*aab){2}", "aa+baa+b" }, - - // During coalescing, the child of the capture changes, so we build a new - // capture. The new capture must have the cap of the old capture. - // Failure to copy it results in cap=0 -> ToString() logs a fatal error. - { "(a*aab)", "(aa+b)" }, - - // Test squashing of **, ++, ?? et cetera. - { "(?:(?:a){0,}){0,}", "a*" }, - { "(?:(?:a){1,}){1,}", "a+" }, - { "(?:(?:a){0,1}){0,1}", "a?" }, - { "(?:(?:a){0,}){1,}", "a*" }, - { "(?:(?:a){0,}){0,1}", "a*" }, - { "(?:(?:a){1,}){0,}", "a*" }, - { "(?:(?:a){1,}){0,1}", "a*" }, - { "(?:(?:a){0,1}){0,}", "a*" }, - { "(?:(?:a){0,1}){1,}", "a*" }, -}; - -TEST(TestSimplify, SimpleRegexps) { - for (size_t i = 0; i < arraysize(tests); i++) { - RegexpStatus status; - VLOG(1) << "Testing " << tests[i].regexp; - Regexp* re = Regexp::Parse(tests[i].regexp, - Regexp::MatchNL | (Regexp::LikePerl & - ~Regexp::OneLine), - &status); - ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text(); - Regexp* sre = re->Simplify(); - ASSERT_TRUE(sre != NULL); - - // Check that already-simple regexps don't allocate new ones. - if (strcmp(tests[i].regexp, tests[i].simplified) == 0) { - ASSERT_TRUE(re == sre) << " " << tests[i].regexp - << " " << re->ToString() << " " << sre->ToString(); - } - - EXPECT_EQ(tests[i].simplified, sre->ToString()) - << " " << tests[i].regexp << " " << sre->Dump(); - - re->Decref(); - sre->Decref(); - } -} - -} // namespace re2 +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test simplify.cc. + +#include <string.h> +#include <string> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/regexp.h" + +namespace re2 { + +struct Test { + const char* regexp; + const char* simplified; +}; + +static Test tests[] = { + // Already-simple constructs + { "a", "a" }, + { "ab", "ab" }, + { "a|b", "[a-b]" }, + { "ab|cd", "ab|cd" }, + { "(ab)*", "(ab)*" }, + { "(ab)+", "(ab)+" }, + { "(ab)?", "(ab)?" }, + { ".", "." }, + { "^", "^" }, + { "$", "$" }, + { "[ac]", "[ac]" }, + { "[^ac]", "[^ac]" }, + + // Posix character classes + { "[[:alnum:]]", "[0-9A-Za-z]" }, + { "[[:alpha:]]", "[A-Za-z]" }, + { "[[:blank:]]", "[\\t ]" }, + { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" }, + { "[[:digit:]]", "[0-9]" }, + { "[[:graph:]]", "[!-~]" }, + { "[[:lower:]]", "[a-z]" }, + { "[[:print:]]", "[ -~]" }, + { "[[:punct:]]", "[!-/:-@\\[-`{-~]" }, + { "[[:space:]]" , "[\\t-\\r ]" }, + { "[[:upper:]]", "[A-Z]" }, + { "[[:xdigit:]]", "[0-9A-Fa-f]" }, + + // Perl character classes + { "\\d", "[0-9]" }, + { "\\s", "[\\t-\\n\\f-\\r ]" }, + { "\\w", "[0-9A-Z_a-z]" }, + { "\\D", "[^0-9]" }, + { "\\S", "[^\\t-\\n\\f-\\r ]" }, + { "\\W", "[^0-9A-Z_a-z]" }, + { "[\\d]", "[0-9]" }, + { "[\\s]", "[\\t-\\n\\f-\\r ]" }, + { "[\\w]", "[0-9A-Z_a-z]" }, + { "[\\D]", "[^0-9]" }, + { "[\\S]", "[^\\t-\\n\\f-\\r ]" }, + { "[\\W]", "[^0-9A-Z_a-z]" }, + + // Posix repetitions + { "a{1}", "a" }, + { "a{2}", "aa" }, + { "a{5}", "aaaaa" }, + { "a{0,1}", "a?" }, + // The next three are illegible because Simplify inserts (?:) + // parens instead of () parens to avoid creating extra + // captured subexpressions. The comments show a version fewer parens. + { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)? + { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)? + { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)? + { "a{0,2}", "(?:aa?)?" }, // (aa?)? + { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)? + { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)? + { "a{0,}", "a*" }, + { "a{1,}", "a+" }, + { "a{2,}", "aa+" }, + { "a{5,}", "aaaaa+" }, + + // Test that operators simplify their arguments. + // (Simplify used to not simplify arguments to a {} repeat.) + { "(?:a{1,}){1,}", "a+" }, + { "(a{1,}b{1,})", "(a+b+)" }, + { "a{1,}|b{1,}", "a+|b+" }, + { "(?:a{1,})*", "(?:a+)*" }, + { "(?:a{1,})+", "a+" }, + { "(?:a{1,})?", "(?:a+)?" }, + { "a{0}", "" }, + + // Character class simplification + { "[ab]", "[a-b]" }, + { "[a-za-za-z]", "[a-z]" }, + { "[A-Za-zA-Za-z]", "[A-Za-z]" }, + { "[ABCDEFGH]", "[A-H]" }, + { "[AB-CD-EF-GH]", "[A-H]" }, + { "[W-ZP-XE-R]", "[E-Z]" }, + { "[a-ee-gg-m]", "[a-m]" }, + { "[a-ea-ha-m]", "[a-m]" }, + { "[a-ma-ha-e]", "[a-m]" }, + { "[a-zA-Z0-9 -~]", "[ -~]" }, + + // Empty character classes + { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" }, + + // Full character classes + { "[[:cntrl:][:^cntrl:]]", "." }, + + // Unicode case folding. + { "(?i)A", "[Aa]" }, + { "(?i)a", "[Aa]" }, + { "(?i)K", "[Kk\\x{212a}]" }, + { "(?i)k", "[Kk\\x{212a}]" }, + { "(?i)\\x{212a}", "[Kk\\x{212a}]" }, + { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" }, + { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" }, + { "(?i)[\\x00-\\x{10ffff}]", "." }, + + // Empty string as a regular expression. + // Empty string must be preserved inside parens in order + // to make submatches work right, so these are less + // interesting than they used to be. ToString inserts + // explicit (?:) in place of non-parenthesized empty strings, + // to make them easier to spot for other parsers. + { "(a|b|)", "([a-b]|(?:))" }, + { "(|)", "((?:)|(?:))" }, + { "a()", "a()" }, + { "(()|())", "(()|())" }, + { "(a|)", "(a|(?:))" }, + { "ab()cd()", "ab()cd()" }, + { "()", "()" }, + { "()*", "()*" }, + { "()+", "()+" }, + { "()?" , "()?" }, + { "(){0}", "" }, + { "(){1}", "()" }, + { "(){1,}", "()+" }, + { "(){0,2}", "(?:()()?)?" }, + + // Test that coalescing occurs and that the resulting repeats are simplified. + // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal: + { "a*a*", "a*" }, + { "a*a+", "a+" }, + { "a*a?", "a*" }, + { "a*a{2}", "aa+" }, + { "a*a{2,}", "aa+" }, + { "a*a{2,3}", "aa+" }, + { "a+a*", "a+" }, + { "a+a+", "aa+" }, + { "a+a?", "a+" }, + { "a+a{2}", "aaa+" }, + { "a+a{2,}", "aaa+" }, + { "a+a{2,3}", "aaa+" }, + { "a?a*", "a*" }, + { "a?a+", "a+" }, + { "a?a?", "(?:aa?)?" }, + { "a?a{2}", "aaa?" }, + { "a?a{2,}", "aa+" }, + { "a?a{2,3}", "aa(?:aa?)?" }, + { "a{2}a*", "aa+" }, + { "a{2}a+", "aaa+" }, + { "a{2}a?", "aaa?" }, + { "a{2}a{2}", "aaaa" }, + { "a{2}a{2,}", "aaaa+" }, + { "a{2}a{2,3}", "aaaaa?" }, + { "a{2,}a*", "aa+" }, + { "a{2,}a+", "aaa+" }, + { "a{2,}a?", "aa+" }, + { "a{2,}a{2}", "aaaa+" }, + { "a{2,}a{2,}", "aaaa+" }, + { "a{2,}a{2,3}", "aaaa+" }, + { "a{2,3}a*", "aa+" }, + { "a{2,3}a+", "aaa+" }, + { "a{2,3}a?", "aa(?:aa?)?" }, + { "a{2,3}a{2}", "aaaaa?" }, + { "a{2,3}a{2,}", "aaaa+" }, + { "a{2,3}a{2,3}", "aaaa(?:aa?)?" }, + // With a char class, any char and any byte: + { "\\d*\\d*", "[0-9]*" }, + { ".*.*", ".*" }, + { "\\C*\\C*", "\\C*" }, + // FoldCase works, but must be consistent: + { "(?i)A*a*", "[Aa]*" }, + { "(?i)a+A+", "[Aa][Aa]+" }, + { "(?i)A*(?-i)a*", "[Aa]*a*" }, + { "(?i)a+(?-i)A+", "[Aa]+A+" }, + // NonGreedy works, but must be consistent: + { "a*?a*?", "a*?" }, + { "a+?a+?", "aa+?" }, + { "a*?a*", "a*?a*" }, + { "a+a+?", "a+a+?" }, + // The second element is the literal, char class, any char or any byte: + { "a*a", "a+" }, + { "\\d*\\d", "[0-9]+" }, + { ".*.", ".+" }, + { "\\C*\\C", "\\C+" }, + // FoldCase works, but must be consistent: + { "(?i)A*a", "[Aa]+" }, + { "(?i)a+A", "[Aa][Aa]+" }, + { "(?i)A*(?-i)a", "[Aa]*a" }, + { "(?i)a+(?-i)A", "[Aa]+A" }, + // The second element is a literal string that begins with the literal: + { "a*aa", "aa+" }, + { "a*aab", "aa+b" }, + // FoldCase works, but must be consistent: + { "(?i)a*aa", "[Aa][Aa]+" }, + { "(?i)a*aab", "[Aa][Aa]+[Bb]" }, + { "(?i)a*(?-i)aa", "[Aa]*aa" }, + { "(?i)a*(?-i)aab", "[Aa]*aab" }, + // Negative tests with mismatching ops: + { "a*b*", "a*b*" }, + { "\\d*\\D*", "[0-9]*[^0-9]*" }, + { "a+b", "a+b" }, + { "\\d+\\D", "[0-9]+[^0-9]" }, + { "a?bb", "a?bb" }, + // Negative tests with capturing groups: + { "(a*)a*", "(a*)a*" }, + { "a+(a)", "a+(a)" }, + { "(a?)(aa)", "(a?)(aa)" }, + // Just for fun: + { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" }, + + // During coalescing, the child of the repeat changes, so we build a new + // repeat. The new repeat must have the min and max of the old repeat. + // Failure to copy them results in min=0 and max=0 -> empty match. + { "(?:a*aab){2}", "aa+baa+b" }, + + // During coalescing, the child of the capture changes, so we build a new + // capture. The new capture must have the cap of the old capture. + // Failure to copy it results in cap=0 -> ToString() logs a fatal error. + { "(a*aab)", "(aa+b)" }, + + // Test squashing of **, ++, ?? et cetera. + { "(?:(?:a){0,}){0,}", "a*" }, + { "(?:(?:a){1,}){1,}", "a+" }, + { "(?:(?:a){0,1}){0,1}", "a?" }, + { "(?:(?:a){0,}){1,}", "a*" }, + { "(?:(?:a){0,}){0,1}", "a*" }, + { "(?:(?:a){1,}){0,}", "a*" }, + { "(?:(?:a){1,}){0,1}", "a*" }, + { "(?:(?:a){0,1}){0,}", "a*" }, + { "(?:(?:a){0,1}){1,}", "a*" }, +}; + +TEST(TestSimplify, SimpleRegexps) { + for (size_t i = 0; i < arraysize(tests); i++) { + RegexpStatus status; + VLOG(1) << "Testing " << tests[i].regexp; + Regexp* re = Regexp::Parse(tests[i].regexp, + Regexp::MatchNL | (Regexp::LikePerl & + ~Regexp::OneLine), + &status); + ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text(); + Regexp* sre = re->Simplify(); + ASSERT_TRUE(sre != NULL); + + // Check that already-simple regexps don't allocate new ones. + if (strcmp(tests[i].regexp, tests[i].simplified) == 0) { + ASSERT_TRUE(re == sre) << " " << tests[i].regexp + << " " << re->ToString() << " " << sre->ToString(); + } + + EXPECT_EQ(tests[i].simplified, sre->ToString()) + << " " << tests[i].regexp << " " << sre->Dump(); + + re->Decref(); + sre->Decref(); + } +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/string_generator.cc b/contrib/libs/re2/re2/testing/string_generator.cc index f42df29c82..96dbbf5d82 100644 --- a/contrib/libs/re2/re2/testing/string_generator.cc +++ b/contrib/libs/re2/re2/testing/string_generator.cc @@ -1,141 +1,141 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// String generator: generates all possible strings of up to -// maxlen letters using the set of letters in alpha. -// Fetch strings using a Java-like Next()/HasNext() interface. - -#include <stddef.h> -#include <stdint.h> -#include <string> -#include <vector> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/logging.h" -#include "re2/testing/string_generator.h" - -namespace re2 { - -StringGenerator::StringGenerator(int maxlen, - const std::vector<std::string>& alphabet) - : maxlen_(maxlen), alphabet_(alphabet), - generate_null_(false), - random_(false), nrandom_(0) { - - // Degenerate case: no letters, no non-empty strings. - if (alphabet_.empty()) - maxlen_ = 0; - - // Next() will return empty string (digits_ is empty). - hasnext_ = true; -} - -// Resets the string generator state to the beginning. -void StringGenerator::Reset() { - digits_.clear(); - hasnext_ = true; - random_ = false; - nrandom_ = 0; - generate_null_ = false; -} - -// Increments the big number in digits_, returning true if successful. -// Returns false if all the numbers have been used. -bool StringGenerator::IncrementDigits() { - // First try to increment the current number. - for (int i = static_cast<int>(digits_.size()) - 1; i >= 0; i--) { - if (++digits_[i] < static_cast<int>(alphabet_.size())) - return true; - digits_[i] = 0; - } - - // If that failed, make a longer number. - if (static_cast<int>(digits_.size()) < maxlen_) { - digits_.push_back(0); - return true; - } - - return false; -} - -// Generates random digits_, return true if successful. -// Returns false if the random sequence is over. -bool StringGenerator::RandomDigits() { - if (--nrandom_ <= 0) - return false; - - std::uniform_int_distribution<int> random_len(0, maxlen_); - std::uniform_int_distribution<int> random_alphabet_index( - 0, static_cast<int>(alphabet_.size()) - 1); - - // Pick length. - int len = random_len(rng_); - digits_.resize(len); - for (int i = 0; i < len; i++) - digits_[i] = random_alphabet_index(rng_); - return true; -} - -// Returns the next string in the iteration, which is the one -// currently described by digits_. Calls IncrementDigits -// after computing the string, so that it knows the answer -// for subsequent HasNext() calls. -const StringPiece& StringGenerator::Next() { - CHECK(hasnext_); - if (generate_null_) { - generate_null_ = false; - sp_ = StringPiece(); - return sp_; - } - s_.clear(); - for (size_t i = 0; i < digits_.size(); i++) { - s_ += alphabet_[digits_[i]]; - } - hasnext_ = random_ ? RandomDigits() : IncrementDigits(); - sp_ = s_; - return sp_; -} - -// Sets generator up to return n random strings. -void StringGenerator::Random(int32_t seed, int n) { - rng_.seed(seed); - - random_ = true; - nrandom_ = n; - hasnext_ = nrandom_ > 0; -} - -void StringGenerator::GenerateNULL() { - generate_null_ = true; - hasnext_ = true; -} - -std::string DeBruijnString(int n) { - CHECK_GE(n, 1); - CHECK_LE(n, 29); - const size_t size = size_t{1} << static_cast<size_t>(n); - const size_t mask = size - 1; - std::vector<bool> did(size, false); - std::string s; - s.reserve(static_cast<size_t>(n) + size); - for (size_t i = 0; i < static_cast<size_t>(n - 1); i++) - s += '0'; - size_t bits = 0; - for (size_t i = 0; i < size; i++) { - bits <<= 1; - bits &= mask; - if (!did[bits | 1]) { - bits |= 1; - s += '1'; - } else { - s += '0'; - } - CHECK(!did[bits]); - did[bits] = true; - } - CHECK_EQ(s.size(), static_cast<size_t>(n - 1) + size); - return s; -} - -} // namespace re2 +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// String generator: generates all possible strings of up to +// maxlen letters using the set of letters in alpha. +// Fetch strings using a Java-like Next()/HasNext() interface. + +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <vector> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/logging.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +StringGenerator::StringGenerator(int maxlen, + const std::vector<std::string>& alphabet) + : maxlen_(maxlen), alphabet_(alphabet), + generate_null_(false), + random_(false), nrandom_(0) { + + // Degenerate case: no letters, no non-empty strings. + if (alphabet_.empty()) + maxlen_ = 0; + + // Next() will return empty string (digits_ is empty). + hasnext_ = true; +} + +// Resets the string generator state to the beginning. +void StringGenerator::Reset() { + digits_.clear(); + hasnext_ = true; + random_ = false; + nrandom_ = 0; + generate_null_ = false; +} + +// Increments the big number in digits_, returning true if successful. +// Returns false if all the numbers have been used. +bool StringGenerator::IncrementDigits() { + // First try to increment the current number. + for (int i = static_cast<int>(digits_.size()) - 1; i >= 0; i--) { + if (++digits_[i] < static_cast<int>(alphabet_.size())) + return true; + digits_[i] = 0; + } + + // If that failed, make a longer number. + if (static_cast<int>(digits_.size()) < maxlen_) { + digits_.push_back(0); + return true; + } + + return false; +} + +// Generates random digits_, return true if successful. +// Returns false if the random sequence is over. +bool StringGenerator::RandomDigits() { + if (--nrandom_ <= 0) + return false; + + std::uniform_int_distribution<int> random_len(0, maxlen_); + std::uniform_int_distribution<int> random_alphabet_index( + 0, static_cast<int>(alphabet_.size()) - 1); + + // Pick length. + int len = random_len(rng_); + digits_.resize(len); + for (int i = 0; i < len; i++) + digits_[i] = random_alphabet_index(rng_); + return true; +} + +// Returns the next string in the iteration, which is the one +// currently described by digits_. Calls IncrementDigits +// after computing the string, so that it knows the answer +// for subsequent HasNext() calls. +const StringPiece& StringGenerator::Next() { + CHECK(hasnext_); + if (generate_null_) { + generate_null_ = false; + sp_ = StringPiece(); + return sp_; + } + s_.clear(); + for (size_t i = 0; i < digits_.size(); i++) { + s_ += alphabet_[digits_[i]]; + } + hasnext_ = random_ ? RandomDigits() : IncrementDigits(); + sp_ = s_; + return sp_; +} + +// Sets generator up to return n random strings. +void StringGenerator::Random(int32_t seed, int n) { + rng_.seed(seed); + + random_ = true; + nrandom_ = n; + hasnext_ = nrandom_ > 0; +} + +void StringGenerator::GenerateNULL() { + generate_null_ = true; + hasnext_ = true; +} + +std::string DeBruijnString(int n) { + CHECK_GE(n, 1); + CHECK_LE(n, 29); + const size_t size = size_t{1} << static_cast<size_t>(n); + const size_t mask = size - 1; + std::vector<bool> did(size, false); + std::string s; + s.reserve(static_cast<size_t>(n) + size); + for (size_t i = 0; i < static_cast<size_t>(n - 1); i++) + s += '0'; + size_t bits = 0; + for (size_t i = 0; i < size; i++) { + bits <<= 1; + bits &= mask; + if (!did[bits | 1]) { + bits |= 1; + s += '1'; + } else { + s += '0'; + } + CHECK(!did[bits]); + did[bits] = true; + } + CHECK_EQ(s.size(), static_cast<size_t>(n - 1) + size); + return s; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/string_generator.h b/contrib/libs/re2/re2/testing/string_generator.h index ff8179bfb7..73fbb51451 100644 --- a/contrib/libs/re2/re2/testing/string_generator.h +++ b/contrib/libs/re2/re2/testing/string_generator.h @@ -1,76 +1,76 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef RE2_TESTING_STRING_GENERATOR_H_ -#define RE2_TESTING_STRING_GENERATOR_H_ - -// String generator: generates all possible strings of up to -// maxlen letters using the set of letters in alpha. -// Fetch strings using a Java-like Next()/HasNext() interface. - -#include <stdint.h> -#include <random> -#include <string> -#include <vector> - -#include "util/util.h" -#include "re2/stringpiece.h" - -namespace re2 { - -class StringGenerator { - public: - StringGenerator(int maxlen, const std::vector<std::string>& alphabet); - ~StringGenerator() {} - - const StringPiece& Next(); - bool HasNext() { return hasnext_; } - - // Resets generator to start sequence over. - void Reset(); - - // Causes generator to emit random strings for next n calls to Next(). - void Random(int32_t seed, int n); - - // Causes generator to emit a NULL as the next call. - void GenerateNULL(); - - private: - bool IncrementDigits(); - bool RandomDigits(); - - // Global state. - int maxlen_; // Maximum length string to generate. - std::vector<std::string> alphabet_; // Alphabet, one string per letter. - - // Iteration state. - StringPiece sp_; // Last StringPiece returned by Next(). - std::string s_; // String data in last StringPiece returned by Next(). - bool hasnext_; // Whether Next() can be called again. - std::vector<int> digits_; // Alphabet indices for next string. - bool generate_null_; // Whether to generate a NULL StringPiece next. - bool random_; // Whether generated strings are random. - int nrandom_; // Number of random strings left to generate. - std::minstd_rand0 rng_; // Random number generator. - - StringGenerator(const StringGenerator&) = delete; - StringGenerator& operator=(const StringGenerator&) = delete; -}; - -// Generates and returns a string over binary alphabet {0,1} that contains -// all possible binary sequences of length n as subsequences. The obvious -// brute force method would generate a string of length n * 2^n, but this -// generates a string of length n-1 + 2^n called a De Bruijn cycle. -// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. -// -// Such a string is useful for testing a DFA. If you have a DFA -// where distinct last n bytes implies distinct states, then running on a -// DeBruijn string causes the DFA to need to create a new state at every -// position in the input, never reusing any states until it gets to the -// end of the string. This is the worst possible case for DFA execution. -std::string DeBruijnString(int n); - -} // namespace re2 - -#endif // RE2_TESTING_STRING_GENERATOR_H_ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_TESTING_STRING_GENERATOR_H_ +#define RE2_TESTING_STRING_GENERATOR_H_ + +// String generator: generates all possible strings of up to +// maxlen letters using the set of letters in alpha. +// Fetch strings using a Java-like Next()/HasNext() interface. + +#include <stdint.h> +#include <random> +#include <string> +#include <vector> + +#include "util/util.h" +#include "re2/stringpiece.h" + +namespace re2 { + +class StringGenerator { + public: + StringGenerator(int maxlen, const std::vector<std::string>& alphabet); + ~StringGenerator() {} + + const StringPiece& Next(); + bool HasNext() { return hasnext_; } + + // Resets generator to start sequence over. + void Reset(); + + // Causes generator to emit random strings for next n calls to Next(). + void Random(int32_t seed, int n); + + // Causes generator to emit a NULL as the next call. + void GenerateNULL(); + + private: + bool IncrementDigits(); + bool RandomDigits(); + + // Global state. + int maxlen_; // Maximum length string to generate. + std::vector<std::string> alphabet_; // Alphabet, one string per letter. + + // Iteration state. + StringPiece sp_; // Last StringPiece returned by Next(). + std::string s_; // String data in last StringPiece returned by Next(). + bool hasnext_; // Whether Next() can be called again. + std::vector<int> digits_; // Alphabet indices for next string. + bool generate_null_; // Whether to generate a NULL StringPiece next. + bool random_; // Whether generated strings are random. + int nrandom_; // Number of random strings left to generate. + std::minstd_rand0 rng_; // Random number generator. + + StringGenerator(const StringGenerator&) = delete; + StringGenerator& operator=(const StringGenerator&) = delete; +}; + +// Generates and returns a string over binary alphabet {0,1} that contains +// all possible binary sequences of length n as subsequences. The obvious +// brute force method would generate a string of length n * 2^n, but this +// generates a string of length n-1 + 2^n called a De Bruijn cycle. +// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. +// +// Such a string is useful for testing a DFA. If you have a DFA +// where distinct last n bytes implies distinct states, then running on a +// DeBruijn string causes the DFA to need to create a new state at every +// position in the input, never reusing any states until it gets to the +// end of the string. This is the worst possible case for DFA execution. +std::string DeBruijnString(int n); + +} // namespace re2 + +#endif // RE2_TESTING_STRING_GENERATOR_H_ diff --git a/contrib/libs/re2/re2/testing/string_generator_test.cc b/contrib/libs/re2/re2/testing/string_generator_test.cc index 89a3ebaf82..80521568b3 100644 --- a/contrib/libs/re2/re2/testing/string_generator_test.cc +++ b/contrib/libs/re2/re2/testing/string_generator_test.cc @@ -1,110 +1,110 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Test StringGenerator. - -#include <stdint.h> -#include <string> - -#include "library/cpp/testing/gtest/gtest.h" -#include "util/utf.h" -#include "re2/testing/string_generator.h" -#include "re2/testing/regexp_generator.h" - -namespace re2 { - -// Returns i to the e. -static int64_t IntegerPower(int i, int e) { - int64_t p = 1; - while (e-- > 0) - p *= i; - return p; -} - -// Checks that for given settings of the string generator: -// * it generates strings that are non-decreasing in length. -// * strings of the same length are sorted in alphabet order. -// * it doesn't generate the same string twice. -// * it generates the right number of strings. -// -// If all of these hold, the StringGenerator is behaving. -// Assumes that the alphabet is sorted, so that the generated -// strings can just be compared lexicographically. -static void RunTest(int len, const std::string& alphabet, bool donull) { - StringGenerator g(len, Explode(alphabet)); - - int n = 0; - int last_l = -1; - std::string last_s; - - if (donull) { - g.GenerateNULL(); - EXPECT_TRUE(g.HasNext()); - StringPiece sp = g.Next(); - EXPECT_EQ(sp.data(), static_cast<const char*>(NULL)); - EXPECT_EQ(sp.size(), 0); - } - - while (g.HasNext()) { - std::string s = std::string(g.Next()); - n++; - - // Check that all characters in s appear in alphabet. - for (const char *p = s.c_str(); *p != '\0'; ) { - Rune r; - p += chartorune(&r, p); - EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL); - } - - // Check that string is properly ordered w.r.t. previous string. - int l = utflen(s.c_str()); - EXPECT_LE(l, len); - if (last_l < l) { - last_l = l; - } else { - EXPECT_EQ(last_l, l); - EXPECT_LT(last_s, s); - } - last_s = s; - } - - // Check total string count. - int64_t m = 0; - int alpha = utflen(alphabet.c_str()); - if (alpha == 0) // Degenerate case. - len = 0; - for (int i = 0; i <= len; i++) - m += IntegerPower(alpha, i); - EXPECT_EQ(n, m); -} - -TEST(StringGenerator, NoLength) { - RunTest(0, "abc", false); -} - -TEST(StringGenerator, NoLengthNoAlphabet) { - RunTest(0, "", false); -} - -TEST(StringGenerator, NoAlphabet) { - RunTest(5, "", false); -} - -TEST(StringGenerator, Simple) { - RunTest(3, "abc", false); -} - -TEST(StringGenerator, UTF8) { - RunTest(4, "abc\xE2\x98\xBA", false); -} - -TEST(StringGenerator, GenNULL) { - RunTest(0, "abc", true); - RunTest(0, "", true); - RunTest(5, "", true); - RunTest(3, "abc", true); - RunTest(4, "abc\xE2\x98\xBA", true); -} - -} // namespace re2 +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test StringGenerator. + +#include <stdint.h> +#include <string> + +#include "library/cpp/testing/gtest/gtest.h" +#include "util/utf.h" +#include "re2/testing/string_generator.h" +#include "re2/testing/regexp_generator.h" + +namespace re2 { + +// Returns i to the e. +static int64_t IntegerPower(int i, int e) { + int64_t p = 1; + while (e-- > 0) + p *= i; + return p; +} + +// Checks that for given settings of the string generator: +// * it generates strings that are non-decreasing in length. +// * strings of the same length are sorted in alphabet order. +// * it doesn't generate the same string twice. +// * it generates the right number of strings. +// +// If all of these hold, the StringGenerator is behaving. +// Assumes that the alphabet is sorted, so that the generated +// strings can just be compared lexicographically. +static void RunTest(int len, const std::string& alphabet, bool donull) { + StringGenerator g(len, Explode(alphabet)); + + int n = 0; + int last_l = -1; + std::string last_s; + + if (donull) { + g.GenerateNULL(); + EXPECT_TRUE(g.HasNext()); + StringPiece sp = g.Next(); + EXPECT_EQ(sp.data(), static_cast<const char*>(NULL)); + EXPECT_EQ(sp.size(), 0); + } + + while (g.HasNext()) { + std::string s = std::string(g.Next()); + n++; + + // Check that all characters in s appear in alphabet. + for (const char *p = s.c_str(); *p != '\0'; ) { + Rune r; + p += chartorune(&r, p); + EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL); + } + + // Check that string is properly ordered w.r.t. previous string. + int l = utflen(s.c_str()); + EXPECT_LE(l, len); + if (last_l < l) { + last_l = l; + } else { + EXPECT_EQ(last_l, l); + EXPECT_LT(last_s, s); + } + last_s = s; + } + + // Check total string count. + int64_t m = 0; + int alpha = utflen(alphabet.c_str()); + if (alpha == 0) // Degenerate case. + len = 0; + for (int i = 0; i <= len; i++) + m += IntegerPower(alpha, i); + EXPECT_EQ(n, m); +} + +TEST(StringGenerator, NoLength) { + RunTest(0, "abc", false); +} + +TEST(StringGenerator, NoLengthNoAlphabet) { + RunTest(0, "", false); +} + +TEST(StringGenerator, NoAlphabet) { + RunTest(5, "", false); +} + +TEST(StringGenerator, Simple) { + RunTest(3, "abc", false); +} + +TEST(StringGenerator, UTF8) { + RunTest(4, "abc\xE2\x98\xBA", false); +} + +TEST(StringGenerator, GenNULL) { + RunTest(0, "abc", true); + RunTest(0, "", true); + RunTest(5, "", true); + RunTest(3, "abc", true); + RunTest(4, "abc\xE2\x98\xBA", true); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/tester.cc b/contrib/libs/re2/re2/testing/tester.cc index 8c74ed80d7..b0c22f25b2 100644 --- a/contrib/libs/re2/re2/testing/tester.cc +++ b/contrib/libs/re2/re2/testing/tester.cc @@ -1,685 +1,685 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Regular expression engine tester -- test all the implementations against each other. - -#include <stddef.h> -#include <stdint.h> -#include <string.h> -#include <string> - -#include "util/util.h" -#include "util/flags.h" -#include "util/logging.h" -#include "util/strutil.h" -#include "re2/testing/tester.h" -#include "re2/prog.h" -#include "re2/re2.h" -#include "re2/regexp.h" - -DEFINE_FLAG(bool, dump_prog, false, "dump regexp program"); -DEFINE_FLAG(bool, log_okay, false, "log successful runs"); -DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program"); - -DEFINE_FLAG(int, max_regexp_failures, 100, - "maximum number of regexp test failures (-1 = unlimited)"); - -DEFINE_FLAG(std::string, regexp_engines, "", - "pattern to select regexp engines to test"); - -namespace re2 { - -enum { - kMaxSubmatch = 1+16, // $0...$16 -}; - -const char* engine_names[kEngineMax] = { - "Backtrack", - "NFA", - "DFA", - "DFA1", - "OnePass", - "BitState", - "RE2", - "RE2a", - "RE2b", - "PCRE", -}; - -// Returns the name of the engine. -static const char* EngineName(Engine e) { - CHECK_GE(e, 0); - CHECK_LT(e, arraysize(engine_names)); - CHECK(engine_names[e] != NULL); - return engine_names[e]; -} - -// Returns bit mask of engines to use. -static uint32_t Engines() { - static bool did_parse = false; - static uint32_t cached_engines = 0; - - if (did_parse) - return cached_engines; - - if (GetFlag(FLAGS_regexp_engines).empty()) { - cached_engines = ~0; - } else { - for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) - if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos) - cached_engines |= 1<<i; - } - - if (cached_engines == 0) - LOG(INFO) << "Warning: no engines enabled."; - if (!UsingPCRE) - cached_engines &= ~(1<<kEnginePCRE); - for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) { - if (cached_engines & (1<<i)) - LOG(INFO) << EngineName(i) << " enabled"; - } - - did_parse = true; - return cached_engines; -} - -// The result of running a match. -struct TestInstance::Result { - Result() - : skipped(false), - matched(false), - untrusted(false), - have_submatch(false), - have_submatch0(false) { - ClearSubmatch(); - } - - void ClearSubmatch() { - for (int i = 0; i < kMaxSubmatch; i++) - submatch[i] = StringPiece(); - } - - bool skipped; // test skipped: wasn't applicable - bool matched; // found a match - bool untrusted; // don't really trust the answer - bool have_submatch; // computed all submatch info - bool have_submatch0; // computed just submatch[0] - StringPiece submatch[kMaxSubmatch]; -}; - -typedef TestInstance::Result Result; - -// Formats a single capture range s in text in the form (a,b) -// where a and b are the starting and ending offsets of s in text. -static std::string FormatCapture(const StringPiece& text, - const StringPiece& s) { - if (s.data() == NULL) - return "(?,?)"; - return StringPrintf("(%td,%td)", +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression engine tester -- test all the implementations against each other. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <string> + +#include "util/util.h" +#include "util/flags.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/testing/tester.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +DEFINE_FLAG(bool, dump_prog, false, "dump regexp program"); +DEFINE_FLAG(bool, log_okay, false, "log successful runs"); +DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program"); + +DEFINE_FLAG(int, max_regexp_failures, 100, + "maximum number of regexp test failures (-1 = unlimited)"); + +DEFINE_FLAG(std::string, regexp_engines, "", + "pattern to select regexp engines to test"); + +namespace re2 { + +enum { + kMaxSubmatch = 1+16, // $0...$16 +}; + +const char* engine_names[kEngineMax] = { + "Backtrack", + "NFA", + "DFA", + "DFA1", + "OnePass", + "BitState", + "RE2", + "RE2a", + "RE2b", + "PCRE", +}; + +// Returns the name of the engine. +static const char* EngineName(Engine e) { + CHECK_GE(e, 0); + CHECK_LT(e, arraysize(engine_names)); + CHECK(engine_names[e] != NULL); + return engine_names[e]; +} + +// Returns bit mask of engines to use. +static uint32_t Engines() { + static bool did_parse = false; + static uint32_t cached_engines = 0; + + if (did_parse) + return cached_engines; + + if (GetFlag(FLAGS_regexp_engines).empty()) { + cached_engines = ~0; + } else { + for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) + if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos) + cached_engines |= 1<<i; + } + + if (cached_engines == 0) + LOG(INFO) << "Warning: no engines enabled."; + if (!UsingPCRE) + cached_engines &= ~(1<<kEnginePCRE); + for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) { + if (cached_engines & (1<<i)) + LOG(INFO) << EngineName(i) << " enabled"; + } + + did_parse = true; + return cached_engines; +} + +// The result of running a match. +struct TestInstance::Result { + Result() + : skipped(false), + matched(false), + untrusted(false), + have_submatch(false), + have_submatch0(false) { + ClearSubmatch(); + } + + void ClearSubmatch() { + for (int i = 0; i < kMaxSubmatch; i++) + submatch[i] = StringPiece(); + } + + bool skipped; // test skipped: wasn't applicable + bool matched; // found a match + bool untrusted; // don't really trust the answer + bool have_submatch; // computed all submatch info + bool have_submatch0; // computed just submatch[0] + StringPiece submatch[kMaxSubmatch]; +}; + +typedef TestInstance::Result Result; + +// Formats a single capture range s in text in the form (a,b) +// where a and b are the starting and ending offsets of s in text. +static std::string FormatCapture(const StringPiece& text, + const StringPiece& s) { + if (s.data() == NULL) + return "(?,?)"; + return StringPrintf("(%td,%td)", BeginPtr(s) - BeginPtr(text), EndPtr(s) - BeginPtr(text)); -} - -// Returns whether text contains non-ASCII (>= 0x80) bytes. -static bool NonASCII(const StringPiece& text) { - for (size_t i = 0; i < text.size(); i++) - if ((uint8_t)text[i] >= 0x80) - return true; - return false; -} - -// Returns string representation of match kind. -static std::string FormatKind(Prog::MatchKind kind) { - switch (kind) { - case Prog::kFullMatch: - return "full match"; - case Prog::kLongestMatch: - return "longest match"; - case Prog::kFirstMatch: - return "first match"; - case Prog::kManyMatch: - return "many match"; - } - return "???"; -} - -// Returns string representation of anchor kind. -static std::string FormatAnchor(Prog::Anchor anchor) { - switch (anchor) { - case Prog::kAnchored: - return "anchored"; - case Prog::kUnanchored: - return "unanchored"; - } - return "???"; -} - -struct ParseMode { - Regexp::ParseFlags parse_flags; - std::string desc; -}; - -static const Regexp::ParseFlags single_line = - Regexp::LikePerl; -static const Regexp::ParseFlags multi_line = - static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine); - -static ParseMode parse_modes[] = { - { single_line, "single-line" }, - { single_line|Regexp::Latin1, "single-line, latin1" }, - { multi_line, "multiline" }, - { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, - { multi_line|Regexp::Latin1, "multiline, latin1" }, -}; - -static std::string FormatMode(Regexp::ParseFlags flags) { - for (size_t i = 0; i < arraysize(parse_modes); i++) - if (parse_modes[i].parse_flags == flags) - return parse_modes[i].desc; - return StringPrintf("%#x", static_cast<uint32_t>(flags)); -} - -// Constructs and saves all the matching engines that -// will be required for the given tests. -TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, - Regexp::ParseFlags flags) - : regexp_str_(regexp_str), - kind_(kind), - flags_(flags), - error_(false), - regexp_(NULL), - num_captures_(0), - prog_(NULL), - rprog_(NULL), - re_(NULL), - re2_(NULL) { - - VLOG(1) << CEscape(regexp_str); - - // Compile regexp to prog. - // Always required - needed for backtracking (reference implementation). - RegexpStatus status; - regexp_ = Regexp::Parse(regexp_str, flags, &status); - if (regexp_ == NULL) { - LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) - << " mode: " << FormatMode(flags); - error_ = true; - return; - } - num_captures_ = regexp_->NumCaptures(); - prog_ = regexp_->CompileToProg(0); - if (prog_ == NULL) { - LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_); - error_ = true; - return; - } - if (GetFlag(FLAGS_dump_prog)) { - LOG(INFO) << "Prog for " - << " regexp " - << CEscape(regexp_str_) - << " (" << FormatKind(kind_) - << ", " << FormatMode(flags_) - << ")\n" - << prog_->Dump(); - } - - // Compile regexp to reversed prog. Only needed for DFA engines. - if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) { - rprog_ = regexp_->CompileToReverseProg(0); - if (rprog_ == NULL) { - LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_); - error_ = true; - return; - } - if (GetFlag(FLAGS_dump_rprog)) - LOG(INFO) << rprog_->Dump(); - } - - // Create re string that will be used for RE and RE2. - std::string re = std::string(regexp_str); - // Accomodate flags. - // Regexp::Latin1 will be accomodated below. - if (!(flags & Regexp::OneLine)) - re = "(?m)" + re; - if (flags & Regexp::NonGreedy) - re = "(?U)" + re; - if (flags & Regexp::DotNL) - re = "(?s)" + re; - - // Compile regexp to RE2. - if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) { - RE2::Options options; - if (flags & Regexp::Latin1) - options.set_encoding(RE2::Options::EncodingLatin1); - if (kind_ == Prog::kLongestMatch) - options.set_longest_match(true); - re2_ = new RE2(re, options); - if (!re2_->error().empty()) { - LOG(INFO) << "Cannot RE2: " << CEscape(re); - error_ = true; - return; - } - } - - // Compile regexp to RE. - // PCRE as exposed by the RE interface isn't always usable. - // 1. It disagrees about handling of empty-string reptitions - // like matching (a*)* against "b". PCRE treats the (a*) as - // occurring once, while we treat it as occurring not at all. - // 2. It treats $ as this weird thing meaning end of string - // or before the \n at the end of the string. - // 3. It doesn't implement POSIX leftmost-longest matching. - // 4. It lets \s match vertical tab. - // MimicsPCRE() detects 1 and 2. - if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && - kind_ != Prog::kLongestMatch) { - PCRE_Options o; - o.set_option(PCRE::UTF8); - if (flags & Regexp::Latin1) - o.set_option(PCRE::None); - // PCRE has interface bug keeping us from finding $0, so - // add one more layer of parens. - re_ = new PCRE("("+re+")", o); - if (!re_->error().empty()) { - LOG(INFO) << "Cannot PCRE: " << CEscape(re); - error_ = true; - return; - } - } -} - -TestInstance::~TestInstance() { - if (regexp_) - regexp_->Decref(); - delete prog_; - delete rprog_; - delete re_; - delete re2_; -} - -// Runs a single search using the named engine type. -// This interface hides all the irregularities of the various -// engine interfaces from the rest of this file. -void TestInstance::RunSearch(Engine type, - const StringPiece& orig_text, - const StringPiece& orig_context, - Prog::Anchor anchor, - Result* result) { - if (regexp_ == NULL) { - result->skipped = true; - return; - } - int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0 - if (nsubmatch > kMaxSubmatch) - nsubmatch = kMaxSubmatch; - - StringPiece text = orig_text; - StringPiece context = orig_context; - - switch (type) { - default: - LOG(FATAL) << "Bad RunSearch type: " << (int)type; - - case kEngineBacktrack: - if (prog_ == NULL) { - result->skipped = true; - break; - } - result->matched = - prog_->UnsafeSearchBacktrack(text, context, anchor, kind_, - result->submatch, nsubmatch); - result->have_submatch = true; - break; - - case kEngineNFA: - if (prog_ == NULL) { - result->skipped = true; - break; - } - result->matched = - prog_->SearchNFA(text, context, anchor, kind_, - result->submatch, nsubmatch); - result->have_submatch = true; - break; - - case kEngineDFA: - if (prog_ == NULL) { - result->skipped = true; - break; - } - result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL, - &result->skipped, NULL); - break; - - case kEngineDFA1: - if (prog_ == NULL || rprog_ == NULL) { - result->skipped = true; - break; - } - result->matched = - prog_->SearchDFA(text, context, anchor, kind_, result->submatch, - &result->skipped, NULL); - // If anchored, no need for second run, - // but do it anyway to find more bugs. - if (result->matched) { - if (!rprog_->SearchDFA(result->submatch[0], context, - Prog::kAnchored, Prog::kLongestMatch, - result->submatch, - &result->skipped, NULL)) { - LOG(ERROR) << "Reverse DFA inconsistency: " - << CEscape(regexp_str_) - << " on " << CEscape(text); - result->matched = false; - } - } - result->have_submatch0 = true; - break; - - case kEngineOnePass: - if (prog_ == NULL || - !prog_->IsOnePass() || - anchor == Prog::kUnanchored || - nsubmatch > Prog::kMaxOnePassCapture) { - result->skipped = true; - break; - } - result->matched = prog_->SearchOnePass(text, context, anchor, kind_, - result->submatch, nsubmatch); - result->have_submatch = true; - break; - - case kEngineBitState: - if (prog_ == NULL || - !prog_->CanBitState()) { - result->skipped = true; - break; - } - result->matched = prog_->SearchBitState(text, context, anchor, kind_, - result->submatch, nsubmatch); - result->have_submatch = true; - break; - - case kEngineRE2: - case kEngineRE2a: - case kEngineRE2b: { +} + +// Returns whether text contains non-ASCII (>= 0x80) bytes. +static bool NonASCII(const StringPiece& text) { + for (size_t i = 0; i < text.size(); i++) + if ((uint8_t)text[i] >= 0x80) + return true; + return false; +} + +// Returns string representation of match kind. +static std::string FormatKind(Prog::MatchKind kind) { + switch (kind) { + case Prog::kFullMatch: + return "full match"; + case Prog::kLongestMatch: + return "longest match"; + case Prog::kFirstMatch: + return "first match"; + case Prog::kManyMatch: + return "many match"; + } + return "???"; +} + +// Returns string representation of anchor kind. +static std::string FormatAnchor(Prog::Anchor anchor) { + switch (anchor) { + case Prog::kAnchored: + return "anchored"; + case Prog::kUnanchored: + return "unanchored"; + } + return "???"; +} + +struct ParseMode { + Regexp::ParseFlags parse_flags; + std::string desc; +}; + +static const Regexp::ParseFlags single_line = + Regexp::LikePerl; +static const Regexp::ParseFlags multi_line = + static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine); + +static ParseMode parse_modes[] = { + { single_line, "single-line" }, + { single_line|Regexp::Latin1, "single-line, latin1" }, + { multi_line, "multiline" }, + { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, + { multi_line|Regexp::Latin1, "multiline, latin1" }, +}; + +static std::string FormatMode(Regexp::ParseFlags flags) { + for (size_t i = 0; i < arraysize(parse_modes); i++) + if (parse_modes[i].parse_flags == flags) + return parse_modes[i].desc; + return StringPrintf("%#x", static_cast<uint32_t>(flags)); +} + +// Constructs and saves all the matching engines that +// will be required for the given tests. +TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, + Regexp::ParseFlags flags) + : regexp_str_(regexp_str), + kind_(kind), + flags_(flags), + error_(false), + regexp_(NULL), + num_captures_(0), + prog_(NULL), + rprog_(NULL), + re_(NULL), + re2_(NULL) { + + VLOG(1) << CEscape(regexp_str); + + // Compile regexp to prog. + // Always required - needed for backtracking (reference implementation). + RegexpStatus status; + regexp_ = Regexp::Parse(regexp_str, flags, &status); + if (regexp_ == NULL) { + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) + << " mode: " << FormatMode(flags); + error_ = true; + return; + } + num_captures_ = regexp_->NumCaptures(); + prog_ = regexp_->CompileToProg(0); + if (prog_ == NULL) { + LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_); + error_ = true; + return; + } + if (GetFlag(FLAGS_dump_prog)) { + LOG(INFO) << "Prog for " + << " regexp " + << CEscape(regexp_str_) + << " (" << FormatKind(kind_) + << ", " << FormatMode(flags_) + << ")\n" + << prog_->Dump(); + } + + // Compile regexp to reversed prog. Only needed for DFA engines. + if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) { + rprog_ = regexp_->CompileToReverseProg(0); + if (rprog_ == NULL) { + LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_); + error_ = true; + return; + } + if (GetFlag(FLAGS_dump_rprog)) + LOG(INFO) << rprog_->Dump(); + } + + // Create re string that will be used for RE and RE2. + std::string re = std::string(regexp_str); + // Accomodate flags. + // Regexp::Latin1 will be accomodated below. + if (!(flags & Regexp::OneLine)) + re = "(?m)" + re; + if (flags & Regexp::NonGreedy) + re = "(?U)" + re; + if (flags & Regexp::DotNL) + re = "(?s)" + re; + + // Compile regexp to RE2. + if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) { + RE2::Options options; + if (flags & Regexp::Latin1) + options.set_encoding(RE2::Options::EncodingLatin1); + if (kind_ == Prog::kLongestMatch) + options.set_longest_match(true); + re2_ = new RE2(re, options); + if (!re2_->error().empty()) { + LOG(INFO) << "Cannot RE2: " << CEscape(re); + error_ = true; + return; + } + } + + // Compile regexp to RE. + // PCRE as exposed by the RE interface isn't always usable. + // 1. It disagrees about handling of empty-string reptitions + // like matching (a*)* against "b". PCRE treats the (a*) as + // occurring once, while we treat it as occurring not at all. + // 2. It treats $ as this weird thing meaning end of string + // or before the \n at the end of the string. + // 3. It doesn't implement POSIX leftmost-longest matching. + // 4. It lets \s match vertical tab. + // MimicsPCRE() detects 1 and 2. + if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && + kind_ != Prog::kLongestMatch) { + PCRE_Options o; + o.set_option(PCRE::UTF8); + if (flags & Regexp::Latin1) + o.set_option(PCRE::None); + // PCRE has interface bug keeping us from finding $0, so + // add one more layer of parens. + re_ = new PCRE("("+re+")", o); + if (!re_->error().empty()) { + LOG(INFO) << "Cannot PCRE: " << CEscape(re); + error_ = true; + return; + } + } +} + +TestInstance::~TestInstance() { + if (regexp_) + regexp_->Decref(); + delete prog_; + delete rprog_; + delete re_; + delete re2_; +} + +// Runs a single search using the named engine type. +// This interface hides all the irregularities of the various +// engine interfaces from the rest of this file. +void TestInstance::RunSearch(Engine type, + const StringPiece& orig_text, + const StringPiece& orig_context, + Prog::Anchor anchor, + Result* result) { + if (regexp_ == NULL) { + result->skipped = true; + return; + } + int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0 + if (nsubmatch > kMaxSubmatch) + nsubmatch = kMaxSubmatch; + + StringPiece text = orig_text; + StringPiece context = orig_context; + + switch (type) { + default: + LOG(FATAL) << "Bad RunSearch type: " << (int)type; + + case kEngineBacktrack: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->UnsafeSearchBacktrack(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineNFA: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->SearchNFA(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineDFA: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL, + &result->skipped, NULL); + break; + + case kEngineDFA1: + if (prog_ == NULL || rprog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->SearchDFA(text, context, anchor, kind_, result->submatch, + &result->skipped, NULL); + // If anchored, no need for second run, + // but do it anyway to find more bugs. + if (result->matched) { + if (!rprog_->SearchDFA(result->submatch[0], context, + Prog::kAnchored, Prog::kLongestMatch, + result->submatch, + &result->skipped, NULL)) { + LOG(ERROR) << "Reverse DFA inconsistency: " + << CEscape(regexp_str_) + << " on " << CEscape(text); + result->matched = false; + } + } + result->have_submatch0 = true; + break; + + case kEngineOnePass: + if (prog_ == NULL || + !prog_->IsOnePass() || + anchor == Prog::kUnanchored || + nsubmatch > Prog::kMaxOnePassCapture) { + result->skipped = true; + break; + } + result->matched = prog_->SearchOnePass(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineBitState: + if (prog_ == NULL || + !prog_->CanBitState()) { + result->skipped = true; + break; + } + result->matched = prog_->SearchBitState(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineRE2: + case kEngineRE2a: + case kEngineRE2b: { if (!re2_ || EndPtr(text) != EndPtr(context)) { - result->skipped = true; - break; - } - - RE2::Anchor re_anchor; - if (anchor == Prog::kAnchored) - re_anchor = RE2::ANCHOR_START; - else - re_anchor = RE2::UNANCHORED; - if (kind_ == Prog::kFullMatch) - re_anchor = RE2::ANCHOR_BOTH; - - result->matched = re2_->Match( - context, + result->skipped = true; + break; + } + + RE2::Anchor re_anchor; + if (anchor == Prog::kAnchored) + re_anchor = RE2::ANCHOR_START; + else + re_anchor = RE2::UNANCHORED; + if (kind_ == Prog::kFullMatch) + re_anchor = RE2::ANCHOR_BOTH; + + result->matched = re2_->Match( + context, static_cast<size_t>(BeginPtr(text) - BeginPtr(context)), static_cast<size_t>(EndPtr(text) - BeginPtr(context)), - re_anchor, - result->submatch, - nsubmatch); - result->have_submatch = nsubmatch > 0; - break; - } - - case kEnginePCRE: { + re_anchor, + result->submatch, + nsubmatch); + result->have_submatch = nsubmatch > 0; + break; + } + + case kEnginePCRE: { if (!re_ || BeginPtr(text) != BeginPtr(context) || EndPtr(text) != EndPtr(context)) { - result->skipped = true; - break; - } - - // In Perl/PCRE, \v matches any character considered vertical - // whitespace, not just vertical tab. Regexp::MimicsPCRE() is - // unable to handle all cases of this, unfortunately, so just - // catch them here. :( - if (regexp_str_.find("\\v") != StringPiece::npos && - (text.find('\n') != StringPiece::npos || - text.find('\f') != StringPiece::npos || - text.find('\r') != StringPiece::npos)) { - result->skipped = true; - break; - } - - // PCRE 8.34 or so started allowing vertical tab to match \s, - // following a change made in Perl 5.18. RE2 does not. - if ((regexp_str_.find("\\s") != StringPiece::npos || - regexp_str_.find("\\S") != StringPiece::npos) && - text.find('\v') != StringPiece::npos) { - result->skipped = true; - break; - } - - const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; - PCRE::Arg *a = new PCRE::Arg[nsubmatch]; - for (int i = 0; i < nsubmatch; i++) { - a[i] = PCRE::Arg(&result->submatch[i]); - argptr[i] = &a[i]; - } - size_t consumed; - PCRE::Anchor pcre_anchor; - if (anchor == Prog::kAnchored) - pcre_anchor = PCRE::ANCHOR_START; - else - pcre_anchor = PCRE::UNANCHORED; - if (kind_ == Prog::kFullMatch) - pcre_anchor = PCRE::ANCHOR_BOTH; - re_->ClearHitLimit(); - result->matched = - re_->DoMatch(text, - pcre_anchor, - &consumed, - argptr, nsubmatch); - if (re_->HitLimit()) { - result->untrusted = true; - delete[] argptr; - delete[] a; - break; - } - result->have_submatch = true; - delete[] argptr; - delete[] a; - break; - } - } - - if (!result->matched) - result->ClearSubmatch(); -} - -// Checks whether r is okay given that correct is the right answer. -// Specifically, r's answers have to match (but it doesn't have to -// claim to have all the answers). -static bool ResultOkay(const Result& r, const Result& correct) { - if (r.skipped) - return true; - if (r.matched != correct.matched) - return false; - if (r.have_submatch || r.have_submatch0) { - for (int i = 0; i < kMaxSubmatch; i++) { - if (correct.submatch[i].data() != r.submatch[i].data() || - correct.submatch[i].size() != r.submatch[i].size()) - return false; - if (!r.have_submatch) - break; - } - } - return true; -} - -// Runs a single test. -bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, - Prog::Anchor anchor) { - // Backtracking is the gold standard. - Result correct; - RunSearch(kEngineBacktrack, text, context, anchor, &correct); - if (correct.skipped) { - if (regexp_ == NULL) - return true; - LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) - << " " << FormatMode(flags_); - return false; - } - VLOG(1) << "Try: regexp " << CEscape(regexp_str_) - << " text " << CEscape(text) - << " (" << FormatKind(kind_) - << ", " << FormatAnchor(anchor) - << ", " << FormatMode(flags_) - << ")"; - - // Compare the others. - bool all_okay = true; - for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) { - if (!(Engines() & (1<<i))) - continue; - - Result r; - RunSearch(i, text, context, anchor, &r); - if (ResultOkay(r, correct)) { - if (GetFlag(FLAGS_log_okay)) - LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); - continue; - } - - // We disagree with PCRE on the meaning of some Unicode matches. - // In particular, we treat non-ASCII UTF-8 as non-word characters. - // We also treat "empty" character sets like [^\w\W] as being - // impossible to match, while PCRE apparently excludes some code - // points (e.g., 0x0080) from both \w and \W. - if (i == kEnginePCRE && NonASCII(text)) - continue; - - if (!r.untrusted) - all_okay = false; - - LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, - context, anchor); - if (r.matched != correct.matched) { - if (r.matched) { - LOG(INFO) << " Should not match (but does)."; - } else { - LOG(INFO) << " Should match (but does not)."; - continue; - } - } - for (int i = 0; i < 1+num_captures_; i++) { - if (r.submatch[i].data() != correct.submatch[i].data() || - r.submatch[i].size() != correct.submatch[i].size()) { - LOG(INFO) << - StringPrintf(" $%d: should be %s is %s", - i, - FormatCapture(text, correct.submatch[i]).c_str(), - FormatCapture(text, r.submatch[i]).c_str()); - } else { - LOG(INFO) << - StringPrintf(" $%d: %s ok", i, - FormatCapture(text, r.submatch[i]).c_str()); - } - } - } - - if (!all_okay) { - // This will be initialised once (after flags have been initialised) - // and that is desirable because we want to enforce a global limit. - static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures); - if (max_regexp_failures > 0 && --max_regexp_failures == 0) - LOG(QFATAL) << "Too many regexp failures."; - } - - return all_okay; -} - -void TestInstance::LogMatch(const char* prefix, Engine e, - const StringPiece& text, const StringPiece& context, - Prog::Anchor anchor) { - LOG(INFO) << prefix - << EngineName(e) - << " regexp " - << CEscape(regexp_str_) - << " " - << CEscape(regexp_->ToString()) - << " text " - << CEscape(text) - << " (" + result->skipped = true; + break; + } + + // In Perl/PCRE, \v matches any character considered vertical + // whitespace, not just vertical tab. Regexp::MimicsPCRE() is + // unable to handle all cases of this, unfortunately, so just + // catch them here. :( + if (regexp_str_.find("\\v") != StringPiece::npos && + (text.find('\n') != StringPiece::npos || + text.find('\f') != StringPiece::npos || + text.find('\r') != StringPiece::npos)) { + result->skipped = true; + break; + } + + // PCRE 8.34 or so started allowing vertical tab to match \s, + // following a change made in Perl 5.18. RE2 does not. + if ((regexp_str_.find("\\s") != StringPiece::npos || + regexp_str_.find("\\S") != StringPiece::npos) && + text.find('\v') != StringPiece::npos) { + result->skipped = true; + break; + } + + const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; + PCRE::Arg *a = new PCRE::Arg[nsubmatch]; + for (int i = 0; i < nsubmatch; i++) { + a[i] = PCRE::Arg(&result->submatch[i]); + argptr[i] = &a[i]; + } + size_t consumed; + PCRE::Anchor pcre_anchor; + if (anchor == Prog::kAnchored) + pcre_anchor = PCRE::ANCHOR_START; + else + pcre_anchor = PCRE::UNANCHORED; + if (kind_ == Prog::kFullMatch) + pcre_anchor = PCRE::ANCHOR_BOTH; + re_->ClearHitLimit(); + result->matched = + re_->DoMatch(text, + pcre_anchor, + &consumed, + argptr, nsubmatch); + if (re_->HitLimit()) { + result->untrusted = true; + delete[] argptr; + delete[] a; + break; + } + result->have_submatch = true; + delete[] argptr; + delete[] a; + break; + } + } + + if (!result->matched) + result->ClearSubmatch(); +} + +// Checks whether r is okay given that correct is the right answer. +// Specifically, r's answers have to match (but it doesn't have to +// claim to have all the answers). +static bool ResultOkay(const Result& r, const Result& correct) { + if (r.skipped) + return true; + if (r.matched != correct.matched) + return false; + if (r.have_submatch || r.have_submatch0) { + for (int i = 0; i < kMaxSubmatch; i++) { + if (correct.submatch[i].data() != r.submatch[i].data() || + correct.submatch[i].size() != r.submatch[i].size()) + return false; + if (!r.have_submatch) + break; + } + } + return true; +} + +// Runs a single test. +bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + // Backtracking is the gold standard. + Result correct; + RunSearch(kEngineBacktrack, text, context, anchor, &correct); + if (correct.skipped) { + if (regexp_ == NULL) + return true; + LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) + << " " << FormatMode(flags_); + return false; + } + VLOG(1) << "Try: regexp " << CEscape(regexp_str_) + << " text " << CEscape(text) + << " (" << FormatKind(kind_) + << ", " << FormatAnchor(anchor) + << ", " << FormatMode(flags_) + << ")"; + + // Compare the others. + bool all_okay = true; + for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) { + if (!(Engines() & (1<<i))) + continue; + + Result r; + RunSearch(i, text, context, anchor, &r); + if (ResultOkay(r, correct)) { + if (GetFlag(FLAGS_log_okay)) + LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); + continue; + } + + // We disagree with PCRE on the meaning of some Unicode matches. + // In particular, we treat non-ASCII UTF-8 as non-word characters. + // We also treat "empty" character sets like [^\w\W] as being + // impossible to match, while PCRE apparently excludes some code + // points (e.g., 0x0080) from both \w and \W. + if (i == kEnginePCRE && NonASCII(text)) + continue; + + if (!r.untrusted) + all_okay = false; + + LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, + context, anchor); + if (r.matched != correct.matched) { + if (r.matched) { + LOG(INFO) << " Should not match (but does)."; + } else { + LOG(INFO) << " Should match (but does not)."; + continue; + } + } + for (int i = 0; i < 1+num_captures_; i++) { + if (r.submatch[i].data() != correct.submatch[i].data() || + r.submatch[i].size() != correct.submatch[i].size()) { + LOG(INFO) << + StringPrintf(" $%d: should be %s is %s", + i, + FormatCapture(text, correct.submatch[i]).c_str(), + FormatCapture(text, r.submatch[i]).c_str()); + } else { + LOG(INFO) << + StringPrintf(" $%d: %s ok", i, + FormatCapture(text, r.submatch[i]).c_str()); + } + } + } + + if (!all_okay) { + // This will be initialised once (after flags have been initialised) + // and that is desirable because we want to enforce a global limit. + static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures); + if (max_regexp_failures > 0 && --max_regexp_failures == 0) + LOG(QFATAL) << "Too many regexp failures."; + } + + return all_okay; +} + +void TestInstance::LogMatch(const char* prefix, Engine e, + const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + LOG(INFO) << prefix + << EngineName(e) + << " regexp " + << CEscape(regexp_str_) + << " " + << CEscape(regexp_->ToString()) + << " text " + << CEscape(text) + << " (" << BeginPtr(text) - BeginPtr(context) - << "," + << "," << EndPtr(text) - BeginPtr(context) - << ") of context " - << CEscape(context) - << " (" << FormatKind(kind_) - << ", " << FormatAnchor(anchor) - << ", " << FormatMode(flags_) - << ")"; -} - -static Prog::MatchKind kinds[] = { - Prog::kFirstMatch, - Prog::kLongestMatch, - Prog::kFullMatch, -}; - -// Test all possible match kinds and parse modes. -Tester::Tester(const StringPiece& regexp) { - error_ = false; - for (size_t i = 0; i < arraysize(kinds); i++) { - for (size_t j = 0; j < arraysize(parse_modes); j++) { - TestInstance* t = new TestInstance(regexp, kinds[i], - parse_modes[j].parse_flags); - error_ |= t->error(); - v_.push_back(t); - } - } -} - -Tester::~Tester() { - for (size_t i = 0; i < v_.size(); i++) - delete v_[i]; -} - -bool Tester::TestCase(const StringPiece& text, const StringPiece& context, - Prog::Anchor anchor) { - bool okay = true; - for (size_t i = 0; i < v_.size(); i++) - okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); - return okay; -} - -static Prog::Anchor anchors[] = { - Prog::kAnchored, - Prog::kUnanchored -}; - -bool Tester::TestInput(const StringPiece& text) { - bool okay = TestInputInContext(text, text); - if (!text.empty()) { - StringPiece sp; - sp = text; - sp.remove_prefix(1); - okay &= TestInputInContext(sp, text); - sp = text; - sp.remove_suffix(1); - okay &= TestInputInContext(sp, text); - } - return okay; -} - -bool Tester::TestInputInContext(const StringPiece& text, - const StringPiece& context) { - bool okay = true; - for (size_t i = 0; i < arraysize(anchors); i++) - okay &= TestCase(text, context, anchors[i]); - return okay; -} - -bool TestRegexpOnText(const StringPiece& regexp, - const StringPiece& text) { - Tester t(regexp); - return t.TestInput(text); -} - -} // namespace re2 + << ") of context " + << CEscape(context) + << " (" << FormatKind(kind_) + << ", " << FormatAnchor(anchor) + << ", " << FormatMode(flags_) + << ")"; +} + +static Prog::MatchKind kinds[] = { + Prog::kFirstMatch, + Prog::kLongestMatch, + Prog::kFullMatch, +}; + +// Test all possible match kinds and parse modes. +Tester::Tester(const StringPiece& regexp) { + error_ = false; + for (size_t i = 0; i < arraysize(kinds); i++) { + for (size_t j = 0; j < arraysize(parse_modes); j++) { + TestInstance* t = new TestInstance(regexp, kinds[i], + parse_modes[j].parse_flags); + error_ |= t->error(); + v_.push_back(t); + } + } +} + +Tester::~Tester() { + for (size_t i = 0; i < v_.size(); i++) + delete v_[i]; +} + +bool Tester::TestCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + bool okay = true; + for (size_t i = 0; i < v_.size(); i++) + okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); + return okay; +} + +static Prog::Anchor anchors[] = { + Prog::kAnchored, + Prog::kUnanchored +}; + +bool Tester::TestInput(const StringPiece& text) { + bool okay = TestInputInContext(text, text); + if (!text.empty()) { + StringPiece sp; + sp = text; + sp.remove_prefix(1); + okay &= TestInputInContext(sp, text); + sp = text; + sp.remove_suffix(1); + okay &= TestInputInContext(sp, text); + } + return okay; +} + +bool Tester::TestInputInContext(const StringPiece& text, + const StringPiece& context) { + bool okay = true; + for (size_t i = 0; i < arraysize(anchors); i++) + okay &= TestCase(text, context, anchors[i]); + return okay; +} + +bool TestRegexpOnText(const StringPiece& regexp, + const StringPiece& text) { + Tester t(regexp); + return t.TestInput(text); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/testing/tester.h b/contrib/libs/re2/re2/testing/tester.h index 1ddab2c5ce..47d0c4304f 100644 --- a/contrib/libs/re2/re2/testing/tester.h +++ b/contrib/libs/re2/re2/testing/tester.h @@ -1,123 +1,123 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef RE2_TESTING_TESTER_H_ -#define RE2_TESTING_TESTER_H_ - -// Comparative tester for regular expression matching. -// Checks all implementations against each other. - -#include <vector> - -#include "re2/stringpiece.h" -#include "re2/prog.h" -#include "re2/regexp.h" -#include "re2/re2.h" -#include "util/pcre.h" - -namespace re2 { - -// All the supported regexp engines. -enum Engine { - kEngineBacktrack = 0, // Prog::UnsafeSearchBacktrack - kEngineNFA, // Prog::SearchNFA - kEngineDFA, // Prog::SearchDFA, only ask whether it matched - kEngineDFA1, // Prog::SearchDFA, ask for match[0] - kEngineOnePass, // Prog::SearchOnePass, if applicable - kEngineBitState, // Prog::SearchBitState - kEngineRE2, // RE2, all submatches - kEngineRE2a, // RE2, only ask for match[0] - kEngineRE2b, // RE2, only ask whether it matched - kEnginePCRE, // PCRE (util/pcre.h) - - kEngineMax, -}; - -// Make normal math on the enum preserve the type. -// By default, C++ doesn't define ++ on enum, and e+1 has type int. -static inline void operator++(Engine& e, int unused) { - e = static_cast<Engine>(e+1); -} - -static inline Engine operator+(Engine e, int i) { - return static_cast<Engine>(static_cast<int>(e)+i); -} - -// A TestInstance caches per-regexp state for a given -// regular expression in a given configuration -// (UTF-8 vs Latin1, longest vs first match, etc.). -class TestInstance { - public: - struct Result; - - TestInstance(const StringPiece& regexp, Prog::MatchKind kind, - Regexp::ParseFlags flags); - ~TestInstance(); - Regexp::ParseFlags flags() { return flags_; } - bool error() { return error_; } - - // Runs a single test case: search in text, which is in context, - // using the given anchoring. - bool RunCase(const StringPiece& text, const StringPiece& context, - Prog::Anchor anchor); - - private: - // Runs a single search using the named engine type. - void RunSearch(Engine type, - const StringPiece& text, const StringPiece& context, - Prog::Anchor anchor, - Result *result); - - void LogMatch(const char* prefix, Engine e, const StringPiece& text, - const StringPiece& context, Prog::Anchor anchor); - - const StringPiece regexp_str_; // regexp being tested - Prog::MatchKind kind_; // kind of match - Regexp::ParseFlags flags_; // flags for parsing regexp_str_ - bool error_; // error during constructor? - - Regexp* regexp_; // parsed regexp - int num_captures_; // regexp_->NumCaptures() cached - Prog* prog_; // compiled program - Prog* rprog_; // compiled reverse program - PCRE* re_; // PCRE implementation - RE2* re2_; // RE2 implementation - - TestInstance(const TestInstance&) = delete; - TestInstance& operator=(const TestInstance&) = delete; -}; - -// A group of TestInstances for all possible configurations. -class Tester { - public: - explicit Tester(const StringPiece& regexp); - ~Tester(); - - bool error() { return error_; } - - // Runs a single test case: search in text, which is in context, - // using the given anchoring. - bool TestCase(const StringPiece& text, const StringPiece& context, - Prog::Anchor anchor); - - // Run TestCase(text, text, anchor) for all anchoring modes. - bool TestInput(const StringPiece& text); - - // Run TestCase(text, context, anchor) for all anchoring modes. - bool TestInputInContext(const StringPiece& text, const StringPiece& context); - - private: - bool error_; - std::vector<TestInstance*> v_; - - Tester(const Tester&) = delete; - Tester& operator=(const Tester&) = delete; -}; - -// Run all possible tests using regexp and text. -bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text); - -} // namespace re2 - -#endif // RE2_TESTING_TESTER_H_ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_TESTING_TESTER_H_ +#define RE2_TESTING_TESTER_H_ + +// Comparative tester for regular expression matching. +// Checks all implementations against each other. + +#include <vector> + +#include "re2/stringpiece.h" +#include "re2/prog.h" +#include "re2/regexp.h" +#include "re2/re2.h" +#include "util/pcre.h" + +namespace re2 { + +// All the supported regexp engines. +enum Engine { + kEngineBacktrack = 0, // Prog::UnsafeSearchBacktrack + kEngineNFA, // Prog::SearchNFA + kEngineDFA, // Prog::SearchDFA, only ask whether it matched + kEngineDFA1, // Prog::SearchDFA, ask for match[0] + kEngineOnePass, // Prog::SearchOnePass, if applicable + kEngineBitState, // Prog::SearchBitState + kEngineRE2, // RE2, all submatches + kEngineRE2a, // RE2, only ask for match[0] + kEngineRE2b, // RE2, only ask whether it matched + kEnginePCRE, // PCRE (util/pcre.h) + + kEngineMax, +}; + +// Make normal math on the enum preserve the type. +// By default, C++ doesn't define ++ on enum, and e+1 has type int. +static inline void operator++(Engine& e, int unused) { + e = static_cast<Engine>(e+1); +} + +static inline Engine operator+(Engine e, int i) { + return static_cast<Engine>(static_cast<int>(e)+i); +} + +// A TestInstance caches per-regexp state for a given +// regular expression in a given configuration +// (UTF-8 vs Latin1, longest vs first match, etc.). +class TestInstance { + public: + struct Result; + + TestInstance(const StringPiece& regexp, Prog::MatchKind kind, + Regexp::ParseFlags flags); + ~TestInstance(); + Regexp::ParseFlags flags() { return flags_; } + bool error() { return error_; } + + // Runs a single test case: search in text, which is in context, + // using the given anchoring. + bool RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor); + + private: + // Runs a single search using the named engine type. + void RunSearch(Engine type, + const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor, + Result *result); + + void LogMatch(const char* prefix, Engine e, const StringPiece& text, + const StringPiece& context, Prog::Anchor anchor); + + const StringPiece regexp_str_; // regexp being tested + Prog::MatchKind kind_; // kind of match + Regexp::ParseFlags flags_; // flags for parsing regexp_str_ + bool error_; // error during constructor? + + Regexp* regexp_; // parsed regexp + int num_captures_; // regexp_->NumCaptures() cached + Prog* prog_; // compiled program + Prog* rprog_; // compiled reverse program + PCRE* re_; // PCRE implementation + RE2* re2_; // RE2 implementation + + TestInstance(const TestInstance&) = delete; + TestInstance& operator=(const TestInstance&) = delete; +}; + +// A group of TestInstances for all possible configurations. +class Tester { + public: + explicit Tester(const StringPiece& regexp); + ~Tester(); + + bool error() { return error_; } + + // Runs a single test case: search in text, which is in context, + // using the given anchoring. + bool TestCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor); + + // Run TestCase(text, text, anchor) for all anchoring modes. + bool TestInput(const StringPiece& text); + + // Run TestCase(text, context, anchor) for all anchoring modes. + bool TestInputInContext(const StringPiece& text, const StringPiece& context); + + private: + bool error_; + std::vector<TestInstance*> v_; + + Tester(const Tester&) = delete; + Tester& operator=(const Tester&) = delete; +}; + +// Run all possible tests using regexp and text. +bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text); + +} // namespace re2 + +#endif // RE2_TESTING_TESTER_H_ diff --git a/contrib/libs/re2/re2/testing/ya.make b/contrib/libs/re2/re2/testing/ya.make index 5436c70dbb..df9023fee5 100644 --- a/contrib/libs/re2/re2/testing/ya.make +++ b/contrib/libs/re2/re2/testing/ya.make @@ -1,50 +1,50 @@ -# Generated by devtools/yamaker. - -GTEST() - -OWNER(g:cpp-contrib) - -LICENSE(BSD-3-Clause) - +# Generated by devtools/yamaker. + +GTEST() + +OWNER(g:cpp-contrib) + +LICENSE(BSD-3-Clause) + LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -PEERDIR( - contrib/libs/re2 -) - +PEERDIR( + contrib/libs/re2 +) + ADDINCL( contrib/libs/re2 ) - -NO_COMPILER_WARNINGS() - -NO_UTIL() - -SRCDIR(contrib/libs/re2) - -SRCS( - re2/testing/backtrack.cc - re2/testing/charclass_test.cc - re2/testing/compile_test.cc - re2/testing/dump.cc - re2/testing/exhaustive_tester.cc - re2/testing/filtered_re2_test.cc - re2/testing/mimics_pcre_test.cc - re2/testing/null_walker.cc - re2/testing/parse_test.cc - re2/testing/possible_match_test.cc - re2/testing/re2_arg_test.cc - re2/testing/re2_test.cc - re2/testing/regexp_generator.cc - re2/testing/regexp_test.cc - re2/testing/required_prefix_test.cc - re2/testing/search_test.cc - re2/testing/set_test.cc - re2/testing/simplify_test.cc - re2/testing/string_generator.cc - re2/testing/string_generator_test.cc - re2/testing/tester.cc - util/pcre.cc -) - -END() + +NO_COMPILER_WARNINGS() + +NO_UTIL() + +SRCDIR(contrib/libs/re2) + +SRCS( + re2/testing/backtrack.cc + re2/testing/charclass_test.cc + re2/testing/compile_test.cc + re2/testing/dump.cc + re2/testing/exhaustive_tester.cc + re2/testing/filtered_re2_test.cc + re2/testing/mimics_pcre_test.cc + re2/testing/null_walker.cc + re2/testing/parse_test.cc + re2/testing/possible_match_test.cc + re2/testing/re2_arg_test.cc + re2/testing/re2_test.cc + re2/testing/regexp_generator.cc + re2/testing/regexp_test.cc + re2/testing/required_prefix_test.cc + re2/testing/search_test.cc + re2/testing/set_test.cc + re2/testing/simplify_test.cc + re2/testing/string_generator.cc + re2/testing/string_generator_test.cc + re2/testing/tester.cc + util/pcre.cc +) + +END() diff --git a/contrib/libs/re2/re2/tostring.cc b/contrib/libs/re2/re2/tostring.cc index 255aa94820..9c1c038ca6 100644 --- a/contrib/libs/re2/re2/tostring.cc +++ b/contrib/libs/re2/re2/tostring.cc @@ -28,7 +28,7 @@ enum { }; // Helper function. See description below. -static void AppendCCRange(std::string* t, Rune lo, Rune hi); +static void AppendCCRange(std::string* t, Rune lo, Rune hi); // Walker to generate string in s_. // The arg pointers are actually integers giving the @@ -36,7 +36,7 @@ static void AppendCCRange(std::string* t, Rune lo, Rune hi); // The child_args are always NULL. class ToStringWalker : public Regexp::Walker<int> { public: - explicit ToStringWalker(std::string* t) : t_(t) {} + explicit ToStringWalker(std::string* t) : t_(t) {} virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, @@ -46,14 +46,14 @@ class ToStringWalker : public Regexp::Walker<int> { } private: - std::string* t_; // The string the walker appends to. + std::string* t_; // The string the walker appends to. ToStringWalker(const ToStringWalker&) = delete; ToStringWalker& operator=(const ToStringWalker&) = delete; }; -std::string Regexp::ToString() { - std::string t; +std::string Regexp::ToString() { + std::string t; ToStringWalker w(&t); w.WalkExponential(this, PrecToplevel, 100000); if (w.stopped_early()) @@ -126,7 +126,7 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { return nprec; } -static void AppendLiteral(std::string *t, Rune r, bool foldcase) { +static void AppendLiteral(std::string *t, Rune r, bool foldcase) { if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { t->append(1, '\\'); t->append(1, static_cast<char>(r)); @@ -269,9 +269,9 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, } t_->append("["); // Heuristic: show class as negated if it contains the - // non-character 0xFFFE and yet somehow isn't full. + // non-character 0xFFFE and yet somehow isn't full. CharClass* cc = re->cc(); - if (cc->Contains(0xFFFE) && !cc->full()) { + if (cc->Contains(0xFFFE) && !cc->full()) { cc = cc->Negate(); t_->append("^"); } @@ -291,7 +291,7 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, // There's no syntax accepted by the parser to generate // this node (it is generated by RE2::Set) so make something // up that is readable but won't compile. - t_->append(StringPrintf("(?HaveMatch:%d)", re->match_id())); + t_->append(StringPrintf("(?HaveMatch:%d)", re->match_id())); break; } @@ -303,7 +303,7 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, } // Appends a rune for use in a character class to the string t. -static void AppendCCChar(std::string* t, Rune r) { +static void AppendCCChar(std::string* t, Rune r) { if (0x20 <= r && r <= 0x7E) { if (strchr("[]^-\\", r)) t->append("\\"); @@ -332,13 +332,13 @@ static void AppendCCChar(std::string* t, Rune r) { } if (r < 0x100) { - *t += StringPrintf("\\x%02x", static_cast<int>(r)); + *t += StringPrintf("\\x%02x", static_cast<int>(r)); return; } - *t += StringPrintf("\\x{%x}", static_cast<int>(r)); + *t += StringPrintf("\\x{%x}", static_cast<int>(r)); } -static void AppendCCRange(std::string* t, Rune lo, Rune hi) { +static void AppendCCRange(std::string* t, Rune lo, Rune hi) { if (lo > hi) return; AppendCCChar(t, lo); diff --git a/contrib/libs/re2/re2/unicode_casefold.cc b/contrib/libs/re2/re2/unicode_casefold.cc index f6899c6cf6..d9de2821d5 100644 --- a/contrib/libs/re2/re2/unicode_casefold.cc +++ b/contrib/libs/re2/re2/unicode_casefold.cc @@ -113,7 +113,7 @@ const CaseFold unicode_casefold[] = { { 614, 614, 42308 }, { 616, 616, -209 }, { 617, 617, -211 }, - { 618, 618, 42308 }, + { 618, 618, 42308 }, { 619, 619, 10743 }, { 620, 620, 42305 }, { 623, 623, -211 }, @@ -122,7 +122,7 @@ const CaseFold unicode_casefold[] = { { 629, 629, -214 }, { 637, 637, 10727 }, { 640, 640, -218 }, - { 642, 642, 42307 }, + { 642, 642, 42307 }, { 643, 643, -218 }, { 647, 647, 42282 }, { 648, 648, -218 }, @@ -186,21 +186,21 @@ const CaseFold unicode_casefold[] = { { 1021, 1023, -130 }, { 1024, 1039, 80 }, { 1040, 1071, 32 }, - { 1072, 1073, -32 }, - { 1074, 1074, 6222 }, - { 1075, 1075, -32 }, - { 1076, 1076, 6221 }, - { 1077, 1085, -32 }, - { 1086, 1086, 6212 }, - { 1087, 1088, -32 }, - { 1089, 1090, 6210 }, - { 1091, 1097, -32 }, - { 1098, 1098, 6204 }, - { 1099, 1103, -32 }, + { 1072, 1073, -32 }, + { 1074, 1074, 6222 }, + { 1075, 1075, -32 }, + { 1076, 1076, 6221 }, + { 1077, 1085, -32 }, + { 1086, 1086, 6212 }, + { 1087, 1088, -32 }, + { 1089, 1090, 6210 }, + { 1091, 1097, -32 }, + { 1098, 1098, 6204 }, + { 1099, 1103, -32 }, { 1104, 1119, -80 }, - { 1120, 1122, EvenOdd }, - { 1123, 1123, 6180 }, - { 1124, 1153, EvenOdd }, + { 1120, 1122, EvenOdd }, + { 1123, 1123, 6180 }, + { 1124, 1153, EvenOdd }, { 1162, 1215, EvenOdd }, { 1216, 1216, 15 }, { 1217, 1230, OddEven }, @@ -211,25 +211,25 @@ const CaseFold unicode_casefold[] = { { 4256, 4293, 7264 }, { 4295, 4295, 7264 }, { 4301, 4301, 7264 }, - { 4304, 4346, 3008 }, - { 4349, 4351, 3008 }, + { 4304, 4346, 3008 }, + { 4349, 4351, 3008 }, { 5024, 5103, 38864 }, { 5104, 5109, 8 }, { 5112, 5117, -8 }, - { 7296, 7296, -6254 }, - { 7297, 7297, -6253 }, - { 7298, 7298, -6244 }, - { 7299, 7299, -6242 }, - { 7300, 7300, EvenOdd }, - { 7301, 7301, -6243 }, - { 7302, 7302, -6236 }, - { 7303, 7303, -6181 }, - { 7304, 7304, 35266 }, - { 7312, 7354, -3008 }, - { 7357, 7359, -3008 }, + { 7296, 7296, -6254 }, + { 7297, 7297, -6253 }, + { 7298, 7298, -6244 }, + { 7299, 7299, -6242 }, + { 7300, 7300, EvenOdd }, + { 7301, 7301, -6243 }, + { 7302, 7302, -6236 }, + { 7303, 7303, -6181 }, + { 7304, 7304, 35266 }, + { 7312, 7354, -3008 }, + { 7357, 7359, -3008 }, { 7545, 7545, 35332 }, { 7549, 7549, 3814 }, - { 7566, 7566, 35384 }, + { 7566, 7566, 35384 }, { 7680, 7776, EvenOdd }, { 7777, 7777, 58 }, { 7778, 7829, EvenOdd }, @@ -321,9 +321,9 @@ const CaseFold unicode_casefold[] = { { 11520, 11557, -7264 }, { 11559, 11559, -7264 }, { 11565, 11565, -7264 }, - { 42560, 42570, EvenOdd }, - { 42571, 42571, -35267 }, - { 42572, 42605, EvenOdd }, + { 42560, 42570, EvenOdd }, + { 42571, 42571, -35267 }, + { 42572, 42605, EvenOdd }, { 42624, 42651, EvenOdd }, { 42786, 42799, EvenOdd }, { 42802, 42863, EvenOdd }, @@ -333,33 +333,33 @@ const CaseFold unicode_casefold[] = { { 42891, 42892, OddEven }, { 42893, 42893, -42280 }, { 42896, 42899, EvenOdd }, - { 42900, 42900, 48 }, + { 42900, 42900, 48 }, { 42902, 42921, EvenOdd }, { 42922, 42922, -42308 }, { 42923, 42923, -42319 }, { 42924, 42924, -42315 }, { 42925, 42925, -42305 }, - { 42926, 42926, -42308 }, + { 42926, 42926, -42308 }, { 42928, 42928, -42258 }, { 42929, 42929, -42282 }, { 42930, 42930, -42261 }, { 42931, 42931, 928 }, { 42932, 42947, EvenOdd }, - { 42948, 42948, -48 }, - { 42949, 42949, -42307 }, - { 42950, 42950, -35384 }, - { 42951, 42954, OddEven }, + { 42948, 42948, -48 }, + { 42949, 42949, -42307 }, + { 42950, 42950, -35384 }, + { 42951, 42954, OddEven }, { 42960, 42961, EvenOdd }, { 42966, 42969, EvenOdd }, - { 42997, 42998, OddEven }, + { 42997, 42998, OddEven }, { 43859, 43859, -928 }, { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 65345, 65370, -32 }, { 66560, 66599, 40 }, { 66600, 66639, -40 }, - { 66736, 66771, 40 }, - { 66776, 66811, -40 }, + { 66736, 66771, 40 }, + { 66776, 66811, -40 }, { 66928, 66938, 39 }, { 66940, 66954, 39 }, { 66956, 66962, 39 }, @@ -372,10 +372,10 @@ const CaseFold unicode_casefold[] = { { 68800, 68850, -64 }, { 71840, 71871, 32 }, { 71872, 71903, -32 }, - { 93760, 93791, 32 }, - { 93792, 93823, -32 }, - { 125184, 125217, 34 }, - { 125218, 125251, -34 }, + { 93760, 93791, 32 }, + { 93792, 93823, -32 }, + { 125184, 125217, 34 }, + { 125218, 125251, -34 }, }; const int num_unicode_casefold = 367; @@ -482,16 +482,16 @@ const CaseFold unicode_tolower[] = { { 4295, 4295, 7264 }, { 4301, 4301, 7264 }, { 5112, 5117, -8 }, - { 7296, 7296, -6222 }, - { 7297, 7297, -6221 }, - { 7298, 7298, -6212 }, - { 7299, 7300, -6210 }, - { 7301, 7301, -6211 }, - { 7302, 7302, -6204 }, - { 7303, 7303, -6180 }, - { 7304, 7304, 35267 }, - { 7312, 7354, -3008 }, - { 7357, 7359, -3008 }, + { 7296, 7296, -6222 }, + { 7297, 7297, -6221 }, + { 7298, 7298, -6212 }, + { 7299, 7300, -6210 }, + { 7301, 7301, -6211 }, + { 7302, 7302, -6204 }, + { 7303, 7303, -6180 }, + { 7304, 7304, 35267 }, + { 7312, 7354, -3008 }, + { 7357, 7359, -3008 }, { 7680, 7828, EvenOddSkip }, { 7835, 7835, -58 }, { 7838, 7838, -7615 }, @@ -561,31 +561,31 @@ const CaseFold unicode_tolower[] = { { 42923, 42923, -42319 }, { 42924, 42924, -42315 }, { 42925, 42925, -42305 }, - { 42926, 42926, -42308 }, + { 42926, 42926, -42308 }, { 42928, 42928, -42258 }, { 42929, 42929, -42282 }, { 42930, 42930, -42261 }, { 42931, 42931, 928 }, { 42932, 42946, EvenOddSkip }, - { 42948, 42948, -48 }, - { 42949, 42949, -42307 }, - { 42950, 42950, -35384 }, - { 42951, 42953, OddEvenSkip }, + { 42948, 42948, -48 }, + { 42949, 42949, -42307 }, + { 42950, 42950, -35384 }, + { 42951, 42953, OddEvenSkip }, { 42960, 42960, EvenOdd }, { 42966, 42968, EvenOddSkip }, - { 42997, 42997, OddEven }, + { 42997, 42997, OddEven }, { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, - { 66736, 66771, 40 }, + { 66736, 66771, 40 }, { 66928, 66938, 39 }, { 66940, 66954, 39 }, { 66956, 66962, 39 }, { 66964, 66965, 39 }, { 68736, 68786, 64 }, { 71840, 71871, 32 }, - { 93760, 93791, 32 }, - { 125184, 125217, 34 }, + { 93760, 93791, 32 }, + { 125184, 125217, 34 }, }; const int num_unicode_tolower = 205; diff --git a/contrib/libs/re2/re2/unicode_groups.cc b/contrib/libs/re2/re2/unicode_groups.cc index 21aeca513f..2a8d7dae1f 100644 --- a/contrib/libs/re2/re2/unicode_groups.cc +++ b/contrib/libs/re2/re2/unicode_groups.cc @@ -7,109 +7,109 @@ namespace re2 { -static const URange16 C_range16[] = { - { 0, 31 }, - { 127, 159 }, - { 173, 173 }, - { 1536, 1541 }, - { 1564, 1564 }, - { 1757, 1757 }, - { 1807, 1807 }, +static const URange16 C_range16[] = { + { 0, 31 }, + { 127, 159 }, + { 173, 173 }, + { 1536, 1541 }, + { 1564, 1564 }, + { 1757, 1757 }, + { 1807, 1807 }, { 2192, 2193 }, - { 2274, 2274 }, - { 6158, 6158 }, - { 8203, 8207 }, - { 8234, 8238 }, - { 8288, 8292 }, - { 8294, 8303 }, - { 55296, 63743 }, - { 65279, 65279 }, - { 65529, 65531 }, -}; -static const URange32 C_range32[] = { - { 69821, 69821 }, - { 69837, 69837 }, - { 78896, 78904 }, - { 113824, 113827 }, - { 119155, 119162 }, - { 917505, 917505 }, - { 917536, 917631 }, - { 983040, 1048573 }, - { 1048576, 1114109 }, -}; -static const URange16 Cc_range16[] = { - { 0, 31 }, - { 127, 159 }, -}; -static const URange16 Cf_range16[] = { - { 173, 173 }, - { 1536, 1541 }, - { 1564, 1564 }, - { 1757, 1757 }, - { 1807, 1807 }, + { 2274, 2274 }, + { 6158, 6158 }, + { 8203, 8207 }, + { 8234, 8238 }, + { 8288, 8292 }, + { 8294, 8303 }, + { 55296, 63743 }, + { 65279, 65279 }, + { 65529, 65531 }, +}; +static const URange32 C_range32[] = { + { 69821, 69821 }, + { 69837, 69837 }, + { 78896, 78904 }, + { 113824, 113827 }, + { 119155, 119162 }, + { 917505, 917505 }, + { 917536, 917631 }, + { 983040, 1048573 }, + { 1048576, 1114109 }, +}; +static const URange16 Cc_range16[] = { + { 0, 31 }, + { 127, 159 }, +}; +static const URange16 Cf_range16[] = { + { 173, 173 }, + { 1536, 1541 }, + { 1564, 1564 }, + { 1757, 1757 }, + { 1807, 1807 }, { 2192, 2193 }, - { 2274, 2274 }, - { 6158, 6158 }, - { 8203, 8207 }, - { 8234, 8238 }, - { 8288, 8292 }, - { 8294, 8303 }, - { 65279, 65279 }, - { 65529, 65531 }, -}; -static const URange32 Cf_range32[] = { - { 69821, 69821 }, - { 69837, 69837 }, - { 78896, 78904 }, - { 113824, 113827 }, - { 119155, 119162 }, - { 917505, 917505 }, - { 917536, 917631 }, -}; -static const URange16 Co_range16[] = { - { 57344, 63743 }, -}; -static const URange32 Co_range32[] = { - { 983040, 1048573 }, - { 1048576, 1114109 }, -}; -static const URange16 Cs_range16[] = { - { 55296, 57343 }, -}; -static const URange16 L_range16[] = { - { 65, 90 }, - { 97, 122 }, + { 2274, 2274 }, + { 6158, 6158 }, + { 8203, 8207 }, + { 8234, 8238 }, + { 8288, 8292 }, + { 8294, 8303 }, + { 65279, 65279 }, + { 65529, 65531 }, +}; +static const URange32 Cf_range32[] = { + { 69821, 69821 }, + { 69837, 69837 }, + { 78896, 78904 }, + { 113824, 113827 }, + { 119155, 119162 }, + { 917505, 917505 }, + { 917536, 917631 }, +}; +static const URange16 Co_range16[] = { + { 57344, 63743 }, +}; +static const URange32 Co_range32[] = { + { 983040, 1048573 }, + { 1048576, 1114109 }, +}; +static const URange16 Cs_range16[] = { + { 55296, 57343 }, +}; +static const URange16 L_range16[] = { + { 65, 90 }, + { 97, 122 }, { 170, 170 }, - { 181, 181 }, + { 181, 181 }, { 186, 186 }, - { 192, 214 }, - { 216, 246 }, - { 248, 705 }, - { 710, 721 }, - { 736, 740 }, - { 748, 748 }, - { 750, 750 }, - { 880, 884 }, - { 886, 887 }, - { 890, 893 }, - { 895, 895 }, - { 902, 902 }, - { 904, 906 }, - { 908, 908 }, - { 910, 929 }, - { 931, 1013 }, - { 1015, 1153 }, - { 1162, 1327 }, - { 1329, 1366 }, - { 1369, 1369 }, - { 1376, 1416 }, + { 192, 214 }, + { 216, 246 }, + { 248, 705 }, + { 710, 721 }, + { 736, 740 }, + { 748, 748 }, + { 750, 750 }, + { 880, 884 }, + { 886, 887 }, + { 890, 893 }, + { 895, 895 }, + { 902, 902 }, + { 904, 906 }, + { 908, 908 }, + { 910, 929 }, + { 931, 1013 }, + { 1015, 1153 }, + { 1162, 1327 }, + { 1329, 1366 }, + { 1369, 1369 }, + { 1376, 1416 }, { 1488, 1514 }, - { 1519, 1522 }, - { 1568, 1610 }, + { 1519, 1522 }, + { 1568, 1610 }, { 1646, 1647 }, { 1649, 1747 }, { 1749, 1749 }, - { 1765, 1766 }, + { 1765, 1766 }, { 1774, 1775 }, { 1786, 1788 }, { 1791, 1791 }, @@ -118,14 +118,14 @@ static const URange16 L_range16[] = { { 1869, 1957 }, { 1969, 1969 }, { 1994, 2026 }, - { 2036, 2037 }, - { 2042, 2042 }, + { 2036, 2037 }, + { 2042, 2042 }, { 2048, 2069 }, - { 2074, 2074 }, - { 2084, 2084 }, - { 2088, 2088 }, + { 2074, 2074 }, + { 2084, 2084 }, + { 2088, 2088 }, { 2112, 2136 }, - { 2144, 2154 }, + { 2144, 2154 }, { 2160, 2183 }, { 2185, 2190 }, { 2208, 2249 }, @@ -133,7 +133,7 @@ static const URange16 L_range16[] = { { 2365, 2365 }, { 2384, 2384 }, { 2392, 2401 }, - { 2417, 2432 }, + { 2417, 2432 }, { 2437, 2444 }, { 2447, 2448 }, { 2451, 2472 }, @@ -145,7 +145,7 @@ static const URange16 L_range16[] = { { 2524, 2525 }, { 2527, 2529 }, { 2544, 2545 }, - { 2556, 2556 }, + { 2556, 2556 }, { 2565, 2570 }, { 2575, 2576 }, { 2579, 2600 }, @@ -195,7 +195,7 @@ static const URange16 L_range16[] = { { 3160, 3162 }, { 3165, 3165 }, { 3168, 3169 }, - { 3200, 3200 }, + { 3200, 3200 }, { 3205, 3212 }, { 3214, 3216 }, { 3218, 3240 }, @@ -205,12 +205,12 @@ static const URange16 L_range16[] = { { 3293, 3294 }, { 3296, 3297 }, { 3313, 3314 }, - { 3332, 3340 }, + { 3332, 3340 }, { 3342, 3344 }, { 3346, 3386 }, { 3389, 3389 }, { 3406, 3406 }, - { 3412, 3414 }, + { 3412, 3414 }, { 3423, 3425 }, { 3450, 3455 }, { 3461, 3478 }, @@ -220,17 +220,17 @@ static const URange16 L_range16[] = { { 3520, 3526 }, { 3585, 3632 }, { 3634, 3635 }, - { 3648, 3654 }, + { 3648, 3654 }, { 3713, 3714 }, { 3716, 3716 }, - { 3718, 3722 }, - { 3724, 3747 }, + { 3718, 3722 }, + { 3724, 3747 }, { 3749, 3749 }, - { 3751, 3760 }, + { 3751, 3760 }, { 3762, 3763 }, { 3773, 3773 }, { 3776, 3780 }, - { 3782, 3782 }, + { 3782, 3782 }, { 3804, 3807 }, { 3840, 3840 }, { 3904, 3911 }, @@ -245,11 +245,11 @@ static const URange16 L_range16[] = { { 4206, 4208 }, { 4213, 4225 }, { 4238, 4238 }, - { 4256, 4293 }, - { 4295, 4295 }, - { 4301, 4301 }, + { 4256, 4293 }, + { 4295, 4295 }, + { 4301, 4301 }, { 4304, 4346 }, - { 4348, 4680 }, + { 4348, 4680 }, { 4682, 4685 }, { 4688, 4694 }, { 4696, 4696 }, @@ -266,8 +266,8 @@ static const URange16 L_range16[] = { { 4882, 4885 }, { 4888, 4954 }, { 4992, 5007 }, - { 5024, 5109 }, - { 5112, 5117 }, + { 5024, 5109 }, + { 5112, 5117 }, { 5121, 5740 }, { 5743, 5759 }, { 5761, 5786 }, @@ -279,11 +279,11 @@ static const URange16 L_range16[] = { { 5984, 5996 }, { 5998, 6000 }, { 6016, 6067 }, - { 6103, 6103 }, + { 6103, 6103 }, { 6108, 6108 }, - { 6176, 6264 }, - { 6272, 6276 }, - { 6279, 6312 }, + { 6176, 6264 }, + { 6272, 6276 }, + { 6279, 6312 }, { 6314, 6314 }, { 6320, 6389 }, { 6400, 6430 }, @@ -293,7 +293,7 @@ static const URange16 L_range16[] = { { 6576, 6601 }, { 6656, 6678 }, { 6688, 6740 }, - { 6823, 6823 }, + { 6823, 6823 }, { 6917, 6963 }, { 6981, 6988 }, { 7043, 7072 }, @@ -301,59 +301,59 @@ static const URange16 L_range16[] = { { 7098, 7141 }, { 7168, 7203 }, { 7245, 7247 }, - { 7258, 7293 }, - { 7296, 7304 }, - { 7312, 7354 }, - { 7357, 7359 }, + { 7258, 7293 }, + { 7296, 7304 }, + { 7312, 7354 }, + { 7357, 7359 }, { 7401, 7404 }, - { 7406, 7411 }, + { 7406, 7411 }, { 7413, 7414 }, - { 7418, 7418 }, - { 7424, 7615 }, - { 7680, 7957 }, - { 7960, 7965 }, - { 7968, 8005 }, - { 8008, 8013 }, - { 8016, 8023 }, - { 8025, 8025 }, - { 8027, 8027 }, - { 8029, 8029 }, - { 8031, 8061 }, - { 8064, 8116 }, - { 8118, 8124 }, - { 8126, 8126 }, - { 8130, 8132 }, - { 8134, 8140 }, - { 8144, 8147 }, - { 8150, 8155 }, - { 8160, 8172 }, - { 8178, 8180 }, - { 8182, 8188 }, - { 8305, 8305 }, - { 8319, 8319 }, - { 8336, 8348 }, - { 8450, 8450 }, - { 8455, 8455 }, - { 8458, 8467 }, - { 8469, 8469 }, - { 8473, 8477 }, - { 8484, 8484 }, - { 8486, 8486 }, - { 8488, 8488 }, - { 8490, 8493 }, - { 8495, 8505 }, - { 8508, 8511 }, - { 8517, 8521 }, - { 8526, 8526 }, - { 8579, 8580 }, + { 7418, 7418 }, + { 7424, 7615 }, + { 7680, 7957 }, + { 7960, 7965 }, + { 7968, 8005 }, + { 8008, 8013 }, + { 8016, 8023 }, + { 8025, 8025 }, + { 8027, 8027 }, + { 8029, 8029 }, + { 8031, 8061 }, + { 8064, 8116 }, + { 8118, 8124 }, + { 8126, 8126 }, + { 8130, 8132 }, + { 8134, 8140 }, + { 8144, 8147 }, + { 8150, 8155 }, + { 8160, 8172 }, + { 8178, 8180 }, + { 8182, 8188 }, + { 8305, 8305 }, + { 8319, 8319 }, + { 8336, 8348 }, + { 8450, 8450 }, + { 8455, 8455 }, + { 8458, 8467 }, + { 8469, 8469 }, + { 8473, 8477 }, + { 8484, 8484 }, + { 8486, 8486 }, + { 8488, 8488 }, + { 8490, 8493 }, + { 8495, 8505 }, + { 8508, 8511 }, + { 8517, 8521 }, + { 8526, 8526 }, + { 8579, 8580 }, { 11264, 11492 }, - { 11499, 11502 }, - { 11506, 11507 }, - { 11520, 11557 }, - { 11559, 11559 }, - { 11565, 11565 }, + { 11499, 11502 }, + { 11506, 11507 }, + { 11520, 11557 }, + { 11559, 11559 }, + { 11565, 11565 }, { 11568, 11623 }, - { 11631, 11631 }, + { 11631, 11631 }, { 11648, 11670 }, { 11680, 11686 }, { 11688, 11694 }, @@ -363,29 +363,29 @@ static const URange16 L_range16[] = { { 11720, 11726 }, { 11728, 11734 }, { 11736, 11742 }, - { 11823, 11823 }, - { 12293, 12294 }, - { 12337, 12341 }, - { 12347, 12348 }, + { 11823, 11823 }, + { 12293, 12294 }, + { 12337, 12341 }, + { 12347, 12348 }, { 12353, 12438 }, - { 12445, 12447 }, + { 12445, 12447 }, { 12449, 12538 }, - { 12540, 12543 }, - { 12549, 12591 }, + { 12540, 12543 }, + { 12549, 12591 }, { 12593, 12686 }, - { 12704, 12735 }, + { 12704, 12735 }, { 12784, 12799 }, - { 13312, 19903 }, + { 13312, 19903 }, { 19968, 42124 }, - { 42192, 42237 }, - { 42240, 42508 }, + { 42192, 42237 }, + { 42240, 42508 }, { 42512, 42527 }, { 42538, 42539 }, - { 42560, 42606 }, - { 42623, 42653 }, + { 42560, 42606 }, + { 42623, 42653 }, { 42656, 42725 }, - { 42775, 42783 }, - { 42786, 42888 }, + { 42775, 42783 }, + { 42786, 42888 }, { 42891, 42954 }, { 42960, 42961 }, { 42963, 42963 }, @@ -398,19 +398,19 @@ static const URange16 L_range16[] = { { 43138, 43187 }, { 43250, 43255 }, { 43259, 43259 }, - { 43261, 43262 }, + { 43261, 43262 }, { 43274, 43301 }, { 43312, 43334 }, { 43360, 43388 }, { 43396, 43442 }, - { 43471, 43471 }, + { 43471, 43471 }, { 43488, 43492 }, - { 43494, 43503 }, + { 43494, 43503 }, { 43514, 43518 }, { 43520, 43560 }, { 43584, 43586 }, { 43588, 43595 }, - { 43616, 43638 }, + { 43616, 43638 }, { 43642, 43642 }, { 43646, 43695 }, { 43697, 43697 }, @@ -418,24 +418,24 @@ static const URange16 L_range16[] = { { 43705, 43709 }, { 43712, 43712 }, { 43714, 43714 }, - { 43739, 43741 }, + { 43739, 43741 }, { 43744, 43754 }, - { 43762, 43764 }, + { 43762, 43764 }, { 43777, 43782 }, { 43785, 43790 }, { 43793, 43798 }, { 43808, 43814 }, { 43816, 43822 }, - { 43824, 43866 }, - { 43868, 43881 }, - { 43888, 44002 }, + { 43824, 43866 }, + { 43868, 43881 }, + { 43888, 44002 }, { 44032, 55203 }, { 55216, 55238 }, { 55243, 55291 }, { 63744, 64109 }, { 64112, 64217 }, - { 64256, 64262 }, - { 64275, 64279 }, + { 64256, 64262 }, + { 64275, 64279 }, { 64285, 64285 }, { 64287, 64296 }, { 64298, 64310 }, @@ -450,15 +450,15 @@ static const URange16 L_range16[] = { { 65008, 65019 }, { 65136, 65140 }, { 65142, 65276 }, - { 65313, 65338 }, - { 65345, 65370 }, - { 65382, 65470 }, + { 65313, 65338 }, + { 65345, 65370 }, + { 65382, 65470 }, { 65474, 65479 }, { 65482, 65487 }, { 65490, 65495 }, { 65498, 65500 }, }; -static const URange32 L_range32[] = { +static const URange32 L_range32[] = { { 65536, 65547 }, { 65549, 65574 }, { 65576, 65594 }, @@ -469,15 +469,15 @@ static const URange32 L_range32[] = { { 66176, 66204 }, { 66208, 66256 }, { 66304, 66335 }, - { 66349, 66368 }, + { 66349, 66368 }, { 66370, 66377 }, { 66384, 66421 }, { 66432, 66461 }, { 66464, 66499 }, { 66504, 66511 }, - { 66560, 66717 }, - { 66736, 66771 }, - { 66776, 66811 }, + { 66560, 66717 }, + { 66736, 66771 }, + { 66776, 66811 }, { 66816, 66855 }, { 66864, 66915 }, { 66928, 66938 }, @@ -511,7 +511,7 @@ static const URange32 L_range32[] = { { 68096, 68096 }, { 68112, 68115 }, { 68117, 68119 }, - { 68121, 68149 }, + { 68121, 68149 }, { 68192, 68220 }, { 68224, 68252 }, { 68288, 68295 }, @@ -521,25 +521,25 @@ static const URange32 L_range32[] = { { 68448, 68466 }, { 68480, 68497 }, { 68608, 68680 }, - { 68736, 68786 }, - { 68800, 68850 }, - { 68864, 68899 }, - { 69248, 69289 }, - { 69296, 69297 }, - { 69376, 69404 }, - { 69415, 69415 }, - { 69424, 69445 }, + { 68736, 68786 }, + { 68800, 68850 }, + { 68864, 68899 }, + { 69248, 69289 }, + { 69296, 69297 }, + { 69376, 69404 }, + { 69415, 69415 }, + { 69424, 69445 }, { 69488, 69505 }, - { 69552, 69572 }, - { 69600, 69622 }, + { 69552, 69572 }, + { 69600, 69622 }, { 69635, 69687 }, { 69745, 69746 }, { 69749, 69749 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, - { 69956, 69956 }, - { 69959, 69959 }, + { 69956, 69956 }, + { 69959, 69959 }, { 69968, 70002 }, { 70006, 70006 }, { 70019, 70066 }, @@ -563,9 +563,9 @@ static const URange32 L_range32[] = { { 70461, 70461 }, { 70480, 70480 }, { 70493, 70497 }, - { 70656, 70708 }, - { 70727, 70730 }, - { 70751, 70753 }, + { 70656, 70708 }, + { 70727, 70730 }, + { 70751, 70753 }, { 70784, 70831 }, { 70852, 70853 }, { 70855, 70855 }, @@ -574,43 +574,43 @@ static const URange32 L_range32[] = { { 71168, 71215 }, { 71236, 71236 }, { 71296, 71338 }, - { 71352, 71352 }, - { 71424, 71450 }, + { 71352, 71352 }, + { 71424, 71450 }, { 71488, 71494 }, - { 71680, 71723 }, - { 71840, 71903 }, - { 71935, 71942 }, - { 71945, 71945 }, - { 71948, 71955 }, - { 71957, 71958 }, - { 71960, 71983 }, - { 71999, 71999 }, - { 72001, 72001 }, - { 72096, 72103 }, - { 72106, 72144 }, - { 72161, 72161 }, - { 72163, 72163 }, - { 72192, 72192 }, - { 72203, 72242 }, - { 72250, 72250 }, - { 72272, 72272 }, - { 72284, 72329 }, - { 72349, 72349 }, + { 71680, 71723 }, + { 71840, 71903 }, + { 71935, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71983 }, + { 71999, 71999 }, + { 72001, 72001 }, + { 72096, 72103 }, + { 72106, 72144 }, + { 72161, 72161 }, + { 72163, 72163 }, + { 72192, 72192 }, + { 72203, 72242 }, + { 72250, 72250 }, + { 72272, 72272 }, + { 72284, 72329 }, + { 72349, 72349 }, { 72368, 72440 }, - { 72704, 72712 }, - { 72714, 72750 }, - { 72768, 72768 }, - { 72818, 72847 }, - { 72960, 72966 }, - { 72968, 72969 }, - { 72971, 73008 }, - { 73030, 73030 }, - { 73056, 73061 }, - { 73063, 73064 }, - { 73066, 73097 }, - { 73112, 73112 }, - { 73440, 73458 }, - { 73648, 73648 }, + { 72704, 72712 }, + { 72714, 72750 }, + { 72768, 72768 }, + { 72818, 72847 }, + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73008 }, + { 73030, 73030 }, + { 73056, 73061 }, + { 73063, 73064 }, + { 73066, 73097 }, + { 73112, 73112 }, + { 73440, 73458 }, + { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, { 77712, 77808 }, @@ -621,72 +621,72 @@ static const URange32 L_range32[] = { { 92784, 92862 }, { 92880, 92909 }, { 92928, 92975 }, - { 92992, 92995 }, + { 92992, 92995 }, { 93027, 93047 }, { 93053, 93071 }, - { 93760, 93823 }, - { 93952, 94026 }, + { 93760, 93823 }, + { 93952, 94026 }, { 94032, 94032 }, - { 94099, 94111 }, - { 94176, 94177 }, - { 94179, 94179 }, - { 94208, 100343 }, - { 100352, 101589 }, - { 101632, 101640 }, + { 94099, 94111 }, + { 94176, 94177 }, + { 94179, 94179 }, + { 94208, 100343 }, + { 100352, 101589 }, + { 101632, 101640 }, { 110576, 110579 }, { 110581, 110587 }, { 110589, 110590 }, { 110592, 110882 }, - { 110928, 110930 }, - { 110948, 110951 }, - { 110960, 111355 }, + { 110928, 110930 }, + { 110948, 110951 }, + { 110960, 111355 }, { 113664, 113770 }, { 113776, 113788 }, { 113792, 113800 }, { 113808, 113817 }, - { 119808, 119892 }, - { 119894, 119964 }, - { 119966, 119967 }, - { 119970, 119970 }, - { 119973, 119974 }, - { 119977, 119980 }, - { 119982, 119993 }, - { 119995, 119995 }, - { 119997, 120003 }, - { 120005, 120069 }, - { 120071, 120074 }, - { 120077, 120084 }, - { 120086, 120092 }, - { 120094, 120121 }, - { 120123, 120126 }, - { 120128, 120132 }, - { 120134, 120134 }, - { 120138, 120144 }, - { 120146, 120485 }, - { 120488, 120512 }, - { 120514, 120538 }, - { 120540, 120570 }, - { 120572, 120596 }, - { 120598, 120628 }, - { 120630, 120654 }, - { 120656, 120686 }, - { 120688, 120712 }, - { 120714, 120744 }, - { 120746, 120770 }, - { 120772, 120779 }, + { 119808, 119892 }, + { 119894, 119964 }, + { 119966, 119967 }, + { 119970, 119970 }, + { 119973, 119974 }, + { 119977, 119980 }, + { 119982, 119993 }, + { 119995, 119995 }, + { 119997, 120003 }, + { 120005, 120069 }, + { 120071, 120074 }, + { 120077, 120084 }, + { 120086, 120092 }, + { 120094, 120121 }, + { 120123, 120126 }, + { 120128, 120132 }, + { 120134, 120134 }, + { 120138, 120144 }, + { 120146, 120485 }, + { 120488, 120512 }, + { 120514, 120538 }, + { 120540, 120570 }, + { 120572, 120596 }, + { 120598, 120628 }, + { 120630, 120654 }, + { 120656, 120686 }, + { 120688, 120712 }, + { 120714, 120744 }, + { 120746, 120770 }, + { 120772, 120779 }, { 122624, 122654 }, - { 123136, 123180 }, - { 123191, 123197 }, - { 123214, 123214 }, + { 123136, 123180 }, + { 123191, 123197 }, + { 123214, 123214 }, { 123536, 123565 }, - { 123584, 123627 }, + { 123584, 123627 }, { 124896, 124902 }, { 124904, 124907 }, { 124909, 124910 }, { 124912, 124926 }, { 124928, 125124 }, - { 125184, 125251 }, - { 125259, 125259 }, + { 125184, 125251 }, + { 125259, 125259 }, { 126464, 126467 }, { 126469, 126495 }, { 126497, 126498 }, @@ -724,9 +724,9 @@ static const URange32 L_range32[] = { { 173824, 177976 }, { 177984, 178205 }, { 178208, 183969 }, - { 183984, 191456 }, + { 183984, 191456 }, { 194560, 195101 }, - { 196608, 201546 }, + { 196608, 201546 }, }; static const URange16 Ll_range16[] = { { 97, 122 }, @@ -998,11 +998,11 @@ static const URange16 Ll_range16[] = { { 1323, 1323 }, { 1325, 1325 }, { 1327, 1327 }, - { 1376, 1416 }, - { 4304, 4346 }, - { 4349, 4351 }, + { 1376, 1416 }, + { 4304, 4346 }, + { 4349, 4351 }, { 5112, 5117 }, - { 7296, 7304 }, + { 7296, 7304 }, { 7424, 7467 }, { 7531, 7543 }, { 7545, 7578 }, @@ -1322,26 +1322,26 @@ static const URange16 Ll_range16[] = { { 42917, 42917 }, { 42919, 42919 }, { 42921, 42921 }, - { 42927, 42927 }, + { 42927, 42927 }, { 42933, 42933 }, { 42935, 42935 }, - { 42937, 42937 }, - { 42939, 42939 }, - { 42941, 42941 }, - { 42943, 42943 }, + { 42937, 42937 }, + { 42939, 42939 }, + { 42941, 42941 }, + { 42943, 42943 }, { 42945, 42945 }, - { 42947, 42947 }, - { 42952, 42952 }, - { 42954, 42954 }, + { 42947, 42947 }, + { 42952, 42952 }, + { 42954, 42954 }, { 42961, 42961 }, { 42963, 42963 }, { 42965, 42965 }, { 42967, 42967 }, { 42969, 42969 }, - { 42998, 42998 }, + { 42998, 42998 }, { 43002, 43002 }, { 43824, 43866 }, - { 43872, 43880 }, + { 43872, 43880 }, { 43888, 43967 }, { 64256, 64262 }, { 64275, 64279 }, @@ -1349,14 +1349,14 @@ static const URange16 Ll_range16[] = { }; static const URange32 Ll_range32[] = { { 66600, 66639 }, - { 66776, 66811 }, + { 66776, 66811 }, { 66967, 66977 }, { 66979, 66993 }, { 66995, 67001 }, { 67003, 67004 }, { 68800, 68850 }, { 71872, 71903 }, - { 93792, 93823 }, + { 93792, 93823 }, { 119834, 119859 }, { 119886, 119892 }, { 119894, 119911 }, @@ -1387,7 +1387,7 @@ static const URange32 Ll_range32[] = { { 120779, 120779 }, { 122624, 122633 }, { 122635, 122654 }, - { 125218, 125251 }, + { 125218, 125251 }, }; static const URange16 Lm_range16[] = { { 688, 705 }, @@ -1444,7 +1444,7 @@ static const URange16 Lm_range16[] = { { 43741, 43741 }, { 43763, 43764 }, { 43868, 43871 }, - { 43881, 43881 }, + { 43881, 43881 }, { 65392, 65392 }, { 65438, 65439 }, }; @@ -1454,518 +1454,518 @@ static const URange32 Lm_range32[] = { { 67506, 67514 }, { 92992, 92995 }, { 94099, 94111 }, - { 94176, 94177 }, - { 94179, 94179 }, + { 94176, 94177 }, + { 94179, 94179 }, { 110576, 110579 }, { 110581, 110587 }, { 110589, 110590 }, - { 123191, 123197 }, - { 125259, 125259 }, -}; -static const URange16 Lo_range16[] = { - { 170, 170 }, - { 186, 186 }, - { 443, 443 }, - { 448, 451 }, - { 660, 660 }, - { 1488, 1514 }, - { 1519, 1522 }, - { 1568, 1599 }, - { 1601, 1610 }, - { 1646, 1647 }, - { 1649, 1747 }, - { 1749, 1749 }, - { 1774, 1775 }, - { 1786, 1788 }, - { 1791, 1791 }, - { 1808, 1808 }, - { 1810, 1839 }, - { 1869, 1957 }, - { 1969, 1969 }, - { 1994, 2026 }, - { 2048, 2069 }, - { 2112, 2136 }, - { 2144, 2154 }, + { 123191, 123197 }, + { 125259, 125259 }, +}; +static const URange16 Lo_range16[] = { + { 170, 170 }, + { 186, 186 }, + { 443, 443 }, + { 448, 451 }, + { 660, 660 }, + { 1488, 1514 }, + { 1519, 1522 }, + { 1568, 1599 }, + { 1601, 1610 }, + { 1646, 1647 }, + { 1649, 1747 }, + { 1749, 1749 }, + { 1774, 1775 }, + { 1786, 1788 }, + { 1791, 1791 }, + { 1808, 1808 }, + { 1810, 1839 }, + { 1869, 1957 }, + { 1969, 1969 }, + { 1994, 2026 }, + { 2048, 2069 }, + { 2112, 2136 }, + { 2144, 2154 }, { 2160, 2183 }, { 2185, 2190 }, { 2208, 2248 }, - { 2308, 2361 }, - { 2365, 2365 }, - { 2384, 2384 }, - { 2392, 2401 }, - { 2418, 2432 }, - { 2437, 2444 }, - { 2447, 2448 }, - { 2451, 2472 }, - { 2474, 2480 }, - { 2482, 2482 }, - { 2486, 2489 }, - { 2493, 2493 }, - { 2510, 2510 }, - { 2524, 2525 }, - { 2527, 2529 }, - { 2544, 2545 }, - { 2556, 2556 }, - { 2565, 2570 }, - { 2575, 2576 }, - { 2579, 2600 }, - { 2602, 2608 }, - { 2610, 2611 }, - { 2613, 2614 }, - { 2616, 2617 }, - { 2649, 2652 }, - { 2654, 2654 }, - { 2674, 2676 }, - { 2693, 2701 }, - { 2703, 2705 }, - { 2707, 2728 }, - { 2730, 2736 }, - { 2738, 2739 }, - { 2741, 2745 }, - { 2749, 2749 }, - { 2768, 2768 }, - { 2784, 2785 }, - { 2809, 2809 }, - { 2821, 2828 }, - { 2831, 2832 }, - { 2835, 2856 }, - { 2858, 2864 }, - { 2866, 2867 }, - { 2869, 2873 }, - { 2877, 2877 }, - { 2908, 2909 }, - { 2911, 2913 }, - { 2929, 2929 }, - { 2947, 2947 }, - { 2949, 2954 }, - { 2958, 2960 }, - { 2962, 2965 }, - { 2969, 2970 }, - { 2972, 2972 }, - { 2974, 2975 }, - { 2979, 2980 }, - { 2984, 2986 }, - { 2990, 3001 }, - { 3024, 3024 }, - { 3077, 3084 }, - { 3086, 3088 }, - { 3090, 3112 }, - { 3114, 3129 }, - { 3133, 3133 }, - { 3160, 3162 }, + { 2308, 2361 }, + { 2365, 2365 }, + { 2384, 2384 }, + { 2392, 2401 }, + { 2418, 2432 }, + { 2437, 2444 }, + { 2447, 2448 }, + { 2451, 2472 }, + { 2474, 2480 }, + { 2482, 2482 }, + { 2486, 2489 }, + { 2493, 2493 }, + { 2510, 2510 }, + { 2524, 2525 }, + { 2527, 2529 }, + { 2544, 2545 }, + { 2556, 2556 }, + { 2565, 2570 }, + { 2575, 2576 }, + { 2579, 2600 }, + { 2602, 2608 }, + { 2610, 2611 }, + { 2613, 2614 }, + { 2616, 2617 }, + { 2649, 2652 }, + { 2654, 2654 }, + { 2674, 2676 }, + { 2693, 2701 }, + { 2703, 2705 }, + { 2707, 2728 }, + { 2730, 2736 }, + { 2738, 2739 }, + { 2741, 2745 }, + { 2749, 2749 }, + { 2768, 2768 }, + { 2784, 2785 }, + { 2809, 2809 }, + { 2821, 2828 }, + { 2831, 2832 }, + { 2835, 2856 }, + { 2858, 2864 }, + { 2866, 2867 }, + { 2869, 2873 }, + { 2877, 2877 }, + { 2908, 2909 }, + { 2911, 2913 }, + { 2929, 2929 }, + { 2947, 2947 }, + { 2949, 2954 }, + { 2958, 2960 }, + { 2962, 2965 }, + { 2969, 2970 }, + { 2972, 2972 }, + { 2974, 2975 }, + { 2979, 2980 }, + { 2984, 2986 }, + { 2990, 3001 }, + { 3024, 3024 }, + { 3077, 3084 }, + { 3086, 3088 }, + { 3090, 3112 }, + { 3114, 3129 }, + { 3133, 3133 }, + { 3160, 3162 }, { 3165, 3165 }, - { 3168, 3169 }, - { 3200, 3200 }, - { 3205, 3212 }, - { 3214, 3216 }, - { 3218, 3240 }, - { 3242, 3251 }, - { 3253, 3257 }, - { 3261, 3261 }, + { 3168, 3169 }, + { 3200, 3200 }, + { 3205, 3212 }, + { 3214, 3216 }, + { 3218, 3240 }, + { 3242, 3251 }, + { 3253, 3257 }, + { 3261, 3261 }, { 3293, 3294 }, - { 3296, 3297 }, - { 3313, 3314 }, - { 3332, 3340 }, - { 3342, 3344 }, - { 3346, 3386 }, - { 3389, 3389 }, - { 3406, 3406 }, - { 3412, 3414 }, - { 3423, 3425 }, - { 3450, 3455 }, - { 3461, 3478 }, - { 3482, 3505 }, - { 3507, 3515 }, - { 3517, 3517 }, - { 3520, 3526 }, - { 3585, 3632 }, - { 3634, 3635 }, - { 3648, 3653 }, - { 3713, 3714 }, - { 3716, 3716 }, - { 3718, 3722 }, - { 3724, 3747 }, - { 3749, 3749 }, - { 3751, 3760 }, - { 3762, 3763 }, - { 3773, 3773 }, - { 3776, 3780 }, - { 3804, 3807 }, - { 3840, 3840 }, - { 3904, 3911 }, - { 3913, 3948 }, - { 3976, 3980 }, - { 4096, 4138 }, - { 4159, 4159 }, - { 4176, 4181 }, - { 4186, 4189 }, - { 4193, 4193 }, - { 4197, 4198 }, - { 4206, 4208 }, - { 4213, 4225 }, - { 4238, 4238 }, - { 4352, 4680 }, - { 4682, 4685 }, - { 4688, 4694 }, - { 4696, 4696 }, - { 4698, 4701 }, - { 4704, 4744 }, - { 4746, 4749 }, - { 4752, 4784 }, - { 4786, 4789 }, - { 4792, 4798 }, - { 4800, 4800 }, - { 4802, 4805 }, - { 4808, 4822 }, - { 4824, 4880 }, - { 4882, 4885 }, - { 4888, 4954 }, - { 4992, 5007 }, - { 5121, 5740 }, - { 5743, 5759 }, - { 5761, 5786 }, - { 5792, 5866 }, - { 5873, 5880 }, + { 3296, 3297 }, + { 3313, 3314 }, + { 3332, 3340 }, + { 3342, 3344 }, + { 3346, 3386 }, + { 3389, 3389 }, + { 3406, 3406 }, + { 3412, 3414 }, + { 3423, 3425 }, + { 3450, 3455 }, + { 3461, 3478 }, + { 3482, 3505 }, + { 3507, 3515 }, + { 3517, 3517 }, + { 3520, 3526 }, + { 3585, 3632 }, + { 3634, 3635 }, + { 3648, 3653 }, + { 3713, 3714 }, + { 3716, 3716 }, + { 3718, 3722 }, + { 3724, 3747 }, + { 3749, 3749 }, + { 3751, 3760 }, + { 3762, 3763 }, + { 3773, 3773 }, + { 3776, 3780 }, + { 3804, 3807 }, + { 3840, 3840 }, + { 3904, 3911 }, + { 3913, 3948 }, + { 3976, 3980 }, + { 4096, 4138 }, + { 4159, 4159 }, + { 4176, 4181 }, + { 4186, 4189 }, + { 4193, 4193 }, + { 4197, 4198 }, + { 4206, 4208 }, + { 4213, 4225 }, + { 4238, 4238 }, + { 4352, 4680 }, + { 4682, 4685 }, + { 4688, 4694 }, + { 4696, 4696 }, + { 4698, 4701 }, + { 4704, 4744 }, + { 4746, 4749 }, + { 4752, 4784 }, + { 4786, 4789 }, + { 4792, 4798 }, + { 4800, 4800 }, + { 4802, 4805 }, + { 4808, 4822 }, + { 4824, 4880 }, + { 4882, 4885 }, + { 4888, 4954 }, + { 4992, 5007 }, + { 5121, 5740 }, + { 5743, 5759 }, + { 5761, 5786 }, + { 5792, 5866 }, + { 5873, 5880 }, { 5888, 5905 }, { 5919, 5937 }, - { 5952, 5969 }, - { 5984, 5996 }, - { 5998, 6000 }, - { 6016, 6067 }, - { 6108, 6108 }, - { 6176, 6210 }, - { 6212, 6264 }, - { 6272, 6276 }, - { 6279, 6312 }, - { 6314, 6314 }, - { 6320, 6389 }, - { 6400, 6430 }, - { 6480, 6509 }, - { 6512, 6516 }, - { 6528, 6571 }, - { 6576, 6601 }, - { 6656, 6678 }, - { 6688, 6740 }, - { 6917, 6963 }, + { 5952, 5969 }, + { 5984, 5996 }, + { 5998, 6000 }, + { 6016, 6067 }, + { 6108, 6108 }, + { 6176, 6210 }, + { 6212, 6264 }, + { 6272, 6276 }, + { 6279, 6312 }, + { 6314, 6314 }, + { 6320, 6389 }, + { 6400, 6430 }, + { 6480, 6509 }, + { 6512, 6516 }, + { 6528, 6571 }, + { 6576, 6601 }, + { 6656, 6678 }, + { 6688, 6740 }, + { 6917, 6963 }, { 6981, 6988 }, - { 7043, 7072 }, - { 7086, 7087 }, - { 7098, 7141 }, - { 7168, 7203 }, - { 7245, 7247 }, - { 7258, 7287 }, - { 7401, 7404 }, - { 7406, 7411 }, - { 7413, 7414 }, - { 7418, 7418 }, - { 8501, 8504 }, - { 11568, 11623 }, - { 11648, 11670 }, - { 11680, 11686 }, - { 11688, 11694 }, - { 11696, 11702 }, - { 11704, 11710 }, - { 11712, 11718 }, - { 11720, 11726 }, - { 11728, 11734 }, - { 11736, 11742 }, - { 12294, 12294 }, - { 12348, 12348 }, - { 12353, 12438 }, - { 12447, 12447 }, - { 12449, 12538 }, - { 12543, 12543 }, - { 12549, 12591 }, - { 12593, 12686 }, - { 12704, 12735 }, - { 12784, 12799 }, - { 13312, 19903 }, + { 7043, 7072 }, + { 7086, 7087 }, + { 7098, 7141 }, + { 7168, 7203 }, + { 7245, 7247 }, + { 7258, 7287 }, + { 7401, 7404 }, + { 7406, 7411 }, + { 7413, 7414 }, + { 7418, 7418 }, + { 8501, 8504 }, + { 11568, 11623 }, + { 11648, 11670 }, + { 11680, 11686 }, + { 11688, 11694 }, + { 11696, 11702 }, + { 11704, 11710 }, + { 11712, 11718 }, + { 11720, 11726 }, + { 11728, 11734 }, + { 11736, 11742 }, + { 12294, 12294 }, + { 12348, 12348 }, + { 12353, 12438 }, + { 12447, 12447 }, + { 12449, 12538 }, + { 12543, 12543 }, + { 12549, 12591 }, + { 12593, 12686 }, + { 12704, 12735 }, + { 12784, 12799 }, + { 13312, 19903 }, { 19968, 40980 }, - { 40982, 42124 }, - { 42192, 42231 }, - { 42240, 42507 }, - { 42512, 42527 }, - { 42538, 42539 }, - { 42606, 42606 }, - { 42656, 42725 }, - { 42895, 42895 }, - { 42999, 42999 }, - { 43003, 43009 }, - { 43011, 43013 }, - { 43015, 43018 }, - { 43020, 43042 }, - { 43072, 43123 }, - { 43138, 43187 }, - { 43250, 43255 }, - { 43259, 43259 }, - { 43261, 43262 }, - { 43274, 43301 }, - { 43312, 43334 }, - { 43360, 43388 }, - { 43396, 43442 }, - { 43488, 43492 }, - { 43495, 43503 }, - { 43514, 43518 }, - { 43520, 43560 }, - { 43584, 43586 }, - { 43588, 43595 }, - { 43616, 43631 }, - { 43633, 43638 }, - { 43642, 43642 }, - { 43646, 43695 }, - { 43697, 43697 }, - { 43701, 43702 }, - { 43705, 43709 }, - { 43712, 43712 }, - { 43714, 43714 }, - { 43739, 43740 }, - { 43744, 43754 }, - { 43762, 43762 }, - { 43777, 43782 }, - { 43785, 43790 }, - { 43793, 43798 }, - { 43808, 43814 }, - { 43816, 43822 }, - { 43968, 44002 }, - { 44032, 55203 }, - { 55216, 55238 }, - { 55243, 55291 }, - { 63744, 64109 }, - { 64112, 64217 }, - { 64285, 64285 }, - { 64287, 64296 }, - { 64298, 64310 }, - { 64312, 64316 }, - { 64318, 64318 }, - { 64320, 64321 }, - { 64323, 64324 }, - { 64326, 64433 }, - { 64467, 64829 }, - { 64848, 64911 }, - { 64914, 64967 }, - { 65008, 65019 }, - { 65136, 65140 }, - { 65142, 65276 }, - { 65382, 65391 }, - { 65393, 65437 }, - { 65440, 65470 }, - { 65474, 65479 }, - { 65482, 65487 }, - { 65490, 65495 }, - { 65498, 65500 }, -}; -static const URange32 Lo_range32[] = { - { 65536, 65547 }, - { 65549, 65574 }, - { 65576, 65594 }, - { 65596, 65597 }, - { 65599, 65613 }, - { 65616, 65629 }, - { 65664, 65786 }, - { 66176, 66204 }, - { 66208, 66256 }, - { 66304, 66335 }, - { 66349, 66368 }, - { 66370, 66377 }, - { 66384, 66421 }, - { 66432, 66461 }, - { 66464, 66499 }, - { 66504, 66511 }, - { 66640, 66717 }, - { 66816, 66855 }, - { 66864, 66915 }, - { 67072, 67382 }, - { 67392, 67413 }, - { 67424, 67431 }, - { 67584, 67589 }, - { 67592, 67592 }, - { 67594, 67637 }, - { 67639, 67640 }, - { 67644, 67644 }, - { 67647, 67669 }, - { 67680, 67702 }, - { 67712, 67742 }, - { 67808, 67826 }, - { 67828, 67829 }, - { 67840, 67861 }, - { 67872, 67897 }, - { 67968, 68023 }, - { 68030, 68031 }, - { 68096, 68096 }, - { 68112, 68115 }, - { 68117, 68119 }, - { 68121, 68149 }, - { 68192, 68220 }, - { 68224, 68252 }, - { 68288, 68295 }, - { 68297, 68324 }, - { 68352, 68405 }, - { 68416, 68437 }, - { 68448, 68466 }, - { 68480, 68497 }, - { 68608, 68680 }, - { 68864, 68899 }, - { 69248, 69289 }, - { 69296, 69297 }, - { 69376, 69404 }, - { 69415, 69415 }, - { 69424, 69445 }, + { 40982, 42124 }, + { 42192, 42231 }, + { 42240, 42507 }, + { 42512, 42527 }, + { 42538, 42539 }, + { 42606, 42606 }, + { 42656, 42725 }, + { 42895, 42895 }, + { 42999, 42999 }, + { 43003, 43009 }, + { 43011, 43013 }, + { 43015, 43018 }, + { 43020, 43042 }, + { 43072, 43123 }, + { 43138, 43187 }, + { 43250, 43255 }, + { 43259, 43259 }, + { 43261, 43262 }, + { 43274, 43301 }, + { 43312, 43334 }, + { 43360, 43388 }, + { 43396, 43442 }, + { 43488, 43492 }, + { 43495, 43503 }, + { 43514, 43518 }, + { 43520, 43560 }, + { 43584, 43586 }, + { 43588, 43595 }, + { 43616, 43631 }, + { 43633, 43638 }, + { 43642, 43642 }, + { 43646, 43695 }, + { 43697, 43697 }, + { 43701, 43702 }, + { 43705, 43709 }, + { 43712, 43712 }, + { 43714, 43714 }, + { 43739, 43740 }, + { 43744, 43754 }, + { 43762, 43762 }, + { 43777, 43782 }, + { 43785, 43790 }, + { 43793, 43798 }, + { 43808, 43814 }, + { 43816, 43822 }, + { 43968, 44002 }, + { 44032, 55203 }, + { 55216, 55238 }, + { 55243, 55291 }, + { 63744, 64109 }, + { 64112, 64217 }, + { 64285, 64285 }, + { 64287, 64296 }, + { 64298, 64310 }, + { 64312, 64316 }, + { 64318, 64318 }, + { 64320, 64321 }, + { 64323, 64324 }, + { 64326, 64433 }, + { 64467, 64829 }, + { 64848, 64911 }, + { 64914, 64967 }, + { 65008, 65019 }, + { 65136, 65140 }, + { 65142, 65276 }, + { 65382, 65391 }, + { 65393, 65437 }, + { 65440, 65470 }, + { 65474, 65479 }, + { 65482, 65487 }, + { 65490, 65495 }, + { 65498, 65500 }, +}; +static const URange32 Lo_range32[] = { + { 65536, 65547 }, + { 65549, 65574 }, + { 65576, 65594 }, + { 65596, 65597 }, + { 65599, 65613 }, + { 65616, 65629 }, + { 65664, 65786 }, + { 66176, 66204 }, + { 66208, 66256 }, + { 66304, 66335 }, + { 66349, 66368 }, + { 66370, 66377 }, + { 66384, 66421 }, + { 66432, 66461 }, + { 66464, 66499 }, + { 66504, 66511 }, + { 66640, 66717 }, + { 66816, 66855 }, + { 66864, 66915 }, + { 67072, 67382 }, + { 67392, 67413 }, + { 67424, 67431 }, + { 67584, 67589 }, + { 67592, 67592 }, + { 67594, 67637 }, + { 67639, 67640 }, + { 67644, 67644 }, + { 67647, 67669 }, + { 67680, 67702 }, + { 67712, 67742 }, + { 67808, 67826 }, + { 67828, 67829 }, + { 67840, 67861 }, + { 67872, 67897 }, + { 67968, 68023 }, + { 68030, 68031 }, + { 68096, 68096 }, + { 68112, 68115 }, + { 68117, 68119 }, + { 68121, 68149 }, + { 68192, 68220 }, + { 68224, 68252 }, + { 68288, 68295 }, + { 68297, 68324 }, + { 68352, 68405 }, + { 68416, 68437 }, + { 68448, 68466 }, + { 68480, 68497 }, + { 68608, 68680 }, + { 68864, 68899 }, + { 69248, 69289 }, + { 69296, 69297 }, + { 69376, 69404 }, + { 69415, 69415 }, + { 69424, 69445 }, { 69488, 69505 }, - { 69552, 69572 }, - { 69600, 69622 }, - { 69635, 69687 }, + { 69552, 69572 }, + { 69600, 69622 }, + { 69635, 69687 }, { 69745, 69746 }, { 69749, 69749 }, - { 69763, 69807 }, - { 69840, 69864 }, - { 69891, 69926 }, - { 69956, 69956 }, - { 69959, 69959 }, - { 69968, 70002 }, - { 70006, 70006 }, - { 70019, 70066 }, - { 70081, 70084 }, - { 70106, 70106 }, - { 70108, 70108 }, - { 70144, 70161 }, - { 70163, 70187 }, - { 70272, 70278 }, - { 70280, 70280 }, - { 70282, 70285 }, - { 70287, 70301 }, - { 70303, 70312 }, - { 70320, 70366 }, - { 70405, 70412 }, - { 70415, 70416 }, - { 70419, 70440 }, - { 70442, 70448 }, - { 70450, 70451 }, - { 70453, 70457 }, - { 70461, 70461 }, - { 70480, 70480 }, - { 70493, 70497 }, - { 70656, 70708 }, - { 70727, 70730 }, - { 70751, 70753 }, - { 70784, 70831 }, - { 70852, 70853 }, - { 70855, 70855 }, - { 71040, 71086 }, - { 71128, 71131 }, - { 71168, 71215 }, - { 71236, 71236 }, - { 71296, 71338 }, - { 71352, 71352 }, - { 71424, 71450 }, + { 69763, 69807 }, + { 69840, 69864 }, + { 69891, 69926 }, + { 69956, 69956 }, + { 69959, 69959 }, + { 69968, 70002 }, + { 70006, 70006 }, + { 70019, 70066 }, + { 70081, 70084 }, + { 70106, 70106 }, + { 70108, 70108 }, + { 70144, 70161 }, + { 70163, 70187 }, + { 70272, 70278 }, + { 70280, 70280 }, + { 70282, 70285 }, + { 70287, 70301 }, + { 70303, 70312 }, + { 70320, 70366 }, + { 70405, 70412 }, + { 70415, 70416 }, + { 70419, 70440 }, + { 70442, 70448 }, + { 70450, 70451 }, + { 70453, 70457 }, + { 70461, 70461 }, + { 70480, 70480 }, + { 70493, 70497 }, + { 70656, 70708 }, + { 70727, 70730 }, + { 70751, 70753 }, + { 70784, 70831 }, + { 70852, 70853 }, + { 70855, 70855 }, + { 71040, 71086 }, + { 71128, 71131 }, + { 71168, 71215 }, + { 71236, 71236 }, + { 71296, 71338 }, + { 71352, 71352 }, + { 71424, 71450 }, { 71488, 71494 }, - { 71680, 71723 }, - { 71935, 71942 }, - { 71945, 71945 }, - { 71948, 71955 }, - { 71957, 71958 }, - { 71960, 71983 }, - { 71999, 71999 }, - { 72001, 72001 }, - { 72096, 72103 }, - { 72106, 72144 }, - { 72161, 72161 }, - { 72163, 72163 }, - { 72192, 72192 }, - { 72203, 72242 }, - { 72250, 72250 }, - { 72272, 72272 }, - { 72284, 72329 }, - { 72349, 72349 }, + { 71680, 71723 }, + { 71935, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71983 }, + { 71999, 71999 }, + { 72001, 72001 }, + { 72096, 72103 }, + { 72106, 72144 }, + { 72161, 72161 }, + { 72163, 72163 }, + { 72192, 72192 }, + { 72203, 72242 }, + { 72250, 72250 }, + { 72272, 72272 }, + { 72284, 72329 }, + { 72349, 72349 }, { 72368, 72440 }, - { 72704, 72712 }, - { 72714, 72750 }, - { 72768, 72768 }, - { 72818, 72847 }, - { 72960, 72966 }, - { 72968, 72969 }, - { 72971, 73008 }, - { 73030, 73030 }, - { 73056, 73061 }, - { 73063, 73064 }, - { 73066, 73097 }, - { 73112, 73112 }, - { 73440, 73458 }, - { 73648, 73648 }, - { 73728, 74649 }, - { 74880, 75075 }, + { 72704, 72712 }, + { 72714, 72750 }, + { 72768, 72768 }, + { 72818, 72847 }, + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73008 }, + { 73030, 73030 }, + { 73056, 73061 }, + { 73063, 73064 }, + { 73066, 73097 }, + { 73112, 73112 }, + { 73440, 73458 }, + { 73648, 73648 }, + { 73728, 74649 }, + { 74880, 75075 }, { 77712, 77808 }, - { 77824, 78894 }, - { 82944, 83526 }, - { 92160, 92728 }, - { 92736, 92766 }, + { 77824, 78894 }, + { 82944, 83526 }, + { 92160, 92728 }, + { 92736, 92766 }, { 92784, 92862 }, - { 92880, 92909 }, - { 92928, 92975 }, - { 93027, 93047 }, - { 93053, 93071 }, - { 93952, 94026 }, - { 94032, 94032 }, - { 94208, 100343 }, - { 100352, 101589 }, - { 101632, 101640 }, + { 92880, 92909 }, + { 92928, 92975 }, + { 93027, 93047 }, + { 93053, 93071 }, + { 93952, 94026 }, + { 94032, 94032 }, + { 94208, 100343 }, + { 100352, 101589 }, + { 101632, 101640 }, { 110592, 110882 }, - { 110928, 110930 }, - { 110948, 110951 }, - { 110960, 111355 }, - { 113664, 113770 }, - { 113776, 113788 }, - { 113792, 113800 }, - { 113808, 113817 }, + { 110928, 110930 }, + { 110948, 110951 }, + { 110960, 111355 }, + { 113664, 113770 }, + { 113776, 113788 }, + { 113792, 113800 }, + { 113808, 113817 }, { 122634, 122634 }, - { 123136, 123180 }, - { 123214, 123214 }, + { 123136, 123180 }, + { 123214, 123214 }, { 123536, 123565 }, - { 123584, 123627 }, + { 123584, 123627 }, { 124896, 124902 }, { 124904, 124907 }, { 124909, 124910 }, { 124912, 124926 }, - { 124928, 125124 }, - { 126464, 126467 }, - { 126469, 126495 }, - { 126497, 126498 }, - { 126500, 126500 }, - { 126503, 126503 }, - { 126505, 126514 }, - { 126516, 126519 }, - { 126521, 126521 }, - { 126523, 126523 }, - { 126530, 126530 }, - { 126535, 126535 }, - { 126537, 126537 }, - { 126539, 126539 }, - { 126541, 126543 }, - { 126545, 126546 }, - { 126548, 126548 }, - { 126551, 126551 }, - { 126553, 126553 }, - { 126555, 126555 }, - { 126557, 126557 }, - { 126559, 126559 }, - { 126561, 126562 }, - { 126564, 126564 }, - { 126567, 126570 }, - { 126572, 126578 }, - { 126580, 126583 }, - { 126585, 126588 }, - { 126590, 126590 }, - { 126592, 126601 }, - { 126603, 126619 }, - { 126625, 126627 }, - { 126629, 126633 }, - { 126635, 126651 }, + { 124928, 125124 }, + { 126464, 126467 }, + { 126469, 126495 }, + { 126497, 126498 }, + { 126500, 126500 }, + { 126503, 126503 }, + { 126505, 126514 }, + { 126516, 126519 }, + { 126521, 126521 }, + { 126523, 126523 }, + { 126530, 126530 }, + { 126535, 126535 }, + { 126537, 126537 }, + { 126539, 126539 }, + { 126541, 126543 }, + { 126545, 126546 }, + { 126548, 126548 }, + { 126551, 126551 }, + { 126553, 126553 }, + { 126555, 126555 }, + { 126557, 126557 }, + { 126559, 126559 }, + { 126561, 126562 }, + { 126564, 126564 }, + { 126567, 126570 }, + { 126572, 126578 }, + { 126580, 126583 }, + { 126585, 126588 }, + { 126590, 126590 }, + { 126592, 126601 }, + { 126603, 126619 }, + { 126625, 126627 }, + { 126629, 126633 }, + { 126635, 126651 }, { 131072, 173791 }, { 173824, 177976 }, - { 177984, 178205 }, - { 178208, 183969 }, - { 183984, 191456 }, - { 194560, 195101 }, - { 196608, 201546 }, + { 177984, 178205 }, + { 178208, 183969 }, + { 183984, 191456 }, + { 194560, 195101 }, + { 196608, 201546 }, }; static const URange16 Lt_range16[] = { { 453, 453 }, @@ -2256,8 +2256,8 @@ static const URange16 Lu_range16[] = { { 4295, 4295 }, { 4301, 4301 }, { 5024, 5109 }, - { 7312, 7354 }, - { 7357, 7359 }, + { 7312, 7354 }, + { 7357, 7359 }, { 7680, 7680 }, { 7682, 7682 }, { 7684, 7684 }, @@ -2569,33 +2569,33 @@ static const URange16 Lu_range16[] = { { 42916, 42916 }, { 42918, 42918 }, { 42920, 42920 }, - { 42922, 42926 }, + { 42922, 42926 }, { 42928, 42932 }, { 42934, 42934 }, - { 42936, 42936 }, - { 42938, 42938 }, - { 42940, 42940 }, - { 42942, 42942 }, + { 42936, 42936 }, + { 42938, 42938 }, + { 42940, 42940 }, + { 42942, 42942 }, { 42944, 42944 }, - { 42946, 42946 }, - { 42948, 42951 }, - { 42953, 42953 }, + { 42946, 42946 }, + { 42948, 42951 }, + { 42953, 42953 }, { 42960, 42960 }, { 42966, 42966 }, { 42968, 42968 }, - { 42997, 42997 }, + { 42997, 42997 }, { 65313, 65338 }, }; static const URange32 Lu_range32[] = { { 66560, 66599 }, - { 66736, 66771 }, + { 66736, 66771 }, { 66928, 66938 }, { 66940, 66954 }, { 66956, 66962 }, { 66964, 66965 }, { 68736, 68786 }, { 71840, 71871 }, - { 93760, 93791 }, + { 93760, 93791 }, { 119808, 119833 }, { 119860, 119885 }, { 119912, 119937 }, @@ -2627,310 +2627,310 @@ static const URange32 Lu_range32[] = { { 120662, 120686 }, { 120720, 120744 }, { 120778, 120778 }, - { 125184, 125217 }, -}; -static const URange16 M_range16[] = { - { 768, 879 }, - { 1155, 1161 }, - { 1425, 1469 }, - { 1471, 1471 }, - { 1473, 1474 }, - { 1476, 1477 }, - { 1479, 1479 }, - { 1552, 1562 }, - { 1611, 1631 }, - { 1648, 1648 }, - { 1750, 1756 }, - { 1759, 1764 }, - { 1767, 1768 }, - { 1770, 1773 }, - { 1809, 1809 }, - { 1840, 1866 }, - { 1958, 1968 }, - { 2027, 2035 }, - { 2045, 2045 }, - { 2070, 2073 }, - { 2075, 2083 }, - { 2085, 2087 }, - { 2089, 2093 }, - { 2137, 2139 }, + { 125184, 125217 }, +}; +static const URange16 M_range16[] = { + { 768, 879 }, + { 1155, 1161 }, + { 1425, 1469 }, + { 1471, 1471 }, + { 1473, 1474 }, + { 1476, 1477 }, + { 1479, 1479 }, + { 1552, 1562 }, + { 1611, 1631 }, + { 1648, 1648 }, + { 1750, 1756 }, + { 1759, 1764 }, + { 1767, 1768 }, + { 1770, 1773 }, + { 1809, 1809 }, + { 1840, 1866 }, + { 1958, 1968 }, + { 2027, 2035 }, + { 2045, 2045 }, + { 2070, 2073 }, + { 2075, 2083 }, + { 2085, 2087 }, + { 2089, 2093 }, + { 2137, 2139 }, { 2200, 2207 }, { 2250, 2273 }, - { 2275, 2307 }, - { 2362, 2364 }, - { 2366, 2383 }, - { 2385, 2391 }, - { 2402, 2403 }, - { 2433, 2435 }, - { 2492, 2492 }, - { 2494, 2500 }, - { 2503, 2504 }, - { 2507, 2509 }, - { 2519, 2519 }, - { 2530, 2531 }, - { 2558, 2558 }, - { 2561, 2563 }, - { 2620, 2620 }, - { 2622, 2626 }, - { 2631, 2632 }, - { 2635, 2637 }, - { 2641, 2641 }, - { 2672, 2673 }, - { 2677, 2677 }, - { 2689, 2691 }, - { 2748, 2748 }, - { 2750, 2757 }, - { 2759, 2761 }, - { 2763, 2765 }, - { 2786, 2787 }, - { 2810, 2815 }, - { 2817, 2819 }, - { 2876, 2876 }, - { 2878, 2884 }, - { 2887, 2888 }, - { 2891, 2893 }, - { 2901, 2903 }, - { 2914, 2915 }, - { 2946, 2946 }, - { 3006, 3010 }, - { 3014, 3016 }, - { 3018, 3021 }, - { 3031, 3031 }, - { 3072, 3076 }, + { 2275, 2307 }, + { 2362, 2364 }, + { 2366, 2383 }, + { 2385, 2391 }, + { 2402, 2403 }, + { 2433, 2435 }, + { 2492, 2492 }, + { 2494, 2500 }, + { 2503, 2504 }, + { 2507, 2509 }, + { 2519, 2519 }, + { 2530, 2531 }, + { 2558, 2558 }, + { 2561, 2563 }, + { 2620, 2620 }, + { 2622, 2626 }, + { 2631, 2632 }, + { 2635, 2637 }, + { 2641, 2641 }, + { 2672, 2673 }, + { 2677, 2677 }, + { 2689, 2691 }, + { 2748, 2748 }, + { 2750, 2757 }, + { 2759, 2761 }, + { 2763, 2765 }, + { 2786, 2787 }, + { 2810, 2815 }, + { 2817, 2819 }, + { 2876, 2876 }, + { 2878, 2884 }, + { 2887, 2888 }, + { 2891, 2893 }, + { 2901, 2903 }, + { 2914, 2915 }, + { 2946, 2946 }, + { 3006, 3010 }, + { 3014, 3016 }, + { 3018, 3021 }, + { 3031, 3031 }, + { 3072, 3076 }, { 3132, 3132 }, - { 3134, 3140 }, - { 3142, 3144 }, - { 3146, 3149 }, - { 3157, 3158 }, - { 3170, 3171 }, - { 3201, 3203 }, - { 3260, 3260 }, - { 3262, 3268 }, - { 3270, 3272 }, - { 3274, 3277 }, - { 3285, 3286 }, - { 3298, 3299 }, - { 3328, 3331 }, - { 3387, 3388 }, - { 3390, 3396 }, - { 3398, 3400 }, - { 3402, 3405 }, - { 3415, 3415 }, - { 3426, 3427 }, - { 3457, 3459 }, - { 3530, 3530 }, - { 3535, 3540 }, - { 3542, 3542 }, - { 3544, 3551 }, - { 3570, 3571 }, - { 3633, 3633 }, - { 3636, 3642 }, - { 3655, 3662 }, - { 3761, 3761 }, - { 3764, 3772 }, - { 3784, 3789 }, - { 3864, 3865 }, - { 3893, 3893 }, - { 3895, 3895 }, - { 3897, 3897 }, - { 3902, 3903 }, - { 3953, 3972 }, - { 3974, 3975 }, - { 3981, 3991 }, - { 3993, 4028 }, - { 4038, 4038 }, - { 4139, 4158 }, - { 4182, 4185 }, - { 4190, 4192 }, - { 4194, 4196 }, - { 4199, 4205 }, - { 4209, 4212 }, - { 4226, 4237 }, - { 4239, 4239 }, - { 4250, 4253 }, - { 4957, 4959 }, + { 3134, 3140 }, + { 3142, 3144 }, + { 3146, 3149 }, + { 3157, 3158 }, + { 3170, 3171 }, + { 3201, 3203 }, + { 3260, 3260 }, + { 3262, 3268 }, + { 3270, 3272 }, + { 3274, 3277 }, + { 3285, 3286 }, + { 3298, 3299 }, + { 3328, 3331 }, + { 3387, 3388 }, + { 3390, 3396 }, + { 3398, 3400 }, + { 3402, 3405 }, + { 3415, 3415 }, + { 3426, 3427 }, + { 3457, 3459 }, + { 3530, 3530 }, + { 3535, 3540 }, + { 3542, 3542 }, + { 3544, 3551 }, + { 3570, 3571 }, + { 3633, 3633 }, + { 3636, 3642 }, + { 3655, 3662 }, + { 3761, 3761 }, + { 3764, 3772 }, + { 3784, 3789 }, + { 3864, 3865 }, + { 3893, 3893 }, + { 3895, 3895 }, + { 3897, 3897 }, + { 3902, 3903 }, + { 3953, 3972 }, + { 3974, 3975 }, + { 3981, 3991 }, + { 3993, 4028 }, + { 4038, 4038 }, + { 4139, 4158 }, + { 4182, 4185 }, + { 4190, 4192 }, + { 4194, 4196 }, + { 4199, 4205 }, + { 4209, 4212 }, + { 4226, 4237 }, + { 4239, 4239 }, + { 4250, 4253 }, + { 4957, 4959 }, { 5906, 5909 }, - { 5938, 5940 }, - { 5970, 5971 }, - { 6002, 6003 }, - { 6068, 6099 }, - { 6109, 6109 }, - { 6155, 6157 }, + { 5938, 5940 }, + { 5970, 5971 }, + { 6002, 6003 }, + { 6068, 6099 }, + { 6109, 6109 }, + { 6155, 6157 }, { 6159, 6159 }, - { 6277, 6278 }, - { 6313, 6313 }, - { 6432, 6443 }, - { 6448, 6459 }, - { 6679, 6683 }, - { 6741, 6750 }, - { 6752, 6780 }, - { 6783, 6783 }, + { 6277, 6278 }, + { 6313, 6313 }, + { 6432, 6443 }, + { 6448, 6459 }, + { 6679, 6683 }, + { 6741, 6750 }, + { 6752, 6780 }, + { 6783, 6783 }, { 6832, 6862 }, - { 6912, 6916 }, - { 6964, 6980 }, - { 7019, 7027 }, - { 7040, 7042 }, - { 7073, 7085 }, - { 7142, 7155 }, - { 7204, 7223 }, - { 7376, 7378 }, - { 7380, 7400 }, - { 7405, 7405 }, - { 7412, 7412 }, - { 7415, 7417 }, + { 6912, 6916 }, + { 6964, 6980 }, + { 7019, 7027 }, + { 7040, 7042 }, + { 7073, 7085 }, + { 7142, 7155 }, + { 7204, 7223 }, + { 7376, 7378 }, + { 7380, 7400 }, + { 7405, 7405 }, + { 7412, 7412 }, + { 7415, 7417 }, { 7616, 7679 }, - { 8400, 8432 }, - { 11503, 11505 }, - { 11647, 11647 }, - { 11744, 11775 }, - { 12330, 12335 }, - { 12441, 12442 }, - { 42607, 42610 }, - { 42612, 42621 }, - { 42654, 42655 }, - { 42736, 42737 }, - { 43010, 43010 }, - { 43014, 43014 }, - { 43019, 43019 }, - { 43043, 43047 }, - { 43052, 43052 }, - { 43136, 43137 }, - { 43188, 43205 }, - { 43232, 43249 }, - { 43263, 43263 }, - { 43302, 43309 }, - { 43335, 43347 }, - { 43392, 43395 }, - { 43443, 43456 }, - { 43493, 43493 }, - { 43561, 43574 }, - { 43587, 43587 }, - { 43596, 43597 }, - { 43643, 43645 }, - { 43696, 43696 }, - { 43698, 43700 }, - { 43703, 43704 }, - { 43710, 43711 }, - { 43713, 43713 }, - { 43755, 43759 }, - { 43765, 43766 }, - { 44003, 44010 }, - { 44012, 44013 }, - { 64286, 64286 }, - { 65024, 65039 }, - { 65056, 65071 }, -}; -static const URange32 M_range32[] = { - { 66045, 66045 }, - { 66272, 66272 }, - { 66422, 66426 }, - { 68097, 68099 }, - { 68101, 68102 }, - { 68108, 68111 }, - { 68152, 68154 }, - { 68159, 68159 }, - { 68325, 68326 }, - { 68900, 68903 }, - { 69291, 69292 }, - { 69446, 69456 }, + { 8400, 8432 }, + { 11503, 11505 }, + { 11647, 11647 }, + { 11744, 11775 }, + { 12330, 12335 }, + { 12441, 12442 }, + { 42607, 42610 }, + { 42612, 42621 }, + { 42654, 42655 }, + { 42736, 42737 }, + { 43010, 43010 }, + { 43014, 43014 }, + { 43019, 43019 }, + { 43043, 43047 }, + { 43052, 43052 }, + { 43136, 43137 }, + { 43188, 43205 }, + { 43232, 43249 }, + { 43263, 43263 }, + { 43302, 43309 }, + { 43335, 43347 }, + { 43392, 43395 }, + { 43443, 43456 }, + { 43493, 43493 }, + { 43561, 43574 }, + { 43587, 43587 }, + { 43596, 43597 }, + { 43643, 43645 }, + { 43696, 43696 }, + { 43698, 43700 }, + { 43703, 43704 }, + { 43710, 43711 }, + { 43713, 43713 }, + { 43755, 43759 }, + { 43765, 43766 }, + { 44003, 44010 }, + { 44012, 44013 }, + { 64286, 64286 }, + { 65024, 65039 }, + { 65056, 65071 }, +}; +static const URange32 M_range32[] = { + { 66045, 66045 }, + { 66272, 66272 }, + { 66422, 66426 }, + { 68097, 68099 }, + { 68101, 68102 }, + { 68108, 68111 }, + { 68152, 68154 }, + { 68159, 68159 }, + { 68325, 68326 }, + { 68900, 68903 }, + { 69291, 69292 }, + { 69446, 69456 }, { 69506, 69509 }, - { 69632, 69634 }, - { 69688, 69702 }, + { 69632, 69634 }, + { 69688, 69702 }, { 69744, 69744 }, { 69747, 69748 }, - { 69759, 69762 }, - { 69808, 69818 }, + { 69759, 69762 }, + { 69808, 69818 }, { 69826, 69826 }, - { 69888, 69890 }, - { 69927, 69940 }, - { 69957, 69958 }, - { 70003, 70003 }, - { 70016, 70018 }, - { 70067, 70080 }, - { 70089, 70092 }, - { 70094, 70095 }, - { 70188, 70199 }, - { 70206, 70206 }, - { 70367, 70378 }, - { 70400, 70403 }, - { 70459, 70460 }, - { 70462, 70468 }, - { 70471, 70472 }, - { 70475, 70477 }, - { 70487, 70487 }, - { 70498, 70499 }, - { 70502, 70508 }, - { 70512, 70516 }, - { 70709, 70726 }, - { 70750, 70750 }, - { 70832, 70851 }, - { 71087, 71093 }, - { 71096, 71104 }, - { 71132, 71133 }, - { 71216, 71232 }, - { 71339, 71351 }, - { 71453, 71467 }, - { 71724, 71738 }, - { 71984, 71989 }, - { 71991, 71992 }, - { 71995, 71998 }, - { 72000, 72000 }, - { 72002, 72003 }, - { 72145, 72151 }, - { 72154, 72160 }, - { 72164, 72164 }, - { 72193, 72202 }, - { 72243, 72249 }, - { 72251, 72254 }, - { 72263, 72263 }, - { 72273, 72283 }, - { 72330, 72345 }, - { 72751, 72758 }, - { 72760, 72767 }, - { 72850, 72871 }, - { 72873, 72886 }, - { 73009, 73014 }, - { 73018, 73018 }, - { 73020, 73021 }, - { 73023, 73029 }, - { 73031, 73031 }, - { 73098, 73102 }, - { 73104, 73105 }, - { 73107, 73111 }, - { 73459, 73462 }, - { 92912, 92916 }, - { 92976, 92982 }, - { 94031, 94031 }, - { 94033, 94087 }, - { 94095, 94098 }, - { 94180, 94180 }, - { 94192, 94193 }, - { 113821, 113822 }, + { 69888, 69890 }, + { 69927, 69940 }, + { 69957, 69958 }, + { 70003, 70003 }, + { 70016, 70018 }, + { 70067, 70080 }, + { 70089, 70092 }, + { 70094, 70095 }, + { 70188, 70199 }, + { 70206, 70206 }, + { 70367, 70378 }, + { 70400, 70403 }, + { 70459, 70460 }, + { 70462, 70468 }, + { 70471, 70472 }, + { 70475, 70477 }, + { 70487, 70487 }, + { 70498, 70499 }, + { 70502, 70508 }, + { 70512, 70516 }, + { 70709, 70726 }, + { 70750, 70750 }, + { 70832, 70851 }, + { 71087, 71093 }, + { 71096, 71104 }, + { 71132, 71133 }, + { 71216, 71232 }, + { 71339, 71351 }, + { 71453, 71467 }, + { 71724, 71738 }, + { 71984, 71989 }, + { 71991, 71992 }, + { 71995, 71998 }, + { 72000, 72000 }, + { 72002, 72003 }, + { 72145, 72151 }, + { 72154, 72160 }, + { 72164, 72164 }, + { 72193, 72202 }, + { 72243, 72249 }, + { 72251, 72254 }, + { 72263, 72263 }, + { 72273, 72283 }, + { 72330, 72345 }, + { 72751, 72758 }, + { 72760, 72767 }, + { 72850, 72871 }, + { 72873, 72886 }, + { 73009, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73029 }, + { 73031, 73031 }, + { 73098, 73102 }, + { 73104, 73105 }, + { 73107, 73111 }, + { 73459, 73462 }, + { 92912, 92916 }, + { 92976, 92982 }, + { 94031, 94031 }, + { 94033, 94087 }, + { 94095, 94098 }, + { 94180, 94180 }, + { 94192, 94193 }, + { 113821, 113822 }, { 118528, 118573 }, { 118576, 118598 }, - { 119141, 119145 }, - { 119149, 119154 }, - { 119163, 119170 }, - { 119173, 119179 }, - { 119210, 119213 }, - { 119362, 119364 }, - { 121344, 121398 }, - { 121403, 121452 }, - { 121461, 121461 }, - { 121476, 121476 }, - { 121499, 121503 }, - { 121505, 121519 }, - { 122880, 122886 }, - { 122888, 122904 }, - { 122907, 122913 }, - { 122915, 122916 }, - { 122918, 122922 }, - { 123184, 123190 }, + { 119141, 119145 }, + { 119149, 119154 }, + { 119163, 119170 }, + { 119173, 119179 }, + { 119210, 119213 }, + { 119362, 119364 }, + { 121344, 121398 }, + { 121403, 121452 }, + { 121461, 121461 }, + { 121476, 121476 }, + { 121499, 121503 }, + { 121505, 121519 }, + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, + { 123184, 123190 }, { 123566, 123566 }, - { 123628, 123631 }, - { 125136, 125142 }, - { 125252, 125258 }, - { 917760, 917999 }, + { 123628, 123631 }, + { 125136, 125142 }, + { 125252, 125258 }, + { 917760, 917999 }, }; static const URange16 Mc_range16[] = { { 2307, 2307 }, @@ -3021,7 +3021,7 @@ static const URange16 Mc_range16[] = { { 7204, 7211 }, { 7220, 7221 }, { 7393, 7393 }, - { 7415, 7415 }, + { 7415, 7415 }, { 12334, 12335 }, { 43043, 43044 }, { 43047, 43047 }, @@ -3031,7 +3031,7 @@ static const URange16 Mc_range16[] = { { 43395, 43395 }, { 43444, 43445 }, { 43450, 43451 }, - { 43454, 43456 }, + { 43454, 43456 }, { 43567, 43568 }, { 43571, 43572 }, { 43597, 43597 }, @@ -3052,11 +3052,11 @@ static const URange32 Mc_range32[] = { { 69808, 69810 }, { 69815, 69816 }, { 69932, 69932 }, - { 69957, 69958 }, + { 69957, 69958 }, { 70018, 70018 }, { 70067, 70069 }, { 70079, 70080 }, - { 70094, 70094 }, + { 70094, 70094 }, { 70188, 70190 }, { 70194, 70195 }, { 70197, 70197 }, @@ -3068,9 +3068,9 @@ static const URange32 Mc_range32[] = { { 70475, 70477 }, { 70487, 70487 }, { 70498, 70499 }, - { 70709, 70711 }, - { 70720, 70721 }, - { 70725, 70725 }, + { 70709, 70711 }, + { 70720, 70721 }, + { 70725, 70725 }, { 70832, 70834 }, { 70841, 70841 }, { 70843, 70846 }, @@ -3086,40 +3086,40 @@ static const URange32 Mc_range32[] = { { 71350, 71350 }, { 71456, 71457 }, { 71462, 71462 }, - { 71724, 71726 }, - { 71736, 71736 }, - { 71984, 71989 }, - { 71991, 71992 }, - { 71997, 71997 }, - { 72000, 72000 }, - { 72002, 72002 }, - { 72145, 72147 }, - { 72156, 72159 }, - { 72164, 72164 }, - { 72249, 72249 }, - { 72279, 72280 }, - { 72343, 72343 }, - { 72751, 72751 }, - { 72766, 72766 }, - { 72873, 72873 }, - { 72881, 72881 }, - { 72884, 72884 }, - { 73098, 73102 }, - { 73107, 73108 }, - { 73110, 73110 }, - { 73461, 73462 }, - { 94033, 94087 }, - { 94192, 94193 }, + { 71724, 71726 }, + { 71736, 71736 }, + { 71984, 71989 }, + { 71991, 71992 }, + { 71997, 71997 }, + { 72000, 72000 }, + { 72002, 72002 }, + { 72145, 72147 }, + { 72156, 72159 }, + { 72164, 72164 }, + { 72249, 72249 }, + { 72279, 72280 }, + { 72343, 72343 }, + { 72751, 72751 }, + { 72766, 72766 }, + { 72873, 72873 }, + { 72881, 72881 }, + { 72884, 72884 }, + { 73098, 73102 }, + { 73107, 73108 }, + { 73110, 73110 }, + { 73461, 73462 }, + { 94033, 94087 }, + { 94192, 94193 }, { 119141, 119142 }, { 119149, 119154 }, }; -static const URange16 Me_range16[] = { - { 1160, 1161 }, - { 6846, 6846 }, - { 8413, 8416 }, - { 8418, 8420 }, - { 42608, 42610 }, -}; +static const URange16 Me_range16[] = { + { 1160, 1161 }, + { 6846, 6846 }, + { 8413, 8416 }, + { 8418, 8420 }, + { 42608, 42610 }, +}; static const URange16 Mn_range16[] = { { 768, 879 }, { 1155, 1159 }, @@ -3139,7 +3139,7 @@ static const URange16 Mn_range16[] = { { 1840, 1866 }, { 1958, 1968 }, { 2027, 2035 }, - { 2045, 2045 }, + { 2045, 2045 }, { 2070, 2073 }, { 2075, 2083 }, { 2085, 2087 }, @@ -3159,7 +3159,7 @@ static const URange16 Mn_range16[] = { { 2497, 2500 }, { 2509, 2509 }, { 2530, 2531 }, - { 2558, 2558 }, + { 2558, 2558 }, { 2561, 2562 }, { 2620, 2620 }, { 2625, 2626 }, @@ -3174,19 +3174,19 @@ static const URange16 Mn_range16[] = { { 2759, 2760 }, { 2765, 2765 }, { 2786, 2787 }, - { 2810, 2815 }, + { 2810, 2815 }, { 2817, 2817 }, { 2876, 2876 }, { 2879, 2879 }, { 2881, 2884 }, { 2893, 2893 }, - { 2901, 2902 }, + { 2901, 2902 }, { 2914, 2915 }, { 2946, 2946 }, { 3008, 3008 }, { 3021, 3021 }, { 3072, 3072 }, - { 3076, 3076 }, + { 3076, 3076 }, { 3132, 3132 }, { 3134, 3136 }, { 3142, 3144 }, @@ -3199,12 +3199,12 @@ static const URange16 Mn_range16[] = { { 3270, 3270 }, { 3276, 3277 }, { 3298, 3299 }, - { 3328, 3329 }, - { 3387, 3388 }, + { 3328, 3329 }, + { 3387, 3388 }, { 3393, 3396 }, { 3405, 3405 }, { 3426, 3427 }, - { 3457, 3457 }, + { 3457, 3457 }, { 3530, 3530 }, { 3538, 3540 }, { 3542, 3542 }, @@ -3212,7 +3212,7 @@ static const URange16 Mn_range16[] = { { 3636, 3642 }, { 3655, 3662 }, { 3761, 3761 }, - { 3764, 3772 }, + { 3764, 3772 }, { 3784, 3789 }, { 3864, 3865 }, { 3893, 3893 }, @@ -3247,7 +3247,7 @@ static const URange16 Mn_range16[] = { { 6109, 6109 }, { 6155, 6157 }, { 6159, 6159 }, - { 6277, 6278 }, + { 6277, 6278 }, { 6313, 6313 }, { 6432, 6434 }, { 6439, 6440 }, @@ -3303,16 +3303,16 @@ static const URange16 Mn_range16[] = { { 43014, 43014 }, { 43019, 43019 }, { 43045, 43046 }, - { 43052, 43052 }, - { 43204, 43205 }, + { 43052, 43052 }, + { 43204, 43205 }, { 43232, 43249 }, - { 43263, 43263 }, + { 43263, 43263 }, { 43302, 43309 }, { 43335, 43345 }, { 43392, 43394 }, { 43443, 43443 }, { 43446, 43449 }, - { 43452, 43453 }, + { 43452, 43453 }, { 43493, 43493 }, { 43561, 43566 }, { 43569, 43570 }, @@ -3344,9 +3344,9 @@ static const URange32 Mn_range32[] = { { 68152, 68154 }, { 68159, 68159 }, { 68325, 68326 }, - { 68900, 68903 }, - { 69291, 69292 }, - { 69446, 69456 }, + { 68900, 68903 }, + { 69291, 69292 }, + { 69446, 69456 }, { 69506, 69509 }, { 69633, 69633 }, { 69688, 69702 }, @@ -3362,23 +3362,23 @@ static const URange32 Mn_range32[] = { { 70003, 70003 }, { 70016, 70017 }, { 70070, 70078 }, - { 70089, 70092 }, - { 70095, 70095 }, + { 70089, 70092 }, + { 70095, 70095 }, { 70191, 70193 }, { 70196, 70196 }, { 70198, 70199 }, - { 70206, 70206 }, + { 70206, 70206 }, { 70367, 70367 }, { 70371, 70378 }, { 70400, 70401 }, - { 70459, 70460 }, + { 70459, 70460 }, { 70464, 70464 }, { 70502, 70508 }, { 70512, 70516 }, - { 70712, 70719 }, - { 70722, 70724 }, - { 70726, 70726 }, - { 70750, 70750 }, + { 70712, 70719 }, + { 70722, 70724 }, + { 70726, 70726 }, + { 70750, 70750 }, { 70835, 70840 }, { 70842, 70842 }, { 70847, 70848 }, @@ -3397,43 +3397,43 @@ static const URange32 Mn_range32[] = { { 71453, 71455 }, { 71458, 71461 }, { 71463, 71467 }, - { 71727, 71735 }, - { 71737, 71738 }, - { 71995, 71996 }, - { 71998, 71998 }, - { 72003, 72003 }, - { 72148, 72151 }, - { 72154, 72155 }, - { 72160, 72160 }, - { 72193, 72202 }, - { 72243, 72248 }, - { 72251, 72254 }, - { 72263, 72263 }, - { 72273, 72278 }, - { 72281, 72283 }, - { 72330, 72342 }, - { 72344, 72345 }, - { 72752, 72758 }, - { 72760, 72765 }, - { 72767, 72767 }, - { 72850, 72871 }, - { 72874, 72880 }, - { 72882, 72883 }, - { 72885, 72886 }, - { 73009, 73014 }, - { 73018, 73018 }, - { 73020, 73021 }, - { 73023, 73029 }, - { 73031, 73031 }, - { 73104, 73105 }, - { 73109, 73109 }, - { 73111, 73111 }, - { 73459, 73460 }, + { 71727, 71735 }, + { 71737, 71738 }, + { 71995, 71996 }, + { 71998, 71998 }, + { 72003, 72003 }, + { 72148, 72151 }, + { 72154, 72155 }, + { 72160, 72160 }, + { 72193, 72202 }, + { 72243, 72248 }, + { 72251, 72254 }, + { 72263, 72263 }, + { 72273, 72278 }, + { 72281, 72283 }, + { 72330, 72342 }, + { 72344, 72345 }, + { 72752, 72758 }, + { 72760, 72765 }, + { 72767, 72767 }, + { 72850, 72871 }, + { 72874, 72880 }, + { 72882, 72883 }, + { 72885, 72886 }, + { 73009, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73029 }, + { 73031, 73031 }, + { 73104, 73105 }, + { 73109, 73109 }, + { 73111, 73111 }, + { 73459, 73460 }, { 92912, 92916 }, { 92976, 92982 }, - { 94031, 94031 }, + { 94031, 94031 }, { 94095, 94098 }, - { 94180, 94180 }, + { 94180, 94180 }, { 113821, 113822 }, { 118528, 118573 }, { 118576, 118598 }, @@ -3448,16 +3448,16 @@ static const URange32 Mn_range32[] = { { 121476, 121476 }, { 121499, 121503 }, { 121505, 121519 }, - { 122880, 122886 }, - { 122888, 122904 }, - { 122907, 122913 }, - { 122915, 122916 }, - { 122918, 122922 }, - { 123184, 123190 }, + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, + { 123184, 123190 }, { 123566, 123566 }, - { 123628, 123631 }, + { 123628, 123631 }, { 125136, 125142 }, - { 125252, 125258 }, + { 125252, 125258 }, { 917760, 917999 }, }; static const URange16 N_range16[] = { @@ -3479,8 +3479,8 @@ static const URange16 N_range16[] = { { 3174, 3183 }, { 3192, 3198 }, { 3302, 3311 }, - { 3416, 3422 }, - { 3430, 3448 }, + { 3416, 3422 }, + { 3430, 3448 }, { 3558, 3567 }, { 3664, 3673 }, { 3792, 3801 }, @@ -3547,7 +3547,7 @@ static const URange32 N_range32[] = { { 68028, 68029 }, { 68032, 68047 }, { 68050, 68095 }, - { 68160, 68168 }, + { 68160, 68168 }, { 68221, 68222 }, { 68253, 68255 }, { 68331, 68335 }, @@ -3555,206 +3555,206 @@ static const URange32 N_range32[] = { { 68472, 68479 }, { 68521, 68527 }, { 68858, 68863 }, - { 68912, 68921 }, + { 68912, 68921 }, { 69216, 69246 }, - { 69405, 69414 }, - { 69457, 69460 }, - { 69573, 69579 }, + { 69405, 69414 }, + { 69457, 69460 }, + { 69573, 69579 }, { 69714, 69743 }, { 69872, 69881 }, { 69942, 69951 }, { 70096, 70105 }, { 70113, 70132 }, { 70384, 70393 }, - { 70736, 70745 }, + { 70736, 70745 }, { 70864, 70873 }, { 71248, 71257 }, { 71360, 71369 }, { 71472, 71483 }, { 71904, 71922 }, - { 72016, 72025 }, - { 72784, 72812 }, - { 73040, 73049 }, - { 73120, 73129 }, - { 73664, 73684 }, + { 72016, 72025 }, + { 72784, 72812 }, + { 73040, 73049 }, + { 73120, 73129 }, + { 73664, 73684 }, { 74752, 74862 }, { 92768, 92777 }, { 92864, 92873 }, { 93008, 93017 }, { 93019, 93025 }, - { 93824, 93846 }, - { 119520, 119539 }, - { 119648, 119672 }, + { 93824, 93846 }, + { 119520, 119539 }, + { 119648, 119672 }, { 120782, 120831 }, - { 123200, 123209 }, - { 123632, 123641 }, + { 123200, 123209 }, + { 123632, 123641 }, { 125127, 125135 }, - { 125264, 125273 }, - { 126065, 126123 }, - { 126125, 126127 }, - { 126129, 126132 }, - { 126209, 126253 }, - { 126255, 126269 }, + { 125264, 125273 }, + { 126065, 126123 }, + { 126125, 126127 }, + { 126129, 126132 }, + { 126209, 126253 }, + { 126255, 126269 }, { 127232, 127244 }, - { 130032, 130041 }, -}; -static const URange16 Nd_range16[] = { - { 48, 57 }, - { 1632, 1641 }, - { 1776, 1785 }, - { 1984, 1993 }, - { 2406, 2415 }, - { 2534, 2543 }, - { 2662, 2671 }, - { 2790, 2799 }, - { 2918, 2927 }, - { 3046, 3055 }, - { 3174, 3183 }, - { 3302, 3311 }, - { 3430, 3439 }, - { 3558, 3567 }, - { 3664, 3673 }, - { 3792, 3801 }, - { 3872, 3881 }, - { 4160, 4169 }, - { 4240, 4249 }, - { 6112, 6121 }, - { 6160, 6169 }, - { 6470, 6479 }, - { 6608, 6617 }, - { 6784, 6793 }, - { 6800, 6809 }, - { 6992, 7001 }, - { 7088, 7097 }, - { 7232, 7241 }, - { 7248, 7257 }, - { 42528, 42537 }, - { 43216, 43225 }, - { 43264, 43273 }, - { 43472, 43481 }, - { 43504, 43513 }, - { 43600, 43609 }, - { 44016, 44025 }, - { 65296, 65305 }, -}; -static const URange32 Nd_range32[] = { - { 66720, 66729 }, - { 68912, 68921 }, - { 69734, 69743 }, - { 69872, 69881 }, - { 69942, 69951 }, - { 70096, 70105 }, - { 70384, 70393 }, - { 70736, 70745 }, - { 70864, 70873 }, - { 71248, 71257 }, - { 71360, 71369 }, - { 71472, 71481 }, - { 71904, 71913 }, - { 72016, 72025 }, - { 72784, 72793 }, - { 73040, 73049 }, - { 73120, 73129 }, - { 92768, 92777 }, + { 130032, 130041 }, +}; +static const URange16 Nd_range16[] = { + { 48, 57 }, + { 1632, 1641 }, + { 1776, 1785 }, + { 1984, 1993 }, + { 2406, 2415 }, + { 2534, 2543 }, + { 2662, 2671 }, + { 2790, 2799 }, + { 2918, 2927 }, + { 3046, 3055 }, + { 3174, 3183 }, + { 3302, 3311 }, + { 3430, 3439 }, + { 3558, 3567 }, + { 3664, 3673 }, + { 3792, 3801 }, + { 3872, 3881 }, + { 4160, 4169 }, + { 4240, 4249 }, + { 6112, 6121 }, + { 6160, 6169 }, + { 6470, 6479 }, + { 6608, 6617 }, + { 6784, 6793 }, + { 6800, 6809 }, + { 6992, 7001 }, + { 7088, 7097 }, + { 7232, 7241 }, + { 7248, 7257 }, + { 42528, 42537 }, + { 43216, 43225 }, + { 43264, 43273 }, + { 43472, 43481 }, + { 43504, 43513 }, + { 43600, 43609 }, + { 44016, 44025 }, + { 65296, 65305 }, +}; +static const URange32 Nd_range32[] = { + { 66720, 66729 }, + { 68912, 68921 }, + { 69734, 69743 }, + { 69872, 69881 }, + { 69942, 69951 }, + { 70096, 70105 }, + { 70384, 70393 }, + { 70736, 70745 }, + { 70864, 70873 }, + { 71248, 71257 }, + { 71360, 71369 }, + { 71472, 71481 }, + { 71904, 71913 }, + { 72016, 72025 }, + { 72784, 72793 }, + { 73040, 73049 }, + { 73120, 73129 }, + { 92768, 92777 }, { 92864, 92873 }, - { 93008, 93017 }, - { 120782, 120831 }, - { 123200, 123209 }, - { 123632, 123641 }, - { 125264, 125273 }, - { 130032, 130041 }, -}; -static const URange16 Nl_range16[] = { - { 5870, 5872 }, - { 8544, 8578 }, - { 8581, 8584 }, - { 12295, 12295 }, - { 12321, 12329 }, - { 12344, 12346 }, - { 42726, 42735 }, -}; -static const URange32 Nl_range32[] = { - { 65856, 65908 }, - { 66369, 66369 }, - { 66378, 66378 }, - { 66513, 66517 }, - { 74752, 74862 }, -}; -static const URange16 No_range16[] = { - { 178, 179 }, - { 185, 185 }, - { 188, 190 }, - { 2548, 2553 }, - { 2930, 2935 }, - { 3056, 3058 }, - { 3192, 3198 }, - { 3416, 3422 }, - { 3440, 3448 }, - { 3882, 3891 }, - { 4969, 4988 }, - { 6128, 6137 }, - { 6618, 6618 }, - { 8304, 8304 }, - { 8308, 8313 }, - { 8320, 8329 }, - { 8528, 8543 }, - { 8585, 8585 }, - { 9312, 9371 }, - { 9450, 9471 }, - { 10102, 10131 }, - { 11517, 11517 }, - { 12690, 12693 }, - { 12832, 12841 }, - { 12872, 12879 }, - { 12881, 12895 }, - { 12928, 12937 }, - { 12977, 12991 }, - { 43056, 43061 }, -}; -static const URange32 No_range32[] = { - { 65799, 65843 }, - { 65909, 65912 }, - { 65930, 65931 }, - { 66273, 66299 }, - { 66336, 66339 }, - { 67672, 67679 }, - { 67705, 67711 }, - { 67751, 67759 }, - { 67835, 67839 }, - { 67862, 67867 }, - { 68028, 68029 }, - { 68032, 68047 }, - { 68050, 68095 }, - { 68160, 68168 }, - { 68221, 68222 }, - { 68253, 68255 }, - { 68331, 68335 }, - { 68440, 68447 }, - { 68472, 68479 }, - { 68521, 68527 }, - { 68858, 68863 }, - { 69216, 69246 }, - { 69405, 69414 }, - { 69457, 69460 }, - { 69573, 69579 }, - { 69714, 69733 }, - { 70113, 70132 }, - { 71482, 71483 }, - { 71914, 71922 }, - { 72794, 72812 }, - { 73664, 73684 }, - { 93019, 93025 }, - { 93824, 93846 }, - { 119520, 119539 }, - { 119648, 119672 }, - { 125127, 125135 }, - { 126065, 126123 }, - { 126125, 126127 }, - { 126129, 126132 }, - { 126209, 126253 }, - { 126255, 126269 }, - { 127232, 127244 }, -}; + { 93008, 93017 }, + { 120782, 120831 }, + { 123200, 123209 }, + { 123632, 123641 }, + { 125264, 125273 }, + { 130032, 130041 }, +}; +static const URange16 Nl_range16[] = { + { 5870, 5872 }, + { 8544, 8578 }, + { 8581, 8584 }, + { 12295, 12295 }, + { 12321, 12329 }, + { 12344, 12346 }, + { 42726, 42735 }, +}; +static const URange32 Nl_range32[] = { + { 65856, 65908 }, + { 66369, 66369 }, + { 66378, 66378 }, + { 66513, 66517 }, + { 74752, 74862 }, +}; +static const URange16 No_range16[] = { + { 178, 179 }, + { 185, 185 }, + { 188, 190 }, + { 2548, 2553 }, + { 2930, 2935 }, + { 3056, 3058 }, + { 3192, 3198 }, + { 3416, 3422 }, + { 3440, 3448 }, + { 3882, 3891 }, + { 4969, 4988 }, + { 6128, 6137 }, + { 6618, 6618 }, + { 8304, 8304 }, + { 8308, 8313 }, + { 8320, 8329 }, + { 8528, 8543 }, + { 8585, 8585 }, + { 9312, 9371 }, + { 9450, 9471 }, + { 10102, 10131 }, + { 11517, 11517 }, + { 12690, 12693 }, + { 12832, 12841 }, + { 12872, 12879 }, + { 12881, 12895 }, + { 12928, 12937 }, + { 12977, 12991 }, + { 43056, 43061 }, +}; +static const URange32 No_range32[] = { + { 65799, 65843 }, + { 65909, 65912 }, + { 65930, 65931 }, + { 66273, 66299 }, + { 66336, 66339 }, + { 67672, 67679 }, + { 67705, 67711 }, + { 67751, 67759 }, + { 67835, 67839 }, + { 67862, 67867 }, + { 68028, 68029 }, + { 68032, 68047 }, + { 68050, 68095 }, + { 68160, 68168 }, + { 68221, 68222 }, + { 68253, 68255 }, + { 68331, 68335 }, + { 68440, 68447 }, + { 68472, 68479 }, + { 68521, 68527 }, + { 68858, 68863 }, + { 69216, 69246 }, + { 69405, 69414 }, + { 69457, 69460 }, + { 69573, 69579 }, + { 69714, 69733 }, + { 70113, 70132 }, + { 71482, 71483 }, + { 71914, 71922 }, + { 72794, 72812 }, + { 73664, 73684 }, + { 93019, 93025 }, + { 93824, 93846 }, + { 119520, 119539 }, + { 119648, 119672 }, + { 125127, 125135 }, + { 126065, 126123 }, + { 126125, 126127 }, + { 126129, 126132 }, + { 126209, 126253 }, + { 126255, 126269 }, + { 127232, 127244 }, +}; static const URange16 P_range16[] = { { 33, 35 }, { 37, 42 }, @@ -3792,11 +3792,11 @@ static const URange16 P_range16[] = { { 2142, 2142 }, { 2404, 2405 }, { 2416, 2416 }, - { 2557, 2557 }, - { 2678, 2678 }, + { 2557, 2557 }, + { 2678, 2678 }, { 2800, 2800 }, - { 3191, 3191 }, - { 3204, 3204 }, + { 3191, 3191 }, + { 3204, 3204 }, { 3572, 3572 }, { 3663, 3663 }, { 3674, 3675 }, @@ -3810,7 +3810,7 @@ static const URange16 P_range16[] = { { 4347, 4347 }, { 4960, 4968 }, { 5120, 5120 }, - { 5742, 5742 }, + { 5742, 5742 }, { 5787, 5788 }, { 5867, 5869 }, { 5941, 5942 }, @@ -3846,7 +3846,7 @@ static const URange16 P_range16[] = { { 11518, 11519 }, { 11632, 11632 }, { 11776, 11822 }, - { 11824, 11855 }, + { 11824, 11855 }, { 11858, 11869 }, { 12289, 12291 }, { 12296, 12305 }, @@ -3903,455 +3903,455 @@ static const URange32 P_range32[] = { { 68336, 68342 }, { 68409, 68415 }, { 68505, 68508 }, - { 69293, 69293 }, - { 69461, 69465 }, + { 69293, 69293 }, + { 69461, 69465 }, { 69510, 69513 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, { 69952, 69955 }, { 70004, 70005 }, - { 70085, 70088 }, + { 70085, 70088 }, { 70093, 70093 }, { 70107, 70107 }, { 70109, 70111 }, { 70200, 70205 }, { 70313, 70313 }, - { 70731, 70735 }, - { 70746, 70747 }, - { 70749, 70749 }, + { 70731, 70735 }, + { 70746, 70747 }, + { 70749, 70749 }, { 70854, 70854 }, { 71105, 71127 }, { 71233, 71235 }, - { 71264, 71276 }, + { 71264, 71276 }, { 71353, 71353 }, { 71484, 71486 }, - { 71739, 71739 }, - { 72004, 72006 }, - { 72162, 72162 }, - { 72255, 72262 }, - { 72346, 72348 }, - { 72350, 72354 }, - { 72769, 72773 }, - { 72816, 72817 }, - { 73463, 73464 }, - { 73727, 73727 }, + { 71739, 71739 }, + { 72004, 72006 }, + { 72162, 72162 }, + { 72255, 72262 }, + { 72346, 72348 }, + { 72350, 72354 }, + { 72769, 72773 }, + { 72816, 72817 }, + { 73463, 73464 }, + { 73727, 73727 }, { 74864, 74868 }, { 77809, 77810 }, { 92782, 92783 }, { 92917, 92917 }, { 92983, 92987 }, { 92996, 92996 }, - { 93847, 93850 }, - { 94178, 94178 }, + { 93847, 93850 }, + { 94178, 94178 }, { 113823, 113823 }, { 121479, 121483 }, - { 125278, 125279 }, -}; -static const URange16 Pc_range16[] = { - { 95, 95 }, - { 8255, 8256 }, - { 8276, 8276 }, - { 65075, 65076 }, - { 65101, 65103 }, - { 65343, 65343 }, -}; -static const URange16 Pd_range16[] = { - { 45, 45 }, - { 1418, 1418 }, - { 1470, 1470 }, - { 5120, 5120 }, - { 6150, 6150 }, - { 8208, 8213 }, - { 11799, 11799 }, - { 11802, 11802 }, - { 11834, 11835 }, - { 11840, 11840 }, + { 125278, 125279 }, +}; +static const URange16 Pc_range16[] = { + { 95, 95 }, + { 8255, 8256 }, + { 8276, 8276 }, + { 65075, 65076 }, + { 65101, 65103 }, + { 65343, 65343 }, +}; +static const URange16 Pd_range16[] = { + { 45, 45 }, + { 1418, 1418 }, + { 1470, 1470 }, + { 5120, 5120 }, + { 6150, 6150 }, + { 8208, 8213 }, + { 11799, 11799 }, + { 11802, 11802 }, + { 11834, 11835 }, + { 11840, 11840 }, { 11869, 11869 }, - { 12316, 12316 }, - { 12336, 12336 }, - { 12448, 12448 }, - { 65073, 65074 }, - { 65112, 65112 }, - { 65123, 65123 }, - { 65293, 65293 }, -}; -static const URange32 Pd_range32[] = { - { 69293, 69293 }, -}; -static const URange16 Pe_range16[] = { - { 41, 41 }, - { 93, 93 }, - { 125, 125 }, - { 3899, 3899 }, - { 3901, 3901 }, - { 5788, 5788 }, - { 8262, 8262 }, - { 8318, 8318 }, - { 8334, 8334 }, - { 8969, 8969 }, - { 8971, 8971 }, - { 9002, 9002 }, - { 10089, 10089 }, - { 10091, 10091 }, - { 10093, 10093 }, - { 10095, 10095 }, - { 10097, 10097 }, - { 10099, 10099 }, - { 10101, 10101 }, - { 10182, 10182 }, - { 10215, 10215 }, - { 10217, 10217 }, - { 10219, 10219 }, - { 10221, 10221 }, - { 10223, 10223 }, - { 10628, 10628 }, - { 10630, 10630 }, - { 10632, 10632 }, - { 10634, 10634 }, - { 10636, 10636 }, - { 10638, 10638 }, - { 10640, 10640 }, - { 10642, 10642 }, - { 10644, 10644 }, - { 10646, 10646 }, - { 10648, 10648 }, - { 10713, 10713 }, - { 10715, 10715 }, - { 10749, 10749 }, - { 11811, 11811 }, - { 11813, 11813 }, - { 11815, 11815 }, - { 11817, 11817 }, + { 12316, 12316 }, + { 12336, 12336 }, + { 12448, 12448 }, + { 65073, 65074 }, + { 65112, 65112 }, + { 65123, 65123 }, + { 65293, 65293 }, +}; +static const URange32 Pd_range32[] = { + { 69293, 69293 }, +}; +static const URange16 Pe_range16[] = { + { 41, 41 }, + { 93, 93 }, + { 125, 125 }, + { 3899, 3899 }, + { 3901, 3901 }, + { 5788, 5788 }, + { 8262, 8262 }, + { 8318, 8318 }, + { 8334, 8334 }, + { 8969, 8969 }, + { 8971, 8971 }, + { 9002, 9002 }, + { 10089, 10089 }, + { 10091, 10091 }, + { 10093, 10093 }, + { 10095, 10095 }, + { 10097, 10097 }, + { 10099, 10099 }, + { 10101, 10101 }, + { 10182, 10182 }, + { 10215, 10215 }, + { 10217, 10217 }, + { 10219, 10219 }, + { 10221, 10221 }, + { 10223, 10223 }, + { 10628, 10628 }, + { 10630, 10630 }, + { 10632, 10632 }, + { 10634, 10634 }, + { 10636, 10636 }, + { 10638, 10638 }, + { 10640, 10640 }, + { 10642, 10642 }, + { 10644, 10644 }, + { 10646, 10646 }, + { 10648, 10648 }, + { 10713, 10713 }, + { 10715, 10715 }, + { 10749, 10749 }, + { 11811, 11811 }, + { 11813, 11813 }, + { 11815, 11815 }, + { 11817, 11817 }, { 11862, 11862 }, { 11864, 11864 }, { 11866, 11866 }, { 11868, 11868 }, - { 12297, 12297 }, - { 12299, 12299 }, - { 12301, 12301 }, - { 12303, 12303 }, - { 12305, 12305 }, - { 12309, 12309 }, - { 12311, 12311 }, - { 12313, 12313 }, - { 12315, 12315 }, - { 12318, 12319 }, - { 64830, 64830 }, - { 65048, 65048 }, - { 65078, 65078 }, - { 65080, 65080 }, - { 65082, 65082 }, - { 65084, 65084 }, - { 65086, 65086 }, - { 65088, 65088 }, - { 65090, 65090 }, - { 65092, 65092 }, - { 65096, 65096 }, - { 65114, 65114 }, - { 65116, 65116 }, - { 65118, 65118 }, - { 65289, 65289 }, - { 65341, 65341 }, - { 65373, 65373 }, - { 65376, 65376 }, - { 65379, 65379 }, -}; -static const URange16 Pf_range16[] = { - { 187, 187 }, - { 8217, 8217 }, - { 8221, 8221 }, - { 8250, 8250 }, - { 11779, 11779 }, - { 11781, 11781 }, - { 11786, 11786 }, - { 11789, 11789 }, - { 11805, 11805 }, - { 11809, 11809 }, -}; -static const URange16 Pi_range16[] = { - { 171, 171 }, - { 8216, 8216 }, - { 8219, 8220 }, - { 8223, 8223 }, - { 8249, 8249 }, - { 11778, 11778 }, - { 11780, 11780 }, - { 11785, 11785 }, - { 11788, 11788 }, - { 11804, 11804 }, - { 11808, 11808 }, -}; -static const URange16 Po_range16[] = { - { 33, 35 }, - { 37, 39 }, - { 42, 42 }, - { 44, 44 }, - { 46, 47 }, - { 58, 59 }, - { 63, 64 }, - { 92, 92 }, - { 161, 161 }, - { 167, 167 }, - { 182, 183 }, - { 191, 191 }, - { 894, 894 }, - { 903, 903 }, - { 1370, 1375 }, - { 1417, 1417 }, - { 1472, 1472 }, - { 1475, 1475 }, - { 1478, 1478 }, - { 1523, 1524 }, - { 1545, 1546 }, - { 1548, 1549 }, - { 1563, 1563 }, + { 12297, 12297 }, + { 12299, 12299 }, + { 12301, 12301 }, + { 12303, 12303 }, + { 12305, 12305 }, + { 12309, 12309 }, + { 12311, 12311 }, + { 12313, 12313 }, + { 12315, 12315 }, + { 12318, 12319 }, + { 64830, 64830 }, + { 65048, 65048 }, + { 65078, 65078 }, + { 65080, 65080 }, + { 65082, 65082 }, + { 65084, 65084 }, + { 65086, 65086 }, + { 65088, 65088 }, + { 65090, 65090 }, + { 65092, 65092 }, + { 65096, 65096 }, + { 65114, 65114 }, + { 65116, 65116 }, + { 65118, 65118 }, + { 65289, 65289 }, + { 65341, 65341 }, + { 65373, 65373 }, + { 65376, 65376 }, + { 65379, 65379 }, +}; +static const URange16 Pf_range16[] = { + { 187, 187 }, + { 8217, 8217 }, + { 8221, 8221 }, + { 8250, 8250 }, + { 11779, 11779 }, + { 11781, 11781 }, + { 11786, 11786 }, + { 11789, 11789 }, + { 11805, 11805 }, + { 11809, 11809 }, +}; +static const URange16 Pi_range16[] = { + { 171, 171 }, + { 8216, 8216 }, + { 8219, 8220 }, + { 8223, 8223 }, + { 8249, 8249 }, + { 11778, 11778 }, + { 11780, 11780 }, + { 11785, 11785 }, + { 11788, 11788 }, + { 11804, 11804 }, + { 11808, 11808 }, +}; +static const URange16 Po_range16[] = { + { 33, 35 }, + { 37, 39 }, + { 42, 42 }, + { 44, 44 }, + { 46, 47 }, + { 58, 59 }, + { 63, 64 }, + { 92, 92 }, + { 161, 161 }, + { 167, 167 }, + { 182, 183 }, + { 191, 191 }, + { 894, 894 }, + { 903, 903 }, + { 1370, 1375 }, + { 1417, 1417 }, + { 1472, 1472 }, + { 1475, 1475 }, + { 1478, 1478 }, + { 1523, 1524 }, + { 1545, 1546 }, + { 1548, 1549 }, + { 1563, 1563 }, { 1565, 1567 }, - { 1642, 1645 }, - { 1748, 1748 }, - { 1792, 1805 }, - { 2039, 2041 }, - { 2096, 2110 }, - { 2142, 2142 }, - { 2404, 2405 }, - { 2416, 2416 }, - { 2557, 2557 }, - { 2678, 2678 }, - { 2800, 2800 }, - { 3191, 3191 }, - { 3204, 3204 }, - { 3572, 3572 }, - { 3663, 3663 }, - { 3674, 3675 }, - { 3844, 3858 }, - { 3860, 3860 }, - { 3973, 3973 }, - { 4048, 4052 }, - { 4057, 4058 }, - { 4170, 4175 }, - { 4347, 4347 }, - { 4960, 4968 }, - { 5742, 5742 }, - { 5867, 5869 }, - { 5941, 5942 }, - { 6100, 6102 }, - { 6104, 6106 }, - { 6144, 6149 }, - { 6151, 6154 }, - { 6468, 6469 }, - { 6686, 6687 }, - { 6816, 6822 }, - { 6824, 6829 }, - { 7002, 7008 }, + { 1642, 1645 }, + { 1748, 1748 }, + { 1792, 1805 }, + { 2039, 2041 }, + { 2096, 2110 }, + { 2142, 2142 }, + { 2404, 2405 }, + { 2416, 2416 }, + { 2557, 2557 }, + { 2678, 2678 }, + { 2800, 2800 }, + { 3191, 3191 }, + { 3204, 3204 }, + { 3572, 3572 }, + { 3663, 3663 }, + { 3674, 3675 }, + { 3844, 3858 }, + { 3860, 3860 }, + { 3973, 3973 }, + { 4048, 4052 }, + { 4057, 4058 }, + { 4170, 4175 }, + { 4347, 4347 }, + { 4960, 4968 }, + { 5742, 5742 }, + { 5867, 5869 }, + { 5941, 5942 }, + { 6100, 6102 }, + { 6104, 6106 }, + { 6144, 6149 }, + { 6151, 6154 }, + { 6468, 6469 }, + { 6686, 6687 }, + { 6816, 6822 }, + { 6824, 6829 }, + { 7002, 7008 }, { 7037, 7038 }, - { 7164, 7167 }, - { 7227, 7231 }, - { 7294, 7295 }, - { 7360, 7367 }, - { 7379, 7379 }, - { 8214, 8215 }, - { 8224, 8231 }, - { 8240, 8248 }, - { 8251, 8254 }, - { 8257, 8259 }, - { 8263, 8273 }, - { 8275, 8275 }, - { 8277, 8286 }, - { 11513, 11516 }, - { 11518, 11519 }, - { 11632, 11632 }, - { 11776, 11777 }, - { 11782, 11784 }, - { 11787, 11787 }, - { 11790, 11798 }, - { 11800, 11801 }, - { 11803, 11803 }, - { 11806, 11807 }, - { 11818, 11822 }, - { 11824, 11833 }, - { 11836, 11839 }, - { 11841, 11841 }, - { 11843, 11855 }, + { 7164, 7167 }, + { 7227, 7231 }, + { 7294, 7295 }, + { 7360, 7367 }, + { 7379, 7379 }, + { 8214, 8215 }, + { 8224, 8231 }, + { 8240, 8248 }, + { 8251, 8254 }, + { 8257, 8259 }, + { 8263, 8273 }, + { 8275, 8275 }, + { 8277, 8286 }, + { 11513, 11516 }, + { 11518, 11519 }, + { 11632, 11632 }, + { 11776, 11777 }, + { 11782, 11784 }, + { 11787, 11787 }, + { 11790, 11798 }, + { 11800, 11801 }, + { 11803, 11803 }, + { 11806, 11807 }, + { 11818, 11822 }, + { 11824, 11833 }, + { 11836, 11839 }, + { 11841, 11841 }, + { 11843, 11855 }, { 11858, 11860 }, - { 12289, 12291 }, - { 12349, 12349 }, - { 12539, 12539 }, - { 42238, 42239 }, - { 42509, 42511 }, - { 42611, 42611 }, - { 42622, 42622 }, - { 42738, 42743 }, - { 43124, 43127 }, - { 43214, 43215 }, - { 43256, 43258 }, - { 43260, 43260 }, - { 43310, 43311 }, - { 43359, 43359 }, - { 43457, 43469 }, - { 43486, 43487 }, - { 43612, 43615 }, - { 43742, 43743 }, - { 43760, 43761 }, - { 44011, 44011 }, - { 65040, 65046 }, - { 65049, 65049 }, - { 65072, 65072 }, - { 65093, 65094 }, - { 65097, 65100 }, - { 65104, 65106 }, - { 65108, 65111 }, - { 65119, 65121 }, - { 65128, 65128 }, - { 65130, 65131 }, - { 65281, 65283 }, - { 65285, 65287 }, - { 65290, 65290 }, - { 65292, 65292 }, - { 65294, 65295 }, - { 65306, 65307 }, - { 65311, 65312 }, - { 65340, 65340 }, - { 65377, 65377 }, - { 65380, 65381 }, -}; -static const URange32 Po_range32[] = { - { 65792, 65794 }, - { 66463, 66463 }, - { 66512, 66512 }, - { 66927, 66927 }, - { 67671, 67671 }, - { 67871, 67871 }, - { 67903, 67903 }, - { 68176, 68184 }, - { 68223, 68223 }, - { 68336, 68342 }, - { 68409, 68415 }, - { 68505, 68508 }, - { 69461, 69465 }, + { 12289, 12291 }, + { 12349, 12349 }, + { 12539, 12539 }, + { 42238, 42239 }, + { 42509, 42511 }, + { 42611, 42611 }, + { 42622, 42622 }, + { 42738, 42743 }, + { 43124, 43127 }, + { 43214, 43215 }, + { 43256, 43258 }, + { 43260, 43260 }, + { 43310, 43311 }, + { 43359, 43359 }, + { 43457, 43469 }, + { 43486, 43487 }, + { 43612, 43615 }, + { 43742, 43743 }, + { 43760, 43761 }, + { 44011, 44011 }, + { 65040, 65046 }, + { 65049, 65049 }, + { 65072, 65072 }, + { 65093, 65094 }, + { 65097, 65100 }, + { 65104, 65106 }, + { 65108, 65111 }, + { 65119, 65121 }, + { 65128, 65128 }, + { 65130, 65131 }, + { 65281, 65283 }, + { 65285, 65287 }, + { 65290, 65290 }, + { 65292, 65292 }, + { 65294, 65295 }, + { 65306, 65307 }, + { 65311, 65312 }, + { 65340, 65340 }, + { 65377, 65377 }, + { 65380, 65381 }, +}; +static const URange32 Po_range32[] = { + { 65792, 65794 }, + { 66463, 66463 }, + { 66512, 66512 }, + { 66927, 66927 }, + { 67671, 67671 }, + { 67871, 67871 }, + { 67903, 67903 }, + { 68176, 68184 }, + { 68223, 68223 }, + { 68336, 68342 }, + { 68409, 68415 }, + { 68505, 68508 }, + { 69461, 69465 }, { 69510, 69513 }, - { 69703, 69709 }, - { 69819, 69820 }, - { 69822, 69825 }, - { 69952, 69955 }, - { 70004, 70005 }, - { 70085, 70088 }, - { 70093, 70093 }, - { 70107, 70107 }, - { 70109, 70111 }, - { 70200, 70205 }, - { 70313, 70313 }, - { 70731, 70735 }, - { 70746, 70747 }, - { 70749, 70749 }, - { 70854, 70854 }, - { 71105, 71127 }, - { 71233, 71235 }, - { 71264, 71276 }, + { 69703, 69709 }, + { 69819, 69820 }, + { 69822, 69825 }, + { 69952, 69955 }, + { 70004, 70005 }, + { 70085, 70088 }, + { 70093, 70093 }, + { 70107, 70107 }, + { 70109, 70111 }, + { 70200, 70205 }, + { 70313, 70313 }, + { 70731, 70735 }, + { 70746, 70747 }, + { 70749, 70749 }, + { 70854, 70854 }, + { 71105, 71127 }, + { 71233, 71235 }, + { 71264, 71276 }, { 71353, 71353 }, - { 71484, 71486 }, - { 71739, 71739 }, - { 72004, 72006 }, - { 72162, 72162 }, - { 72255, 72262 }, - { 72346, 72348 }, - { 72350, 72354 }, - { 72769, 72773 }, - { 72816, 72817 }, - { 73463, 73464 }, - { 73727, 73727 }, - { 74864, 74868 }, + { 71484, 71486 }, + { 71739, 71739 }, + { 72004, 72006 }, + { 72162, 72162 }, + { 72255, 72262 }, + { 72346, 72348 }, + { 72350, 72354 }, + { 72769, 72773 }, + { 72816, 72817 }, + { 73463, 73464 }, + { 73727, 73727 }, + { 74864, 74868 }, { 77809, 77810 }, - { 92782, 92783 }, - { 92917, 92917 }, - { 92983, 92987 }, - { 92996, 92996 }, - { 93847, 93850 }, - { 94178, 94178 }, - { 113823, 113823 }, - { 121479, 121483 }, - { 125278, 125279 }, -}; -static const URange16 Ps_range16[] = { - { 40, 40 }, - { 91, 91 }, - { 123, 123 }, - { 3898, 3898 }, - { 3900, 3900 }, - { 5787, 5787 }, - { 8218, 8218 }, - { 8222, 8222 }, - { 8261, 8261 }, - { 8317, 8317 }, - { 8333, 8333 }, - { 8968, 8968 }, - { 8970, 8970 }, - { 9001, 9001 }, - { 10088, 10088 }, - { 10090, 10090 }, - { 10092, 10092 }, - { 10094, 10094 }, - { 10096, 10096 }, - { 10098, 10098 }, - { 10100, 10100 }, - { 10181, 10181 }, - { 10214, 10214 }, - { 10216, 10216 }, - { 10218, 10218 }, - { 10220, 10220 }, - { 10222, 10222 }, - { 10627, 10627 }, - { 10629, 10629 }, - { 10631, 10631 }, - { 10633, 10633 }, - { 10635, 10635 }, - { 10637, 10637 }, - { 10639, 10639 }, - { 10641, 10641 }, - { 10643, 10643 }, - { 10645, 10645 }, - { 10647, 10647 }, - { 10712, 10712 }, - { 10714, 10714 }, - { 10748, 10748 }, - { 11810, 11810 }, - { 11812, 11812 }, - { 11814, 11814 }, - { 11816, 11816 }, - { 11842, 11842 }, + { 92782, 92783 }, + { 92917, 92917 }, + { 92983, 92987 }, + { 92996, 92996 }, + { 93847, 93850 }, + { 94178, 94178 }, + { 113823, 113823 }, + { 121479, 121483 }, + { 125278, 125279 }, +}; +static const URange16 Ps_range16[] = { + { 40, 40 }, + { 91, 91 }, + { 123, 123 }, + { 3898, 3898 }, + { 3900, 3900 }, + { 5787, 5787 }, + { 8218, 8218 }, + { 8222, 8222 }, + { 8261, 8261 }, + { 8317, 8317 }, + { 8333, 8333 }, + { 8968, 8968 }, + { 8970, 8970 }, + { 9001, 9001 }, + { 10088, 10088 }, + { 10090, 10090 }, + { 10092, 10092 }, + { 10094, 10094 }, + { 10096, 10096 }, + { 10098, 10098 }, + { 10100, 10100 }, + { 10181, 10181 }, + { 10214, 10214 }, + { 10216, 10216 }, + { 10218, 10218 }, + { 10220, 10220 }, + { 10222, 10222 }, + { 10627, 10627 }, + { 10629, 10629 }, + { 10631, 10631 }, + { 10633, 10633 }, + { 10635, 10635 }, + { 10637, 10637 }, + { 10639, 10639 }, + { 10641, 10641 }, + { 10643, 10643 }, + { 10645, 10645 }, + { 10647, 10647 }, + { 10712, 10712 }, + { 10714, 10714 }, + { 10748, 10748 }, + { 11810, 11810 }, + { 11812, 11812 }, + { 11814, 11814 }, + { 11816, 11816 }, + { 11842, 11842 }, { 11861, 11861 }, { 11863, 11863 }, { 11865, 11865 }, { 11867, 11867 }, - { 12296, 12296 }, - { 12298, 12298 }, - { 12300, 12300 }, - { 12302, 12302 }, - { 12304, 12304 }, - { 12308, 12308 }, - { 12310, 12310 }, - { 12312, 12312 }, - { 12314, 12314 }, - { 12317, 12317 }, - { 64831, 64831 }, - { 65047, 65047 }, - { 65077, 65077 }, - { 65079, 65079 }, - { 65081, 65081 }, - { 65083, 65083 }, - { 65085, 65085 }, - { 65087, 65087 }, - { 65089, 65089 }, - { 65091, 65091 }, - { 65095, 65095 }, - { 65113, 65113 }, - { 65115, 65115 }, - { 65117, 65117 }, - { 65288, 65288 }, - { 65339, 65339 }, - { 65371, 65371 }, - { 65375, 65375 }, - { 65378, 65378 }, -}; + { 12296, 12296 }, + { 12298, 12298 }, + { 12300, 12300 }, + { 12302, 12302 }, + { 12304, 12304 }, + { 12308, 12308 }, + { 12310, 12310 }, + { 12312, 12312 }, + { 12314, 12314 }, + { 12317, 12317 }, + { 64831, 64831 }, + { 65047, 65047 }, + { 65077, 65077 }, + { 65079, 65079 }, + { 65081, 65081 }, + { 65083, 65083 }, + { 65085, 65085 }, + { 65087, 65087 }, + { 65089, 65089 }, + { 65091, 65091 }, + { 65095, 65095 }, + { 65113, 65113 }, + { 65115, 65115 }, + { 65117, 65117 }, + { 65288, 65288 }, + { 65339, 65339 }, + { 65371, 65371 }, + { 65375, 65375 }, + { 65378, 65378 }, +}; static const URange16 S_range16[] = { { 36, 36 }, { 43, 43 }, @@ -4385,7 +4385,7 @@ static const URange16 S_range16[] = { { 1769, 1769 }, { 1789, 1790 }, { 2038, 2038 }, - { 2046, 2047 }, + { 2046, 2047 }, { 2184, 2184 }, { 2546, 2547 }, { 2554, 2555 }, @@ -4393,7 +4393,7 @@ static const URange16 S_range16[] = { { 2928, 2928 }, { 3059, 3066 }, { 3199, 3199 }, - { 3407, 3407 }, + { 3407, 3407 }, { 3449, 3449 }, { 3647, 3647 }, { 3841, 3843 }, @@ -4409,7 +4409,7 @@ static const URange16 S_range16[] = { { 4053, 4056 }, { 4254, 4255 }, { 5008, 5017 }, - { 5741, 5741 }, + { 5741, 5741 }, { 6107, 6107 }, { 6464, 6464 }, { 6622, 6655 }, @@ -4443,7 +4443,7 @@ static const URange16 S_range16[] = { { 8586, 8587 }, { 8592, 8967 }, { 8972, 9000 }, - { 9003, 9254 }, + { 9003, 9254 }, { 9280, 9290 }, { 9372, 9449 }, { 9472, 10087 }, @@ -4454,9 +4454,9 @@ static const URange16 S_range16[] = { { 10716, 10747 }, { 10750, 11123 }, { 11126, 11157 }, - { 11159, 11263 }, + { 11159, 11263 }, { 11493, 11498 }, - { 11856, 11857 }, + { 11856, 11857 }, { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, @@ -4475,7 +4475,7 @@ static const URange16 S_range16[] = { { 12880, 12880 }, { 12896, 12927 }, { 12938, 12976 }, - { 12992, 13311 }, + { 12992, 13311 }, { 19904, 19967 }, { 42128, 42182 }, { 42752, 42774 }, @@ -4485,7 +4485,7 @@ static const URange16 S_range16[] = { { 43062, 43065 }, { 43639, 43641 }, { 43867, 43867 }, - { 43882, 43883 }, + { 43882, 43883 }, { 64297, 64297 }, { 64434, 64450 }, { 64832, 64847 }, @@ -4508,14 +4508,14 @@ static const URange16 S_range16[] = { static const URange32 S_range32[] = { { 65847, 65855 }, { 65913, 65929 }, - { 65932, 65934 }, - { 65936, 65948 }, + { 65932, 65934 }, + { 65936, 65948 }, { 65952, 65952 }, { 66000, 66044 }, { 67703, 67704 }, { 68296, 68296 }, { 71487, 71487 }, - { 73685, 73713 }, + { 73685, 73713 }, { 92988, 92991 }, { 92997, 92997 }, { 113820, 113820 }, @@ -4545,11 +4545,11 @@ static const URange32 S_range32[] = { { 121453, 121460 }, { 121462, 121475 }, { 121477, 121478 }, - { 123215, 123215 }, - { 123647, 123647 }, - { 126124, 126124 }, - { 126128, 126128 }, - { 126254, 126254 }, + { 123215, 123215 }, + { 123647, 123647 }, + { 126124, 126124 }, + { 126128, 126128 }, + { 126254, 126254 }, { 126704, 126705 }, { 126976, 127019 }, { 127024, 127123 }, @@ -4557,167 +4557,167 @@ static const URange32 S_range32[] = { { 127153, 127167 }, { 127169, 127183 }, { 127185, 127221 }, - { 127245, 127405 }, + { 127245, 127405 }, { 127462, 127490 }, - { 127504, 127547 }, + { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, - { 127584, 127589 }, - { 127744, 128727 }, + { 127584, 127589 }, + { 127744, 128727 }, { 128733, 128748 }, - { 128752, 128764 }, + { 128752, 128764 }, { 128768, 128883 }, - { 128896, 128984 }, - { 128992, 129003 }, + { 128896, 128984 }, + { 128992, 129003 }, { 129008, 129008 }, { 129024, 129035 }, { 129040, 129095 }, { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129200, 129201 }, + { 129200, 129201 }, { 129280, 129619 }, - { 129632, 129645 }, - { 129648, 129652 }, + { 129632, 129645 }, + { 129648, 129652 }, { 129656, 129660 }, - { 129664, 129670 }, + { 129664, 129670 }, { 129680, 129708 }, { 129712, 129722 }, { 129728, 129733 }, { 129744, 129753 }, { 129760, 129767 }, { 129776, 129782 }, - { 129792, 129938 }, - { 129940, 129994 }, -}; -static const URange16 Sc_range16[] = { - { 36, 36 }, - { 162, 165 }, - { 1423, 1423 }, - { 1547, 1547 }, - { 2046, 2047 }, - { 2546, 2547 }, - { 2555, 2555 }, - { 2801, 2801 }, - { 3065, 3065 }, - { 3647, 3647 }, - { 6107, 6107 }, + { 129792, 129938 }, + { 129940, 129994 }, +}; +static const URange16 Sc_range16[] = { + { 36, 36 }, + { 162, 165 }, + { 1423, 1423 }, + { 1547, 1547 }, + { 2046, 2047 }, + { 2546, 2547 }, + { 2555, 2555 }, + { 2801, 2801 }, + { 3065, 3065 }, + { 3647, 3647 }, + { 6107, 6107 }, { 8352, 8384 }, - { 43064, 43064 }, - { 65020, 65020 }, - { 65129, 65129 }, - { 65284, 65284 }, - { 65504, 65505 }, - { 65509, 65510 }, -}; -static const URange32 Sc_range32[] = { - { 73693, 73696 }, - { 123647, 123647 }, - { 126128, 126128 }, -}; -static const URange16 Sk_range16[] = { - { 94, 94 }, - { 96, 96 }, - { 168, 168 }, - { 175, 175 }, - { 180, 180 }, - { 184, 184 }, - { 706, 709 }, - { 722, 735 }, - { 741, 747 }, - { 749, 749 }, - { 751, 767 }, - { 885, 885 }, - { 900, 901 }, + { 43064, 43064 }, + { 65020, 65020 }, + { 65129, 65129 }, + { 65284, 65284 }, + { 65504, 65505 }, + { 65509, 65510 }, +}; +static const URange32 Sc_range32[] = { + { 73693, 73696 }, + { 123647, 123647 }, + { 126128, 126128 }, +}; +static const URange16 Sk_range16[] = { + { 94, 94 }, + { 96, 96 }, + { 168, 168 }, + { 175, 175 }, + { 180, 180 }, + { 184, 184 }, + { 706, 709 }, + { 722, 735 }, + { 741, 747 }, + { 749, 749 }, + { 751, 767 }, + { 885, 885 }, + { 900, 901 }, { 2184, 2184 }, - { 8125, 8125 }, - { 8127, 8129 }, - { 8141, 8143 }, - { 8157, 8159 }, - { 8173, 8175 }, - { 8189, 8190 }, - { 12443, 12444 }, - { 42752, 42774 }, - { 42784, 42785 }, - { 42889, 42890 }, - { 43867, 43867 }, - { 43882, 43883 }, + { 8125, 8125 }, + { 8127, 8129 }, + { 8141, 8143 }, + { 8157, 8159 }, + { 8173, 8175 }, + { 8189, 8190 }, + { 12443, 12444 }, + { 42752, 42774 }, + { 42784, 42785 }, + { 42889, 42890 }, + { 43867, 43867 }, + { 43882, 43883 }, { 64434, 64450 }, - { 65342, 65342 }, - { 65344, 65344 }, - { 65507, 65507 }, -}; -static const URange32 Sk_range32[] = { - { 127995, 127999 }, -}; -static const URange16 Sm_range16[] = { - { 43, 43 }, - { 60, 62 }, - { 124, 124 }, - { 126, 126 }, - { 172, 172 }, - { 177, 177 }, - { 215, 215 }, - { 247, 247 }, - { 1014, 1014 }, - { 1542, 1544 }, - { 8260, 8260 }, - { 8274, 8274 }, - { 8314, 8316 }, - { 8330, 8332 }, - { 8472, 8472 }, - { 8512, 8516 }, - { 8523, 8523 }, - { 8592, 8596 }, - { 8602, 8603 }, - { 8608, 8608 }, - { 8611, 8611 }, - { 8614, 8614 }, - { 8622, 8622 }, - { 8654, 8655 }, - { 8658, 8658 }, - { 8660, 8660 }, - { 8692, 8959 }, - { 8992, 8993 }, - { 9084, 9084 }, - { 9115, 9139 }, - { 9180, 9185 }, - { 9655, 9655 }, - { 9665, 9665 }, - { 9720, 9727 }, - { 9839, 9839 }, - { 10176, 10180 }, - { 10183, 10213 }, - { 10224, 10239 }, - { 10496, 10626 }, - { 10649, 10711 }, - { 10716, 10747 }, - { 10750, 11007 }, - { 11056, 11076 }, - { 11079, 11084 }, - { 64297, 64297 }, - { 65122, 65122 }, - { 65124, 65126 }, - { 65291, 65291 }, - { 65308, 65310 }, - { 65372, 65372 }, - { 65374, 65374 }, - { 65506, 65506 }, - { 65513, 65516 }, -}; -static const URange32 Sm_range32[] = { - { 120513, 120513 }, - { 120539, 120539 }, - { 120571, 120571 }, - { 120597, 120597 }, - { 120629, 120629 }, - { 120655, 120655 }, - { 120687, 120687 }, - { 120713, 120713 }, - { 120745, 120745 }, - { 120771, 120771 }, - { 126704, 126705 }, -}; + { 65342, 65342 }, + { 65344, 65344 }, + { 65507, 65507 }, +}; +static const URange32 Sk_range32[] = { + { 127995, 127999 }, +}; +static const URange16 Sm_range16[] = { + { 43, 43 }, + { 60, 62 }, + { 124, 124 }, + { 126, 126 }, + { 172, 172 }, + { 177, 177 }, + { 215, 215 }, + { 247, 247 }, + { 1014, 1014 }, + { 1542, 1544 }, + { 8260, 8260 }, + { 8274, 8274 }, + { 8314, 8316 }, + { 8330, 8332 }, + { 8472, 8472 }, + { 8512, 8516 }, + { 8523, 8523 }, + { 8592, 8596 }, + { 8602, 8603 }, + { 8608, 8608 }, + { 8611, 8611 }, + { 8614, 8614 }, + { 8622, 8622 }, + { 8654, 8655 }, + { 8658, 8658 }, + { 8660, 8660 }, + { 8692, 8959 }, + { 8992, 8993 }, + { 9084, 9084 }, + { 9115, 9139 }, + { 9180, 9185 }, + { 9655, 9655 }, + { 9665, 9665 }, + { 9720, 9727 }, + { 9839, 9839 }, + { 10176, 10180 }, + { 10183, 10213 }, + { 10224, 10239 }, + { 10496, 10626 }, + { 10649, 10711 }, + { 10716, 10747 }, + { 10750, 11007 }, + { 11056, 11076 }, + { 11079, 11084 }, + { 64297, 64297 }, + { 65122, 65122 }, + { 65124, 65126 }, + { 65291, 65291 }, + { 65308, 65310 }, + { 65372, 65372 }, + { 65374, 65374 }, + { 65506, 65506 }, + { 65513, 65516 }, +}; +static const URange32 Sm_range32[] = { + { 120513, 120513 }, + { 120539, 120539 }, + { 120571, 120571 }, + { 120597, 120597 }, + { 120629, 120629 }, + { 120655, 120655 }, + { 120687, 120687 }, + { 120713, 120713 }, + { 120745, 120745 }, + { 120771, 120771 }, + { 126704, 126705 }, +}; static const URange16 So_range16[] = { { 166, 166 }, { 169, 169 }, @@ -4735,7 +4735,7 @@ static const URange16 So_range16[] = { { 3059, 3064 }, { 3066, 3066 }, { 3199, 3199 }, - { 3407, 3407 }, + { 3407, 3407 }, { 3449, 3449 }, { 3841, 3843 }, { 3859, 3859 }, @@ -4750,7 +4750,7 @@ static const URange16 So_range16[] = { { 4053, 4056 }, { 4254, 4255 }, { 5008, 5017 }, - { 5741, 5741 }, + { 5741, 5741 }, { 6464, 6464 }, { 6622, 6655 }, { 7009, 7018 }, @@ -4785,7 +4785,7 @@ static const URange16 So_range16[] = { { 9003, 9083 }, { 9085, 9114 }, { 9140, 9179 }, - { 9186, 9254 }, + { 9186, 9254 }, { 9280, 9290 }, { 9372, 9449 }, { 9472, 9654 }, @@ -4799,9 +4799,9 @@ static const URange16 So_range16[] = { { 11077, 11078 }, { 11085, 11123 }, { 11126, 11157 }, - { 11159, 11263 }, + { 11159, 11263 }, { 11493, 11498 }, - { 11856, 11857 }, + { 11856, 11857 }, { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, @@ -4819,7 +4819,7 @@ static const URange16 So_range16[] = { { 12880, 12880 }, { 12896, 12927 }, { 12938, 12976 }, - { 12992, 13311 }, + { 12992, 13311 }, { 19904, 19967 }, { 42128, 42182 }, { 43048, 43051 }, @@ -4837,15 +4837,15 @@ static const URange16 So_range16[] = { static const URange32 So_range32[] = { { 65847, 65855 }, { 65913, 65929 }, - { 65932, 65934 }, - { 65936, 65948 }, + { 65932, 65934 }, + { 65936, 65948 }, { 65952, 65952 }, { 66000, 66044 }, { 67703, 67704 }, { 68296, 68296 }, { 71487, 71487 }, - { 73685, 73692 }, - { 73697, 73713 }, + { 73685, 73692 }, + { 73697, 73713 }, { 92988, 92991 }, { 92997, 92997 }, { 113820, 113820 }, @@ -4865,48 +4865,48 @@ static const URange32 So_range32[] = { { 121453, 121460 }, { 121462, 121475 }, { 121477, 121478 }, - { 123215, 123215 }, - { 126124, 126124 }, - { 126254, 126254 }, + { 123215, 123215 }, + { 126124, 126124 }, + { 126254, 126254 }, { 126976, 127019 }, { 127024, 127123 }, { 127136, 127150 }, { 127153, 127167 }, { 127169, 127183 }, { 127185, 127221 }, - { 127245, 127405 }, + { 127245, 127405 }, { 127462, 127490 }, - { 127504, 127547 }, + { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, - { 127584, 127589 }, + { 127584, 127589 }, { 127744, 127994 }, - { 128000, 128727 }, + { 128000, 128727 }, { 128733, 128748 }, - { 128752, 128764 }, + { 128752, 128764 }, { 128768, 128883 }, - { 128896, 128984 }, - { 128992, 129003 }, + { 128896, 128984 }, + { 128992, 129003 }, { 129008, 129008 }, { 129024, 129035 }, { 129040, 129095 }, { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129200, 129201 }, + { 129200, 129201 }, { 129280, 129619 }, - { 129632, 129645 }, - { 129648, 129652 }, + { 129632, 129645 }, + { 129648, 129652 }, { 129656, 129660 }, - { 129664, 129670 }, + { 129664, 129670 }, { 129680, 129708 }, { 129712, 129722 }, { 129728, 129733 }, { 129744, 129753 }, { 129760, 129767 }, { 129776, 129782 }, - { 129792, 129938 }, - { 129940, 129994 }, + { 129792, 129938 }, + { 129940, 129994 }, }; static const URange16 Z_range16[] = { { 32, 32 }, @@ -4921,390 +4921,390 @@ static const URange16 Z_range16[] = { static const URange16 Zl_range16[] = { { 8232, 8232 }, }; -static const URange16 Zp_range16[] = { - { 8233, 8233 }, -}; -static const URange16 Zs_range16[] = { - { 32, 32 }, - { 160, 160 }, - { 5760, 5760 }, - { 8192, 8202 }, - { 8239, 8239 }, - { 8287, 8287 }, - { 12288, 12288 }, -}; -static const URange32 Adlam_range32[] = { - { 125184, 125259 }, - { 125264, 125273 }, - { 125278, 125279 }, -}; -static const URange32 Ahom_range32[] = { - { 71424, 71450 }, - { 71453, 71467 }, +static const URange16 Zp_range16[] = { + { 8233, 8233 }, +}; +static const URange16 Zs_range16[] = { + { 32, 32 }, + { 160, 160 }, + { 5760, 5760 }, + { 8192, 8202 }, + { 8239, 8239 }, + { 8287, 8287 }, + { 12288, 12288 }, +}; +static const URange32 Adlam_range32[] = { + { 125184, 125259 }, + { 125264, 125273 }, + { 125278, 125279 }, +}; +static const URange32 Ahom_range32[] = { + { 71424, 71450 }, + { 71453, 71467 }, { 71472, 71494 }, -}; -static const URange32 Anatolian_Hieroglyphs_range32[] = { - { 82944, 83526 }, -}; -static const URange16 Arabic_range16[] = { - { 1536, 1540 }, - { 1542, 1547 }, - { 1549, 1562 }, +}; +static const URange32 Anatolian_Hieroglyphs_range32[] = { + { 82944, 83526 }, +}; +static const URange16 Arabic_range16[] = { + { 1536, 1540 }, + { 1542, 1547 }, + { 1549, 1562 }, { 1564, 1566 }, - { 1568, 1599 }, - { 1601, 1610 }, - { 1622, 1647 }, - { 1649, 1756 }, - { 1758, 1791 }, - { 1872, 1919 }, + { 1568, 1599 }, + { 1601, 1610 }, + { 1622, 1647 }, + { 1649, 1756 }, + { 1758, 1791 }, + { 1872, 1919 }, { 2160, 2190 }, { 2192, 2193 }, { 2200, 2273 }, - { 2275, 2303 }, + { 2275, 2303 }, { 64336, 64450 }, - { 64467, 64829 }, + { 64467, 64829 }, { 64832, 64911 }, - { 64914, 64967 }, + { 64914, 64967 }, { 64975, 64975 }, { 65008, 65023 }, - { 65136, 65140 }, - { 65142, 65276 }, -}; -static const URange32 Arabic_range32[] = { - { 69216, 69246 }, - { 126464, 126467 }, - { 126469, 126495 }, - { 126497, 126498 }, - { 126500, 126500 }, - { 126503, 126503 }, - { 126505, 126514 }, - { 126516, 126519 }, - { 126521, 126521 }, - { 126523, 126523 }, - { 126530, 126530 }, - { 126535, 126535 }, - { 126537, 126537 }, - { 126539, 126539 }, - { 126541, 126543 }, - { 126545, 126546 }, - { 126548, 126548 }, - { 126551, 126551 }, - { 126553, 126553 }, - { 126555, 126555 }, - { 126557, 126557 }, - { 126559, 126559 }, - { 126561, 126562 }, - { 126564, 126564 }, - { 126567, 126570 }, - { 126572, 126578 }, - { 126580, 126583 }, - { 126585, 126588 }, - { 126590, 126590 }, - { 126592, 126601 }, - { 126603, 126619 }, - { 126625, 126627 }, - { 126629, 126633 }, - { 126635, 126651 }, - { 126704, 126705 }, -}; -static const URange16 Armenian_range16[] = { - { 1329, 1366 }, - { 1369, 1418 }, - { 1421, 1423 }, - { 64275, 64279 }, -}; -static const URange32 Avestan_range32[] = { - { 68352, 68405 }, - { 68409, 68415 }, -}; -static const URange16 Balinese_range16[] = { + { 65136, 65140 }, + { 65142, 65276 }, +}; +static const URange32 Arabic_range32[] = { + { 69216, 69246 }, + { 126464, 126467 }, + { 126469, 126495 }, + { 126497, 126498 }, + { 126500, 126500 }, + { 126503, 126503 }, + { 126505, 126514 }, + { 126516, 126519 }, + { 126521, 126521 }, + { 126523, 126523 }, + { 126530, 126530 }, + { 126535, 126535 }, + { 126537, 126537 }, + { 126539, 126539 }, + { 126541, 126543 }, + { 126545, 126546 }, + { 126548, 126548 }, + { 126551, 126551 }, + { 126553, 126553 }, + { 126555, 126555 }, + { 126557, 126557 }, + { 126559, 126559 }, + { 126561, 126562 }, + { 126564, 126564 }, + { 126567, 126570 }, + { 126572, 126578 }, + { 126580, 126583 }, + { 126585, 126588 }, + { 126590, 126590 }, + { 126592, 126601 }, + { 126603, 126619 }, + { 126625, 126627 }, + { 126629, 126633 }, + { 126635, 126651 }, + { 126704, 126705 }, +}; +static const URange16 Armenian_range16[] = { + { 1329, 1366 }, + { 1369, 1418 }, + { 1421, 1423 }, + { 64275, 64279 }, +}; +static const URange32 Avestan_range32[] = { + { 68352, 68405 }, + { 68409, 68415 }, +}; +static const URange16 Balinese_range16[] = { { 6912, 6988 }, { 6992, 7038 }, -}; -static const URange16 Bamum_range16[] = { - { 42656, 42743 }, -}; -static const URange32 Bamum_range32[] = { - { 92160, 92728 }, -}; -static const URange32 Bassa_Vah_range32[] = { - { 92880, 92909 }, - { 92912, 92917 }, -}; -static const URange16 Batak_range16[] = { - { 7104, 7155 }, - { 7164, 7167 }, -}; -static const URange16 Bengali_range16[] = { - { 2432, 2435 }, - { 2437, 2444 }, - { 2447, 2448 }, - { 2451, 2472 }, - { 2474, 2480 }, - { 2482, 2482 }, - { 2486, 2489 }, - { 2492, 2500 }, - { 2503, 2504 }, - { 2507, 2510 }, - { 2519, 2519 }, - { 2524, 2525 }, - { 2527, 2531 }, - { 2534, 2558 }, -}; -static const URange32 Bhaiksuki_range32[] = { - { 72704, 72712 }, - { 72714, 72758 }, - { 72760, 72773 }, - { 72784, 72812 }, -}; -static const URange16 Bopomofo_range16[] = { - { 746, 747 }, - { 12549, 12591 }, - { 12704, 12735 }, -}; -static const URange32 Brahmi_range32[] = { - { 69632, 69709 }, +}; +static const URange16 Bamum_range16[] = { + { 42656, 42743 }, +}; +static const URange32 Bamum_range32[] = { + { 92160, 92728 }, +}; +static const URange32 Bassa_Vah_range32[] = { + { 92880, 92909 }, + { 92912, 92917 }, +}; +static const URange16 Batak_range16[] = { + { 7104, 7155 }, + { 7164, 7167 }, +}; +static const URange16 Bengali_range16[] = { + { 2432, 2435 }, + { 2437, 2444 }, + { 2447, 2448 }, + { 2451, 2472 }, + { 2474, 2480 }, + { 2482, 2482 }, + { 2486, 2489 }, + { 2492, 2500 }, + { 2503, 2504 }, + { 2507, 2510 }, + { 2519, 2519 }, + { 2524, 2525 }, + { 2527, 2531 }, + { 2534, 2558 }, +}; +static const URange32 Bhaiksuki_range32[] = { + { 72704, 72712 }, + { 72714, 72758 }, + { 72760, 72773 }, + { 72784, 72812 }, +}; +static const URange16 Bopomofo_range16[] = { + { 746, 747 }, + { 12549, 12591 }, + { 12704, 12735 }, +}; +static const URange32 Brahmi_range32[] = { + { 69632, 69709 }, { 69714, 69749 }, - { 69759, 69759 }, -}; -static const URange16 Braille_range16[] = { - { 10240, 10495 }, -}; -static const URange16 Buginese_range16[] = { - { 6656, 6683 }, - { 6686, 6687 }, -}; -static const URange16 Buhid_range16[] = { - { 5952, 5971 }, -}; -static const URange16 Canadian_Aboriginal_range16[] = { - { 5120, 5759 }, - { 6320, 6389 }, -}; + { 69759, 69759 }, +}; +static const URange16 Braille_range16[] = { + { 10240, 10495 }, +}; +static const URange16 Buginese_range16[] = { + { 6656, 6683 }, + { 6686, 6687 }, +}; +static const URange16 Buhid_range16[] = { + { 5952, 5971 }, +}; +static const URange16 Canadian_Aboriginal_range16[] = { + { 5120, 5759 }, + { 6320, 6389 }, +}; static const URange32 Canadian_Aboriginal_range32[] = { { 72368, 72383 }, }; -static const URange32 Carian_range32[] = { - { 66208, 66256 }, -}; -static const URange32 Caucasian_Albanian_range32[] = { - { 66864, 66915 }, - { 66927, 66927 }, -}; -static const URange32 Chakma_range32[] = { - { 69888, 69940 }, - { 69942, 69959 }, -}; -static const URange16 Cham_range16[] = { - { 43520, 43574 }, - { 43584, 43597 }, - { 43600, 43609 }, - { 43612, 43615 }, -}; -static const URange16 Cherokee_range16[] = { - { 5024, 5109 }, - { 5112, 5117 }, - { 43888, 43967 }, -}; -static const URange32 Chorasmian_range32[] = { - { 69552, 69579 }, -}; -static const URange16 Common_range16[] = { - { 0, 64 }, - { 91, 96 }, - { 123, 169 }, - { 171, 185 }, - { 187, 191 }, - { 215, 215 }, - { 247, 247 }, - { 697, 735 }, - { 741, 745 }, - { 748, 767 }, - { 884, 884 }, - { 894, 894 }, - { 901, 901 }, - { 903, 903 }, - { 1541, 1541 }, - { 1548, 1548 }, - { 1563, 1563 }, - { 1567, 1567 }, - { 1600, 1600 }, +static const URange32 Carian_range32[] = { + { 66208, 66256 }, +}; +static const URange32 Caucasian_Albanian_range32[] = { + { 66864, 66915 }, + { 66927, 66927 }, +}; +static const URange32 Chakma_range32[] = { + { 69888, 69940 }, + { 69942, 69959 }, +}; +static const URange16 Cham_range16[] = { + { 43520, 43574 }, + { 43584, 43597 }, + { 43600, 43609 }, + { 43612, 43615 }, +}; +static const URange16 Cherokee_range16[] = { + { 5024, 5109 }, + { 5112, 5117 }, + { 43888, 43967 }, +}; +static const URange32 Chorasmian_range32[] = { + { 69552, 69579 }, +}; +static const URange16 Common_range16[] = { + { 0, 64 }, + { 91, 96 }, + { 123, 169 }, + { 171, 185 }, + { 187, 191 }, + { 215, 215 }, + { 247, 247 }, + { 697, 735 }, + { 741, 745 }, + { 748, 767 }, + { 884, 884 }, + { 894, 894 }, + { 901, 901 }, + { 903, 903 }, + { 1541, 1541 }, + { 1548, 1548 }, + { 1563, 1563 }, + { 1567, 1567 }, + { 1600, 1600 }, { 1757, 1757 }, - { 2274, 2274 }, - { 2404, 2405 }, - { 3647, 3647 }, - { 4053, 4056 }, - { 4347, 4347 }, - { 5867, 5869 }, - { 5941, 5942 }, - { 6146, 6147 }, - { 6149, 6149 }, - { 7379, 7379 }, - { 7393, 7393 }, - { 7401, 7404 }, - { 7406, 7411 }, - { 7413, 7415 }, - { 7418, 7418 }, - { 8192, 8203 }, - { 8206, 8292 }, - { 8294, 8304 }, - { 8308, 8318 }, - { 8320, 8334 }, + { 2274, 2274 }, + { 2404, 2405 }, + { 3647, 3647 }, + { 4053, 4056 }, + { 4347, 4347 }, + { 5867, 5869 }, + { 5941, 5942 }, + { 6146, 6147 }, + { 6149, 6149 }, + { 7379, 7379 }, + { 7393, 7393 }, + { 7401, 7404 }, + { 7406, 7411 }, + { 7413, 7415 }, + { 7418, 7418 }, + { 8192, 8203 }, + { 8206, 8292 }, + { 8294, 8304 }, + { 8308, 8318 }, + { 8320, 8334 }, { 8352, 8384 }, - { 8448, 8485 }, - { 8487, 8489 }, - { 8492, 8497 }, - { 8499, 8525 }, - { 8527, 8543 }, - { 8585, 8587 }, - { 8592, 9254 }, - { 9280, 9290 }, - { 9312, 10239 }, - { 10496, 11123 }, - { 11126, 11157 }, - { 11159, 11263 }, + { 8448, 8485 }, + { 8487, 8489 }, + { 8492, 8497 }, + { 8499, 8525 }, + { 8527, 8543 }, + { 8585, 8587 }, + { 8592, 9254 }, + { 9280, 9290 }, + { 9312, 10239 }, + { 10496, 11123 }, + { 11126, 11157 }, + { 11159, 11263 }, { 11776, 11869 }, - { 12272, 12283 }, - { 12288, 12292 }, - { 12294, 12294 }, - { 12296, 12320 }, - { 12336, 12343 }, - { 12348, 12351 }, - { 12443, 12444 }, - { 12448, 12448 }, - { 12539, 12540 }, - { 12688, 12703 }, - { 12736, 12771 }, - { 12832, 12895 }, - { 12927, 13007 }, - { 13055, 13055 }, - { 13144, 13311 }, - { 19904, 19967 }, - { 42752, 42785 }, - { 42888, 42890 }, - { 43056, 43065 }, - { 43310, 43310 }, - { 43471, 43471 }, - { 43867, 43867 }, - { 43882, 43883 }, - { 64830, 64831 }, - { 65040, 65049 }, - { 65072, 65106 }, - { 65108, 65126 }, - { 65128, 65131 }, + { 12272, 12283 }, + { 12288, 12292 }, + { 12294, 12294 }, + { 12296, 12320 }, + { 12336, 12343 }, + { 12348, 12351 }, + { 12443, 12444 }, + { 12448, 12448 }, + { 12539, 12540 }, + { 12688, 12703 }, + { 12736, 12771 }, + { 12832, 12895 }, + { 12927, 13007 }, + { 13055, 13055 }, + { 13144, 13311 }, + { 19904, 19967 }, + { 42752, 42785 }, + { 42888, 42890 }, + { 43056, 43065 }, + { 43310, 43310 }, + { 43471, 43471 }, + { 43867, 43867 }, + { 43882, 43883 }, + { 64830, 64831 }, + { 65040, 65049 }, + { 65072, 65106 }, + { 65108, 65126 }, + { 65128, 65131 }, { 65279, 65279 }, - { 65281, 65312 }, - { 65339, 65344 }, - { 65371, 65381 }, - { 65392, 65392 }, - { 65438, 65439 }, - { 65504, 65510 }, - { 65512, 65518 }, - { 65529, 65533 }, -}; -static const URange32 Common_range32[] = { - { 65792, 65794 }, - { 65799, 65843 }, - { 65847, 65855 }, - { 65936, 65948 }, - { 66000, 66044 }, - { 66273, 66299 }, + { 65281, 65312 }, + { 65339, 65344 }, + { 65371, 65381 }, + { 65392, 65392 }, + { 65438, 65439 }, + { 65504, 65510 }, + { 65512, 65518 }, + { 65529, 65533 }, +}; +static const URange32 Common_range32[] = { + { 65792, 65794 }, + { 65799, 65843 }, + { 65847, 65855 }, + { 65936, 65948 }, + { 66000, 66044 }, + { 66273, 66299 }, { 113824, 113827 }, { 118608, 118723 }, - { 118784, 119029 }, - { 119040, 119078 }, - { 119081, 119142 }, - { 119146, 119162 }, - { 119171, 119172 }, - { 119180, 119209 }, + { 118784, 119029 }, + { 119040, 119078 }, + { 119081, 119142 }, + { 119146, 119162 }, + { 119171, 119172 }, + { 119180, 119209 }, { 119214, 119274 }, - { 119520, 119539 }, - { 119552, 119638 }, - { 119648, 119672 }, - { 119808, 119892 }, - { 119894, 119964 }, - { 119966, 119967 }, - { 119970, 119970 }, - { 119973, 119974 }, - { 119977, 119980 }, - { 119982, 119993 }, - { 119995, 119995 }, - { 119997, 120003 }, - { 120005, 120069 }, - { 120071, 120074 }, - { 120077, 120084 }, - { 120086, 120092 }, - { 120094, 120121 }, - { 120123, 120126 }, - { 120128, 120132 }, - { 120134, 120134 }, - { 120138, 120144 }, - { 120146, 120485 }, - { 120488, 120779 }, - { 120782, 120831 }, - { 126065, 126132 }, - { 126209, 126269 }, - { 126976, 127019 }, - { 127024, 127123 }, - { 127136, 127150 }, - { 127153, 127167 }, - { 127169, 127183 }, - { 127185, 127221 }, - { 127232, 127405 }, - { 127462, 127487 }, - { 127489, 127490 }, - { 127504, 127547 }, - { 127552, 127560 }, - { 127568, 127569 }, - { 127584, 127589 }, - { 127744, 128727 }, + { 119520, 119539 }, + { 119552, 119638 }, + { 119648, 119672 }, + { 119808, 119892 }, + { 119894, 119964 }, + { 119966, 119967 }, + { 119970, 119970 }, + { 119973, 119974 }, + { 119977, 119980 }, + { 119982, 119993 }, + { 119995, 119995 }, + { 119997, 120003 }, + { 120005, 120069 }, + { 120071, 120074 }, + { 120077, 120084 }, + { 120086, 120092 }, + { 120094, 120121 }, + { 120123, 120126 }, + { 120128, 120132 }, + { 120134, 120134 }, + { 120138, 120144 }, + { 120146, 120485 }, + { 120488, 120779 }, + { 120782, 120831 }, + { 126065, 126132 }, + { 126209, 126269 }, + { 126976, 127019 }, + { 127024, 127123 }, + { 127136, 127150 }, + { 127153, 127167 }, + { 127169, 127183 }, + { 127185, 127221 }, + { 127232, 127405 }, + { 127462, 127487 }, + { 127489, 127490 }, + { 127504, 127547 }, + { 127552, 127560 }, + { 127568, 127569 }, + { 127584, 127589 }, + { 127744, 128727 }, { 128733, 128748 }, - { 128752, 128764 }, - { 128768, 128883 }, - { 128896, 128984 }, - { 128992, 129003 }, + { 128752, 128764 }, + { 128768, 128883 }, + { 128896, 128984 }, + { 128992, 129003 }, { 129008, 129008 }, - { 129024, 129035 }, - { 129040, 129095 }, - { 129104, 129113 }, - { 129120, 129159 }, - { 129168, 129197 }, - { 129200, 129201 }, + { 129024, 129035 }, + { 129040, 129095 }, + { 129104, 129113 }, + { 129120, 129159 }, + { 129168, 129197 }, + { 129200, 129201 }, { 129280, 129619 }, - { 129632, 129645 }, - { 129648, 129652 }, + { 129632, 129645 }, + { 129648, 129652 }, { 129656, 129660 }, - { 129664, 129670 }, + { 129664, 129670 }, { 129680, 129708 }, { 129712, 129722 }, { 129728, 129733 }, { 129744, 129753 }, { 129760, 129767 }, { 129776, 129782 }, - { 129792, 129938 }, - { 129940, 129994 }, - { 130032, 130041 }, + { 129792, 129938 }, + { 129940, 129994 }, + { 130032, 130041 }, { 917505, 917505 }, { 917536, 917631 }, }; -static const URange16 Coptic_range16[] = { - { 994, 1007 }, - { 11392, 11507 }, - { 11513, 11519 }, -}; -static const URange32 Cuneiform_range32[] = { - { 73728, 74649 }, - { 74752, 74862 }, - { 74864, 74868 }, - { 74880, 75075 }, -}; -static const URange32 Cypriot_range32[] = { - { 67584, 67589 }, - { 67592, 67592 }, - { 67594, 67637 }, - { 67639, 67640 }, - { 67644, 67644 }, - { 67647, 67647 }, +static const URange16 Coptic_range16[] = { + { 994, 1007 }, + { 11392, 11507 }, + { 11513, 11519 }, +}; +static const URange32 Cuneiform_range32[] = { + { 73728, 74649 }, + { 74752, 74862 }, + { 74864, 74868 }, + { 74880, 75075 }, +}; +static const URange32 Cypriot_range32[] = { + { 67584, 67589 }, + { 67592, 67592 }, + { 67594, 67637 }, + { 67639, 67640 }, + { 67644, 67644 }, + { 67647, 67647 }, }; static const URange32 Cypro_Minoan_range32[] = { { 77712, 77810 }, @@ -5312,52 +5312,52 @@ static const URange32 Cypro_Minoan_range32[] = { static const URange16 Cyrillic_range16[] = { { 1024, 1156 }, { 1159, 1327 }, - { 7296, 7304 }, + { 7296, 7304 }, { 7467, 7467 }, { 7544, 7544 }, { 11744, 11775 }, { 42560, 42655 }, { 65070, 65071 }, }; -static const URange32 Deseret_range32[] = { - { 66560, 66639 }, -}; -static const URange16 Devanagari_range16[] = { - { 2304, 2384 }, - { 2389, 2403 }, - { 2406, 2431 }, - { 43232, 43263 }, -}; -static const URange32 Dives_Akuru_range32[] = { - { 71936, 71942 }, - { 71945, 71945 }, - { 71948, 71955 }, - { 71957, 71958 }, - { 71960, 71989 }, - { 71991, 71992 }, - { 71995, 72006 }, - { 72016, 72025 }, -}; -static const URange32 Dogra_range32[] = { - { 71680, 71739 }, -}; -static const URange32 Duployan_range32[] = { - { 113664, 113770 }, - { 113776, 113788 }, - { 113792, 113800 }, - { 113808, 113817 }, - { 113820, 113823 }, -}; -static const URange32 Egyptian_Hieroglyphs_range32[] = { - { 77824, 78894 }, - { 78896, 78904 }, -}; -static const URange32 Elbasan_range32[] = { - { 66816, 66855 }, -}; -static const URange32 Elymaic_range32[] = { - { 69600, 69622 }, -}; +static const URange32 Deseret_range32[] = { + { 66560, 66639 }, +}; +static const URange16 Devanagari_range16[] = { + { 2304, 2384 }, + { 2389, 2403 }, + { 2406, 2431 }, + { 43232, 43263 }, +}; +static const URange32 Dives_Akuru_range32[] = { + { 71936, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71989 }, + { 71991, 71992 }, + { 71995, 72006 }, + { 72016, 72025 }, +}; +static const URange32 Dogra_range32[] = { + { 71680, 71739 }, +}; +static const URange32 Duployan_range32[] = { + { 113664, 113770 }, + { 113776, 113788 }, + { 113792, 113800 }, + { 113808, 113817 }, + { 113820, 113823 }, +}; +static const URange32 Egyptian_Hieroglyphs_range32[] = { + { 77824, 78894 }, + { 78896, 78904 }, +}; +static const URange32 Elbasan_range32[] = { + { 66816, 66855 }, +}; +static const URange32 Elymaic_range32[] = { + { 69600, 69622 }, +}; static const URange16 Ethiopic_range16[] = { { 4608, 4680 }, { 4682, 4685 }, @@ -5398,130 +5398,130 @@ static const URange32 Ethiopic_range32[] = { { 124909, 124910 }, { 124912, 124926 }, }; -static const URange16 Georgian_range16[] = { - { 4256, 4293 }, - { 4295, 4295 }, - { 4301, 4301 }, - { 4304, 4346 }, - { 4348, 4351 }, - { 7312, 7354 }, - { 7357, 7359 }, - { 11520, 11557 }, - { 11559, 11559 }, - { 11565, 11565 }, -}; -static const URange16 Glagolitic_range16[] = { +static const URange16 Georgian_range16[] = { + { 4256, 4293 }, + { 4295, 4295 }, + { 4301, 4301 }, + { 4304, 4346 }, + { 4348, 4351 }, + { 7312, 7354 }, + { 7357, 7359 }, + { 11520, 11557 }, + { 11559, 11559 }, + { 11565, 11565 }, +}; +static const URange16 Glagolitic_range16[] = { { 11264, 11359 }, }; -static const URange32 Glagolitic_range32[] = { - { 122880, 122886 }, - { 122888, 122904 }, - { 122907, 122913 }, - { 122915, 122916 }, - { 122918, 122922 }, -}; -static const URange32 Gothic_range32[] = { - { 66352, 66378 }, -}; -static const URange32 Grantha_range32[] = { - { 70400, 70403 }, - { 70405, 70412 }, - { 70415, 70416 }, - { 70419, 70440 }, - { 70442, 70448 }, - { 70450, 70451 }, - { 70453, 70457 }, - { 70460, 70468 }, - { 70471, 70472 }, - { 70475, 70477 }, - { 70480, 70480 }, - { 70487, 70487 }, - { 70493, 70499 }, - { 70502, 70508 }, - { 70512, 70516 }, -}; -static const URange16 Greek_range16[] = { - { 880, 883 }, - { 885, 887 }, - { 890, 893 }, - { 895, 895 }, - { 900, 900 }, - { 902, 902 }, - { 904, 906 }, - { 908, 908 }, - { 910, 929 }, - { 931, 993 }, - { 1008, 1023 }, - { 7462, 7466 }, - { 7517, 7521 }, - { 7526, 7530 }, - { 7615, 7615 }, - { 7936, 7957 }, - { 7960, 7965 }, - { 7968, 8005 }, - { 8008, 8013 }, - { 8016, 8023 }, - { 8025, 8025 }, - { 8027, 8027 }, - { 8029, 8029 }, - { 8031, 8061 }, - { 8064, 8116 }, - { 8118, 8132 }, - { 8134, 8147 }, - { 8150, 8155 }, - { 8157, 8175 }, - { 8178, 8180 }, - { 8182, 8190 }, - { 8486, 8486 }, - { 43877, 43877 }, -}; -static const URange32 Greek_range32[] = { - { 65856, 65934 }, - { 65952, 65952 }, - { 119296, 119365 }, -}; -static const URange16 Gujarati_range16[] = { - { 2689, 2691 }, - { 2693, 2701 }, - { 2703, 2705 }, - { 2707, 2728 }, - { 2730, 2736 }, - { 2738, 2739 }, - { 2741, 2745 }, - { 2748, 2757 }, - { 2759, 2761 }, - { 2763, 2765 }, - { 2768, 2768 }, - { 2784, 2787 }, - { 2790, 2801 }, - { 2809, 2815 }, -}; -static const URange32 Gunjala_Gondi_range32[] = { - { 73056, 73061 }, - { 73063, 73064 }, - { 73066, 73102 }, - { 73104, 73105 }, - { 73107, 73112 }, - { 73120, 73129 }, -}; -static const URange16 Gurmukhi_range16[] = { - { 2561, 2563 }, - { 2565, 2570 }, - { 2575, 2576 }, - { 2579, 2600 }, - { 2602, 2608 }, - { 2610, 2611 }, - { 2613, 2614 }, - { 2616, 2617 }, - { 2620, 2620 }, - { 2622, 2626 }, - { 2631, 2632 }, - { 2635, 2637 }, - { 2641, 2641 }, - { 2649, 2652 }, - { 2654, 2654 }, - { 2662, 2678 }, -}; +static const URange32 Glagolitic_range32[] = { + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, +}; +static const URange32 Gothic_range32[] = { + { 66352, 66378 }, +}; +static const URange32 Grantha_range32[] = { + { 70400, 70403 }, + { 70405, 70412 }, + { 70415, 70416 }, + { 70419, 70440 }, + { 70442, 70448 }, + { 70450, 70451 }, + { 70453, 70457 }, + { 70460, 70468 }, + { 70471, 70472 }, + { 70475, 70477 }, + { 70480, 70480 }, + { 70487, 70487 }, + { 70493, 70499 }, + { 70502, 70508 }, + { 70512, 70516 }, +}; +static const URange16 Greek_range16[] = { + { 880, 883 }, + { 885, 887 }, + { 890, 893 }, + { 895, 895 }, + { 900, 900 }, + { 902, 902 }, + { 904, 906 }, + { 908, 908 }, + { 910, 929 }, + { 931, 993 }, + { 1008, 1023 }, + { 7462, 7466 }, + { 7517, 7521 }, + { 7526, 7530 }, + { 7615, 7615 }, + { 7936, 7957 }, + { 7960, 7965 }, + { 7968, 8005 }, + { 8008, 8013 }, + { 8016, 8023 }, + { 8025, 8025 }, + { 8027, 8027 }, + { 8029, 8029 }, + { 8031, 8061 }, + { 8064, 8116 }, + { 8118, 8132 }, + { 8134, 8147 }, + { 8150, 8155 }, + { 8157, 8175 }, + { 8178, 8180 }, + { 8182, 8190 }, + { 8486, 8486 }, + { 43877, 43877 }, +}; +static const URange32 Greek_range32[] = { + { 65856, 65934 }, + { 65952, 65952 }, + { 119296, 119365 }, +}; +static const URange16 Gujarati_range16[] = { + { 2689, 2691 }, + { 2693, 2701 }, + { 2703, 2705 }, + { 2707, 2728 }, + { 2730, 2736 }, + { 2738, 2739 }, + { 2741, 2745 }, + { 2748, 2757 }, + { 2759, 2761 }, + { 2763, 2765 }, + { 2768, 2768 }, + { 2784, 2787 }, + { 2790, 2801 }, + { 2809, 2815 }, +}; +static const URange32 Gunjala_Gondi_range32[] = { + { 73056, 73061 }, + { 73063, 73064 }, + { 73066, 73102 }, + { 73104, 73105 }, + { 73107, 73112 }, + { 73120, 73129 }, +}; +static const URange16 Gurmukhi_range16[] = { + { 2561, 2563 }, + { 2565, 2570 }, + { 2575, 2576 }, + { 2579, 2600 }, + { 2602, 2608 }, + { 2610, 2611 }, + { 2613, 2614 }, + { 2616, 2617 }, + { 2620, 2620 }, + { 2622, 2626 }, + { 2631, 2632 }, + { 2635, 2637 }, + { 2641, 2641 }, + { 2649, 2652 }, + { 2654, 2654 }, + { 2662, 2678 }, +}; static const URange16 Han_range16[] = { { 11904, 11929 }, { 11931, 12019 }, @@ -5530,60 +5530,60 @@ static const URange16 Han_range16[] = { { 12295, 12295 }, { 12321, 12329 }, { 12344, 12347 }, - { 13312, 19903 }, + { 13312, 19903 }, { 19968, 40959 }, { 63744, 64109 }, { 64112, 64217 }, }; static const URange32 Han_range32[] = { { 94178, 94179 }, - { 94192, 94193 }, + { 94192, 94193 }, { 131072, 173791 }, { 173824, 177976 }, { 177984, 178205 }, { 178208, 183969 }, - { 183984, 191456 }, + { 183984, 191456 }, { 194560, 195101 }, - { 196608, 201546 }, -}; -static const URange16 Hangul_range16[] = { - { 4352, 4607 }, - { 12334, 12335 }, - { 12593, 12686 }, - { 12800, 12830 }, - { 12896, 12926 }, - { 43360, 43388 }, - { 44032, 55203 }, - { 55216, 55238 }, - { 55243, 55291 }, - { 65440, 65470 }, - { 65474, 65479 }, - { 65482, 65487 }, - { 65490, 65495 }, - { 65498, 65500 }, -}; -static const URange32 Hanifi_Rohingya_range32[] = { - { 68864, 68903 }, - { 68912, 68921 }, -}; -static const URange16 Hanunoo_range16[] = { - { 5920, 5940 }, -}; -static const URange32 Hatran_range32[] = { - { 67808, 67826 }, - { 67828, 67829 }, - { 67835, 67839 }, -}; -static const URange16 Hebrew_range16[] = { - { 1425, 1479 }, - { 1488, 1514 }, - { 1519, 1524 }, - { 64285, 64310 }, - { 64312, 64316 }, - { 64318, 64318 }, - { 64320, 64321 }, - { 64323, 64324 }, - { 64326, 64335 }, + { 196608, 201546 }, +}; +static const URange16 Hangul_range16[] = { + { 4352, 4607 }, + { 12334, 12335 }, + { 12593, 12686 }, + { 12800, 12830 }, + { 12896, 12926 }, + { 43360, 43388 }, + { 44032, 55203 }, + { 55216, 55238 }, + { 55243, 55291 }, + { 65440, 65470 }, + { 65474, 65479 }, + { 65482, 65487 }, + { 65490, 65495 }, + { 65498, 65500 }, +}; +static const URange32 Hanifi_Rohingya_range32[] = { + { 68864, 68903 }, + { 68912, 68921 }, +}; +static const URange16 Hanunoo_range16[] = { + { 5920, 5940 }, +}; +static const URange32 Hatran_range32[] = { + { 67808, 67826 }, + { 67828, 67829 }, + { 67835, 67839 }, +}; +static const URange16 Hebrew_range16[] = { + { 1425, 1479 }, + { 1488, 1514 }, + { 1519, 1524 }, + { 64285, 64310 }, + { 64312, 64316 }, + { 64318, 64318 }, + { 64320, 64321 }, + { 64323, 64324 }, + { 64326, 64335 }, }; static const URange16 Hiragana_range16[] = { { 12353, 12438 }, @@ -5591,95 +5591,95 @@ static const URange16 Hiragana_range16[] = { }; static const URange32 Hiragana_range32[] = { { 110593, 110879 }, - { 110928, 110930 }, + { 110928, 110930 }, { 127488, 127488 }, }; -static const URange32 Imperial_Aramaic_range32[] = { - { 67648, 67669 }, - { 67671, 67679 }, +static const URange32 Imperial_Aramaic_range32[] = { + { 67648, 67669 }, + { 67671, 67679 }, }; -static const URange16 Inherited_range16[] = { - { 768, 879 }, - { 1157, 1158 }, - { 1611, 1621 }, - { 1648, 1648 }, - { 2385, 2388 }, +static const URange16 Inherited_range16[] = { + { 768, 879 }, + { 1157, 1158 }, + { 1611, 1621 }, + { 1648, 1648 }, + { 2385, 2388 }, { 6832, 6862 }, - { 7376, 7378 }, - { 7380, 7392 }, - { 7394, 7400 }, - { 7405, 7405 }, - { 7412, 7412 }, - { 7416, 7417 }, + { 7376, 7378 }, + { 7380, 7392 }, + { 7394, 7400 }, + { 7405, 7405 }, + { 7412, 7412 }, + { 7416, 7417 }, { 7616, 7679 }, - { 8204, 8205 }, - { 8400, 8432 }, - { 12330, 12333 }, - { 12441, 12442 }, - { 65024, 65039 }, - { 65056, 65069 }, -}; -static const URange32 Inherited_range32[] = { - { 66045, 66045 }, - { 66272, 66272 }, - { 70459, 70459 }, + { 8204, 8205 }, + { 8400, 8432 }, + { 12330, 12333 }, + { 12441, 12442 }, + { 65024, 65039 }, + { 65056, 65069 }, +}; +static const URange32 Inherited_range32[] = { + { 66045, 66045 }, + { 66272, 66272 }, + { 70459, 70459 }, { 118528, 118573 }, { 118576, 118598 }, - { 119143, 119145 }, - { 119163, 119170 }, - { 119173, 119179 }, - { 119210, 119213 }, - { 917760, 917999 }, -}; -static const URange32 Inscriptional_Pahlavi_range32[] = { - { 68448, 68466 }, - { 68472, 68479 }, -}; -static const URange32 Inscriptional_Parthian_range32[] = { - { 68416, 68437 }, - { 68440, 68447 }, -}; -static const URange16 Javanese_range16[] = { - { 43392, 43469 }, - { 43472, 43481 }, - { 43486, 43487 }, -}; -static const URange32 Kaithi_range32[] = { + { 119143, 119145 }, + { 119163, 119170 }, + { 119173, 119179 }, + { 119210, 119213 }, + { 917760, 917999 }, +}; +static const URange32 Inscriptional_Pahlavi_range32[] = { + { 68448, 68466 }, + { 68472, 68479 }, +}; +static const URange32 Inscriptional_Parthian_range32[] = { + { 68416, 68437 }, + { 68440, 68447 }, +}; +static const URange16 Javanese_range16[] = { + { 43392, 43469 }, + { 43472, 43481 }, + { 43486, 43487 }, +}; +static const URange32 Kaithi_range32[] = { { 69760, 69826 }, - { 69837, 69837 }, -}; -static const URange16 Kannada_range16[] = { - { 3200, 3212 }, - { 3214, 3216 }, - { 3218, 3240 }, - { 3242, 3251 }, - { 3253, 3257 }, - { 3260, 3268 }, - { 3270, 3272 }, - { 3274, 3277 }, - { 3285, 3286 }, + { 69837, 69837 }, +}; +static const URange16 Kannada_range16[] = { + { 3200, 3212 }, + { 3214, 3216 }, + { 3218, 3240 }, + { 3242, 3251 }, + { 3253, 3257 }, + { 3260, 3268 }, + { 3270, 3272 }, + { 3274, 3277 }, + { 3285, 3286 }, { 3293, 3294 }, - { 3296, 3299 }, - { 3302, 3311 }, - { 3313, 3314 }, -}; -static const URange16 Katakana_range16[] = { - { 12449, 12538 }, - { 12541, 12543 }, - { 12784, 12799 }, - { 13008, 13054 }, - { 13056, 13143 }, - { 65382, 65391 }, - { 65393, 65437 }, -}; -static const URange32 Katakana_range32[] = { + { 3296, 3299 }, + { 3302, 3311 }, + { 3313, 3314 }, +}; +static const URange16 Katakana_range16[] = { + { 12449, 12538 }, + { 12541, 12543 }, + { 12784, 12799 }, + { 13008, 13054 }, + { 13056, 13143 }, + { 65382, 65391 }, + { 65393, 65437 }, +}; +static const URange32 Katakana_range32[] = { { 110576, 110579 }, { 110581, 110587 }, { 110589, 110590 }, - { 110592, 110592 }, + { 110592, 110592 }, { 110880, 110882 }, - { 110948, 110951 }, -}; + { 110948, 110951 }, +}; static const URange16 Kayah_Li_range16[] = { { 43264, 43309 }, { 43311, 43311 }, @@ -5689,42 +5689,42 @@ static const URange32 Kharoshthi_range32[] = { { 68101, 68102 }, { 68108, 68115 }, { 68117, 68119 }, - { 68121, 68149 }, + { 68121, 68149 }, { 68152, 68154 }, - { 68159, 68168 }, + { 68159, 68168 }, { 68176, 68184 }, }; -static const URange32 Khitan_Small_Script_range32[] = { - { 94180, 94180 }, - { 101120, 101589 }, -}; -static const URange16 Khmer_range16[] = { - { 6016, 6109 }, - { 6112, 6121 }, - { 6128, 6137 }, - { 6624, 6655 }, -}; -static const URange32 Khojki_range32[] = { - { 70144, 70161 }, - { 70163, 70206 }, -}; -static const URange32 Khudawadi_range32[] = { - { 70320, 70378 }, - { 70384, 70393 }, -}; -static const URange16 Lao_range16[] = { - { 3713, 3714 }, - { 3716, 3716 }, - { 3718, 3722 }, - { 3724, 3747 }, - { 3749, 3749 }, - { 3751, 3773 }, - { 3776, 3780 }, - { 3782, 3782 }, - { 3784, 3789 }, - { 3792, 3801 }, - { 3804, 3807 }, -}; +static const URange32 Khitan_Small_Script_range32[] = { + { 94180, 94180 }, + { 101120, 101589 }, +}; +static const URange16 Khmer_range16[] = { + { 6016, 6109 }, + { 6112, 6121 }, + { 6128, 6137 }, + { 6624, 6655 }, +}; +static const URange32 Khojki_range32[] = { + { 70144, 70161 }, + { 70163, 70206 }, +}; +static const URange32 Khudawadi_range32[] = { + { 70320, 70378 }, + { 70384, 70393 }, +}; +static const URange16 Lao_range16[] = { + { 3713, 3714 }, + { 3716, 3716 }, + { 3718, 3722 }, + { 3724, 3747 }, + { 3749, 3749 }, + { 3751, 3773 }, + { 3776, 3780 }, + { 3782, 3782 }, + { 3784, 3789 }, + { 3792, 3801 }, + { 3804, 3807 }, +}; static const URange16 Latin_range16[] = { { 65, 90 }, { 97, 122 }, @@ -5756,7 +5756,7 @@ static const URange16 Latin_range16[] = { { 42994, 43007 }, { 43824, 43866 }, { 43868, 43876 }, - { 43878, 43881 }, + { 43878, 43881 }, { 64256, 64262 }, { 65313, 65338 }, { 65345, 65370 }, @@ -5767,146 +5767,146 @@ static const URange32 Latin_range32[] = { { 67506, 67514 }, { 122624, 122654 }, }; -static const URange16 Lepcha_range16[] = { - { 7168, 7223 }, - { 7227, 7241 }, - { 7245, 7247 }, -}; -static const URange16 Limbu_range16[] = { - { 6400, 6430 }, - { 6432, 6443 }, - { 6448, 6459 }, - { 6464, 6464 }, - { 6468, 6479 }, -}; -static const URange32 Linear_A_range32[] = { - { 67072, 67382 }, - { 67392, 67413 }, - { 67424, 67431 }, -}; -static const URange32 Linear_B_range32[] = { - { 65536, 65547 }, - { 65549, 65574 }, - { 65576, 65594 }, - { 65596, 65597 }, - { 65599, 65613 }, - { 65616, 65629 }, - { 65664, 65786 }, -}; -static const URange16 Lisu_range16[] = { - { 42192, 42239 }, -}; -static const URange32 Lisu_range32[] = { - { 73648, 73648 }, -}; -static const URange32 Lycian_range32[] = { - { 66176, 66204 }, -}; -static const URange32 Lydian_range32[] = { - { 67872, 67897 }, - { 67903, 67903 }, -}; -static const URange32 Mahajani_range32[] = { - { 69968, 70006 }, -}; -static const URange32 Makasar_range32[] = { - { 73440, 73464 }, -}; -static const URange16 Malayalam_range16[] = { - { 3328, 3340 }, - { 3342, 3344 }, - { 3346, 3396 }, - { 3398, 3400 }, - { 3402, 3407 }, - { 3412, 3427 }, - { 3430, 3455 }, -}; -static const URange16 Mandaic_range16[] = { - { 2112, 2139 }, - { 2142, 2142 }, -}; -static const URange32 Manichaean_range32[] = { - { 68288, 68326 }, - { 68331, 68342 }, -}; -static const URange32 Marchen_range32[] = { - { 72816, 72847 }, - { 72850, 72871 }, - { 72873, 72886 }, -}; -static const URange32 Masaram_Gondi_range32[] = { - { 72960, 72966 }, - { 72968, 72969 }, - { 72971, 73014 }, - { 73018, 73018 }, - { 73020, 73021 }, - { 73023, 73031 }, - { 73040, 73049 }, -}; -static const URange32 Medefaidrin_range32[] = { - { 93760, 93850 }, -}; -static const URange16 Meetei_Mayek_range16[] = { - { 43744, 43766 }, - { 43968, 44013 }, - { 44016, 44025 }, -}; +static const URange16 Lepcha_range16[] = { + { 7168, 7223 }, + { 7227, 7241 }, + { 7245, 7247 }, +}; +static const URange16 Limbu_range16[] = { + { 6400, 6430 }, + { 6432, 6443 }, + { 6448, 6459 }, + { 6464, 6464 }, + { 6468, 6479 }, +}; +static const URange32 Linear_A_range32[] = { + { 67072, 67382 }, + { 67392, 67413 }, + { 67424, 67431 }, +}; +static const URange32 Linear_B_range32[] = { + { 65536, 65547 }, + { 65549, 65574 }, + { 65576, 65594 }, + { 65596, 65597 }, + { 65599, 65613 }, + { 65616, 65629 }, + { 65664, 65786 }, +}; +static const URange16 Lisu_range16[] = { + { 42192, 42239 }, +}; +static const URange32 Lisu_range32[] = { + { 73648, 73648 }, +}; +static const URange32 Lycian_range32[] = { + { 66176, 66204 }, +}; +static const URange32 Lydian_range32[] = { + { 67872, 67897 }, + { 67903, 67903 }, +}; +static const URange32 Mahajani_range32[] = { + { 69968, 70006 }, +}; +static const URange32 Makasar_range32[] = { + { 73440, 73464 }, +}; +static const URange16 Malayalam_range16[] = { + { 3328, 3340 }, + { 3342, 3344 }, + { 3346, 3396 }, + { 3398, 3400 }, + { 3402, 3407 }, + { 3412, 3427 }, + { 3430, 3455 }, +}; +static const URange16 Mandaic_range16[] = { + { 2112, 2139 }, + { 2142, 2142 }, +}; +static const URange32 Manichaean_range32[] = { + { 68288, 68326 }, + { 68331, 68342 }, +}; +static const URange32 Marchen_range32[] = { + { 72816, 72847 }, + { 72850, 72871 }, + { 72873, 72886 }, +}; +static const URange32 Masaram_Gondi_range32[] = { + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73031 }, + { 73040, 73049 }, +}; +static const URange32 Medefaidrin_range32[] = { + { 93760, 93850 }, +}; +static const URange16 Meetei_Mayek_range16[] = { + { 43744, 43766 }, + { 43968, 44013 }, + { 44016, 44025 }, +}; static const URange32 Mende_Kikakui_range32[] = { { 124928, 125124 }, { 125127, 125142 }, }; -static const URange32 Meroitic_Cursive_range32[] = { - { 68000, 68023 }, - { 68028, 68047 }, - { 68050, 68095 }, +static const URange32 Meroitic_Cursive_range32[] = { + { 68000, 68023 }, + { 68028, 68047 }, + { 68050, 68095 }, }; -static const URange32 Meroitic_Hieroglyphs_range32[] = { - { 67968, 67999 }, +static const URange32 Meroitic_Hieroglyphs_range32[] = { + { 67968, 67999 }, }; -static const URange32 Miao_range32[] = { - { 93952, 94026 }, - { 94031, 94087 }, - { 94095, 94111 }, +static const URange32 Miao_range32[] = { + { 93952, 94026 }, + { 94031, 94087 }, + { 94095, 94111 }, }; -static const URange32 Modi_range32[] = { - { 71168, 71236 }, - { 71248, 71257 }, +static const URange32 Modi_range32[] = { + { 71168, 71236 }, + { 71248, 71257 }, }; -static const URange16 Mongolian_range16[] = { - { 6144, 6145 }, - { 6148, 6148 }, +static const URange16 Mongolian_range16[] = { + { 6144, 6145 }, + { 6148, 6148 }, { 6150, 6169 }, - { 6176, 6264 }, - { 6272, 6314 }, -}; -static const URange32 Mongolian_range32[] = { - { 71264, 71276 }, -}; -static const URange32 Mro_range32[] = { - { 92736, 92766 }, - { 92768, 92777 }, - { 92782, 92783 }, -}; -static const URange32 Multani_range32[] = { - { 70272, 70278 }, - { 70280, 70280 }, - { 70282, 70285 }, - { 70287, 70301 }, - { 70303, 70313 }, -}; -static const URange16 Myanmar_range16[] = { - { 4096, 4255 }, - { 43488, 43518 }, - { 43616, 43647 }, -}; -static const URange32 Nabataean_range32[] = { - { 67712, 67742 }, - { 67751, 67759 }, -}; -static const URange32 Nandinagari_range32[] = { - { 72096, 72103 }, - { 72106, 72151 }, - { 72154, 72164 }, + { 6176, 6264 }, + { 6272, 6314 }, +}; +static const URange32 Mongolian_range32[] = { + { 71264, 71276 }, +}; +static const URange32 Mro_range32[] = { + { 92736, 92766 }, + { 92768, 92777 }, + { 92782, 92783 }, +}; +static const URange32 Multani_range32[] = { + { 70272, 70278 }, + { 70280, 70280 }, + { 70282, 70285 }, + { 70287, 70301 }, + { 70303, 70313 }, +}; +static const URange16 Myanmar_range16[] = { + { 4096, 4255 }, + { 43488, 43518 }, + { 43616, 43647 }, +}; +static const URange32 Nabataean_range32[] = { + { 67712, 67742 }, + { 67751, 67759 }, +}; +static const URange32 Nandinagari_range32[] = { + { 72096, 72103 }, + { 72106, 72151 }, + { 72154, 72164 }, }; static const URange16 New_Tai_Lue_range16[] = { { 6528, 6571 }, @@ -5914,58 +5914,58 @@ static const URange16 New_Tai_Lue_range16[] = { { 6608, 6618 }, { 6622, 6623 }, }; -static const URange32 Newa_range32[] = { - { 70656, 70747 }, - { 70749, 70753 }, -}; -static const URange16 Nko_range16[] = { - { 1984, 2042 }, - { 2045, 2047 }, -}; -static const URange32 Nushu_range32[] = { - { 94177, 94177 }, - { 110960, 111355 }, -}; -static const URange32 Nyiakeng_Puachue_Hmong_range32[] = { - { 123136, 123180 }, - { 123184, 123197 }, - { 123200, 123209 }, - { 123214, 123215 }, -}; -static const URange16 Ogham_range16[] = { - { 5760, 5788 }, -}; +static const URange32 Newa_range32[] = { + { 70656, 70747 }, + { 70749, 70753 }, +}; +static const URange16 Nko_range16[] = { + { 1984, 2042 }, + { 2045, 2047 }, +}; +static const URange32 Nushu_range32[] = { + { 94177, 94177 }, + { 110960, 111355 }, +}; +static const URange32 Nyiakeng_Puachue_Hmong_range32[] = { + { 123136, 123180 }, + { 123184, 123197 }, + { 123200, 123209 }, + { 123214, 123215 }, +}; +static const URange16 Ogham_range16[] = { + { 5760, 5788 }, +}; static const URange16 Ol_Chiki_range16[] = { { 7248, 7295 }, }; -static const URange32 Old_Hungarian_range32[] = { - { 68736, 68786 }, - { 68800, 68850 }, - { 68858, 68863 }, -}; -static const URange32 Old_Italic_range32[] = { - { 66304, 66339 }, - { 66349, 66351 }, -}; -static const URange32 Old_North_Arabian_range32[] = { - { 68224, 68255 }, -}; -static const URange32 Old_Permic_range32[] = { - { 66384, 66426 }, -}; -static const URange32 Old_Persian_range32[] = { - { 66464, 66499 }, - { 66504, 66517 }, -}; -static const URange32 Old_Sogdian_range32[] = { - { 69376, 69415 }, -}; -static const URange32 Old_South_Arabian_range32[] = { - { 68192, 68223 }, -}; -static const URange32 Old_Turkic_range32[] = { - { 68608, 68680 }, -}; +static const URange32 Old_Hungarian_range32[] = { + { 68736, 68786 }, + { 68800, 68850 }, + { 68858, 68863 }, +}; +static const URange32 Old_Italic_range32[] = { + { 66304, 66339 }, + { 66349, 66351 }, +}; +static const URange32 Old_North_Arabian_range32[] = { + { 68224, 68255 }, +}; +static const URange32 Old_Permic_range32[] = { + { 66384, 66426 }, +}; +static const URange32 Old_Persian_range32[] = { + { 66464, 66499 }, + { 66504, 66517 }, +}; +static const URange32 Old_Sogdian_range32[] = { + { 69376, 69415 }, +}; +static const URange32 Old_South_Arabian_range32[] = { + { 68192, 68223 }, +}; +static const URange32 Old_Turkic_range32[] = { + { 68608, 68680 }, +}; static const URange32 Old_Uyghur_range32[] = { { 69488, 69513 }, }; @@ -5980,77 +5980,77 @@ static const URange16 Oriya_range16[] = { { 2876, 2884 }, { 2887, 2888 }, { 2891, 2893 }, - { 2901, 2903 }, + { 2901, 2903 }, { 2908, 2909 }, { 2911, 2915 }, { 2918, 2935 }, }; -static const URange32 Osage_range32[] = { - { 66736, 66771 }, - { 66776, 66811 }, +static const URange32 Osage_range32[] = { + { 66736, 66771 }, + { 66776, 66811 }, }; -static const URange32 Osmanya_range32[] = { - { 66688, 66717 }, - { 66720, 66729 }, +static const URange32 Osmanya_range32[] = { + { 66688, 66717 }, + { 66720, 66729 }, }; -static const URange32 Pahawh_Hmong_range32[] = { - { 92928, 92997 }, - { 93008, 93017 }, - { 93019, 93025 }, - { 93027, 93047 }, - { 93053, 93071 }, +static const URange32 Pahawh_Hmong_range32[] = { + { 92928, 92997 }, + { 93008, 93017 }, + { 93019, 93025 }, + { 93027, 93047 }, + { 93053, 93071 }, }; -static const URange32 Palmyrene_range32[] = { - { 67680, 67711 }, +static const URange32 Palmyrene_range32[] = { + { 67680, 67711 }, }; -static const URange32 Pau_Cin_Hau_range32[] = { - { 72384, 72440 }, +static const URange32 Pau_Cin_Hau_range32[] = { + { 72384, 72440 }, }; -static const URange16 Phags_Pa_range16[] = { - { 43072, 43127 }, +static const URange16 Phags_Pa_range16[] = { + { 43072, 43127 }, }; static const URange32 Phoenician_range32[] = { { 67840, 67867 }, { 67871, 67871 }, }; -static const URange32 Psalter_Pahlavi_range32[] = { - { 68480, 68497 }, - { 68505, 68508 }, - { 68521, 68527 }, +static const URange32 Psalter_Pahlavi_range32[] = { + { 68480, 68497 }, + { 68505, 68508 }, + { 68521, 68527 }, }; -static const URange16 Rejang_range16[] = { - { 43312, 43347 }, - { 43359, 43359 }, +static const URange16 Rejang_range16[] = { + { 43312, 43347 }, + { 43359, 43359 }, }; -static const URange16 Runic_range16[] = { - { 5792, 5866 }, - { 5870, 5880 }, +static const URange16 Runic_range16[] = { + { 5792, 5866 }, + { 5870, 5880 }, }; -static const URange16 Samaritan_range16[] = { - { 2048, 2093 }, - { 2096, 2110 }, +static const URange16 Samaritan_range16[] = { + { 2048, 2093 }, + { 2096, 2110 }, }; -static const URange16 Saurashtra_range16[] = { - { 43136, 43205 }, - { 43214, 43225 }, +static const URange16 Saurashtra_range16[] = { + { 43136, 43205 }, + { 43214, 43225 }, }; -static const URange32 Sharada_range32[] = { - { 70016, 70111 }, +static const URange32 Sharada_range32[] = { + { 70016, 70111 }, }; -static const URange32 Shavian_range32[] = { - { 66640, 66687 }, +static const URange32 Shavian_range32[] = { + { 66640, 66687 }, }; -static const URange32 Siddham_range32[] = { - { 71040, 71093 }, - { 71096, 71133 }, +static const URange32 Siddham_range32[] = { + { 71040, 71093 }, + { 71096, 71133 }, }; -static const URange32 SignWriting_range32[] = { - { 120832, 121483 }, - { 121499, 121503 }, - { 121505, 121519 }, +static const URange32 SignWriting_range32[] = { + { 120832, 121483 }, + { 121499, 121503 }, + { 121505, 121519 }, }; static const URange16 Sinhala_range16[] = { - { 3457, 3459 }, + { 3457, 3459 }, { 3461, 3478 }, { 3482, 3505 }, { 3507, 3515 }, @@ -6066,124 +6066,124 @@ static const URange16 Sinhala_range16[] = { static const URange32 Sinhala_range32[] = { { 70113, 70132 }, }; -static const URange32 Sogdian_range32[] = { - { 69424, 69465 }, +static const URange32 Sogdian_range32[] = { + { 69424, 69465 }, }; -static const URange32 Sora_Sompeng_range32[] = { - { 69840, 69864 }, - { 69872, 69881 }, +static const URange32 Sora_Sompeng_range32[] = { + { 69840, 69864 }, + { 69872, 69881 }, }; -static const URange32 Soyombo_range32[] = { - { 72272, 72354 }, +static const URange32 Soyombo_range32[] = { + { 72272, 72354 }, }; -static const URange16 Sundanese_range16[] = { - { 7040, 7103 }, - { 7360, 7367 }, +static const URange16 Sundanese_range16[] = { + { 7040, 7103 }, + { 7360, 7367 }, }; -static const URange16 Syloti_Nagri_range16[] = { - { 43008, 43052 }, +static const URange16 Syloti_Nagri_range16[] = { + { 43008, 43052 }, }; -static const URange16 Syriac_range16[] = { - { 1792, 1805 }, - { 1807, 1866 }, - { 1869, 1871 }, - { 2144, 2154 }, +static const URange16 Syriac_range16[] = { + { 1792, 1805 }, + { 1807, 1866 }, + { 1869, 1871 }, + { 2144, 2154 }, }; -static const URange16 Tagalog_range16[] = { +static const URange16 Tagalog_range16[] = { { 5888, 5909 }, { 5919, 5919 }, }; -static const URange16 Tagbanwa_range16[] = { - { 5984, 5996 }, - { 5998, 6000 }, - { 6002, 6003 }, +static const URange16 Tagbanwa_range16[] = { + { 5984, 5996 }, + { 5998, 6000 }, + { 6002, 6003 }, }; -static const URange16 Tai_Le_range16[] = { - { 6480, 6509 }, - { 6512, 6516 }, +static const URange16 Tai_Le_range16[] = { + { 6480, 6509 }, + { 6512, 6516 }, }; -static const URange16 Tai_Tham_range16[] = { - { 6688, 6750 }, - { 6752, 6780 }, - { 6783, 6793 }, - { 6800, 6809 }, - { 6816, 6829 }, +static const URange16 Tai_Tham_range16[] = { + { 6688, 6750 }, + { 6752, 6780 }, + { 6783, 6793 }, + { 6800, 6809 }, + { 6816, 6829 }, }; -static const URange16 Tai_Viet_range16[] = { - { 43648, 43714 }, - { 43739, 43743 }, +static const URange16 Tai_Viet_range16[] = { + { 43648, 43714 }, + { 43739, 43743 }, }; -static const URange32 Takri_range32[] = { +static const URange32 Takri_range32[] = { { 71296, 71353 }, - { 71360, 71369 }, -}; -static const URange16 Tamil_range16[] = { - { 2946, 2947 }, - { 2949, 2954 }, - { 2958, 2960 }, - { 2962, 2965 }, - { 2969, 2970 }, - { 2972, 2972 }, - { 2974, 2975 }, - { 2979, 2980 }, - { 2984, 2986 }, - { 2990, 3001 }, - { 3006, 3010 }, - { 3014, 3016 }, - { 3018, 3021 }, - { 3024, 3024 }, - { 3031, 3031 }, - { 3046, 3066 }, -}; -static const URange32 Tamil_range32[] = { - { 73664, 73713 }, - { 73727, 73727 }, + { 71360, 71369 }, +}; +static const URange16 Tamil_range16[] = { + { 2946, 2947 }, + { 2949, 2954 }, + { 2958, 2960 }, + { 2962, 2965 }, + { 2969, 2970 }, + { 2972, 2972 }, + { 2974, 2975 }, + { 2979, 2980 }, + { 2984, 2986 }, + { 2990, 3001 }, + { 3006, 3010 }, + { 3014, 3016 }, + { 3018, 3021 }, + { 3024, 3024 }, + { 3031, 3031 }, + { 3046, 3066 }, +}; +static const URange32 Tamil_range32[] = { + { 73664, 73713 }, + { 73727, 73727 }, }; static const URange32 Tangsa_range32[] = { { 92784, 92862 }, { 92864, 92873 }, }; -static const URange32 Tangut_range32[] = { - { 94176, 94176 }, - { 94208, 100343 }, - { 100352, 101119 }, - { 101632, 101640 }, -}; -static const URange16 Telugu_range16[] = { - { 3072, 3084 }, - { 3086, 3088 }, - { 3090, 3112 }, - { 3114, 3129 }, +static const URange32 Tangut_range32[] = { + { 94176, 94176 }, + { 94208, 100343 }, + { 100352, 101119 }, + { 101632, 101640 }, +}; +static const URange16 Telugu_range16[] = { + { 3072, 3084 }, + { 3086, 3088 }, + { 3090, 3112 }, + { 3114, 3129 }, { 3132, 3140 }, - { 3142, 3144 }, - { 3146, 3149 }, - { 3157, 3158 }, - { 3160, 3162 }, + { 3142, 3144 }, + { 3146, 3149 }, + { 3157, 3158 }, + { 3160, 3162 }, { 3165, 3165 }, - { 3168, 3171 }, - { 3174, 3183 }, - { 3191, 3199 }, -}; -static const URange16 Thaana_range16[] = { - { 1920, 1969 }, -}; -static const URange16 Thai_range16[] = { - { 3585, 3642 }, - { 3648, 3675 }, -}; -static const URange16 Tibetan_range16[] = { - { 3840, 3911 }, - { 3913, 3948 }, - { 3953, 3991 }, - { 3993, 4028 }, - { 4030, 4044 }, - { 4046, 4052 }, - { 4057, 4058 }, -}; -static const URange16 Tifinagh_range16[] = { - { 11568, 11623 }, - { 11631, 11632 }, - { 11647, 11647 }, + { 3168, 3171 }, + { 3174, 3183 }, + { 3191, 3199 }, +}; +static const URange16 Thaana_range16[] = { + { 1920, 1969 }, +}; +static const URange16 Thai_range16[] = { + { 3585, 3642 }, + { 3648, 3675 }, +}; +static const URange16 Tibetan_range16[] = { + { 3840, 3911 }, + { 3913, 3948 }, + { 3953, 3991 }, + { 3993, 4028 }, + { 4030, 4044 }, + { 4046, 4052 }, + { 4057, 4058 }, +}; +static const URange16 Tifinagh_range16[] = { + { 11568, 11623 }, + { 11631, 11632 }, + { 11647, 11647 }, }; static const URange32 Tirhuta_range32[] = { { 70784, 70855 }, @@ -6192,12 +6192,12 @@ static const URange32 Tirhuta_range32[] = { static const URange32 Toto_range32[] = { { 123536, 123566 }, }; -static const URange32 Ugaritic_range32[] = { - { 66432, 66461 }, - { 66463, 66463 }, +static const URange32 Ugaritic_range32[] = { + { 66432, 66461 }, + { 66463, 66463 }, }; -static const URange16 Vai_range16[] = { - { 42240, 42539 }, +static const URange16 Vai_range16[] = { + { 42240, 42539 }, }; static const URange32 Vithkuqi_range32[] = { { 66928, 66938 }, @@ -6209,40 +6209,40 @@ static const URange32 Vithkuqi_range32[] = { { 66995, 67001 }, { 67003, 67004 }, }; -static const URange32 Wancho_range32[] = { - { 123584, 123641 }, - { 123647, 123647 }, +static const URange32 Wancho_range32[] = { + { 123584, 123641 }, + { 123647, 123647 }, }; -static const URange32 Warang_Citi_range32[] = { - { 71840, 71922 }, - { 71935, 71935 }, +static const URange32 Warang_Citi_range32[] = { + { 71840, 71922 }, + { 71935, 71935 }, }; -static const URange32 Yezidi_range32[] = { - { 69248, 69289 }, - { 69291, 69293 }, - { 69296, 69297 }, +static const URange32 Yezidi_range32[] = { + { 69248, 69289 }, + { 69291, 69293 }, + { 69296, 69297 }, }; -static const URange16 Yi_range16[] = { - { 40960, 42124 }, - { 42128, 42182 }, +static const URange16 Yi_range16[] = { + { 40960, 42124 }, + { 42128, 42182 }, }; -static const URange32 Zanabazar_Square_range32[] = { - { 72192, 72263 }, +static const URange32 Zanabazar_Square_range32[] = { + { 72192, 72263 }, }; // 4038 16-bit ranges, 1712 32-bit ranges const UGroup unicode_groups[] = { - { "Adlam", +1, 0, 0, Adlam_range32, 3 }, + { "Adlam", +1, 0, 0, Adlam_range32, 3 }, { "Ahom", +1, 0, 0, Ahom_range32, 3 }, { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 }, - { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, - { "Armenian", +1, Armenian_range16, 4, 0, 0 }, + { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, + { "Armenian", +1, Armenian_range16, 4, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, { "Bamum", +1, Bamum_range16, 1, Bamum_range32, 1 }, { "Bassa_Vah", +1, 0, 0, Bassa_Vah_range32, 2 }, { "Batak", +1, Batak_range16, 2, 0, 0 }, { "Bengali", +1, Bengali_range16, 14, 0, 0 }, - { "Bhaiksuki", +1, 0, 0, Bhaiksuki_range32, 4 }, + { "Bhaiksuki", +1, 0, 0, Bhaiksuki_range32, 4 }, { "Bopomofo", +1, Bopomofo_range16, 3, 0, 0 }, { "Brahmi", +1, 0, 0, Brahmi_range32, 3 }, { "Braille", +1, Braille_range16, 1, 0, 0 }, @@ -6257,7 +6257,7 @@ const UGroup unicode_groups[] = { { "Chakma", +1, 0, 0, Chakma_range32, 2 }, { "Cham", +1, Cham_range16, 4, 0, 0 }, { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, - { "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 }, + { "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, { "Common", +1, Common_range16, 91, Common_range32, 83 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, @@ -6265,53 +6265,53 @@ const UGroup unicode_groups[] = { { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, { "Cypro_Minoan", +1, 0, 0, Cypro_Minoan_range32, 1 }, - { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, + { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, - { "Dives_Akuru", +1, 0, 0, Dives_Akuru_range32, 8 }, - { "Dogra", +1, 0, 0, Dogra_range32, 1 }, + { "Dives_Akuru", +1, 0, 0, Dives_Akuru_range32, 8 }, + { "Dogra", +1, 0, 0, Dogra_range32, 1 }, { "Duployan", +1, 0, 0, Duployan_range32, 5 }, - { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 2 }, + { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 2 }, { "Elbasan", +1, 0, 0, Elbasan_range32, 1 }, - { "Elymaic", +1, 0, 0, Elymaic_range32, 1 }, + { "Elymaic", +1, 0, 0, Elymaic_range32, 1 }, { "Ethiopic", +1, Ethiopic_range16, 32, Ethiopic_range32, 4 }, - { "Georgian", +1, Georgian_range16, 10, 0, 0 }, + { "Georgian", +1, Georgian_range16, 10, 0, 0 }, { "Glagolitic", +1, Glagolitic_range16, 1, Glagolitic_range32, 5 }, { "Gothic", +1, 0, 0, Gothic_range32, 1 }, { "Grantha", +1, 0, 0, Grantha_range32, 15 }, { "Greek", +1, Greek_range16, 33, Greek_range32, 3 }, { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, - { "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 }, + { "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, { "Han", +1, Han_range16, 11, Han_range32, 9 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, - { "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 }, + { "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, { "Hatran", +1, 0, 0, Hatran_range32, 3 }, { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, - { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 3 }, + { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 3 }, { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 }, { "Inherited", +1, Inherited_range16, 19, Inherited_range32, 10 }, { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 }, { "Inscriptional_Parthian", +1, 0, 0, Inscriptional_Parthian_range32, 2 }, { "Javanese", +1, Javanese_range16, 3, 0, 0 }, - { "Kaithi", +1, 0, 0, Kaithi_range32, 2 }, - { "Kannada", +1, Kannada_range16, 13, 0, 0 }, + { "Kaithi", +1, 0, 0, Kaithi_range32, 2 }, + { "Kannada", +1, Kannada_range16, 13, 0, 0 }, { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 6 }, { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, - { "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 }, + { "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, { "Khojki", +1, 0, 0, Khojki_range32, 2 }, { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, { "L", +1, L_range16, 380, L_range32, 268 }, - { "Lao", +1, Lao_range16, 11, 0, 0 }, + { "Lao", +1, Lao_range16, 11, 0, 0 }, { "Latin", +1, Latin_range16, 34, Latin_range32, 4 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, - { "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 }, + { "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 }, { "Ll", +1, Ll_range16, 617, Ll_range32, 40 }, { "Lm", +1, Lm_range16, 57, Lm_range32, 12 }, { "Lo", +1, Lo_range16, 290, Lo_range32, 211 }, @@ -6321,15 +6321,15 @@ const UGroup unicode_groups[] = { { "Lydian", +1, 0, 0, Lydian_range32, 2 }, { "M", +1, M_range16, 189, M_range32, 110 }, { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, - { "Makasar", +1, 0, 0, Makasar_range32, 1 }, - { "Malayalam", +1, Malayalam_range16, 7, 0, 0 }, + { "Makasar", +1, 0, 0, Makasar_range32, 1 }, + { "Malayalam", +1, Malayalam_range16, 7, 0, 0 }, { "Mandaic", +1, Mandaic_range16, 2, 0, 0 }, { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, - { "Marchen", +1, 0, 0, Marchen_range32, 3 }, - { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, + { "Marchen", +1, 0, 0, Marchen_range32, 3 }, + { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, { "Mc", +1, Mc_range16, 111, Mc_range32, 66 }, { "Me", +1, Me_range16, 5, 0, 0 }, - { "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 }, + { "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, { "Mende_Kikakui", +1, 0, 0, Mende_Kikakui_range32, 2 }, { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, @@ -6343,28 +6343,28 @@ const UGroup unicode_groups[] = { { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, { "N", +1, N_range16, 67, N_range32, 67 }, { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, - { "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 }, + { "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 }, { "Nd", +1, Nd_range16, 37, Nd_range32, 25 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, - { "Newa", +1, 0, 0, Newa_range32, 2 }, - { "Nko", +1, Nko_range16, 2, 0, 0 }, + { "Newa", +1, 0, 0, Newa_range32, 2 }, + { "Nko", +1, Nko_range16, 2, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 29, No_range32, 42 }, - { "Nushu", +1, 0, 0, Nushu_range32, 2 }, - { "Nyiakeng_Puachue_Hmong", +1, 0, 0, Nyiakeng_Puachue_Hmong_range32, 4 }, + { "No", +1, No_range16, 29, No_range32, 42 }, + { "Nushu", +1, 0, 0, Nushu_range32, 2 }, + { "Nyiakeng_Puachue_Hmong", +1, 0, 0, Nyiakeng_Puachue_Hmong_range32, 4 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, { "Ol_Chiki", +1, Ol_Chiki_range16, 1, 0, 0 }, { "Old_Hungarian", +1, 0, 0, Old_Hungarian_range32, 3 }, - { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 }, + { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 }, { "Old_North_Arabian", +1, 0, 0, Old_North_Arabian_range32, 1 }, { "Old_Permic", +1, 0, 0, Old_Permic_range32, 1 }, { "Old_Persian", +1, 0, 0, Old_Persian_range32, 2 }, - { "Old_Sogdian", +1, 0, 0, Old_Sogdian_range32, 1 }, + { "Old_Sogdian", +1, 0, 0, Old_Sogdian_range32, 1 }, { "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 }, { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 }, { "Old_Uyghur", +1, 0, 0, Old_Uyghur_range32, 1 }, { "Oriya", +1, Oriya_range16, 14, 0, 0 }, - { "Osage", +1, 0, 0, Osage_range32, 2 }, + { "Osage", +1, 0, 0, Osage_range32, 2 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, { "P", +1, P_range16, 133, P_range32, 56 }, { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, @@ -6385,8 +6385,8 @@ const UGroup unicode_groups[] = { { "S", +1, S_range16, 151, S_range32, 83 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, - { "Sc", +1, Sc_range16, 18, Sc_range32, 3 }, - { "Sharada", +1, 0, 0, Sharada_range32, 1 }, + { "Sc", +1, Sc_range16, 18, Sc_range32, 3 }, + { "Sharada", +1, 0, 0, Sharada_range32, 1 }, { "Shavian", +1, 0, 0, Shavian_range32, 1 }, { "Siddham", +1, 0, 0, Siddham_range32, 2 }, { "SignWriting", +1, 0, 0, SignWriting_range32, 3 }, @@ -6394,21 +6394,21 @@ const UGroup unicode_groups[] = { { "Sk", +1, Sk_range16, 30, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, { "So", +1, So_range16, 114, So_range32, 72 }, - { "Sogdian", +1, 0, 0, Sogdian_range32, 1 }, + { "Sogdian", +1, 0, 0, Sogdian_range32, 1 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, - { "Soyombo", +1, 0, 0, Soyombo_range32, 1 }, + { "Soyombo", +1, 0, 0, Soyombo_range32, 1 }, { "Sundanese", +1, Sundanese_range16, 2, 0, 0 }, { "Syloti_Nagri", +1, Syloti_Nagri_range16, 1, 0, 0 }, - { "Syriac", +1, Syriac_range16, 4, 0, 0 }, + { "Syriac", +1, Syriac_range16, 4, 0, 0 }, { "Tagalog", +1, Tagalog_range16, 2, 0, 0 }, { "Tagbanwa", +1, Tagbanwa_range16, 3, 0, 0 }, { "Tai_Le", +1, Tai_Le_range16, 2, 0, 0 }, { "Tai_Tham", +1, Tai_Tham_range16, 5, 0, 0 }, { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, { "Takri", +1, 0, 0, Takri_range32, 2 }, - { "Tamil", +1, Tamil_range16, 16, Tamil_range32, 2 }, + { "Tamil", +1, Tamil_range16, 16, Tamil_range32, 2 }, { "Tangsa", +1, 0, 0, Tangsa_range32, 2 }, - { "Tangut", +1, 0, 0, Tangut_range32, 4 }, + { "Tangut", +1, 0, 0, Tangut_range32, 4 }, { "Telugu", +1, Telugu_range16, 13, 0, 0 }, { "Thaana", +1, Thaana_range16, 1, 0, 0 }, { "Thai", +1, Thai_range16, 2, 0, 0 }, @@ -6419,12 +6419,12 @@ const UGroup unicode_groups[] = { { "Ugaritic", +1, 0, 0, Ugaritic_range32, 2 }, { "Vai", +1, Vai_range16, 1, 0, 0 }, { "Vithkuqi", +1, 0, 0, Vithkuqi_range32, 8 }, - { "Wancho", +1, 0, 0, Wancho_range32, 2 }, + { "Wancho", +1, 0, 0, Wancho_range32, 2 }, { "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 }, - { "Yezidi", +1, 0, 0, Yezidi_range32, 3 }, + { "Yezidi", +1, 0, 0, Yezidi_range32, 3 }, { "Yi", +1, Yi_range16, 2, 0, 0 }, { "Z", +1, Z_range16, 8, 0, 0 }, - { "Zanabazar_Square", +1, 0, 0, Zanabazar_Square_range32, 1 }, + { "Zanabazar_Square", +1, 0, 0, Zanabazar_Square_range32, 1 }, { "Zl", +1, Zl_range16, 1, 0, 0 }, { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, diff --git a/contrib/libs/re2/re2/walker-inl.h b/contrib/libs/re2/re2/walker-inl.h index 336b9a3167..4d064a0970 100644 --- a/contrib/libs/re2/re2/walker-inl.h +++ b/contrib/libs/re2/re2/walker-inl.h @@ -89,7 +89,7 @@ template<typename T> class Regexp::Walker { private: // Walk state for the entire traversal. - std::stack<WalkState<T>> stack_; + std::stack<WalkState<T>> stack_; bool stopped_early_; int max_visits_; @@ -119,7 +119,7 @@ template<typename T> T Regexp::Walker<T>::Copy(T arg) { // State about a single level in the traversal. template<typename T> struct WalkState { - WalkState(Regexp* re, T parent) + WalkState(Regexp* re, T parent) : re(re), n(-1), parent_arg(parent), @@ -145,12 +145,12 @@ template<typename T> Regexp::Walker<T>::~Walker() { // Walk always enters and exits with an empty stack. // Logs DFATAL if stack is not already clear. template<typename T> void Regexp::Walker<T>::Reset() { - if (!stack_.empty()) { + if (!stack_.empty()) { LOG(DFATAL) << "Stack not empty."; - while (!stack_.empty()) { + while (!stack_.empty()) { if (stack_.top().re->nsub_ > 1) delete[] stack_.top().child_args; - stack_.pop(); + stack_.pop(); } } } @@ -164,12 +164,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, return top_arg; } - stack_.push(WalkState<T>(re, top_arg)); + stack_.push(WalkState<T>(re, top_arg)); WalkState<T>* s; for (;;) { T t; - s = &stack_.top(); + s = &stack_.top(); re = s->re; switch (s->n) { case -1: { @@ -200,7 +200,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, s->child_args[s->n] = Copy(s->child_args[s->n - 1]); s->n++; } else { - stack_.push(WalkState<T>(sub[s->n], s->pre_arg)); + stack_.push(WalkState<T>(sub[s->n], s->pre_arg)); } continue; } @@ -213,12 +213,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, } } - // We've finished stack_.top(). + // We've finished stack_.top(). // Update next guy down. - stack_.pop(); - if (stack_.empty()) + stack_.pop(); + if (stack_.empty()) return t; - s = &stack_.top(); + s = &stack_.top(); if (s->child_args != NULL) s->child_args[s->n] = t; else diff --git a/contrib/libs/re2/util/flags.h b/contrib/libs/re2/util/flags.h index a3d5fc1234..3386b729d4 100644 --- a/contrib/libs/re2/util/flags.h +++ b/contrib/libs/re2/util/flags.h @@ -1,26 +1,26 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_FLAGS_H_ -#define UTIL_FLAGS_H_ - -// Simplified version of Google's command line flags. -// Does not support parsing the command line. -// If you want to do that, see -// https://gflags.github.io/gflags/ - -#define DEFINE_FLAG(type, name, deflt, desc) \ - namespace re2 { type FLAGS_##name = deflt; } - -#define DECLARE_FLAG(type, name) \ - namespace re2 { extern type FLAGS_##name; } - -namespace re2 { -template <typename T> -T GetFlag(const T& flag) { - return flag; -} -} // namespace re2 - -#endif // UTIL_FLAGS_H_ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_FLAGS_H_ +#define UTIL_FLAGS_H_ + +// Simplified version of Google's command line flags. +// Does not support parsing the command line. +// If you want to do that, see +// https://gflags.github.io/gflags/ + +#define DEFINE_FLAG(type, name, deflt, desc) \ + namespace re2 { type FLAGS_##name = deflt; } + +#define DECLARE_FLAG(type, name) \ + namespace re2 { extern type FLAGS_##name; } + +namespace re2 { +template <typename T> +T GetFlag(const T& flag) { + return flag; +} +} // namespace re2 + +#endif // UTIL_FLAGS_H_ diff --git a/contrib/libs/re2/util/logging.h b/contrib/libs/re2/util/logging.h index be5b4d4dbb..5b2217f29c 100644 --- a/contrib/libs/re2/util/logging.h +++ b/contrib/libs/re2/util/logging.h @@ -62,7 +62,7 @@ class LogMessage { } void Flush() { stream() << "\n"; - std::string s = str_.str(); + std::string s = str_.str(); size_t n = s.size(); if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc flushed_ = true; @@ -93,7 +93,7 @@ class LogMessageFatal : public LogMessage { public: LogMessageFatal(const char* file, int line) : LogMessage(file, line) {} - ATTRIBUTE_NORETURN ~LogMessageFatal() { + ATTRIBUTE_NORETURN ~LogMessageFatal() { Flush(); abort(); } diff --git a/contrib/libs/re2/util/mutex.h b/contrib/libs/re2/util/mutex.h index 0ad97ff1eb..158046bb5c 100644 --- a/contrib/libs/re2/util/mutex.h +++ b/contrib/libs/re2/util/mutex.h @@ -10,13 +10,13 @@ * You should assume the locks are *not* re-entrant. */ -#ifdef _WIN32 -// Requires Windows Vista or Windows Server 2008 at minimum. -#include <windows.h> -#if defined(WINVER) && WINVER >= 0x0600 -#define MUTEX_IS_WIN32_SRWLOCK -#endif -#else +#ifdef _WIN32 +// Requires Windows Vista or Windows Server 2008 at minimum. +#include <windows.h> +#if defined(WINVER) && WINVER >= 0x0600 +#define MUTEX_IS_WIN32_SRWLOCK +#endif +#else #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif @@ -26,9 +26,9 @@ #endif #endif -#if defined(MUTEX_IS_WIN32_SRWLOCK) -typedef SRWLOCK MutexType; -#elif defined(MUTEX_IS_PTHREAD_RWLOCK) +#if defined(MUTEX_IS_WIN32_SRWLOCK) +typedef SRWLOCK MutexType; +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) #include <pthread.h> #include <stdlib.h> typedef pthread_rwlock_t MutexType; @@ -64,17 +64,17 @@ class Mutex { Mutex& operator=(const Mutex&) = delete; }; -#if defined(MUTEX_IS_WIN32_SRWLOCK) +#if defined(MUTEX_IS_WIN32_SRWLOCK) Mutex::Mutex() : mutex_(SRWLOCK_INIT) { } -Mutex::~Mutex() { } -void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } -void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } -void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } -void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } - -#elif defined(MUTEX_IS_PTHREAD_RWLOCK) - +Mutex::~Mutex() { } +void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } +void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } +void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } +void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } + +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) + #define SAFE_PTHREAD(fncall) \ do { \ if ((fncall) != 0) abort(); \ diff --git a/contrib/libs/re2/util/pcre.cc b/contrib/libs/re2/util/pcre.cc index 93ffe9421b..b68985144f 100644 --- a/contrib/libs/re2/util/pcre.cc +++ b/contrib/libs/re2/util/pcre.cc @@ -1,1025 +1,1025 @@ -// Copyright 2003-2009 Google Inc. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This is a variant of PCRE's pcrecpp.cc, originally written at Google. -// The main changes are the addition of the HitLimit method and -// compilation as PCRE in namespace re2. - -#include <assert.h> -#include <ctype.h> -#include <errno.h> -#include <stdlib.h> -#include <string.h> -#include <limits> -#include <string> -#include <utility> - -#include "util/util.h" -#include "util/flags.h" -#include "util/logging.h" -#include "util/pcre.h" -#include "util/strutil.h" - -// Silence warnings about the wacky formatting in the operator() functions. -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 -#pragma GCC diagnostic ignored "-Wmisleading-indentation" -#endif - -#define PCREPORT(level) LOG(level) - -// Default PCRE limits. -// Defaults chosen to allow a plausible amount of CPU and -// not exceed main thread stacks. Note that other threads -// often have smaller stacks, and therefore tightening -// regexp_stack_limit may frequently be necessary. -DEFINE_FLAG(int, regexp_stack_limit, 256 << 10, - "default PCRE stack limit (bytes)"); -DEFINE_FLAG(int, regexp_match_limit, 1000000, - "default PCRE match limit (function calls)"); - -#ifndef USEPCRE - -// Fake just enough of the PCRE API to allow this file to build. :) - -struct pcre_extra { - int flags; - int match_limit; - int match_limit_recursion; -}; - -#define PCRE_EXTRA_MATCH_LIMIT 0 -#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 -#define PCRE_ANCHORED 0 -#define PCRE_NOTEMPTY 0 -#define PCRE_ERROR_NOMATCH 1 -#define PCRE_ERROR_MATCHLIMIT 2 -#define PCRE_ERROR_RECURSIONLIMIT 3 -#define PCRE_INFO_CAPTURECOUNT 0 - -void pcre_free(void*) { -} - -pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { - return NULL; -} - -int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) { - return 0; -} - -int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { - return 0; -} - -#endif - -namespace re2 { - -// Maximum number of args we can set -static const int kMaxArgs = 16; -static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace - -// Approximate size of a recursive invocation of PCRE's -// internal "match()" frame. This varies depending on the -// compiler and architecture, of course, so the constant is -// just a conservative estimate. To find the exact number, -// run regexp_unittest with --regexp_stack_limit=0 under -// a debugger and look at the frames when it crashes. -// The exact frame size was 656 in production on 2008/02/03. -static const int kPCREFrameSize = 700; - -// Special name for missing C++ arguments. -PCRE::Arg PCRE::no_more_args((void*)NULL); - -const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; -const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; -const PCRE::ConsumeFunctor PCRE::Consume = { }; -const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; - -// If a regular expression has no error, its error_ field points here -static const std::string empty_string; - -void PCRE::Init(const char* pattern, Option options, int match_limit, - int stack_limit, bool report_errors) { - pattern_ = pattern; - options_ = options; - match_limit_ = match_limit; - stack_limit_ = stack_limit; - hit_limit_ = false; - error_ = &empty_string; - report_errors_ = report_errors; - re_full_ = NULL; - re_partial_ = NULL; - - if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { - error_ = new std::string("illegal regexp option"); - PCREPORT(ERROR) - << "Error compiling '" << pattern << "': illegal regexp option"; - } else { - re_partial_ = Compile(UNANCHORED); - if (re_partial_ != NULL) { - re_full_ = Compile(ANCHOR_BOTH); - } - } -} - -PCRE::PCRE(const char* pattern) { - Init(pattern, None, 0, 0, true); -} -PCRE::PCRE(const char* pattern, Option option) { - Init(pattern, option, 0, 0, true); -} -PCRE::PCRE(const std::string& pattern) { - Init(pattern.c_str(), None, 0, 0, true); -} -PCRE::PCRE(const std::string& pattern, Option option) { - Init(pattern.c_str(), option, 0, 0, true); -} -PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) { - Init(pattern.c_str(), re_option.option(), re_option.match_limit(), - re_option.stack_limit(), re_option.report_errors()); -} - -PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { - Init(pattern, re_option.option(), re_option.match_limit(), - re_option.stack_limit(), re_option.report_errors()); -} - -PCRE::~PCRE() { - if (re_full_ != NULL) pcre_free(re_full_); - if (re_partial_ != NULL) pcre_free(re_partial_); - if (error_ != &empty_string) delete error_; -} - -pcre* PCRE::Compile(Anchor anchor) { - // Special treatment for anchoring. This is needed because at - // runtime pcre only provides an option for anchoring at the - // beginning of a string. - // - // There are three types of anchoring we want: - // UNANCHORED Compile the original pattern, and use - // a pcre unanchored match. - // ANCHOR_START Compile the original pattern, and use - // a pcre anchored match. - // ANCHOR_BOTH Tack a "\z" to the end of the original pattern - // and use a pcre anchored match. - - const char* error = ""; - int eoffset; - pcre* re; - if (anchor != ANCHOR_BOTH) { - re = pcre_compile(pattern_.c_str(), - (options_ & EnabledCompileOptions), - &error, &eoffset, NULL); - } else { - // Tack a '\z' at the end of PCRE. Parenthesize it first so that - // the '\z' applies to all top-level alternatives in the regexp. - std::string wrapped = "(?:"; // A non-counting grouping operator - wrapped += pattern_; - wrapped += ")\\z"; - re = pcre_compile(wrapped.c_str(), - (options_ & EnabledCompileOptions), - &error, &eoffset, NULL); - } - if (re == NULL) { - if (error_ == &empty_string) error_ = new std::string(error); - PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; - } - return re; -} - -/***** Convenience interfaces *****/ - -bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, - const PCRE& re, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { - const Arg* args[kMaxArgs]; - int n = 0; - if (&a0 == &no_more_args) goto done; args[n++] = &a0; - if (&a1 == &no_more_args) goto done; args[n++] = &a1; - if (&a2 == &no_more_args) goto done; args[n++] = &a2; - if (&a3 == &no_more_args) goto done; args[n++] = &a3; - if (&a4 == &no_more_args) goto done; args[n++] = &a4; - if (&a5 == &no_more_args) goto done; args[n++] = &a5; - if (&a6 == &no_more_args) goto done; args[n++] = &a6; - if (&a7 == &no_more_args) goto done; args[n++] = &a7; - if (&a8 == &no_more_args) goto done; args[n++] = &a8; - if (&a9 == &no_more_args) goto done; args[n++] = &a9; - if (&a10 == &no_more_args) goto done; args[n++] = &a10; - if (&a11 == &no_more_args) goto done; args[n++] = &a11; - if (&a12 == &no_more_args) goto done; args[n++] = &a12; - if (&a13 == &no_more_args) goto done; args[n++] = &a13; - if (&a14 == &no_more_args) goto done; args[n++] = &a14; - if (&a15 == &no_more_args) goto done; args[n++] = &a15; -done: - - size_t consumed; - int vec[kVecSize] = {}; - return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); -} - -bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, - const PCRE& re, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { - const Arg* args[kMaxArgs]; - int n = 0; - if (&a0 == &no_more_args) goto done; args[n++] = &a0; - if (&a1 == &no_more_args) goto done; args[n++] = &a1; - if (&a2 == &no_more_args) goto done; args[n++] = &a2; - if (&a3 == &no_more_args) goto done; args[n++] = &a3; - if (&a4 == &no_more_args) goto done; args[n++] = &a4; - if (&a5 == &no_more_args) goto done; args[n++] = &a5; - if (&a6 == &no_more_args) goto done; args[n++] = &a6; - if (&a7 == &no_more_args) goto done; args[n++] = &a7; - if (&a8 == &no_more_args) goto done; args[n++] = &a8; - if (&a9 == &no_more_args) goto done; args[n++] = &a9; - if (&a10 == &no_more_args) goto done; args[n++] = &a10; - if (&a11 == &no_more_args) goto done; args[n++] = &a11; - if (&a12 == &no_more_args) goto done; args[n++] = &a12; - if (&a13 == &no_more_args) goto done; args[n++] = &a13; - if (&a14 == &no_more_args) goto done; args[n++] = &a14; - if (&a15 == &no_more_args) goto done; args[n++] = &a15; -done: - - size_t consumed; - int vec[kVecSize] = {}; - return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); -} - -bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, - const PCRE& pattern, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { - const Arg* args[kMaxArgs]; - int n = 0; - if (&a0 == &no_more_args) goto done; args[n++] = &a0; - if (&a1 == &no_more_args) goto done; args[n++] = &a1; - if (&a2 == &no_more_args) goto done; args[n++] = &a2; - if (&a3 == &no_more_args) goto done; args[n++] = &a3; - if (&a4 == &no_more_args) goto done; args[n++] = &a4; - if (&a5 == &no_more_args) goto done; args[n++] = &a5; - if (&a6 == &no_more_args) goto done; args[n++] = &a6; - if (&a7 == &no_more_args) goto done; args[n++] = &a7; - if (&a8 == &no_more_args) goto done; args[n++] = &a8; - if (&a9 == &no_more_args) goto done; args[n++] = &a9; - if (&a10 == &no_more_args) goto done; args[n++] = &a10; - if (&a11 == &no_more_args) goto done; args[n++] = &a11; - if (&a12 == &no_more_args) goto done; args[n++] = &a12; - if (&a13 == &no_more_args) goto done; args[n++] = &a13; - if (&a14 == &no_more_args) goto done; args[n++] = &a14; - if (&a15 == &no_more_args) goto done; args[n++] = &a15; -done: - - size_t consumed; - int vec[kVecSize] = {}; - if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, - args, n, vec, kVecSize)) { - input->remove_prefix(consumed); - return true; - } else { - return false; - } -} - -bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, - const PCRE& pattern, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { - const Arg* args[kMaxArgs]; - int n = 0; - if (&a0 == &no_more_args) goto done; args[n++] = &a0; - if (&a1 == &no_more_args) goto done; args[n++] = &a1; - if (&a2 == &no_more_args) goto done; args[n++] = &a2; - if (&a3 == &no_more_args) goto done; args[n++] = &a3; - if (&a4 == &no_more_args) goto done; args[n++] = &a4; - if (&a5 == &no_more_args) goto done; args[n++] = &a5; - if (&a6 == &no_more_args) goto done; args[n++] = &a6; - if (&a7 == &no_more_args) goto done; args[n++] = &a7; - if (&a8 == &no_more_args) goto done; args[n++] = &a8; - if (&a9 == &no_more_args) goto done; args[n++] = &a9; - if (&a10 == &no_more_args) goto done; args[n++] = &a10; - if (&a11 == &no_more_args) goto done; args[n++] = &a11; - if (&a12 == &no_more_args) goto done; args[n++] = &a12; - if (&a13 == &no_more_args) goto done; args[n++] = &a13; - if (&a14 == &no_more_args) goto done; args[n++] = &a14; - if (&a15 == &no_more_args) goto done; args[n++] = &a15; -done: - - size_t consumed; - int vec[kVecSize] = {}; - if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, - args, n, vec, kVecSize)) { - input->remove_prefix(consumed); - return true; - } else { - return false; - } -} - -bool PCRE::Replace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite) { - int vec[kVecSize] = {}; - int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); - if (matches == 0) - return false; - - std::string s; - if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) - return false; - - assert(vec[0] >= 0); - assert(vec[1] >= 0); - str->replace(vec[0], vec[1] - vec[0], s); - return true; -} - -int PCRE::GlobalReplace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite) { - int count = 0; - int vec[kVecSize] = {}; - std::string out; - size_t start = 0; - bool last_match_was_empty_string = false; - - while (start <= str->size()) { - // If the previous match was for the empty string, we shouldn't - // just match again: we'll match in the same way and get an - // infinite loop. Instead, we do the match in a special way: - // anchored -- to force another try at the same position -- - // and with a flag saying that this time, ignore empty matches. - // If this special match returns, that means there's a non-empty - // match at this position as well, and we can continue. If not, - // we do what perl does, and just advance by one. - // Notice that perl prints '@@@' for this; - // perl -le '$_ = "aa"; s/b*|aa/@/g; print' - int matches; - if (last_match_was_empty_string) { - matches = pattern.TryMatch(*str, start, ANCHOR_START, false, - vec, kVecSize); - if (matches <= 0) { - if (start < str->size()) - out.push_back((*str)[start]); - start++; - last_match_was_empty_string = false; - continue; - } - } else { - matches = pattern.TryMatch(*str, start, UNANCHORED, true, - vec, kVecSize); - if (matches <= 0) - break; - } - size_t matchstart = vec[0], matchend = vec[1]; - assert(matchstart >= start); - assert(matchend >= matchstart); - - out.append(*str, start, matchstart - start); - pattern.Rewrite(&out, rewrite, *str, vec, matches); - start = matchend; - count++; - last_match_was_empty_string = (matchstart == matchend); - } - - if (count == 0) - return 0; - - if (start < str->size()) - out.append(*str, start, str->size() - start); - using std::swap; - swap(out, *str); - return count; -} - -bool PCRE::Extract(const StringPiece &text, - const PCRE& pattern, - const StringPiece &rewrite, - std::string *out) { - int vec[kVecSize] = {}; - int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); - if (matches == 0) - return false; - out->clear(); - return pattern.Rewrite(out, rewrite, text, vec, matches); -} - -std::string PCRE::QuoteMeta(const StringPiece& unquoted) { - std::string result; - result.reserve(unquoted.size() << 1); - - // Escape any ascii character not in [A-Za-z_0-9]. - // - // Note that it's legal to escape a character even if it has no - // special meaning in a regular expression -- so this function does - // that. (This also makes it identical to the perl function of the - // same name except for the null-character special case; - // see `perldoc -f quotemeta`.) - for (size_t ii = 0; ii < unquoted.size(); ++ii) { - // Note that using 'isalnum' here raises the benchmark time from - // 32ns to 58ns: - if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && - (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && - (unquoted[ii] < '0' || unquoted[ii] > '9') && - unquoted[ii] != '_' && - // If this is the part of a UTF8 or Latin1 character, we need - // to copy this byte without escaping. Experimentally this is - // what works correctly with the regexp library. - !(unquoted[ii] & 128)) { - if (unquoted[ii] == '\0') { // Special handling for null chars. - // Can't use "\\0" since the next character might be a digit. - result += "\\x00"; - continue; - } - result += '\\'; - } - result += unquoted[ii]; - } - - return result; -} - -/***** Actual matching and rewriting code *****/ - -bool PCRE::HitLimit() { - return hit_limit_ != 0; -} - -void PCRE::ClearHitLimit() { - hit_limit_ = 0; -} - -int PCRE::TryMatch(const StringPiece& text, - size_t startpos, - Anchor anchor, - bool empty_ok, - int *vec, - int vecsize) const { - pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; - if (re == NULL) { - PCREPORT(ERROR) << "Matching against invalid re: " << *error_; - return 0; - } - - int match_limit = match_limit_; - if (match_limit <= 0) { - match_limit = GetFlag(FLAGS_regexp_match_limit); - } - - int stack_limit = stack_limit_; - if (stack_limit <= 0) { - stack_limit = GetFlag(FLAGS_regexp_stack_limit); - } - - pcre_extra extra = { 0 }; - if (match_limit > 0) { - extra.flags |= PCRE_EXTRA_MATCH_LIMIT; - extra.match_limit = match_limit; - } - if (stack_limit > 0) { - extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; - extra.match_limit_recursion = stack_limit / kPCREFrameSize; - } - - int options = 0; - if (anchor != UNANCHORED) - options |= PCRE_ANCHORED; - if (!empty_ok) - options |= PCRE_NOTEMPTY; - - int rc = pcre_exec(re, // The regular expression object - &extra, - (text.data() == NULL) ? "" : text.data(), - static_cast<int>(text.size()), - static_cast<int>(startpos), - options, - vec, - vecsize); - - // Handle errors - if (rc == 0) { - // pcre_exec() returns 0 as a special case when the number of - // capturing subpatterns exceeds the size of the vector. - // When this happens, there is a match and the output vector - // is filled, but we miss out on the positions of the extra subpatterns. - rc = vecsize / 2; - } else if (rc < 0) { - switch (rc) { - case PCRE_ERROR_NOMATCH: - return 0; - case PCRE_ERROR_MATCHLIMIT: - // Writing to hit_limit is not safe if multiple threads - // are using the PCRE, but the flag is only intended - // for use by unit tests anyway, so we let it go. - hit_limit_ = true; - PCREPORT(WARNING) << "Exceeded match limit of " << match_limit - << " when matching '" << pattern_ << "'" - << " against text that is " << text.size() << " bytes."; - return 0; - case PCRE_ERROR_RECURSIONLIMIT: - // See comment about hit_limit above. - hit_limit_ = true; - PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit - << " when matching '" << pattern_ << "'" - << " against text that is " << text.size() << " bytes."; - return 0; - default: - // There are other return codes from pcre.h : - // PCRE_ERROR_NULL (-2) - // PCRE_ERROR_BADOPTION (-3) - // PCRE_ERROR_BADMAGIC (-4) - // PCRE_ERROR_UNKNOWN_NODE (-5) - // PCRE_ERROR_NOMEMORY (-6) - // PCRE_ERROR_NOSUBSTRING (-7) - // ... - PCREPORT(ERROR) << "Unexpected return code: " << rc - << " when matching '" << pattern_ << "'" - << ", re=" << re - << ", text=" << text - << ", vec=" << vec - << ", vecsize=" << vecsize; - return 0; - } - } - - return rc; -} - -bool PCRE::DoMatchImpl(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const* args, - int n, - int* vec, - int vecsize) const { - assert((1 + n) * 3 <= vecsize); // results + PCRE workspace - if (NumberOfCapturingGroups() < n) { - // RE has fewer capturing groups than number of Arg pointers passed in. - return false; - } - - int matches = TryMatch(text, 0, anchor, true, vec, vecsize); - assert(matches >= 0); // TryMatch never returns negatives - if (matches == 0) - return false; - - *consumed = vec[1]; - - if (n == 0 || args == NULL) { - // We are not interested in results - return true; - } - - // If we got here, we must have matched the whole pattern. - // We do not need (can not do) any more checks on the value of 'matches' here - // -- see the comment for TryMatch. - for (int i = 0; i < n; i++) { - const int start = vec[2*(i+1)]; - const int limit = vec[2*(i+1)+1]; - - // Avoid invoking undefined behavior when text.data() happens - // to be null and start happens to be -1, the latter being the - // case for an unmatched subexpression. Even if text.data() is - // not null, pointing one byte before was a longstanding bug. - const char* addr = NULL; - if (start != -1) { - addr = text.data() + start; - } - - if (!args[i]->Parse(addr, limit-start)) { - // TODO: Should we indicate what the error was? - return false; - } - } - - return true; -} - -bool PCRE::DoMatch(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const args[], - int n) const { - assert(n >= 0); - const int vecsize = (1 + n) * 3; // results + PCRE workspace - // (as for kVecSize) - int* vec = new int[vecsize]; - bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); - delete[] vec; - return b; -} - -bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite, - const StringPiece &text, int *vec, int veclen) const { - int number_of_capturing_groups = NumberOfCapturingGroups(); - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - int c = *s; - if (c == '\\') { - c = *++s; - if (isdigit(c)) { - int n = (c - '0'); - if (n >= veclen) { - if (n <= number_of_capturing_groups) { - // unmatched optional capturing group. treat - // its value as empty string; i.e., nothing to append. - } else { - PCREPORT(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); - return false; - } - } - int start = vec[2 * n]; - if (start >= 0) - out->append(text.data() + start, vec[2 * n + 1] - start); - } else if (c == '\\') { - out->push_back('\\'); - } else { - PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); - return false; - } - } else { - out->push_back(c); - } - } - return true; -} - -bool PCRE::CheckRewriteString(const StringPiece& rewrite, - std::string* error) const { - int max_token = -1; - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - int c = *s; - if (c != '\\') { - continue; - } - if (++s == end) { - *error = "Rewrite schema error: '\\' not allowed at end."; - return false; - } - c = *s; - if (c == '\\') { - continue; - } - if (!isdigit(c)) { - *error = "Rewrite schema error: " - "'\\' must be followed by a digit or '\\'."; - return false; - } - int n = (c - '0'); - if (max_token < n) { - max_token = n; - } - } - - if (max_token > NumberOfCapturingGroups()) { - *error = StringPrintf( - "Rewrite schema requests %d matches, but the regexp only has %d " - "parenthesized subexpressions.", - max_token, NumberOfCapturingGroups()); - return false; - } - return true; -} - - -// Return the number of capturing subpatterns, or -1 if the -// regexp wasn't valid on construction. -int PCRE::NumberOfCapturingGroups() const { - if (re_partial_ == NULL) return -1; - - int result; - int rc = pcre_fullinfo(re_partial_, // The regular expression object - NULL, // We did not study the pattern - PCRE_INFO_CAPTURECOUNT, - &result); - if (rc != 0) { - PCREPORT(ERROR) << "Unexpected return code: " << rc; - return -1; - } - return result; -} - - -/***** Parsers for various types *****/ - -bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) { - // We fail if somebody asked us to store into a non-NULL void* pointer - return (dest == NULL); -} - -bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - reinterpret_cast<std::string*>(dest)->assign(str, n); - return true; -} - -bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n); - return true; -} - -bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) { - if (n != 1) return false; - if (dest == NULL) return true; - *(reinterpret_cast<char*>(dest)) = str[0]; - return true; -} - -bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) { - if (n != 1) return false; - if (dest == NULL) return true; - *(reinterpret_cast<signed char*>(dest)) = str[0]; - return true; -} - -bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) { - if (n != 1) return false; - if (dest == NULL) return true; - *(reinterpret_cast<unsigned char*>(dest)) = str[0]; - return true; -} - -// Largest number spec that we are willing to parse -static const int kMaxNumberLength = 32; - -// PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 -// PCREQUIPCRES "n > 0" -// Copies "str" into "buf" and null-terminates if necessary. -// Returns one of: -// a. "str" if no termination is needed -// b. "buf" if the string was copied and null-terminated -// c. "" if the input was invalid and has no hope of being parsed -static const char* TerminateNumber(char* buf, const char* str, size_t n) { - if ((n > 0) && isspace(*str)) { - // We are less forgiving than the strtoxxx() routines and do not - // allow leading spaces. - return ""; - } - - // See if the character right after the input text may potentially - // look like a digit. - if (isdigit(str[n]) || - ((str[n] >= 'a') && (str[n] <= 'f')) || - ((str[n] >= 'A') && (str[n] <= 'F'))) { - if (n > kMaxNumberLength) return ""; // Input too big to be a valid number - memcpy(buf, str, n); - buf[n] = '\0'; - return buf; - } else { - // We can parse right out of the supplied string, so return it. - return str; - } -} - -bool PCRE::Arg::parse_long_radix(const char* str, - size_t n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); - char* end; - errno = 0; - long r = strtol(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast<long*>(dest)) = r; - return true; -} - -bool PCRE::Arg::parse_ulong_radix(const char* str, - size_t n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); - if (str[0] == '-') { - // strtoul() will silently accept negative numbers and parse - // them. This module is more strict and treats them as errors. - return false; - } - - char* end; - errno = 0; - unsigned long r = strtoul(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast<unsigned long*>(dest)) = r; - return true; -} - -bool PCRE::Arg::parse_short_radix(const char* str, - size_t n, - void* dest, - int radix) { - long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range - if (dest == NULL) return true; - *(reinterpret_cast<short*>(dest)) = (short)r; - return true; -} - -bool PCRE::Arg::parse_ushort_radix(const char* str, - size_t n, - void* dest, - int radix) { - unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((unsigned short)r != r) return false; // Out of range - if (dest == NULL) return true; - *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r; - return true; -} - -bool PCRE::Arg::parse_int_radix(const char* str, - size_t n, - void* dest, - int radix) { - long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range - if (dest == NULL) return true; - *(reinterpret_cast<int*>(dest)) = (int)r; - return true; -} - -bool PCRE::Arg::parse_uint_radix(const char* str, - size_t n, - void* dest, - int radix) { - unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((unsigned int)r != r) return false; // Out of range - if (dest == NULL) return true; - *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r; - return true; -} - -bool PCRE::Arg::parse_longlong_radix(const char* str, - size_t n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); - char* end; - errno = 0; - long long r = strtoll(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast<long long*>(dest)) = r; - return true; -} - -bool PCRE::Arg::parse_ulonglong_radix(const char* str, - size_t n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); - if (str[0] == '-') { - // strtoull() will silently accept negative numbers and parse - // them. This module is more strict and treats them as errors. - return false; - } - char* end; - errno = 0; - unsigned long long r = strtoull(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast<unsigned long long*>(dest)) = r; - return true; -} - -static bool parse_double_float(const char* str, size_t n, bool isfloat, - void* dest) { - if (n == 0) return false; - static const int kMaxLength = 200; - char buf[kMaxLength]; - if (n >= kMaxLength) return false; - memcpy(buf, str, n); - buf[n] = '\0'; - char* end; - errno = 0; - double r; - if (isfloat) { - r = strtof(buf, &end); - } else { - r = strtod(buf, &end); - } - if (end != buf + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - if (isfloat) { - *(reinterpret_cast<float*>(dest)) = (float)r; - } else { - *(reinterpret_cast<double*>(dest)) = r; - } - return true; -} - -bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) { - return parse_double_float(str, n, false, dest); -} - -bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) { - return parse_double_float(str, n, true, dest); -} - -#define DEFINE_INTEGER_PARSER(name) \ - bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 10); \ - } \ - bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 16); \ - } \ - bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \ - void* dest) { \ - return parse_##name##_radix(str, n, dest, 8); \ - } \ - bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \ - void* dest) { \ - return parse_##name##_radix(str, n, dest, 0); \ - } - -DEFINE_INTEGER_PARSER(short); -DEFINE_INTEGER_PARSER(ushort); -DEFINE_INTEGER_PARSER(int); -DEFINE_INTEGER_PARSER(uint); -DEFINE_INTEGER_PARSER(long); -DEFINE_INTEGER_PARSER(ulong); -DEFINE_INTEGER_PARSER(longlong); -DEFINE_INTEGER_PARSER(ulonglong); - -#undef DEFINE_INTEGER_PARSER - -} // namespace re2 +// Copyright 2003-2009 Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is a variant of PCRE's pcrecpp.cc, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <limits> +#include <string> +#include <utility> + +#include "util/util.h" +#include "util/flags.h" +#include "util/logging.h" +#include "util/pcre.h" +#include "util/strutil.h" + +// Silence warnings about the wacky formatting in the operator() functions. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#endif + +#define PCREPORT(level) LOG(level) + +// Default PCRE limits. +// Defaults chosen to allow a plausible amount of CPU and +// not exceed main thread stacks. Note that other threads +// often have smaller stacks, and therefore tightening +// regexp_stack_limit may frequently be necessary. +DEFINE_FLAG(int, regexp_stack_limit, 256 << 10, + "default PCRE stack limit (bytes)"); +DEFINE_FLAG(int, regexp_match_limit, 1000000, + "default PCRE match limit (function calls)"); + +#ifndef USEPCRE + +// Fake just enough of the PCRE API to allow this file to build. :) + +struct pcre_extra { + int flags; + int match_limit; + int match_limit_recursion; +}; + +#define PCRE_EXTRA_MATCH_LIMIT 0 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 +#define PCRE_ANCHORED 0 +#define PCRE_NOTEMPTY 0 +#define PCRE_ERROR_NOMATCH 1 +#define PCRE_ERROR_MATCHLIMIT 2 +#define PCRE_ERROR_RECURSIONLIMIT 3 +#define PCRE_INFO_CAPTURECOUNT 0 + +void pcre_free(void*) { +} + +pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { + return NULL; +} + +int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) { + return 0; +} + +int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { + return 0; +} + +#endif + +namespace re2 { + +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace + +// Approximate size of a recursive invocation of PCRE's +// internal "match()" frame. This varies depending on the +// compiler and architecture, of course, so the constant is +// just a conservative estimate. To find the exact number, +// run regexp_unittest with --regexp_stack_limit=0 under +// a debugger and look at the frames when it crashes. +// The exact frame size was 656 in production on 2008/02/03. +static const int kPCREFrameSize = 700; + +// Special name for missing C++ arguments. +PCRE::Arg PCRE::no_more_args((void*)NULL); + +const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; +const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; +const PCRE::ConsumeFunctor PCRE::Consume = { }; +const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; + +// If a regular expression has no error, its error_ field points here +static const std::string empty_string; + +void PCRE::Init(const char* pattern, Option options, int match_limit, + int stack_limit, bool report_errors) { + pattern_ = pattern; + options_ = options; + match_limit_ = match_limit; + stack_limit_ = stack_limit; + hit_limit_ = false; + error_ = &empty_string; + report_errors_ = report_errors; + re_full_ = NULL; + re_partial_ = NULL; + + if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { + error_ = new std::string("illegal regexp option"); + PCREPORT(ERROR) + << "Error compiling '" << pattern << "': illegal regexp option"; + } else { + re_partial_ = Compile(UNANCHORED); + if (re_partial_ != NULL) { + re_full_ = Compile(ANCHOR_BOTH); + } + } +} + +PCRE::PCRE(const char* pattern) { + Init(pattern, None, 0, 0, true); +} +PCRE::PCRE(const char* pattern, Option option) { + Init(pattern, option, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern) { + Init(pattern.c_str(), None, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern, Option option) { + Init(pattern.c_str(), option, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) { + Init(pattern.c_str(), re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { + Init(pattern, re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::~PCRE() { + if (re_full_ != NULL) pcre_free(re_full_); + if (re_partial_ != NULL) pcre_free(re_partial_); + if (error_ != &empty_string) delete error_; +} + +pcre* PCRE::Compile(Anchor anchor) { + // Special treatment for anchoring. This is needed because at + // runtime pcre only provides an option for anchoring at the + // beginning of a string. + // + // There are three types of anchoring we want: + // UNANCHORED Compile the original pattern, and use + // a pcre unanchored match. + // ANCHOR_START Compile the original pattern, and use + // a pcre anchored match. + // ANCHOR_BOTH Tack a "\z" to the end of the original pattern + // and use a pcre anchored match. + + const char* error = ""; + int eoffset; + pcre* re; + if (anchor != ANCHOR_BOTH) { + re = pcre_compile(pattern_.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } else { + // Tack a '\z' at the end of PCRE. Parenthesize it first so that + // the '\z' applies to all top-level alternatives in the regexp. + std::string wrapped = "(?:"; // A non-counting grouping operator + wrapped += pattern_; + wrapped += ")\\z"; + re = pcre_compile(wrapped.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } + if (re == NULL) { + if (error_ == &empty_string) error_ = new std::string(error); + PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; + } + return re; +} + +/***** Convenience interfaces *****/ + +bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::Replace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int vec[kVecSize] = {}; + int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + + std::string s; + if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) + return false; + + assert(vec[0] >= 0); + assert(vec[1] >= 0); + str->replace(vec[0], vec[1] - vec[0], s); + return true; +} + +int PCRE::GlobalReplace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int count = 0; + int vec[kVecSize] = {}; + std::string out; + size_t start = 0; + bool last_match_was_empty_string = false; + + while (start <= str->size()) { + // If the previous match was for the empty string, we shouldn't + // just match again: we'll match in the same way and get an + // infinite loop. Instead, we do the match in a special way: + // anchored -- to force another try at the same position -- + // and with a flag saying that this time, ignore empty matches. + // If this special match returns, that means there's a non-empty + // match at this position as well, and we can continue. If not, + // we do what perl does, and just advance by one. + // Notice that perl prints '@@@' for this; + // perl -le '$_ = "aa"; s/b*|aa/@/g; print' + int matches; + if (last_match_was_empty_string) { + matches = pattern.TryMatch(*str, start, ANCHOR_START, false, + vec, kVecSize); + if (matches <= 0) { + if (start < str->size()) + out.push_back((*str)[start]); + start++; + last_match_was_empty_string = false; + continue; + } + } else { + matches = pattern.TryMatch(*str, start, UNANCHORED, true, + vec, kVecSize); + if (matches <= 0) + break; + } + size_t matchstart = vec[0], matchend = vec[1]; + assert(matchstart >= start); + assert(matchend >= matchstart); + + out.append(*str, start, matchstart - start); + pattern.Rewrite(&out, rewrite, *str, vec, matches); + start = matchend; + count++; + last_match_was_empty_string = (matchstart == matchend); + } + + if (count == 0) + return 0; + + if (start < str->size()) + out.append(*str, start, str->size() - start); + using std::swap; + swap(out, *str); + return count; +} + +bool PCRE::Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + std::string *out) { + int vec[kVecSize] = {}; + int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + out->clear(); + return pattern.Rewrite(out, rewrite, text, vec, matches); +} + +std::string PCRE::QuoteMeta(const StringPiece& unquoted) { + std::string result; + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) + for (size_t ii = 0; ii < unquoted.size(); ++ii) { + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + +/***** Actual matching and rewriting code *****/ + +bool PCRE::HitLimit() { + return hit_limit_ != 0; +} + +void PCRE::ClearHitLimit() { + hit_limit_ = 0; +} + +int PCRE::TryMatch(const StringPiece& text, + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const { + pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; + if (re == NULL) { + PCREPORT(ERROR) << "Matching against invalid re: " << *error_; + return 0; + } + + int match_limit = match_limit_; + if (match_limit <= 0) { + match_limit = GetFlag(FLAGS_regexp_match_limit); + } + + int stack_limit = stack_limit_; + if (stack_limit <= 0) { + stack_limit = GetFlag(FLAGS_regexp_stack_limit); + } + + pcre_extra extra = { 0 }; + if (match_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT; + extra.match_limit = match_limit; + } + if (stack_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; + extra.match_limit_recursion = stack_limit / kPCREFrameSize; + } + + int options = 0; + if (anchor != UNANCHORED) + options |= PCRE_ANCHORED; + if (!empty_ok) + options |= PCRE_NOTEMPTY; + + int rc = pcre_exec(re, // The regular expression object + &extra, + (text.data() == NULL) ? "" : text.data(), + static_cast<int>(text.size()), + static_cast<int>(startpos), + options, + vec, + vecsize); + + // Handle errors + if (rc == 0) { + // pcre_exec() returns 0 as a special case when the number of + // capturing subpatterns exceeds the size of the vector. + // When this happens, there is a match and the output vector + // is filled, but we miss out on the positions of the extra subpatterns. + rc = vecsize / 2; + } else if (rc < 0) { + switch (rc) { + case PCRE_ERROR_NOMATCH: + return 0; + case PCRE_ERROR_MATCHLIMIT: + // Writing to hit_limit is not safe if multiple threads + // are using the PCRE, but the flag is only intended + // for use by unit tests anyway, so we let it go. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded match limit of " << match_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + case PCRE_ERROR_RECURSIONLIMIT: + // See comment about hit_limit above. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + default: + // There are other return codes from pcre.h : + // PCRE_ERROR_NULL (-2) + // PCRE_ERROR_BADOPTION (-3) + // PCRE_ERROR_BADMAGIC (-4) + // PCRE_ERROR_UNKNOWN_NODE (-5) + // PCRE_ERROR_NOMEMORY (-6) + // PCRE_ERROR_NOSUBSTRING (-7) + // ... + PCREPORT(ERROR) << "Unexpected return code: " << rc + << " when matching '" << pattern_ << "'" + << ", re=" << re + << ", text=" << text + << ", vec=" << vec + << ", vecsize=" << vecsize; + return 0; + } + } + + return rc; +} + +bool PCRE::DoMatchImpl(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const* args, + int n, + int* vec, + int vecsize) const { + assert((1 + n) * 3 <= vecsize); // results + PCRE workspace + if (NumberOfCapturingGroups() < n) { + // RE has fewer capturing groups than number of Arg pointers passed in. + return false; + } + + int matches = TryMatch(text, 0, anchor, true, vec, vecsize); + assert(matches >= 0); // TryMatch never returns negatives + if (matches == 0) + return false; + + *consumed = vec[1]; + + if (n == 0 || args == NULL) { + // We are not interested in results + return true; + } + + // If we got here, we must have matched the whole pattern. + // We do not need (can not do) any more checks on the value of 'matches' here + // -- see the comment for TryMatch. + for (int i = 0; i < n; i++) { + const int start = vec[2*(i+1)]; + const int limit = vec[2*(i+1)+1]; + + // Avoid invoking undefined behavior when text.data() happens + // to be null and start happens to be -1, the latter being the + // case for an unmatched subexpression. Even if text.data() is + // not null, pointing one byte before was a longstanding bug. + const char* addr = NULL; + if (start != -1) { + addr = text.data() + start; + } + + if (!args[i]->Parse(addr, limit-start)) { + // TODO: Should we indicate what the error was? + return false; + } + } + + return true; +} + +bool PCRE::DoMatch(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n) const { + assert(n >= 0); + const int vecsize = (1 + n) * 3; // results + PCRE workspace + // (as for kVecSize) + int* vec = new int[vecsize]; + bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); + delete[] vec; + return b; +} + +bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite, + const StringPiece &text, int *vec, int veclen) const { + int number_of_capturing_groups = NumberOfCapturingGroups(); + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c == '\\') { + c = *++s; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (n <= number_of_capturing_groups) { + // unmatched optional capturing group. treat + // its value as empty string; i.e., nothing to append. + } else { + PCREPORT(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + return false; + } + } + int start = vec[2 * n]; + if (start >= 0) + out->append(text.data() + start, vec[2 * n + 1] - start); + } else if (c == '\\') { + out->push_back('\\'); + } else { + PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } else { + out->push_back(c); + } + } + return true; +} + +bool PCRE::CheckRewriteString(const StringPiece& rewrite, + std::string* error) const { + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c != '\\') { + continue; + } + if (++s == end) { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') { + continue; + } + if (!isdigit(c)) { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) { + max_token = n; + } + } + + if (max_token > NumberOfCapturingGroups()) { + *error = StringPrintf( + "Rewrite schema requests %d matches, but the regexp only has %d " + "parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); + return false; + } + return true; +} + + +// Return the number of capturing subpatterns, or -1 if the +// regexp wasn't valid on construction. +int PCRE::NumberOfCapturingGroups() const { + if (re_partial_ == NULL) return -1; + + int result; + int rc = pcre_fullinfo(re_partial_, // The regular expression object + NULL, // We did not study the pattern + PCRE_INFO_CAPTURECOUNT, + &result); + if (rc != 0) { + PCREPORT(ERROR) << "Unexpected return code: " << rc; + return -1; + } + return result; +} + + +/***** Parsers for various types *****/ + +bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) { + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + +bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast<std::string*>(dest)->assign(str, n); + return true; +} + +bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n); + return true; +} + +bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<char*>(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<signed char*>(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned char*>(dest)) = str[0]; + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + +// PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 +// PCREQUIPCRES "n > 0" +// Copies "str" into "buf" and null-terminates if necessary. +// Returns one of: +// a. "str" if no termination is needed +// b. "buf" if the string was copied and null-terminated +// c. "" if the input was invalid and has no hope of being parsed +static const char* TerminateNumber(char* buf, const char* str, size_t n) { + if ((n > 0) && isspace(*str)) { + // We are less forgiving than the strtoxxx() routines and do not + // allow leading spaces. + return ""; + } + + // See if the character right after the input text may potentially + // look like a digit. + if (isdigit(str[n]) || + ((str[n] >= 'a') && (str[n] <= 'f')) || + ((str[n] >= 'A') && (str[n] <= 'F'))) { + if (n > kMaxNumberLength) return ""; // Input too big to be a valid number + memcpy(buf, str, n); + buf[n] = '\0'; + return buf; + } else { + // We can parse right out of the supplied string, so return it. + return str; + } +} + +bool PCRE::Arg::parse_long_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_short_radix(const char* str, + size_t n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<short*>(dest)) = (short)r; + return true; +} + +bool PCRE::Arg::parse_ushort_radix(const char* str, + size_t n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r; + return true; +} + +bool PCRE::Arg::parse_int_radix(const char* str, + size_t n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<int*>(dest)) = (int)r; + return true; +} + +bool PCRE::Arg::parse_uint_radix(const char* str, + size_t n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r; + return true; +} + +bool PCRE::Arg::parse_longlong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long long r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<long long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulonglong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; + unsigned long long r = strtoull(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned long long*>(dest)) = r; + return true; +} + +static bool parse_double_float(const char* str, size_t n, bool isfloat, + void* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength]; + if (n >= kMaxLength) return false; + memcpy(buf, str, n); + buf[n] = '\0'; + char* end; + errno = 0; + double r; + if (isfloat) { + r = strtof(buf, &end); + } else { + r = strtod(buf, &end); + } + if (end != buf + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + if (isfloat) { + *(reinterpret_cast<float*>(dest)) = (float)r; + } else { + *(reinterpret_cast<double*>(dest)) = r; + } + return true; +} + +bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, false, dest); +} + +bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, true, dest); +} + +#define DEFINE_INTEGER_PARSER(name) \ + bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ + } + +DEFINE_INTEGER_PARSER(short); +DEFINE_INTEGER_PARSER(ushort); +DEFINE_INTEGER_PARSER(int); +DEFINE_INTEGER_PARSER(uint); +DEFINE_INTEGER_PARSER(long); +DEFINE_INTEGER_PARSER(ulong); +DEFINE_INTEGER_PARSER(longlong); +DEFINE_INTEGER_PARSER(ulonglong); + +#undef DEFINE_INTEGER_PARSER + +} // namespace re2 diff --git a/contrib/libs/re2/util/pcre.h b/contrib/libs/re2/util/pcre.h index 500c56d283..896b0bdf89 100644 --- a/contrib/libs/re2/util/pcre.h +++ b/contrib/libs/re2/util/pcre.h @@ -1,681 +1,681 @@ -// Copyright 2003-2010 Google Inc. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_PCRE_H_ -#define UTIL_PCRE_H_ - -// This is a variant of PCRE's pcrecpp.h, originally written at Google. -// The main changes are the addition of the HitLimit method and -// compilation as PCRE in namespace re2. - -// C++ interface to the pcre regular-expression library. PCRE supports -// Perl-style regular expressions (with extensions like \d, \w, \s, -// ...). -// -// ----------------------------------------------------------------------- -// REGEXP SYNTAX: -// -// This module uses the pcre library and hence supports its syntax -// for regular expressions: -// -// http://www.google.com/search?q=pcre -// -// The syntax is pretty similar to Perl's. For those not familiar -// with Perl's regular expressions, here are some examples of the most -// commonly used extensions: -// -// "hello (\\w+) world" -- \w matches a "word" character -// "version (\\d+)" -- \d matches a digit -// "hello\\s+world" -- \s matches any whitespace character -// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary -// "(?i)hello" -- (?i) turns on case-insensitive matching -// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible -// -// ----------------------------------------------------------------------- -// MATCHING INTERFACE: -// -// The "FullMatch" operation checks that supplied text matches a -// supplied pattern exactly. -// -// Example: successful match -// CHECK(PCRE::FullMatch("hello", "h.*o")); -// -// Example: unsuccessful match (requires full match): -// CHECK(!PCRE::FullMatch("hello", "e")); -// -// ----------------------------------------------------------------------- -// UTF-8 AND THE MATCHING INTERFACE: -// -// By default, pattern and text are plain text, one byte per character. -// The UTF8 flag, passed to the constructor, causes both pattern -// and string to be treated as UTF-8 text, still a byte stream but -// potentially multiple bytes per character. In practice, the text -// is likelier to be UTF-8 than the pattern, but the match returned -// may depend on the UTF8 flag, so always use it when matching -// UTF8 text. E.g., "." will match one byte normally but with UTF8 -// set may match up to three bytes of a multi-byte character. -// -// Example: -// PCRE re(utf8_pattern, PCRE::UTF8); -// CHECK(PCRE::FullMatch(utf8_string, re)); -// -// ----------------------------------------------------------------------- -// MATCHING WITH SUBSTRING EXTRACTION: -// -// You can supply extra pointer arguments to extract matched substrings. -// -// Example: extracts "ruby" into "s" and 1234 into "i" -// int i; -// std::string s; -// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); -// -// Example: fails because string cannot be stored in integer -// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i)); -// -// Example: fails because there aren't enough sub-patterns: -// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s)); -// -// Example: does not try to extract any extra sub-patterns -// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); -// -// Example: does not try to extract into NULL -// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); -// -// Example: integer overflow causes failure -// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); -// -// ----------------------------------------------------------------------- -// PARTIAL MATCHES -// -// You can use the "PartialMatch" operation when you want the pattern -// to match any substring of the text. -// -// Example: simple search for a string: -// CHECK(PCRE::PartialMatch("hello", "ell")); -// -// Example: find first number in a string -// int number; -// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number)); -// CHECK_EQ(number, 100); -// -// ----------------------------------------------------------------------- -// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS -// -// PCRE makes it easy to use any string as a regular expression, without -// requiring a separate compilation step. -// -// If speed is of the essence, you can create a pre-compiled "PCRE" -// object from the pattern and use it multiple times. If you do so, -// you can typically parse text faster than with sscanf. -// -// Example: precompile pattern for faster matching: -// PCRE pattern("h.*o"); -// while (ReadLine(&str)) { -// if (PCRE::FullMatch(str, pattern)) ...; -// } -// -// ----------------------------------------------------------------------- -// SCANNING TEXT INCPCREMENTALLY -// -// The "Consume" operation may be useful if you want to repeatedly -// match regular expressions at the front of a string and skip over -// them as they match. This requires use of the "StringPiece" type, -// which represents a sub-range of a real string. -// -// Example: read lines of the form "var = value" from a string. -// std::string contents = ...; // Fill string somehow -// StringPiece input(contents); // Wrap a StringPiece around it -// -// std::string var; -// int value; -// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { -// ...; -// } -// -// Each successful call to "Consume" will set "var/value", and also -// advance "input" so it points past the matched text. Note that if the -// regular expression matches an empty string, input will advance -// by 0 bytes. If the regular expression being used might match -// an empty string, the loop body must check for this case and either -// advance the string or break out of the loop. -// -// The "FindAndConsume" operation is similar to "Consume" but does not -// anchor your match at the beginning of the string. For example, you -// could extract all words from a string by repeatedly calling -// PCRE::FindAndConsume(&input, "(\\w+)", &word) -// -// ----------------------------------------------------------------------- -// PARSING HEX/OCTAL/C-RADIX NUMBERS -// -// By default, if you pass a pointer to a numeric value, the -// corresponding text is interpreted as a base-10 number. You can -// instead wrap the pointer with a call to one of the operators Hex(), -// Octal(), or CRadix() to interpret the text in another base. The -// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) -// prefixes, but defaults to base-10. -// -// Example: -// int a, b, c, d; -// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", -// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d)); -// will leave 64 in a, b, c, and d. - -#include "util/util.h" -#include "re2/stringpiece.h" - -#ifdef USEPCRE -#include <pcre.h> -namespace re2 { -const bool UsingPCRE = true; -} // namespace re2 -#else -struct pcre; // opaque -namespace re2 { -const bool UsingPCRE = false; -} // namespace re2 -#endif - -namespace re2 { - -class PCRE_Options; - -// Interface for regular expression matching. Also corresponds to a -// pre-compiled regular expression. An "PCRE" object is safe for -// concurrent use by multiple threads. -class PCRE { - public: - // We convert user-passed pointers into special Arg objects - class Arg; - - // Marks end of arg list. - // ONLY USE IN OPTIONAL ARG DEFAULTS. - // DO NOT PASS EXPLICITLY. - static Arg no_more_args; - - // Options are same value as those in pcre. We provide them here - // to avoid users needing to include pcre.h and also to isolate - // users from pcre should we change the underlying library. - // Only those needed by Google programs are exposed here to - // avoid collision with options employed internally by regexp.cc - // Note that some options have equivalents that can be specified in - // the regexp itself. For example, prefixing your regexp with - // "(?s)" has the same effect as the PCRE_DOTALL option. - enum Option { - None = 0x0000, - UTF8 = 0x0800, // == PCRE_UTF8 - EnabledCompileOptions = UTF8, - EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag - }; - - // We provide implicit conversions from strings so that users can - // pass in a string or a "const char*" wherever an "PCRE" is expected. - PCRE(const char* pattern); - PCRE(const char* pattern, Option option); - PCRE(const std::string& pattern); - PCRE(const std::string& pattern, Option option); - PCRE(const char *pattern, const PCRE_Options& re_option); - PCRE(const std::string& pattern, const PCRE_Options& re_option); - - ~PCRE(); - - // The string specification for this PCRE. E.g. - // PCRE re("ab*c?d+"); - // re.pattern(); // "ab*c?d+" - const std::string& pattern() const { return pattern_; } - - // If PCRE could not be created properly, returns an error string. - // Else returns the empty string. - const std::string& error() const { return *error_; } - - // Whether the PCRE has hit a match limit during execution. - // Not thread safe. Intended only for testing. - // If hitting match limits is a problem, - // you should be using PCRE2 (re2/re2.h) - // instead of checking this flag. - bool HitLimit(); - void ClearHitLimit(); - - /***** The useful part: the matching interface *****/ - - // Matches "text" against "pattern". If pointer arguments are - // supplied, copies matched sub-patterns into them. - // - // You can pass in a "const char*" or a "std::string" for "text". - // You can pass in a "const char*" or a "std::string" or a "PCRE" for "pattern". - // - // The provided pointer arguments can be pointers to any scalar numeric - // type, or one of: - // std::string (matched piece is copied to string) - // StringPiece (StringPiece is mutated to point to matched piece) - // T (where "bool T::ParseFrom(const char*, size_t)" exists) - // (void*)NULL (the corresponding matched sub-pattern is not copied) - // - // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "pattern" exactly - // b. The number of matched sub-patterns is >= number of supplied pointers - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, "i"th captured sub-pattern is - // ignored. - // - // CAVEAT: An optional sub-pattern that does not exist in the - // matched string is assigned the empty string. Therefore, the - // following will return false (because the empty string is not a - // valid number): - // int number; - // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number); - struct FullMatchFunctor { - bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args - const Arg& ptr1 = no_more_args, - const Arg& ptr2 = no_more_args, - const Arg& ptr3 = no_more_args, - const Arg& ptr4 = no_more_args, - const Arg& ptr5 = no_more_args, - const Arg& ptr6 = no_more_args, - const Arg& ptr7 = no_more_args, - const Arg& ptr8 = no_more_args, - const Arg& ptr9 = no_more_args, - const Arg& ptr10 = no_more_args, - const Arg& ptr11 = no_more_args, - const Arg& ptr12 = no_more_args, - const Arg& ptr13 = no_more_args, - const Arg& ptr14 = no_more_args, - const Arg& ptr15 = no_more_args, - const Arg& ptr16 = no_more_args) const; - }; - - static const FullMatchFunctor FullMatch; - - // Exactly like FullMatch(), except that "pattern" is allowed to match - // a substring of "text". - struct PartialMatchFunctor { - bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args - const Arg& ptr1 = no_more_args, - const Arg& ptr2 = no_more_args, - const Arg& ptr3 = no_more_args, - const Arg& ptr4 = no_more_args, - const Arg& ptr5 = no_more_args, - const Arg& ptr6 = no_more_args, - const Arg& ptr7 = no_more_args, - const Arg& ptr8 = no_more_args, - const Arg& ptr9 = no_more_args, - const Arg& ptr10 = no_more_args, - const Arg& ptr11 = no_more_args, - const Arg& ptr12 = no_more_args, - const Arg& ptr13 = no_more_args, - const Arg& ptr14 = no_more_args, - const Arg& ptr15 = no_more_args, - const Arg& ptr16 = no_more_args) const; - }; - - static const PartialMatchFunctor PartialMatch; - - // Like FullMatch() and PartialMatch(), except that pattern has to - // match a prefix of "text", and "input" is advanced past the matched - // text. Note: "input" is modified iff this routine returns true. - struct ConsumeFunctor { - bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args - const Arg& ptr1 = no_more_args, - const Arg& ptr2 = no_more_args, - const Arg& ptr3 = no_more_args, - const Arg& ptr4 = no_more_args, - const Arg& ptr5 = no_more_args, - const Arg& ptr6 = no_more_args, - const Arg& ptr7 = no_more_args, - const Arg& ptr8 = no_more_args, - const Arg& ptr9 = no_more_args, - const Arg& ptr10 = no_more_args, - const Arg& ptr11 = no_more_args, - const Arg& ptr12 = no_more_args, - const Arg& ptr13 = no_more_args, - const Arg& ptr14 = no_more_args, - const Arg& ptr15 = no_more_args, - const Arg& ptr16 = no_more_args) const; - }; - - static const ConsumeFunctor Consume; - - // Like Consume(..), but does not anchor the match at the beginning of the - // string. That is, "pattern" need not start its match at the beginning of - // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next - // word in "s" and stores it in "word". - struct FindAndConsumeFunctor { - bool operator ()(StringPiece* input, const PCRE& pattern, - const Arg& ptr1 = no_more_args, - const Arg& ptr2 = no_more_args, - const Arg& ptr3 = no_more_args, - const Arg& ptr4 = no_more_args, - const Arg& ptr5 = no_more_args, - const Arg& ptr6 = no_more_args, - const Arg& ptr7 = no_more_args, - const Arg& ptr8 = no_more_args, - const Arg& ptr9 = no_more_args, - const Arg& ptr10 = no_more_args, - const Arg& ptr11 = no_more_args, - const Arg& ptr12 = no_more_args, - const Arg& ptr13 = no_more_args, - const Arg& ptr14 = no_more_args, - const Arg& ptr15 = no_more_args, - const Arg& ptr16 = no_more_args) const; - }; - - static const FindAndConsumeFunctor FindAndConsume; - - // Replace the first match of "pattern" in "str" with "rewrite". - // Within "rewrite", backslash-escaped digits (\1 to \9) can be - // used to insert text matching corresponding parenthesized group - // from the pattern. \0 in "rewrite" refers to the entire matching - // text. E.g., - // - // std::string s = "yabba dabba doo"; - // CHECK(PCRE::Replace(&s, "b+", "d")); - // - // will leave "s" containing "yada dabba doo" - // - // Returns true if the pattern matches and a replacement occurs, - // false otherwise. - static bool Replace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite); - - // Like Replace(), except replaces all occurrences of the pattern in - // the string with the rewrite. Replacements are not subject to - // re-matching. E.g., - // - // std::string s = "yabba dabba doo"; - // CHECK(PCRE::GlobalReplace(&s, "b+", "d")); - // - // will leave "s" containing "yada dada doo" - // - // Returns the number of replacements made. - static int GlobalReplace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite); - - // Like Replace, except that if the pattern matches, "rewrite" - // is copied into "out" with substitutions. The non-matching - // portions of "text" are ignored. - // - // Returns true iff a match occurred and the extraction happened - // successfully; if no match occurs, the string is left unaffected. - static bool Extract(const StringPiece &text, - const PCRE& pattern, - const StringPiece &rewrite, - std::string *out); - - // Check that the given @p rewrite string is suitable for use with - // this PCRE. It checks that: - // * The PCRE has enough parenthesized subexpressions to satisfy all - // of the \N tokens in @p rewrite, and - // * The @p rewrite string doesn't have any syntax errors - // ('\' followed by anything besides [0-9] and '\'). - // Making this test will guarantee that "replace" and "extract" - // operations won't LOG(ERROR) or fail because of a bad rewrite - // string. - // @param rewrite The proposed rewrite string. - // @param error An error message is recorded here, iff we return false. - // Otherwise, it is unchanged. - // @return true, iff @p rewrite is suitable for use with the PCRE. - bool CheckRewriteString(const StringPiece& rewrite, - std::string* error) const; - - // Returns a copy of 'unquoted' with all potentially meaningful - // regexp characters backslash-escaped. The returned string, used - // as a regular expression, will exactly match the original string. - // For example, - // 1.5-2.0? - // becomes: - // 1\.5\-2\.0\? - static std::string QuoteMeta(const StringPiece& unquoted); - - /***** Generic matching interface (not so nice to use) *****/ - - // Type of match (TODO: Should be restructured as an Option) - enum Anchor { - UNANCHORED, // No anchoring - ANCHOR_START, // Anchor at start only - ANCHOR_BOTH, // Anchor at start and end - }; - - // General matching routine. Stores the length of the match in - // "*consumed" if successful. - bool DoMatch(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const* args, int n) const; - - // Return the number of capturing subpatterns, or -1 if the - // regexp wasn't valid on construction. - int NumberOfCapturingGroups() const; - - private: - void Init(const char* pattern, Option option, int match_limit, - int stack_limit, bool report_errors); - - // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with - // pairs of integers for the beginning and end positions of matched - // text. The first pair corresponds to the entire matched text; - // subsequent pairs correspond, in order, to parentheses-captured - // matches. Returns the number of pairs (one more than the number of - // the last subpattern with a match) if matching was successful - // and zero if the match failed. - // I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching - // against "foo", "bar", and "baz" respectively. - // When matching PCRE("(foo)|hello") against "hello", it will return 1. - // But the values for all subpattern are filled in into "vec". - int TryMatch(const StringPiece& text, - size_t startpos, - Anchor anchor, - bool empty_ok, - int *vec, - int vecsize) const; - - // Append the "rewrite" string, with backslash subsitutions from "text" - // and "vec", to string "out". - bool Rewrite(std::string *out, - const StringPiece &rewrite, - const StringPiece &text, - int *vec, - int veclen) const; - - // internal implementation for DoMatch - bool DoMatchImpl(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const args[], - int n, - int* vec, - int vecsize) const; - - // Compile the regexp for the specified anchoring mode - pcre* Compile(Anchor anchor); - - std::string pattern_; - Option options_; - pcre* re_full_; // For full matches - pcre* re_partial_; // For partial matches - const std::string* error_; // Error indicator (or empty string) - bool report_errors_; // Silences error logging if false - int match_limit_; // Limit on execution resources - int stack_limit_; // Limit on stack resources (bytes) - mutable int32_t hit_limit_; // Hit limit during execution (bool) - - PCRE(const PCRE&) = delete; - PCRE& operator=(const PCRE&) = delete; -}; - -// PCRE_Options allow you to set the PCRE::Options, plus any pcre -// "extra" options. The only extras are match_limit, which limits -// the CPU time of a match, and stack_limit, which limits the -// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default -// that should not cause too many problems in production code. -// If PCRE hits a limit during a match, it may return a false negative, -// but (hopefully) it won't crash. -// -// NOTE: If you are handling regular expressions specified by -// (external or internal) users, rather than hard-coded ones, -// you should be using PCRE2, which uses an alternate implementation -// that avoids these issues. See http://go/re2quick. -class PCRE_Options { - public: - // constructor - PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {} - // accessors - PCRE::Option option() const { return option_; } - void set_option(PCRE::Option option) { - option_ = option; - } - int match_limit() const { return match_limit_; } - void set_match_limit(int match_limit) { - match_limit_ = match_limit; - } - int stack_limit() const { return stack_limit_; } - void set_stack_limit(int stack_limit) { - stack_limit_ = stack_limit; - } - - // If the regular expression is malformed, an error message will be printed - // iff report_errors() is true. Default: true. - bool report_errors() const { return report_errors_; } - void set_report_errors(bool report_errors) { - report_errors_ = report_errors; - } - private: - PCRE::Option option_; - int match_limit_; - int stack_limit_; - bool report_errors_; -}; - - -/***** Implementation details *****/ - -// Hex/Octal/Binary? - -// Special class for parsing into objects that define a ParseFrom() method -template <typename T> -class _PCRE_MatchObject { - public: - static inline bool Parse(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - T* object = reinterpret_cast<T*>(dest); - return object->ParseFrom(str, n); - } -}; - -class PCRE::Arg { - public: - // Empty constructor so we can declare arrays of PCRE::Arg - Arg(); - - // Constructor specially designed for NULL arguments - Arg(void*); - - typedef bool (*Parser)(const char* str, size_t n, void* dest); - -// Type-specific parsers -#define MAKE_PARSER(type, name) \ - Arg(type* p) : arg_(p), parser_(name) {} \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} - - MAKE_PARSER(char, parse_char); - MAKE_PARSER(signed char, parse_schar); - MAKE_PARSER(unsigned char, parse_uchar); - MAKE_PARSER(float, parse_float); - MAKE_PARSER(double, parse_double); - MAKE_PARSER(std::string, parse_string); - MAKE_PARSER(StringPiece, parse_stringpiece); - - MAKE_PARSER(short, parse_short); - MAKE_PARSER(unsigned short, parse_ushort); - MAKE_PARSER(int, parse_int); - MAKE_PARSER(unsigned int, parse_uint); - MAKE_PARSER(long, parse_long); - MAKE_PARSER(unsigned long, parse_ulong); - MAKE_PARSER(long long, parse_longlong); - MAKE_PARSER(unsigned long long, parse_ulonglong); - -#undef MAKE_PARSER - - // Generic constructor - template <typename T> Arg(T*, Parser parser); - // Generic constructor template - template <typename T> Arg(T* p) - : arg_(p), parser_(_PCRE_MatchObject<T>::Parse) { - } - - // Parse the data - bool Parse(const char* str, size_t n) const; - - private: - void* arg_; - Parser parser_; - - static bool parse_null (const char* str, size_t n, void* dest); - static bool parse_char (const char* str, size_t n, void* dest); - static bool parse_schar (const char* str, size_t n, void* dest); - static bool parse_uchar (const char* str, size_t n, void* dest); - static bool parse_float (const char* str, size_t n, void* dest); - static bool parse_double (const char* str, size_t n, void* dest); - static bool parse_string (const char* str, size_t n, void* dest); - static bool parse_stringpiece (const char* str, size_t n, void* dest); - -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_##name(const char* str, size_t n, void* dest); \ - static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ - int radix); \ - \ - public: \ - static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ - static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ - static bool parse_##name##_cradix(const char* str, size_t n, void* dest) - - DECLARE_INTEGER_PARSER(short); - DECLARE_INTEGER_PARSER(ushort); - DECLARE_INTEGER_PARSER(int); - DECLARE_INTEGER_PARSER(uint); - DECLARE_INTEGER_PARSER(long); - DECLARE_INTEGER_PARSER(ulong); - DECLARE_INTEGER_PARSER(longlong); - DECLARE_INTEGER_PARSER(ulonglong); - -#undef DECLARE_INTEGER_PARSER - -}; - -inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } -inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } - -inline bool PCRE::Arg::Parse(const char* str, size_t n) const { - return (*parser_)(str, n, arg_); -} - -// This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline PCRE::Arg Hex(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \ - } \ - inline PCRE::Arg Octal(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \ - } \ - inline PCRE::Arg CRadix(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \ - } - -MAKE_INTEGER_PARSER(short, short); -MAKE_INTEGER_PARSER(unsigned short, ushort); -MAKE_INTEGER_PARSER(int, int); -MAKE_INTEGER_PARSER(unsigned int, uint); -MAKE_INTEGER_PARSER(long, long); -MAKE_INTEGER_PARSER(unsigned long, ulong); -MAKE_INTEGER_PARSER(long long, longlong); -MAKE_INTEGER_PARSER(unsigned long long, ulonglong); - -#undef MAKE_INTEGER_PARSER - -} // namespace re2 - -#endif // UTIL_PCRE_H_ +// Copyright 2003-2010 Google Inc. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_PCRE_H_ +#define UTIL_PCRE_H_ + +// This is a variant of PCRE's pcrecpp.h, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +// C++ interface to the pcre regular-expression library. PCRE supports +// Perl-style regular expressions (with extensions like \d, \w, \s, +// ...). +// +// ----------------------------------------------------------------------- +// REGEXP SYNTAX: +// +// This module uses the pcre library and hence supports its syntax +// for regular expressions: +// +// http://www.google.com/search?q=pcre +// +// The syntax is pretty similar to Perl's. For those not familiar +// with Perl's regular expressions, here are some examples of the most +// commonly used extensions: +// +// "hello (\\w+) world" -- \w matches a "word" character +// "version (\\d+)" -- \d matches a digit +// "hello\\s+world" -- \s matches any whitespace character +// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary +// "(?i)hello" -- (?i) turns on case-insensitive matching +// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible +// +// ----------------------------------------------------------------------- +// MATCHING INTERFACE: +// +// The "FullMatch" operation checks that supplied text matches a +// supplied pattern exactly. +// +// Example: successful match +// CHECK(PCRE::FullMatch("hello", "h.*o")); +// +// Example: unsuccessful match (requires full match): +// CHECK(!PCRE::FullMatch("hello", "e")); +// +// ----------------------------------------------------------------------- +// UTF-8 AND THE MATCHING INTERFACE: +// +// By default, pattern and text are plain text, one byte per character. +// The UTF8 flag, passed to the constructor, causes both pattern +// and string to be treated as UTF-8 text, still a byte stream but +// potentially multiple bytes per character. In practice, the text +// is likelier to be UTF-8 than the pattern, but the match returned +// may depend on the UTF8 flag, so always use it when matching +// UTF8 text. E.g., "." will match one byte normally but with UTF8 +// set may match up to three bytes of a multi-byte character. +// +// Example: +// PCRE re(utf8_pattern, PCRE::UTF8); +// CHECK(PCRE::FullMatch(utf8_string, re)); +// +// ----------------------------------------------------------------------- +// MATCHING WITH SUBSTRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched substrings. +// +// Example: extracts "ruby" into "s" and 1234 into "i" +// int i; +// std::string s; +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); +// +// Example: fails because string cannot be stored in integer +// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i)); +// +// Example: fails because there aren't enough sub-patterns: +// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s)); +// +// Example: does not try to extract any extra sub-patterns +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); +// +// Example: does not try to extract into NULL +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); +// +// Example: integer overflow causes failure +// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); +// +// ----------------------------------------------------------------------- +// PARTIAL MATCHES +// +// You can use the "PartialMatch" operation when you want the pattern +// to match any substring of the text. +// +// Example: simple search for a string: +// CHECK(PCRE::PartialMatch("hello", "ell")); +// +// Example: find first number in a string +// int number; +// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number)); +// CHECK_EQ(number, 100); +// +// ----------------------------------------------------------------------- +// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS +// +// PCRE makes it easy to use any string as a regular expression, without +// requiring a separate compilation step. +// +// If speed is of the essence, you can create a pre-compiled "PCRE" +// object from the pattern and use it multiple times. If you do so, +// you can typically parse text faster than with sscanf. +// +// Example: precompile pattern for faster matching: +// PCRE pattern("h.*o"); +// while (ReadLine(&str)) { +// if (PCRE::FullMatch(str, pattern)) ...; +// } +// +// ----------------------------------------------------------------------- +// SCANNING TEXT INCPCREMENTALLY +// +// The "Consume" operation may be useful if you want to repeatedly +// match regular expressions at the front of a string and skip over +// them as they match. This requires use of the "StringPiece" type, +// which represents a sub-range of a real string. +// +// Example: read lines of the form "var = value" from a string. +// std::string contents = ...; // Fill string somehow +// StringPiece input(contents); // Wrap a StringPiece around it +// +// std::string var; +// int value; +// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { +// ...; +// } +// +// Each successful call to "Consume" will set "var/value", and also +// advance "input" so it points past the matched text. Note that if the +// regular expression matches an empty string, input will advance +// by 0 bytes. If the regular expression being used might match +// an empty string, the loop body must check for this case and either +// advance the string or break out of the loop. +// +// The "FindAndConsume" operation is similar to "Consume" but does not +// anchor your match at the beginning of the string. For example, you +// could extract all words from a string by repeatedly calling +// PCRE::FindAndConsume(&input, "(\\w+)", &word) +// +// ----------------------------------------------------------------------- +// PARSING HEX/OCTAL/C-RADIX NUMBERS +// +// By default, if you pass a pointer to a numeric value, the +// corresponding text is interpreted as a base-10 number. You can +// instead wrap the pointer with a call to one of the operators Hex(), +// Octal(), or CRadix() to interpret the text in another base. The +// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) +// prefixes, but defaults to base-10. +// +// Example: +// int a, b, c, d; +// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", +// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d)); +// will leave 64 in a, b, c, and d. + +#include "util/util.h" +#include "re2/stringpiece.h" + +#ifdef USEPCRE +#include <pcre.h> +namespace re2 { +const bool UsingPCRE = true; +} // namespace re2 +#else +struct pcre; // opaque +namespace re2 { +const bool UsingPCRE = false; +} // namespace re2 +#endif + +namespace re2 { + +class PCRE_Options; + +// Interface for regular expression matching. Also corresponds to a +// pre-compiled regular expression. An "PCRE" object is safe for +// concurrent use by multiple threads. +class PCRE { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + + // Marks end of arg list. + // ONLY USE IN OPTIONAL ARG DEFAULTS. + // DO NOT PASS EXPLICITLY. + static Arg no_more_args; + + // Options are same value as those in pcre. We provide them here + // to avoid users needing to include pcre.h and also to isolate + // users from pcre should we change the underlying library. + // Only those needed by Google programs are exposed here to + // avoid collision with options employed internally by regexp.cc + // Note that some options have equivalents that can be specified in + // the regexp itself. For example, prefixing your regexp with + // "(?s)" has the same effect as the PCRE_DOTALL option. + enum Option { + None = 0x0000, + UTF8 = 0x0800, // == PCRE_UTF8 + EnabledCompileOptions = UTF8, + EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag + }; + + // We provide implicit conversions from strings so that users can + // pass in a string or a "const char*" wherever an "PCRE" is expected. + PCRE(const char* pattern); + PCRE(const char* pattern, Option option); + PCRE(const std::string& pattern); + PCRE(const std::string& pattern, Option option); + PCRE(const char *pattern, const PCRE_Options& re_option); + PCRE(const std::string& pattern, const PCRE_Options& re_option); + + ~PCRE(); + + // The string specification for this PCRE. E.g. + // PCRE re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const std::string& pattern() const { return pattern_; } + + // If PCRE could not be created properly, returns an error string. + // Else returns the empty string. + const std::string& error() const { return *error_; } + + // Whether the PCRE has hit a match limit during execution. + // Not thread safe. Intended only for testing. + // If hitting match limits is a problem, + // you should be using PCRE2 (re2/re2.h) + // instead of checking this flag. + bool HitLimit(); + void ClearHitLimit(); + + /***** The useful part: the matching interface *****/ + + // Matches "text" against "pattern". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "std::string" for "text". + // You can pass in a "const char*" or a "std::string" or a "PCRE" for "pattern". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // std::string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, size_t)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "pattern" exactly + // b. The number of matched sub-patterns is >= number of supplied pointers + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number); + struct FullMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FullMatchFunctor FullMatch; + + // Exactly like FullMatch(), except that "pattern" is allowed to match + // a substring of "text". + struct PartialMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const PartialMatchFunctor PartialMatch; + + // Like FullMatch() and PartialMatch(), except that pattern has to + // match a prefix of "text", and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true. + struct ConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const ConsumeFunctor Consume; + + // Like Consume(..), but does not anchor the match at the beginning of the + // string. That is, "pattern" need not start its match at the beginning of + // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next + // word in "s" and stores it in "word". + struct FindAndConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FindAndConsumeFunctor FindAndConsume; + + // Replace the first match of "pattern" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(PCRE::Replace(&s, "b+", "d")); + // + // will leave "s" containing "yada dabba doo" + // + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace(), except replaces all occurrences of the pattern in + // the string with the rewrite. Replacements are not subject to + // re-matching. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(PCRE::GlobalReplace(&s, "b+", "d")); + // + // will leave "s" containing "yada dada doo" + // + // Returns the number of replacements made. + static int GlobalReplace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + static bool Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + std::string *out); + + // Check that the given @p rewrite string is suitable for use with + // this PCRE. It checks that: + // * The PCRE has enough parenthesized subexpressions to satisfy all + // of the \N tokens in @p rewrite, and + // * The @p rewrite string doesn't have any syntax errors + // ('\' followed by anything besides [0-9] and '\'). + // Making this test will guarantee that "replace" and "extract" + // operations won't LOG(ERROR) or fail because of a bad rewrite + // string. + // @param rewrite The proposed rewrite string. + // @param error An error message is recorded here, iff we return false. + // Otherwise, it is unchanged. + // @return true, iff @p rewrite is suitable for use with the PCRE. + bool CheckRewriteString(const StringPiece& rewrite, + std::string* error) const; + + // Returns a copy of 'unquoted' with all potentially meaningful + // regexp characters backslash-escaped. The returned string, used + // as a regular expression, will exactly match the original string. + // For example, + // 1.5-2.0? + // becomes: + // 1\.5\-2\.0\? + static std::string QuoteMeta(const StringPiece& unquoted); + + /***** Generic matching interface (not so nice to use) *****/ + + // Type of match (TODO: Should be restructured as an Option) + enum Anchor { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH, // Anchor at start and end + }; + + // General matching routine. Stores the length of the match in + // "*consumed" if successful. + bool DoMatch(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const* args, int n) const; + + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. + int NumberOfCapturingGroups() const; + + private: + void Init(const char* pattern, Option option, int match_limit, + int stack_limit, bool report_errors); + + // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with + // pairs of integers for the beginning and end positions of matched + // text. The first pair corresponds to the entire matched text; + // subsequent pairs correspond, in order, to parentheses-captured + // matches. Returns the number of pairs (one more than the number of + // the last subpattern with a match) if matching was successful + // and zero if the match failed. + // I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching + // against "foo", "bar", and "baz" respectively. + // When matching PCRE("(foo)|hello") against "hello", it will return 1. + // But the values for all subpattern are filled in into "vec". + int TryMatch(const StringPiece& text, + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const; + + // Append the "rewrite" string, with backslash subsitutions from "text" + // and "vec", to string "out". + bool Rewrite(std::string *out, + const StringPiece &rewrite, + const StringPiece &text, + int *vec, + int veclen) const; + + // internal implementation for DoMatch + bool DoMatchImpl(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n, + int* vec, + int vecsize) const; + + // Compile the regexp for the specified anchoring mode + pcre* Compile(Anchor anchor); + + std::string pattern_; + Option options_; + pcre* re_full_; // For full matches + pcre* re_partial_; // For partial matches + const std::string* error_; // Error indicator (or empty string) + bool report_errors_; // Silences error logging if false + int match_limit_; // Limit on execution resources + int stack_limit_; // Limit on stack resources (bytes) + mutable int32_t hit_limit_; // Hit limit during execution (bool) + + PCRE(const PCRE&) = delete; + PCRE& operator=(const PCRE&) = delete; +}; + +// PCRE_Options allow you to set the PCRE::Options, plus any pcre +// "extra" options. The only extras are match_limit, which limits +// the CPU time of a match, and stack_limit, which limits the +// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default +// that should not cause too many problems in production code. +// If PCRE hits a limit during a match, it may return a false negative, +// but (hopefully) it won't crash. +// +// NOTE: If you are handling regular expressions specified by +// (external or internal) users, rather than hard-coded ones, +// you should be using PCRE2, which uses an alternate implementation +// that avoids these issues. See http://go/re2quick. +class PCRE_Options { + public: + // constructor + PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {} + // accessors + PCRE::Option option() const { return option_; } + void set_option(PCRE::Option option) { + option_ = option; + } + int match_limit() const { return match_limit_; } + void set_match_limit(int match_limit) { + match_limit_ = match_limit; + } + int stack_limit() const { return stack_limit_; } + void set_stack_limit(int stack_limit) { + stack_limit_ = stack_limit; + } + + // If the regular expression is malformed, an error message will be printed + // iff report_errors() is true. Default: true. + bool report_errors() const { return report_errors_; } + void set_report_errors(bool report_errors) { + report_errors_ = report_errors; + } + private: + PCRE::Option option_; + int match_limit_; + int stack_limit_; + bool report_errors_; +}; + + +/***** Implementation details *****/ + +// Hex/Octal/Binary? + +// Special class for parsing into objects that define a ParseFrom() method +template <typename T> +class _PCRE_MatchObject { + public: + static inline bool Parse(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + T* object = reinterpret_cast<T*>(dest); + return object->ParseFrom(str, n); + } +}; + +class PCRE::Arg { + public: + // Empty constructor so we can declare arrays of PCRE::Arg + Arg(); + + // Constructor specially designed for NULL arguments + Arg(void*); + + typedef bool (*Parser)(const char* str, size_t n, void* dest); + +// Type-specific parsers +#define MAKE_PARSER(type, name) \ + Arg(type* p) : arg_(p), parser_(name) {} \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} + + MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_schar); + MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(std::string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + + MAKE_PARSER(short, parse_short); + MAKE_PARSER(unsigned short, parse_ushort); + MAKE_PARSER(int, parse_int); + MAKE_PARSER(unsigned int, parse_uint); + MAKE_PARSER(long, parse_long); + MAKE_PARSER(unsigned long, parse_ulong); + MAKE_PARSER(long long, parse_longlong); + MAKE_PARSER(unsigned long long, parse_ulonglong); + +#undef MAKE_PARSER + + // Generic constructor + template <typename T> Arg(T*, Parser parser); + // Generic constructor template + template <typename T> Arg(T* p) + : arg_(p), parser_(_PCRE_MatchObject<T>::Parse) { + } + + // Parse the data + bool Parse(const char* str, size_t n) const; + + private: + void* arg_; + Parser parser_; + + static bool parse_null (const char* str, size_t n, void* dest); + static bool parse_char (const char* str, size_t n, void* dest); + static bool parse_schar (const char* str, size_t n, void* dest); + static bool parse_uchar (const char* str, size_t n, void* dest); + static bool parse_float (const char* str, size_t n, void* dest); + static bool parse_double (const char* str, size_t n, void* dest); + static bool parse_string (const char* str, size_t n, void* dest); + static bool parse_stringpiece (const char* str, size_t n, void* dest); + +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_##name(const char* str, size_t n, void* dest); \ + static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ + int radix); \ + \ + public: \ + static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ + static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ + static bool parse_##name##_cradix(const char* str, size_t n, void* dest) + + DECLARE_INTEGER_PARSER(short); + DECLARE_INTEGER_PARSER(ushort); + DECLARE_INTEGER_PARSER(int); + DECLARE_INTEGER_PARSER(uint); + DECLARE_INTEGER_PARSER(long); + DECLARE_INTEGER_PARSER(ulong); + DECLARE_INTEGER_PARSER(longlong); + DECLARE_INTEGER_PARSER(ulonglong); + +#undef DECLARE_INTEGER_PARSER + +}; + +inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } +inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } + +inline bool PCRE::Arg::Parse(const char* str, size_t n) const { + return (*parser_)(str, n, arg_); +} + +// This part of the parser, appropriate only for ints, deals with bases +#define MAKE_INTEGER_PARSER(type, name) \ + inline PCRE::Arg Hex(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \ + } \ + inline PCRE::Arg Octal(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \ + } \ + inline PCRE::Arg CRadix(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \ + } + +MAKE_INTEGER_PARSER(short, short); +MAKE_INTEGER_PARSER(unsigned short, ushort); +MAKE_INTEGER_PARSER(int, int); +MAKE_INTEGER_PARSER(unsigned int, uint); +MAKE_INTEGER_PARSER(long, long); +MAKE_INTEGER_PARSER(unsigned long, ulong); +MAKE_INTEGER_PARSER(long long, longlong); +MAKE_INTEGER_PARSER(unsigned long long, ulonglong); + +#undef MAKE_INTEGER_PARSER + +} // namespace re2 + +#endif // UTIL_PCRE_H_ diff --git a/contrib/libs/re2/util/strutil.cc b/contrib/libs/re2/util/strutil.cc index f9af3a442c..fb7e6b1b0c 100644 --- a/contrib/libs/re2/util/strutil.cc +++ b/contrib/libs/re2/util/strutil.cc @@ -65,34 +65,34 @@ static size_t CEscapeString(const char* src, size_t src_len, // Copies 'src' to result, escaping dangerous characters using // C-style escape sequences. 'src' and 'dest' should not overlap. // ---------------------------------------------------------------------- -std::string CEscape(const StringPiece& src) { +std::string CEscape(const StringPiece& src) { const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion char* dest = new char[dest_len]; const size_t used = CEscapeString(src.data(), src.size(), dest, dest_len); - std::string s = std::string(dest, used); + std::string s = std::string(dest, used); delete[] dest; return s; } -void PrefixSuccessor(std::string* prefix) { +void PrefixSuccessor(std::string* prefix) { // We can increment the last character in the string and be done // unless that character is 255, in which case we have to erase the // last character and increment the previous character, unless that // is 255, etc. If the string is empty or consists entirely of // 255's, we just return the empty string. - while (!prefix->empty()) { - char& c = prefix->back(); - if (c == '\xff') { // char literal avoids signed/unsigned. - prefix->pop_back(); + while (!prefix->empty()) { + char& c = prefix->back(); + if (c == '\xff') { // char literal avoids signed/unsigned. + prefix->pop_back(); } else { - ++c; - break; + ++c; + break; } } } -static void StringAppendV(std::string* dst, const char* format, va_list ap) { +static void StringAppendV(std::string* dst, const char* format, va_list ap) { // First try with a small fixed size buffer char space[1024]; @@ -137,10 +137,10 @@ static void StringAppendV(std::string* dst, const char* format, va_list ap) { } } -std::string StringPrintf(const char* format, ...) { +std::string StringPrintf(const char* format, ...) { va_list ap; va_start(ap, format); - std::string result; + std::string result; StringAppendV(&result, format, ap); va_end(ap); return result; diff --git a/contrib/libs/re2/util/strutil.h b/contrib/libs/re2/util/strutil.h index 16631b0833..a69908a0dd 100644 --- a/contrib/libs/re2/util/strutil.h +++ b/contrib/libs/re2/util/strutil.h @@ -12,10 +12,10 @@ namespace re2 { -std::string CEscape(const StringPiece& src); -void PrefixSuccessor(std::string* prefix); -std::string StringPrintf(const char* format, ...); +std::string CEscape(const StringPiece& src); +void PrefixSuccessor(std::string* prefix); +std::string StringPrintf(const char* format, ...); } // namespace re2 - + #endif // UTIL_STRUTIL_H_ diff --git a/contrib/libs/re2/util/test.cc b/contrib/libs/re2/util/test.cc index 855295f5bf..028616b359 100644 --- a/contrib/libs/re2/util/test.cc +++ b/contrib/libs/re2/util/test.cc @@ -1,34 +1,34 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <stdio.h> -#include <string> - -#include "util/test.h" - -namespace testing { -std::string TempDir() { return "/tmp/"; } -} // namespace testing - -struct Test { - void (*fn)(void); - const char *name; -}; - -static Test tests[10000]; -static int ntests; - -void RegisterTest(void (*fn)(void), const char *name) { - tests[ntests].fn = fn; - tests[ntests++].name = name; -} - -int main(int argc, char** argv) { - for (int i = 0; i < ntests; i++) { - printf("%s\n", tests[i].name); - tests[i].fn(); - } - printf("PASS\n"); - return 0; -} +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <stdio.h> +#include <string> + +#include "util/test.h" + +namespace testing { +std::string TempDir() { return "/tmp/"; } +} // namespace testing + +struct Test { + void (*fn)(void); + const char *name; +}; + +static Test tests[10000]; +static int ntests; + +void RegisterTest(void (*fn)(void), const char *name) { + tests[ntests].fn = fn; + tests[ntests++].name = name; +} + +int main(int argc, char** argv) { + for (int i = 0; i < ntests; i++) { + printf("%s\n", tests[i].name); + tests[i].fn(); + } + printf("PASS\n"); + return 0; +} diff --git a/contrib/libs/re2/util/test.h b/contrib/libs/re2/util/test.h index 40978b8fae..54e6f8fbbb 100644 --- a/contrib/libs/re2/util/test.h +++ b/contrib/libs/re2/util/test.h @@ -1,50 +1,50 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_TEST_H_ -#define UTIL_TEST_H_ - -#include "util/util.h" -#include "util/logging.h" - -namespace testing { -std::string TempDir(); -} // namespace testing - -#define TEST(x, y) \ - void x##y(void); \ - TestRegisterer r##x##y(x##y, # x "." # y); \ - void x##y(void) - -void RegisterTest(void (*)(void), const char*); - -class TestRegisterer { - public: - TestRegisterer(void (*fn)(void), const char *s) { - RegisterTest(fn, s); - } -}; - -// fatal assertions -#define ASSERT_TRUE CHECK -#define ASSERT_FALSE(x) CHECK(!(x)) -#define ASSERT_EQ CHECK_EQ -#define ASSERT_NE CHECK_NE -#define ASSERT_LT CHECK_LT -#define ASSERT_LE CHECK_LE -#define ASSERT_GT CHECK_GT -#define ASSERT_GE CHECK_GE - -// nonfatal assertions -// TODO(rsc): Do a better job? -#define EXPECT_TRUE CHECK -#define EXPECT_FALSE(x) CHECK(!(x)) -#define EXPECT_EQ CHECK_EQ -#define EXPECT_NE CHECK_NE -#define EXPECT_LT CHECK_LT -#define EXPECT_LE CHECK_LE -#define EXPECT_GT CHECK_GT -#define EXPECT_GE CHECK_GE - -#endif // UTIL_TEST_H_ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_TEST_H_ +#define UTIL_TEST_H_ + +#include "util/util.h" +#include "util/logging.h" + +namespace testing { +std::string TempDir(); +} // namespace testing + +#define TEST(x, y) \ + void x##y(void); \ + TestRegisterer r##x##y(x##y, # x "." # y); \ + void x##y(void) + +void RegisterTest(void (*)(void), const char*); + +class TestRegisterer { + public: + TestRegisterer(void (*fn)(void), const char *s) { + RegisterTest(fn, s); + } +}; + +// fatal assertions +#define ASSERT_TRUE CHECK +#define ASSERT_FALSE(x) CHECK(!(x)) +#define ASSERT_EQ CHECK_EQ +#define ASSERT_NE CHECK_NE +#define ASSERT_LT CHECK_LT +#define ASSERT_LE CHECK_LE +#define ASSERT_GT CHECK_GT +#define ASSERT_GE CHECK_GE + +// nonfatal assertions +// TODO(rsc): Do a better job? +#define EXPECT_TRUE CHECK +#define EXPECT_FALSE(x) CHECK(!(x)) +#define EXPECT_EQ CHECK_EQ +#define EXPECT_NE CHECK_NE +#define EXPECT_LT CHECK_LT +#define EXPECT_LE CHECK_LE +#define EXPECT_GT CHECK_GT +#define EXPECT_GE CHECK_GE + +#endif // UTIL_TEST_H_ diff --git a/contrib/libs/re2/util/util.h b/contrib/libs/re2/util/util.h index 0d28a8ca74..56e46c1a33 100644 --- a/contrib/libs/re2/util/util.h +++ b/contrib/libs/re2/util/util.h @@ -5,35 +5,35 @@ #ifndef UTIL_UTIL_H_ #define UTIL_UTIL_H_ -#define arraysize(array) (sizeof(array)/sizeof((array)[0])) +#define arraysize(array) (sizeof(array)/sizeof((array)[0])) -#ifndef ATTRIBUTE_NORETURN -#if defined(__GNUC__) -#define ATTRIBUTE_NORETURN __attribute__((noreturn)) -#elif defined(_MSC_VER) -#define ATTRIBUTE_NORETURN __declspec(noreturn) -#else -#define ATTRIBUTE_NORETURN -#endif -#endif +#ifndef ATTRIBUTE_NORETURN +#if defined(__GNUC__) +#define ATTRIBUTE_NORETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define ATTRIBUTE_NORETURN __declspec(noreturn) +#else +#define ATTRIBUTE_NORETURN +#endif +#endif + +#ifndef ATTRIBUTE_UNUSED +#if defined(__GNUC__) +#define ATTRIBUTE_UNUSED __attribute__((unused)) +#else +#define ATTRIBUTE_UNUSED +#endif +#endif -#ifndef ATTRIBUTE_UNUSED -#if defined(__GNUC__) -#define ATTRIBUTE_UNUSED __attribute__((unused)) -#else -#define ATTRIBUTE_UNUSED -#endif -#endif - #ifndef FALLTHROUGH_INTENDED -#if defined(__clang__) -#define FALLTHROUGH_INTENDED [[clang::fallthrough]] -#elif defined(__GNUC__) && __GNUC__ >= 7 -#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] -#else -#define FALLTHROUGH_INTENDED do {} while (0) +#if defined(__clang__) +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] +#else +#define FALLTHROUGH_INTENDED do {} while (0) +#endif #endif -#endif #ifndef NO_THREAD_SAFETY_ANALYSIS #define NO_THREAD_SAFETY_ANALYSIS diff --git a/contrib/libs/re2/ya.make b/contrib/libs/re2/ya.make index 3219b0fd7e..8072de2eb2 100644 --- a/contrib/libs/re2/ya.make +++ b/contrib/libs/re2/ya.make @@ -1,8 +1,8 @@ # Generated by devtools/yamaker from nixpkgs 21.11. - + LIBRARY() -OWNER(g:cpp-contrib) +OWNER(g:cpp-contrib) VERSION(2022-02-01) @@ -12,22 +12,22 @@ LICENSE( BSD-3-Clause AND X11-Lucent ) - + LICENSE_TEXTS(.yandex_meta/licenses.list.txt) ADDINCL( - GLOBAL contrib/libs/re2/include - contrib/libs/re2 + GLOBAL contrib/libs/re2/include + contrib/libs/re2 ) -NO_COMPILER_WARNINGS() +NO_COMPILER_WARNINGS() -IF (WITH_VALGRIND) +IF (WITH_VALGRIND) CFLAGS( GLOBAL -DRE2_ON_VALGRIND ) -ENDIF() - +ENDIF() + SRCS( re2/bitstate.cc re2/compile.cc @@ -45,16 +45,16 @@ SRCS( re2/regexp.cc re2/set.cc re2/simplify.cc - re2/stringpiece.cc + re2/stringpiece.cc re2/tostring.cc - re2/unicode_casefold.cc - re2/unicode_groups.cc - util/rune.cc - util/strutil.cc + re2/unicode_casefold.cc + re2/unicode_groups.cc + util/rune.cc + util/strutil.cc ) END() - -RECURSE( - re2/testing -) + +RECURSE( + re2/testing +) |