aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/re2
diff options
context:
space:
mode:
authorantonovvk <antonovvk@yandex-team.ru>2022-02-10 16:47:52 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:47:52 +0300
commit37de222addabbef336dcaaea5f7c7645a629fc6d (patch)
treec0748b5dcbade83af788c0abfa89c0383d6b779c /contrib/libs/re2
parent37a63debdc21e372d99e1808cdd31aecf75018c3 (diff)
downloadydb-37de222addabbef336dcaaea5f7c7645a629fc6d.tar.gz
Restoring authorship annotation for <antonovvk@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/re2')
-rw-r--r--contrib/libs/re2/re2/bitstate.cc420
-rw-r--r--contrib/libs/re2/re2/compile.cc1494
-rw-r--r--contrib/libs/re2/re2/dfa.cc3022
-rw-r--r--contrib/libs/re2/re2/filtered_re2.cc150
-rw-r--r--contrib/libs/re2/re2/filtered_re2.h132
-rw-r--r--contrib/libs/re2/re2/mimics_pcre.cc346
-rw-r--r--contrib/libs/re2/re2/nfa.cc722
-rw-r--r--contrib/libs/re2/re2/onepass.cc914
-rw-r--r--contrib/libs/re2/re2/parse.cc3794
-rw-r--r--contrib/libs/re2/re2/perl_groups.cc128
-rw-r--r--contrib/libs/re2/re2/prefilter.cc1124
-rw-r--r--contrib/libs/re2/re2/prefilter.h178
-rw-r--r--contrib/libs/re2/re2/prefilter_tree.cc466
-rw-r--r--contrib/libs/re2/re2/prefilter_tree.h198
-rw-r--r--contrib/libs/re2/re2/prog.cc446
-rw-r--r--contrib/libs/re2/re2/prog.h500
-rw-r--r--contrib/libs/re2/re2/re2.cc1384
-rw-r--r--contrib/libs/re2/re2/regexp.cc1554
-rw-r--r--contrib/libs/re2/re2/regexp.h1054
-rw-r--r--contrib/libs/re2/re2/set.cc124
-rw-r--r--contrib/libs/re2/re2/simplify.cc694
-rw-r--r--contrib/libs/re2/re2/sparse_array.h346
-rw-r--r--contrib/libs/re2/re2/sparse_set.h128
-rw-r--r--contrib/libs/re2/re2/tostring.cc636
-rw-r--r--contrib/libs/re2/re2/unicode_casefold.h110
-rw-r--r--contrib/libs/re2/re2/unicode_groups.h96
-rw-r--r--contrib/libs/re2/re2/walker-inl.h446
-rw-r--r--contrib/libs/re2/util/rune.cc506
-rw-r--r--contrib/libs/re2/util/strutil.cc126
-rw-r--r--contrib/libs/re2/util/utf.h78
-rw-r--r--contrib/libs/re2/ya.make16
31 files changed, 10666 insertions, 10666 deletions
diff --git a/contrib/libs/re2/re2/bitstate.cc b/contrib/libs/re2/re2/bitstate.cc
index ab4e75f6e5..877e548234 100644
--- a/contrib/libs/re2/re2/bitstate.cc
+++ b/contrib/libs/re2/re2/bitstate.cc
@@ -1,22 +1,22 @@
-// Copyright 2008 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Tested by search_test.cc, exhaustive_test.cc, tester.cc
-
-// Prog::SearchBitState is a regular expression search with submatch
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc, exhaustive_test.cc, tester.cc
+
+// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Similarly to
// testing/backtrack.cc, it allocates a bitmap with (count of
// lists) * (length of text) bits to make sure it never explores the
// same (instruction list, character position) multiple times. This
-// limits the search to run in time linear in the length of the text.
-//
-// Unlike testing/backtrack.cc, SearchBitState is not recursive
-// on the text.
-//
-// SearchBitState is a fast replacement for the NFA code on small
-// regexps and texts when SearchOnePass cannot be used.
-
+// limits the search to run in time linear in the length of the text.
+//
+// Unlike testing/backtrack.cc, SearchBitState is not recursive
+// on the text.
+//
+// SearchBitState is a fast replacement for the NFA code on small
+// regexps and texts when SearchOnePass cannot be used.
+
#include <stddef.h>
#include <stdint.h>
#include <string.h>
@@ -25,84 +25,84 @@
#include "util/logging.h"
#include "re2/pod_array.h"
-#include "re2/prog.h"
-#include "re2/regexp.h"
-
-namespace re2 {
-
-struct Job {
- int id;
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Job {
+ int id;
int rle; // run length encoding
- const char* p;
-};
-
-class BitState {
- public:
- explicit BitState(Prog* prog);
-
- // The usual Search prototype.
- // Can only call Search once per BitState.
- bool Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch);
-
- private:
- inline bool ShouldVisit(int id, const char* p);
+ const char* p;
+};
+
+class BitState {
+ public:
+ explicit BitState(Prog* prog);
+
+ // The usual Search prototype.
+ // Can only call Search once per BitState.
+ bool Search(const StringPiece& text, const StringPiece& context,
+ bool anchored, bool longest,
+ StringPiece* submatch, int nsubmatch);
+
+ private:
+ inline bool ShouldVisit(int id, const char* p);
void Push(int id, const char* p);
void GrowStack();
- bool TrySearch(int id, const char* p);
-
- // Search parameters
- Prog* prog_; // program being run
- StringPiece text_; // text being searched
- StringPiece context_; // greater context of text being searched
- bool anchored_; // whether search is anchored at text.begin()
- bool longest_; // whether search wants leftmost-longest match
- bool endmatch_; // whether match must end at text.end()
+ bool TrySearch(int id, const char* p);
+
+ // Search parameters
+ Prog* prog_; // program being run
+ StringPiece text_; // text being searched
+ StringPiece context_; // greater context of text being searched
+ bool anchored_; // whether search is anchored at text.begin()
+ bool longest_; // whether search wants leftmost-longest match
+ bool endmatch_; // whether match must end at text.end()
StringPiece* submatch_; // submatches to fill in
- int nsubmatch_; // # of submatches to fill in
-
- // Search state
+ int nsubmatch_; // # of submatches to fill in
+
+ // Search state
static constexpr int kVisitedBits = 64;
PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
PODArray<const char*> cap_; // capture registers
PODArray<Job> job_; // stack of text positions to explore
int njob_; // stack size
-
+
BitState(const BitState&) = delete;
BitState& operator=(const BitState&) = delete;
-};
-
-BitState::BitState(Prog* prog)
- : prog_(prog),
- anchored_(false),
- longest_(false),
- endmatch_(false),
- submatch_(NULL),
- nsubmatch_(0),
+};
+
+BitState::BitState(Prog* prog)
+ : prog_(prog),
+ anchored_(false),
+ longest_(false),
+ endmatch_(false),
+ submatch_(NULL),
+ nsubmatch_(0),
njob_(0) {
-}
-
+}
+
// Given id, which *must* be a list head, we can look up its list ID.
// Then the question is: Should the search visit the (list ID, p) pair?
-// If so, remember that it was visited so that the next time,
-// we don't repeat the visit.
-bool BitState::ShouldVisit(int id, const char* p) {
+// If so, remember that it was visited so that the next time,
+// we don't repeat the visit.
+bool BitState::ShouldVisit(int id, const char* p) {
int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
static_cast<int>(p-text_.data());
if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
- return false;
+ return false;
visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
- return true;
-}
-
-// Grow the stack.
+ return true;
+}
+
+// Grow the stack.
void BitState::GrowStack() {
PODArray<Job> tmp(2*job_.size());
memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
job_ = std::move(tmp);
-}
-
+}
+
// Push (id, p) onto the stack, growing it if necessary.
void BitState::Push(int id, const char* p) {
if (njob_ >= job_.size()) {
@@ -111,10 +111,10 @@ void BitState::Push(int id, const char* p) {
LOG(DFATAL) << "GrowStack() failed: "
<< "njob_ = " << njob_ << ", "
<< "job_.size() = " << job_.size();
- return;
+ return;
}
- }
-
+ }
+
// If id < 0, it's undoing a Capture,
// so we mustn't interfere with that.
if (id >= 0 && njob_ > 0) {
@@ -126,30 +126,30 @@ void BitState::Push(int id, const char* p) {
return;
}
}
-
+
Job* top = &job_[njob_++];
top->id = id;
top->rle = 0;
top->p = p;
-}
-
-// Try a search from instruction id0 in state p0.
-// Return whether it succeeded.
-bool BitState::TrySearch(int id0, const char* p0) {
- bool matched = false;
+}
+
+// Try a search from instruction id0 in state p0.
+// Return whether it succeeded.
+bool BitState::TrySearch(int id0, const char* p0) {
+ bool matched = false;
const char* end = text_.data() + text_.size();
- njob_ = 0;
+ njob_ = 0;
// Push() no longer checks ShouldVisit(),
// so we must perform the check ourselves.
if (ShouldVisit(id0, p0))
Push(id0, p0);
- while (njob_ > 0) {
- // Pop job off stack.
- --njob_;
- int id = job_[njob_].id;
+ while (njob_ > 0) {
+ // Pop job off stack.
+ --njob_;
+ int id = job_[njob_].id;
int& rle = job_[njob_].rle;
- const char* p = job_[njob_].p;
-
+ const char* p = job_[njob_].p;
+
if (id < 0) {
// Undo the Capture.
cap_[prog_->inst(-id)->cap()] = p;
@@ -161,16 +161,16 @@ bool BitState::TrySearch(int id0, const char* p0) {
// Revivify job on stack.
--rle;
++njob_;
- }
-
+ }
+
Loop:
// Visit id, p.
- Prog::Inst* ip = prog_->inst(id);
- switch (ip->opcode()) {
- default:
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
- return false;
-
+ return false;
+
case kInstFail:
break;
@@ -180,7 +180,7 @@ bool BitState::TrySearch(int id0, const char* p0) {
id = ip->out1();
p = end;
goto Loop;
- }
+ }
if (longest_) {
// ip must be non-greedy...
// out is the Match instruction.
@@ -189,11 +189,11 @@ bool BitState::TrySearch(int id0, const char* p0) {
goto Loop;
}
goto Next;
-
- case kInstByteRange: {
- int c = -1;
- if (p < end)
- c = *p & 0xFF;
+
+ case kInstByteRange: {
+ int c = -1;
+ if (p < end)
+ c = *p & 0xFF;
if (!ip->Matches(c))
goto Next;
@@ -202,9 +202,9 @@ bool BitState::TrySearch(int id0, const char* p0) {
id = ip->out();
p++;
goto CheckAndLoop;
- }
-
- case kInstCapture:
+ }
+
+ case kInstCapture:
if (!ip->last())
Push(id+1, p); // try the next when we're done
@@ -217,20 +217,20 @@ bool BitState::TrySearch(int id0, const char* p0) {
id = ip->out();
goto CheckAndLoop;
- case kInstEmptyWidth:
- if (ip->empty() & ~Prog::EmptyFlags(context_, p))
+ case kInstEmptyWidth:
+ if (ip->empty() & ~Prog::EmptyFlags(context_, p))
goto Next;
if (!ip->last())
Push(id+1, p); // try the next when we're done
- id = ip->out();
- goto CheckAndLoop;
-
- case kInstNop:
+ id = ip->out();
+ goto CheckAndLoop;
+
+ case kInstNop:
if (!ip->last())
Push(id+1, p); // try the next when we're done
- id = ip->out();
-
+ id = ip->out();
+
CheckAndLoop:
// Sanity check: id is the head of its list, which must
// be the case if id-1 is the last of *its* list. :)
@@ -239,37 +239,37 @@ bool BitState::TrySearch(int id0, const char* p0) {
goto Loop;
break;
- case kInstMatch: {
+ case kInstMatch: {
if (endmatch_ && p != end)
goto Next;
-
- // We found a match. If the caller doesn't care
- // where the match is, no point going further.
- if (nsubmatch_ == 0)
- return true;
-
- // Record best match so far.
- // Only need to check end point, because this entire
- // call is only considering one start position.
- matched = true;
- cap_[1] = p;
- if (submatch_[0].data() == NULL ||
+
+ // We found a match. If the caller doesn't care
+ // where the match is, no point going further.
+ if (nsubmatch_ == 0)
+ return true;
+
+ // Record best match so far.
+ // Only need to check end point, because this entire
+ // call is only considering one start position.
+ matched = true;
+ cap_[1] = p;
+ if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].data() + submatch_[0].size())) {
- for (int i = 0; i < nsubmatch_; i++)
+ for (int i = 0; i < nsubmatch_; i++)
submatch_[i] =
StringPiece(cap_[2 * i],
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
- }
-
- // If going for first match, we're done.
- if (!longest_)
- return true;
-
- // If we used the entire text, no longer match is possible.
+ }
+
+ // If going for first match, we're done.
+ if (!longest_)
+ return true;
+
+ // If we used the entire text, no longer match is possible.
if (p == end)
- return true;
-
- // Otherwise, continue on in hope of a longer match.
+ return true;
+
+ // Otherwise, continue on in hope of a longer match.
// Note the absence of the ShouldVisit() check here
// due to execution remaining in the same list.
Next:
@@ -278,60 +278,60 @@ bool BitState::TrySearch(int id0, const char* p0) {
goto Loop;
}
break;
- }
- }
- }
- return matched;
-}
-
-// Search text (within context) for prog_.
-bool BitState::Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch) {
- // Search parameters.
- text_ = text;
- context_ = context;
+ }
+ }
+ }
+ return matched;
+}
+
+// Search text (within context) for prog_.
+bool BitState::Search(const StringPiece& text, const StringPiece& context,
+ bool anchored, bool longest,
+ StringPiece* submatch, int nsubmatch) {
+ // Search parameters.
+ text_ = text;
+ context_ = context;
if (context_.data() == NULL)
- context_ = text;
+ context_ = text;
if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
- return false;
+ return false;
if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
- return false;
- anchored_ = anchored || prog_->anchor_start();
- longest_ = longest || prog_->anchor_end();
- endmatch_ = prog_->anchor_end();
- submatch_ = submatch;
- nsubmatch_ = nsubmatch;
- for (int i = 0; i < nsubmatch_; i++)
+ return false;
+ anchored_ = anchored || prog_->anchor_start();
+ longest_ = longest || prog_->anchor_end();
+ endmatch_ = prog_->anchor_end();
+ submatch_ = submatch;
+ nsubmatch_ = nsubmatch;
+ for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece();
-
- // Allocate scratch space.
+
+ // Allocate scratch space.
int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
visited_ = PODArray<uint64_t>(nvisited);
memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
-
+
int ncap = 2*nsubmatch;
if (ncap < 2)
ncap = 2;
cap_ = PODArray<const char*>(ncap);
memset(cap_.data(), 0, ncap*sizeof cap_[0]);
-
+
// When sizeof(Job) == 16, we start with a nice round 1KiB. :)
job_ = PODArray<Job>(64);
-
- // Anchored search must start at text.begin().
- if (anchored_) {
+
+ // Anchored search must start at text.begin().
+ if (anchored_) {
cap_[0] = text.data();
return TrySearch(prog_->start(), text.data());
- }
-
- // Unanchored search, starting from each possible text position.
- // Notice that we have to try the empty string at the end of
- // the text, so the loop condition is p <= text.end(), not p < text.end().
- // This looks like it's quadratic in the size of the text,
- // but we are not clearing visited_ between calls to TrySearch,
- // so no work is duplicated and it ends up still being linear.
+ }
+
+ // Unanchored search, starting from each possible text position.
+ // Notice that we have to try the empty string at the end of
+ // the text, so the loop condition is p <= text.end(), not p < text.end().
+ // This looks like it's quadratic in the size of the text,
+ // but we are not clearing visited_ between calls to TrySearch,
+ // so no work is duplicated and it ends up still being linear.
const char* etext = text.data() + text.size();
for (const char* p = text.data(); p <= etext; p++) {
// Try to use prefix accel (e.g. memchr) to skip ahead.
@@ -341,45 +341,45 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
p = etext;
}
- cap_[0] = p;
- if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
- return true;
+ cap_[0] = p;
+ if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
+ return true;
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by simply not continuing the loop.
if (p == NULL)
break;
- }
- return false;
-}
-
-// Bit-state search.
-bool Prog::SearchBitState(const StringPiece& text,
- const StringPiece& context,
- Anchor anchor,
- MatchKind kind,
- StringPiece* match,
- int nmatch) {
- // If full match, we ask for an anchored longest match
- // and then check that match[0] == text.
- // So make sure match[0] exists.
- StringPiece sp0;
- if (kind == kFullMatch) {
- anchor = kAnchored;
- if (nmatch < 1) {
- match = &sp0;
- nmatch = 1;
- }
- }
-
- // Run the search.
- BitState b(this);
- bool anchored = anchor == kAnchored;
- bool longest = kind != kFirstMatch;
- if (!b.Search(text, context, anchored, longest, match, nmatch))
- return false;
+ }
+ return false;
+}
+
+// Bit-state search.
+bool Prog::SearchBitState(const StringPiece& text,
+ const StringPiece& context,
+ Anchor anchor,
+ MatchKind kind,
+ StringPiece* match,
+ int nmatch) {
+ // If full match, we ask for an anchored longest match
+ // and then check that match[0] == text.
+ // So make sure match[0] exists.
+ StringPiece sp0;
+ if (kind == kFullMatch) {
+ anchor = kAnchored;
+ if (nmatch < 1) {
+ match = &sp0;
+ nmatch = 1;
+ }
+ }
+
+ // Run the search.
+ BitState b(this);
+ bool anchored = anchor == kAnchored;
+ bool longest = kind != kFirstMatch;
+ if (!b.Search(text, context, anchored, longest, match, nmatch))
+ return false;
if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
- return false;
- return true;
-}
-
-} // namespace re2
+ return false;
+ return true;
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/compile.cc b/contrib/libs/re2/re2/compile.cc
index 36c902044b..61d801a630 100644
--- a/contrib/libs/re2/re2/compile.cc
+++ b/contrib/libs/re2/re2/compile.cc
@@ -1,13 +1,13 @@
-// Copyright 2007 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Compile regular expression to Prog.
-//
-// Prog and Inst are defined in prog.h.
-// This file's external interface is just Regexp::CompileToProg.
-// The Compiler class defined in this file is private.
-
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Compile regular expression to Prog.
+//
+// Prog and Inst are defined in prog.h.
+// This file's external interface is just Regexp::CompileToProg.
+// The Compiler class defined in this file is private.
+
#include <stdint.h>
#include <string.h>
#include <unordered_map>
@@ -16,32 +16,32 @@
#include "util/logging.h"
#include "util/utf.h"
#include "re2/pod_array.h"
-#include "re2/prog.h"
+#include "re2/prog.h"
#include "re2/re2.h"
-#include "re2/regexp.h"
-#include "re2/walker-inl.h"
-
-namespace re2 {
-
-// List of pointers to Inst* that need to be filled in (patched).
-// Because the Inst* haven't been filled in yet,
-// we can use the Inst* word to hold the list's "next" pointer.
-// It's kind of sleazy, but it works well in practice.
-// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
-//
-// Because the out and out1 fields in Inst are no longer pointers,
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// List of pointers to Inst* that need to be filled in (patched).
+// Because the Inst* haven't been filled in yet,
+// we can use the Inst* word to hold the list's "next" pointer.
+// It's kind of sleazy, but it works well in practice.
+// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
+//
+// Because the out and out1 fields in Inst are no longer pointers,
// we can't use pointers directly here either. Instead, head refers
// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
// head == 0 represents the NULL list. This is okay because instruction #0
-// is always the fail instruction, which never appears on a list.
-struct PatchList {
- // Returns patch list containing just p.
+// is always the fail instruction, which never appears on a list.
+struct PatchList {
+ // Returns patch list containing just p.
static PatchList Mk(uint32_t p) {
return {p, p};
}
-
+
// Patches all the entries on l to have value p.
- // Caller must not ever use patch list again.
+ // Caller must not ever use patch list again.
static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
while (l.head != 0) {
Prog::Inst* ip = &inst0[l.head>>1];
@@ -52,9 +52,9 @@ struct PatchList {
l.head = ip->out();
ip->set_out(p);
}
- }
- }
-
+ }
+ }
+
// Appends two patch lists and returns result.
static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
if (l1.head == 0)
@@ -67,113 +67,113 @@ struct PatchList {
else
ip->set_out(l2.head);
return {l1.head, l2.tail};
- }
-
+ }
+
uint32_t head;
uint32_t tail; // for constant-time append
};
-
+
static const PatchList kNullPatchList = {0, 0};
-
-// Compiled program fragment.
-struct Frag {
+
+// Compiled program fragment.
+struct Frag {
uint32_t begin;
- PatchList end;
+ PatchList end;
bool nullable;
-
+
Frag() : begin(0), end(kNullPatchList), nullable(false) {}
Frag(uint32_t begin, PatchList end, bool nullable)
: begin(begin), end(end), nullable(nullable) {}
-};
-
-// Input encodings.
-enum Encoding {
- kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
+};
+
+// Input encodings.
+enum Encoding {
+ kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
kEncodingLatin1, // Latin-1 (0-FF)
-};
-
-class Compiler : public Regexp::Walker<Frag> {
- public:
- explicit Compiler();
- ~Compiler();
-
- // Compiles Regexp to a new Prog.
- // Caller is responsible for deleting Prog when finished with it.
- // If reversed is true, compiles for walking over the input
- // string backward (reverses all concatenations).
+};
+
+class Compiler : public Regexp::Walker<Frag> {
+ public:
+ explicit Compiler();
+ ~Compiler();
+
+ // Compiles Regexp to a new Prog.
+ // Caller is responsible for deleting Prog when finished with it.
+ // If reversed is true, compiles for walking over the input
+ // string backward (reverses all concatenations).
static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
-
- // Compiles alternation of all the re to a new Prog.
- // Each re has a match with an id equal to its index in the vector.
+
+ // Compiles alternation of all the re to a new Prog.
+ // Each re has a match with an id equal to its index in the vector.
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
-
- // Interface for Regexp::Walker, which helps traverse the Regexp.
- // The walk is purely post-recursive: given the machines for the
- // children, PostVisit combines them to create the machine for
- // the current node. The child_args are Frags.
- // The Compiler traverses the Regexp parse tree, visiting
- // each node in depth-first order. It invokes PreVisit before
- // visiting the node's children and PostVisit after visiting
- // the children.
- Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
- Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
- int nchild_args);
- Frag ShortVisit(Regexp* re, Frag parent_arg);
- Frag Copy(Frag arg);
-
- // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
- Frag Plus(Frag a, bool nongreedy);
- Frag Star(Frag a, bool nongreedy);
- Frag Quest(Frag a, bool nongreedy);
-
- // Given fragment a, returns (a) capturing as \n.
- Frag Capture(Frag a, int n);
-
- // Given fragments a and b, returns ab; a|b
- Frag Cat(Frag a, Frag b);
- Frag Alt(Frag a, Frag b);
-
- // Returns a fragment that can't match anything.
- Frag NoMatch();
-
- // Returns a fragment that matches the empty string.
+
+ // Interface for Regexp::Walker, which helps traverse the Regexp.
+ // The walk is purely post-recursive: given the machines for the
+ // children, PostVisit combines them to create the machine for
+ // the current node. The child_args are Frags.
+ // The Compiler traverses the Regexp parse tree, visiting
+ // each node in depth-first order. It invokes PreVisit before
+ // visiting the node's children and PostVisit after visiting
+ // the children.
+ Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
+ Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
+ int nchild_args);
+ Frag ShortVisit(Regexp* re, Frag parent_arg);
+ Frag Copy(Frag arg);
+
+ // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
+ Frag Plus(Frag a, bool nongreedy);
+ Frag Star(Frag a, bool nongreedy);
+ Frag Quest(Frag a, bool nongreedy);
+
+ // Given fragment a, returns (a) capturing as \n.
+ Frag Capture(Frag a, int n);
+
+ // Given fragments a and b, returns ab; a|b
+ Frag Cat(Frag a, Frag b);
+ Frag Alt(Frag a, Frag b);
+
+ // Returns a fragment that can't match anything.
+ Frag NoMatch();
+
+ // Returns a fragment that matches the empty string.
Frag Match(int32_t id);
-
- // Returns a no-op fragment.
- Frag Nop();
-
- // Returns a fragment matching the byte range lo-hi.
- Frag ByteRange(int lo, int hi, bool foldcase);
-
- // Returns a fragment matching an empty-width special op.
- Frag EmptyWidth(EmptyOp op);
-
- // Adds n instructions to the program.
- // Returns the index of the first one.
- // Returns -1 if no more instructions are available.
- int AllocInst(int n);
-
- // Rune range compiler.
-
- // Begins a new alternation.
- void BeginRange();
-
- // Adds a fragment matching the rune range lo-hi.
- void AddRuneRange(Rune lo, Rune hi, bool foldcase);
- void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
- void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
- void Add_80_10ffff();
-
- // New suffix that matches the byte range lo-hi, then goes to next.
+
+ // Returns a no-op fragment.
+ Frag Nop();
+
+ // Returns a fragment matching the byte range lo-hi.
+ Frag ByteRange(int lo, int hi, bool foldcase);
+
+ // Returns a fragment matching an empty-width special op.
+ Frag EmptyWidth(EmptyOp op);
+
+ // Adds n instructions to the program.
+ // Returns the index of the first one.
+ // Returns -1 if no more instructions are available.
+ int AllocInst(int n);
+
+ // Rune range compiler.
+
+ // Begins a new alternation.
+ void BeginRange();
+
+ // Adds a fragment matching the rune range lo-hi.
+ void AddRuneRange(Rune lo, Rune hi, bool foldcase);
+ void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
+ void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
+ void Add_80_10ffff();
+
+ // New suffix that matches the byte range lo-hi, then goes to next.
int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
-
+
// Returns true iff the suffix is cached.
bool IsCachedRuneByteSuffix(int id);
- // Adds a suffix to alternation.
- void AddSuffix(int id);
-
+ // Adds a suffix to alternation.
+ void AddSuffix(int id);
+
// Adds a suffix to the trie starting from the given root node.
// Returns zero iff allocating an instruction fails. Otherwise, returns
// the current root node, which might be different from what was given.
@@ -187,62 +187,62 @@ class Compiler : public Regexp::Walker<Frag> {
// Compares two ByteRanges and returns true iff they are equal.
bool ByteRangeEqual(int id1, int id2);
- // Returns the alternation of all the added suffixes.
- Frag EndRange();
-
- // Single rune.
- Frag Literal(Rune r, bool foldcase);
-
+ // Returns the alternation of all the added suffixes.
+ Frag EndRange();
+
+ // Single rune.
+ Frag Literal(Rune r, bool foldcase);
+
void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
Prog* Finish(Regexp* re);
-
- // Returns .* where dot = any byte
- Frag DotStar();
-
- private:
- Prog* prog_; // Program being built.
- bool failed_; // Did we give up compiling?
- Encoding encoding_; // Input encoding
- bool reversed_; // Should program run backward over text?
-
+
+ // Returns .* where dot = any byte
+ Frag DotStar();
+
+ private:
+ Prog* prog_; // Program being built.
+ bool failed_; // Did we give up compiling?
+ Encoding encoding_; // Input encoding
+ bool reversed_; // Should program run backward over text?
+
PODArray<Prog::Inst> inst_;
int ninst_; // Number of instructions used.
int max_ninst_; // Maximum number of instructions.
-
+
int64_t max_mem_; // Total memory budget.
-
+
std::unordered_map<uint64_t, int> rune_cache_;
- Frag rune_range_;
-
- RE2::Anchor anchor_; // anchor mode for RE2::Set
-
+ Frag rune_range_;
+
+ RE2::Anchor anchor_; // anchor mode for RE2::Set
+
Compiler(const Compiler&) = delete;
Compiler& operator=(const Compiler&) = delete;
-};
-
-Compiler::Compiler() {
- prog_ = new Prog();
- failed_ = false;
- encoding_ = kEncodingUTF8;
- reversed_ = false;
+};
+
+Compiler::Compiler() {
+ prog_ = new Prog();
+ failed_ = false;
+ encoding_ = kEncodingUTF8;
+ reversed_ = false;
ninst_ = 0;
max_ninst_ = 1; // make AllocInst for fail instruction okay
- max_mem_ = 0;
- int fail = AllocInst(1);
- inst_[fail].InitFail();
+ max_mem_ = 0;
+ int fail = AllocInst(1);
+ inst_[fail].InitFail();
max_ninst_ = 0; // Caller must change
-}
-
-Compiler::~Compiler() {
- delete prog_;
-}
-
-int Compiler::AllocInst(int n) {
+}
+
+Compiler::~Compiler() {
+ delete prog_;
+}
+
+int Compiler::AllocInst(int n) {
if (failed_ || ninst_ + n > max_ninst_) {
- failed_ = true;
- return -1;
- }
-
+ failed_ = true;
+ return -1;
+ }
+
if (ninst_ + n > inst_.size()) {
int cap = inst_.size();
if (cap == 0)
@@ -254,92 +254,92 @@ int Compiler::AllocInst(int n) {
memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
inst_ = std::move(inst);
- }
+ }
int id = ninst_;
ninst_ += n;
- return id;
-}
-
-// These routines are somewhat hard to visualize in text --
-// see http://swtch.com/~rsc/regexp/regexp1.html for
-// pictures explaining what is going on here.
-
-// Returns an unmatchable fragment.
-Frag Compiler::NoMatch() {
+ return id;
+}
+
+// These routines are somewhat hard to visualize in text --
+// see http://swtch.com/~rsc/regexp/regexp1.html for
+// pictures explaining what is going on here.
+
+// Returns an unmatchable fragment.
+Frag Compiler::NoMatch() {
return Frag();
-}
-
-// Is a an unmatchable fragment?
-static bool IsNoMatch(Frag a) {
- return a.begin == 0;
-}
-
-// Given fragments a and b, returns fragment for ab.
-Frag Compiler::Cat(Frag a, Frag b) {
- if (IsNoMatch(a) || IsNoMatch(b))
- return NoMatch();
-
- // Elide no-op.
- Prog::Inst* begin = &inst_[a.begin];
- if (begin->opcode() == kInstNop &&
+}
+
+// Is a an unmatchable fragment?
+static bool IsNoMatch(Frag a) {
+ return a.begin == 0;
+}
+
+// Given fragments a and b, returns fragment for ab.
+Frag Compiler::Cat(Frag a, Frag b) {
+ if (IsNoMatch(a) || IsNoMatch(b))
+ return NoMatch();
+
+ // Elide no-op.
+ Prog::Inst* begin = &inst_[a.begin];
+ if (begin->opcode() == kInstNop &&
a.end.head == (a.begin << 1) &&
- begin->out() == 0) {
+ begin->out() == 0) {
// in case refs to a somewhere
PatchList::Patch(inst_.data(), a.end, b.begin);
- return b;
- }
-
- // To run backward over string, reverse all concatenations.
- if (reversed_) {
+ return b;
+ }
+
+ // To run backward over string, reverse all concatenations.
+ if (reversed_) {
PatchList::Patch(inst_.data(), b.end, a.begin);
return Frag(b.begin, a.end, b.nullable && a.nullable);
- }
-
+ }
+
PatchList::Patch(inst_.data(), a.end, b.begin);
return Frag(a.begin, b.end, a.nullable && b.nullable);
-}
-
-// Given fragments for a and b, returns fragment for a|b.
-Frag Compiler::Alt(Frag a, Frag b) {
- // Special case for convenience in loops.
- if (IsNoMatch(a))
- return b;
- if (IsNoMatch(b))
- return a;
-
- int id = AllocInst(1);
- if (id < 0)
- return NoMatch();
-
- inst_[id].InitAlt(a.begin, b.begin);
+}
+
+// Given fragments for a and b, returns fragment for a|b.
+Frag Compiler::Alt(Frag a, Frag b) {
+ // Special case for convenience in loops.
+ if (IsNoMatch(a))
+ return b;
+ if (IsNoMatch(b))
+ return a;
+
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+
+ inst_[id].InitAlt(a.begin, b.begin);
return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
a.nullable || b.nullable);
-}
-
-// When capturing submatches in like-Perl mode, a kOpAlt Inst
-// treats out_ as the first choice, out1_ as the second.
-//
-// For *, +, and ?, if out_ causes another repetition,
-// then the operator is greedy. If out1_ is the repetition
-// (and out_ moves forward), then the operator is non-greedy.
-
+}
+
+// When capturing submatches in like-Perl mode, a kOpAlt Inst
+// treats out_ as the first choice, out1_ as the second.
+//
+// For *, +, and ?, if out_ causes another repetition,
+// then the operator is greedy. If out1_ is the repetition
+// (and out_ moves forward), then the operator is non-greedy.
+
// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Frag Compiler::Plus(Frag a, bool nongreedy) {
- int id = AllocInst(1);
- if (id < 0)
- return NoMatch();
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
PatchList pl;
- if (nongreedy) {
+ if (nongreedy) {
inst_[id].InitAlt(0, a.begin);
pl = PatchList::Mk(id << 1);
- } else {
+ } else {
inst_[id].InitAlt(a.begin, 0);
pl = PatchList::Mk((id << 1) | 1);
- }
+ }
PatchList::Patch(inst_.data(), a.end, id);
return Frag(a.begin, pl, a.nullable);
-}
-
+}
+
// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
Frag Compiler::Star(Frag a, bool nongreedy) {
// When the subexpression is nullable, one Alt isn't enough to guarantee
@@ -361,112 +361,112 @@ Frag Compiler::Star(Frag a, bool nongreedy) {
}
PatchList::Patch(inst_.data(), a.end, id);
return Frag(id, pl, true);
-}
-
-// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
-Frag Compiler::Quest(Frag a, bool nongreedy) {
+}
+
+// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
+Frag Compiler::Quest(Frag a, bool nongreedy) {
if (IsNoMatch(a))
return Nop();
- int id = AllocInst(1);
- if (id < 0)
- return NoMatch();
- PatchList pl;
- if (nongreedy) {
- inst_[id].InitAlt(0, a.begin);
- pl = PatchList::Mk(id << 1);
- } else {
- inst_[id].InitAlt(a.begin, 0);
- pl = PatchList::Mk((id << 1) | 1);
- }
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ PatchList pl;
+ if (nongreedy) {
+ inst_[id].InitAlt(0, a.begin);
+ pl = PatchList::Mk(id << 1);
+ } else {
+ inst_[id].InitAlt(a.begin, 0);
+ pl = PatchList::Mk((id << 1) | 1);
+ }
return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
-}
-
-// Returns a fragment for the byte range lo-hi.
-Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
- int id = AllocInst(1);
- if (id < 0)
- return NoMatch();
- inst_[id].InitByteRange(lo, hi, foldcase, 0);
+}
+
+// Returns a fragment for the byte range lo-hi.
+Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitByteRange(lo, hi, foldcase, 0);
return Frag(id, PatchList::Mk(id << 1), false);
-}
-
-// Returns a no-op fragment. Sometimes unavoidable.
-Frag Compiler::Nop() {
- int id = AllocInst(1);
- if (id < 0)
- return NoMatch();
- inst_[id].InitNop(0);
+}
+
+// Returns a no-op fragment. Sometimes unavoidable.
+Frag Compiler::Nop() {
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitNop(0);
return Frag(id, PatchList::Mk(id << 1), true);
-}
-
-// Returns a fragment that signals a match.
+}
+
+// Returns a fragment that signals a match.
Frag Compiler::Match(int32_t match_id) {
- int id = AllocInst(1);
- if (id < 0)
- return NoMatch();
- inst_[id].InitMatch(match_id);
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitMatch(match_id);
return Frag(id, kNullPatchList, false);
-}
-
-// Returns a fragment matching a particular empty-width op (like ^ or $)
-Frag Compiler::EmptyWidth(EmptyOp empty) {
- int id = AllocInst(1);
- if (id < 0)
- return NoMatch();
- inst_[id].InitEmptyWidth(empty, 0);
+}
+
+// Returns a fragment matching a particular empty-width op (like ^ or $)
+Frag Compiler::EmptyWidth(EmptyOp empty) {
+ int id = AllocInst(1);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitEmptyWidth(empty, 0);
return Frag(id, PatchList::Mk(id << 1), true);
-}
-
-// Given a fragment a, returns a fragment with capturing parens around a.
-Frag Compiler::Capture(Frag a, int n) {
+}
+
+// Given a fragment a, returns a fragment with capturing parens around a.
+Frag Compiler::Capture(Frag a, int n) {
if (IsNoMatch(a))
return NoMatch();
- int id = AllocInst(2);
- if (id < 0)
- return NoMatch();
- inst_[id].InitCapture(2*n, a.begin);
- inst_[id+1].InitCapture(2*n+1, 0);
+ int id = AllocInst(2);
+ if (id < 0)
+ return NoMatch();
+ inst_[id].InitCapture(2*n, a.begin);
+ inst_[id+1].InitCapture(2*n+1, 0);
PatchList::Patch(inst_.data(), a.end, id+1);
-
+
return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
-}
-
-// A Rune is a name for a Unicode code point.
-// Returns maximum rune encoded by UTF-8 sequence of length len.
-static int MaxRune(int len) {
+}
+
+// A Rune is a name for a Unicode code point.
+// Returns maximum rune encoded by UTF-8 sequence of length len.
+static int MaxRune(int len) {
int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
- if (len == 1)
- b = 7;
- else
- b = 8-(len+1) + 6*(len-1);
- return (1<<b) - 1; // maximum Rune for b bits.
-}
-
-// The rune range compiler caches common suffix fragments,
-// which are very common in UTF-8 (e.g., [80-bf]).
-// The fragment suffixes are identified by their start
-// instructions. NULL denotes the eventual end match.
-// The Frag accumulates in rune_range_. Caching common
-// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
-// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
-
-void Compiler::BeginRange() {
- rune_cache_.clear();
- rune_range_.begin = 0;
+ if (len == 1)
+ b = 7;
+ else
+ b = 8-(len+1) + 6*(len-1);
+ return (1<<b) - 1; // maximum Rune for b bits.
+}
+
+// The rune range compiler caches common suffix fragments,
+// which are very common in UTF-8 (e.g., [80-bf]).
+// The fragment suffixes are identified by their start
+// instructions. NULL denotes the eventual end match.
+// The Frag accumulates in rune_range_. Caching common
+// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
+// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
+
+void Compiler::BeginRange() {
+ rune_cache_.clear();
+ rune_range_.begin = 0;
rune_range_.end = kNullPatchList;
-}
-
+}
+
int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
- int next) {
- Frag f = ByteRange(lo, hi, foldcase);
- if (next != 0) {
+ int next) {
+ Frag f = ByteRange(lo, hi, foldcase);
+ if (next != 0) {
PatchList::Patch(inst_.data(), f.end, next);
- } else {
+ } else {
rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
- }
- return f.begin;
-}
-
+ }
+ return f.begin;
+}
+
static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
int next) {
return (uint64_t)next << 17 |
@@ -474,18 +474,18 @@ static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
(uint64_t)hi << 1 |
(uint64_t)foldcase;
}
-
+
int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
int next) {
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
- if (it != rune_cache_.end())
- return it->second;
- int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
- rune_cache_[key] = id;
- return id;
-}
-
+ if (it != rune_cache_.end())
+ return it->second;
+ int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
+ rune_cache_[key] = id;
+ return id;
+}
+
bool Compiler::IsCachedRuneByteSuffix(int id) {
uint8_t lo = inst_[id].lo_;
uint8_t hi = inst_[id].hi_;
@@ -496,30 +496,30 @@ bool Compiler::IsCachedRuneByteSuffix(int id) {
return rune_cache_.find(key) != rune_cache_.end();
}
-void Compiler::AddSuffix(int id) {
+void Compiler::AddSuffix(int id) {
if (failed_)
return;
- if (rune_range_.begin == 0) {
- rune_range_.begin = id;
- return;
- }
-
+ if (rune_range_.begin == 0) {
+ rune_range_.begin = id;
+ return;
+ }
+
if (encoding_ == kEncodingUTF8) {
// Build a trie in order to reduce fanout.
rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
return;
}
- int alt = AllocInst(1);
- if (alt < 0) {
- rune_range_.begin = 0;
- return;
- }
- inst_[alt].InitAlt(rune_range_.begin, id);
- rune_range_.begin = alt;
-}
-
+ int alt = AllocInst(1);
+ if (alt < 0) {
+ rune_range_.begin = 0;
+ return;
+ }
+ inst_[alt].InitAlt(rune_range_.begin, id);
+ rune_range_.begin = alt;
+}
+
int Compiler::AddSuffixRecursive(int root, int id) {
DCHECK(inst_[root].opcode() == kInstAlt ||
inst_[root].opcode() == kInstByteRange);
@@ -616,38 +616,38 @@ Frag Compiler::FindByteRange(int root, int id) {
return NoMatch();
}
-Frag Compiler::EndRange() {
- return rune_range_;
-}
-
-// Converts rune range lo-hi into a fragment that recognizes
-// the bytes that would make up those runes in the current
-// encoding (Latin 1 or UTF-8).
-// This lets the machine work byte-by-byte even when
-// using multibyte encodings.
-
-void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
- switch (encoding_) {
- default:
- case kEncodingUTF8:
- AddRuneRangeUTF8(lo, hi, foldcase);
- break;
- case kEncodingLatin1:
- AddRuneRangeLatin1(lo, hi, foldcase);
- break;
- }
-}
-
-void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
+Frag Compiler::EndRange() {
+ return rune_range_;
+}
+
+// Converts rune range lo-hi into a fragment that recognizes
+// the bytes that would make up those runes in the current
+// encoding (Latin 1 or UTF-8).
+// This lets the machine work byte-by-byte even when
+// using multibyte encodings.
+
+void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
+ switch (encoding_) {
+ default:
+ case kEncodingUTF8:
+ AddRuneRangeUTF8(lo, hi, foldcase);
+ break;
+ case kEncodingLatin1:
+ AddRuneRangeLatin1(lo, hi, foldcase);
+ break;
+ }
+}
+
+void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
// Latin-1 is easy: runes *are* bytes.
- if (lo > hi || lo > 0xFF)
- return;
- if (hi > 0xFF)
- hi = 0xFF;
+ if (lo > hi || lo > 0xFF)
+ return;
+ if (hi > 0xFF)
+ hi = 0xFF;
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
static_cast<uint8_t>(hi), foldcase, 0));
-}
-
+}
+
void Compiler::Add_80_10ffff() {
// The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
// (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
@@ -661,12 +661,12 @@ void Compiler::Add_80_10ffff() {
id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
AddSuffix(id);
-
+
id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
AddSuffix(id);
-
+
id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
@@ -677,7 +677,7 @@ void Compiler::Add_80_10ffff() {
int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
AddSuffix(id);
-
+
int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
AddSuffix(id);
@@ -685,60 +685,60 @@ void Compiler::Add_80_10ffff() {
int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
AddSuffix(id);
- }
-}
-
-void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
- if (lo > hi)
- return;
-
+ }
+}
+
+void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
+ if (lo > hi)
+ return;
+
// Pick off 80-10FFFF as a common special case.
if (lo == 0x80 && hi == 0x10ffff) {
- Add_80_10ffff();
- return;
- }
-
- // Split range into same-length sized ranges.
- for (int i = 1; i < UTFmax; i++) {
- Rune max = MaxRune(i);
- if (lo <= max && max < hi) {
- AddRuneRangeUTF8(lo, max, foldcase);
- AddRuneRangeUTF8(max+1, hi, foldcase);
- return;
- }
- }
-
- // ASCII range is always a special case.
- if (hi < Runeself) {
+ Add_80_10ffff();
+ return;
+ }
+
+ // Split range into same-length sized ranges.
+ for (int i = 1; i < UTFmax; i++) {
+ Rune max = MaxRune(i);
+ if (lo <= max && max < hi) {
+ AddRuneRangeUTF8(lo, max, foldcase);
+ AddRuneRangeUTF8(max+1, hi, foldcase);
+ return;
+ }
+ }
+
+ // ASCII range is always a special case.
+ if (hi < Runeself) {
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
static_cast<uint8_t>(hi), foldcase, 0));
- return;
- }
-
- // Split range into sections that agree on leading bytes.
- for (int i = 1; i < UTFmax; i++) {
+ return;
+ }
+
+ // Split range into sections that agree on leading bytes.
+ for (int i = 1; i < UTFmax; i++) {
uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
- if ((lo & ~m) != (hi & ~m)) {
- if ((lo & m) != 0) {
- AddRuneRangeUTF8(lo, lo|m, foldcase);
- AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
- return;
- }
- if ((hi & m) != m) {
- AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
- AddRuneRangeUTF8(hi&~m, hi, foldcase);
- return;
- }
- }
- }
-
- // Finally. Generate byte matching equivalent for lo-hi.
+ if ((lo & ~m) != (hi & ~m)) {
+ if ((lo & m) != 0) {
+ AddRuneRangeUTF8(lo, lo|m, foldcase);
+ AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
+ return;
+ }
+ if ((hi & m) != m) {
+ AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
+ AddRuneRangeUTF8(hi&~m, hi, foldcase);
+ return;
+ }
+ }
+ }
+
+ // Finally. Generate byte matching equivalent for lo-hi.
uint8_t ulo[UTFmax], uhi[UTFmax];
- int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
- int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
- (void)m; // USED(m)
- DCHECK_EQ(n, m);
-
+ int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
+ int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
+ (void)m; // USED(m)
+ DCHECK_EQ(n, m);
+
// The logic below encodes this thinking:
//
// 1. When we have built the whole suffix, we know that it cannot
@@ -763,8 +763,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
// is more likely so; in reverse mode, a byte range is unlikely to
// be part of a common suffix whereas a single byte is more likely
// so. The same benefit versus cost argument applies here.
- int id = 0;
- if (reversed_) {
+ int id = 0;
+ if (reversed_) {
for (int i = 0; i < n; i++) {
// In reverse UTF-8 mode: cache the leading byte; don't cache the last
// continuation byte; cache anything else iff it's a single byte (XX-XX).
@@ -773,7 +773,7 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
else
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
}
- } else {
+ } else {
for (int i = n-1; i >= 0; i--) {
// In forward UTF-8 mode: don't cache the leading byte; cache the last
// continuation byte; cache anything else iff it's a byte range (XX-YY).
@@ -782,206 +782,206 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
else
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
}
- }
- AddSuffix(id);
-}
-
-// Should not be called.
-Frag Compiler::Copy(Frag arg) {
- // We're using WalkExponential; there should be no copying.
- LOG(DFATAL) << "Compiler::Copy called!";
- failed_ = true;
- return NoMatch();
-}
-
-// Visits a node quickly; called once WalkExponential has
-// decided to cut this walk short.
-Frag Compiler::ShortVisit(Regexp* re, Frag) {
- failed_ = true;
- return NoMatch();
-}
-
-// Called before traversing a node's children during the walk.
-Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
- // Cut off walk if we've already failed.
- if (failed_)
- *stop = true;
-
+ }
+ AddSuffix(id);
+}
+
+// Should not be called.
+Frag Compiler::Copy(Frag arg) {
+ // We're using WalkExponential; there should be no copying.
+ LOG(DFATAL) << "Compiler::Copy called!";
+ failed_ = true;
+ return NoMatch();
+}
+
+// Visits a node quickly; called once WalkExponential has
+// decided to cut this walk short.
+Frag Compiler::ShortVisit(Regexp* re, Frag) {
+ failed_ = true;
+ return NoMatch();
+}
+
+// Called before traversing a node's children during the walk.
+Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
+ // Cut off walk if we've already failed.
+ if (failed_)
+ *stop = true;
+
return Frag(); // not used by caller
-}
-
-Frag Compiler::Literal(Rune r, bool foldcase) {
- switch (encoding_) {
- default:
+}
+
+Frag Compiler::Literal(Rune r, bool foldcase) {
+ switch (encoding_) {
+ default:
return Frag();
-
- case kEncodingLatin1:
- return ByteRange(r, r, foldcase);
-
- case kEncodingUTF8: {
- if (r < Runeself) // Make common case fast.
- return ByteRange(r, r, foldcase);
+
+ case kEncodingLatin1:
+ return ByteRange(r, r, foldcase);
+
+ case kEncodingUTF8: {
+ if (r < Runeself) // Make common case fast.
+ return ByteRange(r, r, foldcase);
uint8_t buf[UTFmax];
- int n = runetochar(reinterpret_cast<char*>(buf), &r);
+ int n = runetochar(reinterpret_cast<char*>(buf), &r);
Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
- for (int i = 1; i < n; i++)
+ for (int i = 1; i < n; i++)
f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
- return f;
- }
- }
-}
-
-// Called after traversing the node's children during the walk.
-// Given their frags, build and return the frag for this re.
-Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
- int nchild_frags) {
- // If a child failed, don't bother going forward, especially
- // since the child_frags might contain Frags with NULLs in them.
- if (failed_)
- return NoMatch();
-
- // Given the child fragments, return the fragment for this node.
- switch (re->op()) {
- case kRegexpRepeat:
- // Should not see; code at bottom of function will print error
- break;
-
- case kRegexpNoMatch:
- return NoMatch();
-
- case kRegexpEmptyMatch:
- return Nop();
-
- case kRegexpHaveMatch: {
- Frag f = Match(re->match_id());
+ return f;
+ }
+ }
+}
+
+// Called after traversing the node's children during the walk.
+// Given their frags, build and return the frag for this re.
+Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
+ int nchild_frags) {
+ // If a child failed, don't bother going forward, especially
+ // since the child_frags might contain Frags with NULLs in them.
+ if (failed_)
+ return NoMatch();
+
+ // Given the child fragments, return the fragment for this node.
+ switch (re->op()) {
+ case kRegexpRepeat:
+ // Should not see; code at bottom of function will print error
+ break;
+
+ case kRegexpNoMatch:
+ return NoMatch();
+
+ case kRegexpEmptyMatch:
+ return Nop();
+
+ case kRegexpHaveMatch: {
+ Frag f = Match(re->match_id());
if (anchor_ == RE2::ANCHOR_BOTH) {
// Append \z or else the subexpression will effectively be unanchored.
// Complemented by the UNANCHORED case in CompileSet().
f = Cat(EmptyWidth(kEmptyEndText), f);
}
- return f;
- }
-
- case kRegexpConcat: {
- Frag f = child_frags[0];
- for (int i = 1; i < nchild_frags; i++)
- f = Cat(f, child_frags[i]);
- return f;
- }
-
- case kRegexpAlternate: {
- Frag f = child_frags[0];
- for (int i = 1; i < nchild_frags; i++)
- f = Alt(f, child_frags[i]);
- return f;
- }
-
- case kRegexpStar:
+ return f;
+ }
+
+ case kRegexpConcat: {
+ Frag f = child_frags[0];
+ for (int i = 1; i < nchild_frags; i++)
+ f = Cat(f, child_frags[i]);
+ return f;
+ }
+
+ case kRegexpAlternate: {
+ Frag f = child_frags[0];
+ for (int i = 1; i < nchild_frags; i++)
+ f = Alt(f, child_frags[i]);
+ return f;
+ }
+
+ case kRegexpStar:
return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
-
- case kRegexpPlus:
+
+ case kRegexpPlus:
return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
-
- case kRegexpQuest:
+
+ case kRegexpQuest:
return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
-
- case kRegexpLiteral:
+
+ case kRegexpLiteral:
return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
-
- case kRegexpLiteralString: {
- // Concatenation of literals.
- if (re->nrunes() == 0)
- return Nop();
- Frag f;
- for (int i = 0; i < re->nrunes(); i++) {
+
+ case kRegexpLiteralString: {
+ // Concatenation of literals.
+ if (re->nrunes() == 0)
+ return Nop();
+ Frag f;
+ for (int i = 0; i < re->nrunes(); i++) {
Frag f1 = Literal(re->runes()[i],
(re->parse_flags()&Regexp::FoldCase) != 0);
- if (i == 0)
- f = f1;
- else
- f = Cat(f, f1);
- }
- return f;
- }
-
- case kRegexpAnyChar:
- BeginRange();
- AddRuneRange(0, Runemax, false);
- return EndRange();
-
- case kRegexpAnyByte:
- return ByteRange(0x00, 0xFF, false);
-
- case kRegexpCharClass: {
- CharClass* cc = re->cc();
- if (cc->empty()) {
- // This can't happen.
- LOG(DFATAL) << "No ranges in char class";
- failed_ = true;
- return NoMatch();
- }
-
- // ASCII case-folding optimization: if the char class
- // behaves the same on A-Z as it does on a-z,
- // discard any ranges wholly contained in A-Z
- // and mark the other ranges as foldascii.
- // This reduces the size of a program for
- // (?i)abc from 3 insts per letter to 1 per letter.
- bool foldascii = cc->FoldsASCII();
-
- // Character class is just a big OR of the different
- // character ranges in the class.
- BeginRange();
- for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
- // ASCII case-folding optimization (see above).
- if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
- continue;
-
- // If this range contains all of A-Za-z or none of it,
- // the fold flag is unnecessary; don't bother.
- bool fold = foldascii;
+ if (i == 0)
+ f = f1;
+ else
+ f = Cat(f, f1);
+ }
+ return f;
+ }
+
+ case kRegexpAnyChar:
+ BeginRange();
+ AddRuneRange(0, Runemax, false);
+ return EndRange();
+
+ case kRegexpAnyByte:
+ return ByteRange(0x00, 0xFF, false);
+
+ case kRegexpCharClass: {
+ CharClass* cc = re->cc();
+ if (cc->empty()) {
+ // This can't happen.
+ LOG(DFATAL) << "No ranges in char class";
+ failed_ = true;
+ return NoMatch();
+ }
+
+ // ASCII case-folding optimization: if the char class
+ // behaves the same on A-Z as it does on a-z,
+ // discard any ranges wholly contained in A-Z
+ // and mark the other ranges as foldascii.
+ // This reduces the size of a program for
+ // (?i)abc from 3 insts per letter to 1 per letter.
+ bool foldascii = cc->FoldsASCII();
+
+ // Character class is just a big OR of the different
+ // character ranges in the class.
+ BeginRange();
+ for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
+ // ASCII case-folding optimization (see above).
+ if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
+ continue;
+
+ // If this range contains all of A-Za-z or none of it,
+ // the fold flag is unnecessary; don't bother.
+ bool fold = foldascii;
if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
('Z' < i->lo && i->hi < 'a'))
- fold = false;
-
- AddRuneRange(i->lo, i->hi, fold);
- }
- return EndRange();
- }
-
- case kRegexpCapture:
- // If this is a non-capturing parenthesis -- (?:foo) --
- // just use the inner expression.
- if (re->cap() < 0)
- return child_frags[0];
- return Capture(child_frags[0], re->cap());
-
- case kRegexpBeginLine:
- return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
-
- case kRegexpEndLine:
- return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
-
- case kRegexpBeginText:
- return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
-
- case kRegexpEndText:
- return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
-
- case kRegexpWordBoundary:
- return EmptyWidth(kEmptyWordBoundary);
-
- case kRegexpNoWordBoundary:
- return EmptyWidth(kEmptyNonWordBoundary);
- }
- LOG(DFATAL) << "Missing case in Compiler: " << re->op();
- failed_ = true;
- return NoMatch();
-}
-
-// Is this regexp required to start at the beginning of the text?
-// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
-// but handles (\A(a|b)). Could use the Walker to write a more exact one.
+ fold = false;
+
+ AddRuneRange(i->lo, i->hi, fold);
+ }
+ return EndRange();
+ }
+
+ case kRegexpCapture:
+ // If this is a non-capturing parenthesis -- (?:foo) --
+ // just use the inner expression.
+ if (re->cap() < 0)
+ return child_frags[0];
+ return Capture(child_frags[0], re->cap());
+
+ case kRegexpBeginLine:
+ return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
+
+ case kRegexpEndLine:
+ return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
+
+ case kRegexpBeginText:
+ return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
+
+ case kRegexpEndText:
+ return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
+
+ case kRegexpWordBoundary:
+ return EmptyWidth(kEmptyWordBoundary);
+
+ case kRegexpNoWordBoundary:
+ return EmptyWidth(kEmptyNonWordBoundary);
+ }
+ LOG(DFATAL) << "Missing case in Compiler: " << re->op();
+ failed_ = true;
+ return NoMatch();
+}
+
+// Is this regexp required to start at the beginning of the text?
+// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
+// but handles (\A(a|b)). Could use the Walker to write a more exact one.
static bool IsAnchorStart(Regexp** pre, int depth) {
Regexp* re = *pre;
Regexp* sub;
@@ -1005,7 +1005,7 @@ static bool IsAnchorStart(Regexp** pre, int depth) {
*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
re->Decref();
return true;
- }
+ }
sub->Decref();
}
break;
@@ -1013,8 +1013,8 @@ static bool IsAnchorStart(Regexp** pre, int depth) {
sub = re->sub()[0]->Incref();
if (IsAnchorStart(&sub, depth+1)) {
*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
- re->Decref();
- return true;
+ re->Decref();
+ return true;
}
sub->Decref();
break;
@@ -1022,13 +1022,13 @@ static bool IsAnchorStart(Regexp** pre, int depth) {
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
re->Decref();
return true;
- }
+ }
return false;
-}
-
-// Is this regexp required to start at the end of the text?
-// Only approximate; can return false for complicated regexps like (a\z|b\z),
-// but handles ((a|b)\z). Could use the Walker to write a more exact one.
+}
+
+// Is this regexp required to start at the end of the text?
+// Only approximate; can return false for complicated regexps like (a\z|b\z),
+// but handles ((a|b)\z). Could use the Walker to write a more exact one.
static bool IsAnchorEnd(Regexp** pre, int depth) {
Regexp* re = *pre;
Regexp* sub;
@@ -1052,7 +1052,7 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
re->Decref();
return true;
- }
+ }
sub->Decref();
}
break;
@@ -1060,8 +1060,8 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
sub = re->sub()[0]->Incref();
if (IsAnchorEnd(&sub, depth+1)) {
*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
- re->Decref();
- return true;
+ re->Decref();
+ return true;
}
sub->Decref();
break;
@@ -1069,110 +1069,110 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
re->Decref();
return true;
- }
+ }
return false;
-}
-
+}
+
void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
- RE2::Anchor anchor) {
- if (flags & Regexp::Latin1)
- encoding_ = kEncodingLatin1;
- max_mem_ = max_mem;
- if (max_mem <= 0) {
+ RE2::Anchor anchor) {
+ if (flags & Regexp::Latin1)
+ encoding_ = kEncodingLatin1;
+ max_mem_ = max_mem;
+ if (max_mem <= 0) {
max_ninst_ = 100000; // more than enough
} else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
- // No room for anything.
+ // No room for anything.
max_ninst_ = 0;
- } else {
+ } else {
int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
- // Limit instruction count so that inst->id() fits nicely in an int.
- // SparseArray also assumes that the indices (inst->id()) are ints.
+ // Limit instruction count so that inst->id() fits nicely in an int.
+ // SparseArray also assumes that the indices (inst->id()) are ints.
// The call to WalkExponential uses 2*max_ninst_ below,
- // and other places in the code use 2 or 3 * prog->size().
- // Limiting to 2^24 should avoid overflow in those places.
- // (The point of allowing more than 32 bits of memory is to
- // have plenty of room for the DFA states, not to use it up
- // on the program.)
- if (m >= 1<<24)
- m = 1<<24;
- // Inst imposes its own limit (currently bigger than 2^24 but be safe).
- if (m > Prog::Inst::kMaxInst)
- m = Prog::Inst::kMaxInst;
+ // and other places in the code use 2 or 3 * prog->size().
+ // Limiting to 2^24 should avoid overflow in those places.
+ // (The point of allowing more than 32 bits of memory is to
+ // have plenty of room for the DFA states, not to use it up
+ // on the program.)
+ if (m >= 1<<24)
+ m = 1<<24;
+ // Inst imposes its own limit (currently bigger than 2^24 but be safe).
+ if (m > Prog::Inst::kMaxInst)
+ m = Prog::Inst::kMaxInst;
max_ninst_ = static_cast<int>(m);
- }
- anchor_ = anchor;
-}
-
-// Compiles re, returning program.
-// Caller is responsible for deleting prog_.
-// If reversed is true, compiles a program that expects
-// to run over the input string backward (reverses all concatenations).
-// The reversed flag is also recorded in the returned program.
+ }
+ anchor_ = anchor;
+}
+
+// Compiles re, returning program.
+// Caller is responsible for deleting prog_.
+// If reversed is true, compiles a program that expects
+// to run over the input string backward (reverses all concatenations).
+// The reversed flag is also recorded in the returned program.
Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
- Compiler c;
+ Compiler c;
c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
- c.reversed_ = reversed;
-
- // Simplify to remove things like counted repetitions
- // and character classes like \d.
- Regexp* sre = re->Simplify();
- if (sre == NULL)
- return NULL;
-
- // Record whether prog is anchored, removing the anchors.
- // (They get in the way of other optimizations.)
+ c.reversed_ = reversed;
+
+ // Simplify to remove things like counted repetitions
+ // and character classes like \d.
+ Regexp* sre = re->Simplify();
+ if (sre == NULL)
+ return NULL;
+
+ // Record whether prog is anchored, removing the anchors.
+ // (They get in the way of other optimizations.)
bool is_anchor_start = IsAnchorStart(&sre, 0);
bool is_anchor_end = IsAnchorEnd(&sre, 0);
-
- // Generate fragment for entire regexp.
+
+ // Generate fragment for entire regexp.
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
- sre->Decref();
- if (c.failed_)
- return NULL;
-
- // Success! Finish by putting Match node at end, and record start.
- // Turn off c.reversed_ (if it is set) to force the remaining concatenations
- // to behave normally.
- c.reversed_ = false;
+ sre->Decref();
+ if (c.failed_)
+ return NULL;
+
+ // Success! Finish by putting Match node at end, and record start.
+ // Turn off c.reversed_ (if it is set) to force the remaining concatenations
+ // to behave normally.
+ c.reversed_ = false;
all = c.Cat(all, c.Match(0));
-
+
c.prog_->set_reversed(reversed);
if (c.prog_->reversed()) {
- c.prog_->set_anchor_start(is_anchor_end);
- c.prog_->set_anchor_end(is_anchor_start);
- } else {
- c.prog_->set_anchor_start(is_anchor_start);
- c.prog_->set_anchor_end(is_anchor_end);
- }
-
+ c.prog_->set_anchor_start(is_anchor_end);
+ c.prog_->set_anchor_end(is_anchor_start);
+ } else {
+ c.prog_->set_anchor_start(is_anchor_start);
+ c.prog_->set_anchor_end(is_anchor_end);
+ }
+
c.prog_->set_start(all.begin);
if (!c.prog_->anchor_start()) {
// Also create unanchored version, which starts with a .*? loop.
all = c.Cat(c.DotStar(), all);
- }
+ }
c.prog_->set_start_unanchored(all.begin);
-
- // Hand ownership of prog_ to caller.
+
+ // Hand ownership of prog_ to caller.
return c.Finish(re);
-}
-
+}
+
Prog* Compiler::Finish(Regexp* re) {
- if (failed_)
- return NULL;
-
- if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
- // No possible matches; keep Fail instruction only.
+ if (failed_)
+ return NULL;
+
+ if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
+ // No possible matches; keep Fail instruction only.
ninst_ = 1;
- }
-
+ }
+
// Hand off the array to Prog.
prog_->inst_ = std::move(inst_);
prog_->size_ = ninst_;
-
+
prog_->Optimize();
prog_->Flatten();
- prog_->ComputeByteMap();
-
+ prog_->ComputeByteMap();
+
if (!prog_->reversed()) {
std::string prefix;
bool prefix_foldcase;
@@ -1180,82 +1180,82 @@ Prog* Compiler::Finish(Regexp* re) {
prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
}
- // Record remaining memory for DFA.
- if (max_mem_ <= 0) {
- prog_->set_dfa_mem(1<<20);
- } else {
+ // Record remaining memory for DFA.
+ if (max_mem_ <= 0) {
+ prog_->set_dfa_mem(1<<20);
+ } else {
int64_t m = max_mem_ - sizeof(Prog);
m -= prog_->size_*sizeof(Prog::Inst); // account for inst_
if (prog_->CanBitState())
m -= prog_->size_*sizeof(uint16_t); // account for list_heads_
- if (m < 0)
- m = 0;
- prog_->set_dfa_mem(m);
- }
-
- Prog* p = prog_;
- prog_ = NULL;
- return p;
-}
-
-// Converts Regexp to Prog.
+ if (m < 0)
+ m = 0;
+ prog_->set_dfa_mem(m);
+ }
+
+ Prog* p = prog_;
+ prog_ = NULL;
+ return p;
+}
+
+// Converts Regexp to Prog.
Prog* Regexp::CompileToProg(int64_t max_mem) {
- return Compiler::Compile(this, false, max_mem);
-}
-
+ return Compiler::Compile(this, false, max_mem);
+}
+
Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
- return Compiler::Compile(this, true, max_mem);
-}
-
-Frag Compiler::DotStar() {
- return Star(ByteRange(0x00, 0xff, false), true);
-}
-
-// Compiles RE set to Prog.
+ return Compiler::Compile(this, true, max_mem);
+}
+
+Frag Compiler::DotStar() {
+ return Star(ByteRange(0x00, 0xff, false), true);
+}
+
+// Compiles RE set to Prog.
Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
- Compiler c;
+ Compiler c;
c.Setup(re->parse_flags(), max_mem, anchor);
-
+
Regexp* sre = re->Simplify();
if (sre == NULL)
return NULL;
-
+
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
sre->Decref();
- if (c.failed_)
- return NULL;
-
+ if (c.failed_)
+ return NULL;
+
c.prog_->set_anchor_start(true);
c.prog_->set_anchor_end(true);
- if (anchor == RE2::UNANCHORED) {
+ if (anchor == RE2::UNANCHORED) {
// Prepend .* or else the expression will effectively be anchored.
// Complemented by the ANCHOR_BOTH case in PostVisit().
- all = c.Cat(c.DotStar(), all);
- }
- c.prog_->set_start(all.begin);
- c.prog_->set_start_unanchored(all.begin);
-
+ all = c.Cat(c.DotStar(), all);
+ }
+ c.prog_->set_start(all.begin);
+ c.prog_->set_start_unanchored(all.begin);
+
Prog* prog = c.Finish(re);
- if (prog == NULL)
- return NULL;
-
- // Make sure DFA has enough memory to operate,
- // since we're not going to fall back to the NFA.
+ if (prog == NULL)
+ return NULL;
+
+ // Make sure DFA has enough memory to operate,
+ // since we're not going to fall back to the NFA.
bool dfa_failed = false;
- StringPiece sp = "hello, world";
- prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
+ StringPiece sp = "hello, world";
+ prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
NULL, &dfa_failed, NULL);
if (dfa_failed) {
- delete prog;
- return NULL;
- }
-
- return prog;
-}
-
+ delete prog;
+ return NULL;
+ }
+
+ return prog;
+}
+
Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
return Compiler::CompileSet(re, anchor, max_mem);
-}
-
-} // namespace re2
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/dfa.cc b/contrib/libs/re2/re2/dfa.cc
index c02e5730cc..d47c7d50a7 100644
--- a/contrib/libs/re2/re2/dfa.cc
+++ b/contrib/libs/re2/re2/dfa.cc
@@ -1,26 +1,26 @@
-// Copyright 2008 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// A DFA (deterministic finite automaton)-based regular expression search.
-//
-// The DFA search has two main parts: the construction of the automaton,
-// which is represented by a graph of State structures, and the execution
-// of the automaton over a given input string.
-//
-// The basic idea is that the State graph is constructed so that the
-// execution can simply start with a state s, and then for each byte c in
-// the input string, execute "s = s->next[c]", checking at each point whether
-// the current s represents a matching state.
-//
-// The simple explanation just given does convey the essence of this code,
-// but it omits the details of how the State graph gets constructed as well
-// as some performance-driven optimizations to the execution of the automaton.
-// All these details are explained in the comments for the code following
-// the definition of class DFA.
-//
-// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
-
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// A DFA (deterministic finite automaton)-based regular expression search.
+//
+// The DFA search has two main parts: the construction of the automaton,
+// which is represented by a graph of State structures, and the execution
+// of the automaton over a given input string.
+//
+// The basic idea is that the State graph is constructed so that the
+// execution can simply start with a state s, and then for each byte c in
+// the input string, execute "s = s->next[c]", checking at each point whether
+// the current s represents a matching state.
+//
+// The simple explanation just given does convey the essence of this code,
+// but it omits the details of how the State graph gets constructed as well
+// as some performance-driven optimizations to the execution of the automaton.
+// All these details are explained in the comments for the code following
+// the definition of class DFA.
+//
+// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
+
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
@@ -41,18 +41,18 @@
#include "util/mutex.h"
#include "util/strutil.h"
#include "re2/pod_array.h"
-#include "re2/prog.h"
+#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/sparse_set.h"
#include "re2/stringpiece.h"
-
+
// Silence "zero-sized array in struct/union" warning for DFA::State::next_.
#ifdef _MSC_VER
#pragma warning(disable: 4200)
#endif
-
-namespace re2 {
-
+
+namespace re2 {
+
// Controls whether the DFA should bail out early if the NFA would be faster.
static bool dfa_should_bail_when_slow = true;
@@ -60,65 +60,65 @@ void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) {
dfa_should_bail_when_slow = b;
}
-// Changing this to true compiles in prints that trace execution of the DFA.
-// Generates a lot of output -- only useful for debugging.
+// Changing this to true compiles in prints that trace execution of the DFA.
+// Generates a lot of output -- only useful for debugging.
static const bool ExtraDebug = false;
-
-// A DFA implementation of a regular expression program.
-// Since this is entirely a forward declaration mandated by C++,
-// some of the comments here are better understood after reading
-// the comments in the sections that follow the DFA definition.
-class DFA {
- public:
+
+// A DFA implementation of a regular expression program.
+// Since this is entirely a forward declaration mandated by C++,
+// some of the comments here are better understood after reading
+// the comments in the sections that follow the DFA definition.
+class DFA {
+ public:
DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem);
- ~DFA();
- bool ok() const { return !init_failed_; }
- Prog::MatchKind kind() { return kind_; }
-
- // Searches for the regular expression in text, which is considered
- // as a subsection of context for the purposes of interpreting flags
- // like ^ and $ and \A and \z.
- // Returns whether a match was found.
- // If a match is found, sets *ep to the end point of the best match in text.
- // If "anchored", the match must begin at the start of text.
- // If "want_earliest_match", the match that ends first is used, not
- // necessarily the best one.
- // If "run_forward" is true, the DFA runs from text.begin() to text.end().
- // If it is false, the DFA runs from text.end() to text.begin(),
- // returning the leftmost end of the match instead of the rightmost one.
- // If the DFA cannot complete the search (for example, if it is out of
- // memory), it sets *failed and returns false.
- bool Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool want_earliest_match, bool run_forward,
+ ~DFA();
+ bool ok() const { return !init_failed_; }
+ Prog::MatchKind kind() { return kind_; }
+
+ // Searches for the regular expression in text, which is considered
+ // as a subsection of context for the purposes of interpreting flags
+ // like ^ and $ and \A and \z.
+ // Returns whether a match was found.
+ // If a match is found, sets *ep to the end point of the best match in text.
+ // If "anchored", the match must begin at the start of text.
+ // If "want_earliest_match", the match that ends first is used, not
+ // necessarily the best one.
+ // If "run_forward" is true, the DFA runs from text.begin() to text.end().
+ // If it is false, the DFA runs from text.end() to text.begin(),
+ // returning the leftmost end of the match instead of the rightmost one.
+ // If the DFA cannot complete the search (for example, if it is out of
+ // memory), it sets *failed and returns false.
+ bool Search(const StringPiece& text, const StringPiece& context,
+ bool anchored, bool want_earliest_match, bool run_forward,
bool* failed, const char** ep, SparseSet* matches);
-
+
// Builds out all states for the entire DFA.
// If cb is not empty, it receives one callback per state built.
// Returns the number of states built.
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
int BuildAllStates(const Prog::DFAStateCallback& cb);
-
- // Computes min and max for matching strings. Won't return strings
- // bigger than maxlen.
+
+ // Computes min and max for matching strings. Won't return strings
+ // bigger than maxlen.
bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
-
- // These data structures are logically private, but C++ makes it too
- // difficult to mark them as such.
- class RWLocker;
- class StateSaver;
+
+ // These data structures are logically private, but C++ makes it too
+ // difficult to mark them as such.
+ class RWLocker;
+ class StateSaver;
class Workq;
-
- // A single DFA state. The DFA is represented as a graph of these
- // States, linked by the next_ pointers. If in state s and reading
- // byte c, the next state should be s->next_[c].
- struct State {
+
+ // A single DFA state. The DFA is represented as a graph of these
+ // States, linked by the next_ pointers. If in state s and reading
+ // byte c, the next state should be s->next_[c].
+ struct State {
inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; }
-
- int* inst_; // Instruction pointers in the state.
- int ninst_; // # of inst_ pointers.
+
+ int* inst_; // Instruction pointers in the state.
+ int ninst_; // # of inst_ pointers.
uint32_t flag_; // Empty string bitfield flags in effect on the way
- // into this state, along with kFlagMatch if this
- // is a matching state.
+ // into this state, along with kFlagMatch if this
+ // is a matching state.
// Work around the bug affecting flexible array members in GCC 6.x (for x >= 1).
// (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932)
@@ -128,18 +128,18 @@ class DFA {
std::atomic<State*> next_[]; // Outgoing arrows from State,
#endif
- // one per input byte class
- };
-
- enum {
- kByteEndText = 256, // imaginary byte at end of text
-
+ // one per input byte class
+ };
+
+ enum {
+ kByteEndText = 256, // imaginary byte at end of text
+
kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags
kFlagMatch = 0x0100, // State.flag_: this is a matching state
kFlagLastWord = 0x0200, // State.flag_: last byte was a word char
- kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left
- };
-
+ kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left
+ };
+
struct StateHash {
size_t operator()(const State* a) const {
DCHECK(a != NULL);
@@ -151,285 +151,285 @@ class DFA {
}
};
- struct StateEqual {
- bool operator()(const State* a, const State* b) const {
+ struct StateEqual {
+ bool operator()(const State* a, const State* b) const {
DCHECK(a != NULL);
DCHECK(b != NULL);
- if (a == b)
- return true;
+ if (a == b)
+ return true;
if (a->flag_ != b->flag_)
- return false;
- if (a->ninst_ != b->ninst_)
- return false;
- for (int i = 0; i < a->ninst_; i++)
- if (a->inst_[i] != b->inst_[i])
- return false;
+ return false;
+ if (a->ninst_ != b->ninst_)
+ return false;
+ for (int i = 0; i < a->ninst_; i++)
+ if (a->inst_[i] != b->inst_[i])
+ return false;
return true;
- }
- };
-
+ }
+ };
+
typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
-
- private:
+
+ private:
// Make it easier to swap in a scalable reader-writer mutex.
using CacheMutex = Mutex;
- enum {
- // Indices into start_ for unanchored searches.
- // Add kStartAnchored for anchored searches.
- kStartBeginText = 0, // text at beginning of context
- kStartBeginLine = 2, // text at beginning of line
- kStartAfterWordChar = 4, // text follows a word character
- kStartAfterNonWordChar = 6, // text follows non-word character
- kMaxStart = 8,
-
- kStartAnchored = 1,
- };
-
- // Resets the DFA State cache, flushing all saved State* information.
- // Releases and reacquires cache_mutex_ via cache_lock, so any
- // State* existing before the call are not valid after the call.
- // Use a StateSaver to preserve important states across the call.
- // cache_mutex_.r <= L < mutex_
- // After: cache_mutex_.w <= L < mutex_
- void ResetCache(RWLocker* cache_lock);
-
- // Looks up and returns the State corresponding to a Workq.
- // L >= mutex_
+ enum {
+ // Indices into start_ for unanchored searches.
+ // Add kStartAnchored for anchored searches.
+ kStartBeginText = 0, // text at beginning of context
+ kStartBeginLine = 2, // text at beginning of line
+ kStartAfterWordChar = 4, // text follows a word character
+ kStartAfterNonWordChar = 6, // text follows non-word character
+ kMaxStart = 8,
+
+ kStartAnchored = 1,
+ };
+
+ // Resets the DFA State cache, flushing all saved State* information.
+ // Releases and reacquires cache_mutex_ via cache_lock, so any
+ // State* existing before the call are not valid after the call.
+ // Use a StateSaver to preserve important states across the call.
+ // cache_mutex_.r <= L < mutex_
+ // After: cache_mutex_.w <= L < mutex_
+ void ResetCache(RWLocker* cache_lock);
+
+ // Looks up and returns the State corresponding to a Workq.
+ // L >= mutex_
State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag);
-
- // Looks up and returns a State matching the inst, ninst, and flag.
- // L >= mutex_
+
+ // Looks up and returns a State matching the inst, ninst, and flag.
+ // L >= mutex_
State* CachedState(int* inst, int ninst, uint32_t flag);
-
- // Clear the cache entirely.
- // Must hold cache_mutex_.w or be in destructor.
- void ClearCache();
-
- // Converts a State into a Workq: the opposite of WorkqToCachedState.
- // L >= mutex_
+
+ // Clear the cache entirely.
+ // Must hold cache_mutex_.w or be in destructor.
+ void ClearCache();
+
+ // Converts a State into a Workq: the opposite of WorkqToCachedState.
+ // L >= mutex_
void StateToWorkq(State* s, Workq* q);
-
- // Runs a State on a given byte, returning the next state.
- State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_
- State* RunStateOnByte(State*, int); // L >= mutex_
-
- // Runs a Workq on a given byte followed by a set of empty-string flags,
- // producing a new Workq in nq. If a match instruction is encountered,
- // sets *ismatch to true.
- // L >= mutex_
- void RunWorkqOnByte(Workq* q, Workq* nq,
+
+ // Runs a State on a given byte, returning the next state.
+ State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_
+ State* RunStateOnByte(State*, int); // L >= mutex_
+
+ // Runs a Workq on a given byte followed by a set of empty-string flags,
+ // producing a new Workq in nq. If a match instruction is encountered,
+ // sets *ismatch to true.
+ // L >= mutex_
+ void RunWorkqOnByte(Workq* q, Workq* nq,
int c, uint32_t flag, bool* ismatch);
-
- // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
- // L >= mutex_
+
+ // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
+ // L >= mutex_
void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag);
-
- // Adds the instruction id to the Workq, following empty arrows
- // according to flag.
- // L >= mutex_
+
+ // Adds the instruction id to the Workq, following empty arrows
+ // according to flag.
+ // L >= mutex_
void AddToQueue(Workq* q, int id, uint32_t flag);
-
- // For debugging, returns a text representation of State.
+
+ // For debugging, returns a text representation of State.
static std::string DumpState(State* state);
-
- // For debugging, returns a text representation of a Workq.
+
+ // For debugging, returns a text representation of a Workq.
static std::string DumpWorkq(Workq* q);
-
- // Search parameters
- struct SearchParams {
- SearchParams(const StringPiece& text, const StringPiece& context,
- RWLocker* cache_lock)
+
+ // Search parameters
+ struct SearchParams {
+ SearchParams(const StringPiece& text, const StringPiece& context,
+ RWLocker* cache_lock)
: text(text),
context(context),
- anchored(false),
+ anchored(false),
can_prefix_accel(false),
- want_earliest_match(false),
- run_forward(false),
- start(NULL),
- cache_lock(cache_lock),
- failed(false),
- ep(NULL),
+ want_earliest_match(false),
+ run_forward(false),
+ start(NULL),
+ cache_lock(cache_lock),
+ failed(false),
+ ep(NULL),
matches(NULL) {}
-
- StringPiece text;
- StringPiece context;
- bool anchored;
+
+ StringPiece text;
+ StringPiece context;
+ bool anchored;
bool can_prefix_accel;
- bool want_earliest_match;
- bool run_forward;
- State* start;
+ bool want_earliest_match;
+ bool run_forward;
+ State* start;
RWLocker* cache_lock;
- bool failed; // "out" parameter: whether search gave up
- const char* ep; // "out" parameter: end pointer for match
+ bool failed; // "out" parameter: whether search gave up
+ const char* ep; // "out" parameter: end pointer for match
SparseSet* matches;
-
- private:
+
+ private:
SearchParams(const SearchParams&) = delete;
SearchParams& operator=(const SearchParams&) = delete;
- };
-
- // Before each search, the parameters to Search are analyzed by
+ };
+
+ // Before each search, the parameters to Search are analyzed by
// AnalyzeSearch to determine the state in which to start.
- struct StartInfo {
+ struct StartInfo {
StartInfo() : start(NULL) {}
std::atomic<State*> start;
- };
-
+ };
+
// Fills in params->start and params->can_prefix_accel using
- // the other search parameters. Returns true on success,
- // false on failure.
- // cache_mutex_.r <= L < mutex_
- bool AnalyzeSearch(SearchParams* params);
+ // the other search parameters. Returns true on success,
+ // false on failure.
+ // cache_mutex_.r <= L < mutex_
+ bool AnalyzeSearch(SearchParams* params);
bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
uint32_t flags);
-
- // The generic search loop, inlined to create specialized versions.
- // cache_mutex_.r <= L < mutex_
- // Might unlock and relock cache_mutex_ via params->cache_lock.
+
+ // The generic search loop, inlined to create specialized versions.
+ // cache_mutex_.r <= L < mutex_
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
template <bool can_prefix_accel,
bool want_earliest_match,
bool run_forward>
inline bool InlinedSearchLoop(SearchParams* params);
-
- // The specialized versions of InlinedSearchLoop. The three letters
- // at the ends of the name denote the true/false values used as the
- // last three parameters of InlinedSearchLoop.
- // cache_mutex_.r <= L < mutex_
- // Might unlock and relock cache_mutex_ via params->cache_lock.
- bool SearchFFF(SearchParams* params);
- bool SearchFFT(SearchParams* params);
- bool SearchFTF(SearchParams* params);
- bool SearchFTT(SearchParams* params);
- bool SearchTFF(SearchParams* params);
- bool SearchTFT(SearchParams* params);
- bool SearchTTF(SearchParams* params);
- bool SearchTTT(SearchParams* params);
-
- // The main search loop: calls an appropriate specialized version of
- // InlinedSearchLoop.
- // cache_mutex_.r <= L < mutex_
- // Might unlock and relock cache_mutex_ via params->cache_lock.
- bool FastSearchLoop(SearchParams* params);
-
-
- // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
- int ByteMap(int c) {
- if (c == kByteEndText)
- return prog_->bytemap_range();
- return prog_->bytemap()[c];
- }
-
- // Constant after initialization.
- Prog* prog_; // The regular expression program to run.
- Prog::MatchKind kind_; // The kind of DFA.
- bool init_failed_; // initialization failed (out of memory)
-
- Mutex mutex_; // mutex_ >= cache_mutex_.r
-
- // Scratch areas, protected by mutex_.
- Workq* q0_; // Two pre-allocated work queues.
- Workq* q1_;
+
+ // The specialized versions of InlinedSearchLoop. The three letters
+ // at the ends of the name denote the true/false values used as the
+ // last three parameters of InlinedSearchLoop.
+ // cache_mutex_.r <= L < mutex_
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
+ bool SearchFFF(SearchParams* params);
+ bool SearchFFT(SearchParams* params);
+ bool SearchFTF(SearchParams* params);
+ bool SearchFTT(SearchParams* params);
+ bool SearchTFF(SearchParams* params);
+ bool SearchTFT(SearchParams* params);
+ bool SearchTTF(SearchParams* params);
+ bool SearchTTT(SearchParams* params);
+
+ // The main search loop: calls an appropriate specialized version of
+ // InlinedSearchLoop.
+ // cache_mutex_.r <= L < mutex_
+ // Might unlock and relock cache_mutex_ via params->cache_lock.
+ bool FastSearchLoop(SearchParams* params);
+
+
+ // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
+ int ByteMap(int c) {
+ if (c == kByteEndText)
+ return prog_->bytemap_range();
+ return prog_->bytemap()[c];
+ }
+
+ // Constant after initialization.
+ Prog* prog_; // The regular expression program to run.
+ Prog::MatchKind kind_; // The kind of DFA.
+ bool init_failed_; // initialization failed (out of memory)
+
+ Mutex mutex_; // mutex_ >= cache_mutex_.r
+
+ // Scratch areas, protected by mutex_.
+ Workq* q0_; // Two pre-allocated work queues.
+ Workq* q1_;
PODArray<int> stack_; // Pre-allocated stack for AddToQueue
-
- // State* cache. Many threads use and add to the cache simultaneously,
- // holding cache_mutex_ for reading and mutex_ (above) when adding.
- // If the cache fills and needs to be discarded, the discarding is done
- // while holding cache_mutex_ for writing, to avoid interrupting other
- // readers. Any State* pointers are only valid while cache_mutex_
- // is held.
+
+ // State* cache. Many threads use and add to the cache simultaneously,
+ // holding cache_mutex_ for reading and mutex_ (above) when adding.
+ // If the cache fills and needs to be discarded, the discarding is done
+ // while holding cache_mutex_ for writing, to avoid interrupting other
+ // readers. Any State* pointers are only valid while cache_mutex_
+ // is held.
CacheMutex cache_mutex_;
int64_t mem_budget_; // Total memory budget for all States.
int64_t state_budget_; // Amount of memory remaining for new States.
- StateSet state_cache_; // All States computed so far.
- StartInfo start_[kMaxStart];
+ StateSet state_cache_; // All States computed so far.
+ StartInfo start_[kMaxStart];
DFA(const DFA&) = delete;
DFA& operator=(const DFA&) = delete;
-};
-
+};
+
// Shorthand for casting to uint8_t*.
static inline const uint8_t* BytePtr(const void* v) {
return reinterpret_cast<const uint8_t*>(v);
-}
-
-// Work queues
-
-// Marks separate thread groups of different priority
-// in the work queue when in leftmost-longest matching mode.
-#define Mark (-1)
-
+}
+
+// Work queues
+
+// Marks separate thread groups of different priority
+// in the work queue when in leftmost-longest matching mode.
+#define Mark (-1)
+
// Separates the match IDs from the instructions in inst_.
// Used only for "many match" DFA states.
#define MatchSep (-2)
-// Internally, the DFA uses a sparse array of
-// program instruction pointers as a work queue.
-// In leftmost longest mode, marks separate sections
-// of workq that started executing at different
-// locations in the string (earlier locations first).
-class DFA::Workq : public SparseSet {
- public:
- // Constructor: n is number of normal slots, maxmark number of mark slots.
- Workq(int n, int maxmark) :
- SparseSet(n+maxmark),
- n_(n),
- maxmark_(maxmark),
- nextmark_(n),
- last_was_mark_(true) {
- }
-
- bool is_mark(int i) { return i >= n_; }
-
- int maxmark() { return maxmark_; }
-
- void clear() {
- SparseSet::clear();
- nextmark_ = n_;
- }
-
- void mark() {
- if (last_was_mark_)
- return;
- last_was_mark_ = false;
- SparseSet::insert_new(nextmark_++);
- }
-
- int size() {
- return n_ + maxmark_;
- }
-
- void insert(int id) {
- if (contains(id))
- return;
- insert_new(id);
- }
-
- void insert_new(int id) {
- last_was_mark_ = false;
- SparseSet::insert_new(id);
- }
-
- private:
- int n_; // size excluding marks
- int maxmark_; // maximum number of marks
- int nextmark_; // id of next mark
- bool last_was_mark_; // last inserted was mark
+// Internally, the DFA uses a sparse array of
+// program instruction pointers as a work queue.
+// In leftmost longest mode, marks separate sections
+// of workq that started executing at different
+// locations in the string (earlier locations first).
+class DFA::Workq : public SparseSet {
+ public:
+ // Constructor: n is number of normal slots, maxmark number of mark slots.
+ Workq(int n, int maxmark) :
+ SparseSet(n+maxmark),
+ n_(n),
+ maxmark_(maxmark),
+ nextmark_(n),
+ last_was_mark_(true) {
+ }
+
+ bool is_mark(int i) { return i >= n_; }
+
+ int maxmark() { return maxmark_; }
+
+ void clear() {
+ SparseSet::clear();
+ nextmark_ = n_;
+ }
+
+ void mark() {
+ if (last_was_mark_)
+ return;
+ last_was_mark_ = false;
+ SparseSet::insert_new(nextmark_++);
+ }
+
+ int size() {
+ return n_ + maxmark_;
+ }
+
+ void insert(int id) {
+ if (contains(id))
+ return;
+ insert_new(id);
+ }
+
+ void insert_new(int id) {
+ last_was_mark_ = false;
+ SparseSet::insert_new(id);
+ }
+
+ private:
+ int n_; // size excluding marks
+ int maxmark_; // maximum number of marks
+ int nextmark_; // id of next mark
+ bool last_was_mark_; // last inserted was mark
Workq(const Workq&) = delete;
Workq& operator=(const Workq&) = delete;
-};
-
+};
+
DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
- : prog_(prog),
- kind_(kind),
- init_failed_(false),
- q0_(NULL),
- q1_(NULL),
+ : prog_(prog),
+ kind_(kind),
+ init_failed_(false),
+ q0_(NULL),
+ q1_(NULL),
mem_budget_(max_mem) {
if (ExtraDebug)
fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str());
- int nmark = 0;
+ int nmark = 0;
if (kind_ == Prog::kLongestMatch)
nmark = prog_->size();
// See DFA::AddToQueue() for why this is so.
@@ -437,266 +437,266 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
prog_->inst_count(kInstEmptyWidth) +
prog_->inst_count(kInstNop) +
nmark + 1; // + 1 for start inst
-
+
// Account for space needed for DFA, q0, q1, stack.
- mem_budget_ -= sizeof(DFA);
- mem_budget_ -= (prog_->size() + nmark) *
- (sizeof(int)+sizeof(int)) * 2; // q0, q1
+ mem_budget_ -= sizeof(DFA);
+ mem_budget_ -= (prog_->size() + nmark) *
+ (sizeof(int)+sizeof(int)) * 2; // q0, q1
mem_budget_ -= nstack * sizeof(int); // stack
- if (mem_budget_ < 0) {
- init_failed_ = true;
- return;
- }
-
- state_budget_ = mem_budget_;
-
- // Make sure there is a reasonable amount of working room left.
- // At minimum, the search requires room for two states in order
- // to limp along, restarting frequently. We'll get better performance
- // if there is room for a larger number of states, say 20.
+ if (mem_budget_ < 0) {
+ init_failed_ = true;
+ return;
+ }
+
+ state_budget_ = mem_budget_;
+
+ // Make sure there is a reasonable amount of working room left.
+ // At minimum, the search requires room for two states in order
+ // to limp along, restarting frequently. We'll get better performance
+ // if there is room for a larger number of states, say 20.
// Note that a state stores list heads only, so we use the program
// list count for the upper bound, not the program size.
int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
(prog_->list_count()+nmark)*sizeof(int);
- if (state_budget_ < 20*one_state) {
- init_failed_ = true;
- return;
- }
-
+ if (state_budget_ < 20*one_state) {
+ init_failed_ = true;
+ return;
+ }
+
q0_ = new Workq(prog_->size(), nmark);
q1_ = new Workq(prog_->size(), nmark);
stack_ = PODArray<int>(nstack);
-}
-
-DFA::~DFA() {
- delete q0_;
- delete q1_;
- ClearCache();
-}
-
-// In the DFA state graph, s->next[c] == NULL means that the
-// state has not yet been computed and needs to be. We need
-// a different special value to signal that s->next[c] is a
-// state that can never lead to a match (and thus the search
-// can be called off). Hence DeadState.
-#define DeadState reinterpret_cast<State*>(1)
-
-// Signals that the rest of the string matches no matter what it is.
-#define FullMatchState reinterpret_cast<State*>(2)
-
-#define SpecialStateMax FullMatchState
-
-// Debugging printouts
-
-// For debugging, returns a string representation of the work queue.
+}
+
+DFA::~DFA() {
+ delete q0_;
+ delete q1_;
+ ClearCache();
+}
+
+// In the DFA state graph, s->next[c] == NULL means that the
+// state has not yet been computed and needs to be. We need
+// a different special value to signal that s->next[c] is a
+// state that can never lead to a match (and thus the search
+// can be called off). Hence DeadState.
+#define DeadState reinterpret_cast<State*>(1)
+
+// Signals that the rest of the string matches no matter what it is.
+#define FullMatchState reinterpret_cast<State*>(2)
+
+#define SpecialStateMax FullMatchState
+
+// Debugging printouts
+
+// For debugging, returns a string representation of the work queue.
std::string DFA::DumpWorkq(Workq* q) {
std::string s;
- const char* sep = "";
+ const char* sep = "";
for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
- if (q->is_mark(*it)) {
+ if (q->is_mark(*it)) {
s += "|";
- sep = "";
- } else {
+ sep = "";
+ } else {
s += StringPrintf("%s%d", sep, *it);
- sep = ",";
- }
- }
- return s;
-}
-
-// For debugging, returns a string representation of the state.
+ sep = ",";
+ }
+ }
+ return s;
+}
+
+// For debugging, returns a string representation of the state.
std::string DFA::DumpState(State* state) {
- if (state == NULL)
- return "_";
- if (state == DeadState)
- return "X";
- if (state == FullMatchState)
- return "*";
+ if (state == NULL)
+ return "_";
+ if (state == DeadState)
+ return "X";
+ if (state == FullMatchState)
+ return "*";
std::string s;
- const char* sep = "";
+ const char* sep = "";
s += StringPrintf("(%p)", state);
- for (int i = 0; i < state->ninst_; i++) {
- if (state->inst_[i] == Mark) {
+ for (int i = 0; i < state->ninst_; i++) {
+ if (state->inst_[i] == Mark) {
s += "|";
- sep = "";
+ sep = "";
} else if (state->inst_[i] == MatchSep) {
s += "||";
sep = "";
- } else {
+ } else {
s += StringPrintf("%s%d", sep, state->inst_[i]);
- sep = ",";
- }
- }
+ sep = ",";
+ }
+ }
s += StringPrintf(" flag=%#x", state->flag_);
- return s;
-}
-
-//////////////////////////////////////////////////////////////////////
-//
-// DFA state graph construction.
-//
-// The DFA state graph is a heavily-linked collection of State* structures.
-// The state_cache_ is a set of all the State structures ever allocated,
-// so that if the same state is reached by two different paths,
-// the same State structure can be used. This reduces allocation
-// requirements and also avoids duplication of effort across the two
-// identical states.
-//
-// A State is defined by an ordered list of instruction ids and a flag word.
-//
-// The choice of an ordered list of instructions differs from a typical
-// textbook DFA implementation, which would use an unordered set.
-// Textbook descriptions, however, only care about whether
-// the DFA matches, not where it matches in the text. To decide where the
-// DFA matches, we need to mimic the behavior of the dominant backtracking
-// implementations like PCRE, which try one possible regular expression
-// execution, then another, then another, stopping when one of them succeeds.
-// The DFA execution tries these many executions in parallel, representing
-// each by an instruction id. These pointers are ordered in the State.inst_
-// list in the same order that the executions would happen in a backtracking
-// search: if a match is found during execution of inst_[2], inst_[i] for i>=3
-// can be discarded.
-//
-// Textbooks also typically do not consider context-aware empty string operators
-// like ^ or $. These are handled by the flag word, which specifies the set
-// of empty-string operators that should be matched when executing at the
-// current text position. These flag bits are defined in prog.h.
-// The flag word also contains two DFA-specific bits: kFlagMatch if the state
-// is a matching state (one that reached a kInstMatch in the program)
-// and kFlagLastWord if the last processed byte was a word character, for the
-// implementation of \B and \b.
-//
-// The flag word also contains, shifted up 16 bits, the bits looked for by
-// any kInstEmptyWidth instructions in the state. These provide a useful
-// summary indicating when new flags might be useful.
-//
-// The permanent representation of a State's instruction ids is just an array,
-// but while a state is being analyzed, these instruction ids are represented
-// as a Workq, which is an array that allows iteration in insertion order.
-
-// NOTE(rsc): The choice of State construction determines whether the DFA
-// mimics backtracking implementations (so-called leftmost first matching) or
-// traditional DFA implementations (so-called leftmost longest matching as
-// prescribed by POSIX). This implementation chooses to mimic the
-// backtracking implementations, because we want to replace PCRE. To get
-// POSIX behavior, the states would need to be considered not as a simple
-// ordered list of instruction ids, but as a list of unordered sets of instruction
-// ids. A match by a state in one set would inhibit the running of sets
-// farther down the list but not other instruction ids in the same set. Each
-// set would correspond to matches beginning at a given point in the string.
-// This is implemented by separating different sets with Mark pointers.
-
-// Looks in the State cache for a State matching q, flag.
-// If one is found, returns it. If one is not found, allocates one,
-// inserts it in the cache, and returns it.
+ return s;
+}
+
+//////////////////////////////////////////////////////////////////////
+//
+// DFA state graph construction.
+//
+// The DFA state graph is a heavily-linked collection of State* structures.
+// The state_cache_ is a set of all the State structures ever allocated,
+// so that if the same state is reached by two different paths,
+// the same State structure can be used. This reduces allocation
+// requirements and also avoids duplication of effort across the two
+// identical states.
+//
+// A State is defined by an ordered list of instruction ids and a flag word.
+//
+// The choice of an ordered list of instructions differs from a typical
+// textbook DFA implementation, which would use an unordered set.
+// Textbook descriptions, however, only care about whether
+// the DFA matches, not where it matches in the text. To decide where the
+// DFA matches, we need to mimic the behavior of the dominant backtracking
+// implementations like PCRE, which try one possible regular expression
+// execution, then another, then another, stopping when one of them succeeds.
+// The DFA execution tries these many executions in parallel, representing
+// each by an instruction id. These pointers are ordered in the State.inst_
+// list in the same order that the executions would happen in a backtracking
+// search: if a match is found during execution of inst_[2], inst_[i] for i>=3
+// can be discarded.
+//
+// Textbooks also typically do not consider context-aware empty string operators
+// like ^ or $. These are handled by the flag word, which specifies the set
+// of empty-string operators that should be matched when executing at the
+// current text position. These flag bits are defined in prog.h.
+// The flag word also contains two DFA-specific bits: kFlagMatch if the state
+// is a matching state (one that reached a kInstMatch in the program)
+// and kFlagLastWord if the last processed byte was a word character, for the
+// implementation of \B and \b.
+//
+// The flag word also contains, shifted up 16 bits, the bits looked for by
+// any kInstEmptyWidth instructions in the state. These provide a useful
+// summary indicating when new flags might be useful.
+//
+// The permanent representation of a State's instruction ids is just an array,
+// but while a state is being analyzed, these instruction ids are represented
+// as a Workq, which is an array that allows iteration in insertion order.
+
+// NOTE(rsc): The choice of State construction determines whether the DFA
+// mimics backtracking implementations (so-called leftmost first matching) or
+// traditional DFA implementations (so-called leftmost longest matching as
+// prescribed by POSIX). This implementation chooses to mimic the
+// backtracking implementations, because we want to replace PCRE. To get
+// POSIX behavior, the states would need to be considered not as a simple
+// ordered list of instruction ids, but as a list of unordered sets of instruction
+// ids. A match by a state in one set would inhibit the running of sets
+// farther down the list but not other instruction ids in the same set. Each
+// set would correspond to matches beginning at a given point in the string.
+// This is implemented by separating different sets with Mark pointers.
+
+// Looks in the State cache for a State matching q, flag.
+// If one is found, returns it. If one is not found, allocates one,
+// inserts it in the cache, and returns it.
// If mq is not null, MatchSep and the match IDs in mq will be appended
// to the State.
DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
//mutex_.AssertHeld();
-
- // Construct array of instruction ids for the new state.
- // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
- // those are the only operators with any effect in
- // RunWorkqOnEmptyString or RunWorkqOnByte.
+
+ // Construct array of instruction ids for the new state.
+ // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
+ // those are the only operators with any effect in
+ // RunWorkqOnEmptyString or RunWorkqOnByte.
PODArray<int> inst(q->size());
- int n = 0;
+ int n = 0;
uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions
bool sawmatch = false; // whether queue contains guaranteed kInstMatch
bool sawmark = false; // whether queue contains a Mark
if (ExtraDebug)
- fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
- for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
- int id = *it;
- if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
- break;
- if (q->is_mark(id)) {
- if (n > 0 && inst[n-1] != Mark) {
- sawmark = true;
- inst[n++] = Mark;
- }
- continue;
- }
- Prog::Inst* ip = prog_->inst(id);
- switch (ip->opcode()) {
- case kInstAltMatch:
- // This state will continue to a match no matter what
- // the rest of the input is. If it is the highest priority match
- // being considered, return the special FullMatchState
- // to indicate that it's all matches from here out.
- if (kind_ != Prog::kManyMatch &&
- (kind_ != Prog::kFirstMatch ||
- (it == q->begin() && ip->greedy(prog_))) &&
+ fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
+ for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
+ int id = *it;
+ if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
+ break;
+ if (q->is_mark(id)) {
+ if (n > 0 && inst[n-1] != Mark) {
+ sawmark = true;
+ inst[n++] = Mark;
+ }
+ continue;
+ }
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ case kInstAltMatch:
+ // This state will continue to a match no matter what
+ // the rest of the input is. If it is the highest priority match
+ // being considered, return the special FullMatchState
+ // to indicate that it's all matches from here out.
+ if (kind_ != Prog::kManyMatch &&
+ (kind_ != Prog::kFirstMatch ||
+ (it == q->begin() && ip->greedy(prog_))) &&
(kind_ != Prog::kLongestMatch || !sawmark) &&
(flag & kFlagMatch)) {
if (ExtraDebug)
- fprintf(stderr, " -> FullMatchState\n");
- return FullMatchState;
- }
+ fprintf(stderr, " -> FullMatchState\n");
+ return FullMatchState;
+ }
FALLTHROUGH_INTENDED;
default:
// Record iff id is the head of its list, which must
// be the case if id-1 is the last of *its* list. :)
if (prog_->inst(id-1)->last())
inst[n++] = *it;
- if (ip->opcode() == kInstEmptyWidth)
- needflags |= ip->empty();
- if (ip->opcode() == kInstMatch && !prog_->anchor_end())
- sawmatch = true;
- break;
- }
- }
- DCHECK_LE(n, q->size());
- if (n > 0 && inst[n-1] == Mark)
- n--;
-
- // If there are no empty-width instructions waiting to execute,
- // then the extra flag bits will not be used, so there is no
- // point in saving them. (Discarding them reduces the number
- // of distinct states.)
- if (needflags == 0)
- flag &= kFlagMatch;
-
- // NOTE(rsc): The code above cannot do flag &= needflags,
- // because if the right flags were present to pass the current
- // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
- // might be reached that in turn need different flags.
- // The only sure thing is that if there are no kInstEmptyWidth
- // instructions at all, no flags will be needed.
- // We could do the extra work to figure out the full set of
- // possibly needed flags by exploring past the kInstEmptyWidth
- // instructions, but the check above -- are any flags needed
- // at all? -- handles the most common case. More fine-grained
- // analysis can only be justified by measurements showing that
- // too many redundant states are being allocated.
-
- // If there are no Insts in the list, it's a dead state,
- // which is useful to signal with a special pointer so that
- // the execution loop can stop early. This is only okay
- // if the state is *not* a matching state.
- if (n == 0 && flag == 0) {
+ if (ip->opcode() == kInstEmptyWidth)
+ needflags |= ip->empty();
+ if (ip->opcode() == kInstMatch && !prog_->anchor_end())
+ sawmatch = true;
+ break;
+ }
+ }
+ DCHECK_LE(n, q->size());
+ if (n > 0 && inst[n-1] == Mark)
+ n--;
+
+ // If there are no empty-width instructions waiting to execute,
+ // then the extra flag bits will not be used, so there is no
+ // point in saving them. (Discarding them reduces the number
+ // of distinct states.)
+ if (needflags == 0)
+ flag &= kFlagMatch;
+
+ // NOTE(rsc): The code above cannot do flag &= needflags,
+ // because if the right flags were present to pass the current
+ // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
+ // might be reached that in turn need different flags.
+ // The only sure thing is that if there are no kInstEmptyWidth
+ // instructions at all, no flags will be needed.
+ // We could do the extra work to figure out the full set of
+ // possibly needed flags by exploring past the kInstEmptyWidth
+ // instructions, but the check above -- are any flags needed
+ // at all? -- handles the most common case. More fine-grained
+ // analysis can only be justified by measurements showing that
+ // too many redundant states are being allocated.
+
+ // If there are no Insts in the list, it's a dead state,
+ // which is useful to signal with a special pointer so that
+ // the execution loop can stop early. This is only okay
+ // if the state is *not* a matching state.
+ if (n == 0 && flag == 0) {
if (ExtraDebug)
- fprintf(stderr, " -> DeadState\n");
- return DeadState;
- }
-
- // If we're in longest match mode, the state is a sequence of
- // unordered state sets separated by Marks. Sort each set
- // to canonicalize, to reduce the number of distinct sets stored.
- if (kind_ == Prog::kLongestMatch) {
+ fprintf(stderr, " -> DeadState\n");
+ return DeadState;
+ }
+
+ // If we're in longest match mode, the state is a sequence of
+ // unordered state sets separated by Marks. Sort each set
+ // to canonicalize, to reduce the number of distinct sets stored.
+ if (kind_ == Prog::kLongestMatch) {
int* ip = inst.data();
- int* ep = ip + n;
- while (ip < ep) {
- int* markp = ip;
- while (markp < ep && *markp != Mark)
- markp++;
+ int* ep = ip + n;
+ while (ip < ep) {
+ int* markp = ip;
+ while (markp < ep && *markp != Mark)
+ markp++;
std::sort(ip, markp);
- if (markp < ep)
- markp++;
- ip = markp;
- }
- }
-
+ if (markp < ep)
+ markp++;
+ ip = markp;
+ }
+ }
+
// If we're in many match mode, canonicalize for similar reasons:
// we have an unordered set of states (i.e. we don't have Marks)
// and sorting will reduce the number of distinct sets stored.
@@ -717,47 +717,47 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
}
}
- // Save the needed empty-width flags in the top bits for use later.
- flag |= needflags << kFlagNeedShift;
-
+ // Save the needed empty-width flags in the top bits for use later.
+ flag |= needflags << kFlagNeedShift;
+
State* state = CachedState(inst.data(), n, flag);
- return state;
-}
-
-// Looks in the State cache for a State matching inst, ninst, flag.
-// If one is found, returns it. If one is not found, allocates one,
-// inserts it in the cache, and returns it.
+ return state;
+}
+
+// Looks in the State cache for a State matching inst, ninst, flag.
+// If one is found, returns it. If one is not found, allocates one,
+// inserts it in the cache, and returns it.
DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
//mutex_.AssertHeld();
-
- // Look in the cache for a pre-existing state.
+
+ // Look in the cache for a pre-existing state.
// We have to initialise the struct like this because otherwise
// MSVC will complain about the flexible array member. :(
State state;
state.inst_ = inst;
state.ninst_ = ninst;
state.flag_ = flag;
- StateSet::iterator it = state_cache_.find(&state);
- if (it != state_cache_.end()) {
+ StateSet::iterator it = state_cache_.find(&state);
+ if (it != state_cache_.end()) {
if (ExtraDebug)
- fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());
- return *it;
- }
-
- // Must have enough memory for new state.
- // In addition to what we're going to allocate,
+ fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());
+ return *it;
+ }
+
+ // Must have enough memory for new state.
+ // In addition to what we're going to allocate,
// the state cache hash table seems to incur about 40 bytes per
- // State*, empirically.
+ // State*, empirically.
const int kStateCacheOverhead = 40;
- int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
+ int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
ninst*sizeof(int);
- if (mem_budget_ < mem + kStateCacheOverhead) {
- mem_budget_ = -1;
- return NULL;
- }
- mem_budget_ -= mem + kStateCacheOverhead;
-
+ if (mem_budget_ < mem + kStateCacheOverhead) {
+ mem_budget_ = -1;
+ return NULL;
+ }
+ mem_budget_ -= mem + kStateCacheOverhead;
+
// Allocate new state along with room for next_ and inst_.
char* space = std::allocator<char>().allocate(mem);
State* s = new (space) State;
@@ -767,19 +767,19 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
for (int i = 0; i < nnext; i++)
(void) new (s->next_ + i) std::atomic<State*>(NULL);
s->inst_ = new (s->next_ + nnext) int[ninst];
- memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
- s->ninst_ = ninst;
- s->flag_ = flag;
+ memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
+ s->ninst_ = ninst;
+ s->flag_ = flag;
if (ExtraDebug)
- fprintf(stderr, " -> %s\n", DumpState(s).c_str());
-
- // Put state in cache and return it.
- state_cache_.insert(s);
- return s;
-}
-
-// Clear the cache. Must hold cache_mutex_.w or be in destructor.
-void DFA::ClearCache() {
+ fprintf(stderr, " -> %s\n", DumpState(s).c_str());
+
+ // Put state in cache and return it.
+ state_cache_.insert(s);
+ return s;
+}
+
+// Clear the cache. Must hold cache_mutex_.w or be in destructor.
+void DFA::ClearCache() {
StateSet::iterator begin = state_cache_.begin();
StateSet::iterator end = state_cache_.end();
while (begin != end) {
@@ -793,15 +793,15 @@ void DFA::ClearCache() {
ninst*sizeof(int);
std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem);
}
- state_cache_.clear();
-}
-
-// Copies insts in state s to the work queue q.
-void DFA::StateToWorkq(State* s, Workq* q) {
- q->clear();
- for (int i = 0; i < s->ninst_; i++) {
+ state_cache_.clear();
+}
+
+// Copies insts in state s to the work queue q.
+void DFA::StateToWorkq(State* s, Workq* q) {
+ q->clear();
+ for (int i = 0; i < s->ninst_; i++) {
if (s->inst_[i] == Mark) {
- q->mark();
+ q->mark();
} else if (s->inst_[i] == MatchSep) {
// Nothing after this is an instruction!
break;
@@ -809,12 +809,12 @@ void DFA::StateToWorkq(State* s, Workq* q) {
// Explore from the head of the list.
AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask);
}
- }
-}
-
+ }
+}
+
// Adds ip to the work queue, following empty arrows according to flag.
void DFA::AddToQueue(Workq* q, int id, uint32_t flag) {
-
+
// Use stack_ to hold our stack of instructions yet to process.
// It was preallocated as follows:
// one entry per Capture;
@@ -825,66 +825,66 @@ void DFA::AddToQueue(Workq* q, int id, uint32_t flag) {
// When using marks, we also added nmark == prog_->size().
// (Otherwise, nmark == 0.)
int* stk = stack_.data();
- int nstk = 0;
-
- stk[nstk++] = id;
- while (nstk > 0) {
+ int nstk = 0;
+
+ stk[nstk++] = id;
+ while (nstk > 0) {
DCHECK_LE(nstk, stack_.size());
- id = stk[--nstk];
-
+ id = stk[--nstk];
+
Loop:
- if (id == Mark) {
- q->mark();
- continue;
- }
-
- if (id == 0)
- continue;
-
- // If ip is already on the queue, nothing to do.
+ if (id == Mark) {
+ q->mark();
+ continue;
+ }
+
+ if (id == 0)
+ continue;
+
+ // If ip is already on the queue, nothing to do.
// Otherwise add it. We don't actually keep all the
// ones that get added, but adding all of them here
- // increases the likelihood of q->contains(id),
- // reducing the amount of duplicated work.
- if (q->contains(id))
- continue;
- q->insert_new(id);
-
- // Process instruction.
- Prog::Inst* ip = prog_->inst(id);
- switch (ip->opcode()) {
+ // increases the likelihood of q->contains(id),
+ // reducing the amount of duplicated work.
+ if (q->contains(id))
+ continue;
+ q->insert_new(id);
+
+ // Process instruction.
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
- break;
-
- case kInstByteRange: // just save these on the queue
- case kInstMatch:
+ break;
+
+ case kInstByteRange: // just save these on the queue
+ case kInstMatch:
if (ip->last())
break;
id = id+1;
goto Loop;
-
- case kInstCapture: // DFA treats captures as no-ops.
- case kInstNop:
+
+ case kInstCapture: // DFA treats captures as no-ops.
+ case kInstNop:
if (!ip->last())
stk[nstk++] = id+1;
-
+
// If this instruction is the [00-FF]* loop at the beginning of
// a leftmost-longest unanchored search, separate with a Mark so
// that future threads (which will start farther to the right in
// the input string) are lower priority than current threads.
if (ip->opcode() == kInstNop && q->maxmark() > 0 &&
- id == prog_->start_unanchored() && id != prog_->start())
- stk[nstk++] = Mark;
+ id == prog_->start_unanchored() && id != prog_->start())
+ stk[nstk++] = Mark;
id = ip->out();
goto Loop;
-
+
case kInstAltMatch:
DCHECK(!ip->last());
id = id+1;
goto Loop;
- case kInstEmptyWidth:
+ case kInstEmptyWidth:
if (!ip->last())
stk[nstk++] = id+1;
@@ -893,67 +893,67 @@ void DFA::AddToQueue(Workq* q, int id, uint32_t flag) {
break;
id = ip->out();
goto Loop;
- }
- }
-}
-
-// Running of work queues. In the work queue, order matters:
-// the queue is sorted in priority order. If instruction i comes before j,
-// then the instructions that i produces during the run must come before
-// the ones that j produces. In order to keep this invariant, all the
-// work queue runners have to take an old queue to process and then
-// also a new queue to fill in. It's not acceptable to add to the end of
-// an existing queue, because new instructions will not end up in the
-// correct position.
-
-// Runs the work queue, processing the empty strings indicated by flag.
-// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
-// both ^ and $. It is important that callers pass all flags at once:
-// processing both ^ and $ is not the same as first processing only ^
-// and then processing only $. Doing the two-step sequence won't match
-// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
-// exhibited by existing implementations).
+ }
+ }
+}
+
+// Running of work queues. In the work queue, order matters:
+// the queue is sorted in priority order. If instruction i comes before j,
+// then the instructions that i produces during the run must come before
+// the ones that j produces. In order to keep this invariant, all the
+// work queue runners have to take an old queue to process and then
+// also a new queue to fill in. It's not acceptable to add to the end of
+// an existing queue, because new instructions will not end up in the
+// correct position.
+
+// Runs the work queue, processing the empty strings indicated by flag.
+// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
+// both ^ and $. It is important that callers pass all flags at once:
+// processing both ^ and $ is not the same as first processing only ^
+// and then processing only $. Doing the two-step sequence won't match
+// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
+// exhibited by existing implementations).
void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) {
- newq->clear();
- for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
- if (oldq->is_mark(*i))
- AddToQueue(newq, Mark, flag);
- else
- AddToQueue(newq, *i, flag);
- }
-}
-
-// Runs the work queue, processing the single byte c followed by any empty
-// strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine,
-// means to match c$. Sets the bool *ismatch to true if the end of the
-// regular expression program has been reached (the regexp has matched).
-void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
+ newq->clear();
+ for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
+ if (oldq->is_mark(*i))
+ AddToQueue(newq, Mark, flag);
+ else
+ AddToQueue(newq, *i, flag);
+ }
+}
+
+// Runs the work queue, processing the single byte c followed by any empty
+// strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine,
+// means to match c$. Sets the bool *ismatch to true if the end of the
+// regular expression program has been reached (the regexp has matched).
+void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
int c, uint32_t flag, bool* ismatch) {
//mutex_.AssertHeld();
-
- newq->clear();
- for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
- if (oldq->is_mark(*i)) {
- if (*ismatch)
- return;
- newq->mark();
- continue;
- }
- int id = *i;
- Prog::Inst* ip = prog_->inst(id);
- switch (ip->opcode()) {
+
+ newq->clear();
+ for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
+ if (oldq->is_mark(*i)) {
+ if (*ismatch)
+ return;
+ newq->mark();
+ continue;
+ }
+ int id = *i;
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
- case kInstFail: // never succeeds
- case kInstCapture: // already followed
- case kInstNop: // already followed
- case kInstAltMatch: // already followed
- case kInstEmptyWidth: // already followed
- break;
-
- case kInstByteRange: // can follow if c is in range
+ case kInstFail: // never succeeds
+ case kInstCapture: // already followed
+ case kInstNop: // already followed
+ case kInstAltMatch: // already followed
+ case kInstEmptyWidth: // already followed
+ break;
+
+ case kInstByteRange: // can follow if c is in range
if (!ip->Matches(c))
break;
AddToQueue(newq, ip->out(), flag);
@@ -969,363 +969,363 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
++ip;
i += ip - ip0;
}
- break;
-
- case kInstMatch:
+ break;
+
+ case kInstMatch:
if (prog_->anchor_end() && c != kByteEndText &&
kind_ != Prog::kManyMatch)
- break;
- *ismatch = true;
+ break;
+ *ismatch = true;
if (kind_ == Prog::kFirstMatch) {
- // Can stop processing work queue since we found a match.
- return;
- }
- break;
- }
- }
-
+ // Can stop processing work queue since we found a match.
+ return;
+ }
+ break;
+ }
+ }
+
if (ExtraDebug)
fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n",
DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch);
-}
-
-// Processes input byte c in state, returning new state.
-// Caller does not hold mutex.
-DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
- // Keep only one RunStateOnByte going
- // even if the DFA is being run by multiple threads.
- MutexLock l(&mutex_);
- return RunStateOnByte(state, c);
-}
-
-// Processes input byte c in state, returning new state.
-DFA::State* DFA::RunStateOnByte(State* state, int c) {
+}
+
+// Processes input byte c in state, returning new state.
+// Caller does not hold mutex.
+DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
+ // Keep only one RunStateOnByte going
+ // even if the DFA is being run by multiple threads.
+ MutexLock l(&mutex_);
+ return RunStateOnByte(state, c);
+}
+
+// Processes input byte c in state, returning new state.
+DFA::State* DFA::RunStateOnByte(State* state, int c) {
//mutex_.AssertHeld();
- if (state <= SpecialStateMax) {
- if (state == FullMatchState) {
- // It is convenient for routines like PossibleMatchRange
- // if we implement RunStateOnByte for FullMatchState:
- // once you get into this state you never get out,
- // so it's pretty easy.
- return FullMatchState;
- }
- if (state == DeadState) {
- LOG(DFATAL) << "DeadState in RunStateOnByte";
- return NULL;
- }
- if (state == NULL) {
- LOG(DFATAL) << "NULL state in RunStateOnByte";
- return NULL;
- }
- LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
- return NULL;
- }
-
- // If someone else already computed this, return it.
+ if (state <= SpecialStateMax) {
+ if (state == FullMatchState) {
+ // It is convenient for routines like PossibleMatchRange
+ // if we implement RunStateOnByte for FullMatchState:
+ // once you get into this state you never get out,
+ // so it's pretty easy.
+ return FullMatchState;
+ }
+ if (state == DeadState) {
+ LOG(DFATAL) << "DeadState in RunStateOnByte";
+ return NULL;
+ }
+ if (state == NULL) {
+ LOG(DFATAL) << "NULL state in RunStateOnByte";
+ return NULL;
+ }
+ LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
+ return NULL;
+ }
+
+ // If someone else already computed this, return it.
State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed);
if (ns != NULL)
return ns;
-
- // Convert state into Workq.
- StateToWorkq(state, q0_);
-
- // Flags marking the kinds of empty-width things (^ $ etc)
- // around this byte. Before the byte we have the flags recorded
- // in the State structure itself. After the byte we have
- // nothing yet (but that will change: read on).
+
+ // Convert state into Workq.
+ StateToWorkq(state, q0_);
+
+ // Flags marking the kinds of empty-width things (^ $ etc)
+ // around this byte. Before the byte we have the flags recorded
+ // in the State structure itself. After the byte we have
+ // nothing yet (but that will change: read on).
uint32_t needflag = state->flag_ >> kFlagNeedShift;
uint32_t beforeflag = state->flag_ & kFlagEmptyMask;
uint32_t oldbeforeflag = beforeflag;
uint32_t afterflag = 0;
-
- if (c == '\n') {
- // Insert implicit $ and ^ around \n
- beforeflag |= kEmptyEndLine;
- afterflag |= kEmptyBeginLine;
- }
-
- if (c == kByteEndText) {
- // Insert implicit $ and \z before the fake "end text" byte.
- beforeflag |= kEmptyEndLine | kEmptyEndText;
- }
-
- // The state flag kFlagLastWord says whether the last
- // byte processed was a word character. Use that info to
- // insert empty-width (non-)word boundaries.
+
+ if (c == '\n') {
+ // Insert implicit $ and ^ around \n
+ beforeflag |= kEmptyEndLine;
+ afterflag |= kEmptyBeginLine;
+ }
+
+ if (c == kByteEndText) {
+ // Insert implicit $ and \z before the fake "end text" byte.
+ beforeflag |= kEmptyEndLine | kEmptyEndText;
+ }
+
+ // The state flag kFlagLastWord says whether the last
+ // byte processed was a word character. Use that info to
+ // insert empty-width (non-)word boundaries.
bool islastword = (state->flag_ & kFlagLastWord) != 0;
bool isword = c != kByteEndText && Prog::IsWordChar(static_cast<uint8_t>(c));
- if (isword == islastword)
- beforeflag |= kEmptyNonWordBoundary;
- else
- beforeflag |= kEmptyWordBoundary;
-
- // Okay, finally ready to run.
- // Only useful to rerun on empty string if there are new, useful flags.
- if (beforeflag & ~oldbeforeflag & needflag) {
- RunWorkqOnEmptyString(q0_, q1_, beforeflag);
+ if (isword == islastword)
+ beforeflag |= kEmptyNonWordBoundary;
+ else
+ beforeflag |= kEmptyWordBoundary;
+
+ // Okay, finally ready to run.
+ // Only useful to rerun on empty string if there are new, useful flags.
+ if (beforeflag & ~oldbeforeflag & needflag) {
+ RunWorkqOnEmptyString(q0_, q1_, beforeflag);
using std::swap;
- swap(q0_, q1_);
- }
- bool ismatch = false;
+ swap(q0_, q1_);
+ }
+ bool ismatch = false;
RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch);
using std::swap;
swap(q0_, q1_);
-
- // Save afterflag along with ismatch and isword in new state.
+
+ // Save afterflag along with ismatch and isword in new state.
uint32_t flag = afterflag;
- if (ismatch)
- flag |= kFlagMatch;
- if (isword)
- flag |= kFlagLastWord;
-
+ if (ismatch)
+ flag |= kFlagMatch;
+ if (isword)
+ flag |= kFlagLastWord;
+
if (ismatch && kind_ == Prog::kManyMatch)
ns = WorkqToCachedState(q0_, q1_, flag);
else
ns = WorkqToCachedState(q0_, NULL, flag);
-
+
// Flush ns before linking to it.
- // Write barrier before updating state->next_ so that the
- // main search loop can proceed without any locking, for speed.
- // (Otherwise it would need one mutex operation per input byte.)
+ // Write barrier before updating state->next_ so that the
+ // main search loop can proceed without any locking, for speed.
+ // (Otherwise it would need one mutex operation per input byte.)
state->next_[ByteMap(c)].store(ns, std::memory_order_release);
- return ns;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-// DFA cache reset.
-
-// Reader-writer lock helper.
-//
-// The DFA uses a reader-writer mutex to protect the state graph itself.
-// Traversing the state graph requires holding the mutex for reading,
-// and discarding the state graph and starting over requires holding the
-// lock for writing. If a search needs to expand the graph but is out
-// of memory, it will need to drop its read lock and then acquire the
-// write lock. Since it cannot then atomically downgrade from write lock
-// to read lock, it runs the rest of the search holding the write lock.
-// (This probably helps avoid repeated contention, but really the decision
-// is forced by the Mutex interface.) It's a bit complicated to keep
-// track of whether the lock is held for reading or writing and thread
-// that through the search, so instead we encapsulate it in the RWLocker
-// and pass that around.
-
-class DFA::RWLocker {
- public:
+ return ns;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// DFA cache reset.
+
+// Reader-writer lock helper.
+//
+// The DFA uses a reader-writer mutex to protect the state graph itself.
+// Traversing the state graph requires holding the mutex for reading,
+// and discarding the state graph and starting over requires holding the
+// lock for writing. If a search needs to expand the graph but is out
+// of memory, it will need to drop its read lock and then acquire the
+// write lock. Since it cannot then atomically downgrade from write lock
+// to read lock, it runs the rest of the search holding the write lock.
+// (This probably helps avoid repeated contention, but really the decision
+// is forced by the Mutex interface.) It's a bit complicated to keep
+// track of whether the lock is held for reading or writing and thread
+// that through the search, so instead we encapsulate it in the RWLocker
+// and pass that around.
+
+class DFA::RWLocker {
+ public:
explicit RWLocker(CacheMutex* mu);
- ~RWLocker();
-
- // If the lock is only held for reading right now,
- // drop the read lock and re-acquire for writing.
- // Subsequent calls to LockForWriting are no-ops.
- // Notice that the lock is *released* temporarily.
- void LockForWriting();
-
- private:
+ ~RWLocker();
+
+ // If the lock is only held for reading right now,
+ // drop the read lock and re-acquire for writing.
+ // Subsequent calls to LockForWriting are no-ops.
+ // Notice that the lock is *released* temporarily.
+ void LockForWriting();
+
+ private:
CacheMutex* mu_;
- bool writing_;
-
+ bool writing_;
+
RWLocker(const RWLocker&) = delete;
RWLocker& operator=(const RWLocker&) = delete;
-};
-
+};
+
DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
- mu_->ReaderLock();
-}
-
+ mu_->ReaderLock();
+}
+
// This function is marked as NO_THREAD_SAFETY_ANALYSIS because
// the annotations don't support lock upgrade.
-void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
- if (!writing_) {
- mu_->ReaderUnlock();
+void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
+ if (!writing_) {
+ mu_->ReaderUnlock();
mu_->WriterLock();
- writing_ = true;
- }
-}
-
-DFA::RWLocker::~RWLocker() {
+ writing_ = true;
+ }
+}
+
+DFA::RWLocker::~RWLocker() {
if (!writing_)
mu_->ReaderUnlock();
else
- mu_->WriterUnlock();
-}
-
-
-// When the DFA's State cache fills, we discard all the states in the
-// cache and start over. Many threads can be using and adding to the
-// cache at the same time, so we synchronize using the cache_mutex_
-// to keep from stepping on other threads. Specifically, all the
-// threads using the current cache hold cache_mutex_ for reading.
-// When a thread decides to flush the cache, it drops cache_mutex_
-// and then re-acquires it for writing. That ensures there are no
-// other threads accessing the cache anymore. The rest of the search
-// runs holding cache_mutex_ for writing, avoiding any contention
-// with or cache pollution caused by other threads.
-
-void DFA::ResetCache(RWLocker* cache_lock) {
- // Re-acquire the cache_mutex_ for writing (exclusive use).
- cache_lock->LockForWriting();
-
+ mu_->WriterUnlock();
+}
+
+
+// When the DFA's State cache fills, we discard all the states in the
+// cache and start over. Many threads can be using and adding to the
+// cache at the same time, so we synchronize using the cache_mutex_
+// to keep from stepping on other threads. Specifically, all the
+// threads using the current cache hold cache_mutex_ for reading.
+// When a thread decides to flush the cache, it drops cache_mutex_
+// and then re-acquires it for writing. That ensures there are no
+// other threads accessing the cache anymore. The rest of the search
+// runs holding cache_mutex_ for writing, avoiding any contention
+// with or cache pollution caused by other threads.
+
+void DFA::ResetCache(RWLocker* cache_lock) {
+ // Re-acquire the cache_mutex_ for writing (exclusive use).
+ cache_lock->LockForWriting();
+
hooks::GetDFAStateCacheResetHook()({
state_budget_,
state_cache_.size(),
});
- // Clear the cache, reset the memory budget.
+ // Clear the cache, reset the memory budget.
for (int i = 0; i < kMaxStart; i++)
start_[i].start.store(NULL, std::memory_order_relaxed);
- ClearCache();
- mem_budget_ = state_budget_;
-}
-
-// Typically, a couple States do need to be preserved across a cache
-// reset, like the State at the current point in the search.
-// The StateSaver class helps keep States across cache resets.
-// It makes a copy of the state's guts outside the cache (before the reset)
-// and then can be asked, after the reset, to recreate the State
-// in the new cache. For example, in a DFA method ("this" is a DFA):
-//
-// StateSaver saver(this, s);
-// ResetCache(cache_lock);
-// s = saver.Restore();
-//
-// The saver should always have room in the cache to re-create the state,
-// because resetting the cache locks out all other threads, and the cache
-// is known to have room for at least a couple states (otherwise the DFA
-// constructor fails).
-
-class DFA::StateSaver {
- public:
- explicit StateSaver(DFA* dfa, State* state);
- ~StateSaver();
-
- // Recreates and returns a state equivalent to the
- // original state passed to the constructor.
- // Returns NULL if the cache has filled, but
- // since the DFA guarantees to have room in the cache
- // for a couple states, should never return NULL
- // if used right after ResetCache.
- State* Restore();
-
- private:
- DFA* dfa_; // the DFA to use
- int* inst_; // saved info from State
- int ninst_;
+ ClearCache();
+ mem_budget_ = state_budget_;
+}
+
+// Typically, a couple States do need to be preserved across a cache
+// reset, like the State at the current point in the search.
+// The StateSaver class helps keep States across cache resets.
+// It makes a copy of the state's guts outside the cache (before the reset)
+// and then can be asked, after the reset, to recreate the State
+// in the new cache. For example, in a DFA method ("this" is a DFA):
+//
+// StateSaver saver(this, s);
+// ResetCache(cache_lock);
+// s = saver.Restore();
+//
+// The saver should always have room in the cache to re-create the state,
+// because resetting the cache locks out all other threads, and the cache
+// is known to have room for at least a couple states (otherwise the DFA
+// constructor fails).
+
+class DFA::StateSaver {
+ public:
+ explicit StateSaver(DFA* dfa, State* state);
+ ~StateSaver();
+
+ // Recreates and returns a state equivalent to the
+ // original state passed to the constructor.
+ // Returns NULL if the cache has filled, but
+ // since the DFA guarantees to have room in the cache
+ // for a couple states, should never return NULL
+ // if used right after ResetCache.
+ State* Restore();
+
+ private:
+ DFA* dfa_; // the DFA to use
+ int* inst_; // saved info from State
+ int ninst_;
uint32_t flag_;
- bool is_special_; // whether original state was special
- State* special_; // if is_special_, the original state
-
+ bool is_special_; // whether original state was special
+ State* special_; // if is_special_, the original state
+
StateSaver(const StateSaver&) = delete;
StateSaver& operator=(const StateSaver&) = delete;
-};
-
-DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
- dfa_ = dfa;
- if (state <= SpecialStateMax) {
- inst_ = NULL;
- ninst_ = 0;
- flag_ = 0;
- is_special_ = true;
- special_ = state;
- return;
- }
- is_special_ = false;
- special_ = NULL;
- flag_ = state->flag_;
- ninst_ = state->ninst_;
- inst_ = new int[ninst_];
- memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
-}
-
-DFA::StateSaver::~StateSaver() {
- if (!is_special_)
- delete[] inst_;
-}
-
-DFA::State* DFA::StateSaver::Restore() {
- if (is_special_)
- return special_;
- MutexLock l(&dfa_->mutex_);
- State* s = dfa_->CachedState(inst_, ninst_, flag_);
- if (s == NULL)
- LOG(DFATAL) << "StateSaver failed to restore state.";
- return s;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-//
-// DFA execution.
-//
-// The basic search loop is easy: start in a state s and then for each
-// byte c in the input, s = s->next[c].
-//
-// This simple description omits a few efficiency-driven complications.
-//
-// First, the State graph is constructed incrementally: it is possible
-// that s->next[c] is null, indicating that that state has not been
-// fully explored. In this case, RunStateOnByte must be invoked to
-// determine the next state, which is cached in s->next[c] to save
-// future effort. An alternative reason for s->next[c] to be null is
-// that the DFA has reached a so-called "dead state", in which any match
-// is no longer possible. In this case RunStateOnByte will return NULL
-// and the processing of the string can stop early.
-//
-// Second, a 256-element pointer array for s->next_ makes each State
-// quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[]
-// maps from bytes to "byte classes" and then next_ only needs to have
-// as many pointers as there are byte classes. A byte class is simply a
-// range of bytes that the regexp never distinguishes between.
-// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
-// 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit
-// but in exchange we typically cut the size of a State (and thus our
-// memory footprint) by about 5-10x. The comments still refer to
-// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
-//
-// Third, it is common for a DFA for an unanchored match to begin in a
-// state in which only one particular byte value can take the DFA to a
-// different state. That is, s->next[c] != s for only one c. In this
-// situation, the DFA can do better than executing the simple loop.
-// Instead, it can call memchr to search very quickly for the byte c.
-// Whether the start state has this property is determined during a
+};
+
+DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
+ dfa_ = dfa;
+ if (state <= SpecialStateMax) {
+ inst_ = NULL;
+ ninst_ = 0;
+ flag_ = 0;
+ is_special_ = true;
+ special_ = state;
+ return;
+ }
+ is_special_ = false;
+ special_ = NULL;
+ flag_ = state->flag_;
+ ninst_ = state->ninst_;
+ inst_ = new int[ninst_];
+ memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
+}
+
+DFA::StateSaver::~StateSaver() {
+ if (!is_special_)
+ delete[] inst_;
+}
+
+DFA::State* DFA::StateSaver::Restore() {
+ if (is_special_)
+ return special_;
+ MutexLock l(&dfa_->mutex_);
+ State* s = dfa_->CachedState(inst_, ninst_, flag_);
+ if (s == NULL)
+ LOG(DFATAL) << "StateSaver failed to restore state.";
+ return s;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+//
+// DFA execution.
+//
+// The basic search loop is easy: start in a state s and then for each
+// byte c in the input, s = s->next[c].
+//
+// This simple description omits a few efficiency-driven complications.
+//
+// First, the State graph is constructed incrementally: it is possible
+// that s->next[c] is null, indicating that that state has not been
+// fully explored. In this case, RunStateOnByte must be invoked to
+// determine the next state, which is cached in s->next[c] to save
+// future effort. An alternative reason for s->next[c] to be null is
+// that the DFA has reached a so-called "dead state", in which any match
+// is no longer possible. In this case RunStateOnByte will return NULL
+// and the processing of the string can stop early.
+//
+// Second, a 256-element pointer array for s->next_ makes each State
+// quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[]
+// maps from bytes to "byte classes" and then next_ only needs to have
+// as many pointers as there are byte classes. A byte class is simply a
+// range of bytes that the regexp never distinguishes between.
+// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
+// 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit
+// but in exchange we typically cut the size of a State (and thus our
+// memory footprint) by about 5-10x. The comments still refer to
+// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
+//
+// Third, it is common for a DFA for an unanchored match to begin in a
+// state in which only one particular byte value can take the DFA to a
+// different state. That is, s->next[c] != s for only one c. In this
+// situation, the DFA can do better than executing the simple loop.
+// Instead, it can call memchr to search very quickly for the byte c.
+// Whether the start state has this property is determined during a
// pre-compilation pass and the "can_prefix_accel" argument is set.
-//
-// Fourth, the desired behavior is to search for the leftmost-best match
-// (approximately, the same one that Perl would find), which is not
-// necessarily the match ending earliest in the string. Each time a
-// match is found, it must be noted, but the DFA must continue on in
-// hope of finding a higher-priority match. In some cases, the caller only
-// cares whether there is any match at all, not which one is found.
-// The "want_earliest_match" flag causes the search to stop at the first
-// match found.
-//
-// Fifth, one algorithm that uses the DFA needs it to run over the
-// input string backward, beginning at the end and ending at the beginning.
-// Passing false for the "run_forward" flag causes the DFA to run backward.
-//
-// The checks for these last three cases, which in a naive implementation
-// would be performed once per input byte, slow the general loop enough
-// to merit specialized versions of the search loop for each of the
-// eight possible settings of the three booleans. Rather than write
-// eight different functions, we write one general implementation and then
-// inline it to create the specialized ones.
-//
-// Note that matches are delayed by one byte, to make it easier to
-// accomodate match conditions depending on the next input byte (like $ and \b).
-// When s->next[c]->IsMatch(), it means that there is a match ending just
-// *before* byte c.
-
-// The generic search loop. Searches text for a match, returning
-// the pointer to the end of the chosen match, or NULL if no match.
-// The bools are equal to the same-named variables in params, but
-// making them function arguments lets the inliner specialize
-// this function to each combination (see two paragraphs above).
+//
+// Fourth, the desired behavior is to search for the leftmost-best match
+// (approximately, the same one that Perl would find), which is not
+// necessarily the match ending earliest in the string. Each time a
+// match is found, it must be noted, but the DFA must continue on in
+// hope of finding a higher-priority match. In some cases, the caller only
+// cares whether there is any match at all, not which one is found.
+// The "want_earliest_match" flag causes the search to stop at the first
+// match found.
+//
+// Fifth, one algorithm that uses the DFA needs it to run over the
+// input string backward, beginning at the end and ending at the beginning.
+// Passing false for the "run_forward" flag causes the DFA to run backward.
+//
+// The checks for these last three cases, which in a naive implementation
+// would be performed once per input byte, slow the general loop enough
+// to merit specialized versions of the search loop for each of the
+// eight possible settings of the three booleans. Rather than write
+// eight different functions, we write one general implementation and then
+// inline it to create the specialized ones.
+//
+// Note that matches are delayed by one byte, to make it easier to
+// accomodate match conditions depending on the next input byte (like $ and \b).
+// When s->next[c]->IsMatch(), it means that there is a match ending just
+// *before* byte c.
+
+// The generic search loop. Searches text for a match, returning
+// the pointer to the end of the chosen match, or NULL if no match.
+// The bools are equal to the same-named variables in params, but
+// making them function arguments lets the inliner specialize
+// this function to each combination (see two paragraphs above).
template <bool can_prefix_accel,
bool want_earliest_match,
bool run_forward>
inline bool DFA::InlinedSearchLoop(SearchParams* params) {
- State* start = params->start;
+ State* start = params->start;
const uint8_t* bp = BytePtr(params->text.data()); // start of text
const uint8_t* p = bp; // text scanning point
const uint8_t* ep = BytePtr(params->text.data() +
@@ -1333,20 +1333,20 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
const uint8_t* resetp = NULL; // p at last cache reset
if (!run_forward) {
using std::swap;
- swap(p, ep);
+ swap(p, ep);
}
-
+
const uint8_t* bytemap = prog_->bytemap();
const uint8_t* lastmatch = NULL; // most recent matching position in text
- bool matched = false;
+ bool matched = false;
- State* s = start;
+ State* s = start;
if (ExtraDebug)
fprintf(stderr, "@stx: %s\n", DumpState(s).c_str());
-
- if (s->IsMatch()) {
- matched = true;
- lastmatch = p;
+
+ if (s->IsMatch()) {
+ matched = true;
+ lastmatch = p;
if (ExtraDebug)
fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str());
if (params->matches != NULL && kind_ == Prog::kManyMatch) {
@@ -1357,13 +1357,13 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
params->matches->insert(id);
}
}
- if (want_earliest_match) {
- params->ep = reinterpret_cast<const char*>(lastmatch);
- return true;
- }
- }
-
- while (p != ep) {
+ if (want_earliest_match) {
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return true;
+ }
+ }
+
+ while (p != ep) {
if (ExtraDebug)
fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str());
@@ -1375,95 +1375,95 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
if (p == NULL) {
p = ep;
break;
- }
- }
-
- int c;
- if (run_forward)
- c = *p++;
- else
- c = *--p;
-
- // Note that multiple threads might be consulting
- // s->next_[bytemap[c]] simultaneously.
- // RunStateOnByte takes care of the appropriate locking,
- // including a memory barrier so that the unlocked access
- // (sometimes known as "double-checked locking") is safe.
- // The alternative would be either one DFA per thread
- // or one mutex operation per input byte.
- //
- // ns == DeadState means the state is known to be dead
- // (no more matches are possible).
- // ns == NULL means the state has not yet been computed
- // (need to call RunStateOnByteUnlocked).
- // RunStateOnByte returns ns == NULL if it is out of memory.
- // ns == FullMatchState means the rest of the string matches.
- //
- // Okay to use bytemap[] not ByteMap() here, because
- // c is known to be an actual byte and not kByteEndText.
-
+ }
+ }
+
+ int c;
+ if (run_forward)
+ c = *p++;
+ else
+ c = *--p;
+
+ // Note that multiple threads might be consulting
+ // s->next_[bytemap[c]] simultaneously.
+ // RunStateOnByte takes care of the appropriate locking,
+ // including a memory barrier so that the unlocked access
+ // (sometimes known as "double-checked locking") is safe.
+ // The alternative would be either one DFA per thread
+ // or one mutex operation per input byte.
+ //
+ // ns == DeadState means the state is known to be dead
+ // (no more matches are possible).
+ // ns == NULL means the state has not yet been computed
+ // (need to call RunStateOnByteUnlocked).
+ // RunStateOnByte returns ns == NULL if it is out of memory.
+ // ns == FullMatchState means the rest of the string matches.
+ //
+ // Okay to use bytemap[] not ByteMap() here, because
+ // c is known to be an actual byte and not kByteEndText.
+
State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire);
- if (ns == NULL) {
- ns = RunStateOnByteUnlocked(s, c);
- if (ns == NULL) {
- // After we reset the cache, we hold cache_mutex exclusively,
- // so if resetp != NULL, it means we filled the DFA state
- // cache with this search alone (without any other threads).
- // Benchmarks show that doing a state computation on every
- // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
- // same at about 2 MB/s. Unless we're processing an average
- // of 10 bytes per state computation, fail so that RE2 can
+ if (ns == NULL) {
+ ns = RunStateOnByteUnlocked(s, c);
+ if (ns == NULL) {
+ // After we reset the cache, we hold cache_mutex exclusively,
+ // so if resetp != NULL, it means we filled the DFA state
+ // cache with this search alone (without any other threads).
+ // Benchmarks show that doing a state computation on every
+ // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
+ // same at about 2 MB/s. Unless we're processing an average
+ // of 10 bytes per state computation, fail so that RE2 can
// fall back to the NFA. However, RE2::Set cannot fall back,
// so we just have to keep on keeping on in that case.
if (dfa_should_bail_when_slow && resetp != NULL &&
static_cast<size_t>(p - resetp) < 10*state_cache_.size() &&
kind_ != Prog::kManyMatch) {
- params->failed = true;
- return false;
- }
- resetp = p;
-
- // Prepare to save start and s across the reset.
- StateSaver save_start(this, start);
- StateSaver save_s(this, s);
-
- // Discard all the States in the cache.
- ResetCache(params->cache_lock);
-
- // Restore start and s so we can continue.
- if ((start = save_start.Restore()) == NULL ||
- (s = save_s.Restore()) == NULL) {
- // Restore already did LOG(DFATAL).
- params->failed = true;
- return false;
- }
- ns = RunStateOnByteUnlocked(s, c);
- if (ns == NULL) {
- LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
- params->failed = true;
- return false;
- }
- }
- }
- if (ns <= SpecialStateMax) {
- if (ns == DeadState) {
- params->ep = reinterpret_cast<const char*>(lastmatch);
- return matched;
- }
- // FullMatchState
- params->ep = reinterpret_cast<const char*>(ep);
- return true;
- }
-
- s = ns;
- if (s->IsMatch()) {
- matched = true;
- // The DFA notices the match one byte late,
- // so adjust p before using it in the match.
- if (run_forward)
- lastmatch = p - 1;
- else
- lastmatch = p + 1;
+ params->failed = true;
+ return false;
+ }
+ resetp = p;
+
+ // Prepare to save start and s across the reset.
+ StateSaver save_start(this, start);
+ StateSaver save_s(this, s);
+
+ // Discard all the States in the cache.
+ ResetCache(params->cache_lock);
+
+ // Restore start and s so we can continue.
+ if ((start = save_start.Restore()) == NULL ||
+ (s = save_s.Restore()) == NULL) {
+ // Restore already did LOG(DFATAL).
+ params->failed = true;
+ return false;
+ }
+ ns = RunStateOnByteUnlocked(s, c);
+ if (ns == NULL) {
+ LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
+ params->failed = true;
+ return false;
+ }
+ }
+ }
+ if (ns <= SpecialStateMax) {
+ if (ns == DeadState) {
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return matched;
+ }
+ // FullMatchState
+ params->ep = reinterpret_cast<const char*>(ep);
+ return true;
+ }
+
+ s = ns;
+ if (s->IsMatch()) {
+ matched = true;
+ // The DFA notices the match one byte late,
+ // so adjust p before using it in the match.
+ if (run_forward)
+ lastmatch = p - 1;
+ else
+ lastmatch = p + 1;
if (ExtraDebug)
fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str());
if (params->matches != NULL && kind_ == Prog::kManyMatch) {
@@ -1474,63 +1474,63 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
params->matches->insert(id);
}
}
- if (want_earliest_match) {
- params->ep = reinterpret_cast<const char*>(lastmatch);
- return true;
- }
- }
- }
-
- // Process one more byte to see if it triggers a match.
- // (Remember, matches are delayed one byte.)
+ if (want_earliest_match) {
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return true;
+ }
+ }
+ }
+
+ // Process one more byte to see if it triggers a match.
+ // (Remember, matches are delayed one byte.)
if (ExtraDebug)
fprintf(stderr, "@etx: %s\n", DumpState(s).c_str());
- int lastbyte;
- if (run_forward) {
+ int lastbyte;
+ if (run_forward) {
if (EndPtr(params->text) == EndPtr(params->context))
- lastbyte = kByteEndText;
- else
+ lastbyte = kByteEndText;
+ else
lastbyte = EndPtr(params->text)[0] & 0xFF;
- } else {
+ } else {
if (BeginPtr(params->text) == BeginPtr(params->context))
- lastbyte = kByteEndText;
- else
+ lastbyte = kByteEndText;
+ else
lastbyte = BeginPtr(params->text)[-1] & 0xFF;
- }
-
+ }
+
State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire);
- if (ns == NULL) {
- ns = RunStateOnByteUnlocked(s, lastbyte);
- if (ns == NULL) {
- StateSaver save_s(this, s);
- ResetCache(params->cache_lock);
- if ((s = save_s.Restore()) == NULL) {
- params->failed = true;
- return false;
- }
- ns = RunStateOnByteUnlocked(s, lastbyte);
- if (ns == NULL) {
- LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
- params->failed = true;
- return false;
- }
- }
- }
+ if (ns == NULL) {
+ ns = RunStateOnByteUnlocked(s, lastbyte);
+ if (ns == NULL) {
+ StateSaver save_s(this, s);
+ ResetCache(params->cache_lock);
+ if ((s = save_s.Restore()) == NULL) {
+ params->failed = true;
+ return false;
+ }
+ ns = RunStateOnByteUnlocked(s, lastbyte);
+ if (ns == NULL) {
+ LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
+ params->failed = true;
+ return false;
+ }
+ }
+ }
if (ns <= SpecialStateMax) {
if (ns == DeadState) {
params->ep = reinterpret_cast<const char*>(lastmatch);
return matched;
}
// FullMatchState
- params->ep = reinterpret_cast<const char*>(ep);
- return true;
- }
+ params->ep = reinterpret_cast<const char*>(ep);
+ return true;
+ }
s = ns;
if (s->IsMatch()) {
- matched = true;
- lastmatch = p;
+ matched = true;
+ lastmatch = p;
if (ExtraDebug)
fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str());
if (params->matches != NULL && kind_ == Prog::kManyMatch) {
@@ -1541,146 +1541,146 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
params->matches->insert(id);
}
}
- }
-
- params->ep = reinterpret_cast<const char*>(lastmatch);
- return matched;
-}
-
-// Inline specializations of the general loop.
-bool DFA::SearchFFF(SearchParams* params) {
+ }
+
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return matched;
+}
+
+// Inline specializations of the general loop.
+bool DFA::SearchFFF(SearchParams* params) {
return InlinedSearchLoop<false, false, false>(params);
-}
-bool DFA::SearchFFT(SearchParams* params) {
+}
+bool DFA::SearchFFT(SearchParams* params) {
return InlinedSearchLoop<false, false, true>(params);
-}
-bool DFA::SearchFTF(SearchParams* params) {
+}
+bool DFA::SearchFTF(SearchParams* params) {
return InlinedSearchLoop<false, true, false>(params);
-}
-bool DFA::SearchFTT(SearchParams* params) {
+}
+bool DFA::SearchFTT(SearchParams* params) {
return InlinedSearchLoop<false, true, true>(params);
-}
-bool DFA::SearchTFF(SearchParams* params) {
+}
+bool DFA::SearchTFF(SearchParams* params) {
return InlinedSearchLoop<true, false, false>(params);
-}
-bool DFA::SearchTFT(SearchParams* params) {
+}
+bool DFA::SearchTFT(SearchParams* params) {
return InlinedSearchLoop<true, false, true>(params);
-}
-bool DFA::SearchTTF(SearchParams* params) {
+}
+bool DFA::SearchTTF(SearchParams* params) {
return InlinedSearchLoop<true, true, false>(params);
-}
-bool DFA::SearchTTT(SearchParams* params) {
+}
+bool DFA::SearchTTT(SearchParams* params) {
return InlinedSearchLoop<true, true, true>(params);
-}
-
-// For performance, calls the appropriate specialized version
-// of InlinedSearchLoop.
-bool DFA::FastSearchLoop(SearchParams* params) {
- // Because the methods are private, the Searches array
- // cannot be declared at top level.
- static bool (DFA::*Searches[])(SearchParams*) = {
- &DFA::SearchFFF,
- &DFA::SearchFFT,
- &DFA::SearchFTF,
- &DFA::SearchFTT,
- &DFA::SearchTFF,
- &DFA::SearchTFT,
- &DFA::SearchTTF,
- &DFA::SearchTTT,
- };
-
+}
+
+// For performance, calls the appropriate specialized version
+// of InlinedSearchLoop.
+bool DFA::FastSearchLoop(SearchParams* params) {
+ // Because the methods are private, the Searches array
+ // cannot be declared at top level.
+ static bool (DFA::*Searches[])(SearchParams*) = {
+ &DFA::SearchFFF,
+ &DFA::SearchFFT,
+ &DFA::SearchFTF,
+ &DFA::SearchFTT,
+ &DFA::SearchTFF,
+ &DFA::SearchTFT,
+ &DFA::SearchTTF,
+ &DFA::SearchTTT,
+ };
+
int index = 4 * params->can_prefix_accel +
- 2 * params->want_earliest_match +
- 1 * params->run_forward;
- return (this->*Searches[index])(params);
-}
-
-
-// The discussion of DFA execution above ignored the question of how
-// to determine the initial state for the search loop. There are two
-// factors that influence the choice of start state.
-//
-// The first factor is whether the search is anchored or not.
-// The regexp program (Prog*) itself has
-// two different entry points: one for anchored searches and one for
-// unanchored searches. (The unanchored version starts with a leading ".*?"
-// and then jumps to the anchored one.)
-//
-// The second factor is where text appears in the larger context, which
-// determines which empty-string operators can be matched at the beginning
-// of execution. If text is at the very beginning of context, \A and ^ match.
-// Otherwise if text is at the beginning of a line, then ^ matches.
-// Otherwise it matters whether the character before text is a word character
-// or a non-word character.
-//
-// The two cases (unanchored vs not) and four cases (empty-string flags)
-// combine to make the eight cases recorded in the DFA's begin_text_[2],
-// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
-// StartInfos. The start state for each is filled in the first time it
-// is used for an actual search.
-
-// Examines text, context, and anchored to determine the right start
-// state for the DFA search loop. Fills in params and returns true on success.
-// Returns false on failure.
-bool DFA::AnalyzeSearch(SearchParams* params) {
- const StringPiece& text = params->text;
- const StringPiece& context = params->context;
-
- // Sanity check: make sure that text lies within context.
+ 2 * params->want_earliest_match +
+ 1 * params->run_forward;
+ return (this->*Searches[index])(params);
+}
+
+
+// The discussion of DFA execution above ignored the question of how
+// to determine the initial state for the search loop. There are two
+// factors that influence the choice of start state.
+//
+// The first factor is whether the search is anchored or not.
+// The regexp program (Prog*) itself has
+// two different entry points: one for anchored searches and one for
+// unanchored searches. (The unanchored version starts with a leading ".*?"
+// and then jumps to the anchored one.)
+//
+// The second factor is where text appears in the larger context, which
+// determines which empty-string operators can be matched at the beginning
+// of execution. If text is at the very beginning of context, \A and ^ match.
+// Otherwise if text is at the beginning of a line, then ^ matches.
+// Otherwise it matters whether the character before text is a word character
+// or a non-word character.
+//
+// The two cases (unanchored vs not) and four cases (empty-string flags)
+// combine to make the eight cases recorded in the DFA's begin_text_[2],
+// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
+// StartInfos. The start state for each is filled in the first time it
+// is used for an actual search.
+
+// Examines text, context, and anchored to determine the right start
+// state for the DFA search loop. Fills in params and returns true on success.
+// Returns false on failure.
+bool DFA::AnalyzeSearch(SearchParams* params) {
+ const StringPiece& text = params->text;
+ const StringPiece& context = params->context;
+
+ // Sanity check: make sure that text lies within context.
if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
LOG(DFATAL) << "context does not contain text";
- params->start = DeadState;
- return true;
- }
-
- // Determine correct search type.
- int start;
+ params->start = DeadState;
+ return true;
+ }
+
+ // Determine correct search type.
+ int start;
uint32_t flags;
- if (params->run_forward) {
+ if (params->run_forward) {
if (BeginPtr(text) == BeginPtr(context)) {
- start = kStartBeginText;
- flags = kEmptyBeginText|kEmptyBeginLine;
+ start = kStartBeginText;
+ flags = kEmptyBeginText|kEmptyBeginLine;
} else if (BeginPtr(text)[-1] == '\n') {
- start = kStartBeginLine;
- flags = kEmptyBeginLine;
+ start = kStartBeginLine;
+ flags = kEmptyBeginLine;
} else if (Prog::IsWordChar(BeginPtr(text)[-1] & 0xFF)) {
- start = kStartAfterWordChar;
- flags = kFlagLastWord;
- } else {
- start = kStartAfterNonWordChar;
- flags = 0;
- }
- } else {
+ start = kStartAfterWordChar;
+ flags = kFlagLastWord;
+ } else {
+ start = kStartAfterNonWordChar;
+ flags = 0;
+ }
+ } else {
if (EndPtr(text) == EndPtr(context)) {
- start = kStartBeginText;
- flags = kEmptyBeginText|kEmptyBeginLine;
+ start = kStartBeginText;
+ flags = kEmptyBeginText|kEmptyBeginLine;
} else if (EndPtr(text)[0] == '\n') {
- start = kStartBeginLine;
- flags = kEmptyBeginLine;
+ start = kStartBeginLine;
+ flags = kEmptyBeginLine;
} else if (Prog::IsWordChar(EndPtr(text)[0] & 0xFF)) {
- start = kStartAfterWordChar;
- flags = kFlagLastWord;
- } else {
- start = kStartAfterNonWordChar;
- flags = 0;
- }
- }
+ start = kStartAfterWordChar;
+ flags = kFlagLastWord;
+ } else {
+ start = kStartAfterNonWordChar;
+ flags = 0;
+ }
+ }
if (params->anchored)
- start |= kStartAnchored;
- StartInfo* info = &start_[start];
-
- // Try once without cache_lock for writing.
- // Try again after resetting the cache
- // (ResetCache will relock cache_lock for writing).
- if (!AnalyzeSearchHelper(params, info, flags)) {
- ResetCache(params->cache_lock);
- if (!AnalyzeSearchHelper(params, info, flags)) {
- LOG(DFATAL) << "Failed to analyze start state.";
- params->failed = true;
- return false;
- }
- }
-
+ start |= kStartAnchored;
+ StartInfo* info = &start_[start];
+
+ // Try once without cache_lock for writing.
+ // Try again after resetting the cache
+ // (ResetCache will relock cache_lock for writing).
+ if (!AnalyzeSearchHelper(params, info, flags)) {
+ ResetCache(params->cache_lock);
+ if (!AnalyzeSearchHelper(params, info, flags)) {
+ LOG(DFATAL) << "Failed to analyze start state.";
+ params->failed = true;
+ return false;
+ }
+ }
+
params->start = info->start.load(std::memory_order_acquire);
// Even if we could prefix accel, we cannot do so when anchored and,
@@ -1695,99 +1695,99 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
if (ExtraDebug)
fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
- params->anchored, params->run_forward, flags,
+ params->anchored, params->run_forward, flags,
DumpState(params->start).c_str(), params->can_prefix_accel);
-
- return true;
-}
-
-// Fills in info if needed. Returns true on success, false on failure.
-bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
+
+ return true;
+}
+
+// Fills in info if needed. Returns true on success, false on failure.
+bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
uint32_t flags) {
// Quick check.
State* start = info->start.load(std::memory_order_acquire);
if (start != NULL)
- return true;
-
- MutexLock l(&mutex_);
+ return true;
+
+ MutexLock l(&mutex_);
start = info->start.load(std::memory_order_relaxed);
if (start != NULL)
- return true;
-
- q0_->clear();
- AddToQueue(q0_,
- params->anchored ? prog_->start() : prog_->start_unanchored(),
- flags);
+ return true;
+
+ q0_->clear();
+ AddToQueue(q0_,
+ params->anchored ? prog_->start() : prog_->start_unanchored(),
+ flags);
start = WorkqToCachedState(q0_, NULL, flags);
if (start == NULL)
- return false;
-
+ return false;
+
// Synchronize with "quick check" above.
info->start.store(start, std::memory_order_release);
- return true;
-}
-
-// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
-bool DFA::Search(const StringPiece& text,
- const StringPiece& context,
- bool anchored,
- bool want_earliest_match,
- bool run_forward,
- bool* failed,
- const char** epp,
+ return true;
+}
+
+// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
+bool DFA::Search(const StringPiece& text,
+ const StringPiece& context,
+ bool anchored,
+ bool want_earliest_match,
+ bool run_forward,
+ bool* failed,
+ const char** epp,
SparseSet* matches) {
- *epp = NULL;
- if (!ok()) {
- *failed = true;
- return false;
- }
- *failed = false;
-
+ *epp = NULL;
+ if (!ok()) {
+ *failed = true;
+ return false;
+ }
+ *failed = false;
+
if (ExtraDebug) {
- fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
- fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
+ fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
+ fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_);
- }
-
- RWLocker l(&cache_mutex_);
- SearchParams params(text, context, &l);
- params.anchored = anchored;
- params.want_earliest_match = want_earliest_match;
- params.run_forward = run_forward;
- params.matches = matches;
-
- if (!AnalyzeSearch(&params)) {
- *failed = true;
- return false;
- }
- if (params.start == DeadState)
+ }
+
+ RWLocker l(&cache_mutex_);
+ SearchParams params(text, context, &l);
+ params.anchored = anchored;
+ params.want_earliest_match = want_earliest_match;
+ params.run_forward = run_forward;
+ params.matches = matches;
+
+ if (!AnalyzeSearch(&params)) {
+ *failed = true;
+ return false;
+ }
+ if (params.start == DeadState)
return false;
- if (params.start == FullMatchState) {
- if (run_forward == want_earliest_match)
+ if (params.start == FullMatchState) {
+ if (run_forward == want_earliest_match)
*epp = text.data();
- else
+ else
*epp = text.data() + text.size();
- return true;
- }
+ return true;
+ }
if (ExtraDebug)
- fprintf(stderr, "start %s\n", DumpState(params.start).c_str());
- bool ret = FastSearchLoop(&params);
- if (params.failed) {
- *failed = true;
- return false;
- }
- *epp = params.ep;
- return ret;
-}
-
-DFA* Prog::GetDFA(MatchKind kind) {
- // For a forward DFA, half the memory goes to each DFA.
+ fprintf(stderr, "start %s\n", DumpState(params.start).c_str());
+ bool ret = FastSearchLoop(&params);
+ if (params.failed) {
+ *failed = true;
+ return false;
+ }
+ *epp = params.ep;
+ return ret;
+}
+
+DFA* Prog::GetDFA(MatchKind kind) {
+ // For a forward DFA, half the memory goes to each DFA.
// However, if it is a "many match" DFA, then there is
// no counterpart with which the memory must be shared.
//
- // For a reverse DFA, all the memory goes to the
- // "longest match" DFA, because RE2 never does reverse
- // "first match" searches.
+ // For a reverse DFA, all the memory goes to the
+ // "longest match" DFA, because RE2 never does reverse
+ // "first match" searches.
if (kind == kFirstMatch) {
std::call_once(dfa_first_once_, [](Prog* prog) {
prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2);
@@ -1806,55 +1806,55 @@ DFA* Prog::GetDFA(MatchKind kind) {
prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_);
}, this);
return dfa_longest_;
- }
+ }
}
-
+
void Prog::DeleteDFA(DFA* dfa) {
delete dfa;
-}
-
-// Executes the regexp program to search in text,
-// which itself is inside the larger context. (As a convenience,
-// passing a NULL context is equivalent to passing text.)
-// Returns true if a match is found, false if not.
-// If a match is found, fills in match0->end() to point at the end of the match
-// and sets match0->begin() to text.begin(), since the DFA can't track
-// where the match actually began.
-//
-// This is the only external interface (class DFA only exists in this file).
-//
-bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
+}
+
+// Executes the regexp program to search in text,
+// which itself is inside the larger context. (As a convenience,
+// passing a NULL context is equivalent to passing text.)
+// Returns true if a match is found, false if not.
+// If a match is found, fills in match0->end() to point at the end of the match
+// and sets match0->begin() to text.begin(), since the DFA can't track
+// where the match actually began.
+//
+// This is the only external interface (class DFA only exists in this file).
+//
+bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
Anchor anchor, MatchKind kind, StringPiece* match0,
bool* failed, SparseSet* matches) {
- *failed = false;
-
- StringPiece context = const_context;
+ *failed = false;
+
+ StringPiece context = const_context;
if (context.data() == NULL)
- context = text;
+ context = text;
bool caret = anchor_start();
- bool dollar = anchor_end();
- if (reversed_) {
+ bool dollar = anchor_end();
+ if (reversed_) {
using std::swap;
swap(caret, dollar);
- }
+ }
if (caret && BeginPtr(context) != BeginPtr(text))
- return false;
+ return false;
if (dollar && EndPtr(context) != EndPtr(text))
- return false;
-
- // Handle full match by running an anchored longest match
- // and then checking if it covers all of text.
- bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
- bool endmatch = false;
- if (kind == kManyMatch) {
+ return false;
+
+ // Handle full match by running an anchored longest match
+ // and then checking if it covers all of text.
+ bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
+ bool endmatch = false;
+ if (kind == kManyMatch) {
// This is split out in order to avoid clobbering kind.
- } else if (kind == kFullMatch || anchor_end()) {
- endmatch = true;
- kind = kLongestMatch;
- }
-
- // If the caller doesn't care where the match is (just whether one exists),
- // then we can stop at the very first match we find, the so-called
+ } else if (kind == kFullMatch || anchor_end()) {
+ endmatch = true;
+ kind = kLongestMatch;
+ }
+
+ // If the caller doesn't care where the match is (just whether one exists),
+ // then we can stop at the very first match we find, the so-called
// "earliest match".
bool want_earliest_match = false;
if (kind == kManyMatch) {
@@ -1864,62 +1864,62 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
}
} else if (match0 == NULL && !endmatch) {
want_earliest_match = true;
- kind = kLongestMatch;
- }
-
- DFA* dfa = GetDFA(kind);
- const char* ep;
- bool matched = dfa->Search(text, context, anchored,
+ kind = kLongestMatch;
+ }
+
+ DFA* dfa = GetDFA(kind);
+ const char* ep;
+ bool matched = dfa->Search(text, context, anchored,
want_earliest_match, !reversed_,
- failed, &ep, matches);
+ failed, &ep, matches);
if (*failed) {
hooks::GetDFASearchFailureHook()({
// Nothing yet...
});
- return false;
+ return false;
}
- if (!matched)
- return false;
+ if (!matched)
+ return false;
if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size()))
- return false;
-
- // If caller cares, record the boundary of the match.
- // We only know where it ends, so use the boundary of text
- // as the beginning.
- if (match0) {
- if (reversed_)
+ return false;
+
+ // If caller cares, record the boundary of the match.
+ // We only know where it ends, so use the boundary of text
+ // as the beginning.
+ if (match0) {
+ if (reversed_)
*match0 =
StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep));
- else
+ else
*match0 =
StringPiece(text.data(), static_cast<size_t>(ep - text.data()));
- }
- return true;
-}
-
-// Build out all states in DFA. Returns number of states.
+ }
+ return true;
+}
+
+// Build out all states in DFA. Returns number of states.
int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
- if (!ok())
- return 0;
-
- // Pick out start state for unanchored search
- // at beginning of text.
- RWLocker l(&cache_mutex_);
+ if (!ok())
+ return 0;
+
+ // Pick out start state for unanchored search
+ // at beginning of text.
+ RWLocker l(&cache_mutex_);
SearchParams params(StringPiece(), StringPiece(), &l);
- params.anchored = false;
+ params.anchored = false;
if (!AnalyzeSearch(&params) ||
params.start == NULL ||
params.start == DeadState)
- return 0;
-
- // Add start state to work queue.
+ return 0;
+
+ // Add start state to work queue.
// Note that any State* that we handle here must point into the cache,
// so we can simply depend on pointer-as-a-number hashing and equality.
std::unordered_map<State*, int> m;
std::deque<State*> q;
m.emplace(params.start, static_cast<int>(m.size()));
- q.push_back(params.start);
-
+ q.push_back(params.start);
+
// Compute the input bytes needed to cover all of the next pointers.
int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
std::vector<int> input(nnext);
@@ -1934,13 +1934,13 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
// Scratch space for the output.
std::vector<int> output(nnext);
- // Flood to expand every state.
+ // Flood to expand every state.
bool oom = false;
while (!q.empty()) {
State* s = q.front();
q.pop_front();
for (int c : input) {
- State* ns = RunStateOnByteUnlocked(s, c);
+ State* ns = RunStateOnByteUnlocked(s, c);
if (ns == NULL) {
oom = true;
break;
@@ -1951,168 +1951,168 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
}
if (m.find(ns) == m.end()) {
m.emplace(ns, static_cast<int>(m.size()));
- q.push_back(ns);
- }
+ q.push_back(ns);
+ }
output[ByteMap(c)] = m[ns];
- }
+ }
if (cb)
cb(oom ? NULL : output.data(),
s == FullMatchState || s->IsMatch());
if (oom)
break;
- }
-
+ }
+
return static_cast<int>(m.size());
-}
-
-// Build out all states in DFA for kind. Returns number of states.
+}
+
+// Build out all states in DFA for kind. Returns number of states.
int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) {
return GetDFA(kind)->BuildAllStates(cb);
-}
-
-// Computes min and max for matching string.
-// Won't return strings bigger than maxlen.
+}
+
+// Computes min and max for matching string.
+// Won't return strings bigger than maxlen.
bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
- if (!ok())
- return false;
-
- // NOTE: if future users of PossibleMatchRange want more precision when
- // presented with infinitely repeated elements, consider making this a
- // parameter to PossibleMatchRange.
- static int kMaxEltRepetitions = 0;
-
- // Keep track of the number of times we've visited states previously. We only
- // revisit a given state if it's part of a repeated group, so if the value
- // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
- // |*max| to |PrefixSuccessor(*max)|.
- //
- // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
- // tradition, implicitly insert a '0' value at first use. We take advantage
- // of that property below.
+ if (!ok())
+ return false;
+
+ // NOTE: if future users of PossibleMatchRange want more precision when
+ // presented with infinitely repeated elements, consider making this a
+ // parameter to PossibleMatchRange.
+ static int kMaxEltRepetitions = 0;
+
+ // Keep track of the number of times we've visited states previously. We only
+ // revisit a given state if it's part of a repeated group, so if the value
+ // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
+ // |*max| to |PrefixSuccessor(*max)|.
+ //
+ // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
+ // tradition, implicitly insert a '0' value at first use. We take advantage
+ // of that property below.
std::unordered_map<State*, int> previously_visited_states;
-
- // Pick out start state for anchored search at beginning of text.
- RWLocker l(&cache_mutex_);
+
+ // Pick out start state for anchored search at beginning of text.
+ RWLocker l(&cache_mutex_);
SearchParams params(StringPiece(), StringPiece(), &l);
- params.anchored = true;
- if (!AnalyzeSearch(&params))
- return false;
- if (params.start == DeadState) { // No matching strings
- *min = "";
- *max = "";
- return true;
- }
- if (params.start == FullMatchState) // Every string matches: no max
- return false;
-
- // The DFA is essentially a big graph rooted at params.start,
- // and paths in the graph correspond to accepted strings.
- // Each node in the graph has potentially 256+1 arrows
- // coming out, one for each byte plus the magic end of
- // text character kByteEndText.
-
- // To find the smallest possible prefix of an accepted
- // string, we just walk the graph preferring to follow
- // arrows with the lowest bytes possible. To find the
- // largest possible prefix, we follow the largest bytes
- // possible.
-
- // The test for whether there is an arrow from s on byte j is
- // ns = RunStateOnByteUnlocked(s, j);
- // if (ns == NULL)
- // return false;
- // if (ns != DeadState && ns->ninst > 0)
- // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
- // It returns NULL only if the DFA has run out of memory,
- // in which case we can't be sure of anything.
- // The second check sees whether there was graph built
- // and whether it is interesting graph. Nodes might have
- // ns->ninst == 0 if they exist only to represent the fact
- // that a match was found on the previous byte.
-
- // Build minimum prefix.
- State* s = params.start;
- min->clear();
+ params.anchored = true;
+ if (!AnalyzeSearch(&params))
+ return false;
+ if (params.start == DeadState) { // No matching strings
+ *min = "";
+ *max = "";
+ return true;
+ }
+ if (params.start == FullMatchState) // Every string matches: no max
+ return false;
+
+ // The DFA is essentially a big graph rooted at params.start,
+ // and paths in the graph correspond to accepted strings.
+ // Each node in the graph has potentially 256+1 arrows
+ // coming out, one for each byte plus the magic end of
+ // text character kByteEndText.
+
+ // To find the smallest possible prefix of an accepted
+ // string, we just walk the graph preferring to follow
+ // arrows with the lowest bytes possible. To find the
+ // largest possible prefix, we follow the largest bytes
+ // possible.
+
+ // The test for whether there is an arrow from s on byte j is
+ // ns = RunStateOnByteUnlocked(s, j);
+ // if (ns == NULL)
+ // return false;
+ // if (ns != DeadState && ns->ninst > 0)
+ // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
+ // It returns NULL only if the DFA has run out of memory,
+ // in which case we can't be sure of anything.
+ // The second check sees whether there was graph built
+ // and whether it is interesting graph. Nodes might have
+ // ns->ninst == 0 if they exist only to represent the fact
+ // that a match was found on the previous byte.
+
+ // Build minimum prefix.
+ State* s = params.start;
+ min->clear();
MutexLock lock(&mutex_);
- for (int i = 0; i < maxlen; i++) {
+ for (int i = 0; i < maxlen; i++) {
if (previously_visited_states[s] > kMaxEltRepetitions)
- break;
- previously_visited_states[s]++;
-
- // Stop if min is a match.
+ break;
+ previously_visited_states[s]++;
+
+ // Stop if min is a match.
State* ns = RunStateOnByte(s, kByteEndText);
- if (ns == NULL) // DFA out of memory
- return false;
- if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
- break;
-
- // Try to extend the string with low bytes.
- bool extended = false;
- for (int j = 0; j < 256; j++) {
+ if (ns == NULL) // DFA out of memory
+ return false;
+ if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
+ break;
+
+ // Try to extend the string with low bytes.
+ bool extended = false;
+ for (int j = 0; j < 256; j++) {
ns = RunStateOnByte(s, j);
- if (ns == NULL) // DFA out of memory
- return false;
- if (ns == FullMatchState ||
- (ns > SpecialStateMax && ns->ninst_ > 0)) {
- extended = true;
+ if (ns == NULL) // DFA out of memory
+ return false;
+ if (ns == FullMatchState ||
+ (ns > SpecialStateMax && ns->ninst_ > 0)) {
+ extended = true;
min->append(1, static_cast<char>(j));
- s = ns;
- break;
- }
- }
- if (!extended)
- break;
- }
-
- // Build maximum prefix.
- previously_visited_states.clear();
- s = params.start;
- max->clear();
- for (int i = 0; i < maxlen; i++) {
+ s = ns;
+ break;
+ }
+ }
+ if (!extended)
+ break;
+ }
+
+ // Build maximum prefix.
+ previously_visited_states.clear();
+ s = params.start;
+ max->clear();
+ for (int i = 0; i < maxlen; i++) {
if (previously_visited_states[s] > kMaxEltRepetitions)
- break;
- previously_visited_states[s] += 1;
-
- // Try to extend the string with high bytes.
- bool extended = false;
- for (int j = 255; j >= 0; j--) {
+ break;
+ previously_visited_states[s] += 1;
+
+ // Try to extend the string with high bytes.
+ bool extended = false;
+ for (int j = 255; j >= 0; j--) {
State* ns = RunStateOnByte(s, j);
- if (ns == NULL)
- return false;
- if (ns == FullMatchState ||
- (ns > SpecialStateMax && ns->ninst_ > 0)) {
- extended = true;
+ if (ns == NULL)
+ return false;
+ if (ns == FullMatchState ||
+ (ns > SpecialStateMax && ns->ninst_ > 0)) {
+ extended = true;
max->append(1, static_cast<char>(j));
- s = ns;
- break;
- }
- }
- if (!extended) {
- // Done, no need for PrefixSuccessor.
- return true;
- }
- }
-
- // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
+ s = ns;
+ break;
+ }
+ }
+ if (!extended) {
+ // Done, no need for PrefixSuccessor.
+ return true;
+ }
+ }
+
+ // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
PrefixSuccessor(max);
-
- // If there are no bytes left, we have no way to say "there is no maximum
- // string". We could make the interface more complicated and be able to
- // return "there is no maximum but here is a minimum", but that seems like
- // overkill -- the most common no-max case is all possible strings, so not
- // telling the caller that the empty string is the minimum match isn't a
- // great loss.
- if (max->empty())
- return false;
-
- return true;
-}
-
-// PossibleMatchRange for a Prog.
+
+ // If there are no bytes left, we have no way to say "there is no maximum
+ // string". We could make the interface more complicated and be able to
+ // return "there is no maximum but here is a minimum", but that seems like
+ // overkill -- the most common no-max case is all possible strings, so not
+ // telling the caller that the empty string is the minimum match isn't a
+ // great loss.
+ if (max->empty())
+ return false;
+
+ return true;
+}
+
+// PossibleMatchRange for a Prog.
bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
// Have to use dfa_longest_ to get all strings for full matches.
// For example, (a|aa) never matches aa in first-match mode.
return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen);
-}
-
-} // namespace re2
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/filtered_re2.cc b/contrib/libs/re2/re2/filtered_re2.cc
index 3de2ec8124..5df97456e2 100644
--- a/contrib/libs/re2/re2/filtered_re2.cc
+++ b/contrib/libs/re2/re2/filtered_re2.cc
@@ -1,8 +1,8 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "re2/filtered_re2.h"
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/filtered_re2.h"
#include <stddef.h>
#include <string>
@@ -10,26 +10,26 @@
#include "util/util.h"
#include "util/logging.h"
-#include "re2/prefilter.h"
-#include "re2/prefilter_tree.h"
-
-namespace re2 {
-
-FilteredRE2::FilteredRE2()
- : compiled_(false),
- prefilter_tree_(new PrefilterTree()) {
-}
-
+#include "re2/prefilter.h"
+#include "re2/prefilter_tree.h"
+
+namespace re2 {
+
+FilteredRE2::FilteredRE2()
+ : compiled_(false),
+ prefilter_tree_(new PrefilterTree()) {
+}
+
FilteredRE2::FilteredRE2(int min_atom_len)
: compiled_(false),
prefilter_tree_(new PrefilterTree(min_atom_len)) {
}
-FilteredRE2::~FilteredRE2() {
+FilteredRE2::~FilteredRE2() {
for (size_t i = 0; i < re2_vec_.size(); i++)
- delete re2_vec_[i];
-}
-
+ delete re2_vec_[i];
+}
+
FilteredRE2::FilteredRE2(FilteredRE2&& other)
: re2_vec_(std::move(other.re2_vec_)),
compiled_(other.compiled_),
@@ -46,79 +46,79 @@ FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) {
return *this;
}
-RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
- const RE2::Options& options, int* id) {
- RE2* re = new RE2(pattern, options);
- RE2::ErrorCode code = re->error_code();
-
- if (!re->ok()) {
+RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
+ const RE2::Options& options, int* id) {
+ RE2* re = new RE2(pattern, options);
+ RE2::ErrorCode code = re->error_code();
+
+ if (!re->ok()) {
if (options.log_errors()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< pattern << " due to error " << re->error();
}
- delete re;
- } else {
+ delete re;
+ } else {
*id = static_cast<int>(re2_vec_.size());
- re2_vec_.push_back(re);
- }
-
- return code;
-}
-
+ re2_vec_.push_back(re);
+ }
+
+ return code;
+}
+
void FilteredRE2::Compile(std::vector<std::string>* atoms) {
if (compiled_) {
LOG(ERROR) << "Compile called already.";
- return;
- }
-
+ return;
+ }
+
if (re2_vec_.empty()) {
LOG(ERROR) << "Compile called before Add.";
return;
}
for (size_t i = 0; i < re2_vec_.size(); i++) {
- Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
- prefilter_tree_->Add(prefilter);
- }
- atoms->clear();
- prefilter_tree_->Compile(atoms);
- compiled_ = true;
-}
-
-int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
+ Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
+ prefilter_tree_->Add(prefilter);
+ }
+ atoms->clear();
+ prefilter_tree_->Compile(atoms);
+ compiled_ = true;
+}
+
+int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (size_t i = 0; i < re2_vec_.size(); i++)
- if (RE2::PartialMatch(text, *re2_vec_[i]))
+ if (RE2::PartialMatch(text, *re2_vec_[i]))
return static_cast<int>(i);
- return -1;
-}
-
-int FilteredRE2::FirstMatch(const StringPiece& text,
+ return -1;
+}
+
+int FilteredRE2::FirstMatch(const StringPiece& text,
const std::vector<int>& atoms) const {
- if (!compiled_) {
+ if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile.";
- return -1;
- }
+ return -1;
+ }
std::vector<int> regexps;
- prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+ prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
- if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
- return regexps[i];
- return -1;
-}
-
-bool FilteredRE2::AllMatches(
- const StringPiece& text,
+ if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+ return regexps[i];
+ return -1;
+}
+
+bool FilteredRE2::AllMatches(
+ const StringPiece& text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const {
- matching_regexps->clear();
+ matching_regexps->clear();
std::vector<int> regexps;
- prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+ prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
- if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
- matching_regexps->push_back(regexps[i]);
- return !matching_regexps->empty();
-}
-
+ if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+ matching_regexps->push_back(regexps[i]);
+ return !matching_regexps->empty();
+}
+
void FilteredRE2::AllPotentials(
const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const {
@@ -127,11 +127,11 @@ void FilteredRE2::AllPotentials(
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps) {
- prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
-}
-
-void FilteredRE2::PrintPrefilter(int regexpid) {
- prefilter_tree_->PrintPrefilter(regexpid);
-}
-
+ prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
+}
+
+void FilteredRE2::PrintPrefilter(int regexpid) {
+ prefilter_tree_->PrintPrefilter(regexpid);
+}
+
} // namespace re2
diff --git a/contrib/libs/re2/re2/filtered_re2.h b/contrib/libs/re2/re2/filtered_re2.h
index c436b2eca2..dd618c70e8 100644
--- a/contrib/libs/re2/re2/filtered_re2.h
+++ b/contrib/libs/re2/re2/filtered_re2.h
@@ -1,17 +1,17 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
-// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
-// It provides a prefilter mechanism that helps in cutting down the
-// number of regexps that need to be actually searched.
-//
-// By design, it does not include a string matching engine. This is to
+// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
+// It provides a prefilter mechanism that helps in cutting down the
+// number of regexps that need to be actually searched.
+//
+// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string matching
-// engine. The overall flow is: Add all the regexps using Add, then
+// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. Compile returns strings that need to be
// matched. Note that the returned strings are lowercased and distinct.
// For applying regexps to a search text, the caller does the string
@@ -20,23 +20,23 @@
// on a lowercased version of the search text. Then call FirstMatch
// or AllMatches with a vector of indices of strings that were found
// in the text to get the actual regexp matches.
-
+
#include <memory>
#include <string>
#include <vector>
-
+
#include "re2/re2.h"
-
-namespace re2 {
-
-class PrefilterTree;
-
-class FilteredRE2 {
- public:
- FilteredRE2();
+
+namespace re2 {
+
+class PrefilterTree;
+
+class FilteredRE2 {
+ public:
+ FilteredRE2();
explicit FilteredRE2(int min_atom_len);
- ~FilteredRE2();
-
+ ~FilteredRE2();
+
// Not copyable.
FilteredRE2(const FilteredRE2&) = delete;
FilteredRE2& operator=(const FilteredRE2&) = delete;
@@ -44,39 +44,39 @@ class FilteredRE2 {
FilteredRE2(FilteredRE2&& other);
FilteredRE2& operator=(FilteredRE2&& other);
- // Uses RE2 constructor to create a RE2 object (re). Returns
- // re->error_code(). If error_code is other than NoError, then re is
- // deleted and not added to re2_vec_.
- RE2::ErrorCode Add(const StringPiece& pattern,
- const RE2::Options& options,
+ // Uses RE2 constructor to create a RE2 object (re). Returns
+ // re->error_code(). If error_code is other than NoError, then re is
+ // deleted and not added to re2_vec_.
+ RE2::ErrorCode Add(const StringPiece& pattern,
+ const RE2::Options& options,
int* id);
-
- // Prepares the regexps added by Add for filtering. Returns a set
- // of strings that the caller should check for in candidate texts.
+
+ // Prepares the regexps added by Add for filtering. Returns a set
+ // of strings that the caller should check for in candidate texts.
// The returned strings are lowercased and distinct. When doing
// string matching, it should be performed in a case-insensitive
// way or the search text should be lowercased first. Call after
- // all Add calls are done.
+ // all Add calls are done.
void Compile(std::vector<std::string>* strings_to_match);
-
- // Returns the index of the first matching regexp.
- // Returns -1 on no match. Can be called prior to Compile.
- // Does not do any filtering: simply tries to Match the
- // regexps in a loop.
- int SlowFirstMatch(const StringPiece& text) const;
-
- // Returns the index of the first matching regexp.
- // Returns -1 on no match. Compile has to be called before
- // calling this.
- int FirstMatch(const StringPiece& text,
+
+ // Returns the index of the first matching regexp.
+ // Returns -1 on no match. Can be called prior to Compile.
+ // Does not do any filtering: simply tries to Match the
+ // regexps in a loop.
+ int SlowFirstMatch(const StringPiece& text) const;
+
+ // Returns the index of the first matching regexp.
+ // Returns -1 on no match. Compile has to be called before
+ // calling this.
+ int FirstMatch(const StringPiece& text,
const std::vector<int>& atoms) const;
-
- // Returns the indices of all matching regexps, after first clearing
- // matched_regexps.
- bool AllMatches(const StringPiece& text,
+
+ // Returns the indices of all matching regexps, after first clearing
+ // matched_regexps.
+ bool AllMatches(const StringPiece& text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const;
-
+
// Returns the indices of all potentially matching regexps after first
// clearing potential_regexps.
// A regexp is potentially matching if it passes the filter.
@@ -85,30 +85,30 @@ class FilteredRE2 {
void AllPotentials(const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const;
- // The number of regexps added.
+ // The number of regexps added.
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
-
+
// Get the individual RE2 objects.
const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; }
- private:
- // Print prefilter.
- void PrintPrefilter(int regexpid);
-
- // Useful for testing and debugging.
+ private:
+ // Print prefilter.
+ void PrintPrefilter(int regexpid);
+
+ // Useful for testing and debugging.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps);
-
- // All the regexps in the FilteredRE2.
+
+ // All the regexps in the FilteredRE2.
std::vector<RE2*> re2_vec_;
-
- // Has the FilteredRE2 been compiled using Compile()
- bool compiled_;
-
- // An AND-OR tree of string atoms used for filtering regexps.
+
+ // Has the FilteredRE2 been compiled using Compile()
+ bool compiled_;
+
+ // An AND-OR tree of string atoms used for filtering regexps.
std::unique_ptr<PrefilterTree> prefilter_tree_;
-};
-
-} // namespace re2
-
-#endif // RE2_FILTERED_RE2_H_
+};
+
+} // namespace re2
+
+#endif // RE2_FILTERED_RE2_H_
diff --git a/contrib/libs/re2/re2/mimics_pcre.cc b/contrib/libs/re2/re2/mimics_pcre.cc
index 7be60e4212..b1d6a51228 100644
--- a/contrib/libs/re2/re2/mimics_pcre.cc
+++ b/contrib/libs/re2/re2/mimics_pcre.cc
@@ -1,44 +1,44 @@
-// Copyright 2008 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Determine whether this library should match PCRE exactly
-// for a particular Regexp. (If so, the testing framework can
-// check that it does.)
-//
-// This library matches PCRE except in these cases:
-// * the regexp contains a repetition of an empty string,
-// like (a*)* or (a*)+. In this case, PCRE will treat
-// the repetition sequence as ending with an empty string,
-// while this library does not.
-// * Perl and PCRE differ on whether \v matches \n.
-// For historical reasons, this library implements the Perl behavior.
-// * Perl and PCRE allow $ in one-line mode to match either the very
-// end of the text or just before a \n at the end of the text.
-// This library requires it to match only the end of the text.
-// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
-// match the end of the text if the last character is a \n.
-// This library does allow it.
-//
-// Regexp::MimicsPCRE checks for any of these conditions.
-
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Determine whether this library should match PCRE exactly
+// for a particular Regexp. (If so, the testing framework can
+// check that it does.)
+//
+// This library matches PCRE except in these cases:
+// * the regexp contains a repetition of an empty string,
+// like (a*)* or (a*)+. In this case, PCRE will treat
+// the repetition sequence as ending with an empty string,
+// while this library does not.
+// * Perl and PCRE differ on whether \v matches \n.
+// For historical reasons, this library implements the Perl behavior.
+// * Perl and PCRE allow $ in one-line mode to match either the very
+// end of the text or just before a \n at the end of the text.
+// This library requires it to match only the end of the text.
+// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
+// match the end of the text if the last character is a \n.
+// This library does allow it.
+//
+// Regexp::MimicsPCRE checks for any of these conditions.
+
#include "util/util.h"
#include "util/logging.h"
-#include "re2/regexp.h"
-#include "re2/walker-inl.h"
-
-namespace re2 {
-
-// Returns whether re might match an empty string.
-static bool CanBeEmptyString(Regexp *re);
-
-// Walker class to compute whether library handles a regexp
-// exactly as PCRE would. See comment at top for conditions.
-
-class PCREWalker : public Regexp::Walker<bool> {
- public:
- PCREWalker() {}
-
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Returns whether re might match an empty string.
+static bool CanBeEmptyString(Regexp *re);
+
+// Walker class to compute whether library handles a regexp
+// exactly as PCRE would. See comment at top for conditions.
+
+class PCREWalker : public Regexp::Walker<bool> {
+ public:
+ PCREWalker() {}
+
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
@@ -47,151 +47,151 @@ class PCREWalker : public Regexp::Walker<bool> {
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "PCREWalker::ShortVisit called";
#endif
- return a;
- }
+ return a;
+ }
private:
PCREWalker(const PCREWalker&) = delete;
PCREWalker& operator=(const PCREWalker&) = delete;
-};
-
-// Called after visiting each of re's children and accumulating
-// the return values in child_args. So child_args contains whether
-// this library mimics PCRE for those subexpressions.
-bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
- bool* child_args, int nchild_args) {
- // If children failed, so do we.
- for (int i = 0; i < nchild_args; i++)
- if (!child_args[i])
- return false;
-
- // Otherwise look for other reasons to fail.
- switch (re->op()) {
- // Look for repeated empty string.
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- if (CanBeEmptyString(re->sub()[0]))
- return false;
- break;
- case kRegexpRepeat:
- if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
- return false;
- break;
-
- // Look for \v
- case kRegexpLiteral:
- if (re->rune() == '\v')
- return false;
- break;
-
- // Look for $ in single-line mode.
- case kRegexpEndText:
- case kRegexpEmptyMatch:
- if (re->parse_flags() & Regexp::WasDollar)
- return false;
- break;
-
- // Look for ^ in multi-line mode.
- case kRegexpBeginLine:
- // No condition: in single-line mode ^ becomes kRegexpBeginText.
- return false;
-
- default:
- break;
- }
-
- // Not proven guilty.
- return true;
-}
-
-// Returns whether this regexp's behavior will mimic PCRE's exactly.
-bool Regexp::MimicsPCRE() {
- PCREWalker w;
- return w.Walk(this, true);
-}
-
-
-// Walker class to compute whether a Regexp can match an empty string.
-// It is okay to overestimate. For example, \b\B cannot match an empty
-// string, because \b and \B are mutually exclusive, but this isn't
-// that smart and will say it can. Spurious empty strings
-// will reduce the number of regexps we sanity check against PCRE,
-// but they won't break anything.
-
-class EmptyStringWalker : public Regexp::Walker<bool> {
- public:
+};
+
+// Called after visiting each of re's children and accumulating
+// the return values in child_args. So child_args contains whether
+// this library mimics PCRE for those subexpressions.
+bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args) {
+ // If children failed, so do we.
+ for (int i = 0; i < nchild_args; i++)
+ if (!child_args[i])
+ return false;
+
+ // Otherwise look for other reasons to fail.
+ switch (re->op()) {
+ // Look for repeated empty string.
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ if (CanBeEmptyString(re->sub()[0]))
+ return false;
+ break;
+ case kRegexpRepeat:
+ if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
+ return false;
+ break;
+
+ // Look for \v
+ case kRegexpLiteral:
+ if (re->rune() == '\v')
+ return false;
+ break;
+
+ // Look for $ in single-line mode.
+ case kRegexpEndText:
+ case kRegexpEmptyMatch:
+ if (re->parse_flags() & Regexp::WasDollar)
+ return false;
+ break;
+
+ // Look for ^ in multi-line mode.
+ case kRegexpBeginLine:
+ // No condition: in single-line mode ^ becomes kRegexpBeginText.
+ return false;
+
+ default:
+ break;
+ }
+
+ // Not proven guilty.
+ return true;
+}
+
+// Returns whether this regexp's behavior will mimic PCRE's exactly.
+bool Regexp::MimicsPCRE() {
+ PCREWalker w;
+ return w.Walk(this, true);
+}
+
+
+// Walker class to compute whether a Regexp can match an empty string.
+// It is okay to overestimate. For example, \b\B cannot match an empty
+// string, because \b and \B are mutually exclusive, but this isn't
+// that smart and will say it can. Spurious empty strings
+// will reduce the number of regexps we sanity check against PCRE,
+// but they won't break anything.
+
+class EmptyStringWalker : public Regexp::Walker<bool> {
+ public:
EmptyStringWalker() {}
-
+
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
virtual bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
- LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+ LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
#endif
- return a;
- }
-
- private:
+ return a;
+ }
+
+ private:
EmptyStringWalker(const EmptyStringWalker&) = delete;
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
-};
-
-// Called after visiting re's children. child_args contains the return
-// value from each of the children's PostVisits (i.e., whether each child
-// can match an empty string). Returns whether this clause can match an
-// empty string.
-bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
- bool* child_args, int nchild_args) {
- switch (re->op()) {
- case kRegexpNoMatch: // never empty
- case kRegexpLiteral:
- case kRegexpAnyChar:
- case kRegexpAnyByte:
- case kRegexpCharClass:
- case kRegexpLiteralString:
- return false;
-
- case kRegexpEmptyMatch: // always empty
- case kRegexpBeginLine: // always empty, when they match
- case kRegexpEndLine:
- case kRegexpNoWordBoundary:
- case kRegexpWordBoundary:
- case kRegexpBeginText:
- case kRegexpEndText:
- case kRegexpStar: // can always be empty
- case kRegexpQuest:
- case kRegexpHaveMatch:
- return true;
-
- case kRegexpConcat: // can be empty if all children can
- for (int i = 0; i < nchild_args; i++)
- if (!child_args[i])
- return false;
- return true;
-
- case kRegexpAlternate: // can be empty if any child can
- for (int i = 0; i < nchild_args; i++)
- if (child_args[i])
- return true;
- return false;
-
- case kRegexpPlus: // can be empty if the child can
- case kRegexpCapture:
- return child_args[0];
-
- case kRegexpRepeat: // can be empty if child can or is x{0}
- return child_args[0] || re->min() == 0;
- }
- return false;
-}
-
-// Returns whether re can match an empty string.
-static bool CanBeEmptyString(Regexp* re) {
- EmptyStringWalker w;
- return w.Walk(re, true);
-}
-
-} // namespace re2
+};
+
+// Called after visiting re's children. child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string). Returns whether this clause can match an
+// empty string.
+bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+ bool* child_args, int nchild_args) {
+ switch (re->op()) {
+ case kRegexpNoMatch: // never empty
+ case kRegexpLiteral:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpCharClass:
+ case kRegexpLiteralString:
+ return false;
+
+ case kRegexpEmptyMatch: // always empty
+ case kRegexpBeginLine: // always empty, when they match
+ case kRegexpEndLine:
+ case kRegexpNoWordBoundary:
+ case kRegexpWordBoundary:
+ case kRegexpBeginText:
+ case kRegexpEndText:
+ case kRegexpStar: // can always be empty
+ case kRegexpQuest:
+ case kRegexpHaveMatch:
+ return true;
+
+ case kRegexpConcat: // can be empty if all children can
+ for (int i = 0; i < nchild_args; i++)
+ if (!child_args[i])
+ return false;
+ return true;
+
+ case kRegexpAlternate: // can be empty if any child can
+ for (int i = 0; i < nchild_args; i++)
+ if (child_args[i])
+ return true;
+ return false;
+
+ case kRegexpPlus: // can be empty if the child can
+ case kRegexpCapture:
+ return child_args[0];
+
+ case kRegexpRepeat: // can be empty if child can or is x{0}
+ return child_args[0] || re->min() == 0;
+ }
+ return false;
+}
+
+// Returns whether re can match an empty string.
+static bool CanBeEmptyString(Regexp* re) {
+ EmptyStringWalker w;
+ return w.Walk(re, true);
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/nfa.cc b/contrib/libs/re2/re2/nfa.cc
index 3c0ed1f60e..c7339f8ffd 100644
--- a/contrib/libs/re2/re2/nfa.cc
+++ b/contrib/libs/re2/re2/nfa.cc
@@ -1,29 +1,29 @@
-// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Tested by search_test.cc.
-//
-// Prog::SearchNFA, an NFA search.
-// This is an actual NFA like the theorists talk about,
-// not the pseudo-NFA found in backtracking regexp implementations.
-//
-// IMPLEMENTATION
-//
-// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
-// which is a variant of the one described in Thompson's 1968 CACM paper.
-// See http://swtch.com/~rsc/regexp/ for various history. The main feature
-// over the DFA implementation is that it tracks submatch boundaries.
-//
-// When the choice of submatch boundaries is ambiguous, this particular
-// implementation makes the same choices that traditional backtracking
-// implementations (in particular, Perl and PCRE) do.
-// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
-// time in the length of the input.
-//
-// Like Thompson's original machine and like the DFA implementation, this
-// implementation notices a match only once it is one byte past it.
-
+// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc.
+//
+// Prog::SearchNFA, an NFA search.
+// This is an actual NFA like the theorists talk about,
+// not the pseudo-NFA found in backtracking regexp implementations.
+//
+// IMPLEMENTATION
+//
+// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
+// which is a variant of the one described in Thompson's 1968 CACM paper.
+// See http://swtch.com/~rsc/regexp/ for various history. The main feature
+// over the DFA implementation is that it tracks submatch boundaries.
+//
+// When the choice of submatch boundaries is ambiguous, this particular
+// implementation makes the same choices that traditional backtracking
+// implementations (in particular, Perl and PCRE) do.
+// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
+// time in the length of the input.
+//
+// Like Thompson's original machine and like the DFA implementation, this
+// implementation notices a match only once it is one byte past it.
+
#include <stdio.h>
#include <string.h>
#include <algorithm>
@@ -35,68 +35,68 @@
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/pod_array.h"
-#include "re2/prog.h"
-#include "re2/regexp.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
#include "re2/sparse_array.h"
#include "re2/sparse_set.h"
-
-namespace re2 {
-
+
+namespace re2 {
+
static const bool ExtraDebug = false;
-class NFA {
- public:
- NFA(Prog* prog);
- ~NFA();
-
- // Searches for a matching string.
- // * If anchored is true, only considers matches starting at offset.
- // Otherwise finds lefmost match at or after offset.
- // * If longest is true, returns the longest match starting
- // at the chosen start point. Otherwise returns the so-called
- // left-biased match, the one traditional backtracking engines
- // (like Perl and PCRE) find.
- // Records submatch boundaries in submatch[1..nsubmatch-1].
- // Submatch[0] is the entire match. When there is a choice in
- // which text matches each subexpression, the submatch boundaries
- // are chosen to match what a backtracking implementation would choose.
- bool Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch);
-
- private:
- struct Thread {
- union {
+class NFA {
+ public:
+ NFA(Prog* prog);
+ ~NFA();
+
+ // Searches for a matching string.
+ // * If anchored is true, only considers matches starting at offset.
+ // Otherwise finds lefmost match at or after offset.
+ // * If longest is true, returns the longest match starting
+ // at the chosen start point. Otherwise returns the so-called
+ // left-biased match, the one traditional backtracking engines
+ // (like Perl and PCRE) find.
+ // Records submatch boundaries in submatch[1..nsubmatch-1].
+ // Submatch[0] is the entire match. When there is a choice in
+ // which text matches each subexpression, the submatch boundaries
+ // are chosen to match what a backtracking implementation would choose.
+ bool Search(const StringPiece& text, const StringPiece& context,
+ bool anchored, bool longest,
+ StringPiece* submatch, int nsubmatch);
+
+ private:
+ struct Thread {
+ union {
int ref;
- Thread* next; // when on free list
- };
- const char** capture;
- };
-
- // State for explicit stack in AddToThreadq.
- struct AddState {
+ Thread* next; // when on free list
+ };
+ const char** capture;
+ };
+
+ // State for explicit stack in AddToThreadq.
+ struct AddState {
int id; // Inst to process
Thread* t; // if not null, set t0 = t before processing id
- };
-
- // Threadq is a list of threads. The list is sorted by the order
- // in which Perl would explore that particular state -- the earlier
- // choices appear earlier in the list.
- typedef SparseArray<Thread*> Threadq;
-
- inline Thread* AllocThread();
+ };
+
+ // Threadq is a list of threads. The list is sorted by the order
+ // in which Perl would explore that particular state -- the earlier
+ // choices appear earlier in the list.
+ typedef SparseArray<Thread*> Threadq;
+
+ inline Thread* AllocThread();
inline Thread* Incref(Thread* t);
inline void Decref(Thread* t);
-
+
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0);
-
- // Run runq on byte c, appending new states to nextq.
- // Updates matched_ and match_ as new, better matches are found.
+
+ // Run runq on byte c, appending new states to nextq.
+ // Updates matched_ and match_ as new, better matches are found.
// context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
@@ -104,14 +104,14 @@ class NFA {
// If there is a shortcut to the end, returns that shortcut.
int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
const char* p);
-
- // Returns text version of capture information, for debugging.
+
+ // Returns text version of capture information, for debugging.
std::string FormatCapture(const char** capture);
-
+
void CopyCapture(const char** dst, const char** src) {
memmove(dst, src, ncapture_*sizeof src[0]);
}
-
+
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
@@ -125,53 +125,53 @@ class NFA {
Thread* freelist_; // thread freelist
const char** match_; // best match so far
bool matched_; // any match so far?
-
+
NFA(const NFA&) = delete;
NFA& operator=(const NFA&) = delete;
-};
-
-NFA::NFA(Prog* prog) {
- prog_ = prog;
+};
+
+NFA::NFA(Prog* prog) {
+ prog_ = prog;
start_ = prog_->start();
- ncapture_ = 0;
- longest_ = false;
- endmatch_ = false;
- btext_ = NULL;
- etext_ = NULL;
- q0_.resize(prog_->size());
- q1_.resize(prog_->size());
+ ncapture_ = 0;
+ longest_ = false;
+ endmatch_ = false;
+ btext_ = NULL;
+ etext_ = NULL;
+ q0_.resize(prog_->size());
+ q1_.resize(prog_->size());
// See NFA::AddToThreadq() for why this is so.
int nstack = 2*prog_->inst_count(kInstCapture) +
prog_->inst_count(kInstEmptyWidth) +
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
stack_ = PODArray<AddState>(nstack);
freelist_ = NULL;
- match_ = NULL;
- matched_ = false;
-}
-
-NFA::~NFA() {
- delete[] match_;
+ match_ = NULL;
+ matched_ = false;
+}
+
+NFA::~NFA() {
+ delete[] match_;
for (const Thread& t : arena_)
delete[] t.capture;
-}
-
-NFA::Thread* NFA::AllocThread() {
+}
+
+NFA::Thread* NFA::AllocThread() {
Thread* t = freelist_;
if (t != NULL) {
freelist_ = t->next;
t->ref = 1;
// We don't need to touch t->capture because
// the caller will immediately overwrite it.
- return t;
- }
+ return t;
+ }
arena_.emplace_back();
t = &arena_.back();
t->ref = 1;
t->capture = new const char*[ncapture_];
- return t;
-}
-
+ return t;
+}
+
NFA::Thread* NFA::Incref(Thread* t) {
DCHECK(t != NULL);
t->ref++;
@@ -194,9 +194,9 @@ void NFA::Decref(Thread* t) {
// p is the current input position, and t0 is the current thread.
void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0) {
- if (id0 == 0)
- return;
-
+ if (id0 == 0)
+ return;
+
// Use stack_ to hold our stack of instructions yet to process.
// It was preallocated as follows:
// two entries per Capture;
@@ -206,12 +206,12 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
// perform. (Each instruction can be processed at most once.)
AddState* stk = stack_.data();
int nstk = 0;
-
+
stk[nstk++] = {id0, NULL};
- while (nstk > 0) {
+ while (nstk > 0) {
DCHECK_LE(nstk, stack_.size());
AddState a = stk[--nstk];
-
+
Loop:
if (a.t != NULL) {
// t0 was a thread that we allocated and copied in order to
@@ -220,76 +220,76 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
t0 = a.t;
}
- int id = a.id;
- if (id == 0)
- continue;
- if (q->has_index(id)) {
+ int id = a.id;
+ if (id == 0)
+ continue;
+ if (q->has_index(id)) {
if (ExtraDebug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
- continue;
- }
-
- // Create entry in q no matter what. We might fill it in below,
- // or we might not. Even if not, it is necessary to have it,
+ continue;
+ }
+
+ // Create entry in q no matter what. We might fill it in below,
+ // or we might not. Even if not, it is necessary to have it,
// so that we don't revisit id0 during the recursion.
- q->set_new(id, NULL);
+ q->set_new(id, NULL);
Thread** tp = &q->get_existing(id);
- int j;
- Thread* t;
- Prog::Inst* ip = prog_->inst(id);
- switch (ip->opcode()) {
- default:
- LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
- break;
-
- case kInstFail:
- break;
-
- case kInstAltMatch:
- // Save state; will pick up at next byte.
+ int j;
+ Thread* t;
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
+ break;
+
+ case kInstFail:
+ break;
+
+ case kInstAltMatch:
+ // Save state; will pick up at next byte.
t = Incref(t0);
- *tp = t;
-
+ *tp = t;
+
DCHECK(!ip->last());
a = {id+1, NULL};
goto Loop;
-
- case kInstNop:
+
+ case kInstNop:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
- // Continue on.
+ // Continue on.
a = {ip->out(), NULL};
goto Loop;
-
- case kInstCapture:
+
+ case kInstCapture:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
- if ((j=ip->cap()) < ncapture_) {
+ if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore t0
- // once we finish exploring this possibility.
+ // once we finish exploring this possibility.
stk[nstk++] = {0, t0};
-
- // Record capture.
+
+ // Record capture.
t = AllocThread();
CopyCapture(t->capture, t0->capture);
t->capture[j] = p;
t0 = t;
- }
+ }
a = {ip->out(), NULL};
goto Loop;
-
+
case kInstByteRange:
if (!ip->Matches(c))
goto Next;
- // Save state; will pick up at next byte.
+ // Save state; will pick up at next byte.
t = Incref(t0);
- *tp = t;
+ *tp = t;
if (ExtraDebug)
fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
-
+
if (ip->hint() == 0)
break;
a = {id+ip->hint(), NULL};
@@ -308,61 +308,61 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
a = {id+1, NULL};
goto Loop;
- case kInstEmptyWidth:
+ case kInstEmptyWidth:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
- // Continue on if we have all the right flag bits.
+ // Continue on if we have all the right flag bits.
if (ip->empty() & ~Prog::EmptyFlags(context, p))
- break;
+ break;
a = {ip->out(), NULL};
goto Loop;
- }
- }
-}
-
-// Run runq on byte c, appending new states to nextq.
+ }
+ }
+}
+
+// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
-// Frees all the threads on runq.
-// If there is a shortcut to the end, returns that shortcut.
+// Frees all the threads on runq.
+// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
const char* p) {
- nextq->clear();
-
- for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+ nextq->clear();
+
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->value();
- if (t == NULL)
- continue;
-
- if (longest_) {
- // Can skip any threads started after our current best match.
- if (matched_ && match_[0] < t->capture[0]) {
+ if (t == NULL)
+ continue;
+
+ if (longest_) {
+ // Can skip any threads started after our current best match.
+ if (matched_ && match_[0] < t->capture[0]) {
Decref(t);
- continue;
- }
- }
-
+ continue;
+ }
+ }
+
int id = i->index();
- Prog::Inst* ip = prog_->inst(id);
-
- switch (ip->opcode()) {
- default:
- // Should only see the values handled below.
- LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
- break;
-
- case kInstByteRange:
+ Prog::Inst* ip = prog_->inst(id);
+
+ switch (ip->opcode()) {
+ default:
+ // Should only see the values handled below.
+ LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
+ break;
+
+ case kInstByteRange:
AddToThreadq(nextq, ip->out(), c, context, p, t);
- break;
-
- case kInstAltMatch:
- if (i != runq->begin())
- break;
- // The match is ours if we want it.
- if (ip->greedy(prog_) || longest_) {
+ break;
+
+ case kInstAltMatch:
+ if (i != runq->begin())
+ break;
+ // The match is ours if we want it.
+ if (ip->greedy(prog_) || longest_) {
CopyCapture(match_, t->capture);
matched_ = true;
@@ -371,13 +371,13 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
if (i->value() != NULL)
Decref(i->value());
}
- runq->clear();
- if (ip->greedy(prog_))
- return ip->out1();
- return ip->out();
- }
- break;
-
+ runq->clear();
+ if (ip->greedy(prog_))
+ return ip->out1();
+ return ip->out();
+ }
+ break;
+
case kInstMatch: {
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by storing p instead of p-1. (What would the latter even mean?!)
@@ -386,127 +386,127 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
CopyCapture(match_, t->capture);
match_[1] = p;
matched_ = true;
- break;
+ break;
}
-
+
if (endmatch_ && p-1 != etext_)
break;
- if (longest_) {
- // Leftmost-longest mode: save this match only if
- // it is either farther to the left or at the same
- // point but longer than an existing match.
- if (!matched_ || t->capture[0] < match_[0] ||
+ if (longest_) {
+ // Leftmost-longest mode: save this match only if
+ // it is either farther to the left or at the same
+ // point but longer than an existing match.
+ if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && p-1 > match_[1])) {
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
}
- } else {
- // Leftmost-biased mode: this match is by definition
- // better than what we've already found (see next line).
+ } else {
+ // Leftmost-biased mode: this match is by definition
+ // better than what we've already found (see next line).
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
-
- // Cut off the threads that can only find matches
- // worse than the one we just found: don't run the
- // rest of the current Threadq.
+
+ // Cut off the threads that can only find matches
+ // worse than the one we just found: don't run the
+ // rest of the current Threadq.
Decref(t);
for (++i; i != runq->end(); ++i) {
if (i->value() != NULL)
Decref(i->value());
}
- runq->clear();
- return 0;
- }
- break;
+ runq->clear();
+ return 0;
+ }
+ break;
}
- }
+ }
Decref(t);
- }
- runq->clear();
- return 0;
-}
-
+ }
+ runq->clear();
+ return 0;
+}
+
std::string NFA::FormatCapture(const char** capture) {
std::string s;
- for (int i = 0; i < ncapture_; i+=2) {
- if (capture[i] == NULL)
+ for (int i = 0; i < ncapture_; i+=2) {
+ if (capture[i] == NULL)
s += "(?,?)";
- else if (capture[i+1] == NULL)
+ else if (capture[i+1] == NULL)
s += StringPrintf("(%td,?)",
capture[i] - btext_);
- else
+ else
s += StringPrintf("(%td,%td)",
capture[i] - btext_,
capture[i+1] - btext_);
- }
- return s;
-}
-
-bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch) {
- if (start_ == 0)
- return false;
-
- StringPiece context = const_context;
+ }
+ return s;
+}
+
+bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
+ bool anchored, bool longest,
+ StringPiece* submatch, int nsubmatch) {
+ if (start_ == 0)
+ return false;
+
+ StringPiece context = const_context;
if (context.data() == NULL)
- context = text;
-
+ context = text;
+
// Sanity check: make sure that text lies within context.
if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
LOG(DFATAL) << "context does not contain text";
- return false;
- }
-
+ return false;
+ }
+
if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text))
- return false;
+ return false;
if (prog_->anchor_end() && EndPtr(context) != EndPtr(text))
- return false;
- anchored |= prog_->anchor_start();
- if (prog_->anchor_end()) {
- longest = true;
- endmatch_ = true;
- }
-
- if (nsubmatch < 0) {
- LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
- return false;
- }
-
- // Save search parameters.
- ncapture_ = 2*nsubmatch;
- longest_ = longest;
-
- if (nsubmatch == 0) {
- // We need to maintain match[0], both to distinguish the
- // longest match (if longest is true) and also to tell
- // whether we've seen any matches at all.
- ncapture_ = 2;
- }
-
- match_ = new const char*[ncapture_];
+ return false;
+ anchored |= prog_->anchor_start();
+ if (prog_->anchor_end()) {
+ longest = true;
+ endmatch_ = true;
+ }
+
+ if (nsubmatch < 0) {
+ LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
+ return false;
+ }
+
+ // Save search parameters.
+ ncapture_ = 2*nsubmatch;
+ longest_ = longest;
+
+ if (nsubmatch == 0) {
+ // We need to maintain match[0], both to distinguish the
+ // longest match (if longest is true) and also to tell
+ // whether we've seen any matches at all.
+ ncapture_ = 2;
+ }
+
+ match_ = new const char*[ncapture_];
memset(match_, 0, ncapture_*sizeof match_[0]);
- matched_ = false;
-
- // For debugging prints.
+ matched_ = false;
+
+ // For debugging prints.
btext_ = context.data();
// For convenience.
etext_ = text.data() + text.size();
-
+
if (ExtraDebug)
- fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
+ fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
std::string(text).c_str(), std::string(context).c_str(), anchored, longest);
-
- // Set up search.
- Threadq* runq = &q0_;
- Threadq* nextq = &q1_;
- runq->clear();
- nextq->clear();
-
- // Loop over the text, stepping the machine.
+
+ // Set up search.
+ Threadq* runq = &q0_;
+ Threadq* nextq = &q1_;
+ runq->clear();
+ nextq->clear();
+
+ // Loop over the text, stepping the machine.
for (const char* p = text.data();; p++) {
if (ExtraDebug) {
int c = 0;
@@ -518,58 +518,58 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
c = p[0] & 0xFF;
fprintf(stderr, "%c:", c);
- for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->value();
- if (t == NULL)
- continue;
+ if (t == NULL)
+ continue;
fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
- }
- fprintf(stderr, "\n");
- }
-
+ }
+ fprintf(stderr, "\n");
+ }
+
// This is a no-op the first time around the loop because runq is empty.
int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
- DCHECK_EQ(runq->size(), 0);
+ DCHECK_EQ(runq->size(), 0);
using std::swap;
- swap(nextq, runq);
- nextq->clear();
- if (id != 0) {
- // We're done: full match ahead.
+ swap(nextq, runq);
+ nextq->clear();
+ if (id != 0) {
+ // We're done: full match ahead.
p = etext_;
- for (;;) {
- Prog::Inst* ip = prog_->inst(id);
- switch (ip->opcode()) {
- default:
- LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
- break;
-
- case kInstCapture:
+ for (;;) {
+ Prog::Inst* ip = prog_->inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
+ break;
+
+ case kInstCapture:
if (ip->cap() < ncapture_)
match_[ip->cap()] = p;
- id = ip->out();
- continue;
-
- case kInstNop:
- id = ip->out();
- continue;
-
- case kInstMatch:
- match_[1] = p;
- matched_ = true;
- break;
- }
- break;
- }
- break;
- }
-
+ id = ip->out();
+ continue;
+
+ case kInstNop:
+ id = ip->out();
+ continue;
+
+ case kInstMatch:
+ match_[1] = p;
+ matched_ = true;
+ break;
+ }
+ break;
+ }
+ break;
+ }
+
if (p > etext_)
- break;
-
- // Start a new thread if there have not been any matches.
- // (No point in starting a new thread if there have been
- // matches, since it would be to the right of the match
- // we already found.)
+ break;
+
+ // Start a new thread if there have not been any matches.
+ // (No point in starting a new thread if there have been
+ // matches, since it would be to the right of the match
+ // we already found.)
if (!matched_ && (!anchored || p == text.data())) {
// Try to use prefix accel (e.g. memchr) to skip ahead.
// The search must be unanchored and there must be zero
@@ -579,23 +579,23 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
if (p == NULL)
p = etext_;
- }
-
+ }
+
Thread* t = AllocThread();
CopyCapture(t->capture, match_);
t->capture[0] = p;
AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
t);
Decref(t);
- }
-
- // If all the threads have died, stop early.
- if (runq->size() == 0) {
+ }
+
+ // If all the threads have died, stop early.
+ if (runq->size() == 0) {
if (ExtraDebug)
- fprintf(stderr, "dead\n");
- break;
- }
-
+ fprintf(stderr, "dead\n");
+ break;
+ }
+
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by simply not continuing the loop.
// This complements the special case in NFA::Step().
@@ -607,15 +607,15 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
nextq->clear();
break;
}
- }
-
+ }
+
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
if (i->value() != NULL)
Decref(i->value());
}
-
- if (matched_) {
- for (int i = 0; i < nsubmatch; i++)
+
+ if (matched_) {
+ for (int i = 0; i < nsubmatch; i++)
submatch[i] =
StringPiece(match_[2 * i],
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
@@ -623,34 +623,34 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
fprintf(stderr, "match (%td,%td)\n",
match_[0] - btext_,
match_[1] - btext_);
- return true;
- }
- return false;
-}
-
-bool
-Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch) {
+ return true;
+ }
+ return false;
+}
+
+bool
+Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch) {
if (ExtraDebug)
- Dump();
-
- NFA nfa(this);
- StringPiece sp;
- if (kind == kFullMatch) {
- anchor = kAnchored;
- if (nmatch == 0) {
- match = &sp;
- nmatch = 1;
- }
- }
- if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
- return false;
+ Dump();
+
+ NFA nfa(this);
+ StringPiece sp;
+ if (kind == kFullMatch) {
+ anchor = kAnchored;
+ if (nmatch == 0) {
+ match = &sp;
+ nmatch = 1;
+ }
+ }
+ if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
+ return false;
if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
- return false;
- return true;
-}
-
+ return false;
+ return true;
+}
+
// For each instruction i in the program reachable from the start, compute the
// number of instructions reachable from i by following only empty transitions
// and record that count as fanout[i].
@@ -710,4 +710,4 @@ void Prog::Fanout(SparseArray<int>* fanout) {
}
}
-} // namespace re2
+} // namespace re2
diff --git a/contrib/libs/re2/re2/onepass.cc b/contrib/libs/re2/re2/onepass.cc
index ff53b54e59..263974654d 100644
--- a/contrib/libs/re2/re2/onepass.cc
+++ b/contrib/libs/re2/re2/onepass.cc
@@ -1,59 +1,59 @@
-// Copyright 2008 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Tested by search_test.cc.
-//
-// Prog::SearchOnePass is an efficient implementation of
-// regular expression search with submatch tracking for
-// what I call "one-pass regular expressions". (An alternate
-// name might be "backtracking-free regular expressions".)
-//
-// One-pass regular expressions have the property that
-// at each input byte during an anchored match, there may be
-// multiple alternatives but only one can proceed for any
-// given input byte.
-//
-// For example, the regexp /x*yx*/ is one-pass: you read
-// x's until a y, then you read the y, then you keep reading x's.
-// At no point do you have to guess what to do or back up
-// and try a different guess.
-//
-// On the other hand, /x*x/ is not one-pass: when you're
-// looking at an input "x", it's not clear whether you should
-// use it to extend the x* or as the final x.
-//
-// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
-// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
-//
-// A simple intuition for identifying one-pass regular expressions
-// is that it's always immediately obvious when a repetition ends.
-// It must also be immediately obvious which branch of an | to take:
-//
-// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
-//
-// The NFA-based search in nfa.cc does some bookkeeping to
-// avoid the need for backtracking and its associated exponential blowup.
-// But if we have a one-pass regular expression, there is no
-// possibility of backtracking, so there is no need for the
-// extra bookkeeping. Hence, this code.
-//
-// On a one-pass regular expression, the NFA code in nfa.cc
-// runs at about 1/20 of the backtracking-based PCRE speed.
-// In contrast, the code in this file runs at about the same
-// speed as PCRE.
-//
-// One-pass regular expressions get used a lot when RE is
-// used for parsing simple strings, so it pays off to
-// notice them and handle them efficiently.
-//
-// See also Anne Brüggemann-Klein and Derick Wood,
-// "One-unambiguous regular languages", Information and Computation 142(2).
-
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc.
+//
+// Prog::SearchOnePass is an efficient implementation of
+// regular expression search with submatch tracking for
+// what I call "one-pass regular expressions". (An alternate
+// name might be "backtracking-free regular expressions".)
+//
+// One-pass regular expressions have the property that
+// at each input byte during an anchored match, there may be
+// multiple alternatives but only one can proceed for any
+// given input byte.
+//
+// For example, the regexp /x*yx*/ is one-pass: you read
+// x's until a y, then you read the y, then you keep reading x's.
+// At no point do you have to guess what to do or back up
+// and try a different guess.
+//
+// On the other hand, /x*x/ is not one-pass: when you're
+// looking at an input "x", it's not clear whether you should
+// use it to extend the x* or as the final x.
+//
+// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
+// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
+//
+// A simple intuition for identifying one-pass regular expressions
+// is that it's always immediately obvious when a repetition ends.
+// It must also be immediately obvious which branch of an | to take:
+//
+// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
+//
+// The NFA-based search in nfa.cc does some bookkeeping to
+// avoid the need for backtracking and its associated exponential blowup.
+// But if we have a one-pass regular expression, there is no
+// possibility of backtracking, so there is no need for the
+// extra bookkeeping. Hence, this code.
+//
+// On a one-pass regular expression, the NFA code in nfa.cc
+// runs at about 1/20 of the backtracking-based PCRE speed.
+// In contrast, the code in this file runs at about the same
+// speed as PCRE.
+//
+// One-pass regular expressions get used a lot when RE is
+// used for parsing simple strings, so it pays off to
+// notice them and handle them efficiently.
+//
+// See also Anne Brüggemann-Klein and Derick Wood,
+// "One-unambiguous regular languages", Information and Computation 142(2).
+
#include <stdint.h>
-#include <string.h>
+#include <string.h>
#include <algorithm>
-#include <map>
+#include <map>
#include <string>
#include <vector>
@@ -62,188 +62,188 @@
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/pod_array.h"
-#include "re2/prog.h"
+#include "re2/prog.h"
#include "re2/sparse_set.h"
#include "re2/stringpiece.h"
-
+
// Silence "zero-sized array in struct/union" warning for OneState::action.
#ifdef _MSC_VER
#pragma warning(disable: 4200)
#endif
-namespace re2 {
-
+namespace re2 {
+
static const bool ExtraDebug = false;
-
-// The key insight behind this implementation is that the
-// non-determinism in an NFA for a one-pass regular expression
-// is contained. To explain what that means, first a
-// refresher about what regular expression programs look like
-// and how the usual NFA execution runs.
-//
-// In a regular expression program, only the kInstByteRange
-// instruction processes an input byte c and moves on to the
-// next byte in the string (it does so if c is in the given range).
-// The kInstByteRange instructions correspond to literal characters
-// and character classes in the regular expression.
-//
-// The kInstAlt instructions are used as wiring to connect the
-// kInstByteRange instructions together in interesting ways when
-// implementing | + and *.
-// The kInstAlt instruction forks execution, like a goto that
-// jumps to ip->out() and ip->out1() in parallel. Each of the
-// resulting computation paths is called a thread.
-//
-// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
-// are interesting in their own right but like kInstAlt they don't
-// advance the input pointer. Only kInstByteRange does.
-//
-// The automaton execution in nfa.cc runs all the possible
-// threads of execution in lock-step over the input. To process
-// a particular byte, each thread gets run until it either dies
-// or finds a kInstByteRange instruction matching the byte.
-// If the latter happens, the thread stops just past the
-// kInstByteRange instruction (at ip->out()) and waits for
-// the other threads to finish processing the input byte.
-// Then, once all the threads have processed that input byte,
-// the whole process repeats. The kInstAlt state instruction
-// might create new threads during input processing, but no
-// matter what, all the threads stop after a kInstByteRange
-// and wait for the other threads to "catch up".
-// Running in lock step like this ensures that the NFA reads
-// the input string only once.
-//
-// Each thread maintains its own set of capture registers
-// (the string positions at which it executed the kInstCapture
-// instructions corresponding to capturing parentheses in the
-// regular expression). Repeated copying of the capture registers
-// is the main performance bottleneck in the NFA implementation.
-//
-// A regular expression program is "one-pass" if, no matter what
-// the input string, there is only one thread that makes it
-// past a kInstByteRange instruction at each input byte. This means
-// that there is in some sense only one active thread throughout
-// the execution. Other threads might be created during the
-// processing of an input byte, but they are ephemeral: only one
-// thread is left to start processing the next input byte.
-// This is what I meant above when I said the non-determinism
-// was "contained".
-//
-// To execute a one-pass regular expression program, we can build
-// a DFA (no non-determinism) that has at most as many states as
-// the NFA (compare this to the possibly exponential number of states
-// in the general case). Each state records, for each possible
-// input byte, the next state along with the conditions required
-// before entering that state -- empty-width flags that must be true
-// and capture operations that must be performed. It also records
-// whether a set of conditions required to finish a match at that
-// point in the input rather than process the next byte.
-
-// A state in the one-pass NFA - just an array of actions indexed
-// by the bytemap_[] of the next input byte. (The bytemap
-// maps next input bytes into equivalence classes, to reduce
-// the memory footprint.)
-struct OneState {
+
+// The key insight behind this implementation is that the
+// non-determinism in an NFA for a one-pass regular expression
+// is contained. To explain what that means, first a
+// refresher about what regular expression programs look like
+// and how the usual NFA execution runs.
+//
+// In a regular expression program, only the kInstByteRange
+// instruction processes an input byte c and moves on to the
+// next byte in the string (it does so if c is in the given range).
+// The kInstByteRange instructions correspond to literal characters
+// and character classes in the regular expression.
+//
+// The kInstAlt instructions are used as wiring to connect the
+// kInstByteRange instructions together in interesting ways when
+// implementing | + and *.
+// The kInstAlt instruction forks execution, like a goto that
+// jumps to ip->out() and ip->out1() in parallel. Each of the
+// resulting computation paths is called a thread.
+//
+// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
+// are interesting in their own right but like kInstAlt they don't
+// advance the input pointer. Only kInstByteRange does.
+//
+// The automaton execution in nfa.cc runs all the possible
+// threads of execution in lock-step over the input. To process
+// a particular byte, each thread gets run until it either dies
+// or finds a kInstByteRange instruction matching the byte.
+// If the latter happens, the thread stops just past the
+// kInstByteRange instruction (at ip->out()) and waits for
+// the other threads to finish processing the input byte.
+// Then, once all the threads have processed that input byte,
+// the whole process repeats. The kInstAlt state instruction
+// might create new threads during input processing, but no
+// matter what, all the threads stop after a kInstByteRange
+// and wait for the other threads to "catch up".
+// Running in lock step like this ensures that the NFA reads
+// the input string only once.
+//
+// Each thread maintains its own set of capture registers
+// (the string positions at which it executed the kInstCapture
+// instructions corresponding to capturing parentheses in the
+// regular expression). Repeated copying of the capture registers
+// is the main performance bottleneck in the NFA implementation.
+//
+// A regular expression program is "one-pass" if, no matter what
+// the input string, there is only one thread that makes it
+// past a kInstByteRange instruction at each input byte. This means
+// that there is in some sense only one active thread throughout
+// the execution. Other threads might be created during the
+// processing of an input byte, but they are ephemeral: only one
+// thread is left to start processing the next input byte.
+// This is what I meant above when I said the non-determinism
+// was "contained".
+//
+// To execute a one-pass regular expression program, we can build
+// a DFA (no non-determinism) that has at most as many states as
+// the NFA (compare this to the possibly exponential number of states
+// in the general case). Each state records, for each possible
+// input byte, the next state along with the conditions required
+// before entering that state -- empty-width flags that must be true
+// and capture operations that must be performed. It also records
+// whether a set of conditions required to finish a match at that
+// point in the input rather than process the next byte.
+
+// A state in the one-pass NFA - just an array of actions indexed
+// by the bytemap_[] of the next input byte. (The bytemap
+// maps next input bytes into equivalence classes, to reduce
+// the memory footprint.)
+struct OneState {
uint32_t matchcond; // conditions to match right now.
uint32_t action[];
-};
-
+};
+
// The uint32_t conditions in the action are a combination of
-// condition and capture bits and the next state. The bottom 16 bits
-// are the condition and capture bits, and the top 16 are the index of
-// the next state.
-//
-// Bits 0-5 are the empty-width flags from prog.h.
-// Bit 6 is kMatchWins, which means the match takes
-// priority over moving to next in a first-match search.
-// The remaining bits mark capture registers that should
-// be set to the current input position. The capture bits
-// start at index 2, since the search loop can take care of
-// cap[0], cap[1] (the overall match position).
-// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
-// No input position can satisfy both kEmptyWordBoundary
-// and kEmptyNonWordBoundary, so we can use that as a sentinel
-// instead of needing an extra bit.
-
+// condition and capture bits and the next state. The bottom 16 bits
+// are the condition and capture bits, and the top 16 are the index of
+// the next state.
+//
+// Bits 0-5 are the empty-width flags from prog.h.
+// Bit 6 is kMatchWins, which means the match takes
+// priority over moving to next in a first-match search.
+// The remaining bits mark capture registers that should
+// be set to the current input position. The capture bits
+// start at index 2, since the search loop can take care of
+// cap[0], cap[1] (the overall match position).
+// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
+// No input position can satisfy both kEmptyWordBoundary
+// and kEmptyNonWordBoundary, so we can use that as a sentinel
+// instead of needing an extra bit.
+
static const int kIndexShift = 16; // number of bits below index
static const int kEmptyShift = 6; // number of empty flags in prog.h
-static const int kRealCapShift = kEmptyShift + 1;
-static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
-
-// Parameters used to skip over cap[0], cap[1].
-static const int kCapShift = kRealCapShift - 2;
-static const int kMaxCap = kRealMaxCap + 2;
-
+static const int kRealCapShift = kEmptyShift + 1;
+static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
+
+// Parameters used to skip over cap[0], cap[1].
+static const int kCapShift = kRealCapShift - 2;
+static const int kMaxCap = kRealMaxCap + 2;
+
static const uint32_t kMatchWins = 1 << kEmptyShift;
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
-
+
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
-
-// Check, at compile time, that prog.h agrees with math above.
-// This function is never called.
-void OnePass_Checks() {
+
+// Check, at compile time, that prog.h agrees with math above.
+// This function is never called.
+void OnePass_Checks() {
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
"kEmptyShift disagrees with kEmptyAllFlags");
- // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
+ // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
"kMaxCap disagrees with kMaxOnePassCapture");
-}
-
+}
+
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
uint32_t satisfied = Prog::EmptyFlags(context, p);
- if (cond & kEmptyAllFlags & ~satisfied)
- return false;
- return true;
-}
-
-// Apply the capture bits in cond, saving p to the appropriate
-// locations in cap[].
+ if (cond & kEmptyAllFlags & ~satisfied)
+ return false;
+ return true;
+}
+
+// Apply the capture bits in cond, saving p to the appropriate
+// locations in cap[].
static void ApplyCaptures(uint32_t cond, const char* p,
- const char** cap, int ncap) {
- for (int i = 2; i < ncap; i++)
- if (cond & (1 << kCapShift << i))
- cap[i] = p;
-}
-
+ const char** cap, int ncap) {
+ for (int i = 2; i < ncap; i++)
+ if (cond & (1 << kCapShift << i))
+ cap[i] = p;
+}
+
// Computes the OneState* for the given nodeindex.
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
- int nodeindex) {
+ int nodeindex) {
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
-}
-
-bool Prog::SearchOnePass(const StringPiece& text,
- const StringPiece& const_context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch) {
- if (anchor != kAnchored && kind != kFullMatch) {
- LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
- return false;
- }
-
- // Make sure we have at least cap[1],
- // because we use it to tell if we matched.
- int ncap = 2*nmatch;
- if (ncap < 2)
- ncap = 2;
-
- const char* cap[kMaxCap];
- for (int i = 0; i < ncap; i++)
- cap[i] = NULL;
-
- const char* matchcap[kMaxCap];
- for (int i = 0; i < ncap; i++)
- matchcap[i] = NULL;
-
- StringPiece context = const_context;
+}
+
+bool Prog::SearchOnePass(const StringPiece& text,
+ const StringPiece& const_context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch) {
+ if (anchor != kAnchored && kind != kFullMatch) {
+ LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
+ return false;
+ }
+
+ // Make sure we have at least cap[1],
+ // because we use it to tell if we matched.
+ int ncap = 2*nmatch;
+ if (ncap < 2)
+ ncap = 2;
+
+ const char* cap[kMaxCap];
+ for (int i = 0; i < ncap; i++)
+ cap[i] = NULL;
+
+ const char* matchcap[kMaxCap];
+ for (int i = 0; i < ncap; i++)
+ matchcap[i] = NULL;
+
+ StringPiece context = const_context;
if (context.data() == NULL)
- context = text;
+ context = text;
if (anchor_start() && BeginPtr(context) != BeginPtr(text))
- return false;
+ return false;
if (anchor_end() && EndPtr(context) != EndPtr(text))
- return false;
- if (anchor_end())
- kind = kFullMatch;
-
+ return false;
+ if (anchor_end())
+ kind = kFullMatch;
+
uint8_t* nodes = onepass_nodes_.data();
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
// start() is always mapped to the zeroth OneState.
@@ -251,231 +251,231 @@ bool Prog::SearchOnePass(const StringPiece& text,
uint8_t* bytemap = bytemap_;
const char* bp = text.data();
const char* ep = text.data() + text.size();
- const char* p;
- bool matched = false;
- matchcap[0] = bp;
- cap[0] = bp;
+ const char* p;
+ bool matched = false;
+ matchcap[0] = bp;
+ cap[0] = bp;
uint32_t nextmatchcond = state->matchcond;
- for (p = bp; p < ep; p++) {
- int c = bytemap[*p & 0xFF];
+ for (p = bp; p < ep; p++) {
+ int c = bytemap[*p & 0xFF];
uint32_t matchcond = nextmatchcond;
uint32_t cond = state->action[c];
-
- // Determine whether we can reach act->next.
- // If so, advance state and nextmatchcond.
- if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
+
+ // Determine whether we can reach act->next.
+ // If so, advance state and nextmatchcond.
+ if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32_t nextindex = cond >> kIndexShift;
- state = IndexToNode(nodes, statesize, nextindex);
- nextmatchcond = state->matchcond;
- } else {
- state = NULL;
- nextmatchcond = kImpossible;
- }
-
- // This code section is carefully tuned.
- // The goto sequence is about 10% faster than the
- // obvious rewrite as a large if statement in the
- // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
-
- // Saving the match capture registers is expensive.
- // Is this intermediate match worth thinking about?
-
- // Not if we want a full match.
- if (kind == kFullMatch)
- goto skipmatch;
-
- // Not if it's impossible.
- if (matchcond == kImpossible)
- goto skipmatch;
-
- // Not if the possible match is beaten by the certain
- // match at the next byte. When this test is useless
- // (e.g., HTTPPartialMatchRE2) it slows the loop by
- // about 10%, but when it avoids work (e.g., DotMatchRE2),
- // it cuts the loop execution by about 45%.
- if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
- goto skipmatch;
-
- // Finally, the match conditions must be satisfied.
- if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
- for (int i = 2; i < 2*nmatch; i++)
- matchcap[i] = cap[i];
- if (nmatch > 1 && (matchcond & kCapMask))
- ApplyCaptures(matchcond, p, matchcap, ncap);
- matchcap[1] = p;
- matched = true;
-
- // If we're in longest match mode, we have to keep
- // going and see if we find a longer match.
- // In first match mode, we can stop if the match
- // takes priority over the next state for this input byte.
- // That bit is per-input byte and thus in cond, not matchcond.
- if (kind == kFirstMatch && (cond & kMatchWins))
- goto done;
- }
-
- skipmatch:
- if (state == NULL)
- goto done;
- if ((cond & kCapMask) && nmatch > 1)
- ApplyCaptures(cond, p, cap, ncap);
- }
-
- // Look for match at end of input.
- {
+ state = IndexToNode(nodes, statesize, nextindex);
+ nextmatchcond = state->matchcond;
+ } else {
+ state = NULL;
+ nextmatchcond = kImpossible;
+ }
+
+ // This code section is carefully tuned.
+ // The goto sequence is about 10% faster than the
+ // obvious rewrite as a large if statement in the
+ // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
+
+ // Saving the match capture registers is expensive.
+ // Is this intermediate match worth thinking about?
+
+ // Not if we want a full match.
+ if (kind == kFullMatch)
+ goto skipmatch;
+
+ // Not if it's impossible.
+ if (matchcond == kImpossible)
+ goto skipmatch;
+
+ // Not if the possible match is beaten by the certain
+ // match at the next byte. When this test is useless
+ // (e.g., HTTPPartialMatchRE2) it slows the loop by
+ // about 10%, but when it avoids work (e.g., DotMatchRE2),
+ // it cuts the loop execution by about 45%.
+ if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
+ goto skipmatch;
+
+ // Finally, the match conditions must be satisfied.
+ if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
+ for (int i = 2; i < 2*nmatch; i++)
+ matchcap[i] = cap[i];
+ if (nmatch > 1 && (matchcond & kCapMask))
+ ApplyCaptures(matchcond, p, matchcap, ncap);
+ matchcap[1] = p;
+ matched = true;
+
+ // If we're in longest match mode, we have to keep
+ // going and see if we find a longer match.
+ // In first match mode, we can stop if the match
+ // takes priority over the next state for this input byte.
+ // That bit is per-input byte and thus in cond, not matchcond.
+ if (kind == kFirstMatch && (cond & kMatchWins))
+ goto done;
+ }
+
+ skipmatch:
+ if (state == NULL)
+ goto done;
+ if ((cond & kCapMask) && nmatch > 1)
+ ApplyCaptures(cond, p, cap, ncap);
+ }
+
+ // Look for match at end of input.
+ {
uint32_t matchcond = state->matchcond;
- if (matchcond != kImpossible &&
- ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
- if (nmatch > 1 && (matchcond & kCapMask))
- ApplyCaptures(matchcond, p, cap, ncap);
- for (int i = 2; i < ncap; i++)
- matchcap[i] = cap[i];
- matchcap[1] = p;
- matched = true;
- }
- }
-
-done:
- if (!matched)
- return false;
- for (int i = 0; i < nmatch; i++)
+ if (matchcond != kImpossible &&
+ ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
+ if (nmatch > 1 && (matchcond & kCapMask))
+ ApplyCaptures(matchcond, p, cap, ncap);
+ for (int i = 2; i < ncap; i++)
+ matchcap[i] = cap[i];
+ matchcap[1] = p;
+ matched = true;
+ }
+ }
+
+done:
+ if (!matched)
+ return false;
+ for (int i = 0; i < nmatch; i++)
match[i] =
StringPiece(matchcap[2 * i],
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
- return true;
-}
-
-
-// Analysis to determine whether a given regexp program is one-pass.
-
-// If ip is not on workq, adds ip to work queue and returns true.
-// If ip is already on work queue, does nothing and returns false.
-// If ip is NULL, does nothing and returns true (pretends to add it).
-typedef SparseSet Instq;
-static bool AddQ(Instq *q, int id) {
- if (id == 0)
- return true;
- if (q->contains(id))
- return false;
- q->insert(id);
- return true;
-}
-
-struct InstCond {
- int id;
+ return true;
+}
+
+
+// Analysis to determine whether a given regexp program is one-pass.
+
+// If ip is not on workq, adds ip to work queue and returns true.
+// If ip is already on work queue, does nothing and returns false.
+// If ip is NULL, does nothing and returns true (pretends to add it).
+typedef SparseSet Instq;
+static bool AddQ(Instq *q, int id) {
+ if (id == 0)
+ return true;
+ if (q->contains(id))
+ return false;
+ q->insert(id);
+ return true;
+}
+
+struct InstCond {
+ int id;
uint32_t cond;
-};
-
-// Returns whether this is a one-pass program; that is,
-// returns whether it is safe to use SearchOnePass on this program.
-// These conditions must be true for any instruction ip:
-//
-// (1) for any other Inst nip, there is at most one input-free
-// path from ip to nip.
-// (2) there is at most one kInstByte instruction reachable from
-// ip that matches any particular byte c.
-// (3) there is at most one input-free path from ip to a kInstMatch
-// instruction.
-//
-// This is actually just a conservative approximation: it might
-// return false when the answer is true, when kInstEmptyWidth
-// instructions are involved.
-// Constructs and saves corresponding one-pass NFA on success.
-bool Prog::IsOnePass() {
- if (did_onepass_)
+};
+
+// Returns whether this is a one-pass program; that is,
+// returns whether it is safe to use SearchOnePass on this program.
+// These conditions must be true for any instruction ip:
+//
+// (1) for any other Inst nip, there is at most one input-free
+// path from ip to nip.
+// (2) there is at most one kInstByte instruction reachable from
+// ip that matches any particular byte c.
+// (3) there is at most one input-free path from ip to a kInstMatch
+// instruction.
+//
+// This is actually just a conservative approximation: it might
+// return false when the answer is true, when kInstEmptyWidth
+// instructions are involved.
+// Constructs and saves corresponding one-pass NFA on success.
+bool Prog::IsOnePass() {
+ if (did_onepass_)
return onepass_nodes_.data() != NULL;
- did_onepass_ = true;
-
- if (start() == 0) // no match
- return false;
-
- // Steal memory for the one-pass NFA from the overall DFA budget.
- // Willing to use at most 1/4 of the DFA budget (heuristic).
- // Limit max node count to 65000 as a conservative estimate to
- // avoid overflowing 16-bit node index in encoding.
+ did_onepass_ = true;
+
+ if (start() == 0) // no match
+ return false;
+
+ // Steal memory for the one-pass NFA from the overall DFA budget.
+ // Willing to use at most 1/4 of the DFA budget (heuristic).
+ // Limit max node count to 65000 as a conservative estimate to
+ // avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + inst_count(kInstByteRange);
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
- if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
- return false;
-
- // Flood the graph starting at the start state, and check
- // that in each reachable state, each possible byte leads
- // to a unique next state.
+ if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
+ return false;
+
+ // Flood the graph starting at the start state, and check
+ // that in each reachable state, each possible byte leads
+ // to a unique next state.
int stacksize = inst_count(kInstCapture) +
inst_count(kInstEmptyWidth) +
inst_count(kInstNop) + 1; // + 1 for start inst
PODArray<InstCond> stack(stacksize);
- int size = this->size();
+ int size = this->size();
PODArray<int> nodebyid(size); // indexed by ip
memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]);
-
+
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
// unnecessarily optimistic: why allocate a large amount of memory
// upfront for a large program when it is unlikely to be one-pass?
std::vector<uint8_t> nodes;
-
- Instq tovisit(size), workq(size);
- AddQ(&tovisit, start());
- nodebyid[start()] = 0;
- int nalloc = 1;
+
+ Instq tovisit(size), workq(size);
+ AddQ(&tovisit, start());
+ nodebyid[start()] = 0;
+ int nalloc = 1;
nodes.insert(nodes.end(), statesize, 0);
- for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
- int id = *it;
- int nodeindex = nodebyid[id];
+ for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
+ int id = *it;
+ int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
-
- // Flood graph using manual stack, filling in actions as found.
- // Default is none.
- for (int b = 0; b < bytemap_range_; b++)
- node->action[b] = kImpossible;
- node->matchcond = kImpossible;
-
- workq.clear();
- bool matched = false;
- int nstack = 0;
- stack[nstack].id = id;
- stack[nstack++].cond = 0;
- while (nstack > 0) {
- int id = stack[--nstack].id;
+
+ // Flood graph using manual stack, filling in actions as found.
+ // Default is none.
+ for (int b = 0; b < bytemap_range_; b++)
+ node->action[b] = kImpossible;
+ node->matchcond = kImpossible;
+
+ workq.clear();
+ bool matched = false;
+ int nstack = 0;
+ stack[nstack].id = id;
+ stack[nstack++].cond = 0;
+ while (nstack > 0) {
+ int id = stack[--nstack].id;
uint32_t cond = stack[nstack].cond;
Loop:
- Prog::Inst* ip = inst(id);
- switch (ip->opcode()) {
+ Prog::Inst* ip = inst(id);
+ switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
- case kInstAltMatch:
- // TODO(rsc): Ignoring kInstAltMatch optimization.
- // Should implement it in this engine, but it's subtle.
+ case kInstAltMatch:
+ // TODO(rsc): Ignoring kInstAltMatch optimization.
+ // Should implement it in this engine, but it's subtle.
DCHECK(!ip->last());
- // If already on work queue, (1) is violated: bail out.
+ // If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
- goto fail;
+ goto fail;
id = id+1;
goto Loop;
-
- case kInstByteRange: {
- int nextindex = nodebyid[ip->out()];
- if (nextindex == -1) {
- if (nalloc >= maxnodes) {
+
+ case kInstByteRange: {
+ int nextindex = nodebyid[ip->out()];
+ if (nextindex == -1) {
+ if (nalloc >= maxnodes) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
- goto fail;
- }
- nextindex = nalloc;
+ goto fail;
+ }
+ nextindex = nalloc;
AddQ(&tovisit, ip->out());
nodebyid[ip->out()] = nalloc;
- nalloc++;
+ nalloc++;
nodes.insert(nodes.end(), statesize, 0);
// Update node because it might have been invalidated.
node = IndexToNode(nodes.data(), statesize, nodeindex);
- }
- for (int c = ip->lo(); c <= ip->hi(); c++) {
- int b = bytemap_[c];
+ }
+ for (int c = ip->lo(); c <= ip->hi(); c++) {
+ int b = bytemap_[c];
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
@@ -483,20 +483,20 @@ bool Prog::IsOnePass() {
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
- if ((act & kImpossible) == kImpossible) {
- node->action[b] = newact;
- } else if (act != newact) {
+ if ((act & kImpossible) == kImpossible) {
+ node->action[b] = newact;
+ } else if (act != newact) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
- goto fail;
- }
- }
- if (ip->foldcase()) {
+ goto fail;
+ }
+ }
+ if (ip->foldcase()) {
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
- for (int c = lo; c <= hi; c++) {
- int b = bytemap_[c];
+ for (int c = lo; c <= hi; c++) {
+ int b = bytemap_[c];
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
@@ -504,16 +504,16 @@ bool Prog::IsOnePass() {
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
- if ((act & kImpossible) == kImpossible) {
- node->action[b] = newact;
- } else if (act != newact) {
+ if ((act & kImpossible) == kImpossible) {
+ node->action[b] = newact;
+ } else if (act != newact) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
- goto fail;
- }
- }
- }
+ goto fail;
+ }
+ }
+ }
if (ip->last())
break;
@@ -522,9 +522,9 @@ bool Prog::IsOnePass() {
goto fail;
id = id+1;
goto Loop;
- }
-
- case kInstCapture:
+ }
+
+ case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
if (!ip->last()) {
@@ -536,37 +536,37 @@ bool Prog::IsOnePass() {
}
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
- cond |= (1 << kCapShift) << ip->cap();
+ cond |= (1 << kCapShift) << ip->cap();
if (ip->opcode() == kInstEmptyWidth)
cond |= ip->empty();
-
- // kInstCapture and kInstNop always proceed to ip->out().
- // kInstEmptyWidth only sometimes proceeds to ip->out(),
- // but as a conservative approximation we assume it always does.
- // We could be a little more precise by looking at what c
- // is, but that seems like overkill.
-
- // If already on work queue, (1) is violated: bail out.
- if (!AddQ(&workq, ip->out())) {
+
+ // kInstCapture and kInstNop always proceed to ip->out().
+ // kInstEmptyWidth only sometimes proceeds to ip->out(),
+ // but as a conservative approximation we assume it always does.
+ // We could be a little more precise by looking at what c
+ // is, but that seems like overkill.
+
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, ip->out())) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple paths %d -> %d", *it, ip->out());
- goto fail;
- }
+ goto fail;
+ }
id = ip->out();
goto Loop;
-
- case kInstMatch:
- if (matched) {
- // (3) is violated
+
+ case kInstMatch:
+ if (matched) {
+ // (3) is violated
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple matches from %d", *it);
- goto fail;
- }
- matched = true;
- node->matchcond = cond;
-
+ goto fail;
+ }
+ matched = true;
+ node->matchcond = cond;
+
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
@@ -575,49 +575,49 @@ bool Prog::IsOnePass() {
id = id+1;
goto Loop;
- case kInstFail:
- break;
- }
- }
- }
-
+ case kInstFail:
+ break;
+ }
+ }
+ }
+
if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
LOG(ERROR) << "bytemap:\n" << DumpByteMap();
LOG(ERROR) << "prog:\n" << Dump();
std::map<int, int> idmap;
- for (int i = 0; i < size; i++)
- if (nodebyid[i] != -1)
- idmap[nodebyid[i]] = i;
-
+ for (int i = 0; i < size; i++)
+ if (nodebyid[i] != -1)
+ idmap[nodebyid[i]] = i;
+
std::string dump;
- for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
- int id = *it;
- int nodeindex = nodebyid[id];
- if (nodeindex == -1)
+ for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
+ int id = *it;
+ int nodeindex = nodebyid[id];
+ if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
dump += StringPrintf("node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
- for (int i = 0; i < bytemap_range_; i++) {
- if ((node->action[i] & kImpossible) == kImpossible)
- continue;
+ for (int i = 0; i < bytemap_range_; i++) {
+ if ((node->action[i] & kImpossible) == kImpossible)
+ continue;
dump += StringPrintf(" %d cond %#x -> %d id=%d\n",
i, node->action[i] & 0xFFFF,
node->action[i] >> kIndexShift,
idmap[node->action[i] >> kIndexShift]);
- }
- }
+ }
+ }
LOG(ERROR) << "nodes:\n" << dump;
- }
-
- dfa_mem_ -= nalloc*statesize;
+ }
+
+ dfa_mem_ -= nalloc*statesize;
onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);
memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);
- return true;
-
-fail:
- return false;
-}
-
-} // namespace re2
+ return true;
+
+fail:
+ return false;
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/parse.cc b/contrib/libs/re2/re2/parse.cc
index ed7c34db16..85f16f060b 100644
--- a/contrib/libs/re2/re2/parse.cc
+++ b/contrib/libs/re2/re2/parse.cc
@@ -1,21 +1,21 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Regular expression parser.
-
-// The parser is a simple precedence-based parser with a
-// manual stack. The parsing work is done by the methods
-// of the ParseState class. The Regexp::Parse function is
-// essentially just a lexer that calls the ParseState method
-// for each token.
-
-// The parser recognizes POSIX extended regular expressions
-// excluding backreferences, collating elements, and collating
-// classes. It also allows the empty string as a regular expression
-// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
-// See regexp.h for rationale.
-
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression parser.
+
+// The parser is a simple precedence-based parser with a
+// manual stack. The parsing work is done by the methods
+// of the ParseState class. The Regexp::Parse function is
+// essentially just a lexer that calls the ParseState method
+// for each token.
+
+// The parser recognizes POSIX extended regular expressions
+// excluding backreferences, collating elements, and collating
+// classes. It also allows the empty string as a regular expression
+// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
+// See regexp.h for rationale.
+
#include <ctype.h>
#include <stddef.h>
#include <stdint.h>
@@ -30,20 +30,20 @@
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/pod_array.h"
-#include "re2/regexp.h"
+#include "re2/regexp.h"
#include "re2/stringpiece.h"
-#include "re2/unicode_casefold.h"
-#include "re2/unicode_groups.h"
+#include "re2/unicode_casefold.h"
+#include "re2/unicode_groups.h"
#include "re2/walker-inl.h"
-
+
#if defined(RE2_USE_ICU)
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/utypes.h"
#endif
-namespace re2 {
-
+namespace re2 {
+
// Controls the maximum repeat count permitted by the parser.
static int maximum_repeat_count = 1000;
@@ -51,437 +51,437 @@ void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) {
maximum_repeat_count = i;
}
-// Regular expression parse state.
-// The list of parsed regexps so far is maintained as a vector of
-// Regexp pointers called the stack. Left parenthesis and vertical
-// bar markers are also placed on the stack, as Regexps with
-// non-standard opcodes.
-// Scanning a left parenthesis causes the parser to push a left parenthesis
-// marker on the stack.
-// Scanning a vertical bar causes the parser to pop the stack until it finds a
-// vertical bar or left parenthesis marker (not popping the marker),
-// concatenate all the popped results, and push them back on
-// the stack (DoConcatenation).
-// Scanning a right parenthesis causes the parser to act as though it
-// has seen a vertical bar, which then leaves the top of the stack in the
-// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
-// The parser pops all this off the stack and creates an alternation of the
-// regexps (DoAlternation).
-
-class Regexp::ParseState {
- public:
- ParseState(ParseFlags flags, const StringPiece& whole_regexp,
- RegexpStatus* status);
- ~ParseState();
-
- ParseFlags flags() { return flags_; }
- int rune_max() { return rune_max_; }
-
- // Parse methods. All public methods return a bool saying
- // whether parsing should continue. If a method returns
- // false, it has set fields in *status_, and the parser
- // should return NULL.
-
- // Pushes the given regular expression onto the stack.
- // Could check for too much memory used here.
- bool PushRegexp(Regexp* re);
-
- // Pushes the literal rune r onto the stack.
- bool PushLiteral(Rune r);
-
- // Pushes a regexp with the given op (and no args) onto the stack.
- bool PushSimpleOp(RegexpOp op);
-
- // Pushes a ^ onto the stack.
+// Regular expression parse state.
+// The list of parsed regexps so far is maintained as a vector of
+// Regexp pointers called the stack. Left parenthesis and vertical
+// bar markers are also placed on the stack, as Regexps with
+// non-standard opcodes.
+// Scanning a left parenthesis causes the parser to push a left parenthesis
+// marker on the stack.
+// Scanning a vertical bar causes the parser to pop the stack until it finds a
+// vertical bar or left parenthesis marker (not popping the marker),
+// concatenate all the popped results, and push them back on
+// the stack (DoConcatenation).
+// Scanning a right parenthesis causes the parser to act as though it
+// has seen a vertical bar, which then leaves the top of the stack in the
+// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
+// The parser pops all this off the stack and creates an alternation of the
+// regexps (DoAlternation).
+
+class Regexp::ParseState {
+ public:
+ ParseState(ParseFlags flags, const StringPiece& whole_regexp,
+ RegexpStatus* status);
+ ~ParseState();
+
+ ParseFlags flags() { return flags_; }
+ int rune_max() { return rune_max_; }
+
+ // Parse methods. All public methods return a bool saying
+ // whether parsing should continue. If a method returns
+ // false, it has set fields in *status_, and the parser
+ // should return NULL.
+
+ // Pushes the given regular expression onto the stack.
+ // Could check for too much memory used here.
+ bool PushRegexp(Regexp* re);
+
+ // Pushes the literal rune r onto the stack.
+ bool PushLiteral(Rune r);
+
+ // Pushes a regexp with the given op (and no args) onto the stack.
+ bool PushSimpleOp(RegexpOp op);
+
+ // Pushes a ^ onto the stack.
bool PushCaret();
-
- // Pushes a \b (word == true) or \B (word == false) onto the stack.
- bool PushWordBoundary(bool word);
-
- // Pushes a $ onto the stack.
- bool PushDollar();
-
- // Pushes a . onto the stack
- bool PushDot();
-
- // Pushes a repeat operator regexp onto the stack.
- // A valid argument for the operator must already be on the stack.
- // s is the name of the operator, for use in error messages.
- bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy);
-
- // Pushes a repetition regexp onto the stack.
- // A valid argument for the operator must already be on the stack.
- bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy);
-
- // Checks whether a particular regexp op is a marker.
- bool IsMarker(RegexpOp op);
-
- // Processes a left parenthesis in the input.
- // Pushes a marker onto the stack.
- bool DoLeftParen(const StringPiece& name);
- bool DoLeftParenNoCapture();
-
- // Processes a vertical bar in the input.
- bool DoVerticalBar();
-
- // Processes a right parenthesis in the input.
- bool DoRightParen();
-
- // Processes the end of input, returning the final regexp.
- Regexp* DoFinish();
-
- // Finishes the regexp if necessary, preparing it for use
- // in a more complicated expression.
- // If it is a CharClassBuilder, converts into a CharClass.
- Regexp* FinishRegexp(Regexp*);
-
- // These routines don't manipulate the parse stack
- // directly, but they do need to look at flags_.
- // ParseCharClass also manipulates the internals of Regexp
- // while creating *out_re.
-
- // Parse a character class into *out_re.
- // Removes parsed text from s.
- bool ParseCharClass(StringPiece* s, Regexp** out_re,
- RegexpStatus* status);
-
- // Parse a character class character into *rp.
- // Removes parsed text from s.
- bool ParseCCCharacter(StringPiece* s, Rune *rp,
- const StringPiece& whole_class,
- RegexpStatus* status);
-
- // Parse a character class range into rr.
- // Removes parsed text from s.
- bool ParseCCRange(StringPiece* s, RuneRange* rr,
- const StringPiece& whole_class,
- RegexpStatus* status);
-
- // Parse a Perl flag set or non-capturing group from s.
- bool ParsePerlFlags(StringPiece* s);
-
-
- // Finishes the current concatenation,
- // collapsing it into a single regexp on the stack.
- void DoConcatenation();
-
- // Finishes the current alternation,
- // collapsing it to a single regexp on the stack.
- void DoAlternation();
-
- // Generalized DoAlternation/DoConcatenation.
- void DoCollapse(RegexpOp op);
-
- // Maybe concatenate Literals into LiteralString.
- bool MaybeConcatString(int r, ParseFlags flags);
-
-private:
- ParseFlags flags_;
- StringPiece whole_regexp_;
- RegexpStatus* status_;
- Regexp* stacktop_;
- int ncap_; // number of capturing parens seen
- int rune_max_; // maximum char value for this encoding
-
+
+ // Pushes a \b (word == true) or \B (word == false) onto the stack.
+ bool PushWordBoundary(bool word);
+
+ // Pushes a $ onto the stack.
+ bool PushDollar();
+
+ // Pushes a . onto the stack
+ bool PushDot();
+
+ // Pushes a repeat operator regexp onto the stack.
+ // A valid argument for the operator must already be on the stack.
+ // s is the name of the operator, for use in error messages.
+ bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy);
+
+ // Pushes a repetition regexp onto the stack.
+ // A valid argument for the operator must already be on the stack.
+ bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy);
+
+ // Checks whether a particular regexp op is a marker.
+ bool IsMarker(RegexpOp op);
+
+ // Processes a left parenthesis in the input.
+ // Pushes a marker onto the stack.
+ bool DoLeftParen(const StringPiece& name);
+ bool DoLeftParenNoCapture();
+
+ // Processes a vertical bar in the input.
+ bool DoVerticalBar();
+
+ // Processes a right parenthesis in the input.
+ bool DoRightParen();
+
+ // Processes the end of input, returning the final regexp.
+ Regexp* DoFinish();
+
+ // Finishes the regexp if necessary, preparing it for use
+ // in a more complicated expression.
+ // If it is a CharClassBuilder, converts into a CharClass.
+ Regexp* FinishRegexp(Regexp*);
+
+ // These routines don't manipulate the parse stack
+ // directly, but they do need to look at flags_.
+ // ParseCharClass also manipulates the internals of Regexp
+ // while creating *out_re.
+
+ // Parse a character class into *out_re.
+ // Removes parsed text from s.
+ bool ParseCharClass(StringPiece* s, Regexp** out_re,
+ RegexpStatus* status);
+
+ // Parse a character class character into *rp.
+ // Removes parsed text from s.
+ bool ParseCCCharacter(StringPiece* s, Rune *rp,
+ const StringPiece& whole_class,
+ RegexpStatus* status);
+
+ // Parse a character class range into rr.
+ // Removes parsed text from s.
+ bool ParseCCRange(StringPiece* s, RuneRange* rr,
+ const StringPiece& whole_class,
+ RegexpStatus* status);
+
+ // Parse a Perl flag set or non-capturing group from s.
+ bool ParsePerlFlags(StringPiece* s);
+
+
+ // Finishes the current concatenation,
+ // collapsing it into a single regexp on the stack.
+ void DoConcatenation();
+
+ // Finishes the current alternation,
+ // collapsing it to a single regexp on the stack.
+ void DoAlternation();
+
+ // Generalized DoAlternation/DoConcatenation.
+ void DoCollapse(RegexpOp op);
+
+ // Maybe concatenate Literals into LiteralString.
+ bool MaybeConcatString(int r, ParseFlags flags);
+
+private:
+ ParseFlags flags_;
+ StringPiece whole_regexp_;
+ RegexpStatus* status_;
+ Regexp* stacktop_;
+ int ncap_; // number of capturing parens seen
+ int rune_max_; // maximum char value for this encoding
+
ParseState(const ParseState&) = delete;
ParseState& operator=(const ParseState&) = delete;
-};
-
-// Pseudo-operators - only on parse stack.
-const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
-const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
-
-Regexp::ParseState::ParseState(ParseFlags flags,
- const StringPiece& whole_regexp,
- RegexpStatus* status)
- : flags_(flags), whole_regexp_(whole_regexp),
- status_(status), stacktop_(NULL), ncap_(0) {
- if (flags_ & Latin1)
- rune_max_ = 0xFF;
- else
- rune_max_ = Runemax;
-}
-
-// Cleans up by freeing all the regexps on the stack.
-Regexp::ParseState::~ParseState() {
- Regexp* next;
- for (Regexp* re = stacktop_; re != NULL; re = next) {
- next = re->down_;
- re->down_ = NULL;
- if (re->op() == kLeftParen)
- delete re->name_;
- re->Decref();
- }
-}
-
-// Finishes the regexp if necessary, preparing it for use in
-// a more complex expression.
-// If it is a CharClassBuilder, converts into a CharClass.
-Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
- if (re == NULL)
- return NULL;
- re->down_ = NULL;
-
- if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
- CharClassBuilder* ccb = re->ccb_;
- re->ccb_ = NULL;
- re->cc_ = ccb->GetCharClass();
- delete ccb;
- }
-
- return re;
-}
-
-// Pushes the given regular expression onto the stack.
-// Could check for too much memory used here.
-bool Regexp::ParseState::PushRegexp(Regexp* re) {
- MaybeConcatString(-1, NoParseFlags);
-
- // Special case: a character class of one character is just
- // a literal. This is a common idiom for escaping
- // single characters (e.g., [.] instead of \.), and some
- // analysis does better with fewer character classes.
- // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
+};
+
+// Pseudo-operators - only on parse stack.
+const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
+const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
+
+Regexp::ParseState::ParseState(ParseFlags flags,
+ const StringPiece& whole_regexp,
+ RegexpStatus* status)
+ : flags_(flags), whole_regexp_(whole_regexp),
+ status_(status), stacktop_(NULL), ncap_(0) {
+ if (flags_ & Latin1)
+ rune_max_ = 0xFF;
+ else
+ rune_max_ = Runemax;
+}
+
+// Cleans up by freeing all the regexps on the stack.
+Regexp::ParseState::~ParseState() {
+ Regexp* next;
+ for (Regexp* re = stacktop_; re != NULL; re = next) {
+ next = re->down_;
+ re->down_ = NULL;
+ if (re->op() == kLeftParen)
+ delete re->name_;
+ re->Decref();
+ }
+}
+
+// Finishes the regexp if necessary, preparing it for use in
+// a more complex expression.
+// If it is a CharClassBuilder, converts into a CharClass.
+Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
+ if (re == NULL)
+ return NULL;
+ re->down_ = NULL;
+
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
+ CharClassBuilder* ccb = re->ccb_;
+ re->ccb_ = NULL;
+ re->cc_ = ccb->GetCharClass();
+ delete ccb;
+ }
+
+ return re;
+}
+
+// Pushes the given regular expression onto the stack.
+// Could check for too much memory used here.
+bool Regexp::ParseState::PushRegexp(Regexp* re) {
+ MaybeConcatString(-1, NoParseFlags);
+
+ // Special case: a character class of one character is just
+ // a literal. This is a common idiom for escaping
+ // single characters (e.g., [.] instead of \.), and some
+ // analysis does better with fewer character classes.
+ // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
re->ccb_->RemoveAbove(rune_max_);
- if (re->ccb_->size() == 1) {
- Rune r = re->ccb_->begin()->lo;
- re->Decref();
- re = new Regexp(kRegexpLiteral, flags_);
- re->rune_ = r;
- } else if (re->ccb_->size() == 2) {
- Rune r = re->ccb_->begin()->lo;
- if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
- re->Decref();
- re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
- re->rune_ = r + 'a' - 'A';
- }
- }
- }
-
- if (!IsMarker(re->op()))
- re->simple_ = re->ComputeSimple();
- re->down_ = stacktop_;
- stacktop_ = re;
- return true;
-}
-
-// Searches the case folding tables and returns the CaseFold* that contains r.
-// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
-// If there isn't one, returns NULL.
+ if (re->ccb_->size() == 1) {
+ Rune r = re->ccb_->begin()->lo;
+ re->Decref();
+ re = new Regexp(kRegexpLiteral, flags_);
+ re->rune_ = r;
+ } else if (re->ccb_->size() == 2) {
+ Rune r = re->ccb_->begin()->lo;
+ if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
+ re->Decref();
+ re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
+ re->rune_ = r + 'a' - 'A';
+ }
+ }
+ }
+
+ if (!IsMarker(re->op()))
+ re->simple_ = re->ComputeSimple();
+ re->down_ = stacktop_;
+ stacktop_ = re;
+ return true;
+}
+
+// Searches the case folding tables and returns the CaseFold* that contains r.
+// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
+// If there isn't one, returns NULL.
const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) {
const CaseFold* ef = f + n;
-
- // Binary search for entry containing r.
- while (n > 0) {
+
+ // Binary search for entry containing r.
+ while (n > 0) {
int m = n/2;
- if (f[m].lo <= r && r <= f[m].hi)
- return &f[m];
- if (r < f[m].lo) {
- n = m;
- } else {
- f += m+1;
- n -= m+1;
- }
- }
-
- // There is no entry that contains r, but f points
+ if (f[m].lo <= r && r <= f[m].hi)
+ return &f[m];
+ if (r < f[m].lo) {
+ n = m;
+ } else {
+ f += m+1;
+ n -= m+1;
+ }
+ }
+
+ // There is no entry that contains r, but f points
// where it would have been. Unless f points at
- // the end of the array, it points at the next entry
- // after r.
+ // the end of the array, it points at the next entry
+ // after r.
if (f < ef)
- return f;
-
- // No entry contains r; no entry contains runes > r.
- return NULL;
-}
-
-// Returns the result of applying the fold f to the rune r.
+ return f;
+
+ // No entry contains r; no entry contains runes > r.
+ return NULL;
+}
+
+// Returns the result of applying the fold f to the rune r.
Rune ApplyFold(const CaseFold *f, Rune r) {
- switch (f->delta) {
- default:
- return r + f->delta;
-
+ switch (f->delta) {
+ default:
+ return r + f->delta;
+
case EvenOddSkip: // even <-> odd but only applies to every other
if ((r - f->lo) % 2)
return r;
FALLTHROUGH_INTENDED;
- case EvenOdd: // even <-> odd
- if (r%2 == 0)
- return r + 1;
- return r - 1;
-
+ case EvenOdd: // even <-> odd
+ if (r%2 == 0)
+ return r + 1;
+ return r - 1;
+
case OddEvenSkip: // odd <-> even but only applies to every other
if ((r - f->lo) % 2)
return r;
FALLTHROUGH_INTENDED;
- case OddEven: // odd <-> even
- if (r%2 == 1)
- return r + 1;
- return r - 1;
- }
-}
-
-// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
-// Examples:
-// CycleFoldRune('A') = 'a'
-// CycleFoldRune('a') = 'A'
-//
-// CycleFoldRune('K') = 'k'
-// CycleFoldRune('k') = 0x212A (Kelvin)
-// CycleFoldRune(0x212A) = 'K'
-//
-// CycleFoldRune('?') = '?'
-Rune CycleFoldRune(Rune r) {
+ case OddEven: // odd <-> even
+ if (r%2 == 1)
+ return r + 1;
+ return r - 1;
+ }
+}
+
+// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
+// Examples:
+// CycleFoldRune('A') = 'a'
+// CycleFoldRune('a') = 'A'
+//
+// CycleFoldRune('K') = 'k'
+// CycleFoldRune('k') = 0x212A (Kelvin)
+// CycleFoldRune(0x212A) = 'K'
+//
+// CycleFoldRune('?') = '?'
+Rune CycleFoldRune(Rune r) {
const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
- if (f == NULL || r < f->lo)
- return r;
- return ApplyFold(f, r);
-}
-
-// Add lo-hi to the class, along with their fold-equivalent characters.
-// If lo-hi is already in the class, assume that the fold-equivalent
-// chars are there too, so there's no work to do.
-static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
- // AddFoldedRange calls itself recursively for each rune in the fold cycle.
- // Most folding cycles are small: there aren't any bigger than four in the
- // current Unicode tables. make_unicode_casefold.py checks that
- // the cycles are not too long, and we double-check here using depth.
- if (depth > 10) {
- LOG(DFATAL) << "AddFoldedRange recurses too much.";
- return;
- }
-
- if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
- return;
-
- while (lo <= hi) {
+ if (f == NULL || r < f->lo)
+ return r;
+ return ApplyFold(f, r);
+}
+
+// Add lo-hi to the class, along with their fold-equivalent characters.
+// If lo-hi is already in the class, assume that the fold-equivalent
+// chars are there too, so there's no work to do.
+static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
+ // AddFoldedRange calls itself recursively for each rune in the fold cycle.
+ // Most folding cycles are small: there aren't any bigger than four in the
+ // current Unicode tables. make_unicode_casefold.py checks that
+ // the cycles are not too long, and we double-check here using depth.
+ if (depth > 10) {
+ LOG(DFATAL) << "AddFoldedRange recurses too much.";
+ return;
+ }
+
+ if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
+ return;
+
+ while (lo <= hi) {
const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
- if (f == NULL) // lo has no fold, nor does anything above lo
- break;
- if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
- lo = f->lo;
- continue;
- }
-
- // Add in the result of folding the range lo - f->hi
- // and that range's fold, recursively.
- Rune lo1 = lo;
+ if (f == NULL) // lo has no fold, nor does anything above lo
+ break;
+ if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
+ lo = f->lo;
+ continue;
+ }
+
+ // Add in the result of folding the range lo - f->hi
+ // and that range's fold, recursively.
+ Rune lo1 = lo;
Rune hi1 = std::min<Rune>(hi, f->hi);
- switch (f->delta) {
- default:
- lo1 += f->delta;
- hi1 += f->delta;
- break;
- case EvenOdd:
- if (lo1%2 == 1)
- lo1--;
- if (hi1%2 == 0)
- hi1++;
- break;
- case OddEven:
- if (lo1%2 == 0)
- lo1--;
- if (hi1%2 == 1)
- hi1++;
- break;
- }
- AddFoldedRange(cc, lo1, hi1, depth+1);
-
- // Pick up where this fold left off.
- lo = f->hi + 1;
- }
-}
-
-// Pushes the literal rune r onto the stack.
-bool Regexp::ParseState::PushLiteral(Rune r) {
- // Do case folding if needed.
- if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
- Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
- re->ccb_ = new CharClassBuilder;
- Rune r1 = r;
- do {
- if (!(flags_ & NeverNL) || r != '\n') {
- re->ccb_->AddRange(r, r);
- }
- r = CycleFoldRune(r);
- } while (r != r1);
- return PushRegexp(re);
- }
-
- // Exclude newline if applicable.
- if ((flags_ & NeverNL) && r == '\n')
- return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
-
- // No fancy stuff worked. Ordinary literal.
- if (MaybeConcatString(r, flags_))
- return true;
-
- Regexp* re = new Regexp(kRegexpLiteral, flags_);
- re->rune_ = r;
- return PushRegexp(re);
-}
-
-// Pushes a ^ onto the stack.
+ switch (f->delta) {
+ default:
+ lo1 += f->delta;
+ hi1 += f->delta;
+ break;
+ case EvenOdd:
+ if (lo1%2 == 1)
+ lo1--;
+ if (hi1%2 == 0)
+ hi1++;
+ break;
+ case OddEven:
+ if (lo1%2 == 0)
+ lo1--;
+ if (hi1%2 == 1)
+ hi1++;
+ break;
+ }
+ AddFoldedRange(cc, lo1, hi1, depth+1);
+
+ // Pick up where this fold left off.
+ lo = f->hi + 1;
+ }
+}
+
+// Pushes the literal rune r onto the stack.
+bool Regexp::ParseState::PushLiteral(Rune r) {
+ // Do case folding if needed.
+ if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ Rune r1 = r;
+ do {
+ if (!(flags_ & NeverNL) || r != '\n') {
+ re->ccb_->AddRange(r, r);
+ }
+ r = CycleFoldRune(r);
+ } while (r != r1);
+ return PushRegexp(re);
+ }
+
+ // Exclude newline if applicable.
+ if ((flags_ & NeverNL) && r == '\n')
+ return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
+
+ // No fancy stuff worked. Ordinary literal.
+ if (MaybeConcatString(r, flags_))
+ return true;
+
+ Regexp* re = new Regexp(kRegexpLiteral, flags_);
+ re->rune_ = r;
+ return PushRegexp(re);
+}
+
+// Pushes a ^ onto the stack.
bool Regexp::ParseState::PushCaret() {
- if (flags_ & OneLine) {
- return PushSimpleOp(kRegexpBeginText);
- }
- return PushSimpleOp(kRegexpBeginLine);
-}
-
-// Pushes a \b or \B onto the stack.
-bool Regexp::ParseState::PushWordBoundary(bool word) {
- if (word)
- return PushSimpleOp(kRegexpWordBoundary);
- return PushSimpleOp(kRegexpNoWordBoundary);
-}
-
-// Pushes a $ onto the stack.
-bool Regexp::ParseState::PushDollar() {
- if (flags_ & OneLine) {
- // Clumsy marker so that MimicsPCRE() can tell whether
- // this kRegexpEndText was a $ and not a \z.
- Regexp::ParseFlags oflags = flags_;
- flags_ = flags_ | WasDollar;
- bool ret = PushSimpleOp(kRegexpEndText);
- flags_ = oflags;
- return ret;
- }
- return PushSimpleOp(kRegexpEndLine);
-}
-
-// Pushes a . onto the stack.
-bool Regexp::ParseState::PushDot() {
- if ((flags_ & DotNL) && !(flags_ & NeverNL))
- return PushSimpleOp(kRegexpAnyChar);
- // Rewrite . into [^\n]
- Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
- re->ccb_ = new CharClassBuilder;
- re->ccb_->AddRange(0, '\n' - 1);
- re->ccb_->AddRange('\n' + 1, rune_max_);
- return PushRegexp(re);
-}
-
-// Pushes a regexp with the given op (and no args) onto the stack.
-bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
- Regexp* re = new Regexp(op, flags_);
- return PushRegexp(re);
-}
-
-// Pushes a repeat operator regexp onto the stack.
-// A valid argument for the operator must already be on the stack.
-// The char c is the name of the operator, for use in error messages.
-bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
- bool nongreedy) {
- if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
- status_->set_code(kRegexpRepeatArgument);
- status_->set_error_arg(s);
- return false;
- }
- Regexp::ParseFlags fl = flags_;
- if (nongreedy)
- fl = fl ^ NonGreedy;
+ if (flags_ & OneLine) {
+ return PushSimpleOp(kRegexpBeginText);
+ }
+ return PushSimpleOp(kRegexpBeginLine);
+}
+
+// Pushes a \b or \B onto the stack.
+bool Regexp::ParseState::PushWordBoundary(bool word) {
+ if (word)
+ return PushSimpleOp(kRegexpWordBoundary);
+ return PushSimpleOp(kRegexpNoWordBoundary);
+}
+
+// Pushes a $ onto the stack.
+bool Regexp::ParseState::PushDollar() {
+ if (flags_ & OneLine) {
+ // Clumsy marker so that MimicsPCRE() can tell whether
+ // this kRegexpEndText was a $ and not a \z.
+ Regexp::ParseFlags oflags = flags_;
+ flags_ = flags_ | WasDollar;
+ bool ret = PushSimpleOp(kRegexpEndText);
+ flags_ = oflags;
+ return ret;
+ }
+ return PushSimpleOp(kRegexpEndLine);
+}
+
+// Pushes a . onto the stack.
+bool Regexp::ParseState::PushDot() {
+ if ((flags_ & DotNL) && !(flags_ & NeverNL))
+ return PushSimpleOp(kRegexpAnyChar);
+ // Rewrite . into [^\n]
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ re->ccb_->AddRange(0, '\n' - 1);
+ re->ccb_->AddRange('\n' + 1, rune_max_);
+ return PushRegexp(re);
+}
+
+// Pushes a regexp with the given op (and no args) onto the stack.
+bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
+ Regexp* re = new Regexp(op, flags_);
+ return PushRegexp(re);
+}
+
+// Pushes a repeat operator regexp onto the stack.
+// A valid argument for the operator must already be on the stack.
+// The char c is the name of the operator, for use in error messages.
+bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
+ bool nongreedy) {
+ if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
+ status_->set_code(kRegexpRepeatArgument);
+ status_->set_error_arg(s);
+ return false;
+ }
+ Regexp::ParseFlags fl = flags_;
+ if (nongreedy)
+ fl = fl ^ NonGreedy;
// Squash **, ++ and ??. Regexp::Star() et al. handle this too, but
// they're mostly for use during simplification, not during parsing.
@@ -499,15 +499,15 @@ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
return true;
}
- Regexp* re = new Regexp(op, fl);
- re->AllocSub(1);
- re->down_ = stacktop_->down_;
- re->sub()[0] = FinishRegexp(stacktop_);
- re->simple_ = re->ComputeSimple();
- stacktop_ = re;
- return true;
-}
-
+ Regexp* re = new Regexp(op, fl);
+ re->AllocSub(1);
+ re->down_ = stacktop_->down_;
+ re->sub()[0] = FinishRegexp(stacktop_);
+ re->simple_ = re->ComputeSimple();
+ stacktop_ = re;
+ return true;
+}
+
// RepetitionWalker reports whether the repetition regexp is valid.
// Valid means that the combination of the top-level repetition
// and any inner repetitions does not exceed n copies of the
@@ -563,34 +563,34 @@ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
return 0;
}
-// Pushes a repetition regexp onto the stack.
-// A valid argument for the operator must already be on the stack.
-bool Regexp::ParseState::PushRepetition(int min, int max,
- const StringPiece& s,
- bool nongreedy) {
+// Pushes a repetition regexp onto the stack.
+// A valid argument for the operator must already be on the stack.
+bool Regexp::ParseState::PushRepetition(int min, int max,
+ const StringPiece& s,
+ bool nongreedy) {
if ((max != -1 && max < min) ||
min > maximum_repeat_count ||
max > maximum_repeat_count) {
- status_->set_code(kRegexpRepeatSize);
- status_->set_error_arg(s);
- return false;
- }
- if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
- status_->set_code(kRegexpRepeatArgument);
- status_->set_error_arg(s);
- return false;
- }
- Regexp::ParseFlags fl = flags_;
- if (nongreedy)
- fl = fl ^ NonGreedy;
- Regexp* re = new Regexp(kRegexpRepeat, fl);
- re->min_ = min;
- re->max_ = max;
- re->AllocSub(1);
- re->down_ = stacktop_->down_;
- re->sub()[0] = FinishRegexp(stacktop_);
- re->simple_ = re->ComputeSimple();
- stacktop_ = re;
+ status_->set_code(kRegexpRepeatSize);
+ status_->set_error_arg(s);
+ return false;
+ }
+ if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
+ status_->set_code(kRegexpRepeatArgument);
+ status_->set_error_arg(s);
+ return false;
+ }
+ Regexp::ParseFlags fl = flags_;
+ if (nongreedy)
+ fl = fl ^ NonGreedy;
+ Regexp* re = new Regexp(kRegexpRepeat, fl);
+ re->min_ = min;
+ re->max_ = max;
+ re->AllocSub(1);
+ re->down_ = stacktop_->down_;
+ re->sub()[0] = FinishRegexp(stacktop_);
+ re->simple_ = re->ComputeSimple();
+ stacktop_ = re;
if (min >= 2 || max >= 2) {
RepetitionWalker w;
if (w.Walk(stacktop_, maximum_repeat_count) == 0) {
@@ -599,47 +599,47 @@ bool Regexp::ParseState::PushRepetition(int min, int max,
return false;
}
}
- return true;
-}
-
-// Checks whether a particular regexp op is a marker.
-bool Regexp::ParseState::IsMarker(RegexpOp op) {
- return op >= kLeftParen;
-}
-
-// Processes a left parenthesis in the input.
-// Pushes a marker onto the stack.
-bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {
- Regexp* re = new Regexp(kLeftParen, flags_);
- re->cap_ = ++ncap_;
- if (name.data() != NULL)
+ return true;
+}
+
+// Checks whether a particular regexp op is a marker.
+bool Regexp::ParseState::IsMarker(RegexpOp op) {
+ return op >= kLeftParen;
+}
+
+// Processes a left parenthesis in the input.
+// Pushes a marker onto the stack.
+bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {
+ Regexp* re = new Regexp(kLeftParen, flags_);
+ re->cap_ = ++ncap_;
+ if (name.data() != NULL)
re->name_ = new std::string(name);
- return PushRegexp(re);
-}
-
-// Pushes a non-capturing marker onto the stack.
-bool Regexp::ParseState::DoLeftParenNoCapture() {
- Regexp* re = new Regexp(kLeftParen, flags_);
- re->cap_ = -1;
- return PushRegexp(re);
-}
-
-// Processes a vertical bar in the input.
-bool Regexp::ParseState::DoVerticalBar() {
- MaybeConcatString(-1, NoParseFlags);
- DoConcatenation();
-
- // Below the vertical bar is a list to alternate.
- // Above the vertical bar is a list to concatenate.
- // We just did the concatenation, so either swap
- // the result below the vertical bar or push a new
- // vertical bar on the stack.
- Regexp* r1;
- Regexp* r2;
- if ((r1 = stacktop_) != NULL &&
+ return PushRegexp(re);
+}
+
+// Pushes a non-capturing marker onto the stack.
+bool Regexp::ParseState::DoLeftParenNoCapture() {
+ Regexp* re = new Regexp(kLeftParen, flags_);
+ re->cap_ = -1;
+ return PushRegexp(re);
+}
+
+// Processes a vertical bar in the input.
+bool Regexp::ParseState::DoVerticalBar() {
+ MaybeConcatString(-1, NoParseFlags);
+ DoConcatenation();
+
+ // Below the vertical bar is a list to alternate.
+ // Above the vertical bar is a list to concatenate.
+ // We just did the concatenation, so either swap
+ // the result below the vertical bar or push a new
+ // vertical bar on the stack.
+ Regexp* r1;
+ Regexp* r2;
+ if ((r1 = stacktop_) != NULL &&
(r2 = r1->down_) != NULL &&
- r2->op() == kVerticalBar) {
- Regexp* r3;
+ r2->op() == kVerticalBar) {
+ Regexp* r3;
if ((r3 = r2->down_) != NULL &&
(r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
// AnyChar is above or below the vertical bar. Let it subsume
@@ -652,7 +652,7 @@ bool Regexp::ParseState::DoVerticalBar() {
stacktop_ = r2;
r1->Decref();
return true;
- }
+ }
if (r1->op() == kRegexpAnyChar &&
(r3->op() == kRegexpLiteral ||
r3->op() == kRegexpCharClass ||
@@ -664,212 +664,212 @@ bool Regexp::ParseState::DoVerticalBar() {
r3->Decref();
return true;
}
- }
- // Swap r1 below vertical bar (r2).
- r1->down_ = r2->down_;
- r2->down_ = r1;
- stacktop_ = r2;
- return true;
- }
- return PushSimpleOp(kVerticalBar);
-}
-
-// Processes a right parenthesis in the input.
-bool Regexp::ParseState::DoRightParen() {
- // Finish the current concatenation and alternation.
- DoAlternation();
-
- // The stack should be: LeftParen regexp
- // Remove the LeftParen, leaving the regexp,
- // parenthesized.
- Regexp* r1;
- Regexp* r2;
- if ((r1 = stacktop_) == NULL ||
- (r2 = r1->down_) == NULL ||
- r2->op() != kLeftParen) {
+ }
+ // Swap r1 below vertical bar (r2).
+ r1->down_ = r2->down_;
+ r2->down_ = r1;
+ stacktop_ = r2;
+ return true;
+ }
+ return PushSimpleOp(kVerticalBar);
+}
+
+// Processes a right parenthesis in the input.
+bool Regexp::ParseState::DoRightParen() {
+ // Finish the current concatenation and alternation.
+ DoAlternation();
+
+ // The stack should be: LeftParen regexp
+ // Remove the LeftParen, leaving the regexp,
+ // parenthesized.
+ Regexp* r1;
+ Regexp* r2;
+ if ((r1 = stacktop_) == NULL ||
+ (r2 = r1->down_) == NULL ||
+ r2->op() != kLeftParen) {
status_->set_code(kRegexpUnexpectedParen);
- status_->set_error_arg(whole_regexp_);
- return false;
- }
-
- // Pop off r1, r2. Will Decref or reuse below.
- stacktop_ = r2->down_;
-
- // Restore flags from when paren opened.
- Regexp* re = r2;
- flags_ = re->parse_flags();
-
- // Rewrite LeftParen as capture if needed.
- if (re->cap_ > 0) {
- re->op_ = kRegexpCapture;
- // re->cap_ is already set
- re->AllocSub(1);
- re->sub()[0] = FinishRegexp(r1);
- re->simple_ = re->ComputeSimple();
- } else {
- re->Decref();
- re = r1;
- }
- return PushRegexp(re);
-}
-
-// Processes the end of input, returning the final regexp.
-Regexp* Regexp::ParseState::DoFinish() {
- DoAlternation();
- Regexp* re = stacktop_;
- if (re != NULL && re->down_ != NULL) {
- status_->set_code(kRegexpMissingParen);
- status_->set_error_arg(whole_regexp_);
- return NULL;
- }
- stacktop_ = NULL;
- return FinishRegexp(re);
-}
-
-// Returns the leading regexp that re starts with.
-// The returned Regexp* points into a piece of re,
-// so it must not be used after the caller calls re->Decref().
-Regexp* Regexp::LeadingRegexp(Regexp* re) {
- if (re->op() == kRegexpEmptyMatch)
- return NULL;
- if (re->op() == kRegexpConcat && re->nsub() >= 2) {
- Regexp** sub = re->sub();
- if (sub[0]->op() == kRegexpEmptyMatch)
- return NULL;
- return sub[0];
- }
- return re;
-}
-
-// Removes LeadingRegexp(re) from re and returns what's left.
-// Consumes the reference to re and may edit it in place.
-// If caller wants to hold on to LeadingRegexp(re),
-// must have already Incref'ed it.
-Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
- if (re->op() == kRegexpEmptyMatch)
- return re;
- if (re->op() == kRegexpConcat && re->nsub() >= 2) {
- Regexp** sub = re->sub();
- if (sub[0]->op() == kRegexpEmptyMatch)
- return re;
- sub[0]->Decref();
- sub[0] = NULL;
- if (re->nsub() == 2) {
- // Collapse concatenation to single regexp.
- Regexp* nre = sub[1];
- sub[1] = NULL;
- re->Decref();
- return nre;
- }
- // 3 or more -> 2 or more.
- re->nsub_--;
- memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
- return re;
- }
- Regexp::ParseFlags pf = re->parse_flags();
- re->Decref();
- return new Regexp(kRegexpEmptyMatch, pf);
-}
-
-// Returns the leading string that re starts with.
-// The returned Rune* points into a piece of re,
-// so it must not be used after the caller calls re->Decref().
-Rune* Regexp::LeadingString(Regexp* re, int *nrune,
- Regexp::ParseFlags *flags) {
- while (re->op() == kRegexpConcat && re->nsub() > 0)
- re = re->sub()[0];
-
- *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
-
- if (re->op() == kRegexpLiteral) {
- *nrune = 1;
- return &re->rune_;
- }
-
- if (re->op() == kRegexpLiteralString) {
- *nrune = re->nrunes_;
- return re->runes_;
- }
-
- *nrune = 0;
- return NULL;
-}
-
-// Removes the first n leading runes from the beginning of re.
-// Edits re in place.
-void Regexp::RemoveLeadingString(Regexp* re, int n) {
- // Chase down concats to find first string.
- // For regexps generated by parser, nested concats are
- // flattened except when doing so would overflow the 16-bit
- // limit on the size of a concatenation, so we should never
- // see more than two here.
- Regexp* stk[4];
+ status_->set_error_arg(whole_regexp_);
+ return false;
+ }
+
+ // Pop off r1, r2. Will Decref or reuse below.
+ stacktop_ = r2->down_;
+
+ // Restore flags from when paren opened.
+ Regexp* re = r2;
+ flags_ = re->parse_flags();
+
+ // Rewrite LeftParen as capture if needed.
+ if (re->cap_ > 0) {
+ re->op_ = kRegexpCapture;
+ // re->cap_ is already set
+ re->AllocSub(1);
+ re->sub()[0] = FinishRegexp(r1);
+ re->simple_ = re->ComputeSimple();
+ } else {
+ re->Decref();
+ re = r1;
+ }
+ return PushRegexp(re);
+}
+
+// Processes the end of input, returning the final regexp.
+Regexp* Regexp::ParseState::DoFinish() {
+ DoAlternation();
+ Regexp* re = stacktop_;
+ if (re != NULL && re->down_ != NULL) {
+ status_->set_code(kRegexpMissingParen);
+ status_->set_error_arg(whole_regexp_);
+ return NULL;
+ }
+ stacktop_ = NULL;
+ return FinishRegexp(re);
+}
+
+// Returns the leading regexp that re starts with.
+// The returned Regexp* points into a piece of re,
+// so it must not be used after the caller calls re->Decref().
+Regexp* Regexp::LeadingRegexp(Regexp* re) {
+ if (re->op() == kRegexpEmptyMatch)
+ return NULL;
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch)
+ return NULL;
+ return sub[0];
+ }
+ return re;
+}
+
+// Removes LeadingRegexp(re) from re and returns what's left.
+// Consumes the reference to re and may edit it in place.
+// If caller wants to hold on to LeadingRegexp(re),
+// must have already Incref'ed it.
+Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
+ if (re->op() == kRegexpEmptyMatch)
+ return re;
+ if (re->op() == kRegexpConcat && re->nsub() >= 2) {
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch)
+ return re;
+ sub[0]->Decref();
+ sub[0] = NULL;
+ if (re->nsub() == 2) {
+ // Collapse concatenation to single regexp.
+ Regexp* nre = sub[1];
+ sub[1] = NULL;
+ re->Decref();
+ return nre;
+ }
+ // 3 or more -> 2 or more.
+ re->nsub_--;
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
+ return re;
+ }
+ Regexp::ParseFlags pf = re->parse_flags();
+ re->Decref();
+ return new Regexp(kRegexpEmptyMatch, pf);
+}
+
+// Returns the leading string that re starts with.
+// The returned Rune* points into a piece of re,
+// so it must not be used after the caller calls re->Decref().
+Rune* Regexp::LeadingString(Regexp* re, int *nrune,
+ Regexp::ParseFlags *flags) {
+ while (re->op() == kRegexpConcat && re->nsub() > 0)
+ re = re->sub()[0];
+
+ *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
+
+ if (re->op() == kRegexpLiteral) {
+ *nrune = 1;
+ return &re->rune_;
+ }
+
+ if (re->op() == kRegexpLiteralString) {
+ *nrune = re->nrunes_;
+ return re->runes_;
+ }
+
+ *nrune = 0;
+ return NULL;
+}
+
+// Removes the first n leading runes from the beginning of re.
+// Edits re in place.
+void Regexp::RemoveLeadingString(Regexp* re, int n) {
+ // Chase down concats to find first string.
+ // For regexps generated by parser, nested concats are
+ // flattened except when doing so would overflow the 16-bit
+ // limit on the size of a concatenation, so we should never
+ // see more than two here.
+ Regexp* stk[4];
size_t d = 0;
- while (re->op() == kRegexpConcat) {
- if (d < arraysize(stk))
- stk[d++] = re;
- re = re->sub()[0];
- }
-
- // Remove leading string from re.
- if (re->op() == kRegexpLiteral) {
- re->rune_ = 0;
- re->op_ = kRegexpEmptyMatch;
- } else if (re->op() == kRegexpLiteralString) {
- if (n >= re->nrunes_) {
- delete[] re->runes_;
- re->runes_ = NULL;
- re->nrunes_ = 0;
- re->op_ = kRegexpEmptyMatch;
- } else if (n == re->nrunes_ - 1) {
- Rune rune = re->runes_[re->nrunes_ - 1];
- delete[] re->runes_;
- re->runes_ = NULL;
- re->nrunes_ = 0;
- re->rune_ = rune;
- re->op_ = kRegexpLiteral;
- } else {
- re->nrunes_ -= n;
- memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
- }
- }
-
- // If re is now empty, concatenations might simplify too.
+ while (re->op() == kRegexpConcat) {
+ if (d < arraysize(stk))
+ stk[d++] = re;
+ re = re->sub()[0];
+ }
+
+ // Remove leading string from re.
+ if (re->op() == kRegexpLiteral) {
+ re->rune_ = 0;
+ re->op_ = kRegexpEmptyMatch;
+ } else if (re->op() == kRegexpLiteralString) {
+ if (n >= re->nrunes_) {
+ delete[] re->runes_;
+ re->runes_ = NULL;
+ re->nrunes_ = 0;
+ re->op_ = kRegexpEmptyMatch;
+ } else if (n == re->nrunes_ - 1) {
+ Rune rune = re->runes_[re->nrunes_ - 1];
+ delete[] re->runes_;
+ re->runes_ = NULL;
+ re->nrunes_ = 0;
+ re->rune_ = rune;
+ re->op_ = kRegexpLiteral;
+ } else {
+ re->nrunes_ -= n;
+ memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
+ }
+ }
+
+ // If re is now empty, concatenations might simplify too.
while (d > 0) {
re = stk[--d];
- Regexp** sub = re->sub();
- if (sub[0]->op() == kRegexpEmptyMatch) {
- sub[0]->Decref();
- sub[0] = NULL;
- // Delete first element of concat.
- switch (re->nsub()) {
- case 0:
- case 1:
- // Impossible.
- LOG(DFATAL) << "Concat of " << re->nsub();
- re->submany_ = NULL;
- re->op_ = kRegexpEmptyMatch;
- break;
-
- case 2: {
- // Replace re with sub[1].
- Regexp* old = sub[1];
- sub[1] = NULL;
- re->Swap(old);
- old->Decref();
- break;
- }
-
- default:
- // Slide down.
- re->nsub_--;
- memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
- break;
- }
- }
- }
-}
-
+ Regexp** sub = re->sub();
+ if (sub[0]->op() == kRegexpEmptyMatch) {
+ sub[0]->Decref();
+ sub[0] = NULL;
+ // Delete first element of concat.
+ switch (re->nsub()) {
+ case 0:
+ case 1:
+ // Impossible.
+ LOG(DFATAL) << "Concat of " << re->nsub();
+ re->submany_ = NULL;
+ re->op_ = kRegexpEmptyMatch;
+ break;
+
+ case 2: {
+ // Replace re with sub[1].
+ Regexp* old = sub[1];
+ sub[1] = NULL;
+ re->Swap(old);
+ old->Decref();
+ break;
+ }
+
+ default:
+ // Slide down.
+ re->nsub_--;
+ memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
+ break;
+ }
+ }
+ }
+}
+
// In the context of factoring alternations, a Splice is: a factored prefix or
// merged character class computed by one iteration of one round of factoring;
// the span of subexpressions of the alternation to be "spliced" (i.e. removed
@@ -921,28 +921,28 @@ class FactorAlternationImpl {
std::vector<Splice>* splices);
};
-// Factors common prefixes from alternation.
-// For example,
-// ABC|ABD|AEF|BCX|BCY
-// simplifies to
-// A(B(C|D)|EF)|BC(X|Y)
+// Factors common prefixes from alternation.
+// For example,
+// ABC|ABD|AEF|BCX|BCY
+// simplifies to
+// A(B(C|D)|EF)|BC(X|Y)
// and thence to
-// A(B[CD]|EF)|BC[XY]
-//
-// Rewrites sub to contain simplified list to alternate and returns
-// the new length of sub. Adjusts reference counts accordingly
-// (incoming sub[i] decremented, outgoing sub[i] incremented).
+// A(B[CD]|EF)|BC[XY]
+//
+// Rewrites sub to contain simplified list to alternate and returns
+// the new length of sub. Adjusts reference counts accordingly
+// (incoming sub[i] decremented, outgoing sub[i] incremented).
int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
std::vector<Frame> stk;
stk.emplace_back(sub, nsub);
-
+
for (;;) {
auto& sub = stk.back().sub;
auto& nsub = stk.back().nsub;
auto& round = stk.back().round;
auto& splices = stk.back().splices;
auto& spliceidx = stk.back().spliceidx;
-
+
if (splices.empty()) {
// Advance to the next round of factoring. Note that this covers
// the initialised state: when splices is empty and round is 0.
@@ -990,7 +990,7 @@ int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
// Advance to the next round of factoring.
round++;
}
-
+
switch (round) {
case 1:
FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
@@ -1018,7 +1018,7 @@ int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
LOG(DFATAL) << "unknown round: " << round;
break;
}
-
+
// Set spliceidx depending on whether we have Splices to factor.
if (splices.empty() || round == 3) {
spliceidx = static_cast<int>(splices.size());
@@ -1027,59 +1027,59 @@ int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
}
}
}
-
+
void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
Regexp::ParseFlags flags,
std::vector<Splice>* splices) {
- // Round 1: Factor out common literal prefixes.
+ // Round 1: Factor out common literal prefixes.
int start = 0;
Rune* rune = NULL;
- int nrune = 0;
- Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
+ int nrune = 0;
+ Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
for (int i = 0; i <= nsub; i++) {
// Invariant: sub[start:i] consists of regexps that all
// begin with rune[0:nrune].
- Rune* rune_i = NULL;
- int nrune_i = 0;
- Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
+ Rune* rune_i = NULL;
+ int nrune_i = 0;
+ Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
if (i < nsub) {
rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);
- if (runeflags_i == runeflags) {
- int same = 0;
- while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
- same++;
- if (same > 0) {
- // Matches at least one rune in current range. Keep going around.
- nrune = same;
- continue;
- }
- }
- }
-
- // Found end of a run with common leading literal string:
+ if (runeflags_i == runeflags) {
+ int same = 0;
+ while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
+ same++;
+ if (same > 0) {
+ // Matches at least one rune in current range. Keep going around.
+ nrune = same;
+ continue;
+ }
+ }
+ }
+
+ // Found end of a run with common leading literal string:
// sub[start:i] all begin with rune[0:nrune],
// but sub[i] does not even begin with rune[0].
- if (i == start) {
- // Nothing to do - first iteration.
- } else if (i == start+1) {
- // Just one: don't bother factoring.
- } else {
+ if (i == start) {
+ // Nothing to do - first iteration.
+ } else if (i == start+1) {
+ // Just one: don't bother factoring.
+ } else {
Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);
- for (int j = start; j < i; j++)
+ for (int j = start; j < i; j++)
Regexp::RemoveLeadingString(sub[j], nrune);
splices->emplace_back(prefix, sub + start, i - start);
- }
-
+ }
+
// Prepare for next iteration (if there is one).
if (i < nsub) {
- start = i;
- rune = rune_i;
- nrune = nrune_i;
- runeflags = runeflags_i;
- }
- }
+ start = i;
+ rune = rune_i;
+ nrune = nrune_i;
+ runeflags = runeflags_i;
+ }
+ }
}
-
+
void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
Regexp::ParseFlags flags,
std::vector<Splice>* splices) {
@@ -1092,11 +1092,11 @@ void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
// distinct paths through the automaton, which affects
// correctness in some cases.
int start = 0;
- Regexp* first = NULL;
+ Regexp* first = NULL;
for (int i = 0; i <= nsub; i++) {
// Invariant: sub[start:i] consists of regexps that all
// begin with first.
- Regexp* first_i = NULL;
+ Regexp* first_i = NULL;
if (i < nsub) {
first_i = Regexp::LeadingRegexp(sub[i]);
if (first != NULL &&
@@ -1119,31 +1119,31 @@ void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
first->sub()[0]->op() == kRegexpAnyChar ||
first->sub()[0]->op() == kRegexpAnyByte))) &&
Regexp::Equal(first, first_i))
- continue;
- }
-
- // Found end of a run with common leading regexp:
+ continue;
+ }
+
+ // Found end of a run with common leading regexp:
// sub[start:i] all begin with first,
// but sub[i] does not.
- if (i == start) {
- // Nothing to do - first iteration.
- } else if (i == start+1) {
- // Just one: don't bother factoring.
- } else {
+ if (i == start) {
+ // Nothing to do - first iteration.
+ } else if (i == start+1) {
+ // Just one: don't bother factoring.
+ } else {
Regexp* prefix = first->Incref();
- for (int j = start; j < i; j++)
+ for (int j = start; j < i; j++)
sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);
splices->emplace_back(prefix, sub + start, i - start);
- }
-
+ }
+
// Prepare for next iteration (if there is one).
if (i < nsub) {
- start = i;
- first = first_i;
- }
- }
+ start = i;
+ first = first_i;
+ }
+ }
}
-
+
void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
Regexp::ParseFlags flags,
std::vector<Splice>* splices) {
@@ -1163,234 +1163,234 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
first_i->op() == kRegexpCharClass))
continue;
}
-
+
// Found end of a run of Literal/CharClass:
// sub[start:i] all are either one or the other,
// but sub[i] is not.
- if (i == start) {
+ if (i == start) {
// Nothing to do - first iteration.
- } else if (i == start+1) {
+ } else if (i == start+1) {
// Just one: don't bother factoring.
- } else {
- CharClassBuilder ccb;
- for (int j = start; j < i; j++) {
- Regexp* re = sub[j];
- if (re->op() == kRegexpCharClass) {
- CharClass* cc = re->cc();
- for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
- ccb.AddRange(it->lo, it->hi);
- } else if (re->op() == kRegexpLiteral) {
- ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
- } else {
- LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
- << re->ToString();
- }
- re->Decref();
- }
+ } else {
+ CharClassBuilder ccb;
+ for (int j = start; j < i; j++) {
+ Regexp* re = sub[j];
+ if (re->op() == kRegexpCharClass) {
+ CharClass* cc = re->cc();
+ for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
+ ccb.AddRange(it->lo, it->hi);
+ } else if (re->op() == kRegexpLiteral) {
+ ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
+ } else {
+ LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
+ << re->ToString();
+ }
+ re->Decref();
+ }
Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
splices->emplace_back(re, sub + start, i - start);
- }
-
+ }
+
// Prepare for next iteration (if there is one).
if (i < nsub) {
start = i;
first = first_i;
- }
- }
-}
-
-// Collapse the regexps on top of the stack, down to the
-// first marker, into a new op node (op == kRegexpAlternate
-// or op == kRegexpConcat).
-void Regexp::ParseState::DoCollapse(RegexpOp op) {
- // Scan backward to marker, counting children of composite.
- int n = 0;
- Regexp* next = NULL;
- Regexp* sub;
- for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
- next = sub->down_;
- if (sub->op_ == op)
- n += sub->nsub_;
- else
- n++;
- }
-
- // If there's just one child, leave it alone.
- // (Concat of one thing is that one thing; alternate of one thing is same.)
- if (stacktop_ != NULL && stacktop_->down_ == next)
- return;
-
- // Construct op (alternation or concatenation), flattening op of op.
+ }
+ }
+}
+
+// Collapse the regexps on top of the stack, down to the
+// first marker, into a new op node (op == kRegexpAlternate
+// or op == kRegexpConcat).
+void Regexp::ParseState::DoCollapse(RegexpOp op) {
+ // Scan backward to marker, counting children of composite.
+ int n = 0;
+ Regexp* next = NULL;
+ Regexp* sub;
+ for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
+ next = sub->down_;
+ if (sub->op_ == op)
+ n += sub->nsub_;
+ else
+ n++;
+ }
+
+ // If there's just one child, leave it alone.
+ // (Concat of one thing is that one thing; alternate of one thing is same.)
+ if (stacktop_ != NULL && stacktop_->down_ == next)
+ return;
+
+ // Construct op (alternation or concatenation), flattening op of op.
PODArray<Regexp*> subs(n);
- next = NULL;
- int i = n;
- for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
- next = sub->down_;
- if (sub->op_ == op) {
- Regexp** sub_subs = sub->sub();
- for (int k = sub->nsub_ - 1; k >= 0; k--)
- subs[--i] = sub_subs[k]->Incref();
- sub->Decref();
- } else {
- subs[--i] = FinishRegexp(sub);
- }
- }
-
+ next = NULL;
+ int i = n;
+ for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
+ next = sub->down_;
+ if (sub->op_ == op) {
+ Regexp** sub_subs = sub->sub();
+ for (int k = sub->nsub_ - 1; k >= 0; k--)
+ subs[--i] = sub_subs[k]->Incref();
+ sub->Decref();
+ } else {
+ subs[--i] = FinishRegexp(sub);
+ }
+ }
+
Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);
- re->simple_ = re->ComputeSimple();
- re->down_ = next;
- stacktop_ = re;
-}
-
-// Finishes the current concatenation,
-// collapsing it into a single regexp on the stack.
-void Regexp::ParseState::DoConcatenation() {
- Regexp* r1 = stacktop_;
- if (r1 == NULL || IsMarker(r1->op())) {
- // empty concatenation is special case
- Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
- PushRegexp(re);
- }
- DoCollapse(kRegexpConcat);
-}
-
-// Finishes the current alternation,
-// collapsing it to a single regexp on the stack.
-void Regexp::ParseState::DoAlternation() {
- DoVerticalBar();
- // Now stack top is kVerticalBar.
- Regexp* r1 = stacktop_;
- stacktop_ = r1->down_;
- r1->Decref();
- DoCollapse(kRegexpAlternate);
-}
-
-// Incremental conversion of concatenated literals into strings.
-// If top two elements on stack are both literal or string,
-// collapse into single string.
-// Don't walk down the stack -- the parser calls this frequently
-// enough that below the bottom two is known to be collapsed.
-// Only called when another regexp is about to be pushed
-// on the stack, so that the topmost literal is not being considered.
-// (Otherwise ab* would turn into (ab)*.)
-// If r >= 0, consider pushing a literal r on the stack.
-// Return whether that happened.
-bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
- Regexp* re1;
- Regexp* re2;
- if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
- return false;
-
- if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
- return false;
- if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
- return false;
- if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
- return false;
-
- if (re2->op_ == kRegexpLiteral) {
- // convert into string
- Rune rune = re2->rune_;
- re2->op_ = kRegexpLiteralString;
- re2->nrunes_ = 0;
- re2->runes_ = NULL;
- re2->AddRuneToString(rune);
- }
-
- // push re1 into re2.
- if (re1->op_ == kRegexpLiteral) {
- re2->AddRuneToString(re1->rune_);
- } else {
- for (int i = 0; i < re1->nrunes_; i++)
- re2->AddRuneToString(re1->runes_[i]);
- re1->nrunes_ = 0;
- delete[] re1->runes_;
- re1->runes_ = NULL;
- }
-
- // reuse re1 if possible
- if (r >= 0) {
- re1->op_ = kRegexpLiteral;
- re1->rune_ = r;
+ re->simple_ = re->ComputeSimple();
+ re->down_ = next;
+ stacktop_ = re;
+}
+
+// Finishes the current concatenation,
+// collapsing it into a single regexp on the stack.
+void Regexp::ParseState::DoConcatenation() {
+ Regexp* r1 = stacktop_;
+ if (r1 == NULL || IsMarker(r1->op())) {
+ // empty concatenation is special case
+ Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
+ PushRegexp(re);
+ }
+ DoCollapse(kRegexpConcat);
+}
+
+// Finishes the current alternation,
+// collapsing it to a single regexp on the stack.
+void Regexp::ParseState::DoAlternation() {
+ DoVerticalBar();
+ // Now stack top is kVerticalBar.
+ Regexp* r1 = stacktop_;
+ stacktop_ = r1->down_;
+ r1->Decref();
+ DoCollapse(kRegexpAlternate);
+}
+
+// Incremental conversion of concatenated literals into strings.
+// If top two elements on stack are both literal or string,
+// collapse into single string.
+// Don't walk down the stack -- the parser calls this frequently
+// enough that below the bottom two is known to be collapsed.
+// Only called when another regexp is about to be pushed
+// on the stack, so that the topmost literal is not being considered.
+// (Otherwise ab* would turn into (ab)*.)
+// If r >= 0, consider pushing a literal r on the stack.
+// Return whether that happened.
+bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
+ Regexp* re1;
+ Regexp* re2;
+ if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
+ return false;
+
+ if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
+ return false;
+ if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
+ return false;
+ if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
+ return false;
+
+ if (re2->op_ == kRegexpLiteral) {
+ // convert into string
+ Rune rune = re2->rune_;
+ re2->op_ = kRegexpLiteralString;
+ re2->nrunes_ = 0;
+ re2->runes_ = NULL;
+ re2->AddRuneToString(rune);
+ }
+
+ // push re1 into re2.
+ if (re1->op_ == kRegexpLiteral) {
+ re2->AddRuneToString(re1->rune_);
+ } else {
+ for (int i = 0; i < re1->nrunes_; i++)
+ re2->AddRuneToString(re1->runes_[i]);
+ re1->nrunes_ = 0;
+ delete[] re1->runes_;
+ re1->runes_ = NULL;
+ }
+
+ // reuse re1 if possible
+ if (r >= 0) {
+ re1->op_ = kRegexpLiteral;
+ re1->rune_ = r;
re1->parse_flags_ = static_cast<uint16_t>(flags);
- return true;
- }
-
- stacktop_ = re2;
- re1->Decref();
- return false;
-}
-
-// Lexing routines.
-
+ return true;
+ }
+
+ stacktop_ = re2;
+ re1->Decref();
+ return false;
+}
+
+// Lexing routines.
+
// Parses a decimal integer, storing it in *np.
-// Sets *s to span the remainder of the string.
-static bool ParseInteger(StringPiece* s, int* np) {
+// Sets *s to span the remainder of the string.
+static bool ParseInteger(StringPiece* s, int* np) {
if (s->empty() || !isdigit((*s)[0] & 0xFF))
- return false;
- // Disallow leading zeros.
+ return false;
+ // Disallow leading zeros.
if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF))
- return false;
- int n = 0;
- int c;
+ return false;
+ int n = 0;
+ int c;
while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) {
- // Avoid overflow.
- if (n >= 100000000)
- return false;
- n = n*10 + c - '0';
- s->remove_prefix(1); // digit
- }
- *np = n;
- return true;
-}
-
-// Parses a repetition suffix like {1,2} or {2} or {2,}.
-// Sets *s to span the remainder of the string on success.
-// Sets *lo and *hi to the given range.
-// In the case of {2,}, the high number is unbounded;
-// sets *hi to -1 to signify this.
-// {,2} is NOT a valid suffix.
-// The Maybe in the name signifies that the regexp parse
-// doesn't fail even if ParseRepetition does, so the StringPiece
-// s must NOT be edited unless MaybeParseRepetition returns true.
-static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
- StringPiece s = *sp;
+ // Avoid overflow.
+ if (n >= 100000000)
+ return false;
+ n = n*10 + c - '0';
+ s->remove_prefix(1); // digit
+ }
+ *np = n;
+ return true;
+}
+
+// Parses a repetition suffix like {1,2} or {2} or {2,}.
+// Sets *s to span the remainder of the string on success.
+// Sets *lo and *hi to the given range.
+// In the case of {2,}, the high number is unbounded;
+// sets *hi to -1 to signify this.
+// {,2} is NOT a valid suffix.
+// The Maybe in the name signifies that the regexp parse
+// doesn't fail even if ParseRepetition does, so the StringPiece
+// s must NOT be edited unless MaybeParseRepetition returns true.
+static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
+ StringPiece s = *sp;
if (s.empty() || s[0] != '{')
- return false;
- s.remove_prefix(1); // '{'
- if (!ParseInteger(&s, lo))
- return false;
+ return false;
+ s.remove_prefix(1); // '{'
+ if (!ParseInteger(&s, lo))
+ return false;
if (s.empty())
- return false;
- if (s[0] == ',') {
- s.remove_prefix(1); // ','
+ return false;
+ if (s[0] == ',') {
+ s.remove_prefix(1); // ','
if (s.empty())
- return false;
- if (s[0] == '}') {
- // {2,} means at least 2
- *hi = -1;
- } else {
- // {2,4} means 2, 3, or 4.
- if (!ParseInteger(&s, hi))
- return false;
- }
- } else {
- // {2} means exactly two
- *hi = *lo;
- }
+ return false;
+ if (s[0] == '}') {
+ // {2,} means at least 2
+ *hi = -1;
+ } else {
+ // {2,4} means 2, 3, or 4.
+ if (!ParseInteger(&s, hi))
+ return false;
+ }
+ } else {
+ // {2} means exactly two
+ *hi = *lo;
+ }
if (s.empty() || s[0] != '}')
- return false;
- s.remove_prefix(1); // '}'
- *sp = s;
- return true;
-}
-
-// Removes the next Rune from the StringPiece and stores it in *r.
-// Returns number of bytes removed from sp.
-// Behaves as though there is a terminating NUL at the end of sp.
-// Argument order is backwards from usual Google style
-// but consistent with chartorune.
-static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) {
+ return false;
+ s.remove_prefix(1); // '}'
+ *sp = s;
+ return true;
+}
+
+// Removes the next Rune from the StringPiece and stores it in *r.
+// Returns number of bytes removed from sp.
+// Behaves as though there is a terminating NUL at the end of sp.
+// Argument order is backwards from usual Google style
+// but consistent with chartorune.
+static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) {
// fullrune() takes int, not size_t. However, it just looks
// at the leading byte and treats any length >= 4 the same.
if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) {
@@ -1403,278 +1403,278 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) {
n = 1;
*r = Runeerror;
}
- if (!(n == 1 && *r == Runeerror)) { // no decoding error
- sp->remove_prefix(n);
- return n;
- }
- }
-
+ if (!(n == 1 && *r == Runeerror)) { // no decoding error
+ sp->remove_prefix(n);
+ return n;
+ }
+ }
+
if (status != NULL) {
status->set_code(kRegexpBadUTF8);
status->set_error_arg(StringPiece());
}
- return -1;
-}
-
+ return -1;
+}
+
// Returns whether name is valid UTF-8.
// If not, sets status to kRegexpBadUTF8.
-static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
- StringPiece t = s;
- Rune r;
+static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
+ StringPiece t = s;
+ Rune r;
while (!t.empty()) {
- if (StringPieceToRune(&r, &t, status) < 0)
- return false;
- }
- return true;
-}
-
-// Is c a hex digit?
-static int IsHex(int c) {
- return ('0' <= c && c <= '9') ||
- ('A' <= c && c <= 'F') ||
- ('a' <= c && c <= 'f');
-}
-
-// Convert hex digit to value.
-static int UnHex(int c) {
- if ('0' <= c && c <= '9')
- return c - '0';
- if ('A' <= c && c <= 'F')
- return c - 'A' + 10;
- if ('a' <= c && c <= 'f')
- return c - 'a' + 10;
- LOG(DFATAL) << "Bad hex digit " << c;
- return 0;
-}
-
-// Parse an escape sequence (e.g., \n, \{).
-// Sets *s to span the remainder of the string.
-// Sets *rp to the named character.
-static bool ParseEscape(StringPiece* s, Rune* rp,
- RegexpStatus* status, int rune_max) {
+ if (StringPieceToRune(&r, &t, status) < 0)
+ return false;
+ }
+ return true;
+}
+
+// Is c a hex digit?
+static int IsHex(int c) {
+ return ('0' <= c && c <= '9') ||
+ ('A' <= c && c <= 'F') ||
+ ('a' <= c && c <= 'f');
+}
+
+// Convert hex digit to value.
+static int UnHex(int c) {
+ if ('0' <= c && c <= '9')
+ return c - '0';
+ if ('A' <= c && c <= 'F')
+ return c - 'A' + 10;
+ if ('a' <= c && c <= 'f')
+ return c - 'a' + 10;
+ LOG(DFATAL) << "Bad hex digit " << c;
+ return 0;
+}
+
+// Parse an escape sequence (e.g., \n, \{).
+// Sets *s to span the remainder of the string.
+// Sets *rp to the named character.
+static bool ParseEscape(StringPiece* s, Rune* rp,
+ RegexpStatus* status, int rune_max) {
const char* begin = s->data();
if (s->empty() || (*s)[0] != '\\') {
- // Should not happen - caller always checks.
- status->set_code(kRegexpInternalError);
+ // Should not happen - caller always checks.
+ status->set_code(kRegexpInternalError);
status->set_error_arg(StringPiece());
- return false;
- }
+ return false;
+ }
if (s->size() == 1) {
- status->set_code(kRegexpTrailingBackslash);
+ status->set_code(kRegexpTrailingBackslash);
status->set_error_arg(StringPiece());
- return false;
- }
- Rune c, c1;
- s->remove_prefix(1); // backslash
- if (StringPieceToRune(&c, s, status) < 0)
- return false;
- int code;
- switch (c) {
- default:
+ return false;
+ }
+ Rune c, c1;
+ s->remove_prefix(1); // backslash
+ if (StringPieceToRune(&c, s, status) < 0)
+ return false;
+ int code;
+ switch (c) {
+ default:
if (c < Runeself && !isalpha(c) && !isdigit(c)) {
- // Escaped non-word characters are always themselves.
- // PCRE is not quite so rigorous: it accepts things like
- // \q, but we don't. We once rejected \_, but too many
- // programs and people insist on using it, so allow \_.
- *rp = c;
- return true;
- }
- goto BadEscape;
-
- // Octal escapes.
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- // Single non-zero octal digit is a backreference; not supported.
+ // Escaped non-word characters are always themselves.
+ // PCRE is not quite so rigorous: it accepts things like
+ // \q, but we don't. We once rejected \_, but too many
+ // programs and people insist on using it, so allow \_.
+ *rp = c;
+ return true;
+ }
+ goto BadEscape;
+
+ // Octal escapes.
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ // Single non-zero octal digit is a backreference; not supported.
if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7')
- goto BadEscape;
+ goto BadEscape;
FALLTHROUGH_INTENDED;
- case '0':
- // consume up to three octal digits; already have one.
- code = c - '0';
+ case '0':
+ // consume up to three octal digits; already have one.
+ code = c - '0';
if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') {
- code = code * 8 + c - '0';
- s->remove_prefix(1); // digit
+ code = code * 8 + c - '0';
+ s->remove_prefix(1); // digit
if (!s->empty()) {
- c = (*s)[0];
- if ('0' <= c && c <= '7') {
- code = code * 8 + c - '0';
- s->remove_prefix(1); // digit
- }
- }
- }
+ c = (*s)[0];
+ if ('0' <= c && c <= '7') {
+ code = code * 8 + c - '0';
+ s->remove_prefix(1); // digit
+ }
+ }
+ }
if (code > rune_max)
goto BadEscape;
- *rp = code;
- return true;
-
- // Hexadecimal escapes
- case 'x':
+ *rp = code;
+ return true;
+
+ // Hexadecimal escapes
+ case 'x':
if (s->empty())
- goto BadEscape;
- if (StringPieceToRune(&c, s, status) < 0)
- return false;
- if (c == '{') {
- // Any number of digits in braces.
- // Update n as we consume the string, so that
- // the whole thing gets shown in the error message.
- // Perl accepts any text at all; it ignores all text
- // after the first non-hex digit. We require only hex digits,
- // and at least one.
- if (StringPieceToRune(&c, s, status) < 0)
- return false;
- int nhex = 0;
- code = 0;
- while (IsHex(c)) {
- nhex++;
- code = code * 16 + UnHex(c);
- if (code > rune_max)
- goto BadEscape;
+ goto BadEscape;
+ if (StringPieceToRune(&c, s, status) < 0)
+ return false;
+ if (c == '{') {
+ // Any number of digits in braces.
+ // Update n as we consume the string, so that
+ // the whole thing gets shown in the error message.
+ // Perl accepts any text at all; it ignores all text
+ // after the first non-hex digit. We require only hex digits,
+ // and at least one.
+ if (StringPieceToRune(&c, s, status) < 0)
+ return false;
+ int nhex = 0;
+ code = 0;
+ while (IsHex(c)) {
+ nhex++;
+ code = code * 16 + UnHex(c);
+ if (code > rune_max)
+ goto BadEscape;
if (s->empty())
- goto BadEscape;
- if (StringPieceToRune(&c, s, status) < 0)
- return false;
- }
- if (c != '}' || nhex == 0)
- goto BadEscape;
- *rp = code;
- return true;
- }
- // Easy case: two hex digits.
+ goto BadEscape;
+ if (StringPieceToRune(&c, s, status) < 0)
+ return false;
+ }
+ if (c != '}' || nhex == 0)
+ goto BadEscape;
+ *rp = code;
+ return true;
+ }
+ // Easy case: two hex digits.
if (s->empty())
- goto BadEscape;
- if (StringPieceToRune(&c1, s, status) < 0)
- return false;
- if (!IsHex(c) || !IsHex(c1))
- goto BadEscape;
- *rp = UnHex(c) * 16 + UnHex(c1);
- return true;
-
- // C escapes.
- case 'n':
- *rp = '\n';
- return true;
- case 'r':
- *rp = '\r';
- return true;
- case 't':
- *rp = '\t';
- return true;
-
- // Less common C escapes.
- case 'a':
- *rp = '\a';
- return true;
- case 'f':
- *rp = '\f';
- return true;
- case 'v':
- *rp = '\v';
- return true;
-
- // This code is disabled to avoid misparsing
- // the Perl word-boundary \b as a backspace
- // when in POSIX regexp mode. Surprisingly,
- // in Perl, \b means word-boundary but [\b]
- // means backspace. We don't support that:
- // if you want a backspace embed a literal
+ goto BadEscape;
+ if (StringPieceToRune(&c1, s, status) < 0)
+ return false;
+ if (!IsHex(c) || !IsHex(c1))
+ goto BadEscape;
+ *rp = UnHex(c) * 16 + UnHex(c1);
+ return true;
+
+ // C escapes.
+ case 'n':
+ *rp = '\n';
+ return true;
+ case 'r':
+ *rp = '\r';
+ return true;
+ case 't':
+ *rp = '\t';
+ return true;
+
+ // Less common C escapes.
+ case 'a':
+ *rp = '\a';
+ return true;
+ case 'f':
+ *rp = '\f';
+ return true;
+ case 'v':
+ *rp = '\v';
+ return true;
+
+ // This code is disabled to avoid misparsing
+ // the Perl word-boundary \b as a backspace
+ // when in POSIX regexp mode. Surprisingly,
+ // in Perl, \b means word-boundary but [\b]
+ // means backspace. We don't support that:
+ // if you want a backspace embed a literal
// backspace character or use \x08.
- //
- // case 'b':
- // *rp = '\b';
- // return true;
- }
-
- LOG(DFATAL) << "Not reached in ParseEscape.";
-
-BadEscape:
- // Unrecognized escape sequence.
- status->set_code(kRegexpBadEscape);
+ //
+ // case 'b':
+ // *rp = '\b';
+ // return true;
+ }
+
+ LOG(DFATAL) << "Not reached in ParseEscape.";
+
+BadEscape:
+ // Unrecognized escape sequence.
+ status->set_code(kRegexpBadEscape);
status->set_error_arg(
StringPiece(begin, static_cast<size_t>(s->data() - begin)));
- return false;
-}
-
-// Add a range to the character class, but exclude newline if asked.
-// Also handle case folding.
-void CharClassBuilder::AddRangeFlags(
- Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
-
- // Take out \n if the flags say so.
- bool cutnl = !(parse_flags & Regexp::ClassNL) ||
- (parse_flags & Regexp::NeverNL);
- if (cutnl && lo <= '\n' && '\n' <= hi) {
- if (lo < '\n')
- AddRangeFlags(lo, '\n' - 1, parse_flags);
- if (hi > '\n')
- AddRangeFlags('\n' + 1, hi, parse_flags);
- return;
- }
-
- // If folding case, add fold-equivalent characters too.
- if (parse_flags & Regexp::FoldCase)
- AddFoldedRange(this, lo, hi, 0);
- else
- AddRange(lo, hi);
-}
-
-// Look for a group with the given name.
+ return false;
+}
+
+// Add a range to the character class, but exclude newline if asked.
+// Also handle case folding.
+void CharClassBuilder::AddRangeFlags(
+ Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
+
+ // Take out \n if the flags say so.
+ bool cutnl = !(parse_flags & Regexp::ClassNL) ||
+ (parse_flags & Regexp::NeverNL);
+ if (cutnl && lo <= '\n' && '\n' <= hi) {
+ if (lo < '\n')
+ AddRangeFlags(lo, '\n' - 1, parse_flags);
+ if (hi > '\n')
+ AddRangeFlags('\n' + 1, hi, parse_flags);
+ return;
+ }
+
+ // If folding case, add fold-equivalent characters too.
+ if (parse_flags & Regexp::FoldCase)
+ AddFoldedRange(this, lo, hi, 0);
+ else
+ AddRange(lo, hi);
+}
+
+// Look for a group with the given name.
static const UGroup* LookupGroup(const StringPiece& name,
const UGroup *groups, int ngroups) {
- // Simple name lookup.
- for (int i = 0; i < ngroups; i++)
- if (StringPiece(groups[i].name) == name)
- return &groups[i];
- return NULL;
-}
-
-// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
+ // Simple name lookup.
+ for (int i = 0; i < ngroups; i++)
+ if (StringPiece(groups[i].name) == name)
+ return &groups[i];
+ return NULL;
+}
+
+// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
static const UGroup* LookupPosixGroup(const StringPiece& name) {
- return LookupGroup(name, posix_groups, num_posix_groups);
-}
-
+ return LookupGroup(name, posix_groups, num_posix_groups);
+}
+
static const UGroup* LookupPerlGroup(const StringPiece& name) {
- return LookupGroup(name, perl_groups, num_perl_groups);
-}
-
+ return LookupGroup(name, perl_groups, num_perl_groups);
+}
+
#if !defined(RE2_USE_ICU)
// Fake UGroup containing all Runes
static URange16 any16[] = { { 0, 65535 } };
static URange32 any32[] = { { 65536, Runemax } };
static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
-// Look for a Unicode group with the given name (e.g., "Han")
+// Look for a Unicode group with the given name (e.g., "Han")
static const UGroup* LookupUnicodeGroup(const StringPiece& name) {
- // Special case: "Any" means any.
- if (name == StringPiece("Any"))
- return &anygroup;
- return LookupGroup(name, unicode_groups, num_unicode_groups);
-}
+ // Special case: "Any" means any.
+ if (name == StringPiece("Any"))
+ return &anygroup;
+ return LookupGroup(name, unicode_groups, num_unicode_groups);
+}
#endif
-
-// Add a UGroup or its negation to the character class.
+
+// Add a UGroup or its negation to the character class.
static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,
Regexp::ParseFlags parse_flags) {
- if (sign == +1) {
- for (int i = 0; i < g->nr16; i++) {
- cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
- }
- for (int i = 0; i < g->nr32; i++) {
- cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
- }
- } else {
- if (parse_flags & Regexp::FoldCase) {
- // Normally adding a case-folded group means
- // adding all the extra fold-equivalent runes too.
- // But if we're adding the negation of the group,
- // we have to exclude all the runes that are fold-equivalent
- // to what's already missing. Too hard, so do in two steps.
- CharClassBuilder ccb1;
- AddUGroup(&ccb1, g, +1, parse_flags);
+ if (sign == +1) {
+ for (int i = 0; i < g->nr16; i++) {
+ cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
+ }
+ for (int i = 0; i < g->nr32; i++) {
+ cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
+ }
+ } else {
+ if (parse_flags & Regexp::FoldCase) {
+ // Normally adding a case-folded group means
+ // adding all the extra fold-equivalent runes too.
+ // But if we're adding the negation of the group,
+ // we have to exclude all the runes that are fold-equivalent
+ // to what's already missing. Too hard, so do in two steps.
+ CharClassBuilder ccb1;
+ AddUGroup(&ccb1, g, +1, parse_flags);
// If the flags say to take out \n, put it in, so that negating will take it out.
// Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
bool cutnl = !(parse_flags & Regexp::ClassNL) ||
@@ -1682,115 +1682,115 @@ static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,
if (cutnl) {
ccb1.AddRange('\n', '\n');
}
- ccb1.Negate();
- cc->AddCharClass(&ccb1);
- return;
- }
- int next = 0;
- for (int i = 0; i < g->nr16; i++) {
- if (next < g->r16[i].lo)
- cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
- next = g->r16[i].hi + 1;
- }
- for (int i = 0; i < g->nr32; i++) {
- if (next < g->r32[i].lo)
- cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
- next = g->r32[i].hi + 1;
- }
- if (next <= Runemax)
- cc->AddRangeFlags(next, Runemax, parse_flags);
- }
-}
-
-// Maybe parse a Perl character class escape sequence.
-// Only recognizes the Perl character classes (\d \s \w \D \S \W),
-// not the Perl empty-string classes (\b \B \A \Z \z).
-// On success, sets *s to span the remainder of the string
-// and returns the corresponding UGroup.
-// The StringPiece must *NOT* be edited unless the call succeeds.
+ ccb1.Negate();
+ cc->AddCharClass(&ccb1);
+ return;
+ }
+ int next = 0;
+ for (int i = 0; i < g->nr16; i++) {
+ if (next < g->r16[i].lo)
+ cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
+ next = g->r16[i].hi + 1;
+ }
+ for (int i = 0; i < g->nr32; i++) {
+ if (next < g->r32[i].lo)
+ cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
+ next = g->r32[i].hi + 1;
+ }
+ if (next <= Runemax)
+ cc->AddRangeFlags(next, Runemax, parse_flags);
+ }
+}
+
+// Maybe parse a Perl character class escape sequence.
+// Only recognizes the Perl character classes (\d \s \w \D \S \W),
+// not the Perl empty-string classes (\b \B \A \Z \z).
+// On success, sets *s to span the remainder of the string
+// and returns the corresponding UGroup.
+// The StringPiece must *NOT* be edited unless the call succeeds.
const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
- if (!(parse_flags & Regexp::PerlClasses))
- return NULL;
- if (s->size() < 2 || (*s)[0] != '\\')
- return NULL;
- // Could use StringPieceToRune, but there aren't
- // any non-ASCII Perl group names.
+ if (!(parse_flags & Regexp::PerlClasses))
+ return NULL;
+ if (s->size() < 2 || (*s)[0] != '\\')
+ return NULL;
+ // Could use StringPieceToRune, but there aren't
+ // any non-ASCII Perl group names.
StringPiece name(s->data(), 2);
const UGroup *g = LookupPerlGroup(name);
- if (g == NULL)
- return NULL;
- s->remove_prefix(name.size());
- return g;
-}
-
-enum ParseStatus {
- kParseOk, // Did some parsing.
- kParseError, // Found an error.
- kParseNothing, // Decided not to parse.
-};
-
-// Maybe parses a Unicode character group like \p{Han} or \P{Han}
-// (the latter is a negated group).
-ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
- CharClassBuilder *cc,
- RegexpStatus* status) {
- // Decide whether to parse.
- if (!(parse_flags & Regexp::UnicodeGroups))
- return kParseNothing;
- if (s->size() < 2 || (*s)[0] != '\\')
- return kParseNothing;
- Rune c = (*s)[1];
- if (c != 'p' && c != 'P')
- return kParseNothing;
-
- // Committed to parse. Results:
- int sign = +1; // -1 = negated char class
- if (c == 'P')
+ if (g == NULL)
+ return NULL;
+ s->remove_prefix(name.size());
+ return g;
+}
+
+enum ParseStatus {
+ kParseOk, // Did some parsing.
+ kParseError, // Found an error.
+ kParseNothing, // Decided not to parse.
+};
+
+// Maybe parses a Unicode character group like \p{Han} or \P{Han}
+// (the latter is a negated group).
+ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
+ CharClassBuilder *cc,
+ RegexpStatus* status) {
+ // Decide whether to parse.
+ if (!(parse_flags & Regexp::UnicodeGroups))
+ return kParseNothing;
+ if (s->size() < 2 || (*s)[0] != '\\')
+ return kParseNothing;
+ Rune c = (*s)[1];
+ if (c != 'p' && c != 'P')
+ return kParseNothing;
+
+ // Committed to parse. Results:
+ int sign = +1; // -1 = negated char class
+ if (c == 'P')
sign = -sign;
- StringPiece seq = *s; // \p{Han} or \pL
- StringPiece name; // Han or L
- s->remove_prefix(2); // '\\', 'p'
-
- if (!StringPieceToRune(&c, s, status))
- return kParseError;
- if (c != '{') {
- // Name is the bit of string we just skipped over for c.
+ StringPiece seq = *s; // \p{Han} or \pL
+ StringPiece name; // Han or L
+ s->remove_prefix(2); // '\\', 'p'
+
+ if (!StringPieceToRune(&c, s, status))
+ return kParseError;
+ if (c != '{') {
+ // Name is the bit of string we just skipped over for c.
const char* p = seq.data() + 2;
name = StringPiece(p, static_cast<size_t>(s->data() - p));
- } else {
- // Name is in braces. Look for closing }
+ } else {
+ // Name is in braces. Look for closing }
size_t end = s->find('}', 0);
if (end == StringPiece::npos) {
- if (!IsValidUTF8(seq, status))
- return kParseError;
- status->set_code(kRegexpBadCharRange);
- status->set_error_arg(seq);
- return kParseError;
- }
+ if (!IsValidUTF8(seq, status))
+ return kParseError;
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(seq);
+ return kParseError;
+ }
name = StringPiece(s->data(), end); // without '}'
- s->remove_prefix(end + 1); // with '}'
- if (!IsValidUTF8(name, status))
- return kParseError;
- }
-
- // Chop seq where s now begins.
+ s->remove_prefix(end + 1); // with '}'
+ if (!IsValidUTF8(name, status))
+ return kParseError;
+ }
+
+ // Chop seq where s now begins.
seq = StringPiece(seq.data(), static_cast<size_t>(s->data() - seq.data()));
-
+
if (!name.empty() && name[0] == '^') {
- sign = -sign;
- name.remove_prefix(1); // '^'
- }
+ sign = -sign;
+ name.remove_prefix(1); // '^'
+ }
#if !defined(RE2_USE_ICU)
// Look up the group in the RE2 Unicode data.
const UGroup *g = LookupUnicodeGroup(name);
- if (g == NULL) {
- status->set_code(kRegexpBadCharRange);
- status->set_error_arg(seq);
- return kParseError;
- }
-
- AddUGroup(cc, g, sign, parse_flags);
+ if (g == NULL) {
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(seq);
+ return kParseError;
+ }
+
+ AddUGroup(cc, g, sign, parse_flags);
#else
// Look up the group in the ICU Unicode data. Because ICU provides full
// Unicode properties support, this could be more than a lookup by name.
@@ -1815,210 +1815,210 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
AddUGroup(cc, &g, sign, parse_flags);
#endif
- return kParseOk;
-}
-
-// Parses a character class name like [:alnum:].
-// Sets *s to span the remainder of the string.
-// Adds the ranges corresponding to the class to ranges.
-static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
- CharClassBuilder *cc,
- RegexpStatus* status) {
- // Check begins with [:
- const char* p = s->data();
- const char* ep = s->data() + s->size();
- if (ep - p < 2 || p[0] != '[' || p[1] != ':')
- return kParseNothing;
-
- // Look for closing :].
- const char* q;
- for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
- ;
-
- // If no closing :], then ignore.
- if (q > ep-2)
- return kParseNothing;
-
- // Got it. Check that it's valid.
- q += 2;
+ return kParseOk;
+}
+
+// Parses a character class name like [:alnum:].
+// Sets *s to span the remainder of the string.
+// Adds the ranges corresponding to the class to ranges.
+static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
+ CharClassBuilder *cc,
+ RegexpStatus* status) {
+ // Check begins with [:
+ const char* p = s->data();
+ const char* ep = s->data() + s->size();
+ if (ep - p < 2 || p[0] != '[' || p[1] != ':')
+ return kParseNothing;
+
+ // Look for closing :].
+ const char* q;
+ for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
+ ;
+
+ // If no closing :], then ignore.
+ if (q > ep-2)
+ return kParseNothing;
+
+ // Got it. Check that it's valid.
+ q += 2;
StringPiece name(p, static_cast<size_t>(q - p));
-
+
const UGroup *g = LookupPosixGroup(name);
- if (g == NULL) {
- status->set_code(kRegexpBadCharRange);
- status->set_error_arg(name);
- return kParseError;
- }
-
- s->remove_prefix(name.size());
- AddUGroup(cc, g, g->sign, parse_flags);
- return kParseOk;
-}
-
-// Parses a character inside a character class.
-// There are fewer special characters here than in the rest of the regexp.
-// Sets *s to span the remainder of the string.
-// Sets *rp to the character.
-bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
- const StringPiece& whole_class,
- RegexpStatus* status) {
+ if (g == NULL) {
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(name);
+ return kParseError;
+ }
+
+ s->remove_prefix(name.size());
+ AddUGroup(cc, g, g->sign, parse_flags);
+ return kParseOk;
+}
+
+// Parses a character inside a character class.
+// There are fewer special characters here than in the rest of the regexp.
+// Sets *s to span the remainder of the string.
+// Sets *rp to the character.
+bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
+ const StringPiece& whole_class,
+ RegexpStatus* status) {
if (s->empty()) {
- status->set_code(kRegexpMissingBracket);
- status->set_error_arg(whole_class);
- return false;
- }
-
- // Allow regular escape sequences even though
- // many need not be escaped in this context.
+ status->set_code(kRegexpMissingBracket);
+ status->set_error_arg(whole_class);
+ return false;
+ }
+
+ // Allow regular escape sequences even though
+ // many need not be escaped in this context.
if ((*s)[0] == '\\')
- return ParseEscape(s, rp, status, rune_max_);
-
- // Otherwise take the next rune.
- return StringPieceToRune(rp, s, status) >= 0;
-}
-
-// Parses a character class character, or, if the character
-// is followed by a hyphen, parses a character class range.
-// For single characters, rr->lo == rr->hi.
-// Sets *s to span the remainder of the string.
-// Sets *rp to the character.
-bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,
- const StringPiece& whole_class,
- RegexpStatus* status) {
- StringPiece os = *s;
- if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
- return false;
- // [a-] means (a|-), so check for final ].
- if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {
- s->remove_prefix(1); // '-'
- if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
- return false;
- if (rr->hi < rr->lo) {
- status->set_code(kRegexpBadCharRange);
+ return ParseEscape(s, rp, status, rune_max_);
+
+ // Otherwise take the next rune.
+ return StringPieceToRune(rp, s, status) >= 0;
+}
+
+// Parses a character class character, or, if the character
+// is followed by a hyphen, parses a character class range.
+// For single characters, rr->lo == rr->hi.
+// Sets *s to span the remainder of the string.
+// Sets *rp to the character.
+bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,
+ const StringPiece& whole_class,
+ RegexpStatus* status) {
+ StringPiece os = *s;
+ if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
+ return false;
+ // [a-] means (a|-), so check for final ].
+ if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {
+ s->remove_prefix(1); // '-'
+ if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
+ return false;
+ if (rr->hi < rr->lo) {
+ status->set_code(kRegexpBadCharRange);
status->set_error_arg(
StringPiece(os.data(), static_cast<size_t>(s->data() - os.data())));
- return false;
- }
- } else {
- rr->hi = rr->lo;
- }
- return true;
-}
-
-// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
-// Sets *s to span the remainder of the string.
-// Sets *out_re to the regexp for the class.
-bool Regexp::ParseState::ParseCharClass(StringPiece* s,
- Regexp** out_re,
- RegexpStatus* status) {
- StringPiece whole_class = *s;
+ return false;
+ }
+ } else {
+ rr->hi = rr->lo;
+ }
+ return true;
+}
+
+// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
+// Sets *s to span the remainder of the string.
+// Sets *out_re to the regexp for the class.
+bool Regexp::ParseState::ParseCharClass(StringPiece* s,
+ Regexp** out_re,
+ RegexpStatus* status) {
+ StringPiece whole_class = *s;
if (s->empty() || (*s)[0] != '[') {
- // Caller checked this.
- status->set_code(kRegexpInternalError);
+ // Caller checked this.
+ status->set_code(kRegexpInternalError);
status->set_error_arg(StringPiece());
- return false;
- }
- bool negated = false;
- Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
- re->ccb_ = new CharClassBuilder;
- s->remove_prefix(1); // '['
+ return false;
+ }
+ bool negated = false;
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ s->remove_prefix(1); // '['
if (!s->empty() && (*s)[0] == '^') {
- s->remove_prefix(1); // '^'
- negated = true;
- if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
- // If NL can't match implicitly, then pretend
- // negated classes include a leading \n.
- re->ccb_->AddRange('\n', '\n');
- }
- }
- bool first = true; // ] is okay as first char in class
+ s->remove_prefix(1); // '^'
+ negated = true;
+ if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
+ // If NL can't match implicitly, then pretend
+ // negated classes include a leading \n.
+ re->ccb_->AddRange('\n', '\n');
+ }
+ }
+ bool first = true; // ] is okay as first char in class
while (!s->empty() && ((*s)[0] != ']' || first)) {
- // - is only okay unescaped as first or last in class.
- // Except that Perl allows - anywhere.
- if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
- (s->size() == 1 || (*s)[1] != ']')) {
- StringPiece t = *s;
- t.remove_prefix(1); // '-'
- Rune r;
- int n = StringPieceToRune(&r, &t, status);
- if (n < 0) {
- re->Decref();
- return false;
- }
- status->set_code(kRegexpBadCharRange);
- status->set_error_arg(StringPiece(s->data(), 1+n));
- re->Decref();
- return false;
- }
- first = false;
-
- // Look for [:alnum:] etc.
- if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
- switch (ParseCCName(s, flags_, re->ccb_, status)) {
- case kParseOk:
- continue;
- case kParseError:
- re->Decref();
- return false;
- case kParseNothing:
- break;
- }
- }
-
- // Look for Unicode character group like \p{Han}
- if (s->size() > 2 &&
- (*s)[0] == '\\' &&
- ((*s)[1] == 'p' || (*s)[1] == 'P')) {
- switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
- case kParseOk:
- continue;
- case kParseError:
- re->Decref();
- return false;
- case kParseNothing:
- break;
- }
- }
-
- // Look for Perl character class symbols (extension).
+ // - is only okay unescaped as first or last in class.
+ // Except that Perl allows - anywhere.
+ if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
+ (s->size() == 1 || (*s)[1] != ']')) {
+ StringPiece t = *s;
+ t.remove_prefix(1); // '-'
+ Rune r;
+ int n = StringPieceToRune(&r, &t, status);
+ if (n < 0) {
+ re->Decref();
+ return false;
+ }
+ status->set_code(kRegexpBadCharRange);
+ status->set_error_arg(StringPiece(s->data(), 1+n));
+ re->Decref();
+ return false;
+ }
+ first = false;
+
+ // Look for [:alnum:] etc.
+ if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
+ switch (ParseCCName(s, flags_, re->ccb_, status)) {
+ case kParseOk:
+ continue;
+ case kParseError:
+ re->Decref();
+ return false;
+ case kParseNothing:
+ break;
+ }
+ }
+
+ // Look for Unicode character group like \p{Han}
+ if (s->size() > 2 &&
+ (*s)[0] == '\\' &&
+ ((*s)[1] == 'p' || (*s)[1] == 'P')) {
+ switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
+ case kParseOk:
+ continue;
+ case kParseError:
+ re->Decref();
+ return false;
+ case kParseNothing:
+ break;
+ }
+ }
+
+ // Look for Perl character class symbols (extension).
const UGroup *g = MaybeParsePerlCCEscape(s, flags_);
- if (g != NULL) {
- AddUGroup(re->ccb_, g, g->sign, flags_);
- continue;
- }
-
- // Otherwise assume single character or simple range.
- RuneRange rr;
- if (!ParseCCRange(s, &rr, whole_class, status)) {
- re->Decref();
- return false;
- }
- // AddRangeFlags is usually called in response to a class like
- // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
- // Regexp::ClassNL is set. In an explicit range or singleton
- // like we just parsed, we do not filter \n out, so set ClassNL
- // in the flags.
- re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
- }
+ if (g != NULL) {
+ AddUGroup(re->ccb_, g, g->sign, flags_);
+ continue;
+ }
+
+ // Otherwise assume single character or simple range.
+ RuneRange rr;
+ if (!ParseCCRange(s, &rr, whole_class, status)) {
+ re->Decref();
+ return false;
+ }
+ // AddRangeFlags is usually called in response to a class like
+ // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
+ // Regexp::ClassNL is set. In an explicit range or singleton
+ // like we just parsed, we do not filter \n out, so set ClassNL
+ // in the flags.
+ re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
+ }
if (s->empty()) {
- status->set_code(kRegexpMissingBracket);
- status->set_error_arg(whole_class);
- re->Decref();
- return false;
- }
- s->remove_prefix(1); // ']'
-
- if (negated)
- re->ccb_->Negate();
-
- *out_re = re;
- return true;
-}
-
+ status->set_code(kRegexpMissingBracket);
+ status->set_error_arg(whole_class);
+ re->Decref();
+ return false;
+ }
+ s->remove_prefix(1); // ']'
+
+ if (negated)
+ re->ccb_->Negate();
+
+ *out_re = re;
+ return true;
+}
+
// Returns whether name is a valid capture name.
-static bool IsValidCaptureName(const StringPiece& name) {
+static bool IsValidCaptureName(const StringPiece& name) {
if (name.empty())
- return false;
+ return false;
// Historically, we effectively used [0-9A-Za-z_]+ to validate; that
// followed Python 2 except for not restricting the first character.
@@ -2043,230 +2043,230 @@ static bool IsValidCaptureName(const StringPiece& name) {
if (StringPieceToRune(&r, &t, NULL) < 0)
return false;
if (cc->Contains(r))
- continue;
- return false;
- }
- return true;
-}
-
-// Parses a Perl flag setting or non-capturing group or both,
-// like (?i) or (?: or (?i:. Removes from s, updates parse state.
-// The caller must check that s begins with "(?".
-// Returns true on success. If the Perl flag is not
-// well-formed or not supported, sets status_ and returns false.
-bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
- StringPiece t = *s;
-
- // Caller is supposed to check this.
- if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
- LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
- status_->set_code(kRegexpInternalError);
- return false;
- }
-
- t.remove_prefix(2); // "(?"
-
- // Check for named captures, first introduced in Python's regexp library.
- // As usual, there are three slightly different syntaxes:
- //
- // (?P<name>expr) the original, introduced by Python
- // (?<name>expr) the .NET alteration, adopted by Perl 5.10
- // (?'name'expr) another .NET alteration, adopted by Perl 5.10
- //
- // Perl 5.10 gave in and implemented the Python version too,
- // but they claim that the last two are the preferred forms.
- // PCRE and languages based on it (specifically, PHP and Ruby)
- // support all three as well. EcmaScript 4 uses only the Python form.
- //
- // In both the open source world (via Code Search) and the
- // Google source tree, (?P<expr>name) is the dominant form,
- // so that's the one we implement. One is enough.
- if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
- // Pull out name.
+ continue;
+ return false;
+ }
+ return true;
+}
+
+// Parses a Perl flag setting or non-capturing group or both,
+// like (?i) or (?: or (?i:. Removes from s, updates parse state.
+// The caller must check that s begins with "(?".
+// Returns true on success. If the Perl flag is not
+// well-formed or not supported, sets status_ and returns false.
+bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
+ StringPiece t = *s;
+
+ // Caller is supposed to check this.
+ if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
+ LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
+ status_->set_code(kRegexpInternalError);
+ return false;
+ }
+
+ t.remove_prefix(2); // "(?"
+
+ // Check for named captures, first introduced in Python's regexp library.
+ // As usual, there are three slightly different syntaxes:
+ //
+ // (?P<name>expr) the original, introduced by Python
+ // (?<name>expr) the .NET alteration, adopted by Perl 5.10
+ // (?'name'expr) another .NET alteration, adopted by Perl 5.10
+ //
+ // Perl 5.10 gave in and implemented the Python version too,
+ // but they claim that the last two are the preferred forms.
+ // PCRE and languages based on it (specifically, PHP and Ruby)
+ // support all three as well. EcmaScript 4 uses only the Python form.
+ //
+ // In both the open source world (via Code Search) and the
+ // Google source tree, (?P<expr>name) is the dominant form,
+ // so that's the one we implement. One is enough.
+ if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
+ // Pull out name.
size_t end = t.find('>', 2);
if (end == StringPiece::npos) {
- if (!IsValidUTF8(*s, status_))
- return false;
- status_->set_code(kRegexpBadNamedCapture);
- status_->set_error_arg(*s);
- return false;
- }
-
- // t is "P<name>...", t[end] == '>'
+ if (!IsValidUTF8(*s, status_))
+ return false;
+ status_->set_code(kRegexpBadNamedCapture);
+ status_->set_error_arg(*s);
+ return false;
+ }
+
+ // t is "P<name>...", t[end] == '>'
StringPiece capture(t.data()-2, end+3); // "(?P<name>"
StringPiece name(t.data()+2, end-2); // "name"
- if (!IsValidUTF8(name, status_))
- return false;
- if (!IsValidCaptureName(name)) {
- status_->set_code(kRegexpBadNamedCapture);
- status_->set_error_arg(capture);
- return false;
- }
-
- if (!DoLeftParen(name)) {
- // DoLeftParen's failure set status_.
- return false;
- }
-
+ if (!IsValidUTF8(name, status_))
+ return false;
+ if (!IsValidCaptureName(name)) {
+ status_->set_code(kRegexpBadNamedCapture);
+ status_->set_error_arg(capture);
+ return false;
+ }
+
+ if (!DoLeftParen(name)) {
+ // DoLeftParen's failure set status_.
+ return false;
+ }
+
s->remove_prefix(
static_cast<size_t>(capture.data() + capture.size() - s->data()));
- return true;
- }
-
- bool negated = false;
- bool sawflags = false;
- int nflags = flags_;
- Rune c;
- for (bool done = false; !done; ) {
+ return true;
+ }
+
+ bool negated = false;
+ bool sawflags = false;
+ int nflags = flags_;
+ Rune c;
+ for (bool done = false; !done; ) {
if (t.empty())
- goto BadPerlOp;
- if (StringPieceToRune(&c, &t, status_) < 0)
- return false;
- switch (c) {
- default:
- goto BadPerlOp;
-
- // Parse flags.
- case 'i':
- sawflags = true;
- if (negated)
- nflags &= ~FoldCase;
- else
- nflags |= FoldCase;
- break;
-
- case 'm': // opposite of our OneLine
- sawflags = true;
- if (negated)
- nflags |= OneLine;
- else
- nflags &= ~OneLine;
- break;
-
- case 's':
- sawflags = true;
- if (negated)
- nflags &= ~DotNL;
- else
- nflags |= DotNL;
- break;
-
- case 'U':
- sawflags = true;
- if (negated)
- nflags &= ~NonGreedy;
- else
- nflags |= NonGreedy;
- break;
-
- // Negation
- case '-':
- if (negated)
- goto BadPerlOp;
- negated = true;
- sawflags = false;
- break;
-
- // Open new group.
- case ':':
- if (!DoLeftParenNoCapture()) {
- // DoLeftParenNoCapture's failure set status_.
- return false;
- }
- done = true;
- break;
-
- // Finish flags.
- case ')':
- done = true;
- break;
- }
- }
-
- if (negated && !sawflags)
- goto BadPerlOp;
-
- flags_ = static_cast<Regexp::ParseFlags>(nflags);
- *s = t;
- return true;
-
-BadPerlOp:
- status_->set_code(kRegexpBadPerlOp);
+ goto BadPerlOp;
+ if (StringPieceToRune(&c, &t, status_) < 0)
+ return false;
+ switch (c) {
+ default:
+ goto BadPerlOp;
+
+ // Parse flags.
+ case 'i':
+ sawflags = true;
+ if (negated)
+ nflags &= ~FoldCase;
+ else
+ nflags |= FoldCase;
+ break;
+
+ case 'm': // opposite of our OneLine
+ sawflags = true;
+ if (negated)
+ nflags |= OneLine;
+ else
+ nflags &= ~OneLine;
+ break;
+
+ case 's':
+ sawflags = true;
+ if (negated)
+ nflags &= ~DotNL;
+ else
+ nflags |= DotNL;
+ break;
+
+ case 'U':
+ sawflags = true;
+ if (negated)
+ nflags &= ~NonGreedy;
+ else
+ nflags |= NonGreedy;
+ break;
+
+ // Negation
+ case '-':
+ if (negated)
+ goto BadPerlOp;
+ negated = true;
+ sawflags = false;
+ break;
+
+ // Open new group.
+ case ':':
+ if (!DoLeftParenNoCapture()) {
+ // DoLeftParenNoCapture's failure set status_.
+ return false;
+ }
+ done = true;
+ break;
+
+ // Finish flags.
+ case ')':
+ done = true;
+ break;
+ }
+ }
+
+ if (negated && !sawflags)
+ goto BadPerlOp;
+
+ flags_ = static_cast<Regexp::ParseFlags>(nflags);
+ *s = t;
+ return true;
+
+BadPerlOp:
+ status_->set_code(kRegexpBadPerlOp);
status_->set_error_arg(
StringPiece(s->data(), static_cast<size_t>(t.data() - s->data())));
- return false;
-}
-
-// Converts latin1 (assumed to be encoded as Latin1 bytes)
-// into UTF8 encoding in string.
-// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
-// deprecated and because it rejects code points 0x80-0x9F.
+ return false;
+}
+
+// Converts latin1 (assumed to be encoded as Latin1 bytes)
+// into UTF8 encoding in string.
+// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
+// deprecated and because it rejects code points 0x80-0x9F.
void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) {
- char buf[UTFmax];
-
- utf->clear();
+ char buf[UTFmax];
+
+ utf->clear();
for (size_t i = 0; i < latin1.size(); i++) {
- Rune r = latin1[i] & 0xFF;
- int n = runetochar(buf, &r);
- utf->append(buf, n);
- }
-}
-
-// Parses the regular expression given by s,
-// returning the corresponding Regexp tree.
-// The caller must Decref the return value when done with it.
-// Returns NULL on error.
-Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
- RegexpStatus* status) {
- // Make status non-NULL (easier on everyone else).
- RegexpStatus xstatus;
- if (status == NULL)
- status = &xstatus;
-
- ParseState ps(global_flags, s, status);
- StringPiece t = s;
-
- // Convert regexp to UTF-8 (easier on the rest of the parser).
- if (global_flags & Latin1) {
+ Rune r = latin1[i] & 0xFF;
+ int n = runetochar(buf, &r);
+ utf->append(buf, n);
+ }
+}
+
+// Parses the regular expression given by s,
+// returning the corresponding Regexp tree.
+// The caller must Decref the return value when done with it.
+// Returns NULL on error.
+Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
+ RegexpStatus* status) {
+ // Make status non-NULL (easier on everyone else).
+ RegexpStatus xstatus;
+ if (status == NULL)
+ status = &xstatus;
+
+ ParseState ps(global_flags, s, status);
+ StringPiece t = s;
+
+ // Convert regexp to UTF-8 (easier on the rest of the parser).
+ if (global_flags & Latin1) {
std::string* tmp = new std::string;
- ConvertLatin1ToUTF8(t, tmp);
- status->set_tmp(tmp);
- t = *tmp;
- }
-
- if (global_flags & Literal) {
- // Special parse loop for literal string.
+ ConvertLatin1ToUTF8(t, tmp);
+ status->set_tmp(tmp);
+ t = *tmp;
+ }
+
+ if (global_flags & Literal) {
+ // Special parse loop for literal string.
while (!t.empty()) {
- Rune r;
- if (StringPieceToRune(&r, &t, status) < 0)
- return NULL;
- if (!ps.PushLiteral(r))
- return NULL;
- }
- return ps.DoFinish();
- }
-
+ Rune r;
+ if (StringPieceToRune(&r, &t, status) < 0)
+ return NULL;
+ if (!ps.PushLiteral(r))
+ return NULL;
+ }
+ return ps.DoFinish();
+ }
+
StringPiece lastunary = StringPiece();
while (!t.empty()) {
StringPiece isunary = StringPiece();
- switch (t[0]) {
- default: {
- Rune r;
- if (StringPieceToRune(&r, &t, status) < 0)
- return NULL;
- if (!ps.PushLiteral(r))
- return NULL;
- break;
- }
-
- case '(':
- // "(?" introduces Perl escape.
- if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {
- // Flag changes and non-capturing groups.
- if (!ps.ParsePerlFlags(&t))
- return NULL;
- break;
- }
+ switch (t[0]) {
+ default: {
+ Rune r;
+ if (StringPieceToRune(&r, &t, status) < 0)
+ return NULL;
+ if (!ps.PushLiteral(r))
+ return NULL;
+ break;
+ }
+
+ case '(':
+ // "(?" introduces Perl escape.
+ if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {
+ // Flag changes and non-capturing groups.
+ if (!ps.ParsePerlFlags(&t))
+ return NULL;
+ break;
+ }
if (ps.flags() & NeverCapture) {
if (!ps.DoLeftParenNoCapture())
return NULL;
@@ -2274,210 +2274,210 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
if (!ps.DoLeftParen(StringPiece()))
return NULL;
}
- t.remove_prefix(1); // '('
- break;
-
- case '|':
- if (!ps.DoVerticalBar())
- return NULL;
- t.remove_prefix(1); // '|'
- break;
-
- case ')':
- if (!ps.DoRightParen())
- return NULL;
- t.remove_prefix(1); // ')'
- break;
-
- case '^': // Beginning of line.
+ t.remove_prefix(1); // '('
+ break;
+
+ case '|':
+ if (!ps.DoVerticalBar())
+ return NULL;
+ t.remove_prefix(1); // '|'
+ break;
+
+ case ')':
+ if (!ps.DoRightParen())
+ return NULL;
+ t.remove_prefix(1); // ')'
+ break;
+
+ case '^': // Beginning of line.
if (!ps.PushCaret())
- return NULL;
- t.remove_prefix(1); // '^'
- break;
-
- case '$': // End of line.
- if (!ps.PushDollar())
- return NULL;
- t.remove_prefix(1); // '$'
- break;
-
- case '.': // Any character (possibly except newline).
- if (!ps.PushDot())
- return NULL;
- t.remove_prefix(1); // '.'
- break;
-
- case '[': { // Character class.
- Regexp* re;
- if (!ps.ParseCharClass(&t, &re, status))
- return NULL;
- if (!ps.PushRegexp(re))
- return NULL;
- break;
- }
-
- case '*': { // Zero or more.
- RegexpOp op;
- op = kRegexpStar;
- goto Rep;
- case '+': // One or more.
- op = kRegexpPlus;
- goto Rep;
- case '?': // Zero or one.
- op = kRegexpQuest;
- goto Rep;
- Rep:
- StringPiece opstr = t;
- bool nongreedy = false;
- t.remove_prefix(1); // '*' or '+' or '?'
- if (ps.flags() & PerlX) {
+ return NULL;
+ t.remove_prefix(1); // '^'
+ break;
+
+ case '$': // End of line.
+ if (!ps.PushDollar())
+ return NULL;
+ t.remove_prefix(1); // '$'
+ break;
+
+ case '.': // Any character (possibly except newline).
+ if (!ps.PushDot())
+ return NULL;
+ t.remove_prefix(1); // '.'
+ break;
+
+ case '[': { // Character class.
+ Regexp* re;
+ if (!ps.ParseCharClass(&t, &re, status))
+ return NULL;
+ if (!ps.PushRegexp(re))
+ return NULL;
+ break;
+ }
+
+ case '*': { // Zero or more.
+ RegexpOp op;
+ op = kRegexpStar;
+ goto Rep;
+ case '+': // One or more.
+ op = kRegexpPlus;
+ goto Rep;
+ case '?': // Zero or one.
+ op = kRegexpQuest;
+ goto Rep;
+ Rep:
+ StringPiece opstr = t;
+ bool nongreedy = false;
+ t.remove_prefix(1); // '*' or '+' or '?'
+ if (ps.flags() & PerlX) {
if (!t.empty() && t[0] == '?') {
- nongreedy = true;
- t.remove_prefix(1); // '?'
- }
+ nongreedy = true;
+ t.remove_prefix(1); // '?'
+ }
if (!lastunary.empty()) {
- // In Perl it is not allowed to stack repetition operators:
- // a** is a syntax error, not a double-star.
- // (and a++ means something else entirely, which we don't support!)
- status->set_code(kRegexpRepeatOp);
+ // In Perl it is not allowed to stack repetition operators:
+ // a** is a syntax error, not a double-star.
+ // (and a++ means something else entirely, which we don't support!)
+ status->set_code(kRegexpRepeatOp);
status->set_error_arg(StringPiece(
lastunary.data(),
static_cast<size_t>(t.data() - lastunary.data())));
- return NULL;
- }
- }
+ return NULL;
+ }
+ }
opstr = StringPiece(opstr.data(),
static_cast<size_t>(t.data() - opstr.data()));
- if (!ps.PushRepeatOp(op, opstr, nongreedy))
- return NULL;
- isunary = opstr;
- break;
- }
-
- case '{': { // Counted repetition.
- int lo, hi;
- StringPiece opstr = t;
- if (!MaybeParseRepetition(&t, &lo, &hi)) {
- // Treat like a literal.
- if (!ps.PushLiteral('{'))
- return NULL;
- t.remove_prefix(1); // '{'
- break;
- }
- bool nongreedy = false;
- if (ps.flags() & PerlX) {
+ if (!ps.PushRepeatOp(op, opstr, nongreedy))
+ return NULL;
+ isunary = opstr;
+ break;
+ }
+
+ case '{': { // Counted repetition.
+ int lo, hi;
+ StringPiece opstr = t;
+ if (!MaybeParseRepetition(&t, &lo, &hi)) {
+ // Treat like a literal.
+ if (!ps.PushLiteral('{'))
+ return NULL;
+ t.remove_prefix(1); // '{'
+ break;
+ }
+ bool nongreedy = false;
+ if (ps.flags() & PerlX) {
if (!t.empty() && t[0] == '?') {
- nongreedy = true;
- t.remove_prefix(1); // '?'
- }
+ nongreedy = true;
+ t.remove_prefix(1); // '?'
+ }
if (!lastunary.empty()) {
- // Not allowed to stack repetition operators.
- status->set_code(kRegexpRepeatOp);
+ // Not allowed to stack repetition operators.
+ status->set_code(kRegexpRepeatOp);
status->set_error_arg(StringPiece(
lastunary.data(),
static_cast<size_t>(t.data() - lastunary.data())));
- return NULL;
- }
- }
+ return NULL;
+ }
+ }
opstr = StringPiece(opstr.data(),
static_cast<size_t>(t.data() - opstr.data()));
- if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
- return NULL;
- isunary = opstr;
- break;
- }
-
- case '\\': { // Escaped character or Perl sequence.
- // \b and \B: word boundary or not
- if ((ps.flags() & Regexp::PerlB) &&
- t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) {
- if (!ps.PushWordBoundary(t[1] == 'b'))
- return NULL;
- t.remove_prefix(2); // '\\', 'b'
- break;
- }
-
- if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {
- if (t[1] == 'A') {
- if (!ps.PushSimpleOp(kRegexpBeginText))
- return NULL;
- t.remove_prefix(2); // '\\', 'A'
- break;
- }
- if (t[1] == 'z') {
- if (!ps.PushSimpleOp(kRegexpEndText))
- return NULL;
- t.remove_prefix(2); // '\\', 'z'
- break;
- }
- // Do not recognize \Z, because this library can't
- // implement the exact Perl/PCRE semantics.
- // (This library treats "(?-m)$" as \z, even though
- // in Perl and PCRE it is equivalent to \Z.)
-
- if (t[1] == 'C') { // \C: any byte [sic]
- if (!ps.PushSimpleOp(kRegexpAnyByte))
- return NULL;
- t.remove_prefix(2); // '\\', 'C'
- break;
- }
-
- if (t[1] == 'Q') { // \Q ... \E: the ... is always literals
- t.remove_prefix(2); // '\\', 'Q'
+ if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
+ return NULL;
+ isunary = opstr;
+ break;
+ }
+
+ case '\\': { // Escaped character or Perl sequence.
+ // \b and \B: word boundary or not
+ if ((ps.flags() & Regexp::PerlB) &&
+ t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) {
+ if (!ps.PushWordBoundary(t[1] == 'b'))
+ return NULL;
+ t.remove_prefix(2); // '\\', 'b'
+ break;
+ }
+
+ if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {
+ if (t[1] == 'A') {
+ if (!ps.PushSimpleOp(kRegexpBeginText))
+ return NULL;
+ t.remove_prefix(2); // '\\', 'A'
+ break;
+ }
+ if (t[1] == 'z') {
+ if (!ps.PushSimpleOp(kRegexpEndText))
+ return NULL;
+ t.remove_prefix(2); // '\\', 'z'
+ break;
+ }
+ // Do not recognize \Z, because this library can't
+ // implement the exact Perl/PCRE semantics.
+ // (This library treats "(?-m)$" as \z, even though
+ // in Perl and PCRE it is equivalent to \Z.)
+
+ if (t[1] == 'C') { // \C: any byte [sic]
+ if (!ps.PushSimpleOp(kRegexpAnyByte))
+ return NULL;
+ t.remove_prefix(2); // '\\', 'C'
+ break;
+ }
+
+ if (t[1] == 'Q') { // \Q ... \E: the ... is always literals
+ t.remove_prefix(2); // '\\', 'Q'
while (!t.empty()) {
- if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
- t.remove_prefix(2); // '\\', 'E'
- break;
- }
- Rune r;
- if (StringPieceToRune(&r, &t, status) < 0)
- return NULL;
- if (!ps.PushLiteral(r))
- return NULL;
- }
- break;
- }
- }
-
- if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) {
- Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
- re->ccb_ = new CharClassBuilder;
- switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
- case kParseOk:
- if (!ps.PushRegexp(re))
- return NULL;
- goto Break2;
- case kParseError:
- re->Decref();
- return NULL;
- case kParseNothing:
- re->Decref();
- break;
- }
- }
-
+ if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
+ t.remove_prefix(2); // '\\', 'E'
+ break;
+ }
+ Rune r;
+ if (StringPieceToRune(&r, &t, status) < 0)
+ return NULL;
+ if (!ps.PushLiteral(r))
+ return NULL;
+ }
+ break;
+ }
+ }
+
+ if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) {
+ Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
+ case kParseOk:
+ if (!ps.PushRegexp(re))
+ return NULL;
+ goto Break2;
+ case kParseError:
+ re->Decref();
+ return NULL;
+ case kParseNothing:
+ re->Decref();
+ break;
+ }
+ }
+
const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
- if (g != NULL) {
- Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
- re->ccb_ = new CharClassBuilder;
- AddUGroup(re->ccb_, g, g->sign, ps.flags());
- if (!ps.PushRegexp(re))
- return NULL;
- break;
- }
-
- Rune r;
- if (!ParseEscape(&t, &r, status, ps.rune_max()))
- return NULL;
- if (!ps.PushLiteral(r))
- return NULL;
- break;
- }
- }
- Break2:
- lastunary = isunary;
- }
- return ps.DoFinish();
-}
-
-} // namespace re2
+ if (g != NULL) {
+ Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ AddUGroup(re->ccb_, g, g->sign, ps.flags());
+ if (!ps.PushRegexp(re))
+ return NULL;
+ break;
+ }
+
+ Rune r;
+ if (!ParseEscape(&t, &r, status, ps.rune_max()))
+ return NULL;
+ if (!ps.PushLiteral(r))
+ return NULL;
+ break;
+ }
+ }
+ Break2:
+ lastunary = isunary;
+ }
+ return ps.DoFinish();
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/perl_groups.cc b/contrib/libs/re2/re2/perl_groups.cc
index c8f4dbde5e..4687444581 100644
--- a/contrib/libs/re2/re2/perl_groups.cc
+++ b/contrib/libs/re2/re2/perl_groups.cc
@@ -1,24 +1,24 @@
-// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
-// make_perl_groups.pl >perl_groups.cc
-
-#include "re2/unicode_groups.h"
-
-namespace re2 {
-
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
static const URange16 code1[] = { /* \d */
- { 0x30, 0x39 },
-};
+ { 0x30, 0x39 },
+};
static const URange16 code2[] = { /* \s */
- { 0x9, 0xa },
- { 0xc, 0xd },
- { 0x20, 0x20 },
-};
+ { 0x9, 0xa },
+ { 0xc, 0xd },
+ { 0x20, 0x20 },
+};
static const URange16 code3[] = { /* \w */
- { 0x30, 0x39 },
- { 0x41, 0x5a },
- { 0x5f, 0x5f },
- { 0x61, 0x7a },
-};
+ { 0x30, 0x39 },
+ { 0x41, 0x5a },
+ { 0x5f, 0x5f },
+ { 0x61, 0x7a },
+};
const UGroup perl_groups[] = {
{ "\\d", +1, code1, 1, 0, 0 },
{ "\\D", -1, code1, 1, 0, 0 },
@@ -26,64 +26,64 @@ const UGroup perl_groups[] = {
{ "\\S", -1, code2, 3, 0, 0 },
{ "\\w", +1, code3, 4, 0, 0 },
{ "\\W", -1, code3, 4, 0, 0 },
-};
+};
const int num_perl_groups = 6;
static const URange16 code4[] = { /* [:alnum:] */
- { 0x30, 0x39 },
- { 0x41, 0x5a },
- { 0x61, 0x7a },
-};
+ { 0x30, 0x39 },
+ { 0x41, 0x5a },
+ { 0x61, 0x7a },
+};
static const URange16 code5[] = { /* [:alpha:] */
- { 0x41, 0x5a },
- { 0x61, 0x7a },
-};
+ { 0x41, 0x5a },
+ { 0x61, 0x7a },
+};
static const URange16 code6[] = { /* [:ascii:] */
- { 0x0, 0x7f },
-};
+ { 0x0, 0x7f },
+};
static const URange16 code7[] = { /* [:blank:] */
- { 0x9, 0x9 },
- { 0x20, 0x20 },
-};
+ { 0x9, 0x9 },
+ { 0x20, 0x20 },
+};
static const URange16 code8[] = { /* [:cntrl:] */
- { 0x0, 0x1f },
- { 0x7f, 0x7f },
-};
+ { 0x0, 0x1f },
+ { 0x7f, 0x7f },
+};
static const URange16 code9[] = { /* [:digit:] */
- { 0x30, 0x39 },
-};
+ { 0x30, 0x39 },
+};
static const URange16 code10[] = { /* [:graph:] */
- { 0x21, 0x7e },
-};
+ { 0x21, 0x7e },
+};
static const URange16 code11[] = { /* [:lower:] */
- { 0x61, 0x7a },
-};
+ { 0x61, 0x7a },
+};
static const URange16 code12[] = { /* [:print:] */
- { 0x20, 0x7e },
-};
+ { 0x20, 0x7e },
+};
static const URange16 code13[] = { /* [:punct:] */
- { 0x21, 0x2f },
- { 0x3a, 0x40 },
- { 0x5b, 0x60 },
- { 0x7b, 0x7e },
-};
+ { 0x21, 0x2f },
+ { 0x3a, 0x40 },
+ { 0x5b, 0x60 },
+ { 0x7b, 0x7e },
+};
static const URange16 code14[] = { /* [:space:] */
- { 0x9, 0xd },
- { 0x20, 0x20 },
-};
+ { 0x9, 0xd },
+ { 0x20, 0x20 },
+};
static const URange16 code15[] = { /* [:upper:] */
- { 0x41, 0x5a },
-};
+ { 0x41, 0x5a },
+};
static const URange16 code16[] = { /* [:word:] */
- { 0x30, 0x39 },
- { 0x41, 0x5a },
- { 0x5f, 0x5f },
- { 0x61, 0x7a },
-};
+ { 0x30, 0x39 },
+ { 0x41, 0x5a },
+ { 0x5f, 0x5f },
+ { 0x61, 0x7a },
+};
static const URange16 code17[] = { /* [:xdigit:] */
- { 0x30, 0x39 },
- { 0x41, 0x46 },
- { 0x61, 0x66 },
-};
+ { 0x30, 0x39 },
+ { 0x41, 0x46 },
+ { 0x61, 0x66 },
+};
const UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3, 0, 0 },
{ "[:^alnum:]", -1, code4, 3, 0, 0 },
@@ -113,7 +113,7 @@ const UGroup posix_groups[] = {
{ "[:^word:]", -1, code16, 4, 0, 0 },
{ "[:xdigit:]", +1, code17, 3, 0, 0 },
{ "[:^xdigit:]", -1, code17, 3, 0, 0 },
-};
+};
const int num_posix_groups = 28;
-
-} // namespace re2
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/prefilter.cc b/contrib/libs/re2/re2/prefilter.cc
index 6a9a670381..a47b3120fb 100644
--- a/contrib/libs/re2/re2/prefilter.cc
+++ b/contrib/libs/re2/re2/prefilter.cc
@@ -1,8 +1,8 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "re2/prefilter.h"
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/prefilter.h"
#include <stddef.h>
#include <stdint.h>
@@ -15,163 +15,163 @@
#include "util/utf.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
-#include "re2/walker-inl.h"
-
-namespace re2 {
-
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
static const bool ExtraDebug = false;
-
+
typedef std::set<std::string>::iterator SSIter;
typedef std::set<std::string>::const_iterator ConstSSIter;
-
-// Initializes a Prefilter, allocating subs_ as necessary.
-Prefilter::Prefilter(Op op) {
- op_ = op;
- subs_ = NULL;
- if (op_ == AND || op_ == OR)
+
+// Initializes a Prefilter, allocating subs_ as necessary.
+Prefilter::Prefilter(Op op) {
+ op_ = op;
+ subs_ = NULL;
+ if (op_ == AND || op_ == OR)
subs_ = new std::vector<Prefilter*>;
-}
-
-// Destroys a Prefilter.
-Prefilter::~Prefilter() {
- if (subs_) {
+}
+
+// Destroys a Prefilter.
+Prefilter::~Prefilter() {
+ if (subs_) {
for (size_t i = 0; i < subs_->size(); i++)
- delete (*subs_)[i];
- delete subs_;
- subs_ = NULL;
- }
-}
-
-// Simplify if the node is an empty Or or And.
-Prefilter* Prefilter::Simplify() {
- if (op_ != AND && op_ != OR) {
- return this;
- }
-
- // Nothing left in the AND/OR.
+ delete (*subs_)[i];
+ delete subs_;
+ subs_ = NULL;
+ }
+}
+
+// Simplify if the node is an empty Or or And.
+Prefilter* Prefilter::Simplify() {
+ if (op_ != AND && op_ != OR) {
+ return this;
+ }
+
+ // Nothing left in the AND/OR.
if (subs_->empty()) {
- if (op_ == AND)
- op_ = ALL; // AND of nothing is true
- else
- op_ = NONE; // OR of nothing is false
-
- return this;
- }
-
- // Just one subnode: throw away wrapper.
- if (subs_->size() == 1) {
- Prefilter* a = (*subs_)[0];
- subs_->clear();
- delete this;
- return a->Simplify();
- }
-
- return this;
-}
-
-// Combines two Prefilters together to create an "op" (AND or OR).
-// The passed Prefilters will be part of the returned Prefilter or deleted.
-// Does lots of work to avoid creating unnecessarily complicated structures.
-Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
- // If a, b can be rewritten as op, do so.
- a = a->Simplify();
- b = b->Simplify();
-
- // Canonicalize: a->op <= b->op.
- if (a->op() > b->op()) {
- Prefilter* t = a;
- a = b;
- b = t;
- }
-
- // Trivial cases.
- // ALL AND b = b
- // NONE OR b = b
- // ALL OR b = ALL
- // NONE AND b = NONE
- // Don't need to look at b, because of canonicalization above.
- // ALL and NONE are smallest opcodes.
- if (a->op() == ALL || a->op() == NONE) {
- if ((a->op() == ALL && op == AND) ||
- (a->op() == NONE && op == OR)) {
- delete a;
- return b;
- } else {
- delete b;
- return a;
- }
- }
-
- // If a and b match op, merge their contents.
- if (a->op() == op && b->op() == op) {
+ if (op_ == AND)
+ op_ = ALL; // AND of nothing is true
+ else
+ op_ = NONE; // OR of nothing is false
+
+ return this;
+ }
+
+ // Just one subnode: throw away wrapper.
+ if (subs_->size() == 1) {
+ Prefilter* a = (*subs_)[0];
+ subs_->clear();
+ delete this;
+ return a->Simplify();
+ }
+
+ return this;
+}
+
+// Combines two Prefilters together to create an "op" (AND or OR).
+// The passed Prefilters will be part of the returned Prefilter or deleted.
+// Does lots of work to avoid creating unnecessarily complicated structures.
+Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
+ // If a, b can be rewritten as op, do so.
+ a = a->Simplify();
+ b = b->Simplify();
+
+ // Canonicalize: a->op <= b->op.
+ if (a->op() > b->op()) {
+ Prefilter* t = a;
+ a = b;
+ b = t;
+ }
+
+ // Trivial cases.
+ // ALL AND b = b
+ // NONE OR b = b
+ // ALL OR b = ALL
+ // NONE AND b = NONE
+ // Don't need to look at b, because of canonicalization above.
+ // ALL and NONE are smallest opcodes.
+ if (a->op() == ALL || a->op() == NONE) {
+ if ((a->op() == ALL && op == AND) ||
+ (a->op() == NONE && op == OR)) {
+ delete a;
+ return b;
+ } else {
+ delete b;
+ return a;
+ }
+ }
+
+ // If a and b match op, merge their contents.
+ if (a->op() == op && b->op() == op) {
for (size_t i = 0; i < b->subs()->size(); i++) {
- Prefilter* bb = (*b->subs())[i];
- a->subs()->push_back(bb);
- }
- b->subs()->clear();
- delete b;
- return a;
- }
-
- // If a already has the same op as the op that is under construction
- // add in b (similarly if b already has the same op, add in a).
- if (b->op() == op) {
- Prefilter* t = a;
- a = b;
- b = t;
- }
- if (a->op() == op) {
- a->subs()->push_back(b);
- return a;
- }
-
- // Otherwise just return the op.
- Prefilter* c = new Prefilter(op);
- c->subs()->push_back(a);
- c->subs()->push_back(b);
- return c;
-}
-
-Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
- return AndOr(AND, a, b);
-}
-
-Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
- return AndOr(OR, a, b);
-}
-
+ Prefilter* bb = (*b->subs())[i];
+ a->subs()->push_back(bb);
+ }
+ b->subs()->clear();
+ delete b;
+ return a;
+ }
+
+ // If a already has the same op as the op that is under construction
+ // add in b (similarly if b already has the same op, add in a).
+ if (b->op() == op) {
+ Prefilter* t = a;
+ a = b;
+ b = t;
+ }
+ if (a->op() == op) {
+ a->subs()->push_back(b);
+ return a;
+ }
+
+ // Otherwise just return the op.
+ Prefilter* c = new Prefilter(op);
+ c->subs()->push_back(a);
+ c->subs()->push_back(b);
+ return c;
+}
+
+Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
+ return AndOr(AND, a, b);
+}
+
+Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
+ return AndOr(OR, a, b);
+}
+
static void SimplifyStringSet(std::set<std::string>* ss) {
- // Now make sure that the strings aren't redundant. For example, if
- // we know "ab" is a required string, then it doesn't help at all to
- // know that "abc" is also a required string, so delete "abc". This
- // is because, when we are performing a string search to filter
+ // Now make sure that the strings aren't redundant. For example, if
+ // we know "ab" is a required string, then it doesn't help at all to
+ // know that "abc" is also a required string, so delete "abc". This
+ // is because, when we are performing a string search to filter
// regexps, matching "ab" will already allow this regexp to be a
// candidate for match, so further matching "abc" is redundant.
// Note that we must ignore "" because find() would find it at the
// start of everything and thus we would end up erasing everything.
- for (SSIter i = ss->begin(); i != ss->end(); ++i) {
+ for (SSIter i = ss->begin(); i != ss->end(); ++i) {
if (i->empty())
continue;
- SSIter j = i;
- ++j;
- while (j != ss->end()) {
+ SSIter j = i;
+ ++j;
+ while (j != ss->end()) {
if (j->find(*i) != std::string::npos) {
j = ss->erase(j);
continue;
}
- ++j;
- }
- }
-}
-
+ ++j;
+ }
+ }
+}
+
Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) {
Prefilter* or_prefilter = new Prefilter(NONE);
- SimplifyStringSet(ss);
+ SimplifyStringSet(ss);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
- return or_prefilter;
-}
-
+ return or_prefilter;
+}
+
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
@@ -192,221 +192,221 @@ static Rune ToLowerRuneLatin1(Rune r) {
}
Prefilter* Prefilter::FromString(const std::string& str) {
- Prefilter* m = new Prefilter(Prefilter::ATOM);
+ Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
- return m;
-}
-
-// Information about a regexp used during computation of Prefilter.
-// Can be thought of as information about the set of strings matching
-// the given regular expression.
-class Prefilter::Info {
- public:
- Info();
- ~Info();
-
- // More constructors. They delete their Info* arguments.
- static Info* Alt(Info* a, Info* b);
- static Info* Concat(Info* a, Info* b);
- static Info* And(Info* a, Info* b);
- static Info* Star(Info* a);
- static Info* Plus(Info* a);
- static Info* Quest(Info* a);
- static Info* EmptyString();
- static Info* NoMatch();
+ return m;
+}
+
+// Information about a regexp used during computation of Prefilter.
+// Can be thought of as information about the set of strings matching
+// the given regular expression.
+class Prefilter::Info {
+ public:
+ Info();
+ ~Info();
+
+ // More constructors. They delete their Info* arguments.
+ static Info* Alt(Info* a, Info* b);
+ static Info* Concat(Info* a, Info* b);
+ static Info* And(Info* a, Info* b);
+ static Info* Star(Info* a);
+ static Info* Plus(Info* a);
+ static Info* Quest(Info* a);
+ static Info* EmptyString();
+ static Info* NoMatch();
static Info* AnyCharOrAnyByte();
static Info* CClass(CharClass* cc, bool latin1);
- static Info* Literal(Rune r);
+ static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
- static Info* AnyMatch();
-
- // Format Info as a string.
+ static Info* AnyMatch();
+
+ // Format Info as a string.
std::string ToString();
-
- // Caller takes ownership of the Prefilter.
- Prefilter* TakeMatch();
-
+
+ // Caller takes ownership of the Prefilter.
+ Prefilter* TakeMatch();
+
std::set<std::string>& exact() { return exact_; }
-
- bool is_exact() const { return is_exact_; }
-
- class Walker;
-
- private:
+
+ bool is_exact() const { return is_exact_; }
+
+ class Walker;
+
+ private:
std::set<std::string> exact_;
-
- // When is_exact_ is true, the strings that match
- // are placed in exact_. When it is no longer an exact
- // set of strings that match this RE, then is_exact_
- // is false and the match_ contains the required match
- // criteria.
- bool is_exact_;
-
- // Accumulated Prefilter query that any
- // match for this regexp is guaranteed to match.
- Prefilter* match_;
-};
-
-
-Prefilter::Info::Info()
- : is_exact_(false),
- match_(NULL) {
-}
-
-Prefilter::Info::~Info() {
- delete match_;
-}
-
-Prefilter* Prefilter::Info::TakeMatch() {
- if (is_exact_) {
- match_ = Prefilter::OrStrings(&exact_);
- is_exact_ = false;
- }
- Prefilter* m = match_;
- match_ = NULL;
- return m;
-}
-
-// Format a Info in string form.
+
+ // When is_exact_ is true, the strings that match
+ // are placed in exact_. When it is no longer an exact
+ // set of strings that match this RE, then is_exact_
+ // is false and the match_ contains the required match
+ // criteria.
+ bool is_exact_;
+
+ // Accumulated Prefilter query that any
+ // match for this regexp is guaranteed to match.
+ Prefilter* match_;
+};
+
+
+Prefilter::Info::Info()
+ : is_exact_(false),
+ match_(NULL) {
+}
+
+Prefilter::Info::~Info() {
+ delete match_;
+}
+
+Prefilter* Prefilter::Info::TakeMatch() {
+ if (is_exact_) {
+ match_ = Prefilter::OrStrings(&exact_);
+ is_exact_ = false;
+ }
+ Prefilter* m = match_;
+ match_ = NULL;
+ return m;
+}
+
+// Format a Info in string form.
std::string Prefilter::Info::ToString() {
- if (is_exact_) {
- int n = 0;
+ if (is_exact_) {
+ int n = 0;
std::string s;
for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
- if (n++ > 0)
- s += ",";
- s += *i;
- }
- return s;
- }
-
- if (match_)
- return match_->DebugString();
-
- return "";
-}
-
-// Add the strings from src to dst.
+ if (n++ > 0)
+ s += ",";
+ s += *i;
+ }
+ return s;
+ }
+
+ if (match_)
+ return match_->DebugString();
+
+ return "";
+}
+
+// Add the strings from src to dst.
static void CopyIn(const std::set<std::string>& src,
std::set<std::string>* dst) {
- for (ConstSSIter i = src.begin(); i != src.end(); ++i)
- dst->insert(*i);
-}
-
-// Add the cross-product of a and b to dst.
-// (For each string i in a and j in b, add i+j.)
+ for (ConstSSIter i = src.begin(); i != src.end(); ++i)
+ dst->insert(*i);
+}
+
+// Add the cross-product of a and b to dst.
+// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const std::set<std::string>& a,
const std::set<std::string>& b,
std::set<std::string>* dst) {
- for (ConstSSIter i = a.begin(); i != a.end(); ++i)
- for (ConstSSIter j = b.begin(); j != b.end(); ++j)
- dst->insert(*i + *j);
-}
-
-// Concats a and b. Requires that both are exact sets.
-// Forms an exact set that is a crossproduct of a and b.
-Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
- if (a == NULL)
- return b;
- DCHECK(a->is_exact_);
- DCHECK(b && b->is_exact_);
- Info *ab = new Info();
-
- CrossProduct(a->exact_, b->exact_, &ab->exact_);
- ab->is_exact_ = true;
-
- delete a;
- delete b;
- return ab;
-}
-
-// Constructs an inexact Info for ab given a and b.
-// Used only when a or b is not exact or when the
-// exact cross product is likely to be too big.
-Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
- if (a == NULL)
- return b;
- if (b == NULL)
- return a;
-
- Info *ab = new Info();
-
- ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
- ab->is_exact_ = false;
- delete a;
- delete b;
- return ab;
-}
-
-// Constructs Info for a|b given a and b.
-Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
- Info *ab = new Info();
-
- if (a->is_exact_ && b->is_exact_) {
- CopyIn(a->exact_, &ab->exact_);
- CopyIn(b->exact_, &ab->exact_);
- ab->is_exact_ = true;
- } else {
- // Either a or b has is_exact_ = false. If the other
- // one has is_exact_ = true, we move it to match_ and
- // then create a OR of a,b. The resulting Info has
- // is_exact_ = false.
- ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
- ab->is_exact_ = false;
- }
-
- delete a;
- delete b;
- return ab;
-}
-
-// Constructs Info for a? given a.
-Prefilter::Info* Prefilter::Info::Quest(Info *a) {
- Info *ab = new Info();
-
- ab->is_exact_ = false;
- ab->match_ = new Prefilter(ALL);
- delete a;
- return ab;
-}
-
-// Constructs Info for a* given a.
-// Same as a? -- not much to do.
-Prefilter::Info* Prefilter::Info::Star(Info *a) {
- return Quest(a);
-}
-
-// Constructs Info for a+ given a. If a was exact set, it isn't
-// anymore.
-Prefilter::Info* Prefilter::Info::Plus(Info *a) {
- Info *ab = new Info();
-
- ab->match_ = a->TakeMatch();
- ab->is_exact_ = false;
-
- delete a;
- return ab;
-}
-
+ for (ConstSSIter i = a.begin(); i != a.end(); ++i)
+ for (ConstSSIter j = b.begin(); j != b.end(); ++j)
+ dst->insert(*i + *j);
+}
+
+// Concats a and b. Requires that both are exact sets.
+// Forms an exact set that is a crossproduct of a and b.
+Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
+ if (a == NULL)
+ return b;
+ DCHECK(a->is_exact_);
+ DCHECK(b && b->is_exact_);
+ Info *ab = new Info();
+
+ CrossProduct(a->exact_, b->exact_, &ab->exact_);
+ ab->is_exact_ = true;
+
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs an inexact Info for ab given a and b.
+// Used only when a or b is not exact or when the
+// exact cross product is likely to be too big.
+Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
+ if (a == NULL)
+ return b;
+ if (b == NULL)
+ return a;
+
+ Info *ab = new Info();
+
+ ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
+ ab->is_exact_ = false;
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs Info for a|b given a and b.
+Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
+ Info *ab = new Info();
+
+ if (a->is_exact_ && b->is_exact_) {
+ CopyIn(a->exact_, &ab->exact_);
+ CopyIn(b->exact_, &ab->exact_);
+ ab->is_exact_ = true;
+ } else {
+ // Either a or b has is_exact_ = false. If the other
+ // one has is_exact_ = true, we move it to match_ and
+ // then create a OR of a,b. The resulting Info has
+ // is_exact_ = false.
+ ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
+ ab->is_exact_ = false;
+ }
+
+ delete a;
+ delete b;
+ return ab;
+}
+
+// Constructs Info for a? given a.
+Prefilter::Info* Prefilter::Info::Quest(Info *a) {
+ Info *ab = new Info();
+
+ ab->is_exact_ = false;
+ ab->match_ = new Prefilter(ALL);
+ delete a;
+ return ab;
+}
+
+// Constructs Info for a* given a.
+// Same as a? -- not much to do.
+Prefilter::Info* Prefilter::Info::Star(Info *a) {
+ return Quest(a);
+}
+
+// Constructs Info for a+ given a. If a was exact set, it isn't
+// anymore.
+Prefilter::Info* Prefilter::Info::Plus(Info *a) {
+ Info *ab = new Info();
+
+ ab->match_ = a->TakeMatch();
+ ab->is_exact_ = false;
+
+ delete a;
+ return ab;
+}
+
static std::string RuneToString(Rune r) {
- char buf[UTFmax];
- int n = runetochar(buf, &r);
+ char buf[UTFmax];
+ int n = runetochar(buf, &r);
return std::string(buf, n);
-}
-
+}
+
static std::string RuneToStringLatin1(Rune r) {
char c = r & 0xff;
return std::string(&c, 1);
}
-// Constructs Info for literal rune.
-Prefilter::Info* Prefilter::Info::Literal(Rune r) {
- Info* info = new Info();
+// Constructs Info for literal rune.
+Prefilter::Info* Prefilter::Info::Literal(Rune r) {
+ Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
- info->is_exact_ = true;
- return info;
-}
-
+ info->is_exact_ = true;
+ return info;
+}
+
// Constructs Info for literal rune for Latin1 encoded string.
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
Info* info = new Info();
@@ -417,52 +417,52 @@ Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
// Constructs Info for dot (any character) or \C (any byte).
Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
- Prefilter::Info* info = new Prefilter::Info();
- info->match_ = new Prefilter(ALL);
- return info;
-}
-
-// Constructs Prefilter::Info for no possible match.
-Prefilter::Info* Prefilter::Info::NoMatch() {
- Prefilter::Info* info = new Prefilter::Info();
- info->match_ = new Prefilter(NONE);
- return info;
-}
-
-// Constructs Prefilter::Info for any possible match.
-// This Prefilter::Info is valid for any regular expression,
-// since it makes no assertions whatsoever about the
-// strings being matched.
-Prefilter::Info* Prefilter::Info::AnyMatch() {
- Prefilter::Info *info = new Prefilter::Info();
- info->match_ = new Prefilter(ALL);
- return info;
-}
-
-// Constructs Prefilter::Info for just the empty string.
-Prefilter::Info* Prefilter::Info::EmptyString() {
- Prefilter::Info* info = new Prefilter::Info();
- info->is_exact_ = true;
- info->exact_.insert("");
- return info;
-}
-
-// Constructs Prefilter::Info for a character class.
-typedef CharClass::iterator CCIter;
+ Prefilter::Info* info = new Prefilter::Info();
+ info->match_ = new Prefilter(ALL);
+ return info;
+}
+
+// Constructs Prefilter::Info for no possible match.
+Prefilter::Info* Prefilter::Info::NoMatch() {
+ Prefilter::Info* info = new Prefilter::Info();
+ info->match_ = new Prefilter(NONE);
+ return info;
+}
+
+// Constructs Prefilter::Info for any possible match.
+// This Prefilter::Info is valid for any regular expression,
+// since it makes no assertions whatsoever about the
+// strings being matched.
+Prefilter::Info* Prefilter::Info::AnyMatch() {
+ Prefilter::Info *info = new Prefilter::Info();
+ info->match_ = new Prefilter(ALL);
+ return info;
+}
+
+// Constructs Prefilter::Info for just the empty string.
+Prefilter::Info* Prefilter::Info::EmptyString() {
+ Prefilter::Info* info = new Prefilter::Info();
+ info->is_exact_ = true;
+ info->exact_.insert("");
+ return info;
+}
+
+// Constructs Prefilter::Info for a character class.
+typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
if (ExtraDebug) {
LOG(ERROR) << "CharClassInfo:";
- for (CCIter i = cc->begin(); i != cc->end(); ++i)
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
LOG(ERROR) << " " << i->lo << "-" << i->hi;
- }
-
- // If the class is too large, it's okay to overestimate.
- if (cc->size() > 10)
+ }
+
+ // If the class is too large, it's okay to overestimate.
+ if (cc->size() > 10)
return AnyCharOrAnyByte();
-
- Prefilter::Info *a = new Prefilter::Info();
- for (CCIter i = cc->begin(); i != cc->end(); ++i)
+
+ Prefilter::Info *a = new Prefilter::Info();
+ for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++) {
if (latin1) {
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
@@ -470,101 +470,101 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
a->exact_.insert(RuneToString(ToLowerRune(r)));
}
}
-
- a->is_exact_ = true;
-
+
+ a->is_exact_ = true;
+
if (ExtraDebug)
LOG(ERROR) << " = " << a->ToString();
-
- return a;
-}
-
-class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
- public:
+
+ return a;
+}
+
+class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
+ public:
Walker(bool latin1) : latin1_(latin1) {}
-
- virtual Info* PostVisit(
- Regexp* re, Info* parent_arg,
- Info* pre_arg,
- Info** child_args, int nchild_args);
-
- virtual Info* ShortVisit(
- Regexp* re,
- Info* parent_arg);
-
+
+ virtual Info* PostVisit(
+ Regexp* re, Info* parent_arg,
+ Info* pre_arg,
+ Info** child_args, int nchild_args);
+
+ virtual Info* ShortVisit(
+ Regexp* re,
+ Info* parent_arg);
+
bool latin1() { return latin1_; }
- private:
+ private:
bool latin1_;
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
-};
-
-Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
+};
+
+Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (ExtraDebug)
LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
Prefilter::Info::Walker w(latin1);
- Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
-
- if (w.stopped_early()) {
- delete info;
- return NULL;
- }
-
- return info;
-}
-
-Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
- Regexp* re, Prefilter::Info* parent_arg) {
- return AnyMatch();
-}
-
-// Constructs the Prefilter::Info for the given regular expression.
-// Assumes re is simplified.
-Prefilter::Info* Prefilter::Info::Walker::PostVisit(
- Regexp* re, Prefilter::Info* parent_arg,
- Prefilter::Info* pre_arg, Prefilter::Info** child_args,
- int nchild_args) {
- Prefilter::Info *info;
- switch (re->op()) {
- default:
- case kRegexpRepeat:
- LOG(DFATAL) << "Bad regexp op " << re->op();
- info = EmptyString();
- break;
-
- case kRegexpNoMatch:
- info = NoMatch();
- break;
-
- // These ops match the empty string:
- case kRegexpEmptyMatch: // anywhere
- case kRegexpBeginLine: // at beginning of line
- case kRegexpEndLine: // at end of line
- case kRegexpBeginText: // at beginning of text
- case kRegexpEndText: // at end of text
- case kRegexpWordBoundary: // at word boundary
- case kRegexpNoWordBoundary: // not at word boundary
- info = EmptyString();
- break;
-
- case kRegexpLiteral:
+ Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
+
+ if (w.stopped_early()) {
+ delete info;
+ return NULL;
+ }
+
+ return info;
+}
+
+Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
+ Regexp* re, Prefilter::Info* parent_arg) {
+ return AnyMatch();
+}
+
+// Constructs the Prefilter::Info for the given regular expression.
+// Assumes re is simplified.
+Prefilter::Info* Prefilter::Info::Walker::PostVisit(
+ Regexp* re, Prefilter::Info* parent_arg,
+ Prefilter::Info* pre_arg, Prefilter::Info** child_args,
+ int nchild_args) {
+ Prefilter::Info *info;
+ switch (re->op()) {
+ default:
+ case kRegexpRepeat:
+ LOG(DFATAL) << "Bad regexp op " << re->op();
+ info = EmptyString();
+ break;
+
+ case kRegexpNoMatch:
+ info = NoMatch();
+ break;
+
+ // These ops match the empty string:
+ case kRegexpEmptyMatch: // anywhere
+ case kRegexpBeginLine: // at beginning of line
+ case kRegexpEndLine: // at end of line
+ case kRegexpBeginText: // at beginning of text
+ case kRegexpEndText: // at end of text
+ case kRegexpWordBoundary: // at word boundary
+ case kRegexpNoWordBoundary: // not at word boundary
+ info = EmptyString();
+ break;
+
+ case kRegexpLiteral:
if (latin1()) {
info = LiteralLatin1(re->rune());
}
else {
info = Literal(re->rune());
}
- break;
-
- case kRegexpLiteralString:
- if (re->nrunes() == 0) {
- info = NoMatch();
- break;
- }
+ break;
+
+ case kRegexpLiteralString:
+ if (re->nrunes() == 0) {
+ info = NoMatch();
+ break;
+ }
if (latin1()) {
info = LiteralLatin1(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
@@ -576,136 +576,136 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
info = Concat(info, Literal(re->runes()[i]));
}
}
- break;
-
- case kRegexpConcat: {
- // Accumulate in info.
- // Exact is concat of recent contiguous exact nodes.
- info = NULL;
- Info* exact = NULL;
- for (int i = 0; i < nchild_args; i++) {
- Info* ci = child_args[i]; // child info
- if (!ci->is_exact() ||
- (exact && ci->exact().size() * exact->exact().size() > 16)) {
- // Exact run is over.
- info = And(info, exact);
- exact = NULL;
- // Add this child's info.
- info = And(info, ci);
- } else {
- // Append to exact run.
- exact = Concat(exact, ci);
- }
- }
- info = And(info, exact);
- }
- break;
-
- case kRegexpAlternate:
- info = child_args[0];
- for (int i = 1; i < nchild_args; i++)
- info = Alt(info, child_args[i]);
- break;
-
- case kRegexpStar:
- info = Star(child_args[0]);
- break;
-
- case kRegexpQuest:
- info = Quest(child_args[0]);
- break;
-
- case kRegexpPlus:
- info = Plus(child_args[0]);
- break;
-
- case kRegexpAnyChar:
+ break;
+
+ case kRegexpConcat: {
+ // Accumulate in info.
+ // Exact is concat of recent contiguous exact nodes.
+ info = NULL;
+ Info* exact = NULL;
+ for (int i = 0; i < nchild_args; i++) {
+ Info* ci = child_args[i]; // child info
+ if (!ci->is_exact() ||
+ (exact && ci->exact().size() * exact->exact().size() > 16)) {
+ // Exact run is over.
+ info = And(info, exact);
+ exact = NULL;
+ // Add this child's info.
+ info = And(info, ci);
+ } else {
+ // Append to exact run.
+ exact = Concat(exact, ci);
+ }
+ }
+ info = And(info, exact);
+ }
+ break;
+
+ case kRegexpAlternate:
+ info = child_args[0];
+ for (int i = 1; i < nchild_args; i++)
+ info = Alt(info, child_args[i]);
+ break;
+
+ case kRegexpStar:
+ info = Star(child_args[0]);
+ break;
+
+ case kRegexpQuest:
+ info = Quest(child_args[0]);
+ break;
+
+ case kRegexpPlus:
+ info = Plus(child_args[0]);
+ break;
+
+ case kRegexpAnyChar:
case kRegexpAnyByte:
- // Claim nothing, except that it's not empty.
+ // Claim nothing, except that it's not empty.
info = AnyCharOrAnyByte();
- break;
-
- case kRegexpCharClass:
+ break;
+
+ case kRegexpCharClass:
info = CClass(re->cc(), latin1());
- break;
-
- case kRegexpCapture:
- // These don't affect the set of matching strings.
- info = child_args[0];
- break;
- }
-
+ break;
+
+ case kRegexpCapture:
+ // These don't affect the set of matching strings.
+ info = child_args[0];
+ break;
+ }
+
if (ExtraDebug)
LOG(ERROR) << "BuildInfo " << re->ToString()
<< ": " << (info ? info->ToString() : "");
-
- return info;
-}
-
-
-Prefilter* Prefilter::FromRegexp(Regexp* re) {
- if (re == NULL)
- return NULL;
-
- Regexp* simple = re->Simplify();
+
+ return info;
+}
+
+
+Prefilter* Prefilter::FromRegexp(Regexp* re) {
+ if (re == NULL)
+ return NULL;
+
+ Regexp* simple = re->Simplify();
if (simple == NULL)
return NULL;
-
+
Prefilter::Info* info = BuildInfo(simple);
- simple->Decref();
- if (info == NULL)
- return NULL;
-
- Prefilter* m = info->TakeMatch();
- delete info;
- return m;
-}
-
+ simple->Decref();
+ if (info == NULL)
+ return NULL;
+
+ Prefilter* m = info->TakeMatch();
+ delete info;
+ return m;
+}
+
std::string Prefilter::DebugString() const {
- switch (op_) {
- default:
- LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
- return StringPrintf("op%d", op_);
- case NONE:
- return "*no-matches*";
- case ATOM:
- return atom_;
- case ALL:
- return "";
- case AND: {
+ switch (op_) {
+ default:
+ LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
+ return StringPrintf("op%d", op_);
+ case NONE:
+ return "*no-matches*";
+ case ATOM:
+ return atom_;
+ case ALL:
+ return "";
+ case AND: {
std::string s = "";
for (size_t i = 0; i < subs_->size(); i++) {
- if (i > 0)
- s += " ";
+ if (i > 0)
+ s += " ";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
- }
- return s;
- }
- case OR: {
+ }
+ return s;
+ }
+ case OR: {
std::string s = "(";
for (size_t i = 0; i < subs_->size(); i++) {
- if (i > 0)
- s += "|";
+ if (i > 0)
+ s += "|";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
- }
- s += ")";
- return s;
- }
- }
-}
-
-Prefilter* Prefilter::FromRE2(const RE2* re2) {
- if (re2 == NULL)
- return NULL;
-
- Regexp* regexp = re2->Regexp();
- if (regexp == NULL)
- return NULL;
-
- return FromRegexp(regexp);
-}
-
-
-} // namespace re2
+ }
+ s += ")";
+ return s;
+ }
+ }
+}
+
+Prefilter* Prefilter::FromRE2(const RE2* re2) {
+ if (re2 == NULL)
+ return NULL;
+
+ Regexp* regexp = re2->Regexp();
+ if (regexp == NULL)
+ return NULL;
+
+ return FromRegexp(regexp);
+}
+
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/prefilter.h b/contrib/libs/re2/re2/prefilter.h
index 1ce0b63c76..4fedeb4a7c 100644
--- a/contrib/libs/re2/re2/prefilter.h
+++ b/contrib/libs/re2/re2/prefilter.h
@@ -1,108 +1,108 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
-// Prefilter is the class used to extract string guards from regexps.
-// Rather than using Prefilter class directly, use FilteredRE2.
-// See filtered_re2.h
-
+// Prefilter is the class used to extract string guards from regexps.
+// Rather than using Prefilter class directly, use FilteredRE2.
+// See filtered_re2.h
+
#include <set>
#include <string>
#include <vector>
-
+
#include "util/util.h"
#include "util/logging.h"
-
-namespace re2 {
-
-class RE2;
-
-class Regexp;
-
-class Prefilter {
- // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
- public:
- enum Op {
- ALL = 0, // Everything matches
- NONE, // Nothing matches
- ATOM, // The string atom() must match
- AND, // All in subs() must match
- OR, // One of subs() must match
- };
-
- explicit Prefilter(Op op);
- ~Prefilter();
-
- Op op() { return op_; }
+
+namespace re2 {
+
+class RE2;
+
+class Regexp;
+
+class Prefilter {
+ // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
+ public:
+ enum Op {
+ ALL = 0, // Everything matches
+ NONE, // Nothing matches
+ ATOM, // The string atom() must match
+ AND, // All in subs() must match
+ OR, // One of subs() must match
+ };
+
+ explicit Prefilter(Op op);
+ ~Prefilter();
+
+ Op op() { return op_; }
const std::string& atom() const { return atom_; }
- void set_unique_id(int id) { unique_id_ = id; }
- int unique_id() const { return unique_id_; }
-
- // The children of the Prefilter node.
+ void set_unique_id(int id) { unique_id_ = id; }
+ int unique_id() const { return unique_id_; }
+
+ // The children of the Prefilter node.
std::vector<Prefilter*>* subs() {
DCHECK(op_ == AND || op_ == OR);
- return subs_;
- }
-
- // Set the children vector. Prefilter takes ownership of subs and
- // subs_ will be deleted when Prefilter is deleted.
+ return subs_;
+ }
+
+ // Set the children vector. Prefilter takes ownership of subs and
+ // subs_ will be deleted when Prefilter is deleted.
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
-
- // Given a RE2, return a Prefilter. The caller takes ownership of
- // the Prefilter and should deallocate it. Returns NULL if Prefilter
- // cannot be formed.
- static Prefilter* FromRE2(const RE2* re2);
-
- // Returns a readable debug string of the prefilter.
+
+ // Given a RE2, return a Prefilter. The caller takes ownership of
+ // the Prefilter and should deallocate it. Returns NULL if Prefilter
+ // cannot be formed.
+ static Prefilter* FromRE2(const RE2* re2);
+
+ // Returns a readable debug string of the prefilter.
std::string DebugString() const;
-
- private:
- class Info;
-
- // Combines two prefilters together to create an AND. The passed
- // Prefilters will be part of the returned Prefilter or deleted.
- static Prefilter* And(Prefilter* a, Prefilter* b);
-
- // Combines two prefilters together to create an OR. The passed
- // Prefilters will be part of the returned Prefilter or deleted.
- static Prefilter* Or(Prefilter* a, Prefilter* b);
-
- // Generalized And/Or
- static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
-
- static Prefilter* FromRegexp(Regexp* a);
-
+
+ private:
+ class Info;
+
+ // Combines two prefilters together to create an AND. The passed
+ // Prefilters will be part of the returned Prefilter or deleted.
+ static Prefilter* And(Prefilter* a, Prefilter* b);
+
+ // Combines two prefilters together to create an OR. The passed
+ // Prefilters will be part of the returned Prefilter or deleted.
+ static Prefilter* Or(Prefilter* a, Prefilter* b);
+
+ // Generalized And/Or
+ static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
+
+ static Prefilter* FromRegexp(Regexp* a);
+
static Prefilter* FromString(const std::string& str);
-
+
static Prefilter* OrStrings(std::set<std::string>* ss);
-
- static Info* BuildInfo(Regexp* re);
-
- Prefilter* Simplify();
-
- // Kind of Prefilter.
- Op op_;
-
- // Sub-matches for AND or OR Prefilter.
+
+ static Info* BuildInfo(Regexp* re);
+
+ Prefilter* Simplify();
+
+ // Kind of Prefilter.
+ Op op_;
+
+ // Sub-matches for AND or OR Prefilter.
std::vector<Prefilter*>* subs_;
-
- // Actual string to match in leaf node.
+
+ // Actual string to match in leaf node.
std::string atom_;
-
- // If different prefilters have the same string atom, or if they are
- // structurally the same (e.g., OR of same atom strings) they are
- // considered the same unique nodes. This is the id for each unique
- // node. This field is populated with a unique id for every node,
- // and -1 for duplicate nodes.
- int unique_id_;
-
+
+ // If different prefilters have the same string atom, or if they are
+ // structurally the same (e.g., OR of same atom strings) they are
+ // considered the same unique nodes. This is the id for each unique
+ // node. This field is populated with a unique id for every node,
+ // and -1 for duplicate nodes.
+ int unique_id_;
+
Prefilter(const Prefilter&) = delete;
Prefilter& operator=(const Prefilter&) = delete;
-};
-
-} // namespace re2
-
-#endif // RE2_PREFILTER_H_
+};
+
+} // namespace re2
+
+#endif // RE2_PREFILTER_H_
diff --git a/contrib/libs/re2/re2/prefilter_tree.cc b/contrib/libs/re2/re2/prefilter_tree.cc
index 1d24198590..fdf4e083c9 100644
--- a/contrib/libs/re2/re2/prefilter_tree.cc
+++ b/contrib/libs/re2/re2/prefilter_tree.cc
@@ -1,9 +1,9 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#include "re2/prefilter_tree.h"
-
+
#include <stddef.h>
#include <algorithm>
#include <map>
@@ -16,118 +16,118 @@
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
-#include "re2/prefilter.h"
+#include "re2/prefilter.h"
#include "re2/re2.h"
-
-namespace re2 {
-
+
+namespace re2 {
+
static const bool ExtraDebug = false;
-PrefilterTree::PrefilterTree()
+PrefilterTree::PrefilterTree()
: compiled_(false),
min_atom_len_(3) {
-}
-
+}
+
PrefilterTree::PrefilterTree(int min_atom_len)
: compiled_(false),
min_atom_len_(min_atom_len) {
}
-PrefilterTree::~PrefilterTree() {
+PrefilterTree::~PrefilterTree() {
for (size_t i = 0; i < prefilter_vec_.size(); i++)
- delete prefilter_vec_[i];
-
+ delete prefilter_vec_[i];
+
for (size_t i = 0; i < entries_.size(); i++)
- delete entries_[i].parents;
-}
-
+ delete entries_[i].parents;
+}
+
void PrefilterTree::Add(Prefilter* prefilter) {
- if (compiled_) {
+ if (compiled_) {
LOG(DFATAL) << "Add called after Compile.";
- return;
- }
+ return;
+ }
if (prefilter != NULL && !KeepNode(prefilter)) {
delete prefilter;
prefilter = NULL;
- }
-
+ }
+
prefilter_vec_.push_back(prefilter);
-}
-
+}
+
void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
- if (compiled_) {
+ if (compiled_) {
LOG(DFATAL) << "Compile called already.";
- return;
- }
-
+ return;
+ }
+
// Some legacy users of PrefilterTree call Compile() before
// adding any regexps and expect Compile() to have no effect.
- if (prefilter_vec_.empty())
- return;
-
- compiled_ = true;
-
+ if (prefilter_vec_.empty())
+ return;
+
+ compiled_ = true;
+
// TODO(junyer): Use std::unordered_set<Prefilter*> instead?
NodeMap nodes;
AssignUniqueIds(&nodes, atom_vec);
-
- // Identify nodes that are too common among prefilters and are
- // triggering too many parents. Then get rid of them if possible.
- // Note that getting rid of a prefilter node simply means they are
- // no longer necessary for their parent to trigger; that is, we do
- // not miss out on any regexps triggering by getting rid of a
- // prefilter node.
+
+ // Identify nodes that are too common among prefilters and are
+ // triggering too many parents. Then get rid of them if possible.
+ // Note that getting rid of a prefilter node simply means they are
+ // no longer necessary for their parent to trigger; that is, we do
+ // not miss out on any regexps triggering by getting rid of a
+ // prefilter node.
for (size_t i = 0; i < entries_.size(); i++) {
StdIntMap* parents = entries_[i].parents;
- if (parents->size() > 8) {
- // This one triggers too many things. If all the parents are AND
- // nodes and have other things guarding them, then get rid of
- // this trigger. TODO(vsri): Adjust the threshold appropriately,
- // make it a function of total number of nodes?
- bool have_other_guard = true;
+ if (parents->size() > 8) {
+ // This one triggers too many things. If all the parents are AND
+ // nodes and have other things guarding them, then get rid of
+ // this trigger. TODO(vsri): Adjust the threshold appropriately,
+ // make it a function of total number of nodes?
+ bool have_other_guard = true;
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it) {
- have_other_guard = have_other_guard &&
+ have_other_guard = have_other_guard &&
(entries_[it->first].propagate_up_at_count > 1);
}
-
- if (have_other_guard) {
+
+ if (have_other_guard) {
for (StdIntMap::iterator it = parents->begin();
- it != parents->end(); ++it)
+ it != parents->end(); ++it)
entries_[it->first].propagate_up_at_count -= 1;
-
- parents->clear(); // Forget the parents
- }
- }
- }
-
+
+ parents->clear(); // Forget the parents
+ }
+ }
+ }
+
if (ExtraDebug)
PrintDebugInfo(&nodes);
-}
-
+}
+
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
std::string node_string = NodeString(node);
NodeMap::iterator iter = nodes->find(node_string);
if (iter == nodes->end())
- return NULL;
- return (*iter).second;
-}
-
+ return NULL;
+ return (*iter).second;
+}
+
std::string PrefilterTree::NodeString(Prefilter* node) const {
- // Adding the operation disambiguates AND/OR/atom nodes.
+ // Adding the operation disambiguates AND/OR/atom nodes.
std::string s = StringPrintf("%d", node->op()) + ":";
- if (node->op() == Prefilter::ATOM) {
- s += node->atom();
- } else {
+ if (node->op() == Prefilter::ATOM) {
+ s += node->atom();
+ } else {
for (size_t i = 0; i < node->subs()->size(); i++) {
- if (i > 0)
- s += ',';
+ if (i > 0)
+ s += ',';
s += StringPrintf("%d", (*node->subs())[i]->unique_id());
- }
- }
- return s;
-}
-
+ }
+ }
+ return s;
+}
+
bool PrefilterTree::KeepNode(Prefilter* node) const {
if (node == NULL)
return false;
@@ -167,137 +167,137 @@ bool PrefilterTree::KeepNode(Prefilter* node) const {
void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
std::vector<std::string>* atom_vec) {
- atom_vec->clear();
-
- // Build vector of all filter nodes, sorted topologically
- // from top to bottom in v.
+ atom_vec->clear();
+
+ // Build vector of all filter nodes, sorted topologically
+ // from top to bottom in v.
std::vector<Prefilter*> v;
-
- // Add the top level nodes of each regexp prefilter.
+
+ // Add the top level nodes of each regexp prefilter.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
- Prefilter* f = prefilter_vec_[i];
- if (f == NULL)
+ Prefilter* f = prefilter_vec_[i];
+ if (f == NULL)
unfiltered_.push_back(static_cast<int>(i));
-
- // We push NULL also on to v, so that we maintain the
- // mapping of index==regexpid for level=0 prefilter nodes.
- v.push_back(f);
- }
-
- // Now add all the descendant nodes.
+
+ // We push NULL also on to v, so that we maintain the
+ // mapping of index==regexpid for level=0 prefilter nodes.
+ v.push_back(f);
+ }
+
+ // Now add all the descendant nodes.
for (size_t i = 0; i < v.size(); i++) {
- Prefilter* f = v[i];
- if (f == NULL)
- continue;
- if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
+ Prefilter* f = v[i];
+ if (f == NULL)
+ continue;
+ if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const std::vector<Prefilter*>& subs = *f->subs();
for (size_t j = 0; j < subs.size(); j++)
- v.push_back(subs[j]);
- }
- }
-
- // Identify unique nodes.
- int unique_id = 0;
+ v.push_back(subs[j]);
+ }
+ }
+
+ // Identify unique nodes.
+ int unique_id = 0;
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
- Prefilter *node = v[i];
- if (node == NULL)
- continue;
- node->set_unique_id(-1);
+ Prefilter *node = v[i];
+ if (node == NULL)
+ continue;
+ node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(nodes, node);
- if (canonical == NULL) {
- // Any further nodes that have the same node string
- // will find this node as the canonical node.
+ if (canonical == NULL) {
+ // Any further nodes that have the same node string
+ // will find this node as the canonical node.
nodes->emplace(NodeString(node), node);
- if (node->op() == Prefilter::ATOM) {
- atom_vec->push_back(node->atom());
- atom_index_to_id_.push_back(unique_id);
- }
- node->set_unique_id(unique_id++);
- } else {
- node->set_unique_id(canonical->unique_id());
- }
- }
+ if (node->op() == Prefilter::ATOM) {
+ atom_vec->push_back(node->atom());
+ atom_index_to_id_.push_back(unique_id);
+ }
+ node->set_unique_id(unique_id++);
+ } else {
+ node->set_unique_id(canonical->unique_id());
+ }
+ }
entries_.resize(nodes->size());
-
+
// Create parent StdIntMap for the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
- Prefilter* prefilter = v[i];
- if (prefilter == NULL)
- continue;
-
+ Prefilter* prefilter = v[i];
+ if (prefilter == NULL)
+ continue;
+
if (CanonicalNode(nodes, prefilter) != prefilter)
- continue;
-
- Entry* entry = &entries_[prefilter->unique_id()];
+ continue;
+
+ Entry* entry = &entries_[prefilter->unique_id()];
entry->parents = new StdIntMap();
- }
-
- // Fill the entries.
+ }
+
+ // Fill the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
- Prefilter* prefilter = v[i];
- if (prefilter == NULL)
- continue;
-
+ Prefilter* prefilter = v[i];
+ if (prefilter == NULL)
+ continue;
+
if (CanonicalNode(nodes, prefilter) != prefilter)
- continue;
-
- Entry* entry = &entries_[prefilter->unique_id()];
-
- switch (prefilter->op()) {
- default:
- case Prefilter::ALL:
- LOG(DFATAL) << "Unexpected op: " << prefilter->op();
- return;
-
- case Prefilter::ATOM:
- entry->propagate_up_at_count = 1;
- break;
-
- case Prefilter::OR:
- case Prefilter::AND: {
+ continue;
+
+ Entry* entry = &entries_[prefilter->unique_id()];
+
+ switch (prefilter->op()) {
+ default:
+ case Prefilter::ALL:
+ LOG(DFATAL) << "Unexpected op: " << prefilter->op();
+ return;
+
+ case Prefilter::ATOM:
+ entry->propagate_up_at_count = 1;
+ break;
+
+ case Prefilter::OR:
+ case Prefilter::AND: {
std::set<int> uniq_child;
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
- Prefilter* child = (*prefilter->subs())[j];
+ Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(nodes, child);
- if (canonical == NULL) {
- LOG(DFATAL) << "Null canonical node";
- return;
- }
- int child_id = canonical->unique_id();
+ if (canonical == NULL) {
+ LOG(DFATAL) << "Null canonical node";
+ return;
+ }
+ int child_id = canonical->unique_id();
uniq_child.insert(child_id);
- // To the child, we want to add to parent indices.
- Entry* child_entry = &entries_[child_id];
+ // To the child, we want to add to parent indices.
+ Entry* child_entry = &entries_[child_id];
if (child_entry->parents->find(prefilter->unique_id()) ==
child_entry->parents->end()) {
(*child_entry->parents)[prefilter->unique_id()] = 1;
}
- }
+ }
entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
? static_cast<int>(uniq_child.size())
: 1;
-
- break;
- }
- }
- }
-
- // For top level nodes, populate regexp id.
+
+ break;
+ }
+ }
+ }
+
+ // For top level nodes, populate regexp id.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
- if (prefilter_vec_[i] == NULL)
- continue;
+ if (prefilter_vec_[i] == NULL)
+ continue;
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
- DCHECK_LE(0, id);
- Entry* entry = &entries_[id];
+ DCHECK_LE(0, id);
+ Entry* entry = &entries_[id];
entry->regexps.push_back(static_cast<int>(i));
- }
-}
-
-// Functions for triggering during search.
-void PrefilterTree::RegexpsGivenStrings(
+ }
+}
+
+// Functions for triggering during search.
+void PrefilterTree::RegexpsGivenStrings(
const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const {
- regexps->clear();
- if (!compiled_) {
+ regexps->clear();
+ if (!compiled_) {
// Some legacy users of PrefilterTree call Compile() before
// adding any regexps and expect Compile() to have no effect.
// This kludge is a counterpart to that kludge.
@@ -307,7 +307,7 @@ void PrefilterTree::RegexpsGivenStrings(
LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
for (size_t i = 0; i < prefilter_vec_.size(); i++)
regexps->push_back(static_cast<int>(i));
- } else {
+ } else {
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
std::vector<int> matched_atom_ids;
for (size_t j = 0; j < matched_atoms.size(); j++)
@@ -317,57 +317,57 @@ void PrefilterTree::RegexpsGivenStrings(
it != regexps_map.end();
++it)
regexps->push_back(it->index());
-
+
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
- }
+ }
std::sort(regexps->begin(), regexps->end());
-}
-
+}
+
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
- IntMap* regexps) const {
+ IntMap* regexps) const {
IntMap count(static_cast<int>(entries_.size()));
IntMap work(static_cast<int>(entries_.size()));
for (size_t i = 0; i < atom_ids.size(); i++)
- work.set(atom_ids[i], 1);
- for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
- const Entry& entry = entries_[it->index()];
- // Record regexps triggered.
+ work.set(atom_ids[i], 1);
+ for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
+ const Entry& entry = entries_[it->index()];
+ // Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++)
- regexps->set(entry.regexps[i], 1);
- int c;
- // Pass trigger up to parents.
+ regexps->set(entry.regexps[i], 1);
+ int c;
+ // Pass trigger up to parents.
for (StdIntMap::iterator it = entry.parents->begin();
- it != entry.parents->end();
- ++it) {
+ it != entry.parents->end();
+ ++it) {
int j = it->first;
- const Entry& parent = entries_[j];
- // Delay until all the children have succeeded.
- if (parent.propagate_up_at_count > 1) {
- if (count.has_index(j)) {
- c = count.get_existing(j) + 1;
- count.set_existing(j, c);
- } else {
- c = 1;
- count.set_new(j, c);
- }
- if (c < parent.propagate_up_at_count)
- continue;
- }
- // Trigger the parent.
- work.set(j, 1);
- }
- }
-}
-
-// Debugging help.
-void PrefilterTree::PrintPrefilter(int regexpid) {
+ const Entry& parent = entries_[j];
+ // Delay until all the children have succeeded.
+ if (parent.propagate_up_at_count > 1) {
+ if (count.has_index(j)) {
+ c = count.get_existing(j) + 1;
+ count.set_existing(j, c);
+ } else {
+ c = 1;
+ count.set_new(j, c);
+ }
+ if (c < parent.propagate_up_at_count)
+ continue;
+ }
+ // Trigger the parent.
+ work.set(j, 1);
+ }
+ }
+}
+
+// Debugging help.
+void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
-}
-
+}
+
void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
-
+
for (size_t i = 0; i < entries_.size(); i++) {
StdIntMap* parents = entries_[i].parents;
const std::vector<int>& regexps = entries_[i].regexps;
@@ -375,33 +375,33 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
<< " N: " << parents->size() << " R: " << regexps.size();
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
LOG(ERROR) << it->first;
- }
+ }
LOG(ERROR) << "Map:";
for (NodeMap::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
-}
-
+}
+
std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
std::string node_string = "";
- if (node->op() == Prefilter::ATOM) {
- DCHECK(!node->atom().empty());
- node_string += node->atom();
- } else {
- // Adding the operation disambiguates AND and OR nodes.
- node_string += node->op() == Prefilter::AND ? "AND" : "OR";
- node_string += "(";
+ if (node->op() == Prefilter::ATOM) {
+ DCHECK(!node->atom().empty());
+ node_string += node->atom();
+ } else {
+ // Adding the operation disambiguates AND and OR nodes.
+ node_string += node->op() == Prefilter::AND ? "AND" : "OR";
+ node_string += "(";
for (size_t i = 0; i < node->subs()->size(); i++) {
- if (i > 0)
- node_string += ',';
+ if (i > 0)
+ node_string += ',';
node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
- node_string += ":";
- node_string += DebugNodeString((*node->subs())[i]);
- }
- node_string += ")";
- }
- return node_string;
-}
-
-} // namespace re2
+ node_string += ":";
+ node_string += DebugNodeString((*node->subs())[i]);
+ }
+ node_string += ")";
+ }
+ return node_string;
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/prefilter_tree.h b/contrib/libs/re2/re2/prefilter_tree.h
index 2d30fbd717..5d73074d97 100644
--- a/contrib/libs/re2/re2/prefilter_tree.h
+++ b/contrib/libs/re2/re2/prefilter_tree.h
@@ -1,21 +1,21 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
-// The PrefilterTree class is used to form an AND-OR tree of strings
-// that would trigger each regexp. The 'prefilter' of each regexp is
+// The PrefilterTree class is used to form an AND-OR tree of strings
+// that would trigger each regexp. The 'prefilter' of each regexp is
// added to PrefilterTree, and then PrefilterTree is used to find all
-// the unique strings across the prefilters. During search, by using
-// matches from a string matching engine, PrefilterTree deduces the
-// set of regexps that are to be triggered. The 'string matching
-// engine' itself is outside of this class, and the caller can use any
-// favorite engine. PrefilterTree provides a set of strings (called
-// atoms) that the user of this class should use to do the string
-// matching.
-
+// the unique strings across the prefilters. During search, by using
+// matches from a string matching engine, PrefilterTree deduces the
+// set of regexps that are to be triggered. The 'string matching
+// engine' itself is outside of this class, and the caller can use any
+// favorite engine. PrefilterTree provides a set of strings (called
+// atoms) that the user of this class should use to do the string
+// matching.
+
#include <map>
#include <string>
#include <vector>
@@ -23,117 +23,117 @@
#include "util/util.h"
#include "re2/prefilter.h"
#include "re2/sparse_array.h"
-
-namespace re2 {
-
-class PrefilterTree {
- public:
- PrefilterTree();
+
+namespace re2 {
+
+class PrefilterTree {
+ public:
+ PrefilterTree();
explicit PrefilterTree(int min_atom_len);
- ~PrefilterTree();
-
- // Adds the prefilter for the next regexp. Note that we assume that
- // Add called sequentially for all regexps. All Add calls
- // must precede Compile.
- void Add(Prefilter* prefilter);
-
- // The Compile returns a vector of string in atom_vec.
- // Call this after all the prefilters are added through Add.
- // No calls to Add after Compile are allowed.
- // The caller should use the returned set of strings to do string matching.
- // Each time a string matches, the corresponding index then has to be
- // and passed to RegexpsGivenStrings below.
+ ~PrefilterTree();
+
+ // Adds the prefilter for the next regexp. Note that we assume that
+ // Add called sequentially for all regexps. All Add calls
+ // must precede Compile.
+ void Add(Prefilter* prefilter);
+
+ // The Compile returns a vector of string in atom_vec.
+ // Call this after all the prefilters are added through Add.
+ // No calls to Add after Compile are allowed.
+ // The caller should use the returned set of strings to do string matching.
+ // Each time a string matches, the corresponding index then has to be
+ // and passed to RegexpsGivenStrings below.
void Compile(std::vector<std::string>* atom_vec);
-
- // Given the indices of the atoms that matched, returns the indexes
- // of regexps that should be searched. The matched_atoms should
- // contain all the ids of string atoms that were found to match the
- // content. The caller can use any string match engine to perform
- // this function. This function is thread safe.
+
+ // Given the indices of the atoms that matched, returns the indexes
+ // of regexps that should be searched. The matched_atoms should
+ // contain all the ids of string atoms that were found to match the
+ // content. The caller can use any string match engine to perform
+ // this function. This function is thread safe.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const;
-
- // Print debug prefilter. Also prints unique ids associated with
- // nodes of the prefilter of the regexp.
- void PrintPrefilter(int regexpid);
-
+
+ // Print debug prefilter. Also prints unique ids associated with
+ // nodes of the prefilter of the regexp.
+ void PrintPrefilter(int regexpid);
+
private:
typedef SparseArray<int> IntMap;
typedef std::map<int, int> StdIntMap;
typedef std::map<std::string, Prefilter*> NodeMap;
-
- // Each unique node has a corresponding Entry that helps in
- // passing the matching trigger information along the tree.
- struct Entry {
- public:
- // How many children should match before this node triggers the
- // parent. For an atom and an OR node, this is 1 and for an AND
- // node, it is the number of unique children.
- int propagate_up_at_count;
-
- // When this node is ready to trigger the parent, what are the indices
- // of the parent nodes to trigger. The reason there may be more than
- // one is because of sharing. For example (abc | def) and (xyz | def)
- // are two different nodes, but they share the atom 'def'. So when
- // 'def' matches, it triggers two parents, corresponding to the two
- // different OR nodes.
+
+ // Each unique node has a corresponding Entry that helps in
+ // passing the matching trigger information along the tree.
+ struct Entry {
+ public:
+ // How many children should match before this node triggers the
+ // parent. For an atom and an OR node, this is 1 and for an AND
+ // node, it is the number of unique children.
+ int propagate_up_at_count;
+
+ // When this node is ready to trigger the parent, what are the indices
+ // of the parent nodes to trigger. The reason there may be more than
+ // one is because of sharing. For example (abc | def) and (xyz | def)
+ // are two different nodes, but they share the atom 'def'. So when
+ // 'def' matches, it triggers two parents, corresponding to the two
+ // different OR nodes.
StdIntMap* parents;
-
- // When this node is ready to trigger the parent, what are the
- // regexps that are triggered.
+
+ // When this node is ready to trigger the parent, what are the
+ // regexps that are triggered.
std::vector<int> regexps;
- };
-
+ };
+
// Returns true if the prefilter node should be kept.
bool KeepNode(Prefilter* node) const;
- // This function assigns unique ids to various parts of the
- // prefilter, by looking at if these nodes are already in the
- // PrefilterTree.
+ // This function assigns unique ids to various parts of the
+ // prefilter, by looking at if these nodes are already in the
+ // PrefilterTree.
void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec);
-
- // Given the matching atoms, find the regexps to be triggered.
+
+ // Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const std::vector<int>& atom_ids,
- IntMap* regexps) const;
-
- // Returns the prefilter node that has the same NodeString as this
- // node. For the canonical node, returns node.
+ IntMap* regexps) const;
+
+ // Returns the prefilter node that has the same NodeString as this
+ // node. For the canonical node, returns node.
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
-
- // A string that uniquely identifies the node. Assumes that the
- // children of node has already been assigned unique ids.
+
+ // A string that uniquely identifies the node. Assumes that the
+ // children of node has already been assigned unique ids.
std::string NodeString(Prefilter* node) const;
-
- // Recursively constructs a readable prefilter string.
+
+ // Recursively constructs a readable prefilter string.
std::string DebugNodeString(Prefilter* node) const;
-
- // Used for debugging.
+
+ // Used for debugging.
void PrintDebugInfo(NodeMap* nodes);
-
- // These are all the nodes formed by Compile. Essentially, there is
- // one node for each unique atom and each unique AND/OR node.
+
+ // These are all the nodes formed by Compile. Essentially, there is
+ // one node for each unique atom and each unique AND/OR node.
std::vector<Entry> entries_;
-
- // indices of regexps that always pass through the filter (since we
- // found no required literals in these regexps).
+
+ // indices of regexps that always pass through the filter (since we
+ // found no required literals in these regexps).
std::vector<int> unfiltered_;
-
- // vector of Prefilter for all regexps.
+
+ // vector of Prefilter for all regexps.
std::vector<Prefilter*> prefilter_vec_;
-
- // Atom index in returned strings to entry id mapping.
+
+ // Atom index in returned strings to entry id mapping.
std::vector<int> atom_index_to_id_;
-
- // Has the prefilter tree been compiled.
- bool compiled_;
-
+
+ // Has the prefilter tree been compiled.
+ bool compiled_;
+
// Strings less than this length are not stored as atoms.
const int min_atom_len_;
PrefilterTree(const PrefilterTree&) = delete;
PrefilterTree& operator=(const PrefilterTree&) = delete;
-};
-
+};
+
} // namespace
-
-#endif // RE2_PREFILTER_TREE_H_
+
+#endif // RE2_PREFILTER_TREE_H_
diff --git a/contrib/libs/re2/re2/prog.cc b/contrib/libs/re2/re2/prog.cc
index 3756c67f66..a700d35de3 100644
--- a/contrib/libs/re2/re2/prog.cc
+++ b/contrib/libs/re2/re2/prog.cc
@@ -1,12 +1,12 @@
-// Copyright 2007 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Compiled regular expression representation.
-// Tested by compile_test.cc
-
-#include "re2/prog.h"
-
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Compiled regular expression representation.
+// Tested by compile_test.cc
+
+#include "re2/prog.h"
+
#if defined(__AVX2__)
#include <immintrin.h>
#ifdef _MSC_VER
@@ -25,132 +25,132 @@
#include "re2/bitmap256.h"
#include "re2/stringpiece.h"
-namespace re2 {
-
-// Constructors per Inst opcode
-
+namespace re2 {
+
+// Constructors per Inst opcode
+
void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
- DCHECK_EQ(out_opcode_, 0);
- set_out_opcode(out, kInstAlt);
- out1_ = out1;
-}
-
+ DCHECK_EQ(out_opcode_, 0);
+ set_out_opcode(out, kInstAlt);
+ out1_ = out1;
+}
+
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
- DCHECK_EQ(out_opcode_, 0);
- set_out_opcode(out, kInstByteRange);
- lo_ = lo & 0xFF;
- hi_ = hi & 0xFF;
+ DCHECK_EQ(out_opcode_, 0);
+ set_out_opcode(out, kInstByteRange);
+ lo_ = lo & 0xFF;
+ hi_ = hi & 0xFF;
hint_foldcase_ = foldcase&1;
-}
-
+}
+
void Prog::Inst::InitCapture(int cap, uint32_t out) {
- DCHECK_EQ(out_opcode_, 0);
- set_out_opcode(out, kInstCapture);
- cap_ = cap;
-}
-
+ DCHECK_EQ(out_opcode_, 0);
+ set_out_opcode(out, kInstCapture);
+ cap_ = cap;
+}
+
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
- DCHECK_EQ(out_opcode_, 0);
- set_out_opcode(out, kInstEmptyWidth);
- empty_ = empty;
-}
-
+ DCHECK_EQ(out_opcode_, 0);
+ set_out_opcode(out, kInstEmptyWidth);
+ empty_ = empty;
+}
+
void Prog::Inst::InitMatch(int32_t id) {
- DCHECK_EQ(out_opcode_, 0);
- set_opcode(kInstMatch);
- match_id_ = id;
-}
-
+ DCHECK_EQ(out_opcode_, 0);
+ set_opcode(kInstMatch);
+ match_id_ = id;
+}
+
void Prog::Inst::InitNop(uint32_t out) {
- DCHECK_EQ(out_opcode_, 0);
- set_opcode(kInstNop);
-}
-
-void Prog::Inst::InitFail() {
- DCHECK_EQ(out_opcode_, 0);
- set_opcode(kInstFail);
-}
-
+ DCHECK_EQ(out_opcode_, 0);
+ set_opcode(kInstNop);
+}
+
+void Prog::Inst::InitFail() {
+ DCHECK_EQ(out_opcode_, 0);
+ set_opcode(kInstFail);
+}
+
std::string Prog::Inst::Dump() {
- switch (opcode()) {
- default:
- return StringPrintf("opcode %d", static_cast<int>(opcode()));
-
- case kInstAlt:
- return StringPrintf("alt -> %d | %d", out(), out1_);
-
- case kInstAltMatch:
- return StringPrintf("altmatch -> %d | %d", out(), out1_);
-
- case kInstByteRange:
+ switch (opcode()) {
+ default:
+ return StringPrintf("opcode %d", static_cast<int>(opcode()));
+
+ case kInstAlt:
+ return StringPrintf("alt -> %d | %d", out(), out1_);
+
+ case kInstAltMatch:
+ return StringPrintf("altmatch -> %d | %d", out(), out1_);
+
+ case kInstByteRange:
return StringPrintf("byte%s [%02x-%02x] %d -> %d",
foldcase() ? "/i" : "",
lo_, hi_, hint(), out());
-
- case kInstCapture:
- return StringPrintf("capture %d -> %d", cap_, out());
-
- case kInstEmptyWidth:
- return StringPrintf("emptywidth %#x -> %d",
- static_cast<int>(empty_), out());
-
- case kInstMatch:
- return StringPrintf("match! %d", match_id());
-
- case kInstNop:
- return StringPrintf("nop -> %d", out());
-
- case kInstFail:
- return StringPrintf("fail");
- }
-}
-
-Prog::Prog()
- : anchor_start_(false),
- anchor_end_(false),
- reversed_(false),
+
+ case kInstCapture:
+ return StringPrintf("capture %d -> %d", cap_, out());
+
+ case kInstEmptyWidth:
+ return StringPrintf("emptywidth %#x -> %d",
+ static_cast<int>(empty_), out());
+
+ case kInstMatch:
+ return StringPrintf("match! %d", match_id());
+
+ case kInstNop:
+ return StringPrintf("nop -> %d", out());
+
+ case kInstFail:
+ return StringPrintf("fail");
+ }
+}
+
+Prog::Prog()
+ : anchor_start_(false),
+ anchor_end_(false),
+ reversed_(false),
did_flatten_(false),
- did_onepass_(false),
- start_(0),
- start_unanchored_(0),
- size_(0),
- bytemap_range_(0),
+ did_onepass_(false),
+ start_(0),
+ start_unanchored_(0),
+ size_(0),
+ bytemap_range_(0),
prefix_foldcase_(false),
prefix_size_(0),
list_count_(0),
bit_state_text_max_size_(0),
dfa_mem_(0),
- dfa_first_(NULL),
+ dfa_first_(NULL),
dfa_longest_(NULL) {
-}
-
-Prog::~Prog() {
+}
+
+Prog::~Prog() {
DeleteDFA(dfa_longest_);
DeleteDFA(dfa_first_);
if (prefix_foldcase_)
delete[] prefix_dfa_;
-}
-
-typedef SparseSet Workq;
-
-static inline void AddToQueue(Workq* q, int id) {
- if (id != 0)
- q->insert(id);
-}
-
+}
+
+typedef SparseSet Workq;
+
+static inline void AddToQueue(Workq* q, int id) {
+ if (id != 0)
+ q->insert(id);
+}
+
static std::string ProgToString(Prog* prog, Workq* q) {
std::string s;
- for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
- int id = *i;
- Prog::Inst* ip = prog->inst(id);
+ for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
+ int id = *i;
+ Prog::Inst* ip = prog->inst(id);
s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
- AddToQueue(q, ip->out());
- if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
- AddToQueue(q, ip->out1());
- }
- return s;
-}
-
+ AddToQueue(q, ip->out());
+ if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
+ AddToQueue(q, ip->out1());
+ }
+ return s;
+}
+
static std::string FlattenedProgToString(Prog* prog, int start) {
std::string s;
for (int id = start; id < prog->size(); id++) {
@@ -159,28 +159,28 @@ static std::string FlattenedProgToString(Prog* prog, int start) {
s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
else
s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
- }
+ }
return s;
}
-
+
std::string Prog::Dump() {
if (did_flatten_)
return FlattenedProgToString(this, start_);
- Workq q(size_);
- AddToQueue(&q, start_);
+ Workq q(size_);
+ AddToQueue(&q, start_);
return ProgToString(this, &q);
-}
-
+}
+
std::string Prog::DumpUnanchored() {
if (did_flatten_)
return FlattenedProgToString(this, start_unanchored_);
- Workq q(size_);
- AddToQueue(&q, start_unanchored_);
- return ProgToString(this, &q);
-}
-
+ Workq q(size_);
+ AddToQueue(&q, start_unanchored_);
+ return ProgToString(this, &q);
+}
+
std::string Prog::DumpByteMap() {
std::string map;
for (int c = 0; c < 256; c++) {
@@ -220,104 +220,104 @@ static bool IsMatch(Prog* prog, Prog::Inst* ip) {
}
}
-// Peep-hole optimizer.
-void Prog::Optimize() {
- Workq q(size_);
-
- // Eliminate nops. Most are taken out during compilation
- // but a few are hard to avoid.
- q.clear();
- AddToQueue(&q, start_);
- for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
- int id = *i;
-
- Inst* ip = inst(id);
- int j = ip->out();
- Inst* jp;
- while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
- j = jp->out();
- }
- ip->set_out(j);
- AddToQueue(&q, ip->out());
-
- if (ip->opcode() == kInstAlt) {
- j = ip->out1();
- while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
- j = jp->out();
- }
- ip->out1_ = j;
- AddToQueue(&q, ip->out1());
- }
- }
-
- // Insert kInstAltMatch instructions
- // Look for
- // ip: Alt -> j | k
- // j: ByteRange [00-FF] -> ip
- // k: Match
- // or the reverse (the above is the greedy one).
- // Rewrite Alt to AltMatch.
- q.clear();
- AddToQueue(&q, start_);
- for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
- int id = *i;
- Inst* ip = inst(id);
- AddToQueue(&q, ip->out());
- if (ip->opcode() == kInstAlt)
- AddToQueue(&q, ip->out1());
-
- if (ip->opcode() == kInstAlt) {
- Inst* j = inst(ip->out());
- Inst* k = inst(ip->out1());
- if (j->opcode() == kInstByteRange && j->out() == id &&
- j->lo() == 0x00 && j->hi() == 0xFF &&
- IsMatch(this, k)) {
- ip->set_opcode(kInstAltMatch);
- continue;
- }
- if (IsMatch(this, j) &&
- k->opcode() == kInstByteRange && k->out() == id &&
- k->lo() == 0x00 && k->hi() == 0xFF) {
- ip->set_opcode(kInstAltMatch);
- }
- }
- }
-}
-
+// Peep-hole optimizer.
+void Prog::Optimize() {
+ Workq q(size_);
+
+ // Eliminate nops. Most are taken out during compilation
+ // but a few are hard to avoid.
+ q.clear();
+ AddToQueue(&q, start_);
+ for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
+ int id = *i;
+
+ Inst* ip = inst(id);
+ int j = ip->out();
+ Inst* jp;
+ while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
+ j = jp->out();
+ }
+ ip->set_out(j);
+ AddToQueue(&q, ip->out());
+
+ if (ip->opcode() == kInstAlt) {
+ j = ip->out1();
+ while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
+ j = jp->out();
+ }
+ ip->out1_ = j;
+ AddToQueue(&q, ip->out1());
+ }
+ }
+
+ // Insert kInstAltMatch instructions
+ // Look for
+ // ip: Alt -> j | k
+ // j: ByteRange [00-FF] -> ip
+ // k: Match
+ // or the reverse (the above is the greedy one).
+ // Rewrite Alt to AltMatch.
+ q.clear();
+ AddToQueue(&q, start_);
+ for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
+ int id = *i;
+ Inst* ip = inst(id);
+ AddToQueue(&q, ip->out());
+ if (ip->opcode() == kInstAlt)
+ AddToQueue(&q, ip->out1());
+
+ if (ip->opcode() == kInstAlt) {
+ Inst* j = inst(ip->out());
+ Inst* k = inst(ip->out1());
+ if (j->opcode() == kInstByteRange && j->out() == id &&
+ j->lo() == 0x00 && j->hi() == 0xFF &&
+ IsMatch(this, k)) {
+ ip->set_opcode(kInstAltMatch);
+ continue;
+ }
+ if (IsMatch(this, j) &&
+ k->opcode() == kInstByteRange && k->out() == id &&
+ k->lo() == 0x00 && k->hi() == 0xFF) {
+ ip->set_opcode(kInstAltMatch);
+ }
+ }
+ }
+}
+
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
- int flags = 0;
-
- // ^ and \A
+ int flags = 0;
+
+ // ^ and \A
if (p == text.data())
- flags |= kEmptyBeginText | kEmptyBeginLine;
- else if (p[-1] == '\n')
- flags |= kEmptyBeginLine;
-
- // $ and \z
+ flags |= kEmptyBeginText | kEmptyBeginLine;
+ else if (p[-1] == '\n')
+ flags |= kEmptyBeginLine;
+
+ // $ and \z
if (p == text.data() + text.size())
- flags |= kEmptyEndText | kEmptyEndLine;
+ flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.data() + text.size() && p[0] == '\n')
- flags |= kEmptyEndLine;
-
- // \b and \B
+ flags |= kEmptyEndLine;
+
+ // \b and \B
if (p == text.data() && p == text.data() + text.size()) {
- // no word boundary here
+ // no word boundary here
} else if (p == text.data()) {
- if (IsWordChar(p[0]))
- flags |= kEmptyWordBoundary;
+ if (IsWordChar(p[0]))
+ flags |= kEmptyWordBoundary;
} else if (p == text.data() + text.size()) {
- if (IsWordChar(p[-1]))
- flags |= kEmptyWordBoundary;
- } else {
- if (IsWordChar(p[-1]) != IsWordChar(p[0]))
- flags |= kEmptyWordBoundary;
- }
- if (!(flags & kEmptyWordBoundary))
- flags |= kEmptyNonWordBoundary;
-
- return flags;
-}
-
+ if (IsWordChar(p[-1]))
+ flags |= kEmptyWordBoundary;
+ } else {
+ if (IsWordChar(p[-1]) != IsWordChar(p[0]))
+ flags |= kEmptyWordBoundary;
+ }
+ if (!(flags & kEmptyWordBoundary))
+ flags |= kEmptyNonWordBoundary;
+
+ return flags;
+}
+
// ByteMapBuilder implements a coloring algorithm.
//
// The first phase is a series of "mark and merge" batches: we mark one or more
@@ -375,8 +375,8 @@ void ByteMapBuilder::Mark(int lo, int hi) {
return;
ranges_.emplace_back(lo, hi);
-}
-
+}
+
void ByteMapBuilder::Merge() {
for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
it != ranges_.end();
@@ -443,12 +443,12 @@ int ByteMapBuilder::Recolor(int oldcolor) {
return newcolor;
}
-void Prog::ComputeByteMap() {
+void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for the program.
// Ranges of bytes that are treated indistinguishably
// will be mapped to a single byte class.
ByteMapBuilder builder;
-
+
// Don't repeat the work for ^ and $.
bool marked_line_boundaries = false;
// Don't repeat the work for \b and \B.
@@ -507,18 +507,18 @@ void Prog::ComputeByteMap() {
marked_word_boundaries = true;
}
}
- }
-
+ }
+
builder.Build(bytemap_, &bytemap_range_);
if (0) { // For debugging, use trivial bytemap.
LOG(ERROR) << "Using trivial bytemap.";
for (int i = 0; i < 256; i++)
bytemap_[i] = static_cast<uint8_t>(i);
- bytemap_range_ = 256;
- }
-}
-
+ bytemap_range_ = 256;
+ }
+}
+
// Prog::Flatten() implements a graph rewriting algorithm.
//
// The overall process is similar to epsilon removal, but retains some epsilon
@@ -1172,4 +1172,4 @@ const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
}
}
-} // namespace re2
+} // namespace re2
diff --git a/contrib/libs/re2/re2/prog.h b/contrib/libs/re2/re2/prog.h
index 2f35a918b6..4af012ab6f 100644
--- a/contrib/libs/re2/re2/prog.h
+++ b/contrib/libs/re2/re2/prog.h
@@ -1,150 +1,150 @@
-// Copyright 2007 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_PROG_H_
#define RE2_PROG_H_
-// Compiled representation of regular expressions.
-// See regexp.h for the Regexp class, which represents a regular
-// expression symbolically.
-
+// Compiled representation of regular expressions.
+// See regexp.h for the Regexp class, which represents a regular
+// expression symbolically.
+
#include <stdint.h>
#include <functional>
#include <mutex>
#include <string>
#include <vector>
#include <type_traits>
-
+
#include "util/util.h"
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/re2.h"
#include "re2/sparse_array.h"
#include "re2/sparse_set.h"
-
-namespace re2 {
-
-// Opcodes for Inst
-enum InstOp {
- kInstAlt = 0, // choose between out_ and out1_
- kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
- kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
- kInstCapture, // capturing parenthesis number cap_
- kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
- kInstMatch, // found a match!
- kInstNop, // no-op; occasionally unavoidable
- kInstFail, // never match; occasionally unavoidable
+
+namespace re2 {
+
+// Opcodes for Inst
+enum InstOp {
+ kInstAlt = 0, // choose between out_ and out1_
+ kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
+ kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
+ kInstCapture, // capturing parenthesis number cap_
+ kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
+ kInstMatch, // found a match!
+ kInstNop, // no-op; occasionally unavoidable
+ kInstFail, // never match; occasionally unavoidable
kNumInst,
-};
-
-// Bit flags for empty-width specials
-enum EmptyOp {
- kEmptyBeginLine = 1<<0, // ^ - beginning of line
- kEmptyEndLine = 1<<1, // $ - end of line
- kEmptyBeginText = 1<<2, // \A - beginning of text
- kEmptyEndText = 1<<3, // \z - end of text
- kEmptyWordBoundary = 1<<4, // \b - word boundary
- kEmptyNonWordBoundary = 1<<5, // \B - not \b
- kEmptyAllFlags = (1<<6)-1,
-};
-
+};
+
+// Bit flags for empty-width specials
+enum EmptyOp {
+ kEmptyBeginLine = 1<<0, // ^ - beginning of line
+ kEmptyEndLine = 1<<1, // $ - end of line
+ kEmptyBeginText = 1<<2, // \A - beginning of text
+ kEmptyEndText = 1<<3, // \z - end of text
+ kEmptyWordBoundary = 1<<4, // \b - word boundary
+ kEmptyNonWordBoundary = 1<<5, // \B - not \b
+ kEmptyAllFlags = (1<<6)-1,
+};
+
class DFA;
-class Regexp;
-
-// Compiled form of regexp program.
-class Prog {
- public:
- Prog();
- ~Prog();
-
- // Single instruction in regexp program.
- class Inst {
- public:
+class Regexp;
+
+// Compiled form of regexp program.
+class Prog {
+ public:
+ Prog();
+ ~Prog();
+
+ // Single instruction in regexp program.
+ class Inst {
+ public:
// See the assertion below for why this is so.
Inst() = default;
-
+
// Copyable.
Inst(const Inst&) = default;
Inst& operator=(const Inst&) = default;
- // Constructors per opcode
+ // Constructors per opcode
void InitAlt(uint32_t out, uint32_t out1);
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
void InitCapture(int cap, uint32_t out);
void InitEmptyWidth(EmptyOp empty, uint32_t out);
- void InitMatch(int id);
+ void InitMatch(int id);
void InitNop(uint32_t out);
- void InitFail();
-
- // Getters
+ void InitFail();
+
+ // Getters
int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
- InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
+ InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int last() { return (out_opcode_>>3)&1; }
int out() { return out_opcode_>>4; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
- int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
- int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
- int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
+ int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
+ int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
+ int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; }
int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; }
- int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
- EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
+ int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
+ EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog* p) {
- DCHECK_EQ(opcode(), kInstAltMatch);
+ DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange ||
(p->inst(out())->opcode() == kInstNop &&
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
- }
-
- // Does this inst (an kInstByteRange) match c?
- inline bool Matches(int c) {
- DCHECK_EQ(opcode(), kInstByteRange);
+ }
+
+ // Does this inst (an kInstByteRange) match c?
+ inline bool Matches(int c) {
+ DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase() && 'A' <= c && c <= 'Z')
- c += 'a' - 'A';
- return lo_ <= c && c <= hi_;
- }
-
- // Returns string representation for debugging.
+ c += 'a' - 'A';
+ return lo_ <= c && c <= hi_;
+ }
+
+ // Returns string representation for debugging.
std::string Dump();
-
- // Maximum instruction id.
+
+ // Maximum instruction id.
// (Must fit in out_opcode_. PatchList/last steal another bit.)
- static const int kMaxInst = (1<<28) - 1;
-
- private:
- void set_opcode(InstOp opcode) {
+ static const int kMaxInst = (1<<28) - 1;
+
+ private:
+ void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
- }
-
+ }
+
void set_last() {
out_opcode_ = (out()<<4) | (1<<3) | opcode();
}
- void set_out(int out) {
+ void set_out(int out) {
out_opcode_ = (out<<4) | (last()<<3) | opcode();
- }
-
- void set_out_opcode(int out, InstOp opcode) {
+ }
+
+ void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<4) | (last()<<3) | opcode;
- }
-
+ }
+
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
union { // additional instruction arguments:
uint32_t out1_; // opcode == kInstAlt
// alternate next instruction
-
+
int32_t cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
-
+
int32_t match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
-
+
struct { // opcode == kInstByteRange
uint8_t lo_; // byte range is lo_-hi_ inclusive
uint8_t hi_; //
@@ -155,69 +155,69 @@ class Prog {
// means there are no remaining possibilities,
// which is most likely for character classes.
// foldcase: A-Z -> a-z before checking range.
- };
-
+ };
+
EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
- };
-
- friend class Compiler;
- friend struct PatchList;
- friend class Prog;
- };
-
+ };
+
+ friend class Compiler;
+ friend struct PatchList;
+ friend class Prog;
+ };
+
// Inst must be trivial so that we can freely clear it with memset(3).
// Arrays of Inst are initialised by copying the initial elements with
// memmove(3) and then clearing any remaining elements with memset(3).
static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
- // Whether to anchor the search.
- enum Anchor {
- kUnanchored, // match anywhere
- kAnchored, // match only starting at beginning of text
- };
-
- // Kind of match to look for (for anchor != kFullMatch)
- //
- // kLongestMatch mode finds the overall longest
- // match but still makes its submatch choices the way
- // Perl would, not in the way prescribed by POSIX.
- // The POSIX rules are much more expensive to implement,
- // and no one has needed them.
- //
- // kFullMatch is not strictly necessary -- we could use
- // kLongestMatch and then check the length of the match -- but
- // the matching code can run faster if it knows to consider only
- // full matches.
- enum MatchKind {
- kFirstMatch, // like Perl, PCRE
- kLongestMatch, // like egrep or POSIX
- kFullMatch, // match only entire text; implies anchor==kAnchored
- kManyMatch // for SearchDFA, records set of matches
- };
-
- Inst *inst(int id) { return &inst_[id]; }
- int start() { return start_; }
+ // Whether to anchor the search.
+ enum Anchor {
+ kUnanchored, // match anywhere
+ kAnchored, // match only starting at beginning of text
+ };
+
+ // Kind of match to look for (for anchor != kFullMatch)
+ //
+ // kLongestMatch mode finds the overall longest
+ // match but still makes its submatch choices the way
+ // Perl would, not in the way prescribed by POSIX.
+ // The POSIX rules are much more expensive to implement,
+ // and no one has needed them.
+ //
+ // kFullMatch is not strictly necessary -- we could use
+ // kLongestMatch and then check the length of the match -- but
+ // the matching code can run faster if it knows to consider only
+ // full matches.
+ enum MatchKind {
+ kFirstMatch, // like Perl, PCRE
+ kLongestMatch, // like egrep or POSIX
+ kFullMatch, // match only entire text; implies anchor==kAnchored
+ kManyMatch // for SearchDFA, records set of matches
+ };
+
+ Inst *inst(int id) { return &inst_[id]; }
+ int start() { return start_; }
void set_start(int start) { start_ = start; }
- int start_unanchored() { return start_unanchored_; }
- void set_start_unanchored(int start) { start_unanchored_ = start; }
+ int start_unanchored() { return start_unanchored_; }
+ void set_start_unanchored(int start) { start_unanchored_ = start; }
int size() { return size_; }
- bool reversed() { return reversed_; }
- void set_reversed(bool reversed) { reversed_ = reversed; }
+ bool reversed() { return reversed_; }
+ void set_reversed(bool reversed) { reversed_ = reversed; }
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
uint16_t* list_heads() { return list_heads_.data(); }
size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
int64_t dfa_mem() { return dfa_mem_; }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
- bool anchor_start() { return anchor_start_; }
- void set_anchor_start(bool b) { anchor_start_ = b; }
- bool anchor_end() { return anchor_end_; }
- void set_anchor_end(bool b) { anchor_end_ = b; }
- int bytemap_range() { return bytemap_range_; }
+ bool anchor_start() { return anchor_start_; }
+ void set_anchor_start(bool b) { anchor_start_ = b; }
+ bool anchor_end() { return anchor_end_; }
+ void set_anchor_end(bool b) { anchor_end_ = b; }
+ int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
bool can_prefix_accel() { return prefix_size_ != 0; }
-
+
// Accelerates to the first likely occurrence of the prefix.
// Returns a pointer to the first byte or NULL if not found.
const void* PrefixAccel(const void* data, size_t size) {
@@ -242,58 +242,58 @@ class Prog {
// prefix_back_ to return fewer false positives than memchr(3) alone.
const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
- // Returns string representation of program for debugging.
+ // Returns string representation of program for debugging.
std::string Dump();
std::string DumpUnanchored();
std::string DumpByteMap();
-
- // Returns the set of kEmpty flags that are in effect at
- // position p within context.
+
+ // Returns the set of kEmpty flags that are in effect at
+ // position p within context.
static uint32_t EmptyFlags(const StringPiece& context, const char* p);
-
- // Returns whether byte c is a word character: ASCII only.
- // Used by the implementation of \b and \B.
- // This is not right for Unicode, but:
- // - it's hard to get right in a byte-at-a-time matching world
- // (the DFA has only one-byte lookahead).
- // - even if the lookahead were possible, the Progs would be huge.
- // This crude approximation is the same one PCRE uses.
+
+ // Returns whether byte c is a word character: ASCII only.
+ // Used by the implementation of \b and \B.
+ // This is not right for Unicode, but:
+ // - it's hard to get right in a byte-at-a-time matching world
+ // (the DFA has only one-byte lookahead).
+ // - even if the lookahead were possible, the Progs would be huge.
+ // This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8_t c) {
- return ('A' <= c && c <= 'Z') ||
- ('a' <= c && c <= 'z') ||
- ('0' <= c && c <= '9') ||
- c == '_';
- }
-
- // Execution engines. They all search for the regexp (run the prog)
- // in text, which is in the larger context (used for ^ $ \b etc).
- // Anchor and kind control the kind of search.
- // Returns true if match found, false if not.
- // If match found, fills match[0..nmatch-1] with submatch info.
- // match[0] is overall match, match[1] is first set of parens, etc.
- // If a particular submatch is not matched during the regexp match,
- // it is set to NULL.
- //
- // Matching text == StringPiece(NULL, 0) is treated as any other empty
- // string, but note that on return, it will not be possible to distinguish
- // submatches that matched that empty string from submatches that didn't
- // match anything. Either way, match[i] == NULL.
-
- // Search using NFA: can find submatches but kind of slow.
- bool SearchNFA(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch);
-
- // Search using DFA: much faster than NFA but only finds
- // end of match and can use a lot more memory.
- // Returns whether a match was found.
- // If the DFA runs out of memory, sets *failed to true and returns false.
- // If matches != NULL and kind == kManyMatch and there is a match,
- // SearchDFA fills matches with the match IDs of the final matching state.
- bool SearchDFA(const StringPiece& text, const StringPiece& context,
+ return ('A' <= c && c <= 'Z') ||
+ ('a' <= c && c <= 'z') ||
+ ('0' <= c && c <= '9') ||
+ c == '_';
+ }
+
+ // Execution engines. They all search for the regexp (run the prog)
+ // in text, which is in the larger context (used for ^ $ \b etc).
+ // Anchor and kind control the kind of search.
+ // Returns true if match found, false if not.
+ // If match found, fills match[0..nmatch-1] with submatch info.
+ // match[0] is overall match, match[1] is first set of parens, etc.
+ // If a particular submatch is not matched during the regexp match,
+ // it is set to NULL.
+ //
+ // Matching text == StringPiece(NULL, 0) is treated as any other empty
+ // string, but note that on return, it will not be possible to distinguish
+ // submatches that matched that empty string from submatches that didn't
+ // match anything. Either way, match[i] == NULL.
+
+ // Search using NFA: can find submatches but kind of slow.
+ bool SearchNFA(const StringPiece& text, const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch);
+
+ // Search using DFA: much faster than NFA but only finds
+ // end of match and can use a lot more memory.
+ // Returns whether a match was found.
+ // If the DFA runs out of memory, sets *failed to true and returns false.
+ // If matches != NULL and kind == kManyMatch and there is a match,
+ // SearchDFA fills matches with the match IDs of the final matching state.
+ bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind, StringPiece* match0,
bool* failed, SparseSet* matches);
-
+
// The callback issued after building each DFA state with BuildEntireDFA().
// If next is null, then the memory budget has been exhausted and building
// will halt. Otherwise, the state has been built and next points to an array
@@ -304,71 +304,71 @@ class Prog {
using DFAStateCallback = std::function<void(const int* next, bool match)>;
// Build the entire DFA for the given match kind.
- // Usually the DFA is built out incrementally, as needed, which
+ // Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work.
// If cb is not empty, it receives one callback per state built.
// Returns the number of states built.
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
-
+
// Compute bytemap.
- void ComputeByteMap();
-
- // Run peep-hole optimizer on program.
- void Optimize();
-
- // One-pass NFA: only correct if IsOnePass() is true,
- // but much faster than NFA (competitive with PCRE)
- // for those expressions.
- bool IsOnePass();
- bool SearchOnePass(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch);
-
- // Bit-state backtracking. Fast on small cases but uses memory
+ void ComputeByteMap();
+
+ // Run peep-hole optimizer on program.
+ void Optimize();
+
+ // One-pass NFA: only correct if IsOnePass() is true,
+ // but much faster than NFA (competitive with PCRE)
+ // for those expressions.
+ bool IsOnePass();
+ bool SearchOnePass(const StringPiece& text, const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch);
+
+ // Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the list count and the text size.
bool CanBitState() { return list_heads_.data() != NULL; }
- bool SearchBitState(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch);
-
- static const int kMaxOnePassCapture = 5; // $0 through $4
-
- // Backtracking search: the gold standard against which the other
- // implementations are checked. FOR TESTING ONLY.
- // It allocates a ton of memory to avoid running forever.
- // It is also recursive, so can't use in production (will overflow stacks).
- // The name "Unsafe" here is supposed to be a flag that
- // you should not be using this function.
- bool UnsafeSearchBacktrack(const StringPiece& text,
- const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch);
-
- // Computes range for any strings matching regexp. The min and max can in
- // some cases be arbitrarily precise, so the caller gets to specify the
- // maximum desired length of string returned.
- //
- // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
- // string s that is an anchored match for this regexp satisfies
- // min <= s && s <= max.
- //
- // Note that PossibleMatchRange() will only consider the first copy of an
- // infinitely repeated element (i.e., any regexp element followed by a '*' or
- // '+' operator). Regexps with "{N}" constructions are not affected, as those
- // do not compile down to infinite repetitions.
- //
- // Returns true on success, false on error.
+ bool SearchBitState(const StringPiece& text, const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch);
+
+ static const int kMaxOnePassCapture = 5; // $0 through $4
+
+ // Backtracking search: the gold standard against which the other
+ // implementations are checked. FOR TESTING ONLY.
+ // It allocates a ton of memory to avoid running forever.
+ // It is also recursive, so can't use in production (will overflow stacks).
+ // The name "Unsafe" here is supposed to be a flag that
+ // you should not be using this function.
+ bool UnsafeSearchBacktrack(const StringPiece& text,
+ const StringPiece& context,
+ Anchor anchor, MatchKind kind,
+ StringPiece* match, int nmatch);
+
+ // Computes range for any strings matching regexp. The min and max can in
+ // some cases be arbitrarily precise, so the caller gets to specify the
+ // maximum desired length of string returned.
+ //
+ // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
+ // string s that is an anchored match for this regexp satisfies
+ // min <= s && s <= max.
+ //
+ // Note that PossibleMatchRange() will only consider the first copy of an
+ // infinitely repeated element (i.e., any regexp element followed by a '*' or
+ // '+' operator). Regexps with "{N}" constructions are not affected, as those
+ // do not compile down to infinite repetitions.
+ //
+ // Returns true on success, false on error.
bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
-
+
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout into the given sparse array.
void Fanout(SparseArray<int>* fanout);
- // Compiles a collection of regexps to Prog. Each regexp will have
+ // Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the output vector.
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
-
+
// Flattens the Prog from "tree" form to "list" form. This is an in-place
// operation in the sense that the old instructions are lost.
void Flatten();
@@ -403,22 +403,22 @@ class Prog {
// FOR TESTING ONLY.
static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
- private:
- friend class Compiler;
-
- DFA* GetDFA(MatchKind kind);
+ private:
+ friend class Compiler;
+
+ DFA* GetDFA(MatchKind kind);
void DeleteDFA(DFA* dfa);
-
- bool anchor_start_; // regexp has explicit start anchor
- bool anchor_end_; // regexp has explicit end anchor
- bool reversed_; // whether program runs backward over input
+
+ bool anchor_start_; // regexp has explicit start anchor
+ bool anchor_end_; // regexp has explicit end anchor
+ bool reversed_; // whether program runs backward over input
bool did_flatten_; // has Flatten been called?
- bool did_onepass_; // has IsOnePass been called?
-
- int start_; // entry point for program
- int start_unanchored_; // unanchored entry point for program
- int size_; // number of instructions
- int bytemap_range_; // bytemap_[x] < bytemap_range_
+ bool did_onepass_; // has IsOnePass been called?
+
+ int start_; // entry point for program
+ int start_unanchored_; // unanchored entry point for program
+ int size_; // number of instructions
+ int bytemap_range_; // bytemap_[x] < bytemap_range_
bool prefix_foldcase_; // whether prefix is case-insensitive
size_t prefix_size_; // size of prefix (0 if no prefix)
@@ -429,7 +429,7 @@ class Prog {
int prefix_back_; // last byte of prefix
};
};
-
+
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
@@ -438,20 +438,20 @@ class Prog {
PODArray<Inst> inst_; // pointer to instruction array
PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes
-
+
int64_t dfa_mem_; // Maximum memory for DFAs.
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
-
+
uint8_t bytemap_[256]; // map from input bytes to byte classes
-
+
std::once_flag dfa_first_once_;
std::once_flag dfa_longest_once_;
-
+
Prog(const Prog&) = delete;
Prog& operator=(const Prog&) = delete;
-};
-
+};
+
// std::string_view in MSVC has iterators that aren't just pointers and
// that don't allow comparisons between different objects - not even if
// those objects are views into the same string! Thus, we provide these
@@ -463,6 +463,6 @@ static inline const char* EndPtr(const StringPiece& s) {
return s.data() + s.size();
}
-} // namespace re2
-
+} // namespace re2
+
#endif // RE2_PROG_H_
diff --git a/contrib/libs/re2/re2/re2.cc b/contrib/libs/re2/re2/re2.cc
index c32090b4fc..47fb385e4e 100644
--- a/contrib/libs/re2/re2/re2.cc
+++ b/contrib/libs/re2/re2/re2.cc
@@ -1,14 +1,14 @@
-// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Regular expression interface RE2.
-//
-// Originally the PCRE C++ wrapper, but adapted to use
-// the new automata-based regular expression engines.
-
+// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression interface RE2.
+//
+// Originally the PCRE C++ wrapper, but adapted to use
+// the new automata-based regular expression engines.
+
#include "re2/re2.h"
-
+
#include <assert.h>
#include <ctype.h>
#include <errno.h>
@@ -22,7 +22,7 @@
#include <atomic>
#include <iterator>
#include <mutex>
-#include <string>
+#include <string>
#include <utility>
#include <vector>
@@ -30,18 +30,18 @@
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
-#include "re2/prog.h"
-#include "re2/regexp.h"
+#include "re2/prog.h"
+#include "re2/regexp.h"
#include "re2/sparse_array.h"
-
-namespace re2 {
-
-// Maximum number of args we can set
-static const int kMaxArgs = 16;
-static const int kVecSize = 1+kMaxArgs;
-
+
+namespace re2 {
+
+// Maximum number of args we can set
+static const int kMaxArgs = 16;
+static const int kVecSize = 1+kMaxArgs;
+
const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
-
+
RE2::Options::Options(RE2::CannedOptions opt)
: encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
posix_syntax_(opt == RE2::POSIX),
@@ -57,120 +57,120 @@ RE2::Options::Options(RE2::CannedOptions opt)
word_boundary_(false),
one_line_(false) {
}
-
+
// static empty objects for use as const references.
// To avoid global constructors, allocated in RE2::Init().
static const std::string* empty_string;
static const std::map<std::string, int>* empty_named_groups;
static const std::map<int, std::string>* empty_group_names;
-
-// Converts from Regexp error code to RE2 error code.
-// Maybe some day they will diverge. In any event, this
-// hides the existence of Regexp from RE2 users.
+
+// Converts from Regexp error code to RE2 error code.
+// Maybe some day they will diverge. In any event, this
+// hides the existence of Regexp from RE2 users.
static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
- switch (code) {
+ switch (code) {
case re2::kRegexpSuccess:
- return RE2::NoError;
+ return RE2::NoError;
case re2::kRegexpInternalError:
- return RE2::ErrorInternal;
+ return RE2::ErrorInternal;
case re2::kRegexpBadEscape:
- return RE2::ErrorBadEscape;
+ return RE2::ErrorBadEscape;
case re2::kRegexpBadCharClass:
- return RE2::ErrorBadCharClass;
+ return RE2::ErrorBadCharClass;
case re2::kRegexpBadCharRange:
- return RE2::ErrorBadCharRange;
+ return RE2::ErrorBadCharRange;
case re2::kRegexpMissingBracket:
- return RE2::ErrorMissingBracket;
+ return RE2::ErrorMissingBracket;
case re2::kRegexpMissingParen:
- return RE2::ErrorMissingParen;
+ return RE2::ErrorMissingParen;
case re2::kRegexpUnexpectedParen:
return RE2::ErrorUnexpectedParen;
case re2::kRegexpTrailingBackslash:
- return RE2::ErrorTrailingBackslash;
+ return RE2::ErrorTrailingBackslash;
case re2::kRegexpRepeatArgument:
- return RE2::ErrorRepeatArgument;
+ return RE2::ErrorRepeatArgument;
case re2::kRegexpRepeatSize:
- return RE2::ErrorRepeatSize;
+ return RE2::ErrorRepeatSize;
case re2::kRegexpRepeatOp:
- return RE2::ErrorRepeatOp;
+ return RE2::ErrorRepeatOp;
case re2::kRegexpBadPerlOp:
- return RE2::ErrorBadPerlOp;
+ return RE2::ErrorBadPerlOp;
case re2::kRegexpBadUTF8:
- return RE2::ErrorBadUTF8;
+ return RE2::ErrorBadUTF8;
case re2::kRegexpBadNamedCapture:
- return RE2::ErrorBadNamedCapture;
- }
- return RE2::ErrorInternal;
-}
-
+ return RE2::ErrorBadNamedCapture;
+ }
+ return RE2::ErrorInternal;
+}
+
static std::string trunc(const StringPiece& pattern) {
- if (pattern.size() < 100)
+ if (pattern.size() < 100)
return std::string(pattern);
return std::string(pattern.substr(0, 100)) + "...";
-}
-
-
-RE2::RE2(const char* pattern) {
- Init(pattern, DefaultOptions);
-}
-
+}
+
+
+RE2::RE2(const char* pattern) {
+ Init(pattern, DefaultOptions);
+}
+
RE2::RE2(const std::string& pattern) {
- Init(pattern, DefaultOptions);
-}
-
-RE2::RE2(const StringPiece& pattern) {
- Init(pattern, DefaultOptions);
-}
-
-RE2::RE2(const StringPiece& pattern, const Options& options) {
- Init(pattern, options);
-}
-
-int RE2::Options::ParseFlags() const {
- int flags = Regexp::ClassNL;
- switch (encoding()) {
- default:
+ Init(pattern, DefaultOptions);
+}
+
+RE2::RE2(const StringPiece& pattern) {
+ Init(pattern, DefaultOptions);
+}
+
+RE2::RE2(const StringPiece& pattern, const Options& options) {
+ Init(pattern, options);
+}
+
+int RE2::Options::ParseFlags() const {
+ int flags = Regexp::ClassNL;
+ switch (encoding()) {
+ default:
if (log_errors())
LOG(ERROR) << "Unknown encoding " << encoding();
- break;
- case RE2::Options::EncodingUTF8:
- break;
- case RE2::Options::EncodingLatin1:
- flags |= Regexp::Latin1;
- break;
- }
-
- if (!posix_syntax())
- flags |= Regexp::LikePerl;
-
- if (literal())
- flags |= Regexp::Literal;
-
- if (never_nl())
- flags |= Regexp::NeverNL;
-
+ break;
+ case RE2::Options::EncodingUTF8:
+ break;
+ case RE2::Options::EncodingLatin1:
+ flags |= Regexp::Latin1;
+ break;
+ }
+
+ if (!posix_syntax())
+ flags |= Regexp::LikePerl;
+
+ if (literal())
+ flags |= Regexp::Literal;
+
+ if (never_nl())
+ flags |= Regexp::NeverNL;
+
if (dot_nl())
flags |= Regexp::DotNL;
if (never_capture())
flags |= Regexp::NeverCapture;
- if (!case_sensitive())
- flags |= Regexp::FoldCase;
-
- if (perl_classes())
- flags |= Regexp::PerlClasses;
-
- if (word_boundary())
- flags |= Regexp::PerlB;
-
- if (one_line())
- flags |= Regexp::OneLine;
-
- return flags;
-}
-
-void RE2::Init(const StringPiece& pattern, const Options& options) {
+ if (!case_sensitive())
+ flags |= Regexp::FoldCase;
+
+ if (perl_classes())
+ flags |= Regexp::PerlClasses;
+
+ if (word_boundary())
+ flags |= Regexp::PerlB;
+
+ if (one_line())
+ flags |= Regexp::OneLine;
+
+ return flags;
+}
+
+void RE2::Init(const StringPiece& pattern, const Options& options) {
static std::once_flag empty_once;
std::call_once(empty_once, []() {
empty_string = new std::string;
@@ -179,70 +179,70 @@ void RE2::Init(const StringPiece& pattern, const Options& options) {
});
pattern_.assign(pattern.data(), pattern.size());
- options_.Copy(options);
+ options_.Copy(options);
entire_regexp_ = NULL;
error_ = empty_string;
error_code_ = NoError;
error_arg_.clear();
prefix_.clear();
prefix_foldcase_ = false;
- suffix_regexp_ = NULL;
- prog_ = NULL;
+ suffix_regexp_ = NULL;
+ prog_ = NULL;
num_captures_ = -1;
is_one_pass_ = false;
- rprog_ = NULL;
- named_groups_ = NULL;
- group_names_ = NULL;
-
- RegexpStatus status;
- entire_regexp_ = Regexp::Parse(
- pattern_,
- static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
- &status);
- if (entire_regexp_ == NULL) {
- if (options_.log_errors()) {
- LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
- << status.Text();
- }
+ rprog_ = NULL;
+ named_groups_ = NULL;
+ group_names_ = NULL;
+
+ RegexpStatus status;
+ entire_regexp_ = Regexp::Parse(
+ pattern_,
+ static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
+ &status);
+ if (entire_regexp_ == NULL) {
+ if (options_.log_errors()) {
+ LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
+ << status.Text();
+ }
error_ = new std::string(status.Text());
- error_code_ = RegexpErrorToRE2(status.code());
+ error_code_ = RegexpErrorToRE2(status.code());
error_arg_ = std::string(status.error_arg());
- return;
- }
-
+ return;
+ }
+
re2::Regexp* suffix;
- if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
- suffix_regexp_ = suffix;
- else
- suffix_regexp_ = entire_regexp_->Incref();
-
- // Two thirds of the memory goes to the forward Prog,
- // one third to the reverse prog, because the forward
- // Prog has two DFAs but the reverse prog has one.
- prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
- if (prog_ == NULL) {
- if (options_.log_errors())
- LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
+ if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
+ suffix_regexp_ = suffix;
+ else
+ suffix_regexp_ = entire_regexp_->Incref();
+
+ // Two thirds of the memory goes to the forward Prog,
+ // one third to the reverse prog, because the forward
+ // Prog has two DFAs but the reverse prog has one.
+ prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
+ if (prog_ == NULL) {
+ if (options_.log_errors())
+ LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
error_ = new std::string("pattern too large - compile failed");
- error_code_ = RE2::ErrorPatternTooLarge;
- return;
- }
-
+ error_code_ = RE2::ErrorPatternTooLarge;
+ return;
+ }
+
// We used to compute this lazily, but it's used during the
// typical control flow for a match call, so we now compute
// it eagerly, which avoids the overhead of std::once_flag.
num_captures_ = suffix_regexp_->NumCaptures();
- // Could delay this until the first match call that
- // cares about submatch information, but the one-pass
- // machine's memory gets cut from the DFA memory budget,
- // and that is harder to do if the DFA has already
- // been built.
- is_one_pass_ = prog_->IsOnePass();
-}
-
-// Returns rprog_, computing it if needed.
+ // Could delay this until the first match call that
+ // cares about submatch information, but the one-pass
+ // machine's memory gets cut from the DFA memory budget,
+ // and that is harder to do if the DFA has already
+ // been built.
+ is_one_pass_ = prog_->IsOnePass();
+}
+
+// Returns rprog_, computing it if needed.
re2::Prog* RE2::ReverseProg() const {
std::call_once(rprog_once_, [](const RE2* re) {
re->rprog_ =
@@ -255,32 +255,32 @@ re2::Prog* RE2::ReverseProg() const {
// is fine. More importantly, an RE2 object is supposed to be logically
// immutable: whatever ok() would have returned after Init() completed,
// it should continue to return that no matter what ReverseProg() does.
- }
+ }
}, this);
- return rprog_;
-}
-
-RE2::~RE2() {
- if (suffix_regexp_)
- suffix_regexp_->Decref();
- if (entire_regexp_)
- entire_regexp_->Decref();
- delete prog_;
- delete rprog_;
+ return rprog_;
+}
+
+RE2::~RE2() {
+ if (suffix_regexp_)
+ suffix_regexp_->Decref();
+ if (entire_regexp_)
+ entire_regexp_->Decref();
+ delete prog_;
+ delete rprog_;
if (error_ != empty_string)
- delete error_;
+ delete error_;
if (named_groups_ != NULL && named_groups_ != empty_named_groups)
- delete named_groups_;
+ delete named_groups_;
if (group_names_ != NULL && group_names_ != empty_group_names)
- delete group_names_;
-}
-
-int RE2::ProgramSize() const {
- if (prog_ == NULL)
- return -1;
- return prog_->size();
-}
-
+ delete group_names_;
+}
+
+int RE2::ProgramSize() const {
+ if (prog_ == NULL)
+ return -1;
+ return prog_->size();
+}
+
int RE2::ReverseProgramSize() const {
if (prog_ == NULL)
return -1;
@@ -346,7 +346,7 @@ int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
return Fanout(prog, histogram);
}
-// Returns named_groups_, computing it if needed.
+// Returns named_groups_, computing it if needed.
const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
std::call_once(named_groups_once_, [](const RE2* re) {
if (re->suffix_regexp_ != NULL)
@@ -354,10 +354,10 @@ const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
if (re->named_groups_ == NULL)
re->named_groups_ = empty_named_groups;
}, this);
- return *named_groups_;
-}
-
-// Returns group_names_, computing it if needed.
+ return *named_groups_;
+}
+
+// Returns group_names_, computing it if needed.
const std::map<int, std::string>& RE2::CapturingGroupNames() const {
std::call_once(group_names_once_, [](const RE2* re) {
if (re->suffix_regexp_ != NULL)
@@ -365,94 +365,94 @@ const std::map<int, std::string>& RE2::CapturingGroupNames() const {
if (re->group_names_ == NULL)
re->group_names_ = empty_group_names;
}, this);
- return *group_names_;
-}
-
-/***** Convenience interfaces *****/
-
-bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
- const Arg* const args[], int n) {
- return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
-}
-
-bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
- const Arg* const args[], int n) {
- return re.DoMatch(text, UNANCHORED, NULL, args, n);
-}
-
-bool RE2::ConsumeN(StringPiece* input, const RE2& re,
- const Arg* const args[], int n) {
+ return *group_names_;
+}
+
+/***** Convenience interfaces *****/
+
+bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
+ const Arg* const args[], int n) {
+ return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
+}
+
+bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
+ const Arg* const args[], int n) {
+ return re.DoMatch(text, UNANCHORED, NULL, args, n);
+}
+
+bool RE2::ConsumeN(StringPiece* input, const RE2& re,
+ const Arg* const args[], int n) {
size_t consumed;
- if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
- input->remove_prefix(consumed);
- return true;
- } else {
- return false;
- }
-}
-
-bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
- const Arg* const args[], int n) {
+ if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
+ input->remove_prefix(consumed);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
+ const Arg* const args[], int n) {
size_t consumed;
- if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
- input->remove_prefix(consumed);
- return true;
- } else {
- return false;
- }
-}
-
+ if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
+ input->remove_prefix(consumed);
+ return true;
+ } else {
+ return false;
+ }
+}
+
bool RE2::Replace(std::string* str,
const RE2& re,
const StringPiece& rewrite) {
- StringPiece vec[kVecSize];
- int nvec = 1 + MaxSubmatch(rewrite);
+ StringPiece vec[kVecSize];
+ int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
- return false;
+ return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
- return false;
-
+ return false;
+
std::string s;
- if (!re.Rewrite(&s, rewrite, vec, nvec))
- return false;
-
+ if (!re.Rewrite(&s, rewrite, vec, nvec))
+ return false;
+
assert(vec[0].data() >= str->data());
assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
- str->replace(vec[0].data() - str->data(), vec[0].size(), s);
- return true;
-}
-
+ str->replace(vec[0].data() - str->data(), vec[0].size(), s);
+ return true;
+}
+
int RE2::GlobalReplace(std::string* str,
const RE2& re,
const StringPiece& rewrite) {
- StringPiece vec[kVecSize];
- int nvec = 1 + MaxSubmatch(rewrite);
+ StringPiece vec[kVecSize];
+ int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
- return false;
+ return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
-
- const char* p = str->data();
- const char* ep = p + str->size();
- const char* lastend = NULL;
+
+ const char* p = str->data();
+ const char* ep = p + str->size();
+ const char* lastend = NULL;
std::string out;
- int count = 0;
+ int count = 0;
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
// Iterate just once when fuzzing. Otherwise, we easily get bogged down
// and coverage is unlikely to improve despite significant expense.
while (p == str->data()) {
#else
- while (p <= ep) {
+ while (p <= ep) {
#endif
if (!re.Match(*str, static_cast<size_t>(p - str->data()),
str->size(), UNANCHORED, vec, nvec))
- break;
+ break;
if (p < vec[0].data())
out.append(p, vec[0].data() - p);
if (vec[0].data() == lastend && vec[0].empty()) {
- // Disallow empty match at end of last match: skip ahead.
+ // Disallow empty match at end of last match: skip ahead.
//
// fullrune() takes int, not ptrdiff_t. However, it just looks
// at the leading byte and treats any length >= 4 the same.
@@ -476,155 +476,155 @@ int RE2::GlobalReplace(std::string* str,
}
// Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
// we fell through from above and the GIGO principle applies.
- if (p < ep)
- out.append(p, 1);
- p++;
- continue;
- }
- re.Rewrite(&out, rewrite, vec, nvec);
+ if (p < ep)
+ out.append(p, 1);
+ p++;
+ continue;
+ }
+ re.Rewrite(&out, rewrite, vec, nvec);
p = vec[0].data() + vec[0].size();
- lastend = p;
- count++;
- }
-
- if (count == 0)
- return 0;
-
- if (p < ep)
- out.append(p, ep - p);
+ lastend = p;
+ count++;
+ }
+
+ if (count == 0)
+ return 0;
+
+ if (p < ep)
+ out.append(p, ep - p);
using std::swap;
- swap(out, *str);
- return count;
-}
-
+ swap(out, *str);
+ return count;
+}
+
bool RE2::Extract(const StringPiece& text,
const RE2& re,
const StringPiece& rewrite,
std::string* out) {
- StringPiece vec[kVecSize];
- int nvec = 1 + MaxSubmatch(rewrite);
+ StringPiece vec[kVecSize];
+ int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
- return false;
+ return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
- return false;
-
- out->clear();
- return re.Rewrite(out, rewrite, vec, nvec);
-}
-
+ return false;
+
+ out->clear();
+ return re.Rewrite(out, rewrite, vec, nvec);
+}
+
std::string RE2::QuoteMeta(const StringPiece& unquoted) {
std::string result;
- result.reserve(unquoted.size() << 1);
-
- // Escape any ascii character not in [A-Za-z_0-9].
- //
- // Note that it's legal to escape a character even if it has no
- // special meaning in a regular expression -- so this function does
- // that. (This also makes it identical to the perl function of the
- // same name except for the null-character special case;
- // see `perldoc -f quotemeta`.)
+ result.reserve(unquoted.size() << 1);
+
+ // Escape any ascii character not in [A-Za-z_0-9].
+ //
+ // Note that it's legal to escape a character even if it has no
+ // special meaning in a regular expression -- so this function does
+ // that. (This also makes it identical to the perl function of the
+ // same name except for the null-character special case;
+ // see `perldoc -f quotemeta`.)
for (size_t ii = 0; ii < unquoted.size(); ++ii) {
- // Note that using 'isalnum' here raises the benchmark time from
- // 32ns to 58ns:
- if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
- (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
- (unquoted[ii] < '0' || unquoted[ii] > '9') &&
- unquoted[ii] != '_' &&
- // If this is the part of a UTF8 or Latin1 character, we need
- // to copy this byte without escaping. Experimentally this is
- // what works correctly with the regexp library.
- !(unquoted[ii] & 128)) {
- if (unquoted[ii] == '\0') { // Special handling for null chars.
- // Note that this special handling is not strictly required for RE2,
- // but this quoting is required for other regexp libraries such as
- // PCRE.
- // Can't use "\\0" since the next character might be a digit.
- result += "\\x00";
- continue;
- }
- result += '\\';
- }
- result += unquoted[ii];
- }
-
- return result;
-}
-
+ // Note that using 'isalnum' here raises the benchmark time from
+ // 32ns to 58ns:
+ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
+ (unquoted[ii] < '0' || unquoted[ii] > '9') &&
+ unquoted[ii] != '_' &&
+ // If this is the part of a UTF8 or Latin1 character, we need
+ // to copy this byte without escaping. Experimentally this is
+ // what works correctly with the regexp library.
+ !(unquoted[ii] & 128)) {
+ if (unquoted[ii] == '\0') { // Special handling for null chars.
+ // Note that this special handling is not strictly required for RE2,
+ // but this quoting is required for other regexp libraries such as
+ // PCRE.
+ // Can't use "\\0" since the next character might be a digit.
+ result += "\\x00";
+ continue;
+ }
+ result += '\\';
+ }
+ result += unquoted[ii];
+ }
+
+ return result;
+}
+
bool RE2::PossibleMatchRange(std::string* min, std::string* max,
int maxlen) const {
- if (prog_ == NULL)
- return false;
-
+ if (prog_ == NULL)
+ return false;
+
int n = static_cast<int>(prefix_.size());
- if (n > maxlen)
- n = maxlen;
-
- // Determine initial min max from prefix_ literal.
+ if (n > maxlen)
+ n = maxlen;
+
+ // Determine initial min max from prefix_ literal.
*min = prefix_.substr(0, n);
*max = prefix_.substr(0, n);
- if (prefix_foldcase_) {
+ if (prefix_foldcase_) {
// prefix is ASCII lowercase; change *min to uppercase.
- for (int i = 0; i < n; i++) {
+ for (int i = 0; i < n; i++) {
char& c = (*min)[i];
if ('a' <= c && c <= 'z')
c += 'A' - 'a';
- }
- }
-
- // Add to prefix min max using PossibleMatchRange on regexp.
+ }
+ }
+
+ // Add to prefix min max using PossibleMatchRange on regexp.
std::string dmin, dmax;
- maxlen -= n;
- if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
+ maxlen -= n;
+ if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
min->append(dmin);
max->append(dmax);
} else if (!max->empty()) {
- // prog_->PossibleMatchRange has failed us,
- // but we still have useful information from prefix_.
+ // prog_->PossibleMatchRange has failed us,
+ // but we still have useful information from prefix_.
// Round up *max to allow any possible suffix.
PrefixSuccessor(max);
- } else {
- // Nothing useful.
- *min = "";
- *max = "";
- return false;
- }
-
- return true;
-}
-
-// Avoid possible locale nonsense in standard strcasecmp.
-// The string a is known to be all lowercase.
+ } else {
+ // Nothing useful.
+ *min = "";
+ *max = "";
+ return false;
+ }
+
+ return true;
+}
+
+// Avoid possible locale nonsense in standard strcasecmp.
+// The string a is known to be all lowercase.
static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
const char* ae = a + len;
-
- for (; a < ae; a++, b++) {
+
+ for (; a < ae; a++, b++) {
uint8_t x = *a;
uint8_t y = *b;
- if ('A' <= y && y <= 'Z')
- y += 'a' - 'A';
- if (x != y)
- return x - y;
- }
- return 0;
-}
-
-
-/***** Actual matching and rewriting code *****/
-
-bool RE2::Match(const StringPiece& text,
+ if ('A' <= y && y <= 'Z')
+ y += 'a' - 'A';
+ if (x != y)
+ return x - y;
+ }
+ return 0;
+}
+
+
+/***** Actual matching and rewriting code *****/
+
+bool RE2::Match(const StringPiece& text,
size_t startpos,
size_t endpos,
- Anchor re_anchor,
- StringPiece* submatch,
- int nsubmatch) const {
+ Anchor re_anchor,
+ StringPiece* submatch,
+ int nsubmatch) const {
if (!ok()) {
- if (options_.log_errors())
- LOG(ERROR) << "Invalid RE2: " << *error_;
- return false;
- }
-
+ if (options_.log_errors())
+ LOG(ERROR) << "Invalid RE2: " << *error_;
+ return false;
+ }
+
if (startpos > endpos || endpos > text.size()) {
if (options_.log_errors())
LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
@@ -634,23 +634,23 @@ bool RE2::Match(const StringPiece& text,
return false;
}
- StringPiece subtext = text;
- subtext.remove_prefix(startpos);
+ StringPiece subtext = text;
+ subtext.remove_prefix(startpos);
subtext.remove_suffix(text.size() - endpos);
-
- // Use DFAs to find exact location of match, filter out non-matches.
-
- // Don't ask for the location if we won't use it.
- // SearchDFA can do extra optimizations in that case.
- StringPiece match;
- StringPiece* matchp = &match;
- if (nsubmatch == 0)
- matchp = NULL;
-
- int ncap = 1 + NumberOfCapturingGroups();
- if (ncap > nsubmatch)
- ncap = nsubmatch;
-
+
+ // Use DFAs to find exact location of match, filter out non-matches.
+
+ // Don't ask for the location if we won't use it.
+ // SearchDFA can do extra optimizations in that case.
+ StringPiece match;
+ StringPiece* matchp = &match;
+ if (nsubmatch == 0)
+ matchp = NULL;
+
+ int ncap = 1 + NumberOfCapturingGroups();
+ if (ncap > nsubmatch)
+ ncap = nsubmatch;
+
// If the regexp is anchored explicitly, must not be in middle of text.
if (prog_->anchor_start() && startpos != 0)
return false;
@@ -658,53 +658,53 @@ bool RE2::Match(const StringPiece& text,
return false;
// If the regexp is anchored explicitly, update re_anchor
- // so that we can potentially fall into a faster case below.
- if (prog_->anchor_start() && prog_->anchor_end())
- re_anchor = ANCHOR_BOTH;
- else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
- re_anchor = ANCHOR_START;
-
- // Check for the required prefix, if any.
+ // so that we can potentially fall into a faster case below.
+ if (prog_->anchor_start() && prog_->anchor_end())
+ re_anchor = ANCHOR_BOTH;
+ else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
+ re_anchor = ANCHOR_START;
+
+ // Check for the required prefix, if any.
size_t prefixlen = 0;
- if (!prefix_.empty()) {
+ if (!prefix_.empty()) {
if (startpos != 0)
return false;
- prefixlen = prefix_.size();
- if (prefixlen > subtext.size())
- return false;
- if (prefix_foldcase_) {
- if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
- return false;
- } else {
- if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
- return false;
- }
- subtext.remove_prefix(prefixlen);
- // If there is a required prefix, the anchor must be at least ANCHOR_START.
- if (re_anchor != ANCHOR_BOTH)
- re_anchor = ANCHOR_START;
- }
-
- Prog::Anchor anchor = Prog::kUnanchored;
- Prog::MatchKind kind = Prog::kFirstMatch;
- if (options_.longest_match())
- kind = Prog::kLongestMatch;
-
+ prefixlen = prefix_.size();
+ if (prefixlen > subtext.size())
+ return false;
+ if (prefix_foldcase_) {
+ if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
+ return false;
+ } else {
+ if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
+ return false;
+ }
+ subtext.remove_prefix(prefixlen);
+ // If there is a required prefix, the anchor must be at least ANCHOR_START.
+ if (re_anchor != ANCHOR_BOTH)
+ re_anchor = ANCHOR_START;
+ }
+
+ Prog::Anchor anchor = Prog::kUnanchored;
+ Prog::MatchKind kind = Prog::kFirstMatch;
+ if (options_.longest_match())
+ kind = Prog::kLongestMatch;
+
bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture;
bool can_bit_state = prog_->CanBitState();
size_t bit_state_text_max_size = prog_->bit_state_text_max_size();
-
+
#ifdef RE2_HAVE_THREAD_LOCAL
hooks::context = this;
#endif
- bool dfa_failed = false;
+ bool dfa_failed = false;
bool skipped_test = false;
- switch (re_anchor) {
- default:
+ switch (re_anchor) {
+ default:
LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor;
return false;
- case UNANCHORED: {
+ case UNANCHORED: {
if (prog_->anchor_end()) {
// This is a very special case: we don't need the forward DFA because
// we already know where the match must end! Instead, the reverse DFA
@@ -735,78 +735,78 @@ bool RE2::Match(const StringPiece& text,
break;
}
- if (!prog_->SearchDFA(subtext, text, anchor, kind,
- matchp, &dfa_failed, NULL)) {
- if (dfa_failed) {
+ if (!prog_->SearchDFA(subtext, text, anchor, kind,
+ matchp, &dfa_failed, NULL)) {
+ if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
<< "pattern length " << pattern_.size() << ", "
<< "program size " << prog_->size() << ", "
<< "list count " << prog_->list_count() << ", "
<< "bytemap range " << prog_->bytemap_range();
- // Fall back to NFA below.
- skipped_test = true;
- break;
- }
- return false;
- }
+ // Fall back to NFA below.
+ skipped_test = true;
+ break;
+ }
+ return false;
+ }
if (matchp == NULL) // Matched. Don't care where.
- return true;
+ return true;
// SearchDFA set match.end() but didn't know where the
// match started. Run the regexp backward from match.end()
- // to find the longest possible match -- that's where it started.
- Prog* prog = ReverseProg();
+ // to find the longest possible match -- that's where it started.
+ Prog* prog = ReverseProg();
if (prog == NULL) {
// Fall back to NFA below.
skipped_test = true;
break;
}
- if (!prog->SearchDFA(match, text, Prog::kAnchored,
- Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
- if (dfa_failed) {
+ if (!prog->SearchDFA(match, text, Prog::kAnchored,
+ Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
+ if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
<< "pattern length " << pattern_.size() << ", "
<< "program size " << prog->size() << ", "
<< "list count " << prog->list_count() << ", "
<< "bytemap range " << prog->bytemap_range();
- // Fall back to NFA below.
- skipped_test = true;
- break;
- }
+ // Fall back to NFA below.
+ skipped_test = true;
+ break;
+ }
if (options_.log_errors())
LOG(ERROR) << "SearchDFA inconsistency";
- return false;
- }
- break;
- }
-
- case ANCHOR_BOTH:
- case ANCHOR_START:
- if (re_anchor == ANCHOR_BOTH)
- kind = Prog::kFullMatch;
- anchor = Prog::kAnchored;
-
- // If only a small amount of text and need submatch
- // information anyway and we're going to use OnePass or BitState
- // to get it, we might as well not even bother with the DFA:
- // OnePass or BitState will be fast enough.
- // On tiny texts, OnePass outruns even the DFA, and
- // it doesn't have the shared state and occasional mutex that
- // the DFA does.
- if (can_one_pass && text.size() <= 4096 &&
+ return false;
+ }
+ break;
+ }
+
+ case ANCHOR_BOTH:
+ case ANCHOR_START:
+ if (re_anchor == ANCHOR_BOTH)
+ kind = Prog::kFullMatch;
+ anchor = Prog::kAnchored;
+
+ // If only a small amount of text and need submatch
+ // information anyway and we're going to use OnePass or BitState
+ // to get it, we might as well not even bother with the DFA:
+ // OnePass or BitState will be fast enough.
+ // On tiny texts, OnePass outruns even the DFA, and
+ // it doesn't have the shared state and occasional mutex that
+ // the DFA does.
+ if (can_one_pass && text.size() <= 4096 &&
(ncap > 1 || text.size() <= 16)) {
- skipped_test = true;
- break;
- }
+ skipped_test = true;
+ break;
+ }
if (can_bit_state && text.size() <= bit_state_text_max_size &&
ncap > 1) {
- skipped_test = true;
- break;
- }
- if (!prog_->SearchDFA(subtext, text, anchor, kind,
- &match, &dfa_failed, NULL)) {
- if (dfa_failed) {
+ skipped_test = true;
+ break;
+ }
+ if (!prog_->SearchDFA(subtext, text, anchor, kind,
+ &match, &dfa_failed, NULL)) {
+ if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
<< "pattern length " << pattern_.size() << ", "
@@ -814,169 +814,169 @@ bool RE2::Match(const StringPiece& text,
<< "list count " << prog_->list_count() << ", "
<< "bytemap range " << prog_->bytemap_range();
// Fall back to NFA below.
- skipped_test = true;
- break;
- }
- return false;
- }
- break;
- }
-
- if (!skipped_test && ncap <= 1) {
- // We know exactly where it matches. That's enough.
- if (ncap == 1)
- submatch[0] = match;
- } else {
- StringPiece subtext1;
- if (skipped_test) {
- // DFA ran out of memory or was skipped:
- // need to search in entire original text.
- subtext1 = subtext;
- } else {
- // DFA found the exact match location:
- // let NFA run an anchored, full match search
- // to find submatch locations.
- subtext1 = match;
- anchor = Prog::kAnchored;
- kind = Prog::kFullMatch;
- }
-
- if (can_one_pass && anchor != Prog::kUnanchored) {
- if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
+ skipped_test = true;
+ break;
+ }
+ return false;
+ }
+ break;
+ }
+
+ if (!skipped_test && ncap <= 1) {
+ // We know exactly where it matches. That's enough.
+ if (ncap == 1)
+ submatch[0] = match;
+ } else {
+ StringPiece subtext1;
+ if (skipped_test) {
+ // DFA ran out of memory or was skipped:
+ // need to search in entire original text.
+ subtext1 = subtext;
+ } else {
+ // DFA found the exact match location:
+ // let NFA run an anchored, full match search
+ // to find submatch locations.
+ subtext1 = match;
+ anchor = Prog::kAnchored;
+ kind = Prog::kFullMatch;
+ }
+
+ if (can_one_pass && anchor != Prog::kUnanchored) {
+ if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
if (!skipped_test && options_.log_errors())
- LOG(ERROR) << "SearchOnePass inconsistency";
- return false;
- }
+ LOG(ERROR) << "SearchOnePass inconsistency";
+ return false;
+ }
} else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) {
- if (!prog_->SearchBitState(subtext1, text, anchor,
- kind, submatch, ncap)) {
+ if (!prog_->SearchBitState(subtext1, text, anchor,
+ kind, submatch, ncap)) {
if (!skipped_test && options_.log_errors())
- LOG(ERROR) << "SearchBitState inconsistency";
- return false;
- }
- } else {
- if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
+ LOG(ERROR) << "SearchBitState inconsistency";
+ return false;
+ }
+ } else {
+ if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
if (!skipped_test && options_.log_errors())
- LOG(ERROR) << "SearchNFA inconsistency";
- return false;
- }
- }
- }
-
- // Adjust overall match for required prefix that we stripped off.
- if (prefixlen > 0 && nsubmatch > 0)
+ LOG(ERROR) << "SearchNFA inconsistency";
+ return false;
+ }
+ }
+ }
+
+ // Adjust overall match for required prefix that we stripped off.
+ if (prefixlen > 0 && nsubmatch > 0)
submatch[0] = StringPiece(submatch[0].data() - prefixlen,
- submatch[0].size() + prefixlen);
-
- // Zero submatches that don't exist in the regexp.
- for (int i = ncap; i < nsubmatch; i++)
+ submatch[0].size() + prefixlen);
+
+ // Zero submatches that don't exist in the regexp.
+ for (int i = ncap; i < nsubmatch; i++)
submatch[i] = StringPiece();
- return true;
-}
-
-// Internal matcher - like Match() but takes Args not StringPieces.
-bool RE2::DoMatch(const StringPiece& text,
+ return true;
+}
+
+// Internal matcher - like Match() but takes Args not StringPieces.
+bool RE2::DoMatch(const StringPiece& text,
Anchor re_anchor,
size_t* consumed,
- const Arg* const* args,
- int n) const {
- if (!ok()) {
- if (options_.log_errors())
- LOG(ERROR) << "Invalid RE2: " << *error_;
- return false;
- }
-
+ const Arg* const* args,
+ int n) const {
+ if (!ok()) {
+ if (options_.log_errors())
+ LOG(ERROR) << "Invalid RE2: " << *error_;
+ return false;
+ }
+
if (NumberOfCapturingGroups() < n) {
// RE has fewer capturing groups than number of Arg pointers passed in.
return false;
}
- // Count number of capture groups needed.
- int nvec;
- if (n == 0 && consumed == NULL)
- nvec = 0;
- else
- nvec = n+1;
-
- StringPiece* vec;
- StringPiece stkvec[kVecSize];
- StringPiece* heapvec = NULL;
-
+ // Count number of capture groups needed.
+ int nvec;
+ if (n == 0 && consumed == NULL)
+ nvec = 0;
+ else
+ nvec = n+1;
+
+ StringPiece* vec;
+ StringPiece stkvec[kVecSize];
+ StringPiece* heapvec = NULL;
+
if (nvec <= static_cast<int>(arraysize(stkvec))) {
- vec = stkvec;
- } else {
- vec = new StringPiece[nvec];
- heapvec = vec;
- }
-
+ vec = stkvec;
+ } else {
+ vec = new StringPiece[nvec];
+ heapvec = vec;
+ }
+
if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
- delete[] heapvec;
- return false;
- }
-
+ delete[] heapvec;
+ return false;
+ }
+
if (consumed != NULL)
*consumed = static_cast<size_t>(EndPtr(vec[0]) - BeginPtr(text));
-
- if (n == 0 || args == NULL) {
- // We are not interested in results
- delete[] heapvec;
- return true;
- }
-
- // If we got here, we must have matched the whole pattern.
- for (int i = 0; i < n; i++) {
- const StringPiece& s = vec[i+1];
- if (!args[i]->Parse(s.data(), s.size())) {
- // TODO: Should we indicate what the error was?
- delete[] heapvec;
- return false;
- }
- }
-
- delete[] heapvec;
- return true;
-}
-
-// Checks that the rewrite string is well-formed with respect to this
-// regular expression.
+
+ if (n == 0 || args == NULL) {
+ // We are not interested in results
+ delete[] heapvec;
+ return true;
+ }
+
+ // If we got here, we must have matched the whole pattern.
+ for (int i = 0; i < n; i++) {
+ const StringPiece& s = vec[i+1];
+ if (!args[i]->Parse(s.data(), s.size())) {
+ // TODO: Should we indicate what the error was?
+ delete[] heapvec;
+ return false;
+ }
+ }
+
+ delete[] heapvec;
+ return true;
+}
+
+// Checks that the rewrite string is well-formed with respect to this
+// regular expression.
bool RE2::CheckRewriteString(const StringPiece& rewrite,
std::string* error) const {
- int max_token = -1;
- for (const char *s = rewrite.data(), *end = s + rewrite.size();
- s < end; s++) {
- int c = *s;
- if (c != '\\') {
- continue;
- }
- if (++s == end) {
- *error = "Rewrite schema error: '\\' not allowed at end.";
- return false;
- }
- c = *s;
- if (c == '\\') {
- continue;
- }
+ int max_token = -1;
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
+ s < end; s++) {
+ int c = *s;
+ if (c != '\\') {
+ continue;
+ }
+ if (++s == end) {
+ *error = "Rewrite schema error: '\\' not allowed at end.";
+ return false;
+ }
+ c = *s;
+ if (c == '\\') {
+ continue;
+ }
if (!isdigit(c)) {
- *error = "Rewrite schema error: "
- "'\\' must be followed by a digit or '\\'.";
- return false;
- }
- int n = (c - '0');
- if (max_token < n) {
- max_token = n;
- }
- }
-
- if (max_token > NumberOfCapturingGroups()) {
+ *error = "Rewrite schema error: "
+ "'\\' must be followed by a digit or '\\'.";
+ return false;
+ }
+ int n = (c - '0');
+ if (max_token < n) {
+ max_token = n;
+ }
+ }
+
+ if (max_token > NumberOfCapturingGroups()) {
*error = StringPrintf(
"Rewrite schema requests %d matches, but the regexp only has %d "
"parenthesized subexpressions.",
max_token, NumberOfCapturingGroups());
- return false;
- }
- return true;
-}
-
+ return false;
+ }
+ return true;
+}
+
// Returns the maximum submatch needed for the rewrite to be done by Replace().
// E.g. if rewrite == "foo \\2,\\1", returns 2.
int RE2::MaxSubmatch(const StringPiece& rewrite) {
@@ -1033,32 +1033,32 @@ bool RE2::Rewrite(std::string* out,
return true;
}
-/***** Parsers for various types *****/
-
+/***** Parsers for various types *****/
+
namespace re2_internal {
template <>
bool Parse(const char* str, size_t n, void* dest) {
- // We fail if somebody asked us to store into a non-NULL void* pointer
- return (dest == NULL);
-}
-
+ // We fail if somebody asked us to store into a non-NULL void* pointer
+ return (dest == NULL);
+}
+
template <>
bool Parse(const char* str, size_t n, std::string* dest) {
- if (dest == NULL) return true;
+ if (dest == NULL) return true;
dest->assign(str, n);
- return true;
-}
-
+ return true;
+}
+
#if defined(ARCADIA_ROOT)
template <>
bool Parse(const char* str, size_t n, TString* dest) {
- if (dest == NULL) return true;
+ if (dest == NULL) return true;
dest->assign(str, n);
- return true;
-}
+ return true;
+}
#endif
-
+
template <>
bool Parse(const char* str, size_t n, StringPiece* dest) {
if (dest == NULL) return true;
@@ -1068,16 +1068,16 @@ bool Parse(const char* str, size_t n, StringPiece* dest) {
template <>
bool Parse(const char* str, size_t n, char* dest) {
- if (n != 1) return false;
- if (dest == NULL) return true;
+ if (n != 1) return false;
+ if (dest == NULL) return true;
*dest = str[0];
- return true;
-}
-
+ return true;
+}
+
template <>
bool Parse(const char* str, size_t n, signed char* dest) {
- if (n != 1) return false;
- if (dest == NULL) return true;
+ if (n != 1) return false;
+ if (dest == NULL) return true;
*dest = str[0];
return true;
}
@@ -1087,12 +1087,12 @@ bool Parse(const char* str, size_t n, unsigned char* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*dest = str[0];
- return true;
-}
-
-// Largest number spec that we are willing to parse
-static const int kMaxNumberLength = 32;
-
+ return true;
+}
+
+// Largest number spec that we are willing to parse
+static const int kMaxNumberLength = 32;
+
// REQUIRES "buf" must have length at least nbuf.
// Copies "str" into "buf" and null-terminates.
// Overwrites *np with the new length.
@@ -1101,7 +1101,7 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
size_t n = *np;
if (n == 0) return "";
if (n > 0 && isspace(*str)) {
- // We are less forgiving than the strtoxxx() routines and do not
+ // We are less forgiving than the strtoxxx() routines and do not
// allow leading spaces. We do allow leading spaces for floats.
if (!accept_spaces) {
return "";
@@ -1110,8 +1110,8 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
n--;
str++;
}
- }
-
+ }
+
// Although buf has a fixed maximum size, we can still handle
// arbitrarily large integers correctly by omitting leading zeros.
// (Numbers that are still too long will be out of range.)
@@ -1125,7 +1125,7 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
neg = true;
n--;
str++;
- }
+ }
if (n >= 3 && str[0] == '0' && str[1] == '0') {
while (n >= 3 && str[2] == '0') {
@@ -1148,11 +1148,11 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
buf[n] = '\0';
*np = n;
return buf;
-}
-
+}
+
template <>
bool Parse(const char* str, size_t n, float* dest) {
- if (n == 0) return false;
+ if (n == 0) return false;
static const int kMaxLength = 200;
char buf[kMaxLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, true);
@@ -1185,127 +1185,127 @@ bool Parse(const char* str, size_t n, double* dest) {
template <>
bool Parse(const char* str, size_t n, long* dest, int radix) {
if (n == 0) return false;
- char buf[kMaxNumberLength+1];
+ char buf[kMaxNumberLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, false);
- char* end;
- errno = 0;
- long r = strtol(str, &end, radix);
- if (end != str + n) return false; // Leftover junk
- if (errno) return false;
- if (dest == NULL) return true;
+ char* end;
+ errno = 0;
+ long r = strtol(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
*dest = r;
- return true;
-}
-
+ return true;
+}
+
template <>
bool Parse(const char* str, size_t n, unsigned long* dest, int radix) {
- if (n == 0) return false;
- char buf[kMaxNumberLength+1];
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, false);
- if (str[0] == '-') {
+ if (str[0] == '-') {
// strtoul() will silently accept negative numbers and parse
// them. This module is more strict and treats them as errors.
return false;
- }
-
- char* end;
- errno = 0;
- unsigned long r = strtoul(str, &end, radix);
- if (end != str + n) return false; // Leftover junk
- if (errno) return false;
- if (dest == NULL) return true;
+ }
+
+ char* end;
+ errno = 0;
+ unsigned long r = strtoul(str, &end, radix);
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
*dest = r;
- return true;
-}
-
+ return true;
+}
+
template <>
bool Parse(const char* str, size_t n, short* dest, int radix) {
- long r;
+ long r;
if (!Parse(str, n, &r, radix)) return false; // Could not parse
if ((short)r != r) return false; // Out of range
- if (dest == NULL) return true;
+ if (dest == NULL) return true;
*dest = (short)r;
- return true;
-}
-
+ return true;
+}
+
template <>
bool Parse(const char* str, size_t n, unsigned short* dest, int radix) {
- unsigned long r;
+ unsigned long r;
if (!Parse(str, n, &r, radix)) return false; // Could not parse
if ((unsigned short)r != r) return false; // Out of range
- if (dest == NULL) return true;
+ if (dest == NULL) return true;
*dest = (unsigned short)r;
- return true;
-}
-
+ return true;
+}
+
template <>
bool Parse(const char* str, size_t n, int* dest, int radix) {
- long r;
+ long r;
if (!Parse(str, n, &r, radix)) return false; // Could not parse
if ((int)r != r) return false; // Out of range
- if (dest == NULL) return true;
+ if (dest == NULL) return true;
*dest = (int)r;
- return true;
-}
-
+ return true;
+}
+
template <>
bool Parse(const char* str, size_t n, unsigned int* dest, int radix) {
- unsigned long r;
+ unsigned long r;
if (!Parse(str, n, &r, radix)) return false; // Could not parse
if ((unsigned int)r != r) return false; // Out of range
- if (dest == NULL) return true;
+ if (dest == NULL) return true;
*dest = (unsigned int)r;
- return true;
-}
-
+ return true;
+}
+
template <>
bool Parse(const char* str, size_t n, long long* dest, int radix) {
- if (n == 0) return false;
- char buf[kMaxNumberLength+1];
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, false);
- char* end;
- errno = 0;
+ char* end;
+ errno = 0;
long long r = strtoll(str, &end, radix);
- if (end != str + n) return false; // Leftover junk
- if (errno) return false;
- if (dest == NULL) return true;
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
*dest = r;
- return true;
-}
-
+ return true;
+}
+
template <>
bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) {
- if (n == 0) return false;
- char buf[kMaxNumberLength+1];
+ if (n == 0) return false;
+ char buf[kMaxNumberLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, false);
- if (str[0] == '-') {
- // strtoull() will silently accept negative numbers and parse
- // them. This module is more strict and treats them as errors.
- return false;
- }
- char* end;
- errno = 0;
+ if (str[0] == '-') {
+ // strtoull() will silently accept negative numbers and parse
+ // them. This module is more strict and treats them as errors.
+ return false;
+ }
+ char* end;
+ errno = 0;
unsigned long long r = strtoull(str, &end, radix);
- if (end != str + n) return false; // Leftover junk
- if (errno) return false;
- if (dest == NULL) return true;
+ if (end != str + n) return false; // Leftover junk
+ if (errno) return false;
+ if (dest == NULL) return true;
*dest = r;
- return true;
-}
-
+ return true;
+}
+
} // namespace re2_internal
-
+
namespace hooks {
-
+
#ifdef RE2_HAVE_THREAD_LOCAL
thread_local const RE2* context = NULL;
#endif
-
+
template <typename T>
union Hook {
void Store(T* cb) { cb_.store(cb, std::memory_order_release); }
T* Load() const { return cb_.load(std::memory_order_acquire); }
-
+
#if !defined(__clang__) && defined(_MSC_VER)
// Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent,
// this is a gross hack to make std::atomic<T*> constant-initialized on MSVC.
@@ -1313,10 +1313,10 @@ union Hook {
"std::atomic<T*> must be always lock-free");
T* cb_for_constinit_;
#endif
-
+
std::atomic<T*> cb_;
};
-
+
template <typename T>
static void DoNothing(const T&) {}
@@ -1332,4 +1332,4 @@ DEFINE_HOOK(DFASearchFailure, dfa_search_failure)
} // namespace hooks
-} // namespace re2
+} // namespace re2
diff --git a/contrib/libs/re2/re2/regexp.cc b/contrib/libs/re2/re2/regexp.cc
index 949f9dbf72..ca1318b43d 100644
--- a/contrib/libs/re2/re2/regexp.cc
+++ b/contrib/libs/re2/re2/regexp.cc
@@ -1,11 +1,11 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Regular expression representation.
-// Tested by parse_test.cc
-
-#include "re2/regexp.h"
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression representation.
+// Tested by parse_test.cc
+
+#include "re2/regexp.h"
#include <stddef.h>
#include <stdint.h>
@@ -22,80 +22,80 @@
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/stringpiece.h"
-#include "re2/walker-inl.h"
-
-namespace re2 {
-
-// Constructor. Allocates vectors as appropriate for operator.
-Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Constructor. Allocates vectors as appropriate for operator.
+Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(static_cast<uint8_t>(op)),
- simple_(false),
+ simple_(false),
parse_flags_(static_cast<uint16_t>(parse_flags)),
- ref_(1),
- nsub_(0),
- down_(NULL) {
- subone_ = NULL;
- memset(the_union_, 0, sizeof the_union_);
-}
-
-// Destructor. Assumes already cleaned up children.
-// Private: use Decref() instead of delete to destroy Regexps.
-// Can't call Decref on the sub-Regexps here because
-// that could cause arbitrarily deep recursion, so
-// required Decref() to have handled them for us.
-Regexp::~Regexp() {
- if (nsub_ > 0)
- LOG(DFATAL) << "Regexp not destroyed.";
-
- switch (op_) {
- default:
- break;
- case kRegexpCapture:
- delete name_;
- break;
- case kRegexpLiteralString:
- delete[] runes_;
- break;
- case kRegexpCharClass:
+ ref_(1),
+ nsub_(0),
+ down_(NULL) {
+ subone_ = NULL;
+ memset(the_union_, 0, sizeof the_union_);
+}
+
+// Destructor. Assumes already cleaned up children.
+// Private: use Decref() instead of delete to destroy Regexps.
+// Can't call Decref on the sub-Regexps here because
+// that could cause arbitrarily deep recursion, so
+// required Decref() to have handled them for us.
+Regexp::~Regexp() {
+ if (nsub_ > 0)
+ LOG(DFATAL) << "Regexp not destroyed.";
+
+ switch (op_) {
+ default:
+ break;
+ case kRegexpCapture:
+ delete name_;
+ break;
+ case kRegexpLiteralString:
+ delete[] runes_;
+ break;
+ case kRegexpCharClass:
if (cc_)
cc_->Delete();
- delete ccb_;
- break;
- }
-}
-
-// If it's possible to destroy this regexp without recurring,
-// do so and return true. Else return false.
-bool Regexp::QuickDestroy() {
- if (nsub_ == 0) {
- delete this;
- return true;
- }
- return false;
-}
-
+ delete ccb_;
+ break;
+ }
+}
+
+// If it's possible to destroy this regexp without recurring,
+// do so and return true. Else return false.
+bool Regexp::QuickDestroy() {
+ if (nsub_ == 0) {
+ delete this;
+ return true;
+ }
+ return false;
+}
+
// Lazily allocated.
static Mutex* ref_mutex;
static std::map<Regexp*, int>* ref_map;
-
-int Regexp::Ref() {
- if (ref_ < kMaxRef)
- return ref_;
-
+
+int Regexp::Ref() {
+ if (ref_ < kMaxRef)
+ return ref_;
+
MutexLock l(ref_mutex);
return (*ref_map)[this];
-}
-
-// Increments reference count, returns object as convenience.
-Regexp* Regexp::Incref() {
- if (ref_ >= kMaxRef-1) {
+}
+
+// Increments reference count, returns object as convenience.
+Regexp* Regexp::Incref() {
+ if (ref_ >= kMaxRef-1) {
static std::once_flag ref_once;
std::call_once(ref_once, []() {
ref_mutex = new Mutex;
ref_map = new std::map<Regexp*, int>;
});
- // Store ref count in overflow map.
+ // Store ref count in overflow map.
MutexLock l(ref_mutex);
if (ref_ == kMaxRef) {
// already overflowed
@@ -104,97 +104,97 @@ Regexp* Regexp::Incref() {
// overflowing now
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
- }
- return this;
- }
-
- ref_++;
- return this;
-}
-
-// Decrements reference count and deletes this object if count reaches 0.
-void Regexp::Decref() {
- if (ref_ == kMaxRef) {
- // Ref count is stored in overflow map.
+ }
+ return this;
+ }
+
+ ref_++;
+ return this;
+}
+
+// Decrements reference count and deletes this object if count reaches 0.
+void Regexp::Decref() {
+ if (ref_ == kMaxRef) {
+ // Ref count is stored in overflow map.
MutexLock l(ref_mutex);
int r = (*ref_map)[this] - 1;
- if (r < kMaxRef) {
+ if (r < kMaxRef) {
ref_ = static_cast<uint16_t>(r);
ref_map->erase(this);
- } else {
+ } else {
(*ref_map)[this] = r;
- }
- return;
- }
- ref_--;
- if (ref_ == 0)
- Destroy();
-}
-
-// Deletes this object; ref count has count reached 0.
-void Regexp::Destroy() {
- if (QuickDestroy())
- return;
-
- // Handle recursive Destroy with explicit stack
- // to avoid arbitrarily deep recursion on process stack [sigh].
- down_ = NULL;
- Regexp* stack = this;
- while (stack != NULL) {
- Regexp* re = stack;
- stack = re->down_;
- if (re->ref_ != 0)
- LOG(DFATAL) << "Bad reference count " << re->ref_;
- if (re->nsub_ > 0) {
- Regexp** subs = re->sub();
- for (int i = 0; i < re->nsub_; i++) {
- Regexp* sub = subs[i];
- if (sub == NULL)
- continue;
- if (sub->ref_ == kMaxRef)
- sub->Decref();
- else
- --sub->ref_;
- if (sub->ref_ == 0 && !sub->QuickDestroy()) {
- sub->down_ = stack;
- stack = sub;
- }
- }
- if (re->nsub_ > 1)
- delete[] subs;
- re->nsub_ = 0;
- }
- delete re;
- }
-}
-
-void Regexp::AddRuneToString(Rune r) {
- DCHECK(op_ == kRegexpLiteralString);
- if (nrunes_ == 0) {
- // start with 8
- runes_ = new Rune[8];
- } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
- // double on powers of two
- Rune *old = runes_;
- runes_ = new Rune[nrunes_ * 2];
- for (int i = 0; i < nrunes_; i++)
- runes_[i] = old[i];
- delete[] old;
- }
-
- runes_[nrunes_++] = r;
-}
-
-Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
- Regexp* re = new Regexp(kRegexpHaveMatch, flags);
- re->match_id_ = match_id;
- return re;
-}
-
+ }
+ return;
+ }
+ ref_--;
+ if (ref_ == 0)
+ Destroy();
+}
+
+// Deletes this object; ref count has count reached 0.
+void Regexp::Destroy() {
+ if (QuickDestroy())
+ return;
+
+ // Handle recursive Destroy with explicit stack
+ // to avoid arbitrarily deep recursion on process stack [sigh].
+ down_ = NULL;
+ Regexp* stack = this;
+ while (stack != NULL) {
+ Regexp* re = stack;
+ stack = re->down_;
+ if (re->ref_ != 0)
+ LOG(DFATAL) << "Bad reference count " << re->ref_;
+ if (re->nsub_ > 0) {
+ Regexp** subs = re->sub();
+ for (int i = 0; i < re->nsub_; i++) {
+ Regexp* sub = subs[i];
+ if (sub == NULL)
+ continue;
+ if (sub->ref_ == kMaxRef)
+ sub->Decref();
+ else
+ --sub->ref_;
+ if (sub->ref_ == 0 && !sub->QuickDestroy()) {
+ sub->down_ = stack;
+ stack = sub;
+ }
+ }
+ if (re->nsub_ > 1)
+ delete[] subs;
+ re->nsub_ = 0;
+ }
+ delete re;
+ }
+}
+
+void Regexp::AddRuneToString(Rune r) {
+ DCHECK(op_ == kRegexpLiteralString);
+ if (nrunes_ == 0) {
+ // start with 8
+ runes_ = new Rune[8];
+ } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
+ // double on powers of two
+ Rune *old = runes_;
+ runes_ = new Rune[nrunes_ * 2];
+ for (int i = 0; i < nrunes_; i++)
+ runes_[i] = old[i];
+ delete[] old;
+ }
+
+ runes_[nrunes_++] = r;
+}
+
+Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpHaveMatch, flags);
+ re->match_id_ = match_id;
+ return re;
+}
+
Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
// Squash **, ++ and ??.
if (op == sub->op() && flags == sub->parse_flags())
- return sub;
+ return sub;
// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
// op is Star/Plus/Quest, we just have to check that sub->op() is too.
@@ -215,28 +215,28 @@ Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
}
Regexp* re = new Regexp(op, flags);
- re->AllocSub(1);
- re->sub()[0] = sub;
- return re;
-}
-
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ return re;
+}
+
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpPlus, sub, flags);
}
-Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
+Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpStar, sub, flags);
-}
-
-Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
+}
+
+Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpQuest, sub, flags);
-}
-
-Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
- ParseFlags flags, bool can_factor) {
- if (nsub == 1)
- return sub[0];
-
+}
+
+Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
+ ParseFlags flags, bool can_factor) {
+ if (nsub == 1)
+ return sub[0];
+
if (nsub == 0) {
if (op == kRegexpAlternate)
return new Regexp(kRegexpNoMatch, flags);
@@ -245,416 +245,416 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
}
PODArray<Regexp*> subcopy;
- if (op == kRegexpAlternate && can_factor) {
- // Going to edit sub; make a copy so we don't step on caller.
+ if (op == kRegexpAlternate && can_factor) {
+ // Going to edit sub; make a copy so we don't step on caller.
subcopy = PODArray<Regexp*>(nsub);
memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
sub = subcopy.data();
- nsub = FactorAlternation(sub, nsub, flags);
- if (nsub == 1) {
- Regexp* re = sub[0];
- return re;
- }
- }
-
- if (nsub > kMaxNsub) {
- // Too many subexpressions to fit in a single Regexp.
- // Make a two-level tree. Two levels gets us to 65535^2.
- int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
- Regexp* re = new Regexp(op, flags);
- re->AllocSub(nbigsub);
- Regexp** subs = re->sub();
- for (int i = 0; i < nbigsub - 1; i++)
- subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
- subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
- nsub - (nbigsub-1)*kMaxNsub, flags,
- false);
- return re;
- }
-
- Regexp* re = new Regexp(op, flags);
- re->AllocSub(nsub);
- Regexp** subs = re->sub();
- for (int i = 0; i < nsub; i++)
- subs[i] = sub[i];
- return re;
-}
-
-Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
- return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
-}
-
-Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
- return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
-}
-
-Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
- return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
-}
-
-Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
- Regexp* re = new Regexp(kRegexpCapture, flags);
- re->AllocSub(1);
- re->sub()[0] = sub;
- re->cap_ = cap;
- return re;
-}
-
-Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
- Regexp* re = new Regexp(kRegexpRepeat, flags);
- re->AllocSub(1);
- re->sub()[0] = sub;
- re->min_ = min;
- re->max_ = max;
- return re;
-}
-
-Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
- Regexp* re = new Regexp(kRegexpLiteral, flags);
- re->rune_ = rune;
- return re;
-}
-
-Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
- if (nrunes <= 0)
- return new Regexp(kRegexpEmptyMatch, flags);
- if (nrunes == 1)
- return NewLiteral(runes[0], flags);
- Regexp* re = new Regexp(kRegexpLiteralString, flags);
- for (int i = 0; i < nrunes; i++)
- re->AddRuneToString(runes[i]);
- return re;
-}
-
-Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
- Regexp* re = new Regexp(kRegexpCharClass, flags);
- re->cc_ = cc;
- return re;
-}
-
-void Regexp::Swap(Regexp* that) {
+ nsub = FactorAlternation(sub, nsub, flags);
+ if (nsub == 1) {
+ Regexp* re = sub[0];
+ return re;
+ }
+ }
+
+ if (nsub > kMaxNsub) {
+ // Too many subexpressions to fit in a single Regexp.
+ // Make a two-level tree. Two levels gets us to 65535^2.
+ int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(nbigsub);
+ Regexp** subs = re->sub();
+ for (int i = 0; i < nbigsub - 1; i++)
+ subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
+ subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
+ nsub - (nbigsub-1)*kMaxNsub, flags,
+ false);
+ return re;
+ }
+
+ Regexp* re = new Regexp(op, flags);
+ re->AllocSub(nsub);
+ Regexp** subs = re->sub();
+ for (int i = 0; i < nsub; i++)
+ subs[i] = sub[i];
+ return re;
+}
+
+Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
+}
+
+Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
+}
+
+Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
+ return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
+}
+
+Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
+ Regexp* re = new Regexp(kRegexpCapture, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ re->cap_ = cap;
+ return re;
+}
+
+Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
+ Regexp* re = new Regexp(kRegexpRepeat, flags);
+ re->AllocSub(1);
+ re->sub()[0] = sub;
+ re->min_ = min;
+ re->max_ = max;
+ return re;
+}
+
+Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpLiteral, flags);
+ re->rune_ = rune;
+ return re;
+}
+
+Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
+ if (nrunes <= 0)
+ return new Regexp(kRegexpEmptyMatch, flags);
+ if (nrunes == 1)
+ return NewLiteral(runes[0], flags);
+ Regexp* re = new Regexp(kRegexpLiteralString, flags);
+ for (int i = 0; i < nrunes; i++)
+ re->AddRuneToString(runes[i]);
+ return re;
+}
+
+Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags);
+ re->cc_ = cc;
+ return re;
+}
+
+void Regexp::Swap(Regexp* that) {
// Regexp is not trivially copyable, so we cannot freely copy it with
// memmove(3), but swapping objects like so is safe for our purposes.
- char tmp[sizeof *this];
+ char tmp[sizeof *this];
void* vthis = reinterpret_cast<void*>(this);
void* vthat = reinterpret_cast<void*>(that);
memmove(tmp, vthis, sizeof *this);
memmove(vthis, vthat, sizeof *this);
memmove(vthat, tmp, sizeof *this);
-}
-
-// Tests equality of all top-level structure but not subregexps.
-static bool TopEqual(Regexp* a, Regexp* b) {
- if (a->op() != b->op())
- return false;
-
- switch (a->op()) {
- case kRegexpNoMatch:
- case kRegexpEmptyMatch:
- case kRegexpAnyChar:
- case kRegexpAnyByte:
- case kRegexpBeginLine:
- case kRegexpEndLine:
- case kRegexpWordBoundary:
- case kRegexpNoWordBoundary:
- case kRegexpBeginText:
- return true;
-
- case kRegexpEndText:
- // The parse flags remember whether it's \z or (?-m:$),
- // which matters when testing against PCRE.
- return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
-
- case kRegexpLiteral:
- return a->rune() == b->rune() &&
- ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
-
- case kRegexpLiteralString:
- return a->nrunes() == b->nrunes() &&
- ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
- memcmp(a->runes(), b->runes(),
- a->nrunes() * sizeof a->runes()[0]) == 0;
-
- case kRegexpAlternate:
- case kRegexpConcat:
- return a->nsub() == b->nsub();
-
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
-
- case kRegexpRepeat:
- return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
- a->min() == b->min() &&
- a->max() == b->max();
-
- case kRegexpCapture:
- return a->cap() == b->cap() && a->name() == b->name();
-
- case kRegexpHaveMatch:
- return a->match_id() == b->match_id();
-
- case kRegexpCharClass: {
- CharClass* acc = a->cc();
- CharClass* bcc = b->cc();
- return acc->size() == bcc->size() &&
- acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
- memcmp(acc->begin(), bcc->begin(),
- (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
- }
- }
-
- LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
- return 0;
-}
-
-bool Regexp::Equal(Regexp* a, Regexp* b) {
- if (a == NULL || b == NULL)
- return a == b;
-
- if (!TopEqual(a, b))
- return false;
-
- // Fast path:
- // return without allocating vector if there are no subregexps.
- switch (a->op()) {
- case kRegexpAlternate:
- case kRegexpConcat:
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- case kRegexpRepeat:
- case kRegexpCapture:
- break;
-
- default:
- return true;
- }
-
- // Committed to doing real work.
- // The stack (vector) has pairs of regexps waiting to
- // be compared. The regexps are only equal if
- // all the pairs end up being equal.
+}
+
+// Tests equality of all top-level structure but not subregexps.
+static bool TopEqual(Regexp* a, Regexp* b) {
+ if (a->op() != b->op())
+ return false;
+
+ switch (a->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpBeginText:
+ return true;
+
+ case kRegexpEndText:
+ // The parse flags remember whether it's \z or (?-m:$),
+ // which matters when testing against PCRE.
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
+
+ case kRegexpLiteral:
+ return a->rune() == b->rune() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
+
+ case kRegexpLiteralString:
+ return a->nrunes() == b->nrunes() &&
+ ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
+ memcmp(a->runes(), b->runes(),
+ a->nrunes() * sizeof a->runes()[0]) == 0;
+
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ return a->nsub() == b->nsub();
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
+
+ case kRegexpRepeat:
+ return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
+ a->min() == b->min() &&
+ a->max() == b->max();
+
+ case kRegexpCapture:
+ return a->cap() == b->cap() && a->name() == b->name();
+
+ case kRegexpHaveMatch:
+ return a->match_id() == b->match_id();
+
+ case kRegexpCharClass: {
+ CharClass* acc = a->cc();
+ CharClass* bcc = b->cc();
+ return acc->size() == bcc->size() &&
+ acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
+ memcmp(acc->begin(), bcc->begin(),
+ (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
+ }
+ }
+
+ LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
+ return 0;
+}
+
+bool Regexp::Equal(Regexp* a, Regexp* b) {
+ if (a == NULL || b == NULL)
+ return a == b;
+
+ if (!TopEqual(a, b))
+ return false;
+
+ // Fast path:
+ // return without allocating vector if there are no subregexps.
+ switch (a->op()) {
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ break;
+
+ default:
+ return true;
+ }
+
+ // Committed to doing real work.
+ // The stack (vector) has pairs of regexps waiting to
+ // be compared. The regexps are only equal if
+ // all the pairs end up being equal.
std::vector<Regexp*> stk;
-
- for (;;) {
- // Invariant: TopEqual(a, b) == true.
- Regexp* a2;
- Regexp* b2;
- switch (a->op()) {
- default:
- break;
- case kRegexpAlternate:
- case kRegexpConcat:
- for (int i = 0; i < a->nsub(); i++) {
- a2 = a->sub()[i];
- b2 = b->sub()[i];
- if (!TopEqual(a2, b2))
- return false;
- stk.push_back(a2);
- stk.push_back(b2);
- }
- break;
-
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- case kRegexpRepeat:
- case kRegexpCapture:
- a2 = a->sub()[0];
- b2 = b->sub()[0];
- if (!TopEqual(a2, b2))
- return false;
- // Really:
- // stk.push_back(a2);
- // stk.push_back(b2);
- // break;
- // but faster to assign directly and loop.
- a = a2;
- b = b2;
- continue;
- }
-
+
+ for (;;) {
+ // Invariant: TopEqual(a, b) == true.
+ Regexp* a2;
+ Regexp* b2;
+ switch (a->op()) {
+ default:
+ break;
+ case kRegexpAlternate:
+ case kRegexpConcat:
+ for (int i = 0; i < a->nsub(); i++) {
+ a2 = a->sub()[i];
+ b2 = b->sub()[i];
+ if (!TopEqual(a2, b2))
+ return false;
+ stk.push_back(a2);
+ stk.push_back(b2);
+ }
+ break;
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ case kRegexpCapture:
+ a2 = a->sub()[0];
+ b2 = b->sub()[0];
+ if (!TopEqual(a2, b2))
+ return false;
+ // Really:
+ // stk.push_back(a2);
+ // stk.push_back(b2);
+ // break;
+ // but faster to assign directly and loop.
+ a = a2;
+ b = b2;
+ continue;
+ }
+
size_t n = stk.size();
- if (n == 0)
- break;
-
+ if (n == 0)
+ break;
+
DCHECK_GE(n, 2);
- a = stk[n-2];
- b = stk[n-1];
- stk.resize(n-2);
- }
-
- return true;
-}
-
-// Keep in sync with enum RegexpStatusCode in regexp.h
+ a = stk[n-2];
+ b = stk[n-1];
+ stk.resize(n-2);
+ }
+
+ return true;
+}
+
+// Keep in sync with enum RegexpStatusCode in regexp.h
static const char *kErrorStrings[] = {
- "no error",
- "unexpected error",
- "invalid escape sequence",
- "invalid character class",
- "invalid character class range",
- "missing ]",
- "missing )",
+ "no error",
+ "unexpected error",
+ "invalid escape sequence",
+ "invalid character class",
+ "invalid character class range",
+ "missing ]",
+ "missing )",
"unexpected )",
- "trailing \\",
- "no argument for repetition operator",
- "invalid repetition size",
- "bad repetition operator",
- "invalid perl operator",
- "invalid UTF-8",
- "invalid named capture group",
-};
-
+ "trailing \\",
+ "no argument for repetition operator",
+ "invalid repetition size",
+ "bad repetition operator",
+ "invalid perl operator",
+ "invalid UTF-8",
+ "invalid named capture group",
+};
+
std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
- if (code < 0 || code >= arraysize(kErrorStrings))
- code = kRegexpInternalError;
- return kErrorStrings[code];
-}
-
+ if (code < 0 || code >= arraysize(kErrorStrings))
+ code = kRegexpInternalError;
+ return kErrorStrings[code];
+}
+
std::string RegexpStatus::Text() const {
- if (error_arg_.empty())
- return CodeText(code_);
+ if (error_arg_.empty())
+ return CodeText(code_);
std::string s;
- s.append(CodeText(code_));
- s.append(": ");
- s.append(error_arg_.data(), error_arg_.size());
- return s;
-}
-
-void RegexpStatus::Copy(const RegexpStatus& status) {
- code_ = status.code_;
- error_arg_ = status.error_arg_;
-}
-
-typedef int Ignored; // Walker<void> doesn't exist
-
-// Walker subclass to count capturing parens in regexp.
-class NumCapturesWalker : public Regexp::Walker<Ignored> {
- public:
- NumCapturesWalker() : ncapture_(0) {}
- int ncapture() { return ncapture_; }
-
- virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
- if (re->op() == kRegexpCapture)
- ncapture_++;
- return ignored;
- }
-
- virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ s.append(CodeText(code_));
+ s.append(": ");
+ s.append(error_arg_.data(), error_arg_.size());
+ return s;
+}
+
+void RegexpStatus::Copy(const RegexpStatus& status) {
+ code_ = status.code_;
+ error_arg_ = status.error_arg_;
+}
+
+typedef int Ignored; // Walker<void> doesn't exist
+
+// Walker subclass to count capturing parens in regexp.
+class NumCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+ NumCapturesWalker() : ncapture_(0) {}
+ int ncapture() { return ncapture_; }
+
+ virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
+ if (re->op() == kRegexpCapture)
+ ncapture_++;
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
- LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
+ LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
#endif
- return ignored;
- }
-
- private:
- int ncapture_;
+ return ignored;
+ }
+
+ private:
+ int ncapture_;
NumCapturesWalker(const NumCapturesWalker&) = delete;
NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
-};
-
-int Regexp::NumCaptures() {
- NumCapturesWalker w;
- w.Walk(this, 0);
- return w.ncapture();
-}
-
-// Walker class to build map of named capture groups and their indices.
-class NamedCapturesWalker : public Regexp::Walker<Ignored> {
- public:
- NamedCapturesWalker() : map_(NULL) {}
- ~NamedCapturesWalker() { delete map_; }
-
+};
+
+int Regexp::NumCaptures() {
+ NumCapturesWalker w;
+ w.Walk(this, 0);
+ return w.ncapture();
+}
+
+// Walker class to build map of named capture groups and their indices.
+class NamedCapturesWalker : public Regexp::Walker<Ignored> {
+ public:
+ NamedCapturesWalker() : map_(NULL) {}
+ ~NamedCapturesWalker() { delete map_; }
+
std::map<std::string, int>* TakeMap() {
std::map<std::string, int>* m = map_;
- map_ = NULL;
- return m;
- }
-
+ map_ = NULL;
+ return m;
+ }
+
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
- if (re->op() == kRegexpCapture && re->name() != NULL) {
- // Allocate map once we find a name.
- if (map_ == NULL)
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
+ // Allocate map once we find a name.
+ if (map_ == NULL)
map_ = new std::map<std::string, int>;
-
- // Record first occurrence of each name.
- // (The rule is that if you have the same name
- // multiple times, only the leftmost one counts.)
+
+ // Record first occurrence of each name.
+ // (The rule is that if you have the same name
+ // multiple times, only the leftmost one counts.)
map_->insert({*re->name(), re->cap()});
- }
- return ignored;
- }
-
- virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+ }
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
- LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
+ LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
#endif
- return ignored;
- }
-
- private:
+ return ignored;
+ }
+
+ private:
std::map<std::string, int>* map_;
NamedCapturesWalker(const NamedCapturesWalker&) = delete;
NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
-};
-
+};
+
std::map<std::string, int>* Regexp::NamedCaptures() {
- NamedCapturesWalker w;
- w.Walk(this, 0);
- return w.TakeMap();
-}
-
-// Walker class to build map from capture group indices to their names.
-class CaptureNamesWalker : public Regexp::Walker<Ignored> {
- public:
- CaptureNamesWalker() : map_(NULL) {}
- ~CaptureNamesWalker() { delete map_; }
-
+ NamedCapturesWalker w;
+ w.Walk(this, 0);
+ return w.TakeMap();
+}
+
+// Walker class to build map from capture group indices to their names.
+class CaptureNamesWalker : public Regexp::Walker<Ignored> {
+ public:
+ CaptureNamesWalker() : map_(NULL) {}
+ ~CaptureNamesWalker() { delete map_; }
+
std::map<int, std::string>* TakeMap() {
std::map<int, std::string>* m = map_;
- map_ = NULL;
- return m;
- }
-
+ map_ = NULL;
+ return m;
+ }
+
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
- if (re->op() == kRegexpCapture && re->name() != NULL) {
- // Allocate map once we find a name.
- if (map_ == NULL)
+ if (re->op() == kRegexpCapture && re->name() != NULL) {
+ // Allocate map once we find a name.
+ if (map_ == NULL)
map_ = new std::map<int, std::string>;
-
- (*map_)[re->cap()] = *re->name();
- }
- return ignored;
- }
-
- virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
+
+ (*map_)[re->cap()] = *re->name();
+ }
+ return ignored;
+ }
+
+ virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
- LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
+ LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
#endif
- return ignored;
- }
-
- private:
+ return ignored;
+ }
+
+ private:
std::map<int, std::string>* map_;
CaptureNamesWalker(const CaptureNamesWalker&) = delete;
CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
-};
-
+};
+
std::map<int, std::string>* Regexp::CaptureNames() {
- CaptureNamesWalker w;
- w.Walk(this, 0);
- return w.TakeMap();
-}
-
+ CaptureNamesWalker w;
+ w.Walk(this, 0);
+ return w.TakeMap();
+}
+
void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
std::string* bytes) {
if (latin1) {
@@ -671,48 +671,48 @@ void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
}
}
-// Determines whether regexp matches must be anchored
-// with a fixed string prefix. If so, returns the prefix and
-// the regexp that remains after the prefix. The prefix might
-// be ASCII case-insensitive.
+// Determines whether regexp matches must be anchored
+// with a fixed string prefix. If so, returns the prefix and
+// the regexp that remains after the prefix. The prefix might
+// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix) {
prefix->clear();
*foldcase = false;
*suffix = NULL;
- // No need for a walker: the regexp must be of the form
- // 1. some number of ^ anchors
- // 2. a literal char or string
- // 3. the rest
- if (op_ != kRegexpConcat)
- return false;
- int i = 0;
+ // No need for a walker: the regexp must be of the form
+ // 1. some number of ^ anchors
+ // 2. a literal char or string
+ // 3. the rest
+ if (op_ != kRegexpConcat)
+ return false;
+ int i = 0;
while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
- i++;
+ i++;
if (i == 0 || i >= nsub_)
- return false;
+ return false;
Regexp* re = sub()[i];
if (re->op_ != kRegexpLiteral &&
re->op_ != kRegexpLiteralString)
return false;
i++;
- if (i < nsub_) {
- for (int j = i; j < nsub_; j++)
+ if (i < nsub_) {
+ for (int j = i; j < nsub_; j++)
sub()[j]->Incref();
*suffix = Concat(sub() + i, nsub_ - i, parse_flags());
- } else {
+ } else {
*suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
- }
+ }
bool latin1 = (re->parse_flags() & Latin1) != 0;
Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
ConvertRunesToBytes(latin1, runes, nrunes, prefix);
*foldcase = (re->parse_flags() & FoldCase) != 0;
- return true;
-}
-
+ return true;
+}
+
// Determines whether regexp matches must be unanchored
// with a fixed string prefix. If so, returns the prefix.
// The prefix might be ASCII case-insensitive.
@@ -741,246 +741,246 @@ bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
return true;
}
-// Character class builder is a balanced binary tree (STL set)
-// containing non-overlapping, non-abutting RuneRanges.
-// The less-than operator used in the tree treats two
-// ranges as equal if they overlap at all, so that
-// lookups for a particular Rune are possible.
-
-CharClassBuilder::CharClassBuilder() {
- nrunes_ = 0;
- upper_ = 0;
- lower_ = 0;
-}
-
-// Add lo-hi to the class; return whether class got bigger.
-bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
- if (hi < lo)
- return false;
-
- if (lo <= 'z' && hi >= 'A') {
- // Overlaps some alpha, maybe not all.
- // Update bitmaps telling which ASCII letters are in the set.
+// Character class builder is a balanced binary tree (STL set)
+// containing non-overlapping, non-abutting RuneRanges.
+// The less-than operator used in the tree treats two
+// ranges as equal if they overlap at all, so that
+// lookups for a particular Rune are possible.
+
+CharClassBuilder::CharClassBuilder() {
+ nrunes_ = 0;
+ upper_ = 0;
+ lower_ = 0;
+}
+
+// Add lo-hi to the class; return whether class got bigger.
+bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
+ if (hi < lo)
+ return false;
+
+ if (lo <= 'z' && hi >= 'A') {
+ // Overlaps some alpha, maybe not all.
+ // Update bitmaps telling which ASCII letters are in the set.
Rune lo1 = std::max<Rune>(lo, 'A');
Rune hi1 = std::min<Rune>(hi, 'Z');
- if (lo1 <= hi1)
- upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
-
+ if (lo1 <= hi1)
+ upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
+
lo1 = std::max<Rune>(lo, 'a');
hi1 = std::min<Rune>(hi, 'z');
- if (lo1 <= hi1)
- lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
- }
-
- { // Check whether lo, hi is already in the class.
- iterator it = ranges_.find(RuneRange(lo, lo));
- if (it != end() && it->lo <= lo && hi <= it->hi)
- return false;
- }
-
- // Look for a range abutting lo on the left.
- // If it exists, take it out and increase our range.
- if (lo > 0) {
- iterator it = ranges_.find(RuneRange(lo-1, lo-1));
- if (it != end()) {
- lo = it->lo;
- if (it->hi > hi)
- hi = it->hi;
- nrunes_ -= it->hi - it->lo + 1;
- ranges_.erase(it);
- }
- }
-
- // Look for a range abutting hi on the right.
- // If it exists, take it out and increase our range.
- if (hi < Runemax) {
- iterator it = ranges_.find(RuneRange(hi+1, hi+1));
- if (it != end()) {
- hi = it->hi;
- nrunes_ -= it->hi - it->lo + 1;
- ranges_.erase(it);
- }
- }
-
- // Look for ranges between lo and hi. Take them out.
- // This is only safe because the set has no overlapping ranges.
- // We've already removed any ranges abutting lo and hi, so
- // any that overlap [lo, hi] must be contained within it.
- for (;;) {
- iterator it = ranges_.find(RuneRange(lo, hi));
- if (it == end())
- break;
- nrunes_ -= it->hi - it->lo + 1;
- ranges_.erase(it);
- }
-
- // Finally, add [lo, hi].
- nrunes_ += hi - lo + 1;
- ranges_.insert(RuneRange(lo, hi));
- return true;
-}
-
-void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
- for (iterator it = cc->begin(); it != cc->end(); ++it)
- AddRange(it->lo, it->hi);
-}
-
-bool CharClassBuilder::Contains(Rune r) {
- return ranges_.find(RuneRange(r, r)) != end();
-}
-
-// Does the character class behave the same on A-Z as on a-z?
-bool CharClassBuilder::FoldsASCII() {
- return ((upper_ ^ lower_) & AlphaMask) == 0;
-}
-
-CharClassBuilder* CharClassBuilder::Copy() {
- CharClassBuilder* cc = new CharClassBuilder;
- for (iterator it = begin(); it != end(); ++it)
- cc->ranges_.insert(RuneRange(it->lo, it->hi));
- cc->upper_ = upper_;
- cc->lower_ = lower_;
- cc->nrunes_ = nrunes_;
- return cc;
-}
-
-
-
-void CharClassBuilder::RemoveAbove(Rune r) {
- if (r >= Runemax)
- return;
-
- if (r < 'z') {
- if (r < 'a')
- lower_ = 0;
- else
- lower_ &= AlphaMask >> ('z' - r);
- }
-
- if (r < 'Z') {
- if (r < 'A')
- upper_ = 0;
- else
- upper_ &= AlphaMask >> ('Z' - r);
- }
-
- for (;;) {
-
- iterator it = ranges_.find(RuneRange(r + 1, Runemax));
- if (it == end())
- break;
- RuneRange rr = *it;
- ranges_.erase(it);
- nrunes_ -= rr.hi - rr.lo + 1;
- if (rr.lo <= r) {
- rr.hi = r;
- ranges_.insert(rr);
- nrunes_ += rr.hi - rr.lo + 1;
- }
- }
-}
-
-void CharClassBuilder::Negate() {
- // Build up negation and then copy in.
- // Could edit ranges in place, but C++ won't let me.
+ if (lo1 <= hi1)
+ lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
+ }
+
+ { // Check whether lo, hi is already in the class.
+ iterator it = ranges_.find(RuneRange(lo, lo));
+ if (it != end() && it->lo <= lo && hi <= it->hi)
+ return false;
+ }
+
+ // Look for a range abutting lo on the left.
+ // If it exists, take it out and increase our range.
+ if (lo > 0) {
+ iterator it = ranges_.find(RuneRange(lo-1, lo-1));
+ if (it != end()) {
+ lo = it->lo;
+ if (it->hi > hi)
+ hi = it->hi;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+ }
+
+ // Look for a range abutting hi on the right.
+ // If it exists, take it out and increase our range.
+ if (hi < Runemax) {
+ iterator it = ranges_.find(RuneRange(hi+1, hi+1));
+ if (it != end()) {
+ hi = it->hi;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+ }
+
+ // Look for ranges between lo and hi. Take them out.
+ // This is only safe because the set has no overlapping ranges.
+ // We've already removed any ranges abutting lo and hi, so
+ // any that overlap [lo, hi] must be contained within it.
+ for (;;) {
+ iterator it = ranges_.find(RuneRange(lo, hi));
+ if (it == end())
+ break;
+ nrunes_ -= it->hi - it->lo + 1;
+ ranges_.erase(it);
+ }
+
+ // Finally, add [lo, hi].
+ nrunes_ += hi - lo + 1;
+ ranges_.insert(RuneRange(lo, hi));
+ return true;
+}
+
+void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
+ for (iterator it = cc->begin(); it != cc->end(); ++it)
+ AddRange(it->lo, it->hi);
+}
+
+bool CharClassBuilder::Contains(Rune r) {
+ return ranges_.find(RuneRange(r, r)) != end();
+}
+
+// Does the character class behave the same on A-Z as on a-z?
+bool CharClassBuilder::FoldsASCII() {
+ return ((upper_ ^ lower_) & AlphaMask) == 0;
+}
+
+CharClassBuilder* CharClassBuilder::Copy() {
+ CharClassBuilder* cc = new CharClassBuilder;
+ for (iterator it = begin(); it != end(); ++it)
+ cc->ranges_.insert(RuneRange(it->lo, it->hi));
+ cc->upper_ = upper_;
+ cc->lower_ = lower_;
+ cc->nrunes_ = nrunes_;
+ return cc;
+}
+
+
+
+void CharClassBuilder::RemoveAbove(Rune r) {
+ if (r >= Runemax)
+ return;
+
+ if (r < 'z') {
+ if (r < 'a')
+ lower_ = 0;
+ else
+ lower_ &= AlphaMask >> ('z' - r);
+ }
+
+ if (r < 'Z') {
+ if (r < 'A')
+ upper_ = 0;
+ else
+ upper_ &= AlphaMask >> ('Z' - r);
+ }
+
+ for (;;) {
+
+ iterator it = ranges_.find(RuneRange(r + 1, Runemax));
+ if (it == end())
+ break;
+ RuneRange rr = *it;
+ ranges_.erase(it);
+ nrunes_ -= rr.hi - rr.lo + 1;
+ if (rr.lo <= r) {
+ rr.hi = r;
+ ranges_.insert(rr);
+ nrunes_ += rr.hi - rr.lo + 1;
+ }
+ }
+}
+
+void CharClassBuilder::Negate() {
+ // Build up negation and then copy in.
+ // Could edit ranges in place, but C++ won't let me.
std::vector<RuneRange> v;
- v.reserve(ranges_.size() + 1);
-
- // In negation, first range begins at 0, unless
- // the current class begins at 0.
- iterator it = begin();
- if (it == end()) {
- v.push_back(RuneRange(0, Runemax));
- } else {
- int nextlo = 0;
- if (it->lo == 0) {
- nextlo = it->hi + 1;
- ++it;
- }
- for (; it != end(); ++it) {
- v.push_back(RuneRange(nextlo, it->lo - 1));
- nextlo = it->hi + 1;
- }
- if (nextlo <= Runemax)
- v.push_back(RuneRange(nextlo, Runemax));
- }
-
- ranges_.clear();
+ v.reserve(ranges_.size() + 1);
+
+ // In negation, first range begins at 0, unless
+ // the current class begins at 0.
+ iterator it = begin();
+ if (it == end()) {
+ v.push_back(RuneRange(0, Runemax));
+ } else {
+ int nextlo = 0;
+ if (it->lo == 0) {
+ nextlo = it->hi + 1;
+ ++it;
+ }
+ for (; it != end(); ++it) {
+ v.push_back(RuneRange(nextlo, it->lo - 1));
+ nextlo = it->hi + 1;
+ }
+ if (nextlo <= Runemax)
+ v.push_back(RuneRange(nextlo, Runemax));
+ }
+
+ ranges_.clear();
for (size_t i = 0; i < v.size(); i++)
- ranges_.insert(v[i]);
-
- upper_ = AlphaMask & ~upper_;
- lower_ = AlphaMask & ~lower_;
- nrunes_ = Runemax+1 - nrunes_;
-}
-
-// Character class is a sorted list of ranges.
-// The ranges are allocated in the same block as the header,
-// necessitating a special allocator and Delete method.
-
+ ranges_.insert(v[i]);
+
+ upper_ = AlphaMask & ~upper_;
+ lower_ = AlphaMask & ~lower_;
+ nrunes_ = Runemax+1 - nrunes_;
+}
+
+// Character class is a sorted list of ranges.
+// The ranges are allocated in the same block as the header,
+// necessitating a special allocator and Delete method.
+
CharClass* CharClass::New(size_t maxranges) {
- CharClass* cc;
+ CharClass* cc;
uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
- cc = reinterpret_cast<CharClass*>(data);
- cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
- cc->nranges_ = 0;
- cc->folds_ascii_ = false;
- cc->nrunes_ = 0;
- return cc;
-}
-
-void CharClass::Delete() {
+ cc = reinterpret_cast<CharClass*>(data);
+ cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
+ cc->nranges_ = 0;
+ cc->folds_ascii_ = false;
+ cc->nrunes_ = 0;
+ return cc;
+}
+
+void CharClass::Delete() {
uint8_t* data = reinterpret_cast<uint8_t*>(this);
- delete[] data;
-}
-
-CharClass* CharClass::Negate() {
+ delete[] data;
+}
+
+CharClass* CharClass::Negate() {
CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
- cc->folds_ascii_ = folds_ascii_;
- cc->nrunes_ = Runemax + 1 - nrunes_;
- int n = 0;
- int nextlo = 0;
- for (CharClass::iterator it = begin(); it != end(); ++it) {
- if (it->lo == nextlo) {
- nextlo = it->hi + 1;
- } else {
- cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
- nextlo = it->hi + 1;
- }
- }
- if (nextlo <= Runemax)
- cc->ranges_[n++] = RuneRange(nextlo, Runemax);
- cc->nranges_ = n;
- return cc;
-}
-
+ cc->folds_ascii_ = folds_ascii_;
+ cc->nrunes_ = Runemax + 1 - nrunes_;
+ int n = 0;
+ int nextlo = 0;
+ for (CharClass::iterator it = begin(); it != end(); ++it) {
+ if (it->lo == nextlo) {
+ nextlo = it->hi + 1;
+ } else {
+ cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
+ nextlo = it->hi + 1;
+ }
+ }
+ if (nextlo <= Runemax)
+ cc->ranges_[n++] = RuneRange(nextlo, Runemax);
+ cc->nranges_ = n;
+ return cc;
+}
+
bool CharClass::Contains(Rune r) const {
- RuneRange* rr = ranges_;
- int n = nranges_;
- while (n > 0) {
- int m = n/2;
- if (rr[m].hi < r) {
- rr += m+1;
- n -= m+1;
- } else if (r < rr[m].lo) {
- n = m;
- } else { // rr[m].lo <= r && r <= rr[m].hi
- return true;
- }
- }
- return false;
-}
-
-CharClass* CharClassBuilder::GetCharClass() {
+ RuneRange* rr = ranges_;
+ int n = nranges_;
+ while (n > 0) {
+ int m = n/2;
+ if (rr[m].hi < r) {
+ rr += m+1;
+ n -= m+1;
+ } else if (r < rr[m].lo) {
+ n = m;
+ } else { // rr[m].lo <= r && r <= rr[m].hi
+ return true;
+ }
+ }
+ return false;
+}
+
+CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(ranges_.size());
- int n = 0;
- for (iterator it = begin(); it != end(); ++it)
- cc->ranges_[n++] = *it;
- cc->nranges_ = n;
+ int n = 0;
+ for (iterator it = begin(); it != end(); ++it)
+ cc->ranges_[n++] = *it;
+ cc->nranges_ = n;
DCHECK_LE(n, static_cast<int>(ranges_.size()));
- cc->nrunes_ = nrunes_;
- cc->folds_ascii_ = FoldsASCII();
- return cc;
-}
-
-} // namespace re2
+ cc->nrunes_ = nrunes_;
+ cc->folds_ascii_ = FoldsASCII();
+ return cc;
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/regexp.h b/contrib/libs/re2/re2/regexp.h
index 73dca2d64e..b6446f9fe5 100644
--- a/contrib/libs/re2/re2/regexp.h
+++ b/contrib/libs/re2/re2/regexp.h
@@ -1,283 +1,283 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_REGEXP_H_
#define RE2_REGEXP_H_
-// --- SPONSORED LINK --------------------------------------------------
-// If you want to use this library for regular expression matching,
-// you should use re2/re2.h, which provides a class RE2 that
-// mimics the PCRE interface provided by PCRE's C++ wrappers.
-// This header describes the low-level interface used to implement RE2
-// and may change in backwards-incompatible ways from time to time.
-// In contrast, RE2's interface will not.
-// ---------------------------------------------------------------------
-
-// Regular expression library: parsing, execution, and manipulation
-// of regular expressions.
-//
-// Any operation that traverses the Regexp structures should be written
-// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
-// regular expressions such as x++++++++++++++++++++... might cause recursive
-// traversals to overflow the stack.
-//
-// It is the caller's responsibility to provide appropriate mutual exclusion
-// around manipulation of the regexps. RE2 does this.
-//
-// PARSING
-//
-// Regexp::Parse parses regular expressions encoded in UTF-8.
-// The default syntax is POSIX extended regular expressions,
-// with the following changes:
-//
-// 1. Backreferences (optional in POSIX EREs) are not supported.
-// (Supporting them precludes the use of DFA-based
-// matching engines.)
-//
-// 2. Collating elements and collation classes are not supported.
-// (No one has needed or wanted them.)
-//
-// The exact syntax accepted can be modified by passing flags to
-// Regexp::Parse. In particular, many of the basic Perl additions
-// are available. The flags are documented below (search for LikePerl).
-//
-// If parsed with the flag Regexp::Latin1, both the regular expression
-// and the input to the matching routines are assumed to be encoded in
-// Latin-1, not UTF-8.
-//
-// EXECUTION
-//
-// Once Regexp has parsed a regular expression, it provides methods
-// to search text using that regular expression. These methods are
-// implemented via calling out to other regular expression libraries.
-// (Let's call them the sublibraries.)
-//
-// To call a sublibrary, Regexp does not simply prepare a
-// string version of the regular expression and hand it to the
-// sublibrary. Instead, Regexp prepares, from its own parsed form, the
-// corresponding internal representation used by the sublibrary.
-// This has the drawback of needing to know the internal representation
-// used by the sublibrary, but it has two important benefits:
-//
-// 1. The syntax and meaning of regular expressions is guaranteed
-// to be that used by Regexp's parser, not the syntax expected
-// by the sublibrary. Regexp might accept a restricted or
-// expanded syntax for regular expressions as compared with
-// the sublibrary. As long as Regexp can translate from its
-// internal form into the sublibrary's, clients need not know
-// exactly which sublibrary they are using.
-//
-// 2. The sublibrary parsers are bypassed. For whatever reason,
-// sublibrary regular expression parsers often have security
-// problems. For example, plan9grep's regular expression parser
-// has a buffer overflow in its handling of large character
-// classes, and PCRE's parser has had buffer overflow problems
-// in the past. Security-team requires sandboxing of sublibrary
-// regular expression parsers. Avoiding the sublibrary parsers
-// avoids the sandbox.
-//
-// The execution methods we use now are provided by the compiled form,
-// Prog, described in prog.h
-//
-// MANIPULATION
-//
-// Unlike other regular expression libraries, Regexp makes its parsed
-// form accessible to clients, so that client code can analyze the
-// parsed regular expressions.
-
+// --- SPONSORED LINK --------------------------------------------------
+// If you want to use this library for regular expression matching,
+// you should use re2/re2.h, which provides a class RE2 that
+// mimics the PCRE interface provided by PCRE's C++ wrappers.
+// This header describes the low-level interface used to implement RE2
+// and may change in backwards-incompatible ways from time to time.
+// In contrast, RE2's interface will not.
+// ---------------------------------------------------------------------
+
+// Regular expression library: parsing, execution, and manipulation
+// of regular expressions.
+//
+// Any operation that traverses the Regexp structures should be written
+// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
+// regular expressions such as x++++++++++++++++++++... might cause recursive
+// traversals to overflow the stack.
+//
+// It is the caller's responsibility to provide appropriate mutual exclusion
+// around manipulation of the regexps. RE2 does this.
+//
+// PARSING
+//
+// Regexp::Parse parses regular expressions encoded in UTF-8.
+// The default syntax is POSIX extended regular expressions,
+// with the following changes:
+//
+// 1. Backreferences (optional in POSIX EREs) are not supported.
+// (Supporting them precludes the use of DFA-based
+// matching engines.)
+//
+// 2. Collating elements and collation classes are not supported.
+// (No one has needed or wanted them.)
+//
+// The exact syntax accepted can be modified by passing flags to
+// Regexp::Parse. In particular, many of the basic Perl additions
+// are available. The flags are documented below (search for LikePerl).
+//
+// If parsed with the flag Regexp::Latin1, both the regular expression
+// and the input to the matching routines are assumed to be encoded in
+// Latin-1, not UTF-8.
+//
+// EXECUTION
+//
+// Once Regexp has parsed a regular expression, it provides methods
+// to search text using that regular expression. These methods are
+// implemented via calling out to other regular expression libraries.
+// (Let's call them the sublibraries.)
+//
+// To call a sublibrary, Regexp does not simply prepare a
+// string version of the regular expression and hand it to the
+// sublibrary. Instead, Regexp prepares, from its own parsed form, the
+// corresponding internal representation used by the sublibrary.
+// This has the drawback of needing to know the internal representation
+// used by the sublibrary, but it has two important benefits:
+//
+// 1. The syntax and meaning of regular expressions is guaranteed
+// to be that used by Regexp's parser, not the syntax expected
+// by the sublibrary. Regexp might accept a restricted or
+// expanded syntax for regular expressions as compared with
+// the sublibrary. As long as Regexp can translate from its
+// internal form into the sublibrary's, clients need not know
+// exactly which sublibrary they are using.
+//
+// 2. The sublibrary parsers are bypassed. For whatever reason,
+// sublibrary regular expression parsers often have security
+// problems. For example, plan9grep's regular expression parser
+// has a buffer overflow in its handling of large character
+// classes, and PCRE's parser has had buffer overflow problems
+// in the past. Security-team requires sandboxing of sublibrary
+// regular expression parsers. Avoiding the sublibrary parsers
+// avoids the sandbox.
+//
+// The execution methods we use now are provided by the compiled form,
+// Prog, described in prog.h
+//
+// MANIPULATION
+//
+// Unlike other regular expression libraries, Regexp makes its parsed
+// form accessible to clients, so that client code can analyze the
+// parsed regular expressions.
+
#include <stddef.h>
#include <stdint.h>
#include <map>
#include <set>
#include <string>
-
+
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
-
-namespace re2 {
-
-// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
-enum RegexpOp {
- // Matches no strings.
- kRegexpNoMatch = 1,
-
- // Matches empty string.
- kRegexpEmptyMatch,
-
- // Matches rune_.
- kRegexpLiteral,
-
- // Matches runes_.
- kRegexpLiteralString,
-
- // Matches concatenation of sub_[0..nsub-1].
- kRegexpConcat,
- // Matches union of sub_[0..nsub-1].
- kRegexpAlternate,
-
- // Matches sub_[0] zero or more times.
- kRegexpStar,
- // Matches sub_[0] one or more times.
- kRegexpPlus,
- // Matches sub_[0] zero or one times.
- kRegexpQuest,
-
- // Matches sub_[0] at least min_ times, at most max_ times.
- // max_ == -1 means no upper limit.
- kRegexpRepeat,
-
- // Parenthesized (capturing) subexpression. Index is cap_.
- // Optionally, capturing name is name_.
- kRegexpCapture,
-
- // Matches any character.
- kRegexpAnyChar,
-
- // Matches any byte [sic].
- kRegexpAnyByte,
-
- // Matches empty string at beginning of line.
- kRegexpBeginLine,
- // Matches empty string at end of line.
- kRegexpEndLine,
-
- // Matches word boundary "\b".
- kRegexpWordBoundary,
- // Matches not-a-word boundary "\B".
- kRegexpNoWordBoundary,
-
- // Matches empty string at beginning of text.
- kRegexpBeginText,
- // Matches empty string at end of text.
- kRegexpEndText,
-
- // Matches character class given by cc_.
- kRegexpCharClass,
-
- // Forces match of entire expression right now,
- // with match ID match_id_ (used by RE2::Set).
- kRegexpHaveMatch,
-
- kMaxRegexpOp = kRegexpHaveMatch,
-};
-
-// Keep in sync with string list in regexp.cc
-enum RegexpStatusCode {
- // No error
- kRegexpSuccess = 0,
-
- // Unexpected error
- kRegexpInternalError,
-
- // Parse errors
- kRegexpBadEscape, // bad escape sequence
- kRegexpBadCharClass, // bad character class
- kRegexpBadCharRange, // bad character class range
- kRegexpMissingBracket, // missing closing ]
- kRegexpMissingParen, // missing closing )
+
+namespace re2 {
+
+// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
+enum RegexpOp {
+ // Matches no strings.
+ kRegexpNoMatch = 1,
+
+ // Matches empty string.
+ kRegexpEmptyMatch,
+
+ // Matches rune_.
+ kRegexpLiteral,
+
+ // Matches runes_.
+ kRegexpLiteralString,
+
+ // Matches concatenation of sub_[0..nsub-1].
+ kRegexpConcat,
+ // Matches union of sub_[0..nsub-1].
+ kRegexpAlternate,
+
+ // Matches sub_[0] zero or more times.
+ kRegexpStar,
+ // Matches sub_[0] one or more times.
+ kRegexpPlus,
+ // Matches sub_[0] zero or one times.
+ kRegexpQuest,
+
+ // Matches sub_[0] at least min_ times, at most max_ times.
+ // max_ == -1 means no upper limit.
+ kRegexpRepeat,
+
+ // Parenthesized (capturing) subexpression. Index is cap_.
+ // Optionally, capturing name is name_.
+ kRegexpCapture,
+
+ // Matches any character.
+ kRegexpAnyChar,
+
+ // Matches any byte [sic].
+ kRegexpAnyByte,
+
+ // Matches empty string at beginning of line.
+ kRegexpBeginLine,
+ // Matches empty string at end of line.
+ kRegexpEndLine,
+
+ // Matches word boundary "\b".
+ kRegexpWordBoundary,
+ // Matches not-a-word boundary "\B".
+ kRegexpNoWordBoundary,
+
+ // Matches empty string at beginning of text.
+ kRegexpBeginText,
+ // Matches empty string at end of text.
+ kRegexpEndText,
+
+ // Matches character class given by cc_.
+ kRegexpCharClass,
+
+ // Forces match of entire expression right now,
+ // with match ID match_id_ (used by RE2::Set).
+ kRegexpHaveMatch,
+
+ kMaxRegexpOp = kRegexpHaveMatch,
+};
+
+// Keep in sync with string list in regexp.cc
+enum RegexpStatusCode {
+ // No error
+ kRegexpSuccess = 0,
+
+ // Unexpected error
+ kRegexpInternalError,
+
+ // Parse errors
+ kRegexpBadEscape, // bad escape sequence
+ kRegexpBadCharClass, // bad character class
+ kRegexpBadCharRange, // bad character class range
+ kRegexpMissingBracket, // missing closing ]
+ kRegexpMissingParen, // missing closing )
kRegexpUnexpectedParen, // unexpected closing )
- kRegexpTrailingBackslash, // at end of regexp
- kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
- kRegexpRepeatSize, // bad repetition argument
- kRegexpRepeatOp, // bad repetition operator
- kRegexpBadPerlOp, // bad perl operator
- kRegexpBadUTF8, // invalid UTF-8 in regexp
- kRegexpBadNamedCapture, // bad named capture
-};
-
-// Error status for certain operations.
-class RegexpStatus {
- public:
- RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
- ~RegexpStatus() { delete tmp_; }
-
+ kRegexpTrailingBackslash, // at end of regexp
+ kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
+ kRegexpRepeatSize, // bad repetition argument
+ kRegexpRepeatOp, // bad repetition operator
+ kRegexpBadPerlOp, // bad perl operator
+ kRegexpBadUTF8, // invalid UTF-8 in regexp
+ kRegexpBadNamedCapture, // bad named capture
+};
+
+// Error status for certain operations.
+class RegexpStatus {
+ public:
+ RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
+ ~RegexpStatus() { delete tmp_; }
+
void set_code(RegexpStatusCode code) { code_ = code; }
- void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
+ void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
RegexpStatusCode code() const { return code_; }
- const StringPiece& error_arg() const { return error_arg_; }
- bool ok() const { return code() == kRegexpSuccess; }
-
- // Copies state from status.
- void Copy(const RegexpStatus& status);
-
- // Returns text equivalent of code, e.g.:
- // "Bad character class"
+ const StringPiece& error_arg() const { return error_arg_; }
+ bool ok() const { return code() == kRegexpSuccess; }
+
+ // Copies state from status.
+ void Copy(const RegexpStatus& status);
+
+ // Returns text equivalent of code, e.g.:
+ // "Bad character class"
static std::string CodeText(RegexpStatusCode code);
-
- // Returns text describing error, e.g.:
- // "Bad character class: [z-a]"
+
+ // Returns text describing error, e.g.:
+ // "Bad character class: [z-a]"
std::string Text() const;
-
- private:
+
+ private:
RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
std::string* tmp_; // Temporary storage, possibly where error_arg_ is.
-
+
RegexpStatus(const RegexpStatus&) = delete;
RegexpStatus& operator=(const RegexpStatus&) = delete;
-};
-
-// Compiled form; see prog.h
-class Prog;
-
-struct RuneRange {
- RuneRange() : lo(0), hi(0) { }
- RuneRange(int l, int h) : lo(l), hi(h) { }
- Rune lo;
- Rune hi;
-};
-
-// Less-than on RuneRanges treats a == b if they overlap at all.
-// This lets us look in a set to find the range covering a particular Rune.
-struct RuneRangeLess {
- bool operator()(const RuneRange& a, const RuneRange& b) const {
- return a.hi < b.lo;
- }
-};
-
-class CharClassBuilder;
-
-class CharClass {
- public:
- void Delete();
-
- typedef RuneRange* iterator;
- iterator begin() { return ranges_; }
- iterator end() { return ranges_ + nranges_; }
-
- int size() { return nrunes_; }
- bool empty() { return nrunes_ == 0; }
- bool full() { return nrunes_ == Runemax+1; }
- bool FoldsASCII() { return folds_ascii_; }
-
+};
+
+// Compiled form; see prog.h
+class Prog;
+
+struct RuneRange {
+ RuneRange() : lo(0), hi(0) { }
+ RuneRange(int l, int h) : lo(l), hi(h) { }
+ Rune lo;
+ Rune hi;
+};
+
+// Less-than on RuneRanges treats a == b if they overlap at all.
+// This lets us look in a set to find the range covering a particular Rune.
+struct RuneRangeLess {
+ bool operator()(const RuneRange& a, const RuneRange& b) const {
+ return a.hi < b.lo;
+ }
+};
+
+class CharClassBuilder;
+
+class CharClass {
+ public:
+ void Delete();
+
+ typedef RuneRange* iterator;
+ iterator begin() { return ranges_; }
+ iterator end() { return ranges_ + nranges_; }
+
+ int size() { return nrunes_; }
+ bool empty() { return nrunes_ == 0; }
+ bool full() { return nrunes_ == Runemax+1; }
+ bool FoldsASCII() { return folds_ascii_; }
+
bool Contains(Rune r) const;
- CharClass* Negate();
-
- private:
- CharClass(); // not implemented
- ~CharClass(); // not implemented
+ CharClass* Negate();
+
+ private:
+ CharClass(); // not implemented
+ ~CharClass(); // not implemented
static CharClass* New(size_t maxranges);
-
- friend class CharClassBuilder;
-
- bool folds_ascii_;
- int nrunes_;
- RuneRange *ranges_;
- int nranges_;
+
+ friend class CharClassBuilder;
+
+ bool folds_ascii_;
+ int nrunes_;
+ RuneRange *ranges_;
+ int nranges_;
CharClass(const CharClass&) = delete;
CharClass& operator=(const CharClass&) = delete;
-};
-
-class Regexp {
- public:
-
- // Flags for parsing. Can be ORed together.
- enum ParseFlags {
+};
+
+class Regexp {
+ public:
+
+ // Flags for parsing. Can be ORed together.
+ enum ParseFlags {
NoParseFlags = 0,
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
Literal = 1<<1, // Treat s as literal string instead of a regexp.
@@ -309,139 +309,139 @@ class Regexp {
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
NeverCapture = 1<<12, // Parse all parens as non-capturing.
-
- // As close to Perl as we can get.
+
+ // As close to Perl as we can get.
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
UnicodeGroups,
-
- // Internal use only.
+
+ // Internal use only.
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
AllParseFlags = (1<<14)-1,
- };
-
- // Get. No set, Regexps are logically immutable once created.
- RegexpOp op() { return static_cast<RegexpOp>(op_); }
- int nsub() { return nsub_; }
+ };
+
+ // Get. No set, Regexps are logically immutable once created.
+ RegexpOp op() { return static_cast<RegexpOp>(op_); }
+ int nsub() { return nsub_; }
bool simple() { return simple_ != 0; }
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
- int Ref(); // For testing.
-
- Regexp** sub() {
- if(nsub_ <= 1)
- return &subone_;
- else
- return submany_;
- }
-
- int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
- int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
- Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
- CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
- int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
+ int Ref(); // For testing.
+
+ Regexp** sub() {
+ if(nsub_ <= 1)
+ return &subone_;
+ else
+ return submany_;
+ }
+
+ int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
+ int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
+ Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
+ CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
+ int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
- Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
- int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
- int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
-
- // Increments reference count, returns object as convenience.
- Regexp* Incref();
-
- // Decrements reference count and deletes this object if count reaches 0.
- void Decref();
-
- // Parses string s to produce regular expression, returned.
- // Caller must release return value with re->Decref().
- // On failure, sets *status (if status != NULL) and returns NULL.
- static Regexp* Parse(const StringPiece& s, ParseFlags flags,
- RegexpStatus* status);
-
- // Returns a _new_ simplified version of the current regexp.
- // Does not edit the current regexp.
- // Caller must release return value with re->Decref().
- // Simplified means that counted repetition has been rewritten
- // into simpler terms and all Perl/POSIX features have been
- // removed. The result will capture exactly the same
- // subexpressions the original did, unless formatted with ToString.
- Regexp* Simplify();
+ Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
+ int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
+ int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
+
+ // Increments reference count, returns object as convenience.
+ Regexp* Incref();
+
+ // Decrements reference count and deletes this object if count reaches 0.
+ void Decref();
+
+ // Parses string s to produce regular expression, returned.
+ // Caller must release return value with re->Decref().
+ // On failure, sets *status (if status != NULL) and returns NULL.
+ static Regexp* Parse(const StringPiece& s, ParseFlags flags,
+ RegexpStatus* status);
+
+ // Returns a _new_ simplified version of the current regexp.
+ // Does not edit the current regexp.
+ // Caller must release return value with re->Decref().
+ // Simplified means that counted repetition has been rewritten
+ // into simpler terms and all Perl/POSIX features have been
+ // removed. The result will capture exactly the same
+ // subexpressions the original did, unless formatted with ToString.
+ Regexp* Simplify();
friend class CoalesceWalker;
- friend class SimplifyWalker;
-
- // Parses the regexp src and then simplifies it and sets *dst to the
- // string representation of the simplified form. Returns true on success.
- // Returns false and sets *status (if status != NULL) on parse error.
- static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
+ friend class SimplifyWalker;
+
+ // Parses the regexp src and then simplifies it and sets *dst to the
+ // string representation of the simplified form. Returns true on success.
+ // Returns false and sets *status (if status != NULL) on parse error.
+ static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
std::string* dst, RegexpStatus* status);
-
- // Returns the number of capturing groups in the regexp.
- int NumCaptures();
- friend class NumCapturesWalker;
-
- // Returns a map from names to capturing group indices,
- // or NULL if the regexp contains no named capture groups.
- // The caller is responsible for deleting the map.
+
+ // Returns the number of capturing groups in the regexp.
+ int NumCaptures();
+ friend class NumCapturesWalker;
+
+ // Returns a map from names to capturing group indices,
+ // or NULL if the regexp contains no named capture groups.
+ // The caller is responsible for deleting the map.
std::map<std::string, int>* NamedCaptures();
-
- // Returns a map from capturing group indices to capturing group
- // names or NULL if the regexp contains no named capture groups. The
- // caller is responsible for deleting the map.
+
+ // Returns a map from capturing group indices to capturing group
+ // names or NULL if the regexp contains no named capture groups. The
+ // caller is responsible for deleting the map.
std::map<int, std::string>* CaptureNames();
-
- // Returns a string representation of the current regexp,
- // using as few parentheses as possible.
+
+ // Returns a string representation of the current regexp,
+ // using as few parentheses as possible.
std::string ToString();
-
- // Convenience functions. They consume the passed reference,
- // so in many cases you should use, e.g., Plus(re->Incref(), flags).
- // They do not consume allocated arrays like subs or runes.
- static Regexp* Plus(Regexp* sub, ParseFlags flags);
- static Regexp* Star(Regexp* sub, ParseFlags flags);
- static Regexp* Quest(Regexp* sub, ParseFlags flags);
- static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
- static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
- static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
- static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
- static Regexp* NewLiteral(Rune rune, ParseFlags flags);
- static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
- static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
- static Regexp* HaveMatch(int match_id, ParseFlags flags);
-
- // Like Alternate but does not factor out common prefixes.
- static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
-
- // Debugging function. Returns string format for regexp
- // that makes structure clear. Does NOT use regexp syntax.
+
+ // Convenience functions. They consume the passed reference,
+ // so in many cases you should use, e.g., Plus(re->Incref(), flags).
+ // They do not consume allocated arrays like subs or runes.
+ static Regexp* Plus(Regexp* sub, ParseFlags flags);
+ static Regexp* Star(Regexp* sub, ParseFlags flags);
+ static Regexp* Quest(Regexp* sub, ParseFlags flags);
+ static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
+ static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
+ static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
+ static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
+ static Regexp* NewLiteral(Rune rune, ParseFlags flags);
+ static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
+ static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
+ static Regexp* HaveMatch(int match_id, ParseFlags flags);
+
+ // Like Alternate but does not factor out common prefixes.
+ static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
+
+ // Debugging function. Returns string format for regexp
+ // that makes structure clear. Does NOT use regexp syntax.
std::string Dump();
-
- // Helper traversal class, defined fully in walker-inl.h.
- template<typename T> class Walker;
-
- // Compile to Prog. See prog.h
- // Reverse prog expects to be run over text backward.
- // Construction and execution of prog will
- // stay within approximately max_mem bytes of memory.
- // If max_mem <= 0, a reasonable default is used.
+
+ // Helper traversal class, defined fully in walker-inl.h.
+ template<typename T> class Walker;
+
+ // Compile to Prog. See prog.h
+ // Reverse prog expects to be run over text backward.
+ // Construction and execution of prog will
+ // stay within approximately max_mem bytes of memory.
+ // If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64_t max_mem);
Prog* CompileToReverseProg(int64_t max_mem);
-
- // Whether to expect this library to find exactly the same answer as PCRE
- // when running this regexp. Most regexps do mimic PCRE exactly, but a few
- // obscure cases behave differently. Technically this is more a property
- // of the Prog than the Regexp, but the computation is much easier to do
- // on the Regexp. See mimics_pcre.cc for the exact conditions.
- bool MimicsPCRE();
-
- // Benchmarking function.
- void NullWalk();
-
- // Whether every match of this regexp must be anchored and
- // begin with a non-empty fixed string (perhaps after ASCII
- // case-folding). If so, returns the prefix and the sub-regexp that
- // follows it.
+
+ // Whether to expect this library to find exactly the same answer as PCRE
+ // when running this regexp. Most regexps do mimic PCRE exactly, but a few
+ // obscure cases behave differently. Technically this is more a property
+ // of the Prog than the Regexp, but the computation is much easier to do
+ // on the Regexp. See mimics_pcre.cc for the exact conditions.
+ bool MimicsPCRE();
+
+ // Benchmarking function.
+ void NullWalk();
+
+ // Whether every match of this regexp must be anchored and
+ // begin with a non-empty fixed string (perhaps after ASCII
+ // case-folding). If so, returns the prefix and the sub-regexp that
+ // follows it.
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
// regardless of the return value.
bool RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix);
-
+
// Whether every match of this regexp must be unanchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix.
@@ -453,213 +453,213 @@ class Regexp {
// FOR FUZZING ONLY.
static void FUZZING_ONLY_set_maximum_repeat_count(int i);
- private:
- // Constructor allocates vectors as appropriate for operator.
- explicit Regexp(RegexpOp op, ParseFlags parse_flags);
-
- // Use Decref() instead of delete to release Regexps.
- // This is private to catch deletes at compile time.
- ~Regexp();
- void Destroy();
- bool QuickDestroy();
-
- // Helpers for Parse. Listed here so they can edit Regexps.
- class ParseState;
-
- friend class ParseState;
- friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
- RegexpStatus* status);
-
- // Helper for testing [sic].
- friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
-
- // Computes whether Regexp is already simple.
- bool ComputeSimple();
-
+ private:
+ // Constructor allocates vectors as appropriate for operator.
+ explicit Regexp(RegexpOp op, ParseFlags parse_flags);
+
+ // Use Decref() instead of delete to release Regexps.
+ // This is private to catch deletes at compile time.
+ ~Regexp();
+ void Destroy();
+ bool QuickDestroy();
+
+ // Helpers for Parse. Listed here so they can edit Regexps.
+ class ParseState;
+
+ friend class ParseState;
+ friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
+ RegexpStatus* status);
+
+ // Helper for testing [sic].
+ friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
+
+ // Computes whether Regexp is already simple.
+ bool ComputeSimple();
+
// Constructor that generates a Star, Plus or Quest,
// squashing the pair if sub is also a Star, Plus or Quest.
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
- // Constructor that generates a concatenation or alternation,
- // enforcing the limit on the number of subexpressions for
- // a particular Regexp.
- static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
- ParseFlags flags, bool can_factor);
-
- // Returns the leading string that re starts with.
- // The returned Rune* points into a piece of re,
- // so it must not be used after the caller calls re->Decref().
- static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
-
- // Removes the first n leading runes from the beginning of re.
- // Edits re in place.
- static void RemoveLeadingString(Regexp* re, int n);
-
- // Returns the leading regexp in re's top-level concatenation.
- // The returned Regexp* points at re or a sub-expression of re,
- // so it must not be used after the caller calls re->Decref().
- static Regexp* LeadingRegexp(Regexp* re);
-
- // Removes LeadingRegexp(re) from re and returns the remainder.
- // Might edit re in place.
- static Regexp* RemoveLeadingRegexp(Regexp* re);
-
- // Simplifies an alternation of literal strings by factoring out
- // common prefixes.
- static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
+ // Constructor that generates a concatenation or alternation,
+ // enforcing the limit on the number of subexpressions for
+ // a particular Regexp.
+ static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
+ ParseFlags flags, bool can_factor);
+
+ // Returns the leading string that re starts with.
+ // The returned Rune* points into a piece of re,
+ // so it must not be used after the caller calls re->Decref().
+ static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
+
+ // Removes the first n leading runes from the beginning of re.
+ // Edits re in place.
+ static void RemoveLeadingString(Regexp* re, int n);
+
+ // Returns the leading regexp in re's top-level concatenation.
+ // The returned Regexp* points at re or a sub-expression of re,
+ // so it must not be used after the caller calls re->Decref().
+ static Regexp* LeadingRegexp(Regexp* re);
+
+ // Removes LeadingRegexp(re) from re and returns the remainder.
+ // Might edit re in place.
+ static Regexp* RemoveLeadingRegexp(Regexp* re);
+
+ // Simplifies an alternation of literal strings by factoring out
+ // common prefixes.
+ static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
friend class FactorAlternationImpl;
-
- // Is a == b? Only efficient on regexps that have not been through
- // Simplify yet - the expansion of a kRegexpRepeat will make this
- // take a long time. Do not call on such regexps, hence private.
- static bool Equal(Regexp* a, Regexp* b);
-
- // Allocate space for n sub-regexps.
- void AllocSub(int n) {
+
+ // Is a == b? Only efficient on regexps that have not been through
+ // Simplify yet - the expansion of a kRegexpRepeat will make this
+ // take a long time. Do not call on such regexps, hence private.
+ static bool Equal(Regexp* a, Regexp* b);
+
+ // Allocate space for n sub-regexps.
+ void AllocSub(int n) {
DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
- if (n > 1)
- submany_ = new Regexp*[n];
+ if (n > 1)
+ submany_ = new Regexp*[n];
nsub_ = static_cast<uint16_t>(n);
- }
-
- // Add Rune to LiteralString
- void AddRuneToString(Rune r);
-
- // Swaps this with that, in place.
- void Swap(Regexp *that);
-
- // Operator. See description of operators above.
+ }
+
+ // Add Rune to LiteralString
+ void AddRuneToString(Rune r);
+
+ // Swaps this with that, in place.
+ void Swap(Regexp *that);
+
+ // Operator. See description of operators above.
// uint8_t instead of RegexpOp to control space usage.
uint8_t op_;
-
- // Is this regexp structure already simple
- // (has it been returned by Simplify)?
+
+ // Is this regexp structure already simple
+ // (has it been returned by Simplify)?
// uint8_t instead of bool to control space usage.
uint8_t simple_;
-
- // Flags saved from parsing and used during execution.
- // (Only FoldCase is used.)
+
+ // Flags saved from parsing and used during execution.
+ // (Only FoldCase is used.)
// uint16_t instead of ParseFlags to control space usage.
uint16_t parse_flags_;
-
- // Reference count. Exists so that SimplifyRegexp can build
- // regexp structures that are dags rather than trees to avoid
- // exponential blowup in space requirements.
+
+ // Reference count. Exists so that SimplifyRegexp can build
+ // regexp structures that are dags rather than trees to avoid
+ // exponential blowup in space requirements.
// uint16_t to control space usage.
- // The standard regexp routines will never generate a
+ // The standard regexp routines will never generate a
// ref greater than the maximum repeat count (kMaxRepeat),
- // but even so, Incref and Decref consult an overflow map
- // when ref_ reaches kMaxRef.
+ // but even so, Incref and Decref consult an overflow map
+ // when ref_ reaches kMaxRef.
uint16_t ref_;
static const uint16_t kMaxRef = 0xffff;
-
- // Subexpressions.
+
+ // Subexpressions.
// uint16_t to control space usage.
- // Concat and Alternate handle larger numbers of subexpressions
- // by building concatenation or alternation trees.
- // Other routines should call Concat or Alternate instead of
- // filling in sub() by hand.
+ // Concat and Alternate handle larger numbers of subexpressions
+ // by building concatenation or alternation trees.
+ // Other routines should call Concat or Alternate instead of
+ // filling in sub() by hand.
uint16_t nsub_;
static const uint16_t kMaxNsub = 0xffff;
- union {
- Regexp** submany_; // if nsub_ > 1
- Regexp* subone_; // if nsub_ == 1
- };
-
- // Extra space for parse and teardown stacks.
- Regexp* down_;
-
- // Arguments to operator. See description of operators above.
- union {
- struct { // Repeat
- int max_;
- int min_;
- };
- struct { // Capture
- int cap_;
+ union {
+ Regexp** submany_; // if nsub_ > 1
+ Regexp* subone_; // if nsub_ == 1
+ };
+
+ // Extra space for parse and teardown stacks.
+ Regexp* down_;
+
+ // Arguments to operator. See description of operators above.
+ union {
+ struct { // Repeat
+ int max_;
+ int min_;
+ };
+ struct { // Capture
+ int cap_;
std::string* name_;
- };
- struct { // LiteralString
- int nrunes_;
- Rune* runes_;
- };
- struct { // CharClass
- // These two could be in separate union members,
- // but it wouldn't save any space (there are other two-word structs)
- // and keeping them separate avoids confusion during parsing.
- CharClass* cc_;
- CharClassBuilder* ccb_;
- };
- Rune rune_; // Literal
- int match_id_; // HaveMatch
- void *the_union_[2]; // as big as any other element, for memset
- };
-
+ };
+ struct { // LiteralString
+ int nrunes_;
+ Rune* runes_;
+ };
+ struct { // CharClass
+ // These two could be in separate union members,
+ // but it wouldn't save any space (there are other two-word structs)
+ // and keeping them separate avoids confusion during parsing.
+ CharClass* cc_;
+ CharClassBuilder* ccb_;
+ };
+ Rune rune_; // Literal
+ int match_id_; // HaveMatch
+ void *the_union_[2]; // as big as any other element, for memset
+ };
+
Regexp(const Regexp&) = delete;
Regexp& operator=(const Regexp&) = delete;
-};
-
-// Character class set: contains non-overlapping, non-abutting RuneRanges.
+};
+
+// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
-
-class CharClassBuilder {
- public:
- CharClassBuilder();
-
- typedef RuneRangeSet::iterator iterator;
- iterator begin() { return ranges_.begin(); }
- iterator end() { return ranges_.end(); }
-
- int size() { return nrunes_; }
- bool empty() { return nrunes_ == 0; }
- bool full() { return nrunes_ == Runemax+1; }
-
- bool Contains(Rune r);
- bool FoldsASCII();
- bool AddRange(Rune lo, Rune hi); // returns whether class changed
- CharClassBuilder* Copy();
- void AddCharClass(CharClassBuilder* cc);
- void Negate();
- void RemoveAbove(Rune r);
- CharClass* GetCharClass();
- void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
-
- private:
+
+class CharClassBuilder {
+ public:
+ CharClassBuilder();
+
+ typedef RuneRangeSet::iterator iterator;
+ iterator begin() { return ranges_.begin(); }
+ iterator end() { return ranges_.end(); }
+
+ int size() { return nrunes_; }
+ bool empty() { return nrunes_ == 0; }
+ bool full() { return nrunes_ == Runemax+1; }
+
+ bool Contains(Rune r);
+ bool FoldsASCII();
+ bool AddRange(Rune lo, Rune hi); // returns whether class changed
+ CharClassBuilder* Copy();
+ void AddCharClass(CharClassBuilder* cc);
+ void Negate();
+ void RemoveAbove(Rune r);
+ CharClass* GetCharClass();
+ void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
+
+ private:
static const uint32_t AlphaMask = (1<<26) - 1;
uint32_t upper_; // bitmap of A-Z
uint32_t lower_; // bitmap of a-z
- int nrunes_;
- RuneRangeSet ranges_;
+ int nrunes_;
+ RuneRangeSet ranges_;
CharClassBuilder(const CharClassBuilder&) = delete;
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
-};
-
+};
+
// Bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) | static_cast<int>(b));
-}
-
+}
+
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) ^ static_cast<int>(b));
-}
-
+}
+
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) & static_cast<int>(b));
-}
-
+}
+
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
// Attempting to produce a value out of enum's range has undefined behaviour.
return static_cast<Regexp::ParseFlags>(
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
-}
-
-} // namespace re2
+}
+
+} // namespace re2
#endif // RE2_REGEXP_H_
diff --git a/contrib/libs/re2/re2/set.cc b/contrib/libs/re2/re2/set.cc
index df27ca5fd0..18705663a5 100644
--- a/contrib/libs/re2/re2/set.cc
+++ b/contrib/libs/re2/re2/set.cc
@@ -1,9 +1,9 @@
-// Copyright 2010 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2010 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#include "re2/set.h"
-
+
#include <stddef.h>
#include <algorithm>
#include <memory>
@@ -12,26 +12,26 @@
#include "util/util.h"
#include "util/logging.h"
#include "re2/pod_array.h"
-#include "re2/prog.h"
+#include "re2/prog.h"
#include "re2/re2.h"
-#include "re2/regexp.h"
+#include "re2/regexp.h"
#include "re2/stringpiece.h"
-
+
namespace re2 {
-
+
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor)
: options_(options),
anchor_(anchor),
compiled_(false),
size_(0) {
options_.set_never_capture(true); // might unblock some optimisations
-}
-
-RE2::Set::~Set() {
+}
+
+RE2::Set::~Set() {
for (size_t i = 0; i < elem_.size(); i++)
elem_[i].second->Decref();
-}
-
+}
+
RE2::Set::Set(Set&& other)
: options_(other.options_),
anchor_(other.anchor_),
@@ -53,52 +53,52 @@ RE2::Set& RE2::Set::operator=(Set&& other) {
}
int RE2::Set::Add(const StringPiece& pattern, std::string* error) {
- if (compiled_) {
+ if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
- return -1;
- }
-
- Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
- options_.ParseFlags());
- RegexpStatus status;
+ return -1;
+ }
+
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+ options_.ParseFlags());
+ RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
- if (re == NULL) {
- if (error != NULL)
- *error = status.Text();
- if (options_.log_errors())
- LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
- return -1;
- }
-
- // Concatenate with match index and push on vector.
+ if (re == NULL) {
+ if (error != NULL)
+ *error = status.Text();
+ if (options_.log_errors())
+ LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
+ return -1;
+ }
+
+ // Concatenate with match index and push on vector.
int n = static_cast<int>(elem_.size());
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
- if (re->op() == kRegexpConcat) {
- int nsub = re->nsub();
+ if (re->op() == kRegexpConcat) {
+ int nsub = re->nsub();
PODArray<re2::Regexp*> sub(nsub + 1);
- for (int i = 0; i < nsub; i++)
- sub[i] = re->sub()[i]->Incref();
- sub[nsub] = m;
- re->Decref();
+ for (int i = 0; i < nsub; i++)
+ sub[i] = re->sub()[i]->Incref();
+ sub[nsub] = m;
+ re->Decref();
re = re2::Regexp::Concat(sub.data(), nsub + 1, pf);
- } else {
+ } else {
re2::Regexp* sub[2];
- sub[0] = re;
- sub[1] = m;
+ sub[0] = re;
+ sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
- }
+ }
elem_.emplace_back(std::string(pattern), re);
- return n;
-}
-
-bool RE2::Set::Compile() {
- if (compiled_) {
+ return n;
+}
+
+bool RE2::Set::Compile() {
+ if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile() called more than once";
- return false;
- }
- compiled_ = true;
+ return false;
+ }
+ compiled_ = true;
size_ = static_cast<int>(elem_.size());
-
+
// Sort the elements by their patterns. This is good enough for now
// until we have a Regexp comparison function. (Maybe someday...)
std::sort(elem_.begin(), elem_.end(),
@@ -112,27 +112,27 @@ bool RE2::Set::Compile() {
elem_.clear();
elem_.shrink_to_fit();
- Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
- options_.ParseFlags());
+ Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
+ options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf);
prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem()));
- re->Decref();
+ re->Decref();
return prog_ != nullptr;
}
-
+
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
return Match(text, v, NULL);
-}
-
+}
+
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const {
- if (!compiled_) {
+ if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match() called before compiling";
if (error_info != NULL)
error_info->kind = kNotCompiled;
- return false;
- }
+ return false;
+ }
#ifdef RE2_HAVE_THREAD_LOCAL
hooks::context = NULL;
#endif
@@ -157,8 +157,8 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
if (ret == false) {
if (error_info != NULL)
error_info->kind = kNoError;
- return false;
- }
+ return false;
+ }
if (v != NULL) {
if (matches->empty()) {
LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
@@ -170,7 +170,7 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
}
if (error_info != NULL)
error_info->kind = kNoError;
- return true;
-}
+ return true;
+}
} // namespace re2
diff --git a/contrib/libs/re2/re2/simplify.cc b/contrib/libs/re2/re2/simplify.cc
index e80cbca3fa..663d5fcd45 100644
--- a/contrib/libs/re2/re2/simplify.cc
+++ b/contrib/libs/re2/re2/simplify.cc
@@ -1,104 +1,104 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Rewrite POSIX and other features in re
-// to use simple extended regular expression features.
-// Also sort and simplify character classes.
-
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Rewrite POSIX and other features in re
+// to use simple extended regular expression features.
+// Also sort and simplify character classes.
+
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/pod_array.h"
-#include "re2/regexp.h"
-#include "re2/walker-inl.h"
-
-namespace re2 {
-
-// Parses the regexp src and then simplifies it and sets *dst to the
-// string representation of the simplified form. Returns true on success.
-// Returns false and sets *error (if error != NULL) on error.
-bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Parses the regexp src and then simplifies it and sets *dst to the
+// string representation of the simplified form. Returns true on success.
+// Returns false and sets *error (if error != NULL) on error.
+bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
std::string* dst, RegexpStatus* status) {
- Regexp* re = Parse(src, flags, status);
- if (re == NULL)
- return false;
- Regexp* sre = re->Simplify();
- re->Decref();
- if (sre == NULL) {
- if (status) {
- status->set_code(kRegexpInternalError);
- status->set_error_arg(src);
- }
- return false;
- }
- *dst = sre->ToString();
- sre->Decref();
- return true;
-}
-
-// Assuming the simple_ flags on the children are accurate,
-// is this Regexp* simple?
-bool Regexp::ComputeSimple() {
- Regexp** subs;
- switch (op_) {
- case kRegexpNoMatch:
- case kRegexpEmptyMatch:
- case kRegexpLiteral:
- case kRegexpLiteralString:
- case kRegexpBeginLine:
- case kRegexpEndLine:
- case kRegexpBeginText:
- case kRegexpWordBoundary:
- case kRegexpNoWordBoundary:
- case kRegexpEndText:
- case kRegexpAnyChar:
- case kRegexpAnyByte:
- case kRegexpHaveMatch:
- return true;
- case kRegexpConcat:
- case kRegexpAlternate:
- // These are simple as long as the subpieces are simple.
- subs = sub();
- for (int i = 0; i < nsub_; i++)
+ Regexp* re = Parse(src, flags, status);
+ if (re == NULL)
+ return false;
+ Regexp* sre = re->Simplify();
+ re->Decref();
+ if (sre == NULL) {
+ if (status) {
+ status->set_code(kRegexpInternalError);
+ status->set_error_arg(src);
+ }
+ return false;
+ }
+ *dst = sre->ToString();
+ sre->Decref();
+ return true;
+}
+
+// Assuming the simple_ flags on the children are accurate,
+// is this Regexp* simple?
+bool Regexp::ComputeSimple() {
+ Regexp** subs;
+ switch (op_) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpLiteral:
+ case kRegexpLiteralString:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpBeginText:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpEndText:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpHaveMatch:
+ return true;
+ case kRegexpConcat:
+ case kRegexpAlternate:
+ // These are simple as long as the subpieces are simple.
+ subs = sub();
+ for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple())
- return false;
- return true;
- case kRegexpCharClass:
- // Simple as long as the char class is not empty, not full.
- if (ccb_ != NULL)
- return !ccb_->empty() && !ccb_->full();
- return !cc_->empty() && !cc_->full();
- case kRegexpCapture:
- subs = sub();
+ return false;
+ return true;
+ case kRegexpCharClass:
+ // Simple as long as the char class is not empty, not full.
+ if (ccb_ != NULL)
+ return !ccb_->empty() && !ccb_->full();
+ return !cc_->empty() && !cc_->full();
+ case kRegexpCapture:
+ subs = sub();
return subs[0]->simple();
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- subs = sub();
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ subs = sub();
if (!subs[0]->simple())
- return false;
- switch (subs[0]->op_) {
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- case kRegexpEmptyMatch:
- case kRegexpNoMatch:
- return false;
- default:
- break;
- }
- return true;
- case kRegexpRepeat:
- return false;
- }
- LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
- return false;
-}
-
-// Walker subclass used by Simplify.
+ return false;
+ switch (subs[0]->op_) {
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpEmptyMatch:
+ case kRegexpNoMatch:
+ return false;
+ default:
+ break;
+ }
+ return true;
+ case kRegexpRepeat:
+ return false;
+ }
+ LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
+ return false;
+}
+
+// Walker subclass used by Simplify.
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
// occurrences of that literal into repeats of that literal. It also works for
// char classes, any char and any byte.
@@ -130,51 +130,51 @@ class CoalesceWalker : public Regexp::Walker<Regexp*> {
};
// Walker subclass used by Simplify.
-// The simplify walk is purely post-recursive: given the simplified children,
-// PostVisit creates the simplified result.
-// The child_args are simplified Regexp*s.
-class SimplifyWalker : public Regexp::Walker<Regexp*> {
- public:
- SimplifyWalker() {}
- virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
+// The simplify walk is purely post-recursive: given the simplified children,
+// PostVisit creates the simplified result.
+// The child_args are simplified Regexp*s.
+class SimplifyWalker : public Regexp::Walker<Regexp*> {
+ public:
+ SimplifyWalker() {}
+ virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
- Regexp** child_args, int nchild_args);
- virtual Regexp* Copy(Regexp* re);
- virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
-
- private:
- // These functions are declared inside SimplifyWalker so that
- // they can edit the private fields of the Regexps they construct.
-
- // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
- // Caller must Decref return value when done with it.
- static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
-
- // Simplifies the expression re{min,max} in terms of *, +, and ?.
- // Returns a new regexp. Does not edit re. Does not consume reference to re.
- // Caller must Decref return value when done with it.
- static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
- Regexp::ParseFlags parse_flags);
-
- // Simplifies a character class by expanding any named classes
- // into rune ranges. Does not edit re. Does not consume ref to re.
- // Caller must Decref return value when done with it.
- static Regexp* SimplifyCharClass(Regexp* re);
-
+ Regexp** child_args, int nchild_args);
+ virtual Regexp* Copy(Regexp* re);
+ virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
+
+ private:
+ // These functions are declared inside SimplifyWalker so that
+ // they can edit the private fields of the Regexps they construct.
+
+ // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
+ // Caller must Decref return value when done with it.
+ static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
+
+ // Simplifies the expression re{min,max} in terms of *, +, and ?.
+ // Returns a new regexp. Does not edit re. Does not consume reference to re.
+ // Caller must Decref return value when done with it.
+ static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
+ Regexp::ParseFlags parse_flags);
+
+ // Simplifies a character class by expanding any named classes
+ // into rune ranges. Does not edit re. Does not consume ref to re.
+ // Caller must Decref return value when done with it.
+ static Regexp* SimplifyCharClass(Regexp* re);
+
SimplifyWalker(const SimplifyWalker&) = delete;
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
-};
-
-// Simplifies a regular expression, returning a new regexp.
-// The new regexp uses traditional Unix egrep features only,
-// plus the Perl (?:) non-capturing parentheses.
-// Otherwise, no POSIX or Perl additions. The new regexp
-// captures exactly the same subexpressions (with the same indices)
-// as the original.
-// Does not edit current object.
-// Caller must Decref() return value when done with it.
-
-Regexp* Regexp::Simplify() {
+};
+
+// Simplifies a regular expression, returning a new regexp.
+// The new regexp uses traditional Unix egrep features only,
+// plus the Perl (?:) non-capturing parentheses.
+// Otherwise, no POSIX or Perl additions. The new regexp
+// captures exactly the same subexpressions (with the same indices)
+// as the original.
+// Does not edit current object.
+// Caller must Decref() return value when done with it.
+
+Regexp* Regexp::Simplify() {
CoalesceWalker cw;
Regexp* cre = cw.Walk(this, NULL);
if (cre == NULL)
@@ -193,10 +193,10 @@ Regexp* Regexp::Simplify() {
return NULL;
}
return sre;
-}
-
-#define Simplify DontCallSimplify // Avoid accidental recursion
-
+}
+
+#define Simplify DontCallSimplify // Avoid accidental recursion
+
// Utility function for PostVisit implementations that compares re->sub() with
// child_args to determine whether any child_args changed. In the common case,
// where nothing changed, calls Decref() for all child_args and returns false,
@@ -441,225 +441,225 @@ void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
r2->Decref();
}
-Regexp* SimplifyWalker::Copy(Regexp* re) {
- return re->Incref();
-}
-
-Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
+Regexp* SimplifyWalker::Copy(Regexp* re) {
+ return re->Incref();
+}
+
+Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
- LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
+ LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
#endif
- return re->Incref();
-}
-
-Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
+ return re->Incref();
+}
+
+Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple()) {
- *stop = true;
- return re->Incref();
- }
- return NULL;
-}
-
-Regexp* SimplifyWalker::PostVisit(Regexp* re,
- Regexp* parent_arg,
- Regexp* pre_arg,
- Regexp** child_args,
- int nchild_args) {
- switch (re->op()) {
- case kRegexpNoMatch:
- case kRegexpEmptyMatch:
- case kRegexpLiteral:
- case kRegexpLiteralString:
- case kRegexpBeginLine:
- case kRegexpEndLine:
- case kRegexpBeginText:
- case kRegexpWordBoundary:
- case kRegexpNoWordBoundary:
- case kRegexpEndText:
- case kRegexpAnyChar:
- case kRegexpAnyByte:
- case kRegexpHaveMatch:
- // All these are always simple.
- re->simple_ = true;
- return re->Incref();
-
- case kRegexpConcat:
- case kRegexpAlternate: {
- // These are simple as long as the subpieces are simple.
+ *stop = true;
+ return re->Incref();
+ }
+ return NULL;
+}
+
+Regexp* SimplifyWalker::PostVisit(Regexp* re,
+ Regexp* parent_arg,
+ Regexp* pre_arg,
+ Regexp** child_args,
+ int nchild_args) {
+ switch (re->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpLiteral:
+ case kRegexpLiteralString:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpBeginText:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpEndText:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpHaveMatch:
+ // All these are always simple.
+ re->simple_ = true;
+ return re->Incref();
+
+ case kRegexpConcat:
+ case kRegexpAlternate: {
+ // These are simple as long as the subpieces are simple.
if (!ChildArgsChanged(re, child_args)) {
- re->simple_ = true;
- return re->Incref();
- }
- Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ re->simple_ = true;
+ return re->Incref();
+ }
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
- Regexp** nre_subs = nre->sub();
+ Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
- nre_subs[i] = child_args[i];
- nre->simple_ = true;
- return nre;
- }
-
- case kRegexpCapture: {
- Regexp* newsub = child_args[0];
- if (newsub == re->sub()[0]) {
- newsub->Decref();
- re->simple_ = true;
- return re->Incref();
- }
- Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
- nre->AllocSub(1);
- nre->sub()[0] = newsub;
+ nre_subs[i] = child_args[i];
+ nre->simple_ = true;
+ return nre;
+ }
+
+ case kRegexpCapture: {
+ Regexp* newsub = child_args[0];
+ if (newsub == re->sub()[0]) {
+ newsub->Decref();
+ re->simple_ = true;
+ return re->Incref();
+ }
+ Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
+ nre->AllocSub(1);
+ nre->sub()[0] = newsub;
nre->cap_ = re->cap();
- nre->simple_ = true;
- return nre;
- }
-
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest: {
- Regexp* newsub = child_args[0];
- // Special case: repeat the empty string as much as
- // you want, but it's still the empty string.
- if (newsub->op() == kRegexpEmptyMatch)
- return newsub;
-
- // These are simple as long as the subpiece is simple.
- if (newsub == re->sub()[0]) {
- newsub->Decref();
- re->simple_ = true;
- return re->Incref();
- }
-
- // These are also idempotent if flags are constant.
- if (re->op() == newsub->op() &&
- re->parse_flags() == newsub->parse_flags())
- return newsub;
-
- Regexp* nre = new Regexp(re->op(), re->parse_flags());
- nre->AllocSub(1);
- nre->sub()[0] = newsub;
- nre->simple_ = true;
- return nre;
- }
-
- case kRegexpRepeat: {
- Regexp* newsub = child_args[0];
- // Special case: repeat the empty string as much as
- // you want, but it's still the empty string.
- if (newsub->op() == kRegexpEmptyMatch)
- return newsub;
-
- Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
- re->parse_flags());
- newsub->Decref();
- nre->simple_ = true;
- return nre;
- }
-
- case kRegexpCharClass: {
- Regexp* nre = SimplifyCharClass(re);
- nre->simple_ = true;
- return nre;
- }
- }
-
- LOG(ERROR) << "Simplify case not handled: " << re->op();
- return re->Incref();
-}
-
-// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
-// Returns a new Regexp, handing the ref to the caller.
-Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
- Regexp::ParseFlags parse_flags) {
- Regexp* re = new Regexp(kRegexpConcat, parse_flags);
- re->AllocSub(2);
- Regexp** subs = re->sub();
- subs[0] = re1;
- subs[1] = re2;
- return re;
-}
-
-// Simplifies the expression re{min,max} in terms of *, +, and ?.
-// Returns a new regexp. Does not edit re. Does not consume reference to re.
-// Caller must Decref return value when done with it.
-// The result will *not* necessarily have the right capturing parens
-// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
-// but in the Regexp* representation, both (x) are marked as $1.
-Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
- Regexp::ParseFlags f) {
- // x{n,} means at least n matches of x.
- if (max == -1) {
- // Special case: x{0,} is x*
- if (min == 0)
- return Regexp::Star(re->Incref(), f);
-
- // Special case: x{1,} is x+
- if (min == 1)
- return Regexp::Plus(re->Incref(), f);
-
- // General case: x{4,} is xxxx+
+ nre->simple_ = true;
+ return nre;
+ }
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest: {
+ Regexp* newsub = child_args[0];
+ // Special case: repeat the empty string as much as
+ // you want, but it's still the empty string.
+ if (newsub->op() == kRegexpEmptyMatch)
+ return newsub;
+
+ // These are simple as long as the subpiece is simple.
+ if (newsub == re->sub()[0]) {
+ newsub->Decref();
+ re->simple_ = true;
+ return re->Incref();
+ }
+
+ // These are also idempotent if flags are constant.
+ if (re->op() == newsub->op() &&
+ re->parse_flags() == newsub->parse_flags())
+ return newsub;
+
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(1);
+ nre->sub()[0] = newsub;
+ nre->simple_ = true;
+ return nre;
+ }
+
+ case kRegexpRepeat: {
+ Regexp* newsub = child_args[0];
+ // Special case: repeat the empty string as much as
+ // you want, but it's still the empty string.
+ if (newsub->op() == kRegexpEmptyMatch)
+ return newsub;
+
+ Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
+ re->parse_flags());
+ newsub->Decref();
+ nre->simple_ = true;
+ return nre;
+ }
+
+ case kRegexpCharClass: {
+ Regexp* nre = SimplifyCharClass(re);
+ nre->simple_ = true;
+ return nre;
+ }
+ }
+
+ LOG(ERROR) << "Simplify case not handled: " << re->op();
+ return re->Incref();
+}
+
+// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
+// Returns a new Regexp, handing the ref to the caller.
+Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
+ Regexp::ParseFlags parse_flags) {
+ Regexp* re = new Regexp(kRegexpConcat, parse_flags);
+ re->AllocSub(2);
+ Regexp** subs = re->sub();
+ subs[0] = re1;
+ subs[1] = re2;
+ return re;
+}
+
+// Simplifies the expression re{min,max} in terms of *, +, and ?.
+// Returns a new regexp. Does not edit re. Does not consume reference to re.
+// Caller must Decref return value when done with it.
+// The result will *not* necessarily have the right capturing parens
+// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
+// but in the Regexp* representation, both (x) are marked as $1.
+Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
+ Regexp::ParseFlags f) {
+ // x{n,} means at least n matches of x.
+ if (max == -1) {
+ // Special case: x{0,} is x*
+ if (min == 0)
+ return Regexp::Star(re->Incref(), f);
+
+ // Special case: x{1,} is x+
+ if (min == 1)
+ return Regexp::Plus(re->Incref(), f);
+
+ // General case: x{4,} is xxxx+
PODArray<Regexp*> nre_subs(min);
- for (int i = 0; i < min-1; i++)
- nre_subs[i] = re->Incref();
- nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
+ for (int i = 0; i < min-1; i++)
+ nre_subs[i] = re->Incref();
+ nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
return Regexp::Concat(nre_subs.data(), min, f);
- }
-
- // Special case: (x){0} matches only empty string.
- if (min == 0 && max == 0)
- return new Regexp(kRegexpEmptyMatch, f);
-
- // Special case: x{1} is just x.
- if (min == 1 && max == 1)
- return re->Incref();
-
- // General case: x{n,m} means n copies of x and m copies of x?.
- // The machine will do less work if we nest the final m copies,
- // so that x{2,5} = xx(x(x(x)?)?)?
-
- // Build leading prefix: xx. Capturing only on the last one.
- Regexp* nre = NULL;
- if (min > 0) {
+ }
+
+ // Special case: (x){0} matches only empty string.
+ if (min == 0 && max == 0)
+ return new Regexp(kRegexpEmptyMatch, f);
+
+ // Special case: x{1} is just x.
+ if (min == 1 && max == 1)
+ return re->Incref();
+
+ // General case: x{n,m} means n copies of x and m copies of x?.
+ // The machine will do less work if we nest the final m copies,
+ // so that x{2,5} = xx(x(x(x)?)?)?
+
+ // Build leading prefix: xx. Capturing only on the last one.
+ Regexp* nre = NULL;
+ if (min > 0) {
PODArray<Regexp*> nre_subs(min);
- for (int i = 0; i < min; i++)
- nre_subs[i] = re->Incref();
+ for (int i = 0; i < min; i++)
+ nre_subs[i] = re->Incref();
nre = Regexp::Concat(nre_subs.data(), min, f);
- }
-
- // Build and attach suffix: (x(x(x)?)?)?
- if (max > min) {
- Regexp* suf = Regexp::Quest(re->Incref(), f);
- for (int i = min+1; i < max; i++)
- suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
- if (nre == NULL)
- nre = suf;
- else
- nre = Concat2(nre, suf, f);
- }
-
- if (nre == NULL) {
- // Some degenerate case, like min > max, or min < max < 0.
- // This shouldn't happen, because the parser rejects such regexps.
- LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
- return new Regexp(kRegexpNoMatch, f);
- }
-
- return nre;
-}
-
-// Simplifies a character class.
-// Caller must Decref return value when done with it.
-Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
- CharClass* cc = re->cc();
-
- // Special cases
- if (cc->empty())
- return new Regexp(kRegexpNoMatch, re->parse_flags());
- if (cc->full())
- return new Regexp(kRegexpAnyChar, re->parse_flags());
-
- return re->Incref();
-}
-
-} // namespace re2
+ }
+
+ // Build and attach suffix: (x(x(x)?)?)?
+ if (max > min) {
+ Regexp* suf = Regexp::Quest(re->Incref(), f);
+ for (int i = min+1; i < max; i++)
+ suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
+ if (nre == NULL)
+ nre = suf;
+ else
+ nre = Concat2(nre, suf, f);
+ }
+
+ if (nre == NULL) {
+ // Some degenerate case, like min > max, or min < max < 0.
+ // This shouldn't happen, because the parser rejects such regexps.
+ LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
+ return new Regexp(kRegexpNoMatch, f);
+ }
+
+ return nre;
+}
+
+// Simplifies a character class.
+// Caller must Decref return value when done with it.
+Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
+ CharClass* cc = re->cc();
+
+ // Special cases
+ if (cc->empty())
+ return new Regexp(kRegexpNoMatch, re->parse_flags());
+ if (cc->full())
+ return new Regexp(kRegexpAnyChar, re->parse_flags());
+
+ return re->Incref();
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/sparse_array.h b/contrib/libs/re2/re2/sparse_array.h
index 343b1ffdf2..09ffe086b7 100644
--- a/contrib/libs/re2/re2/sparse_array.h
+++ b/contrib/libs/re2/re2/sparse_array.h
@@ -1,68 +1,68 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_SPARSE_ARRAY_H_
#define RE2_SPARSE_ARRAY_H_
-// DESCRIPTION
+// DESCRIPTION
//
-// SparseArray<T>(m) is a map from integers in [0, m) to T values.
-// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
-// fast iteration through the elements in the array and fast clearing
-// of the array. The array has a concept of certain elements being
-// uninitialized (having no value).
+// SparseArray<T>(m) is a map from integers in [0, m) to T values.
+// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
+// fast iteration through the elements in the array and fast clearing
+// of the array. The array has a concept of certain elements being
+// uninitialized (having no value).
//
-// Insertion and deletion are constant time operations.
+// Insertion and deletion are constant time operations.
//
// Allocating the array is a constant time operation
-// when memory allocation is a constant time operation.
+// when memory allocation is a constant time operation.
+//
+// Clearing the array is a constant time operation (unusual!).
//
-// Clearing the array is a constant time operation (unusual!).
+// Iterating through the array is an O(n) operation, where n
+// is the number of items in the array (not O(m)).
//
-// Iterating through the array is an O(n) operation, where n
-// is the number of items in the array (not O(m)).
-//
// The array iterator visits entries in the order they were first
-// inserted into the array. It is safe to add items to the array while
-// using an iterator: the iterator will visit indices added to the array
-// during the iteration, but will not re-visit indices whose values
-// change after visiting. Thus SparseArray can be a convenient
-// implementation of a work queue.
+// inserted into the array. It is safe to add items to the array while
+// using an iterator: the iterator will visit indices added to the array
+// during the iteration, but will not re-visit indices whose values
+// change after visiting. Thus SparseArray can be a convenient
+// implementation of a work queue.
//
-// The SparseArray implementation is NOT thread-safe. It is up to the
-// caller to make sure only one thread is accessing the array. (Typically
-// these arrays are temporary values and used in situations where speed is
-// important.)
+// The SparseArray implementation is NOT thread-safe. It is up to the
+// caller to make sure only one thread is accessing the array. (Typically
+// these arrays are temporary values and used in situations where speed is
+// important.)
//
-// The SparseArray interface does not present all the usual STL bells and
-// whistles.
+// The SparseArray interface does not present all the usual STL bells and
+// whistles.
//
-// Implemented with reference to Briggs & Torczon, An Efficient
-// Representation for Sparse Sets, ACM Letters on Programming Languages
-// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
+// Implemented with reference to Briggs & Torczon, An Efficient
+// Representation for Sparse Sets, ACM Letters on Programming Languages
+// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
-// Briggs & Torczon popularized this technique, but it had been known
-// long before their paper. They point out that Aho, Hopcroft, and
-// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
-// 1986 Programming Pearls both hint at the technique in exercises to the
-// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
-// exercise 8).
+// Briggs & Torczon popularized this technique, but it had been known
+// long before their paper. They point out that Aho, Hopcroft, and
+// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
+// 1986 Programming Pearls both hint at the technique in exercises to the
+// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
+// exercise 8).
+//
+// Briggs & Torczon describe a sparse set implementation. I have
+// trivially generalized it to create a sparse array (actually the original
+// target of the AHU and Bentley exercises).
+
+// IMPLEMENTATION
//
-// Briggs & Torczon describe a sparse set implementation. I have
-// trivially generalized it to create a sparse array (actually the original
-// target of the AHU and Bentley exercises).
-
-// IMPLEMENTATION
-//
// SparseArray is an array dense_ and an array sparse_ of identical size.
// At any point, the number of elements in the sparse array is size_.
//
// The array dense_ contains the size_ elements in the sparse array (with
-// their indices),
-// in the order that the elements were first inserted. This array is dense:
-// the size_ pairs are dense_[0] through dense_[size_-1].
-//
+// their indices),
+// in the order that the elements were first inserted. This array is dense:
+// the size_ pairs are dense_[0] through dense_[size_-1].
+//
// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
// For indices present in the array, dense_[sparse_[i]].index_ == i.
// For indices not present in the array, sparse_ can contain any value at all,
@@ -75,19 +75,19 @@
// dense_[sparse_[i]].index_ == i.
// If both these properties hold, only then it is safe to refer to
// dense_[sparse_[i]].value_
-// as the value associated with index i.
-//
+// as the value associated with index i.
+//
// To insert a new entry, set sparse_[i] to size_,
-// initialize dense_[size_], and then increment size_.
-//
-// To make the sparse array as efficient as possible for non-primitive types,
-// elements may or may not be destroyed when they are deleted from the sparse
+// initialize dense_[size_], and then increment size_.
+//
+// To make the sparse array as efficient as possible for non-primitive types,
+// elements may or may not be destroyed when they are deleted from the sparse
// array through a call to resize(). They immediately become inaccessible, but
// they are only guaranteed to be destroyed when the SparseArray destructor is
// called.
//
// A moved-from SparseArray will be empty.
-
+
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
@@ -101,100 +101,100 @@
#include <algorithm>
#include <memory>
#include <utility>
-
+
#include "re2/pod_array.h"
-namespace re2 {
-
-template<typename Value>
-class SparseArray {
- public:
- SparseArray();
+namespace re2 {
+
+template<typename Value>
+class SparseArray {
+ public:
+ SparseArray();
explicit SparseArray(int max_size);
- ~SparseArray();
-
- // IndexValue pairs: exposed in SparseArray::iterator.
- class IndexValue;
-
+ ~SparseArray();
+
+ // IndexValue pairs: exposed in SparseArray::iterator.
+ class IndexValue;
+
typedef IndexValue* iterator;
typedef const IndexValue* const_iterator;
-
+
SparseArray(const SparseArray& src);
SparseArray(SparseArray&& src);
-
+
SparseArray& operator=(const SparseArray& src);
SparseArray& operator=(SparseArray&& src);
- // Return the number of entries in the array.
- int size() const {
- return size_;
- }
-
+ // Return the number of entries in the array.
+ int size() const {
+ return size_;
+ }
+
// Indicate whether the array is empty.
int empty() const {
return size_ == 0;
}
- // Iterate over the array.
- iterator begin() {
+ // Iterate over the array.
+ iterator begin() {
return dense_.data();
- }
- iterator end() {
+ }
+ iterator end() {
return dense_.data() + size_;
- }
-
- const_iterator begin() const {
+ }
+
+ const_iterator begin() const {
return dense_.data();
- }
- const_iterator end() const {
+ }
+ const_iterator end() const {
return dense_.data() + size_;
- }
-
- // Change the maximum size of the array.
- // Invalidates all iterators.
+ }
+
+ // Change the maximum size of the array.
+ // Invalidates all iterators.
void resize(int new_max_size);
-
- // Return the maximum size of the array.
- // Indices can be in the range [0, max_size).
- int max_size() const {
+
+ // Return the maximum size of the array.
+ // Indices can be in the range [0, max_size).
+ int max_size() const {
if (dense_.data() != NULL)
return dense_.size();
else
return 0;
- }
-
- // Clear the array.
- void clear() {
- size_ = 0;
- }
-
- // Check whether index i is in the array.
+ }
+
+ // Clear the array.
+ void clear() {
+ size_ = 0;
+ }
+
+ // Check whether index i is in the array.
bool has_index(int i) const;
-
- // Comparison function for sorting.
- // Can sort the sparse array so that future iterations
- // will visit indices in increasing order using
+
+ // Comparison function for sorting.
+ // Can sort the sparse array so that future iterations
+ // will visit indices in increasing order using
// std::sort(arr.begin(), arr.end(), arr.less);
- static bool less(const IndexValue& a, const IndexValue& b);
-
- public:
- // Set the value at index i to v.
+ static bool less(const IndexValue& a, const IndexValue& b);
+
+ public:
+ // Set the value at index i to v.
iterator set(int i, const Value& v) {
return SetInternal(true, i, v);
}
-
+
// Set the value at new index i to v.
// Fast but unsafe: only use if has_index(i) is false.
iterator set_new(int i, const Value& v) {
return SetInternal(false, i, v);
}
-
+
// Set the value at index i to v.
- // Fast but unsafe: only use if has_index(i) is true.
+ // Fast but unsafe: only use if has_index(i) is true.
iterator set_existing(int i, const Value& v) {
return SetExistingInternal(i, v);
}
-
+
// Get the value at index i.
// Fast but unsafe: only use if has_index(i) is true.
Value& get_existing(int i) {
@@ -205,8 +205,8 @@ class SparseArray {
assert(has_index(i));
return dense_[sparse_[i]].value_;
}
-
- private:
+
+ private:
iterator SetInternal(bool allow_existing, int i, const Value& v) {
DebugCheckInvariants();
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
@@ -234,18 +234,18 @@ class SparseArray {
return dense_.data() + sparse_[i];
}
- // Add the index i to the array.
- // Only use if has_index(i) is known to be false.
- // Since it doesn't set the value associated with i,
- // this function is private, only intended as a helper
- // for other methods.
+ // Add the index i to the array.
+ // Only use if has_index(i) is known to be false.
+ // Since it doesn't set the value associated with i,
+ // this function is private, only intended as a helper
+ // for other methods.
void create_index(int i);
-
- // In debug mode, verify that some invariant properties of the class
- // are being maintained. This is called at the end of the constructor
- // and at the beginning and end of all public non-const member functions.
+
+ // In debug mode, verify that some invariant properties of the class
+ // are being maintained. This is called at the end of the constructor
+ // and at the beginning and end of all public non-const member functions.
void DebugCheckInvariants() const;
-
+
// Initializes memory for elements [min, max).
void MaybeInitializeMemory(int min, int max) {
#if __has_feature(memory_sanitizer)
@@ -260,11 +260,11 @@ class SparseArray {
int size_ = 0;
PODArray<int> sparse_;
PODArray<IndexValue> dense_;
-};
-
-template<typename Value>
+};
+
+template<typename Value>
SparseArray<Value>::SparseArray() = default;
-
+
template<typename Value>
SparseArray<Value>::SparseArray(const SparseArray& src)
: size_(src.size_),
@@ -305,28 +305,28 @@ SparseArray<Value>& SparseArray<Value>::operator=(SparseArray&& src) {
return *this;
}
-// IndexValue pairs: exposed in SparseArray::iterator.
-template<typename Value>
-class SparseArray<Value>::IndexValue {
- public:
- int index() const { return index_; }
+// IndexValue pairs: exposed in SparseArray::iterator.
+template<typename Value>
+class SparseArray<Value>::IndexValue {
+ public:
+ int index() const { return index_; }
Value& value() { return value_; }
const Value& value() const { return value_; }
-
+
private:
friend class SparseArray;
int index_;
Value value_;
-};
-
-// Change the maximum size of the array.
-// Invalidates all iterators.
-template<typename Value>
+};
+
+// Change the maximum size of the array.
+// Invalidates all iterators.
+template<typename Value>
void SparseArray<Value>::resize(int new_max_size) {
- DebugCheckInvariants();
+ DebugCheckInvariants();
if (new_max_size > max_size()) {
const int old_max_size = max_size();
-
+
// Construct these first for exception safety.
PODArray<int> a(new_max_size);
PODArray<IndexValue> b(new_max_size);
@@ -338,55 +338,55 @@ void SparseArray<Value>::resize(int new_max_size) {
dense_ = std::move(b);
MaybeInitializeMemory(old_max_size, new_max_size);
- }
+ }
if (size_ > new_max_size)
size_ = new_max_size;
- DebugCheckInvariants();
-}
-
-// Check whether index i is in the array.
-template<typename Value>
-bool SparseArray<Value>::has_index(int i) const {
+ DebugCheckInvariants();
+}
+
+// Check whether index i is in the array.
+template<typename Value>
+bool SparseArray<Value>::has_index(int i) const {
assert(i >= 0);
assert(i < max_size());
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
- return false;
- }
+ return false;
+ }
// Unsigned comparison avoids checking sparse_[i] < 0.
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
dense_[sparse_[i]].index_ == i;
-}
-
-template<typename Value>
-void SparseArray<Value>::create_index(int i) {
+}
+
+template<typename Value>
+void SparseArray<Value>::create_index(int i) {
assert(!has_index(i));
assert(size_ < max_size());
sparse_[i] = size_;
- dense_[size_].index_ = i;
- size_++;
-}
-
+ dense_[size_].index_ = i;
+ size_++;
+}
+
template<typename Value> SparseArray<Value>::SparseArray(int max_size) :
sparse_(max_size), dense_(max_size) {
MaybeInitializeMemory(size_, max_size);
- DebugCheckInvariants();
-}
-
-template<typename Value> SparseArray<Value>::~SparseArray() {
- DebugCheckInvariants();
-}
-
-template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
+ DebugCheckInvariants();
+}
+
+template<typename Value> SparseArray<Value>::~SparseArray() {
+ DebugCheckInvariants();
+}
+
+template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
assert(0 <= size_);
assert(size_ <= max_size());
-}
-
-// Comparison function for sorting.
-template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
- const IndexValue& b) {
- return a.index_ < b.index_;
-}
-
-} // namespace re2
-
+}
+
+// Comparison function for sorting.
+template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
+ const IndexValue& b) {
+ return a.index_ < b.index_;
+}
+
+} // namespace re2
+
#endif // RE2_SPARSE_ARRAY_H_
diff --git a/contrib/libs/re2/re2/sparse_set.h b/contrib/libs/re2/re2/sparse_set.h
index 99b18051ef..06ed88d81b 100644
--- a/contrib/libs/re2/re2/sparse_set.h
+++ b/contrib/libs/re2/re2/sparse_set.h
@@ -1,52 +1,52 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_SPARSE_SET_H_
#define RE2_SPARSE_SET_H_
-// DESCRIPTION
+// DESCRIPTION
//
// SparseSet(m) is a set of integers in [0, m).
-// It requires sizeof(int)*m memory, but it provides
-// fast iteration through the elements in the set and fast clearing
-// of the set.
+// It requires sizeof(int)*m memory, but it provides
+// fast iteration through the elements in the set and fast clearing
+// of the set.
//
-// Insertion and deletion are constant time operations.
+// Insertion and deletion are constant time operations.
//
// Allocating the set is a constant time operation
-// when memory allocation is a constant time operation.
+// when memory allocation is a constant time operation.
+//
+// Clearing the set is a constant time operation (unusual!).
//
-// Clearing the set is a constant time operation (unusual!).
+// Iterating through the set is an O(n) operation, where n
+// is the number of items in the set (not O(m)).
//
-// Iterating through the set is an O(n) operation, where n
-// is the number of items in the set (not O(m)).
-//
// The set iterator visits entries in the order they were first
// inserted into the set. It is safe to add items to the set while
-// using an iterator: the iterator will visit indices added to the set
-// during the iteration, but will not re-visit indices whose values
-// change after visiting. Thus SparseSet can be a convenient
-// implementation of a work queue.
+// using an iterator: the iterator will visit indices added to the set
+// during the iteration, but will not re-visit indices whose values
+// change after visiting. Thus SparseSet can be a convenient
+// implementation of a work queue.
//
-// The SparseSet implementation is NOT thread-safe. It is up to the
-// caller to make sure only one thread is accessing the set. (Typically
-// these sets are temporary values and used in situations where speed is
-// important.)
+// The SparseSet implementation is NOT thread-safe. It is up to the
+// caller to make sure only one thread is accessing the set. (Typically
+// these sets are temporary values and used in situations where speed is
+// important.)
//
-// The SparseSet interface does not present all the usual STL bells and
-// whistles.
+// The SparseSet interface does not present all the usual STL bells and
+// whistles.
//
-// Implemented with reference to Briggs & Torczon, An Efficient
-// Representation for Sparse Sets, ACM Letters on Programming Languages
-// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
+// Implemented with reference to Briggs & Torczon, An Efficient
+// Representation for Sparse Sets, ACM Letters on Programming Languages
+// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
// This is a specialization of sparse array; see sparse_array.h.
-
-// IMPLEMENTATION
-//
+
+// IMPLEMENTATION
+//
// See sparse_array.h for implementation details.
-
+
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
@@ -60,31 +60,31 @@
#include <algorithm>
#include <memory>
#include <utility>
-
+
#include "re2/pod_array.h"
-namespace re2 {
-
+namespace re2 {
+
template<typename Value>
class SparseSetT {
- public:
+ public:
SparseSetT();
explicit SparseSetT(int max_size);
~SparseSetT();
-
+
typedef int* iterator;
typedef const int* const_iterator;
// Return the number of entries in the set.
int size() const {
return size_;
- }
-
+ }
+
// Indicate whether the set is empty.
int empty() const {
return size_ == 0;
- }
-
+ }
+
// Iterate over the set.
iterator begin() {
return dense_.data();
@@ -92,18 +92,18 @@ class SparseSetT {
iterator end() {
return dense_.data() + size_;
}
-
+
const_iterator begin() const {
return dense_.data();
}
const_iterator end() const {
return dense_.data() + size_;
}
-
+
// Change the maximum size of the set.
- // Invalidates all iterators.
+ // Invalidates all iterators.
void resize(int new_max_size);
-
+
// Return the maximum size of the set.
// Indices can be in the range [0, max_size).
int max_size() const {
@@ -111,16 +111,16 @@ class SparseSetT {
return dense_.size();
else
return 0;
- }
-
+ }
+
// Clear the set.
void clear() {
size_ = 0;
}
-
+
// Check whether index i is in the set.
bool contains(int i) const;
-
+
// Comparison function for sorting.
// Can sort the sparse set so that future iterations
// will visit indices in increasing order using
@@ -131,24 +131,24 @@ class SparseSetT {
// Insert index i into the set.
iterator insert(int i) {
return InsertInternal(true, i);
- }
-
+ }
+
// Insert index i into the set.
// Fast but unsafe: only use if contains(i) is false.
iterator insert_new(int i) {
return InsertInternal(false, i);
- }
-
+ }
+
private:
iterator InsertInternal(bool allow_existing, int i) {
DebugCheckInvariants();
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
assert(false && "illegal index");
- // Semantically, end() would be better here, but we already know
- // the user did something stupid, so begin() insulates them from
- // dereferencing an invalid pointer.
+ // Semantically, end() would be better here, but we already know
+ // the user did something stupid, so begin() insulates them from
+ // dereferencing an invalid pointer.
return begin();
- }
+ }
if (!allow_existing) {
assert(!contains(i));
create_index(i);
@@ -158,19 +158,19 @@ class SparseSetT {
}
DebugCheckInvariants();
return dense_.data() + sparse_[i];
- }
-
+ }
+
// Add the index i to the set.
// Only use if contains(i) is known to be false.
// This function is private, only intended as a helper
// for other methods.
void create_index(int i);
-
+
// In debug mode, verify that some invariant properties of the class
// are being maintained. This is called at the end of the constructor
// and at the beginning and end of all public non-const member functions.
void DebugCheckInvariants() const;
-
+
// Initializes memory for elements [min, max).
void MaybeInitializeMemory(int min, int max) {
#if __has_feature(memory_sanitizer)
@@ -185,8 +185,8 @@ class SparseSetT {
int size_ = 0;
PODArray<int> sparse_;
PODArray<int> dense_;
-};
-
+};
+
template<typename Value>
SparseSetT<Value>::SparseSetT() = default;
@@ -259,6 +259,6 @@ template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
typedef SparseSetT<void> SparseSet;
-} // namespace re2
-
+} // namespace re2
+
#endif // RE2_SPARSE_SET_H_
diff --git a/contrib/libs/re2/re2/tostring.cc b/contrib/libs/re2/re2/tostring.cc
index a2b2a7ddaf..9c1c038ca6 100644
--- a/contrib/libs/re2/re2/tostring.cc
+++ b/contrib/libs/re2/re2/tostring.cc
@@ -1,10 +1,10 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Format a regular expression structure as a string.
-// Tested by parse_test.cc
-
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Format a regular expression structure as a string.
+// Tested by parse_test.cc
+
#include <string.h>
#include <string>
@@ -12,340 +12,340 @@
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
-#include "re2/regexp.h"
-#include "re2/walker-inl.h"
-
-namespace re2 {
-
-enum {
- PrecAtom,
- PrecUnary,
- PrecConcat,
- PrecAlternate,
- PrecEmpty,
- PrecParen,
- PrecToplevel,
-};
-
-// Helper function. See description below.
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+enum {
+ PrecAtom,
+ PrecUnary,
+ PrecConcat,
+ PrecAlternate,
+ PrecEmpty,
+ PrecParen,
+ PrecToplevel,
+};
+
+// Helper function. See description below.
static void AppendCCRange(std::string* t, Rune lo, Rune hi);
-
-// Walker to generate string in s_.
-// The arg pointers are actually integers giving the
-// context precedence.
-// The child_args are always NULL.
-class ToStringWalker : public Regexp::Walker<int> {
- public:
+
+// Walker to generate string in s_.
+// The arg pointers are actually integers giving the
+// context precedence.
+// The child_args are always NULL.
+class ToStringWalker : public Regexp::Walker<int> {
+ public:
explicit ToStringWalker(std::string* t) : t_(t) {}
-
- virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
- virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
- int* child_args, int nchild_args);
- virtual int ShortVisit(Regexp* re, int parent_arg) {
- return 0;
- }
-
- private:
+
+ virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
+ virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args);
+ virtual int ShortVisit(Regexp* re, int parent_arg) {
+ return 0;
+ }
+
+ private:
std::string* t_; // The string the walker appends to.
-
+
ToStringWalker(const ToStringWalker&) = delete;
ToStringWalker& operator=(const ToStringWalker&) = delete;
-};
-
+};
+
std::string Regexp::ToString() {
std::string t;
- ToStringWalker w(&t);
- w.WalkExponential(this, PrecToplevel, 100000);
- if (w.stopped_early())
- t += " [truncated]";
- return t;
-}
-
-#define ToString DontCallToString // Avoid accidental recursion.
-
-// Visits re before children are processed.
-// Appends ( if needed and passes new precedence to children.
-int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
- int prec = parent_arg;
- int nprec = PrecAtom;
-
- switch (re->op()) {
- case kRegexpNoMatch:
- case kRegexpEmptyMatch:
- case kRegexpLiteral:
- case kRegexpAnyChar:
- case kRegexpAnyByte:
- case kRegexpBeginLine:
- case kRegexpEndLine:
- case kRegexpBeginText:
- case kRegexpEndText:
- case kRegexpWordBoundary:
- case kRegexpNoWordBoundary:
- case kRegexpCharClass:
- case kRegexpHaveMatch:
- nprec = PrecAtom;
- break;
-
- case kRegexpConcat:
- case kRegexpLiteralString:
- if (prec < PrecConcat)
- t_->append("(?:");
- nprec = PrecConcat;
- break;
-
- case kRegexpAlternate:
- if (prec < PrecAlternate)
- t_->append("(?:");
- nprec = PrecAlternate;
- break;
-
- case kRegexpCapture:
- t_->append("(");
+ ToStringWalker w(&t);
+ w.WalkExponential(this, PrecToplevel, 100000);
+ if (w.stopped_early())
+ t += " [truncated]";
+ return t;
+}
+
+#define ToString DontCallToString // Avoid accidental recursion.
+
+// Visits re before children are processed.
+// Appends ( if needed and passes new precedence to children.
+int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
+ int prec = parent_arg;
+ int nprec = PrecAtom;
+
+ switch (re->op()) {
+ case kRegexpNoMatch:
+ case kRegexpEmptyMatch:
+ case kRegexpLiteral:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ case kRegexpBeginLine:
+ case kRegexpEndLine:
+ case kRegexpBeginText:
+ case kRegexpEndText:
+ case kRegexpWordBoundary:
+ case kRegexpNoWordBoundary:
+ case kRegexpCharClass:
+ case kRegexpHaveMatch:
+ nprec = PrecAtom;
+ break;
+
+ case kRegexpConcat:
+ case kRegexpLiteralString:
+ if (prec < PrecConcat)
+ t_->append("(?:");
+ nprec = PrecConcat;
+ break;
+
+ case kRegexpAlternate:
+ if (prec < PrecAlternate)
+ t_->append("(?:");
+ nprec = PrecAlternate;
+ break;
+
+ case kRegexpCapture:
+ t_->append("(");
if (re->cap() == 0)
LOG(DFATAL) << "kRegexpCapture cap() == 0";
- if (re->name()) {
- t_->append("?P<");
- t_->append(*re->name());
- t_->append(">");
- }
- nprec = PrecParen;
- break;
-
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- case kRegexpRepeat:
- if (prec < PrecUnary)
- t_->append("(?:");
- // The subprecedence here is PrecAtom instead of PrecUnary
- // because PCRE treats two unary ops in a row as a parse error.
- nprec = PrecAtom;
- break;
- }
-
- return nprec;
-}
-
+ if (re->name()) {
+ t_->append("?P<");
+ t_->append(*re->name());
+ t_->append(">");
+ }
+ nprec = PrecParen;
+ break;
+
+ case kRegexpStar:
+ case kRegexpPlus:
+ case kRegexpQuest:
+ case kRegexpRepeat:
+ if (prec < PrecUnary)
+ t_->append("(?:");
+ // The subprecedence here is PrecAtom instead of PrecUnary
+ // because PCRE treats two unary ops in a row as a parse error.
+ nprec = PrecAtom;
+ break;
+ }
+
+ return nprec;
+}
+
static void AppendLiteral(std::string *t, Rune r, bool foldcase) {
- if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
- t->append(1, '\\');
+ if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
+ t->append(1, '\\');
t->append(1, static_cast<char>(r));
- } else if (foldcase && 'a' <= r && r <= 'z') {
+ } else if (foldcase && 'a' <= r && r <= 'z') {
r -= 'a' - 'A';
- t->append(1, '[');
+ t->append(1, '[');
t->append(1, static_cast<char>(r));
t->append(1, static_cast<char>(r) + 'a' - 'A');
- t->append(1, ']');
- } else {
- AppendCCRange(t, r, r);
- }
-}
-
-// Visits re after children are processed.
-// For childless regexps, all the work is done here.
-// For regexps with children, append any unary suffixes or ).
-int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
- int* child_args, int nchild_args) {
- int prec = parent_arg;
- switch (re->op()) {
- case kRegexpNoMatch:
- // There's no simple symbol for "no match", but
- // [^0-Runemax] excludes everything.
- t_->append("[^\\x00-\\x{10ffff}]");
- break;
-
- case kRegexpEmptyMatch:
- // Append (?:) to make empty string visible,
- // unless this is already being parenthesized.
- if (prec < PrecEmpty)
- t_->append("(?:)");
- break;
-
- case kRegexpLiteral:
+ t->append(1, ']');
+ } else {
+ AppendCCRange(t, r, r);
+ }
+}
+
+// Visits re after children are processed.
+// For childless regexps, all the work is done here.
+// For regexps with children, append any unary suffixes or ).
+int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args) {
+ int prec = parent_arg;
+ switch (re->op()) {
+ case kRegexpNoMatch:
+ // There's no simple symbol for "no match", but
+ // [^0-Runemax] excludes everything.
+ t_->append("[^\\x00-\\x{10ffff}]");
+ break;
+
+ case kRegexpEmptyMatch:
+ // Append (?:) to make empty string visible,
+ // unless this is already being parenthesized.
+ if (prec < PrecEmpty)
+ t_->append("(?:)");
+ break;
+
+ case kRegexpLiteral:
AppendLiteral(t_, re->rune(),
(re->parse_flags() & Regexp::FoldCase) != 0);
- break;
-
- case kRegexpLiteralString:
- for (int i = 0; i < re->nrunes(); i++)
+ break;
+
+ case kRegexpLiteralString:
+ for (int i = 0; i < re->nrunes(); i++)
AppendLiteral(t_, re->runes()[i],
(re->parse_flags() & Regexp::FoldCase) != 0);
- if (prec < PrecConcat)
- t_->append(")");
- break;
-
- case kRegexpConcat:
- if (prec < PrecConcat)
- t_->append(")");
- break;
-
- case kRegexpAlternate:
- // Clumsy but workable: the children all appended |
- // at the end of their strings, so just remove the last one.
- if ((*t_)[t_->size()-1] == '|')
- t_->erase(t_->size()-1);
- else
- LOG(DFATAL) << "Bad final char: " << t_;
- if (prec < PrecAlternate)
- t_->append(")");
- break;
-
- case kRegexpStar:
- t_->append("*");
- if (re->parse_flags() & Regexp::NonGreedy)
- t_->append("?");
- if (prec < PrecUnary)
- t_->append(")");
- break;
-
- case kRegexpPlus:
- t_->append("+");
- if (re->parse_flags() & Regexp::NonGreedy)
- t_->append("?");
- if (prec < PrecUnary)
- t_->append(")");
- break;
-
- case kRegexpQuest:
- t_->append("?");
- if (re->parse_flags() & Regexp::NonGreedy)
- t_->append("?");
- if (prec < PrecUnary)
- t_->append(")");
- break;
-
- case kRegexpRepeat:
- if (re->max() == -1)
- t_->append(StringPrintf("{%d,}", re->min()));
- else if (re->min() == re->max())
- t_->append(StringPrintf("{%d}", re->min()));
- else
- t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
- if (re->parse_flags() & Regexp::NonGreedy)
- t_->append("?");
- if (prec < PrecUnary)
- t_->append(")");
- break;
-
- case kRegexpAnyChar:
- t_->append(".");
- break;
-
- case kRegexpAnyByte:
- t_->append("\\C");
- break;
-
- case kRegexpBeginLine:
- t_->append("^");
- break;
-
- case kRegexpEndLine:
- t_->append("$");
- break;
-
- case kRegexpBeginText:
- t_->append("(?-m:^)");
- break;
-
- case kRegexpEndText:
- if (re->parse_flags() & Regexp::WasDollar)
- t_->append("(?-m:$)");
- else
- t_->append("\\z");
- break;
-
- case kRegexpWordBoundary:
- t_->append("\\b");
- break;
-
- case kRegexpNoWordBoundary:
- t_->append("\\B");
- break;
-
- case kRegexpCharClass: {
- if (re->cc()->size() == 0) {
- t_->append("[^\\x00-\\x{10ffff}]");
- break;
- }
- t_->append("[");
- // Heuristic: show class as negated if it contains the
+ if (prec < PrecConcat)
+ t_->append(")");
+ break;
+
+ case kRegexpConcat:
+ if (prec < PrecConcat)
+ t_->append(")");
+ break;
+
+ case kRegexpAlternate:
+ // Clumsy but workable: the children all appended |
+ // at the end of their strings, so just remove the last one.
+ if ((*t_)[t_->size()-1] == '|')
+ t_->erase(t_->size()-1);
+ else
+ LOG(DFATAL) << "Bad final char: " << t_;
+ if (prec < PrecAlternate)
+ t_->append(")");
+ break;
+
+ case kRegexpStar:
+ t_->append("*");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpPlus:
+ t_->append("+");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpQuest:
+ t_->append("?");
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpRepeat:
+ if (re->max() == -1)
+ t_->append(StringPrintf("{%d,}", re->min()));
+ else if (re->min() == re->max())
+ t_->append(StringPrintf("{%d}", re->min()));
+ else
+ t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
+ if (re->parse_flags() & Regexp::NonGreedy)
+ t_->append("?");
+ if (prec < PrecUnary)
+ t_->append(")");
+ break;
+
+ case kRegexpAnyChar:
+ t_->append(".");
+ break;
+
+ case kRegexpAnyByte:
+ t_->append("\\C");
+ break;
+
+ case kRegexpBeginLine:
+ t_->append("^");
+ break;
+
+ case kRegexpEndLine:
+ t_->append("$");
+ break;
+
+ case kRegexpBeginText:
+ t_->append("(?-m:^)");
+ break;
+
+ case kRegexpEndText:
+ if (re->parse_flags() & Regexp::WasDollar)
+ t_->append("(?-m:$)");
+ else
+ t_->append("\\z");
+ break;
+
+ case kRegexpWordBoundary:
+ t_->append("\\b");
+ break;
+
+ case kRegexpNoWordBoundary:
+ t_->append("\\B");
+ break;
+
+ case kRegexpCharClass: {
+ if (re->cc()->size() == 0) {
+ t_->append("[^\\x00-\\x{10ffff}]");
+ break;
+ }
+ t_->append("[");
+ // Heuristic: show class as negated if it contains the
// non-character 0xFFFE and yet somehow isn't full.
- CharClass* cc = re->cc();
+ CharClass* cc = re->cc();
if (cc->Contains(0xFFFE) && !cc->full()) {
- cc = cc->Negate();
- t_->append("^");
- }
- for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
- AppendCCRange(t_, i->lo, i->hi);
- if (cc != re->cc())
- cc->Delete();
- t_->append("]");
- break;
- }
-
- case kRegexpCapture:
- t_->append(")");
- break;
-
- case kRegexpHaveMatch:
- // There's no syntax accepted by the parser to generate
- // this node (it is generated by RE2::Set) so make something
- // up that is readable but won't compile.
+ cc = cc->Negate();
+ t_->append("^");
+ }
+ for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
+ AppendCCRange(t_, i->lo, i->hi);
+ if (cc != re->cc())
+ cc->Delete();
+ t_->append("]");
+ break;
+ }
+
+ case kRegexpCapture:
+ t_->append(")");
+ break;
+
+ case kRegexpHaveMatch:
+ // There's no syntax accepted by the parser to generate
+ // this node (it is generated by RE2::Set) so make something
+ // up that is readable but won't compile.
t_->append(StringPrintf("(?HaveMatch:%d)", re->match_id()));
- break;
- }
-
- // If the parent is an alternation, append the | for it.
- if (prec == PrecAlternate)
- t_->append("|");
-
- return 0;
-}
-
-// Appends a rune for use in a character class to the string t.
+ break;
+ }
+
+ // If the parent is an alternation, append the | for it.
+ if (prec == PrecAlternate)
+ t_->append("|");
+
+ return 0;
+}
+
+// Appends a rune for use in a character class to the string t.
static void AppendCCChar(std::string* t, Rune r) {
- if (0x20 <= r && r <= 0x7E) {
- if (strchr("[]^-\\", r))
- t->append("\\");
+ if (0x20 <= r && r <= 0x7E) {
+ if (strchr("[]^-\\", r))
+ t->append("\\");
t->append(1, static_cast<char>(r));
- return;
- }
- switch (r) {
- default:
- break;
-
- case '\r':
- t->append("\\r");
- return;
-
- case '\t':
- t->append("\\t");
- return;
-
- case '\n':
- t->append("\\n");
- return;
-
- case '\f':
- t->append("\\f");
- return;
- }
-
- if (r < 0x100) {
+ return;
+ }
+ switch (r) {
+ default:
+ break;
+
+ case '\r':
+ t->append("\\r");
+ return;
+
+ case '\t':
+ t->append("\\t");
+ return;
+
+ case '\n':
+ t->append("\\n");
+ return;
+
+ case '\f':
+ t->append("\\f");
+ return;
+ }
+
+ if (r < 0x100) {
*t += StringPrintf("\\x%02x", static_cast<int>(r));
- return;
- }
+ return;
+ }
*t += StringPrintf("\\x{%x}", static_cast<int>(r));
-}
-
+}
+
static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
- if (lo > hi)
- return;
- AppendCCChar(t, lo);
- if (lo < hi) {
- t->append("-");
- AppendCCChar(t, hi);
- }
-}
-
-} // namespace re2
+ if (lo > hi)
+ return;
+ AppendCCChar(t, lo);
+ if (lo < hi) {
+ t->append("-");
+ AppendCCChar(t, hi);
+ }
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/re2/unicode_casefold.h b/contrib/libs/re2/re2/unicode_casefold.h
index 70a597010f..8bdbb42fbc 100644
--- a/contrib/libs/re2/re2/unicode_casefold.h
+++ b/contrib/libs/re2/re2/unicode_casefold.h
@@ -1,78 +1,78 @@
-// Copyright 2008 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_UNICODE_CASEFOLD_H_
#define RE2_UNICODE_CASEFOLD_H_
-// Unicode case folding tables.
-
-// The Unicode case folding tables encode the mapping from one Unicode point
-// to the next largest Unicode point with equivalent folding. The largest
-// point wraps back to the first. For example, the tables map:
-//
-// 'A' -> 'a'
-// 'a' -> 'A'
-//
-// 'K' -> 'k'
-// 'k' -> 'K' (Kelvin symbol)
-// 'K' -> 'K'
-//
-// Like everything Unicode, these tables are big. If we represent the table
+// Unicode case folding tables.
+
+// The Unicode case folding tables encode the mapping from one Unicode point
+// to the next largest Unicode point with equivalent folding. The largest
+// point wraps back to the first. For example, the tables map:
+//
+// 'A' -> 'a'
+// 'a' -> 'A'
+//
+// 'K' -> 'k'
+// 'k' -> 'K' (Kelvin symbol)
+// 'K' -> 'K'
+//
+// Like everything Unicode, these tables are big. If we represent the table
// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
-// Most table entries look like the ones around them:
-// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
-// Instead of listing all the pairs explicitly, we make a list of ranges
-// and deltas, so that the table entries for 'A' through 'Z' can be represented
-// as a single entry { 'A', 'Z', +32 }.
-//
-// In addition to blocks that map to each other (A-Z mapping to a-z)
-// there are blocks of pairs that individually map to each other
-// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
-// For those, the special delta value EvenOdd marks even/odd pairs
-// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
-//
-// In this form, the table has 274 entries, about 3kB. If we were to split
-// the table into one for 16-bit codes and an overflow table for larger ones,
-// we could get it down to about 1.5kB, but that's not worth the complexity.
-//
-// The grouped form also allows for efficient fold range calculations
-// rather than looping one character at a time.
-
+// Most table entries look like the ones around them:
+// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
+// Instead of listing all the pairs explicitly, we make a list of ranges
+// and deltas, so that the table entries for 'A' through 'Z' can be represented
+// as a single entry { 'A', 'Z', +32 }.
+//
+// In addition to blocks that map to each other (A-Z mapping to a-z)
+// there are blocks of pairs that individually map to each other
+// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
+// For those, the special delta value EvenOdd marks even/odd pairs
+// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
+//
+// In this form, the table has 274 entries, about 3kB. If we were to split
+// the table into one for 16-bit codes and an overflow table for larger ones,
+// we could get it down to about 1.5kB, but that's not worth the complexity.
+//
+// The grouped form also allows for efficient fold range calculations
+// rather than looping one character at a time.
+
#include <stdint.h>
-
+
#include "util/util.h"
#include "util/utf.h"
-
-namespace re2 {
-
-enum {
- EvenOdd = 1,
+
+namespace re2 {
+
+enum {
+ EvenOdd = 1,
OddEven = -1,
EvenOddSkip = 1<<30,
OddEvenSkip,
-};
-
-struct CaseFold {
+};
+
+struct CaseFold {
Rune lo;
Rune hi;
int32_t delta;
-};
-
+};
+
extern const CaseFold unicode_casefold[];
extern const int num_unicode_casefold;
-
+
extern const CaseFold unicode_tolower[];
extern const int num_unicode_tolower;
-// Returns the CaseFold* in the tables that contains rune.
-// If rune is not in the tables, returns the first CaseFold* after rune.
-// If rune is larger than any value in the tables, returns NULL.
+// Returns the CaseFold* in the tables that contains rune.
+// If rune is not in the tables, returns the first CaseFold* after rune.
+// If rune is larger than any value in the tables, returns NULL.
extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
-
+
// Returns the result of applying the fold f to the rune r.
extern Rune ApplyFold(const CaseFold *f, Rune r);
-} // namespace re2
-
+} // namespace re2
+
#endif // RE2_UNICODE_CASEFOLD_H_
diff --git a/contrib/libs/re2/re2/unicode_groups.h b/contrib/libs/re2/re2/unicode_groups.h
index 17a5900080..75f55daa61 100644
--- a/contrib/libs/re2/re2/unicode_groups.h
+++ b/contrib/libs/re2/re2/unicode_groups.h
@@ -1,67 +1,67 @@
-// Copyright 2008 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2008 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_UNICODE_GROUPS_H_
#define RE2_UNICODE_GROUPS_H_
-// Unicode character groups.
-
-// The codes get split into ranges of 16-bit codes
-// and ranges of 32-bit codes. It would be simpler
-// to use only 32-bit ranges, but these tables are large
-// enough to warrant extra care.
-//
-// Using just 32-bit ranges gives 27 kB of data.
-// Adding 16-bit ranges gives 18 kB of data.
-// Adding an extra table of 16-bit singletons would reduce
-// to 16.5 kB of data but make the data harder to use;
-// we don't bother.
-
+// Unicode character groups.
+
+// The codes get split into ranges of 16-bit codes
+// and ranges of 32-bit codes. It would be simpler
+// to use only 32-bit ranges, but these tables are large
+// enough to warrant extra care.
+//
+// Using just 32-bit ranges gives 27 kB of data.
+// Adding 16-bit ranges gives 18 kB of data.
+// Adding an extra table of 16-bit singletons would reduce
+// to 16.5 kB of data but make the data harder to use;
+// we don't bother.
+
#include <stdint.h>
-
+
#include "util/util.h"
#include "util/utf.h"
-
-namespace re2 {
-
-struct URange16
-{
+
+namespace re2 {
+
+struct URange16
+{
uint16_t lo;
uint16_t hi;
-};
-
-struct URange32
-{
+};
+
+struct URange32
+{
Rune lo;
Rune hi;
-};
-
-struct UGroup
-{
- const char *name;
- int sign; // +1 for [abc], -1 for [^abc]
+};
+
+struct UGroup
+{
+ const char *name;
+ int sign; // +1 for [abc], -1 for [^abc]
const URange16 *r16;
- int nr16;
+ int nr16;
const URange32 *r32;
- int nr32;
-};
-
-// Named by property or script name (e.g., "Nd", "N", "Han").
-// Negated groups are not included.
+ int nr32;
+};
+
+// Named by property or script name (e.g., "Nd", "N", "Han").
+// Negated groups are not included.
extern const UGroup unicode_groups[];
extern const int num_unicode_groups;
-
-// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
-// Negated groups are included.
+
+// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
+// Negated groups are included.
extern const UGroup posix_groups[];
extern const int num_posix_groups;
-
-// Named by Perl name (e.g., "\\d", "\\D").
-// Negated groups are included.
+
+// Named by Perl name (e.g., "\\d", "\\D").
+// Negated groups are included.
extern const UGroup perl_groups[];
extern const int num_perl_groups;
-
-} // namespace re2
-
+
+} // namespace re2
+
#endif // RE2_UNICODE_GROUPS_H_
diff --git a/contrib/libs/re2/re2/walker-inl.h b/contrib/libs/re2/re2/walker-inl.h
index 336fa36290..4d064a0970 100644
--- a/contrib/libs/re2/re2/walker-inl.h
+++ b/contrib/libs/re2/re2/walker-inl.h
@@ -1,247 +1,247 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 2006 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#ifndef RE2_WALKER_INL_H_
#define RE2_WALKER_INL_H_
-// Helper class for traversing Regexps without recursion.
-// Clients should declare their own subclasses that override
-// the PreVisit and PostVisit methods, which are called before
-// and after visiting the subexpressions.
-
-// Not quite the Visitor pattern, because (among other things)
-// the Visitor pattern is recursive.
-
+// Helper class for traversing Regexps without recursion.
+// Clients should declare their own subclasses that override
+// the PreVisit and PostVisit methods, which are called before
+// and after visiting the subexpressions.
+
+// Not quite the Visitor pattern, because (among other things)
+// the Visitor pattern is recursive.
+
#include <stack>
-
+
#include "util/logging.h"
-#include "re2/regexp.h"
-
-namespace re2 {
-
-template<typename T> struct WalkState;
-
-template<typename T> class Regexp::Walker {
- public:
- Walker();
- virtual ~Walker();
-
- // Virtual method called before visiting re's children.
- // PreVisit passes ownership of its return value to its caller.
- // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
- // and passed to the child PreVisits and PostVisits as parent_arg.
- // At the top-most Regexp, parent_arg is arg passed to walk.
- // If PreVisit sets *stop to true, the walk does not recurse
- // into the children. Instead it behaves as though the return
- // value from PreVisit is the return value from PostVisit.
- // The default PreVisit returns parent_arg.
- virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
-
- // Virtual method called after visiting re's children.
- // The pre_arg is the T that PreVisit returned.
- // The child_args is a vector of the T that the child PostVisits returned.
- // PostVisit takes ownership of pre_arg.
- // PostVisit takes ownership of the Ts
- // in *child_args, but not the vector itself.
- // PostVisit passes ownership of its return value
- // to its caller.
- // The default PostVisit simply returns pre_arg.
- virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
- T* child_args, int nchild_args);
-
- // Virtual method called to copy a T,
- // when Walk notices that more than one child is the same re.
- virtual T Copy(T arg);
-
- // Virtual method called to do a "quick visit" of the re,
- // but not its children. Only called once the visit budget
- // has been used up and we're trying to abort the walk
- // as quickly as possible. Should return a value that
- // makes sense for the parent PostVisits still to be run.
- // This function is (hopefully) only called by
- // WalkExponential, but must be implemented by all clients,
- // just in case.
- virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
-
- // Walks over a regular expression.
- // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
- // Returns the T returned by PostVisit on re.
- T Walk(Regexp* re, T top_arg);
-
- // Like Walk, but doesn't use Copy. This can lead to
- // exponential runtimes on cross-linked Regexps like the
- // ones generated by Simplify. To help limit this,
- // at most max_visits nodes will be visited and then
- // the walk will be cut off early.
- // If the walk *is* cut off early, ShortVisit(re)
- // will be called on regexps that cannot be fully
- // visited rather than calling PreVisit/PostVisit.
- T WalkExponential(Regexp* re, T top_arg, int max_visits);
-
- // Clears the stack. Should never be necessary, since
- // Walk always enters and exits with an empty stack.
- // Logs DFATAL if stack is not already clear.
- void Reset();
-
- // Returns whether walk was cut off.
- bool stopped_early() { return stopped_early_; }
-
- private:
- // Walk state for the entire traversal.
+#include "re2/regexp.h"
+
+namespace re2 {
+
+template<typename T> struct WalkState;
+
+template<typename T> class Regexp::Walker {
+ public:
+ Walker();
+ virtual ~Walker();
+
+ // Virtual method called before visiting re's children.
+ // PreVisit passes ownership of its return value to its caller.
+ // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
+ // and passed to the child PreVisits and PostVisits as parent_arg.
+ // At the top-most Regexp, parent_arg is arg passed to walk.
+ // If PreVisit sets *stop to true, the walk does not recurse
+ // into the children. Instead it behaves as though the return
+ // value from PreVisit is the return value from PostVisit.
+ // The default PreVisit returns parent_arg.
+ virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
+
+ // Virtual method called after visiting re's children.
+ // The pre_arg is the T that PreVisit returned.
+ // The child_args is a vector of the T that the child PostVisits returned.
+ // PostVisit takes ownership of pre_arg.
+ // PostVisit takes ownership of the Ts
+ // in *child_args, but not the vector itself.
+ // PostVisit passes ownership of its return value
+ // to its caller.
+ // The default PostVisit simply returns pre_arg.
+ virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
+ T* child_args, int nchild_args);
+
+ // Virtual method called to copy a T,
+ // when Walk notices that more than one child is the same re.
+ virtual T Copy(T arg);
+
+ // Virtual method called to do a "quick visit" of the re,
+ // but not its children. Only called once the visit budget
+ // has been used up and we're trying to abort the walk
+ // as quickly as possible. Should return a value that
+ // makes sense for the parent PostVisits still to be run.
+ // This function is (hopefully) only called by
+ // WalkExponential, but must be implemented by all clients,
+ // just in case.
+ virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
+
+ // Walks over a regular expression.
+ // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
+ // Returns the T returned by PostVisit on re.
+ T Walk(Regexp* re, T top_arg);
+
+ // Like Walk, but doesn't use Copy. This can lead to
+ // exponential runtimes on cross-linked Regexps like the
+ // ones generated by Simplify. To help limit this,
+ // at most max_visits nodes will be visited and then
+ // the walk will be cut off early.
+ // If the walk *is* cut off early, ShortVisit(re)
+ // will be called on regexps that cannot be fully
+ // visited rather than calling PreVisit/PostVisit.
+ T WalkExponential(Regexp* re, T top_arg, int max_visits);
+
+ // Clears the stack. Should never be necessary, since
+ // Walk always enters and exits with an empty stack.
+ // Logs DFATAL if stack is not already clear.
+ void Reset();
+
+ // Returns whether walk was cut off.
+ bool stopped_early() { return stopped_early_; }
+
+ private:
+ // Walk state for the entire traversal.
std::stack<WalkState<T>> stack_;
- bool stopped_early_;
- int max_visits_;
-
- T WalkInternal(Regexp* re, T top_arg, bool use_copy);
-
+ bool stopped_early_;
+ int max_visits_;
+
+ T WalkInternal(Regexp* re, T top_arg, bool use_copy);
+
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
-};
-
-template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
- T parent_arg,
- bool* stop) {
- return parent_arg;
-}
-
-template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
- T parent_arg,
- T pre_arg,
- T* child_args,
- int nchild_args) {
- return pre_arg;
-}
-
-template<typename T> T Regexp::Walker<T>::Copy(T arg) {
- return arg;
-}
-
-// State about a single level in the traversal.
-template<typename T> struct WalkState {
+};
+
+template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
+ T parent_arg,
+ bool* stop) {
+ return parent_arg;
+}
+
+template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
+ T parent_arg,
+ T pre_arg,
+ T* child_args,
+ int nchild_args) {
+ return pre_arg;
+}
+
+template<typename T> T Regexp::Walker<T>::Copy(T arg) {
+ return arg;
+}
+
+// State about a single level in the traversal.
+template<typename T> struct WalkState {
WalkState(Regexp* re, T parent)
- : re(re),
- n(-1),
- parent_arg(parent),
- child_args(NULL) { }
-
- Regexp* re; // The regexp
- int n; // The index of the next child to process; -1 means need to PreVisit
- T parent_arg; // Accumulated arguments.
- T pre_arg;
- T child_arg; // One-element buffer for child_args.
- T* child_args;
-};
-
-template<typename T> Regexp::Walker<T>::Walker() {
- stopped_early_ = false;
-}
-
-template<typename T> Regexp::Walker<T>::~Walker() {
- Reset();
-}
-
-// Clears the stack. Should never be necessary, since
-// Walk always enters and exits with an empty stack.
-// Logs DFATAL if stack is not already clear.
-template<typename T> void Regexp::Walker<T>::Reset() {
+ : re(re),
+ n(-1),
+ parent_arg(parent),
+ child_args(NULL) { }
+
+ Regexp* re; // The regexp
+ int n; // The index of the next child to process; -1 means need to PreVisit
+ T parent_arg; // Accumulated arguments.
+ T pre_arg;
+ T child_arg; // One-element buffer for child_args.
+ T* child_args;
+};
+
+template<typename T> Regexp::Walker<T>::Walker() {
+ stopped_early_ = false;
+}
+
+template<typename T> Regexp::Walker<T>::~Walker() {
+ Reset();
+}
+
+// Clears the stack. Should never be necessary, since
+// Walk always enters and exits with an empty stack.
+// Logs DFATAL if stack is not already clear.
+template<typename T> void Regexp::Walker<T>::Reset() {
if (!stack_.empty()) {
- LOG(DFATAL) << "Stack not empty.";
+ LOG(DFATAL) << "Stack not empty.";
while (!stack_.empty()) {
if (stack_.top().re->nsub_ > 1)
delete[] stack_.top().child_args;
stack_.pop();
- }
- }
-}
-
-template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
- bool use_copy) {
- Reset();
-
- if (re == NULL) {
- LOG(DFATAL) << "Walk NULL";
- return top_arg;
- }
-
+ }
+ }
+}
+
+template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
+ bool use_copy) {
+ Reset();
+
+ if (re == NULL) {
+ LOG(DFATAL) << "Walk NULL";
+ return top_arg;
+ }
+
stack_.push(WalkState<T>(re, top_arg));
-
- WalkState<T>* s;
- for (;;) {
- T t;
+
+ WalkState<T>* s;
+ for (;;) {
+ T t;
s = &stack_.top();
re = s->re;
- switch (s->n) {
- case -1: {
- if (--max_visits_ < 0) {
- stopped_early_ = true;
- t = ShortVisit(re, s->parent_arg);
- break;
- }
- bool stop = false;
- s->pre_arg = PreVisit(re, s->parent_arg, &stop);
- if (stop) {
- t = s->pre_arg;
- break;
- }
- s->n = 0;
- s->child_args = NULL;
- if (re->nsub_ == 1)
- s->child_args = &s->child_arg;
- else if (re->nsub_ > 1)
- s->child_args = new T[re->nsub_];
+ switch (s->n) {
+ case -1: {
+ if (--max_visits_ < 0) {
+ stopped_early_ = true;
+ t = ShortVisit(re, s->parent_arg);
+ break;
+ }
+ bool stop = false;
+ s->pre_arg = PreVisit(re, s->parent_arg, &stop);
+ if (stop) {
+ t = s->pre_arg;
+ break;
+ }
+ s->n = 0;
+ s->child_args = NULL;
+ if (re->nsub_ == 1)
+ s->child_args = &s->child_arg;
+ else if (re->nsub_ > 1)
+ s->child_args = new T[re->nsub_];
FALLTHROUGH_INTENDED;
- }
- default: {
- if (re->nsub_ > 0) {
- Regexp** sub = re->sub();
- if (s->n < re->nsub_) {
- if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
- s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
- s->n++;
- } else {
+ }
+ default: {
+ if (re->nsub_ > 0) {
+ Regexp** sub = re->sub();
+ if (s->n < re->nsub_) {
+ if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
+ s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
+ s->n++;
+ } else {
stack_.push(WalkState<T>(sub[s->n], s->pre_arg));
- }
- continue;
- }
- }
-
- t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
- if (re->nsub_ > 1)
- delete[] s->child_args;
- break;
- }
- }
-
+ }
+ continue;
+ }
+ }
+
+ t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
+ if (re->nsub_ > 1)
+ delete[] s->child_args;
+ break;
+ }
+ }
+
// We've finished stack_.top().
- // Update next guy down.
+ // Update next guy down.
stack_.pop();
if (stack_.empty())
- return t;
+ return t;
s = &stack_.top();
- if (s->child_args != NULL)
- s->child_args[s->n] = t;
- else
- s->child_arg = t;
- s->n++;
- }
-}
-
-template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
- // Without the exponential walking behavior,
- // this budget should be more than enough for any
- // regexp, and yet not enough to get us in trouble
- // as far as CPU time.
- max_visits_ = 1000000;
- return WalkInternal(re, top_arg, true);
-}
-
-template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
- int max_visits) {
- max_visits_ = max_visits;
- return WalkInternal(re, top_arg, false);
-}
-
-} // namespace re2
-
+ if (s->child_args != NULL)
+ s->child_args[s->n] = t;
+ else
+ s->child_arg = t;
+ s->n++;
+ }
+}
+
+template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
+ // Without the exponential walking behavior,
+ // this budget should be more than enough for any
+ // regexp, and yet not enough to get us in trouble
+ // as far as CPU time.
+ max_visits_ = 1000000;
+ return WalkInternal(re, top_arg, true);
+}
+
+template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
+ int max_visits) {
+ max_visits_ = max_visits;
+ return WalkInternal(re, top_arg, false);
+}
+
+} // namespace re2
+
#endif // RE2_WALKER_INL_H_
diff --git a/contrib/libs/re2/util/rune.cc b/contrib/libs/re2/util/rune.cc
index 824656f776..4f625ea380 100644
--- a/contrib/libs/re2/util/rune.cc
+++ b/contrib/libs/re2/util/rune.cc
@@ -1,260 +1,260 @@
-/*
- * The authors of this software are Rob Pike and Ken Thompson.
- * Copyright (c) 2002 by Lucent Technologies.
- * Permission to use, copy, modify, and distribute this software for any
- * purpose without fee is hereby granted, provided that this entire notice
- * is included in all copies of any software which is or includes a copy
- * or modification of this software and in all copies of the supporting
- * documentation for such software.
- * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
- * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
- * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
- */
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ * Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
-#include <stdarg.h>
-#include <string.h>
+#include <stdarg.h>
+#include <string.h>
-#include "util/utf.h"
-
-namespace re2 {
-
-enum
-{
- Bit1 = 7,
- Bitx = 6,
- Bit2 = 5,
- Bit3 = 4,
- Bit4 = 3,
- Bit5 = 2,
-
- T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
- Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
- T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
- T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
- T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
- T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
-
- Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
- Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
- Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
- Rune4 = (1<<(Bit4+3*Bitx))-1,
- /* 0001 1111 1111 1111 1111 1111 */
-
- Maskx = (1<<Bitx)-1, /* 0011 1111 */
- Testx = Maskx ^ 0xFF, /* 1100 0000 */
-
- Bad = Runeerror,
-};
-
-int
-chartorune(Rune *rune, const char *str)
-{
- int c, c1, c2, c3;
- long l;
-
- /*
- * one character sequence
- * 00000-0007F => T1
- */
- c = *(unsigned char*)str;
- if(c < Tx) {
- *rune = c;
- return 1;
- }
-
- /*
- * two character sequence
- * 0080-07FF => T2 Tx
- */
- c1 = *(unsigned char*)(str+1) ^ Tx;
- if(c1 & Testx)
- goto bad;
- if(c < T3) {
- if(c < T2)
- goto bad;
- l = ((c << Bitx) | c1) & Rune2;
- if(l <= Rune1)
- goto bad;
- *rune = l;
- return 2;
- }
-
- /*
- * three character sequence
- * 0800-FFFF => T3 Tx Tx
- */
- c2 = *(unsigned char*)(str+2) ^ Tx;
- if(c2 & Testx)
- goto bad;
- if(c < T4) {
- l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
- if(l <= Rune2)
- goto bad;
- *rune = l;
- return 3;
- }
-
- /*
- * four character sequence (21-bit value)
- * 10000-1FFFFF => T4 Tx Tx Tx
- */
- c3 = *(unsigned char*)(str+3) ^ Tx;
- if (c3 & Testx)
- goto bad;
- if (c < T5) {
- l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
- if (l <= Rune3)
- goto bad;
- *rune = l;
- return 4;
- }
-
- /*
- * Support for 5-byte or longer UTF-8 would go here, but
- * since we don't have that, we'll just fall through to bad.
- */
-
- /*
- * bad decoding
- */
-bad:
- *rune = Bad;
- return 1;
-}
-
-int
-runetochar(char *str, const Rune *rune)
-{
- /* Runes are signed, so convert to unsigned for range check. */
- unsigned long c;
-
- /*
- * one character sequence
- * 00000-0007F => 00-7F
- */
- c = *rune;
- if(c <= Rune1) {
+#include "util/utf.h"
+
+namespace re2 {
+
+enum
+{
+ Bit1 = 7,
+ Bitx = 6,
+ Bit2 = 5,
+ Bit3 = 4,
+ Bit4 = 3,
+ Bit5 = 2,
+
+ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
+ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
+ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
+ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
+ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
+
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1,
+ /* 0001 1111 1111 1111 1111 1111 */
+
+ Maskx = (1<<Bitx)-1, /* 0011 1111 */
+ Testx = Maskx ^ 0xFF, /* 1100 0000 */
+
+ Bad = Runeerror,
+};
+
+int
+chartorune(Rune *rune, const char *str)
+{
+ int c, c1, c2, c3;
+ long l;
+
+ /*
+ * one character sequence
+ * 00000-0007F => T1
+ */
+ c = *(unsigned char*)str;
+ if(c < Tx) {
+ *rune = c;
+ return 1;
+ }
+
+ /*
+ * two character sequence
+ * 0080-07FF => T2 Tx
+ */
+ c1 = *(unsigned char*)(str+1) ^ Tx;
+ if(c1 & Testx)
+ goto bad;
+ if(c < T3) {
+ if(c < T2)
+ goto bad;
+ l = ((c << Bitx) | c1) & Rune2;
+ if(l <= Rune1)
+ goto bad;
+ *rune = l;
+ return 2;
+ }
+
+ /*
+ * three character sequence
+ * 0800-FFFF => T3 Tx Tx
+ */
+ c2 = *(unsigned char*)(str+2) ^ Tx;
+ if(c2 & Testx)
+ goto bad;
+ if(c < T4) {
+ l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+ if(l <= Rune2)
+ goto bad;
+ *rune = l;
+ return 3;
+ }
+
+ /*
+ * four character sequence (21-bit value)
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ c3 = *(unsigned char*)(str+3) ^ Tx;
+ if (c3 & Testx)
+ goto bad;
+ if (c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if (l <= Rune3)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+
+ /*
+ * Support for 5-byte or longer UTF-8 would go here, but
+ * since we don't have that, we'll just fall through to bad.
+ */
+
+ /*
+ * bad decoding
+ */
+bad:
+ *rune = Bad;
+ return 1;
+}
+
+int
+runetochar(char *str, const Rune *rune)
+{
+ /* Runes are signed, so convert to unsigned for range check. */
+ unsigned long c;
+
+ /*
+ * one character sequence
+ * 00000-0007F => 00-7F
+ */
+ c = *rune;
+ if(c <= Rune1) {
str[0] = static_cast<char>(c);
- return 1;
- }
-
- /*
- * two character sequence
- * 0080-07FF => T2 Tx
- */
- if(c <= Rune2) {
+ return 1;
+ }
+
+ /*
+ * two character sequence
+ * 0080-07FF => T2 Tx
+ */
+ if(c <= Rune2) {
str[0] = T2 | static_cast<char>(c >> 1*Bitx);
- str[1] = Tx | (c & Maskx);
- return 2;
- }
-
- /*
- * If the Rune is out of range, convert it to the error rune.
- * Do this test here because the error rune encodes to three bytes.
- * Doing it earlier would duplicate work, since an out of range
- * Rune wouldn't have fit in one or two bytes.
- */
- if (c > Runemax)
- c = Runeerror;
-
- /*
- * three character sequence
- * 0800-FFFF => T3 Tx Tx
- */
- if (c <= Rune3) {
+ str[1] = Tx | (c & Maskx);
+ return 2;
+ }
+
+ /*
+ * If the Rune is out of range, convert it to the error rune.
+ * Do this test here because the error rune encodes to three bytes.
+ * Doing it earlier would duplicate work, since an out of range
+ * Rune wouldn't have fit in one or two bytes.
+ */
+ if (c > Runemax)
+ c = Runeerror;
+
+ /*
+ * three character sequence
+ * 0800-FFFF => T3 Tx Tx
+ */
+ if (c <= Rune3) {
str[0] = T3 | static_cast<char>(c >> 2*Bitx);
- str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
- return 3;
- }
-
- /*
- * four character sequence (21-bit value)
- * 10000-1FFFFF => T4 Tx Tx Tx
- */
+ return 3;
+ }
+
+ /*
+ * four character sequence (21-bit value)
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
str[0] = T4 | static_cast<char>(c >> 3*Bitx);
- str[1] = Tx | ((c >> 2*Bitx) & Maskx);
- str[2] = Tx | ((c >> 1*Bitx) & Maskx);
- str[3] = Tx | (c & Maskx);
- return 4;
-}
-
-int
-runelen(Rune rune)
-{
- char str[10];
-
- return runetochar(str, &rune);
-}
-
-int
-fullrune(const char *str, int n)
-{
- if (n > 0) {
- int c = *(unsigned char*)str;
- if (c < Tx)
- return 1;
- if (n > 1) {
- if (c < T3)
- return 1;
- if (n > 2) {
- if (c < T4 || n > 3)
- return 1;
- }
- }
- }
- return 0;
-}
-
-
-int
-utflen(const char *s)
-{
- int c;
- long n;
- Rune rune;
-
- n = 0;
- for(;;) {
- c = *(unsigned char*)s;
- if(c < Runeself) {
- if(c == 0)
- return n;
- s++;
- } else
- s += chartorune(&rune, s);
- n++;
- }
- return 0;
-}
-
-char*
-utfrune(const char *s, Rune c)
-{
- long c1;
- Rune r;
- int n;
-
- if(c < Runesync) /* not part of utf sequence */
- return strchr((char*)s, c);
-
- for(;;) {
- c1 = *(unsigned char*)s;
- if(c1 < Runeself) { /* one byte rune */
- if(c1 == 0)
- return 0;
- if(c1 == c)
- return (char*)s;
- s++;
- continue;
- }
- n = chartorune(&r, s);
- if(r == c)
- return (char*)s;
- s += n;
- }
- return 0;
-}
-
-} // namespace re2
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
+}
+
+int
+runelen(Rune rune)
+{
+ char str[10];
+
+ return runetochar(str, &rune);
+}
+
+int
+fullrune(const char *str, int n)
+{
+ if (n > 0) {
+ int c = *(unsigned char*)str;
+ if (c < Tx)
+ return 1;
+ if (n > 1) {
+ if (c < T3)
+ return 1;
+ if (n > 2) {
+ if (c < T4 || n > 3)
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+int
+utflen(const char *s)
+{
+ int c;
+ long n;
+ Rune rune;
+
+ n = 0;
+ for(;;) {
+ c = *(unsigned char*)s;
+ if(c < Runeself) {
+ if(c == 0)
+ return n;
+ s++;
+ } else
+ s += chartorune(&rune, s);
+ n++;
+ }
+ return 0;
+}
+
+char*
+utfrune(const char *s, Rune c)
+{
+ long c1;
+ Rune r;
+ int n;
+
+ if(c < Runesync) /* not part of utf sequence */
+ return strchr((char*)s, c);
+
+ for(;;) {
+ c1 = *(unsigned char*)s;
+ if(c1 < Runeself) { /* one byte rune */
+ if(c1 == 0)
+ return 0;
+ if(c1 == c)
+ return (char*)s;
+ s++;
+ continue;
+ }
+ n = chartorune(&r, s);
+ if(r == c)
+ return (char*)s;
+ s += n;
+ }
+ return 0;
+}
+
+} // namespace re2
diff --git a/contrib/libs/re2/util/strutil.cc b/contrib/libs/re2/util/strutil.cc
index f151ab1b80..fb7e6b1b0c 100644
--- a/contrib/libs/re2/util/strutil.cc
+++ b/contrib/libs/re2/util/strutil.cc
@@ -1,10 +1,10 @@
-// Copyright 1999-2005 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
+// Copyright 1999-2005 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
#include <stdarg.h>
#include <stdio.h>
-
+
#include "util/strutil.h"
#ifdef _WIN32
@@ -12,86 +12,86 @@
#define vsnprintf _vsnprintf
#endif
-namespace re2 {
-
-// ----------------------------------------------------------------------
-// CEscapeString()
-// Copies 'src' to 'dest', escaping dangerous characters using
-// C-style escape sequences. 'src' and 'dest' should not overlap.
-// Returns the number of bytes written to 'dest' (not including the \0)
+namespace re2 {
+
+// ----------------------------------------------------------------------
+// CEscapeString()
+// Copies 'src' to 'dest', escaping dangerous characters using
+// C-style escape sequences. 'src' and 'dest' should not overlap.
+// Returns the number of bytes written to 'dest' (not including the \0)
// or (size_t)-1 if there was insufficient space.
-// ----------------------------------------------------------------------
+// ----------------------------------------------------------------------
static size_t CEscapeString(const char* src, size_t src_len,
char* dest, size_t dest_len) {
- const char* src_end = src + src_len;
+ const char* src_end = src + src_len;
size_t used = 0;
-
- for (; src < src_end; src++) {
+
+ for (; src < src_end; src++) {
if (dest_len - used < 2) // space for two-character escape
return (size_t)-1;
-
- unsigned char c = *src;
- switch (c) {
- case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
- case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
- case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
- case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
- case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
- case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
- default:
- // Note that if we emit \xNN and the src character after that is a hex
- // digit then that digit must be escaped too to prevent it being
- // interpreted as part of the character code by C.
- if (c < ' ' || c > '~') {
+
+ unsigned char c = *src;
+ switch (c) {
+ case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
+ case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
+ case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
+ case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
+ case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
+ case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
+ default:
+ // Note that if we emit \xNN and the src character after that is a hex
+ // digit then that digit must be escaped too to prevent it being
+ // interpreted as part of the character code by C.
+ if (c < ' ' || c > '~') {
if (dest_len - used < 5) // space for four-character escape + \0
return (size_t)-1;
snprintf(dest + used, 5, "\\%03o", c);
- used += 4;
- } else {
- dest[used++] = c; break;
- }
- }
- }
-
- if (dest_len - used < 1) // make sure that there is room for \0
+ used += 4;
+ } else {
+ dest[used++] = c; break;
+ }
+ }
+ }
+
+ if (dest_len - used < 1) // make sure that there is room for \0
return (size_t)-1;
-
- dest[used] = '\0'; // doesn't count towards return value though
- return used;
-}
-
-// ----------------------------------------------------------------------
-// CEscape()
-// Copies 'src' to result, escaping dangerous characters using
-// C-style escape sequences. 'src' and 'dest' should not overlap.
-// ----------------------------------------------------------------------
+
+ dest[used] = '\0'; // doesn't count towards return value though
+ return used;
+}
+
+// ----------------------------------------------------------------------
+// CEscape()
+// Copies 'src' to result, escaping dangerous characters using
+// C-style escape sequences. 'src' and 'dest' should not overlap.
+// ----------------------------------------------------------------------
std::string CEscape(const StringPiece& src) {
const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion
char* dest = new char[dest_len];
const size_t used = CEscapeString(src.data(), src.size(),
dest, dest_len);
std::string s = std::string(dest, used);
- delete[] dest;
- return s;
-}
-
+ delete[] dest;
+ return s;
+}
+
void PrefixSuccessor(std::string* prefix) {
- // We can increment the last character in the string and be done
- // unless that character is 255, in which case we have to erase the
- // last character and increment the previous character, unless that
- // is 255, etc. If the string is empty or consists entirely of
- // 255's, we just return the empty string.
+ // We can increment the last character in the string and be done
+ // unless that character is 255, in which case we have to erase the
+ // last character and increment the previous character, unless that
+ // is 255, etc. If the string is empty or consists entirely of
+ // 255's, we just return the empty string.
while (!prefix->empty()) {
char& c = prefix->back();
if (c == '\xff') { // char literal avoids signed/unsigned.
prefix->pop_back();
- } else {
+ } else {
++c;
break;
- }
- }
-}
-
+ }
+ }
+}
+
static void StringAppendV(std::string* dst, const char* format, va_list ap) {
// First try with a small fixed size buffer
char space[1024];
@@ -146,4 +146,4 @@ std::string StringPrintf(const char* format, ...) {
return result;
}
-} // namespace re2
+} // namespace re2
diff --git a/contrib/libs/re2/util/utf.h b/contrib/libs/re2/util/utf.h
index f29404a561..85b4297239 100644
--- a/contrib/libs/re2/util/utf.h
+++ b/contrib/libs/re2/util/utf.h
@@ -1,44 +1,44 @@
-/*
- * The authors of this software are Rob Pike and Ken Thompson.
- * Copyright (c) 2002 by Lucent Technologies.
- * Permission to use, copy, modify, and distribute this software for any
- * purpose without fee is hereby granted, provided that this entire notice
- * is included in all copies of any software which is or includes a copy
- * or modification of this software and in all copies of the supporting
- * documentation for such software.
- * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
- * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
- * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
- *
- * This file and rune.cc have been converted to compile as C++ code
- * in name space re2.
- */
-
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ * Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ *
+ * This file and rune.cc have been converted to compile as C++ code
+ * in name space re2.
+ */
+
#ifndef UTIL_UTF_H_
#define UTIL_UTF_H_
#include <stdint.h>
-
-namespace re2 {
-
-typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
-
-enum
-{
- UTFmax = 4, /* maximum bytes per rune */
- Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
- Runeself = 0x80, /* rune and UTF sequences are the same (<) */
- Runeerror = 0xFFFD, /* decoding error in UTF */
- Runemax = 0x10FFFF, /* maximum rune value */
-};
-
-int runetochar(char* s, const Rune* r);
-int chartorune(Rune* r, const char* s);
-int fullrune(const char* s, int n);
-int utflen(const char* s);
-char* utfrune(const char*, Rune);
-
-} // namespace re2
-
+
+namespace re2 {
+
+typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
+
+enum
+{
+ UTFmax = 4, /* maximum bytes per rune */
+ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
+ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
+ Runeerror = 0xFFFD, /* decoding error in UTF */
+ Runemax = 0x10FFFF, /* maximum rune value */
+};
+
+int runetochar(char* s, const Rune* r);
+int chartorune(Rune* r, const char* s);
+int fullrune(const char* s, int n);
+int utflen(const char* s);
+char* utfrune(const char*, Rune);
+
+} // namespace re2
+
#endif // UTIL_UTF_H_
diff --git a/contrib/libs/re2/ya.make b/contrib/libs/re2/ya.make
index 0f49b2c6b5..8072de2eb2 100644
--- a/contrib/libs/re2/ya.make
+++ b/contrib/libs/re2/ya.make
@@ -1,11 +1,11 @@
# Generated by devtools/yamaker from nixpkgs 21.11.
-LIBRARY()
-
+LIBRARY()
+
OWNER(g:cpp-contrib)
VERSION(2022-02-01)
-
+
ORIGINAL_SOURCE(https://github.com/google/re2/archive/2022-02-01.tar.gz)
LICENSE(
@@ -19,7 +19,7 @@ ADDINCL(
GLOBAL contrib/libs/re2/include
contrib/libs/re2
)
-
+
NO_COMPILER_WARNINGS()
IF (WITH_VALGRIND)
@@ -28,7 +28,7 @@ IF (WITH_VALGRIND)
)
ENDIF()
-SRCS(
+SRCS(
re2/bitstate.cc
re2/compile.cc
re2/dfa.cc
@@ -51,9 +51,9 @@ SRCS(
re2/unicode_groups.cc
util/rune.cc
util/strutil.cc
-)
-
-END()
+)
+
+END()
RECURSE(
re2/testing