diff options
author | antonovvk <antonovvk@yandex-team.ru> | 2022-02-10 16:47:52 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:52 +0300 |
commit | 37de222addabbef336dcaaea5f7c7645a629fc6d (patch) | |
tree | c0748b5dcbade83af788c0abfa89c0383d6b779c /contrib/libs/re2 | |
parent | 37a63debdc21e372d99e1808cdd31aecf75018c3 (diff) | |
download | ydb-37de222addabbef336dcaaea5f7c7645a629fc6d.tar.gz |
Restoring authorship annotation for <antonovvk@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/re2')
31 files changed, 10666 insertions, 10666 deletions
diff --git a/contrib/libs/re2/re2/bitstate.cc b/contrib/libs/re2/re2/bitstate.cc index ab4e75f6e5..877e548234 100644 --- a/contrib/libs/re2/re2/bitstate.cc +++ b/contrib/libs/re2/re2/bitstate.cc @@ -1,22 +1,22 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Tested by search_test.cc, exhaustive_test.cc, tester.cc - -// Prog::SearchBitState is a regular expression search with submatch +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc, exhaustive_test.cc, tester.cc + +// Prog::SearchBitState is a regular expression search with submatch // tracking for small regular expressions and texts. Similarly to // testing/backtrack.cc, it allocates a bitmap with (count of // lists) * (length of text) bits to make sure it never explores the // same (instruction list, character position) multiple times. This -// limits the search to run in time linear in the length of the text. -// -// Unlike testing/backtrack.cc, SearchBitState is not recursive -// on the text. -// -// SearchBitState is a fast replacement for the NFA code on small -// regexps and texts when SearchOnePass cannot be used. - +// limits the search to run in time linear in the length of the text. +// +// Unlike testing/backtrack.cc, SearchBitState is not recursive +// on the text. +// +// SearchBitState is a fast replacement for the NFA code on small +// regexps and texts when SearchOnePass cannot be used. + #include <stddef.h> #include <stdint.h> #include <string.h> @@ -25,84 +25,84 @@ #include "util/logging.h" #include "re2/pod_array.h" -#include "re2/prog.h" -#include "re2/regexp.h" - -namespace re2 { - -struct Job { - int id; +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +struct Job { + int id; int rle; // run length encoding - const char* p; -}; - -class BitState { - public: - explicit BitState(Prog* prog); - - // The usual Search prototype. - // Can only call Search once per BitState. - bool Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch); - - private: - inline bool ShouldVisit(int id, const char* p); + const char* p; +}; + +class BitState { + public: + explicit BitState(Prog* prog); + + // The usual Search prototype. + // Can only call Search once per BitState. + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch); + + private: + inline bool ShouldVisit(int id, const char* p); void Push(int id, const char* p); void GrowStack(); - bool TrySearch(int id, const char* p); - - // Search parameters - Prog* prog_; // program being run - StringPiece text_; // text being searched - StringPiece context_; // greater context of text being searched - bool anchored_; // whether search is anchored at text.begin() - bool longest_; // whether search wants leftmost-longest match - bool endmatch_; // whether match must end at text.end() + bool TrySearch(int id, const char* p); + + // Search parameters + Prog* prog_; // program being run + StringPiece text_; // text being searched + StringPiece context_; // greater context of text being searched + bool anchored_; // whether search is anchored at text.begin() + bool longest_; // whether search wants leftmost-longest match + bool endmatch_; // whether match must end at text.end() StringPiece* submatch_; // submatches to fill in - int nsubmatch_; // # of submatches to fill in - - // Search state + int nsubmatch_; // # of submatches to fill in + + // Search state static constexpr int kVisitedBits = 64; PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited PODArray<const char*> cap_; // capture registers PODArray<Job> job_; // stack of text positions to explore int njob_; // stack size - + BitState(const BitState&) = delete; BitState& operator=(const BitState&) = delete; -}; - -BitState::BitState(Prog* prog) - : prog_(prog), - anchored_(false), - longest_(false), - endmatch_(false), - submatch_(NULL), - nsubmatch_(0), +}; + +BitState::BitState(Prog* prog) + : prog_(prog), + anchored_(false), + longest_(false), + endmatch_(false), + submatch_(NULL), + nsubmatch_(0), njob_(0) { -} - +} + // Given id, which *must* be a list head, we can look up its list ID. // Then the question is: Should the search visit the (list ID, p) pair? -// If so, remember that it was visited so that the next time, -// we don't repeat the visit. -bool BitState::ShouldVisit(int id, const char* p) { +// If so, remember that it was visited so that the next time, +// we don't repeat the visit. +bool BitState::ShouldVisit(int id, const char* p) { int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) + static_cast<int>(p-text_.data()); if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1)))) - return false; + return false; visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1)); - return true; -} - -// Grow the stack. + return true; +} + +// Grow the stack. void BitState::GrowStack() { PODArray<Job> tmp(2*job_.size()); memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]); job_ = std::move(tmp); -} - +} + // Push (id, p) onto the stack, growing it if necessary. void BitState::Push(int id, const char* p) { if (njob_ >= job_.size()) { @@ -111,10 +111,10 @@ void BitState::Push(int id, const char* p) { LOG(DFATAL) << "GrowStack() failed: " << "njob_ = " << njob_ << ", " << "job_.size() = " << job_.size(); - return; + return; } - } - + } + // If id < 0, it's undoing a Capture, // so we mustn't interfere with that. if (id >= 0 && njob_ > 0) { @@ -126,30 +126,30 @@ void BitState::Push(int id, const char* p) { return; } } - + Job* top = &job_[njob_++]; top->id = id; top->rle = 0; top->p = p; -} - -// Try a search from instruction id0 in state p0. -// Return whether it succeeded. -bool BitState::TrySearch(int id0, const char* p0) { - bool matched = false; +} + +// Try a search from instruction id0 in state p0. +// Return whether it succeeded. +bool BitState::TrySearch(int id0, const char* p0) { + bool matched = false; const char* end = text_.data() + text_.size(); - njob_ = 0; + njob_ = 0; // Push() no longer checks ShouldVisit(), // so we must perform the check ourselves. if (ShouldVisit(id0, p0)) Push(id0, p0); - while (njob_ > 0) { - // Pop job off stack. - --njob_; - int id = job_[njob_].id; + while (njob_ > 0) { + // Pop job off stack. + --njob_; + int id = job_[njob_].id; int& rle = job_[njob_].rle; - const char* p = job_[njob_].p; - + const char* p = job_[njob_].p; + if (id < 0) { // Undo the Capture. cap_[prog_->inst(-id)->cap()] = p; @@ -161,16 +161,16 @@ bool BitState::TrySearch(int id0, const char* p0) { // Revivify job on stack. --rle; ++njob_; - } - + } + Loop: // Visit id, p. - Prog::Inst* ip = prog_->inst(id); - switch (ip->opcode()) { - default: + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: LOG(DFATAL) << "Unexpected opcode: " << ip->opcode(); - return false; - + return false; + case kInstFail: break; @@ -180,7 +180,7 @@ bool BitState::TrySearch(int id0, const char* p0) { id = ip->out1(); p = end; goto Loop; - } + } if (longest_) { // ip must be non-greedy... // out is the Match instruction. @@ -189,11 +189,11 @@ bool BitState::TrySearch(int id0, const char* p0) { goto Loop; } goto Next; - - case kInstByteRange: { - int c = -1; - if (p < end) - c = *p & 0xFF; + + case kInstByteRange: { + int c = -1; + if (p < end) + c = *p & 0xFF; if (!ip->Matches(c)) goto Next; @@ -202,9 +202,9 @@ bool BitState::TrySearch(int id0, const char* p0) { id = ip->out(); p++; goto CheckAndLoop; - } - - case kInstCapture: + } + + case kInstCapture: if (!ip->last()) Push(id+1, p); // try the next when we're done @@ -217,20 +217,20 @@ bool BitState::TrySearch(int id0, const char* p0) { id = ip->out(); goto CheckAndLoop; - case kInstEmptyWidth: - if (ip->empty() & ~Prog::EmptyFlags(context_, p)) + case kInstEmptyWidth: + if (ip->empty() & ~Prog::EmptyFlags(context_, p)) goto Next; if (!ip->last()) Push(id+1, p); // try the next when we're done - id = ip->out(); - goto CheckAndLoop; - - case kInstNop: + id = ip->out(); + goto CheckAndLoop; + + case kInstNop: if (!ip->last()) Push(id+1, p); // try the next when we're done - id = ip->out(); - + id = ip->out(); + CheckAndLoop: // Sanity check: id is the head of its list, which must // be the case if id-1 is the last of *its* list. :) @@ -239,37 +239,37 @@ bool BitState::TrySearch(int id0, const char* p0) { goto Loop; break; - case kInstMatch: { + case kInstMatch: { if (endmatch_ && p != end) goto Next; - - // We found a match. If the caller doesn't care - // where the match is, no point going further. - if (nsubmatch_ == 0) - return true; - - // Record best match so far. - // Only need to check end point, because this entire - // call is only considering one start position. - matched = true; - cap_[1] = p; - if (submatch_[0].data() == NULL || + + // We found a match. If the caller doesn't care + // where the match is, no point going further. + if (nsubmatch_ == 0) + return true; + + // Record best match so far. + // Only need to check end point, because this entire + // call is only considering one start position. + matched = true; + cap_[1] = p; + if (submatch_[0].data() == NULL || (longest_ && p > submatch_[0].data() + submatch_[0].size())) { - for (int i = 0; i < nsubmatch_; i++) + for (int i = 0; i < nsubmatch_; i++) submatch_[i] = StringPiece(cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i])); - } - - // If going for first match, we're done. - if (!longest_) - return true; - - // If we used the entire text, no longer match is possible. + } + + // If going for first match, we're done. + if (!longest_) + return true; + + // If we used the entire text, no longer match is possible. if (p == end) - return true; - - // Otherwise, continue on in hope of a longer match. + return true; + + // Otherwise, continue on in hope of a longer match. // Note the absence of the ShouldVisit() check here // due to execution remaining in the same list. Next: @@ -278,60 +278,60 @@ bool BitState::TrySearch(int id0, const char* p0) { goto Loop; } break; - } - } - } - return matched; -} - -// Search text (within context) for prog_. -bool BitState::Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch) { - // Search parameters. - text_ = text; - context_ = context; + } + } + } + return matched; +} + +// Search text (within context) for prog_. +bool BitState::Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch) { + // Search parameters. + text_ = text; + context_ = context; if (context_.data() == NULL) - context_ = text; + context_ = text; if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text)) - return false; + return false; if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text)) - return false; - anchored_ = anchored || prog_->anchor_start(); - longest_ = longest || prog_->anchor_end(); - endmatch_ = prog_->anchor_end(); - submatch_ = submatch; - nsubmatch_ = nsubmatch; - for (int i = 0; i < nsubmatch_; i++) + return false; + anchored_ = anchored || prog_->anchor_start(); + longest_ = longest || prog_->anchor_end(); + endmatch_ = prog_->anchor_end(); + submatch_ = submatch; + nsubmatch_ = nsubmatch; + for (int i = 0; i < nsubmatch_; i++) submatch_[i] = StringPiece(); - - // Allocate scratch space. + + // Allocate scratch space. int nvisited = prog_->list_count() * static_cast<int>(text.size()+1); nvisited = (nvisited + kVisitedBits-1) / kVisitedBits; visited_ = PODArray<uint64_t>(nvisited); memset(visited_.data(), 0, nvisited*sizeof visited_[0]); - + int ncap = 2*nsubmatch; if (ncap < 2) ncap = 2; cap_ = PODArray<const char*>(ncap); memset(cap_.data(), 0, ncap*sizeof cap_[0]); - + // When sizeof(Job) == 16, we start with a nice round 1KiB. :) job_ = PODArray<Job>(64); - - // Anchored search must start at text.begin(). - if (anchored_) { + + // Anchored search must start at text.begin(). + if (anchored_) { cap_[0] = text.data(); return TrySearch(prog_->start(), text.data()); - } - - // Unanchored search, starting from each possible text position. - // Notice that we have to try the empty string at the end of - // the text, so the loop condition is p <= text.end(), not p < text.end(). - // This looks like it's quadratic in the size of the text, - // but we are not clearing visited_ between calls to TrySearch, - // so no work is duplicated and it ends up still being linear. + } + + // Unanchored search, starting from each possible text position. + // Notice that we have to try the empty string at the end of + // the text, so the loop condition is p <= text.end(), not p < text.end(). + // This looks like it's quadratic in the size of the text, + // but we are not clearing visited_ between calls to TrySearch, + // so no work is duplicated and it ends up still being linear. const char* etext = text.data() + text.size(); for (const char* p = text.data(); p <= etext; p++) { // Try to use prefix accel (e.g. memchr) to skip ahead. @@ -341,45 +341,45 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, p = etext; } - cap_[0] = p; - if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. - return true; + cap_[0] = p; + if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. + return true; // Avoid invoking undefined behavior (arithmetic on a null pointer) // by simply not continuing the loop. if (p == NULL) break; - } - return false; -} - -// Bit-state search. -bool Prog::SearchBitState(const StringPiece& text, - const StringPiece& context, - Anchor anchor, - MatchKind kind, - StringPiece* match, - int nmatch) { - // If full match, we ask for an anchored longest match - // and then check that match[0] == text. - // So make sure match[0] exists. - StringPiece sp0; - if (kind == kFullMatch) { - anchor = kAnchored; - if (nmatch < 1) { - match = &sp0; - nmatch = 1; - } - } - - // Run the search. - BitState b(this); - bool anchored = anchor == kAnchored; - bool longest = kind != kFirstMatch; - if (!b.Search(text, context, anchored, longest, match, nmatch)) - return false; + } + return false; +} + +// Bit-state search. +bool Prog::SearchBitState(const StringPiece& text, + const StringPiece& context, + Anchor anchor, + MatchKind kind, + StringPiece* match, + int nmatch) { + // If full match, we ask for an anchored longest match + // and then check that match[0] == text. + // So make sure match[0] exists. + StringPiece sp0; + if (kind == kFullMatch) { + anchor = kAnchored; + if (nmatch < 1) { + match = &sp0; + nmatch = 1; + } + } + + // Run the search. + BitState b(this); + bool anchored = anchor == kAnchored; + bool longest = kind != kFirstMatch; + if (!b.Search(text, context, anchored, longest, match, nmatch)) + return false; if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) - return false; - return true; -} - -} // namespace re2 + return false; + return true; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/compile.cc b/contrib/libs/re2/re2/compile.cc index 36c902044b..61d801a630 100644 --- a/contrib/libs/re2/re2/compile.cc +++ b/contrib/libs/re2/re2/compile.cc @@ -1,13 +1,13 @@ -// Copyright 2007 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Compile regular expression to Prog. -// -// Prog and Inst are defined in prog.h. -// This file's external interface is just Regexp::CompileToProg. -// The Compiler class defined in this file is private. - +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Compile regular expression to Prog. +// +// Prog and Inst are defined in prog.h. +// This file's external interface is just Regexp::CompileToProg. +// The Compiler class defined in this file is private. + #include <stdint.h> #include <string.h> #include <unordered_map> @@ -16,32 +16,32 @@ #include "util/logging.h" #include "util/utf.h" #include "re2/pod_array.h" -#include "re2/prog.h" +#include "re2/prog.h" #include "re2/re2.h" -#include "re2/regexp.h" -#include "re2/walker-inl.h" - -namespace re2 { - -// List of pointers to Inst* that need to be filled in (patched). -// Because the Inst* haven't been filled in yet, -// we can use the Inst* word to hold the list's "next" pointer. -// It's kind of sleazy, but it works well in practice. -// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. -// -// Because the out and out1 fields in Inst are no longer pointers, +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// List of pointers to Inst* that need to be filled in (patched). +// Because the Inst* haven't been filled in yet, +// we can use the Inst* word to hold the list's "next" pointer. +// It's kind of sleazy, but it works well in practice. +// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. +// +// Because the out and out1 fields in Inst are no longer pointers, // we can't use pointers directly here either. Instead, head refers // to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1). // head == 0 represents the NULL list. This is okay because instruction #0 -// is always the fail instruction, which never appears on a list. -struct PatchList { - // Returns patch list containing just p. +// is always the fail instruction, which never appears on a list. +struct PatchList { + // Returns patch list containing just p. static PatchList Mk(uint32_t p) { return {p, p}; } - + // Patches all the entries on l to have value p. - // Caller must not ever use patch list again. + // Caller must not ever use patch list again. static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) { while (l.head != 0) { Prog::Inst* ip = &inst0[l.head>>1]; @@ -52,9 +52,9 @@ struct PatchList { l.head = ip->out(); ip->set_out(p); } - } - } - + } + } + // Appends two patch lists and returns result. static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { if (l1.head == 0) @@ -67,113 +67,113 @@ struct PatchList { else ip->set_out(l2.head); return {l1.head, l2.tail}; - } - + } + uint32_t head; uint32_t tail; // for constant-time append }; - + static const PatchList kNullPatchList = {0, 0}; - -// Compiled program fragment. -struct Frag { + +// Compiled program fragment. +struct Frag { uint32_t begin; - PatchList end; + PatchList end; bool nullable; - + Frag() : begin(0), end(kNullPatchList), nullable(false) {} Frag(uint32_t begin, PatchList end, bool nullable) : begin(begin), end(end), nullable(nullable) {} -}; - -// Input encodings. -enum Encoding { - kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) +}; + +// Input encodings. +enum Encoding { + kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) kEncodingLatin1, // Latin-1 (0-FF) -}; - -class Compiler : public Regexp::Walker<Frag> { - public: - explicit Compiler(); - ~Compiler(); - - // Compiles Regexp to a new Prog. - // Caller is responsible for deleting Prog when finished with it. - // If reversed is true, compiles for walking over the input - // string backward (reverses all concatenations). +}; + +class Compiler : public Regexp::Walker<Frag> { + public: + explicit Compiler(); + ~Compiler(); + + // Compiles Regexp to a new Prog. + // Caller is responsible for deleting Prog when finished with it. + // If reversed is true, compiles for walking over the input + // string backward (reverses all concatenations). static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem); - - // Compiles alternation of all the re to a new Prog. - // Each re has a match with an id equal to its index in the vector. + + // Compiles alternation of all the re to a new Prog. + // Each re has a match with an id equal to its index in the vector. static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); - - // Interface for Regexp::Walker, which helps traverse the Regexp. - // The walk is purely post-recursive: given the machines for the - // children, PostVisit combines them to create the machine for - // the current node. The child_args are Frags. - // The Compiler traverses the Regexp parse tree, visiting - // each node in depth-first order. It invokes PreVisit before - // visiting the node's children and PostVisit after visiting - // the children. - Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); - Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, - int nchild_args); - Frag ShortVisit(Regexp* re, Frag parent_arg); - Frag Copy(Frag arg); - - // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? - Frag Plus(Frag a, bool nongreedy); - Frag Star(Frag a, bool nongreedy); - Frag Quest(Frag a, bool nongreedy); - - // Given fragment a, returns (a) capturing as \n. - Frag Capture(Frag a, int n); - - // Given fragments a and b, returns ab; a|b - Frag Cat(Frag a, Frag b); - Frag Alt(Frag a, Frag b); - - // Returns a fragment that can't match anything. - Frag NoMatch(); - - // Returns a fragment that matches the empty string. + + // Interface for Regexp::Walker, which helps traverse the Regexp. + // The walk is purely post-recursive: given the machines for the + // children, PostVisit combines them to create the machine for + // the current node. The child_args are Frags. + // The Compiler traverses the Regexp parse tree, visiting + // each node in depth-first order. It invokes PreVisit before + // visiting the node's children and PostVisit after visiting + // the children. + Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); + Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, + int nchild_args); + Frag ShortVisit(Regexp* re, Frag parent_arg); + Frag Copy(Frag arg); + + // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? + Frag Plus(Frag a, bool nongreedy); + Frag Star(Frag a, bool nongreedy); + Frag Quest(Frag a, bool nongreedy); + + // Given fragment a, returns (a) capturing as \n. + Frag Capture(Frag a, int n); + + // Given fragments a and b, returns ab; a|b + Frag Cat(Frag a, Frag b); + Frag Alt(Frag a, Frag b); + + // Returns a fragment that can't match anything. + Frag NoMatch(); + + // Returns a fragment that matches the empty string. Frag Match(int32_t id); - - // Returns a no-op fragment. - Frag Nop(); - - // Returns a fragment matching the byte range lo-hi. - Frag ByteRange(int lo, int hi, bool foldcase); - - // Returns a fragment matching an empty-width special op. - Frag EmptyWidth(EmptyOp op); - - // Adds n instructions to the program. - // Returns the index of the first one. - // Returns -1 if no more instructions are available. - int AllocInst(int n); - - // Rune range compiler. - - // Begins a new alternation. - void BeginRange(); - - // Adds a fragment matching the rune range lo-hi. - void AddRuneRange(Rune lo, Rune hi, bool foldcase); - void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); - void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); - void Add_80_10ffff(); - - // New suffix that matches the byte range lo-hi, then goes to next. + + // Returns a no-op fragment. + Frag Nop(); + + // Returns a fragment matching the byte range lo-hi. + Frag ByteRange(int lo, int hi, bool foldcase); + + // Returns a fragment matching an empty-width special op. + Frag EmptyWidth(EmptyOp op); + + // Adds n instructions to the program. + // Returns the index of the first one. + // Returns -1 if no more instructions are available. + int AllocInst(int n); + + // Rune range compiler. + + // Begins a new alternation. + void BeginRange(); + + // Adds a fragment matching the rune range lo-hi. + void AddRuneRange(Rune lo, Rune hi, bool foldcase); + void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); + void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); + void Add_80_10ffff(); + + // New suffix that matches the byte range lo-hi, then goes to next. int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); - + // Returns true iff the suffix is cached. bool IsCachedRuneByteSuffix(int id); - // Adds a suffix to alternation. - void AddSuffix(int id); - + // Adds a suffix to alternation. + void AddSuffix(int id); + // Adds a suffix to the trie starting from the given root node. // Returns zero iff allocating an instruction fails. Otherwise, returns // the current root node, which might be different from what was given. @@ -187,62 +187,62 @@ class Compiler : public Regexp::Walker<Frag> { // Compares two ByteRanges and returns true iff they are equal. bool ByteRangeEqual(int id1, int id2); - // Returns the alternation of all the added suffixes. - Frag EndRange(); - - // Single rune. - Frag Literal(Rune r, bool foldcase); - + // Returns the alternation of all the added suffixes. + Frag EndRange(); + + // Single rune. + Frag Literal(Rune r, bool foldcase); + void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); Prog* Finish(Regexp* re); - - // Returns .* where dot = any byte - Frag DotStar(); - - private: - Prog* prog_; // Program being built. - bool failed_; // Did we give up compiling? - Encoding encoding_; // Input encoding - bool reversed_; // Should program run backward over text? - + + // Returns .* where dot = any byte + Frag DotStar(); + + private: + Prog* prog_; // Program being built. + bool failed_; // Did we give up compiling? + Encoding encoding_; // Input encoding + bool reversed_; // Should program run backward over text? + PODArray<Prog::Inst> inst_; int ninst_; // Number of instructions used. int max_ninst_; // Maximum number of instructions. - + int64_t max_mem_; // Total memory budget. - + std::unordered_map<uint64_t, int> rune_cache_; - Frag rune_range_; - - RE2::Anchor anchor_; // anchor mode for RE2::Set - + Frag rune_range_; + + RE2::Anchor anchor_; // anchor mode for RE2::Set + Compiler(const Compiler&) = delete; Compiler& operator=(const Compiler&) = delete; -}; - -Compiler::Compiler() { - prog_ = new Prog(); - failed_ = false; - encoding_ = kEncodingUTF8; - reversed_ = false; +}; + +Compiler::Compiler() { + prog_ = new Prog(); + failed_ = false; + encoding_ = kEncodingUTF8; + reversed_ = false; ninst_ = 0; max_ninst_ = 1; // make AllocInst for fail instruction okay - max_mem_ = 0; - int fail = AllocInst(1); - inst_[fail].InitFail(); + max_mem_ = 0; + int fail = AllocInst(1); + inst_[fail].InitFail(); max_ninst_ = 0; // Caller must change -} - -Compiler::~Compiler() { - delete prog_; -} - -int Compiler::AllocInst(int n) { +} + +Compiler::~Compiler() { + delete prog_; +} + +int Compiler::AllocInst(int n) { if (failed_ || ninst_ + n > max_ninst_) { - failed_ = true; - return -1; - } - + failed_ = true; + return -1; + } + if (ninst_ + n > inst_.size()) { int cap = inst_.size(); if (cap == 0) @@ -254,92 +254,92 @@ int Compiler::AllocInst(int n) { memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]); memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]); inst_ = std::move(inst); - } + } int id = ninst_; ninst_ += n; - return id; -} - -// These routines are somewhat hard to visualize in text -- -// see http://swtch.com/~rsc/regexp/regexp1.html for -// pictures explaining what is going on here. - -// Returns an unmatchable fragment. -Frag Compiler::NoMatch() { + return id; +} + +// These routines are somewhat hard to visualize in text -- +// see http://swtch.com/~rsc/regexp/regexp1.html for +// pictures explaining what is going on here. + +// Returns an unmatchable fragment. +Frag Compiler::NoMatch() { return Frag(); -} - -// Is a an unmatchable fragment? -static bool IsNoMatch(Frag a) { - return a.begin == 0; -} - -// Given fragments a and b, returns fragment for ab. -Frag Compiler::Cat(Frag a, Frag b) { - if (IsNoMatch(a) || IsNoMatch(b)) - return NoMatch(); - - // Elide no-op. - Prog::Inst* begin = &inst_[a.begin]; - if (begin->opcode() == kInstNop && +} + +// Is a an unmatchable fragment? +static bool IsNoMatch(Frag a) { + return a.begin == 0; +} + +// Given fragments a and b, returns fragment for ab. +Frag Compiler::Cat(Frag a, Frag b) { + if (IsNoMatch(a) || IsNoMatch(b)) + return NoMatch(); + + // Elide no-op. + Prog::Inst* begin = &inst_[a.begin]; + if (begin->opcode() == kInstNop && a.end.head == (a.begin << 1) && - begin->out() == 0) { + begin->out() == 0) { // in case refs to a somewhere PatchList::Patch(inst_.data(), a.end, b.begin); - return b; - } - - // To run backward over string, reverse all concatenations. - if (reversed_) { + return b; + } + + // To run backward over string, reverse all concatenations. + if (reversed_) { PatchList::Patch(inst_.data(), b.end, a.begin); return Frag(b.begin, a.end, b.nullable && a.nullable); - } - + } + PatchList::Patch(inst_.data(), a.end, b.begin); return Frag(a.begin, b.end, a.nullable && b.nullable); -} - -// Given fragments for a and b, returns fragment for a|b. -Frag Compiler::Alt(Frag a, Frag b) { - // Special case for convenience in loops. - if (IsNoMatch(a)) - return b; - if (IsNoMatch(b)) - return a; - - int id = AllocInst(1); - if (id < 0) - return NoMatch(); - - inst_[id].InitAlt(a.begin, b.begin); +} + +// Given fragments for a and b, returns fragment for a|b. +Frag Compiler::Alt(Frag a, Frag b) { + // Special case for convenience in loops. + if (IsNoMatch(a)) + return b; + if (IsNoMatch(b)) + return a; + + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + + inst_[id].InitAlt(a.begin, b.begin); return Frag(id, PatchList::Append(inst_.data(), a.end, b.end), a.nullable || b.nullable); -} - -// When capturing submatches in like-Perl mode, a kOpAlt Inst -// treats out_ as the first choice, out1_ as the second. -// -// For *, +, and ?, if out_ causes another repetition, -// then the operator is greedy. If out1_ is the repetition -// (and out_ moves forward), then the operator is non-greedy. - +} + +// When capturing submatches in like-Perl mode, a kOpAlt Inst +// treats out_ as the first choice, out1_ as the second. +// +// For *, +, and ?, if out_ causes another repetition, +// then the operator is greedy. If out1_ is the repetition +// (and out_ moves forward), then the operator is non-greedy. + // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) Frag Compiler::Plus(Frag a, bool nongreedy) { - int id = AllocInst(1); - if (id < 0) - return NoMatch(); + int id = AllocInst(1); + if (id < 0) + return NoMatch(); PatchList pl; - if (nongreedy) { + if (nongreedy) { inst_[id].InitAlt(0, a.begin); pl = PatchList::Mk(id << 1); - } else { + } else { inst_[id].InitAlt(a.begin, 0); pl = PatchList::Mk((id << 1) | 1); - } + } PatchList::Patch(inst_.data(), a.end, id); return Frag(a.begin, pl, a.nullable); -} - +} + // Given a fragment for a, returns a fragment for a* or a*? (if nongreedy) Frag Compiler::Star(Frag a, bool nongreedy) { // When the subexpression is nullable, one Alt isn't enough to guarantee @@ -361,112 +361,112 @@ Frag Compiler::Star(Frag a, bool nongreedy) { } PatchList::Patch(inst_.data(), a.end, id); return Frag(id, pl, true); -} - -// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) -Frag Compiler::Quest(Frag a, bool nongreedy) { +} + +// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) +Frag Compiler::Quest(Frag a, bool nongreedy) { if (IsNoMatch(a)) return Nop(); - int id = AllocInst(1); - if (id < 0) - return NoMatch(); - PatchList pl; - if (nongreedy) { - inst_[id].InitAlt(0, a.begin); - pl = PatchList::Mk(id << 1); - } else { - inst_[id].InitAlt(a.begin, 0); - pl = PatchList::Mk((id << 1) | 1); - } + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + PatchList pl; + if (nongreedy) { + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); + } else { + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); + } return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true); -} - -// Returns a fragment for the byte range lo-hi. -Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { - int id = AllocInst(1); - if (id < 0) - return NoMatch(); - inst_[id].InitByteRange(lo, hi, foldcase, 0); +} + +// Returns a fragment for the byte range lo-hi. +Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitByteRange(lo, hi, foldcase, 0); return Frag(id, PatchList::Mk(id << 1), false); -} - -// Returns a no-op fragment. Sometimes unavoidable. -Frag Compiler::Nop() { - int id = AllocInst(1); - if (id < 0) - return NoMatch(); - inst_[id].InitNop(0); +} + +// Returns a no-op fragment. Sometimes unavoidable. +Frag Compiler::Nop() { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitNop(0); return Frag(id, PatchList::Mk(id << 1), true); -} - -// Returns a fragment that signals a match. +} + +// Returns a fragment that signals a match. Frag Compiler::Match(int32_t match_id) { - int id = AllocInst(1); - if (id < 0) - return NoMatch(); - inst_[id].InitMatch(match_id); + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitMatch(match_id); return Frag(id, kNullPatchList, false); -} - -// Returns a fragment matching a particular empty-width op (like ^ or $) -Frag Compiler::EmptyWidth(EmptyOp empty) { - int id = AllocInst(1); - if (id < 0) - return NoMatch(); - inst_[id].InitEmptyWidth(empty, 0); +} + +// Returns a fragment matching a particular empty-width op (like ^ or $) +Frag Compiler::EmptyWidth(EmptyOp empty) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitEmptyWidth(empty, 0); return Frag(id, PatchList::Mk(id << 1), true); -} - -// Given a fragment a, returns a fragment with capturing parens around a. -Frag Compiler::Capture(Frag a, int n) { +} + +// Given a fragment a, returns a fragment with capturing parens around a. +Frag Compiler::Capture(Frag a, int n) { if (IsNoMatch(a)) return NoMatch(); - int id = AllocInst(2); - if (id < 0) - return NoMatch(); - inst_[id].InitCapture(2*n, a.begin); - inst_[id+1].InitCapture(2*n+1, 0); + int id = AllocInst(2); + if (id < 0) + return NoMatch(); + inst_[id].InitCapture(2*n, a.begin); + inst_[id+1].InitCapture(2*n+1, 0); PatchList::Patch(inst_.data(), a.end, id+1); - + return Frag(id, PatchList::Mk((id+1) << 1), a.nullable); -} - -// A Rune is a name for a Unicode code point. -// Returns maximum rune encoded by UTF-8 sequence of length len. -static int MaxRune(int len) { +} + +// A Rune is a name for a Unicode code point. +// Returns maximum rune encoded by UTF-8 sequence of length len. +static int MaxRune(int len) { int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) - if (len == 1) - b = 7; - else - b = 8-(len+1) + 6*(len-1); - return (1<<b) - 1; // maximum Rune for b bits. -} - -// The rune range compiler caches common suffix fragments, -// which are very common in UTF-8 (e.g., [80-bf]). -// The fragment suffixes are identified by their start -// instructions. NULL denotes the eventual end match. -// The Frag accumulates in rune_range_. Caching common -// suffixes reduces the UTF-8 "." from 32 to 24 instructions, -// and it reduces the corresponding one-pass NFA from 16 nodes to 8. - -void Compiler::BeginRange() { - rune_cache_.clear(); - rune_range_.begin = 0; + if (len == 1) + b = 7; + else + b = 8-(len+1) + 6*(len-1); + return (1<<b) - 1; // maximum Rune for b bits. +} + +// The rune range compiler caches common suffix fragments, +// which are very common in UTF-8 (e.g., [80-bf]). +// The fragment suffixes are identified by their start +// instructions. NULL denotes the eventual end match. +// The Frag accumulates in rune_range_. Caching common +// suffixes reduces the UTF-8 "." from 32 to 24 instructions, +// and it reduces the corresponding one-pass NFA from 16 nodes to 8. + +void Compiler::BeginRange() { + rune_cache_.clear(); + rune_range_.begin = 0; rune_range_.end = kNullPatchList; -} - +} + int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, - int next) { - Frag f = ByteRange(lo, hi, foldcase); - if (next != 0) { + int next) { + Frag f = ByteRange(lo, hi, foldcase); + if (next != 0) { PatchList::Patch(inst_.data(), f.end, next); - } else { + } else { rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end); - } - return f.begin; -} - + } + return f.begin; +} + static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase, int next) { return (uint64_t)next << 17 | @@ -474,18 +474,18 @@ static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase, (uint64_t)hi << 1 | (uint64_t)foldcase; } - + int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) { uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key); - if (it != rune_cache_.end()) - return it->second; - int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); - rune_cache_[key] = id; - return id; -} - + if (it != rune_cache_.end()) + return it->second; + int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); + rune_cache_[key] = id; + return id; +} + bool Compiler::IsCachedRuneByteSuffix(int id) { uint8_t lo = inst_[id].lo_; uint8_t hi = inst_[id].hi_; @@ -496,30 +496,30 @@ bool Compiler::IsCachedRuneByteSuffix(int id) { return rune_cache_.find(key) != rune_cache_.end(); } -void Compiler::AddSuffix(int id) { +void Compiler::AddSuffix(int id) { if (failed_) return; - if (rune_range_.begin == 0) { - rune_range_.begin = id; - return; - } - + if (rune_range_.begin == 0) { + rune_range_.begin = id; + return; + } + if (encoding_ == kEncodingUTF8) { // Build a trie in order to reduce fanout. rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id); return; } - int alt = AllocInst(1); - if (alt < 0) { - rune_range_.begin = 0; - return; - } - inst_[alt].InitAlt(rune_range_.begin, id); - rune_range_.begin = alt; -} - + int alt = AllocInst(1); + if (alt < 0) { + rune_range_.begin = 0; + return; + } + inst_[alt].InitAlt(rune_range_.begin, id); + rune_range_.begin = alt; +} + int Compiler::AddSuffixRecursive(int root, int id) { DCHECK(inst_[root].opcode() == kInstAlt || inst_[root].opcode() == kInstByteRange); @@ -616,38 +616,38 @@ Frag Compiler::FindByteRange(int root, int id) { return NoMatch(); } -Frag Compiler::EndRange() { - return rune_range_; -} - -// Converts rune range lo-hi into a fragment that recognizes -// the bytes that would make up those runes in the current -// encoding (Latin 1 or UTF-8). -// This lets the machine work byte-by-byte even when -// using multibyte encodings. - -void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { - switch (encoding_) { - default: - case kEncodingUTF8: - AddRuneRangeUTF8(lo, hi, foldcase); - break; - case kEncodingLatin1: - AddRuneRangeLatin1(lo, hi, foldcase); - break; - } -} - -void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { +Frag Compiler::EndRange() { + return rune_range_; +} + +// Converts rune range lo-hi into a fragment that recognizes +// the bytes that would make up those runes in the current +// encoding (Latin 1 or UTF-8). +// This lets the machine work byte-by-byte even when +// using multibyte encodings. + +void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { + switch (encoding_) { + default: + case kEncodingUTF8: + AddRuneRangeUTF8(lo, hi, foldcase); + break; + case kEncodingLatin1: + AddRuneRangeLatin1(lo, hi, foldcase); + break; + } +} + +void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { // Latin-1 is easy: runes *are* bytes. - if (lo > hi || lo > 0xFF) - return; - if (hi > 0xFF) - hi = 0xFF; + if (lo > hi || lo > 0xFF) + return; + if (hi > 0xFF) + hi = 0xFF; AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo), static_cast<uint8_t>(hi), foldcase, 0)); -} - +} + void Compiler::Add_80_10ffff() { // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by @@ -661,12 +661,12 @@ void Compiler::Add_80_10ffff() { id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); AddSuffix(id); - + id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); AddSuffix(id); - + id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); @@ -677,7 +677,7 @@ void Compiler::Add_80_10ffff() { int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0); id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1); AddSuffix(id); - + int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1); id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2); AddSuffix(id); @@ -685,60 +685,60 @@ void Compiler::Add_80_10ffff() { int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2); id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3); AddSuffix(id); - } -} - -void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { - if (lo > hi) - return; - + } +} + +void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { + if (lo > hi) + return; + // Pick off 80-10FFFF as a common special case. if (lo == 0x80 && hi == 0x10ffff) { - Add_80_10ffff(); - return; - } - - // Split range into same-length sized ranges. - for (int i = 1; i < UTFmax; i++) { - Rune max = MaxRune(i); - if (lo <= max && max < hi) { - AddRuneRangeUTF8(lo, max, foldcase); - AddRuneRangeUTF8(max+1, hi, foldcase); - return; - } - } - - // ASCII range is always a special case. - if (hi < Runeself) { + Add_80_10ffff(); + return; + } + + // Split range into same-length sized ranges. + for (int i = 1; i < UTFmax; i++) { + Rune max = MaxRune(i); + if (lo <= max && max < hi) { + AddRuneRangeUTF8(lo, max, foldcase); + AddRuneRangeUTF8(max+1, hi, foldcase); + return; + } + } + + // ASCII range is always a special case. + if (hi < Runeself) { AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo), static_cast<uint8_t>(hi), foldcase, 0)); - return; - } - - // Split range into sections that agree on leading bytes. - for (int i = 1; i < UTFmax; i++) { + return; + } + + // Split range into sections that agree on leading bytes. + for (int i = 1; i < UTFmax; i++) { uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence - if ((lo & ~m) != (hi & ~m)) { - if ((lo & m) != 0) { - AddRuneRangeUTF8(lo, lo|m, foldcase); - AddRuneRangeUTF8((lo|m)+1, hi, foldcase); - return; - } - if ((hi & m) != m) { - AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); - AddRuneRangeUTF8(hi&~m, hi, foldcase); - return; - } - } - } - - // Finally. Generate byte matching equivalent for lo-hi. + if ((lo & ~m) != (hi & ~m)) { + if ((lo & m) != 0) { + AddRuneRangeUTF8(lo, lo|m, foldcase); + AddRuneRangeUTF8((lo|m)+1, hi, foldcase); + return; + } + if ((hi & m) != m) { + AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); + AddRuneRangeUTF8(hi&~m, hi, foldcase); + return; + } + } + } + + // Finally. Generate byte matching equivalent for lo-hi. uint8_t ulo[UTFmax], uhi[UTFmax]; - int n = runetochar(reinterpret_cast<char*>(ulo), &lo); - int m = runetochar(reinterpret_cast<char*>(uhi), &hi); - (void)m; // USED(m) - DCHECK_EQ(n, m); - + int n = runetochar(reinterpret_cast<char*>(ulo), &lo); + int m = runetochar(reinterpret_cast<char*>(uhi), &hi); + (void)m; // USED(m) + DCHECK_EQ(n, m); + // The logic below encodes this thinking: // // 1. When we have built the whole suffix, we know that it cannot @@ -763,8 +763,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { // is more likely so; in reverse mode, a byte range is unlikely to // be part of a common suffix whereas a single byte is more likely // so. The same benefit versus cost argument applies here. - int id = 0; - if (reversed_) { + int id = 0; + if (reversed_) { for (int i = 0; i < n; i++) { // In reverse UTF-8 mode: cache the leading byte; don't cache the last // continuation byte; cache anything else iff it's a single byte (XX-XX). @@ -773,7 +773,7 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { else id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); } - } else { + } else { for (int i = n-1; i >= 0; i--) { // In forward UTF-8 mode: don't cache the leading byte; cache the last // continuation byte; cache anything else iff it's a byte range (XX-YY). @@ -782,206 +782,206 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { else id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); } - } - AddSuffix(id); -} - -// Should not be called. -Frag Compiler::Copy(Frag arg) { - // We're using WalkExponential; there should be no copying. - LOG(DFATAL) << "Compiler::Copy called!"; - failed_ = true; - return NoMatch(); -} - -// Visits a node quickly; called once WalkExponential has -// decided to cut this walk short. -Frag Compiler::ShortVisit(Regexp* re, Frag) { - failed_ = true; - return NoMatch(); -} - -// Called before traversing a node's children during the walk. -Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { - // Cut off walk if we've already failed. - if (failed_) - *stop = true; - + } + AddSuffix(id); +} + +// Should not be called. +Frag Compiler::Copy(Frag arg) { + // We're using WalkExponential; there should be no copying. + LOG(DFATAL) << "Compiler::Copy called!"; + failed_ = true; + return NoMatch(); +} + +// Visits a node quickly; called once WalkExponential has +// decided to cut this walk short. +Frag Compiler::ShortVisit(Regexp* re, Frag) { + failed_ = true; + return NoMatch(); +} + +// Called before traversing a node's children during the walk. +Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { + // Cut off walk if we've already failed. + if (failed_) + *stop = true; + return Frag(); // not used by caller -} - -Frag Compiler::Literal(Rune r, bool foldcase) { - switch (encoding_) { - default: +} + +Frag Compiler::Literal(Rune r, bool foldcase) { + switch (encoding_) { + default: return Frag(); - - case kEncodingLatin1: - return ByteRange(r, r, foldcase); - - case kEncodingUTF8: { - if (r < Runeself) // Make common case fast. - return ByteRange(r, r, foldcase); + + case kEncodingLatin1: + return ByteRange(r, r, foldcase); + + case kEncodingUTF8: { + if (r < Runeself) // Make common case fast. + return ByteRange(r, r, foldcase); uint8_t buf[UTFmax]; - int n = runetochar(reinterpret_cast<char*>(buf), &r); + int n = runetochar(reinterpret_cast<char*>(buf), &r); Frag f = ByteRange((uint8_t)buf[0], buf[0], false); - for (int i = 1; i < n; i++) + for (int i = 1; i < n; i++) f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false)); - return f; - } - } -} - -// Called after traversing the node's children during the walk. -// Given their frags, build and return the frag for this re. -Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, - int nchild_frags) { - // If a child failed, don't bother going forward, especially - // since the child_frags might contain Frags with NULLs in them. - if (failed_) - return NoMatch(); - - // Given the child fragments, return the fragment for this node. - switch (re->op()) { - case kRegexpRepeat: - // Should not see; code at bottom of function will print error - break; - - case kRegexpNoMatch: - return NoMatch(); - - case kRegexpEmptyMatch: - return Nop(); - - case kRegexpHaveMatch: { - Frag f = Match(re->match_id()); + return f; + } + } +} + +// Called after traversing the node's children during the walk. +// Given their frags, build and return the frag for this re. +Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, + int nchild_frags) { + // If a child failed, don't bother going forward, especially + // since the child_frags might contain Frags with NULLs in them. + if (failed_) + return NoMatch(); + + // Given the child fragments, return the fragment for this node. + switch (re->op()) { + case kRegexpRepeat: + // Should not see; code at bottom of function will print error + break; + + case kRegexpNoMatch: + return NoMatch(); + + case kRegexpEmptyMatch: + return Nop(); + + case kRegexpHaveMatch: { + Frag f = Match(re->match_id()); if (anchor_ == RE2::ANCHOR_BOTH) { // Append \z or else the subexpression will effectively be unanchored. // Complemented by the UNANCHORED case in CompileSet(). f = Cat(EmptyWidth(kEmptyEndText), f); } - return f; - } - - case kRegexpConcat: { - Frag f = child_frags[0]; - for (int i = 1; i < nchild_frags; i++) - f = Cat(f, child_frags[i]); - return f; - } - - case kRegexpAlternate: { - Frag f = child_frags[0]; - for (int i = 1; i < nchild_frags; i++) - f = Alt(f, child_frags[i]); - return f; - } - - case kRegexpStar: + return f; + } + + case kRegexpConcat: { + Frag f = child_frags[0]; + for (int i = 1; i < nchild_frags; i++) + f = Cat(f, child_frags[i]); + return f; + } + + case kRegexpAlternate: { + Frag f = child_frags[0]; + for (int i = 1; i < nchild_frags; i++) + f = Alt(f, child_frags[i]); + return f; + } + + case kRegexpStar: return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); - - case kRegexpPlus: + + case kRegexpPlus: return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); - - case kRegexpQuest: + + case kRegexpQuest: return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); - - case kRegexpLiteral: + + case kRegexpLiteral: return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); - - case kRegexpLiteralString: { - // Concatenation of literals. - if (re->nrunes() == 0) - return Nop(); - Frag f; - for (int i = 0; i < re->nrunes(); i++) { + + case kRegexpLiteralString: { + // Concatenation of literals. + if (re->nrunes() == 0) + return Nop(); + Frag f; + for (int i = 0; i < re->nrunes(); i++) { Frag f1 = Literal(re->runes()[i], (re->parse_flags()&Regexp::FoldCase) != 0); - if (i == 0) - f = f1; - else - f = Cat(f, f1); - } - return f; - } - - case kRegexpAnyChar: - BeginRange(); - AddRuneRange(0, Runemax, false); - return EndRange(); - - case kRegexpAnyByte: - return ByteRange(0x00, 0xFF, false); - - case kRegexpCharClass: { - CharClass* cc = re->cc(); - if (cc->empty()) { - // This can't happen. - LOG(DFATAL) << "No ranges in char class"; - failed_ = true; - return NoMatch(); - } - - // ASCII case-folding optimization: if the char class - // behaves the same on A-Z as it does on a-z, - // discard any ranges wholly contained in A-Z - // and mark the other ranges as foldascii. - // This reduces the size of a program for - // (?i)abc from 3 insts per letter to 1 per letter. - bool foldascii = cc->FoldsASCII(); - - // Character class is just a big OR of the different - // character ranges in the class. - BeginRange(); - for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { - // ASCII case-folding optimization (see above). - if (foldascii && 'A' <= i->lo && i->hi <= 'Z') - continue; - - // If this range contains all of A-Za-z or none of it, - // the fold flag is unnecessary; don't bother. - bool fold = foldascii; + if (i == 0) + f = f1; + else + f = Cat(f, f1); + } + return f; + } + + case kRegexpAnyChar: + BeginRange(); + AddRuneRange(0, Runemax, false); + return EndRange(); + + case kRegexpAnyByte: + return ByteRange(0x00, 0xFF, false); + + case kRegexpCharClass: { + CharClass* cc = re->cc(); + if (cc->empty()) { + // This can't happen. + LOG(DFATAL) << "No ranges in char class"; + failed_ = true; + return NoMatch(); + } + + // ASCII case-folding optimization: if the char class + // behaves the same on A-Z as it does on a-z, + // discard any ranges wholly contained in A-Z + // and mark the other ranges as foldascii. + // This reduces the size of a program for + // (?i)abc from 3 insts per letter to 1 per letter. + bool foldascii = cc->FoldsASCII(); + + // Character class is just a big OR of the different + // character ranges in the class. + BeginRange(); + for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { + // ASCII case-folding optimization (see above). + if (foldascii && 'A' <= i->lo && i->hi <= 'Z') + continue; + + // If this range contains all of A-Za-z or none of it, + // the fold flag is unnecessary; don't bother. + bool fold = foldascii; if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || ('Z' < i->lo && i->hi < 'a')) - fold = false; - - AddRuneRange(i->lo, i->hi, fold); - } - return EndRange(); - } - - case kRegexpCapture: - // If this is a non-capturing parenthesis -- (?:foo) -- - // just use the inner expression. - if (re->cap() < 0) - return child_frags[0]; - return Capture(child_frags[0], re->cap()); - - case kRegexpBeginLine: - return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); - - case kRegexpEndLine: - return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); - - case kRegexpBeginText: - return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); - - case kRegexpEndText: - return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); - - case kRegexpWordBoundary: - return EmptyWidth(kEmptyWordBoundary); - - case kRegexpNoWordBoundary: - return EmptyWidth(kEmptyNonWordBoundary); - } - LOG(DFATAL) << "Missing case in Compiler: " << re->op(); - failed_ = true; - return NoMatch(); -} - -// Is this regexp required to start at the beginning of the text? -// Only approximate; can return false for complicated regexps like (\Aa|\Ab), -// but handles (\A(a|b)). Could use the Walker to write a more exact one. + fold = false; + + AddRuneRange(i->lo, i->hi, fold); + } + return EndRange(); + } + + case kRegexpCapture: + // If this is a non-capturing parenthesis -- (?:foo) -- + // just use the inner expression. + if (re->cap() < 0) + return child_frags[0]; + return Capture(child_frags[0], re->cap()); + + case kRegexpBeginLine: + return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); + + case kRegexpEndLine: + return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); + + case kRegexpBeginText: + return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); + + case kRegexpEndText: + return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); + + case kRegexpWordBoundary: + return EmptyWidth(kEmptyWordBoundary); + + case kRegexpNoWordBoundary: + return EmptyWidth(kEmptyNonWordBoundary); + } + LOG(DFATAL) << "Missing case in Compiler: " << re->op(); + failed_ = true; + return NoMatch(); +} + +// Is this regexp required to start at the beginning of the text? +// Only approximate; can return false for complicated regexps like (\Aa|\Ab), +// but handles (\A(a|b)). Could use the Walker to write a more exact one. static bool IsAnchorStart(Regexp** pre, int depth) { Regexp* re = *pre; Regexp* sub; @@ -1005,7 +1005,7 @@ static bool IsAnchorStart(Regexp** pre, int depth) { *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); re->Decref(); return true; - } + } sub->Decref(); } break; @@ -1013,8 +1013,8 @@ static bool IsAnchorStart(Regexp** pre, int depth) { sub = re->sub()[0]->Incref(); if (IsAnchorStart(&sub, depth+1)) { *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); - re->Decref(); - return true; + re->Decref(); + return true; } sub->Decref(); break; @@ -1022,13 +1022,13 @@ static bool IsAnchorStart(Regexp** pre, int depth) { *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); re->Decref(); return true; - } + } return false; -} - -// Is this regexp required to start at the end of the text? -// Only approximate; can return false for complicated regexps like (a\z|b\z), -// but handles ((a|b)\z). Could use the Walker to write a more exact one. +} + +// Is this regexp required to start at the end of the text? +// Only approximate; can return false for complicated regexps like (a\z|b\z), +// but handles ((a|b)\z). Could use the Walker to write a more exact one. static bool IsAnchorEnd(Regexp** pre, int depth) { Regexp* re = *pre; Regexp* sub; @@ -1052,7 +1052,7 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); re->Decref(); return true; - } + } sub->Decref(); } break; @@ -1060,8 +1060,8 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { sub = re->sub()[0]->Incref(); if (IsAnchorEnd(&sub, depth+1)) { *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); - re->Decref(); - return true; + re->Decref(); + return true; } sub->Decref(); break; @@ -1069,110 +1069,110 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); re->Decref(); return true; - } + } return false; -} - +} + void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, - RE2::Anchor anchor) { - if (flags & Regexp::Latin1) - encoding_ = kEncodingLatin1; - max_mem_ = max_mem; - if (max_mem <= 0) { + RE2::Anchor anchor) { + if (flags & Regexp::Latin1) + encoding_ = kEncodingLatin1; + max_mem_ = max_mem; + if (max_mem <= 0) { max_ninst_ = 100000; // more than enough } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) { - // No room for anything. + // No room for anything. max_ninst_ = 0; - } else { + } else { int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); - // Limit instruction count so that inst->id() fits nicely in an int. - // SparseArray also assumes that the indices (inst->id()) are ints. + // Limit instruction count so that inst->id() fits nicely in an int. + // SparseArray also assumes that the indices (inst->id()) are ints. // The call to WalkExponential uses 2*max_ninst_ below, - // and other places in the code use 2 or 3 * prog->size(). - // Limiting to 2^24 should avoid overflow in those places. - // (The point of allowing more than 32 bits of memory is to - // have plenty of room for the DFA states, not to use it up - // on the program.) - if (m >= 1<<24) - m = 1<<24; - // Inst imposes its own limit (currently bigger than 2^24 but be safe). - if (m > Prog::Inst::kMaxInst) - m = Prog::Inst::kMaxInst; + // and other places in the code use 2 or 3 * prog->size(). + // Limiting to 2^24 should avoid overflow in those places. + // (The point of allowing more than 32 bits of memory is to + // have plenty of room for the DFA states, not to use it up + // on the program.) + if (m >= 1<<24) + m = 1<<24; + // Inst imposes its own limit (currently bigger than 2^24 but be safe). + if (m > Prog::Inst::kMaxInst) + m = Prog::Inst::kMaxInst; max_ninst_ = static_cast<int>(m); - } - anchor_ = anchor; -} - -// Compiles re, returning program. -// Caller is responsible for deleting prog_. -// If reversed is true, compiles a program that expects -// to run over the input string backward (reverses all concatenations). -// The reversed flag is also recorded in the returned program. + } + anchor_ = anchor; +} + +// Compiles re, returning program. +// Caller is responsible for deleting prog_. +// If reversed is true, compiles a program that expects +// to run over the input string backward (reverses all concatenations). +// The reversed flag is also recorded in the returned program. Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { - Compiler c; + Compiler c; c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */); - c.reversed_ = reversed; - - // Simplify to remove things like counted repetitions - // and character classes like \d. - Regexp* sre = re->Simplify(); - if (sre == NULL) - return NULL; - - // Record whether prog is anchored, removing the anchors. - // (They get in the way of other optimizations.) + c.reversed_ = reversed; + + // Simplify to remove things like counted repetitions + // and character classes like \d. + Regexp* sre = re->Simplify(); + if (sre == NULL) + return NULL; + + // Record whether prog is anchored, removing the anchors. + // (They get in the way of other optimizations.) bool is_anchor_start = IsAnchorStart(&sre, 0); bool is_anchor_end = IsAnchorEnd(&sre, 0); - - // Generate fragment for entire regexp. + + // Generate fragment for entire regexp. Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); - sre->Decref(); - if (c.failed_) - return NULL; - - // Success! Finish by putting Match node at end, and record start. - // Turn off c.reversed_ (if it is set) to force the remaining concatenations - // to behave normally. - c.reversed_ = false; + sre->Decref(); + if (c.failed_) + return NULL; + + // Success! Finish by putting Match node at end, and record start. + // Turn off c.reversed_ (if it is set) to force the remaining concatenations + // to behave normally. + c.reversed_ = false; all = c.Cat(all, c.Match(0)); - + c.prog_->set_reversed(reversed); if (c.prog_->reversed()) { - c.prog_->set_anchor_start(is_anchor_end); - c.prog_->set_anchor_end(is_anchor_start); - } else { - c.prog_->set_anchor_start(is_anchor_start); - c.prog_->set_anchor_end(is_anchor_end); - } - + c.prog_->set_anchor_start(is_anchor_end); + c.prog_->set_anchor_end(is_anchor_start); + } else { + c.prog_->set_anchor_start(is_anchor_start); + c.prog_->set_anchor_end(is_anchor_end); + } + c.prog_->set_start(all.begin); if (!c.prog_->anchor_start()) { // Also create unanchored version, which starts with a .*? loop. all = c.Cat(c.DotStar(), all); - } + } c.prog_->set_start_unanchored(all.begin); - - // Hand ownership of prog_ to caller. + + // Hand ownership of prog_ to caller. return c.Finish(re); -} - +} + Prog* Compiler::Finish(Regexp* re) { - if (failed_) - return NULL; - - if (prog_->start() == 0 && prog_->start_unanchored() == 0) { - // No possible matches; keep Fail instruction only. + if (failed_) + return NULL; + + if (prog_->start() == 0 && prog_->start_unanchored() == 0) { + // No possible matches; keep Fail instruction only. ninst_ = 1; - } - + } + // Hand off the array to Prog. prog_->inst_ = std::move(inst_); prog_->size_ = ninst_; - + prog_->Optimize(); prog_->Flatten(); - prog_->ComputeByteMap(); - + prog_->ComputeByteMap(); + if (!prog_->reversed()) { std::string prefix; bool prefix_foldcase; @@ -1180,82 +1180,82 @@ Prog* Compiler::Finish(Regexp* re) { prog_->ConfigurePrefixAccel(prefix, prefix_foldcase); } - // Record remaining memory for DFA. - if (max_mem_ <= 0) { - prog_->set_dfa_mem(1<<20); - } else { + // Record remaining memory for DFA. + if (max_mem_ <= 0) { + prog_->set_dfa_mem(1<<20); + } else { int64_t m = max_mem_ - sizeof(Prog); m -= prog_->size_*sizeof(Prog::Inst); // account for inst_ if (prog_->CanBitState()) m -= prog_->size_*sizeof(uint16_t); // account for list_heads_ - if (m < 0) - m = 0; - prog_->set_dfa_mem(m); - } - - Prog* p = prog_; - prog_ = NULL; - return p; -} - -// Converts Regexp to Prog. + if (m < 0) + m = 0; + prog_->set_dfa_mem(m); + } + + Prog* p = prog_; + prog_ = NULL; + return p; +} + +// Converts Regexp to Prog. Prog* Regexp::CompileToProg(int64_t max_mem) { - return Compiler::Compile(this, false, max_mem); -} - + return Compiler::Compile(this, false, max_mem); +} + Prog* Regexp::CompileToReverseProg(int64_t max_mem) { - return Compiler::Compile(this, true, max_mem); -} - -Frag Compiler::DotStar() { - return Star(ByteRange(0x00, 0xff, false), true); -} - -// Compiles RE set to Prog. + return Compiler::Compile(this, true, max_mem); +} + +Frag Compiler::DotStar() { + return Star(ByteRange(0x00, 0xff, false), true); +} + +// Compiles RE set to Prog. Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { - Compiler c; + Compiler c; c.Setup(re->parse_flags(), max_mem, anchor); - + Regexp* sre = re->Simplify(); if (sre == NULL) return NULL; - + Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); sre->Decref(); - if (c.failed_) - return NULL; - + if (c.failed_) + return NULL; + c.prog_->set_anchor_start(true); c.prog_->set_anchor_end(true); - if (anchor == RE2::UNANCHORED) { + if (anchor == RE2::UNANCHORED) { // Prepend .* or else the expression will effectively be anchored. // Complemented by the ANCHOR_BOTH case in PostVisit(). - all = c.Cat(c.DotStar(), all); - } - c.prog_->set_start(all.begin); - c.prog_->set_start_unanchored(all.begin); - + all = c.Cat(c.DotStar(), all); + } + c.prog_->set_start(all.begin); + c.prog_->set_start_unanchored(all.begin); + Prog* prog = c.Finish(re); - if (prog == NULL) - return NULL; - - // Make sure DFA has enough memory to operate, - // since we're not going to fall back to the NFA. + if (prog == NULL) + return NULL; + + // Make sure DFA has enough memory to operate, + // since we're not going to fall back to the NFA. bool dfa_failed = false; - StringPiece sp = "hello, world"; - prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, + StringPiece sp = "hello, world"; + prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, NULL, &dfa_failed, NULL); if (dfa_failed) { - delete prog; - return NULL; - } - - return prog; -} - + delete prog; + return NULL; + } + + return prog; +} + Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { return Compiler::CompileSet(re, anchor, max_mem); -} - -} // namespace re2 +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/dfa.cc b/contrib/libs/re2/re2/dfa.cc index c02e5730cc..d47c7d50a7 100644 --- a/contrib/libs/re2/re2/dfa.cc +++ b/contrib/libs/re2/re2/dfa.cc @@ -1,26 +1,26 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// A DFA (deterministic finite automaton)-based regular expression search. -// -// The DFA search has two main parts: the construction of the automaton, -// which is represented by a graph of State structures, and the execution -// of the automaton over a given input string. -// -// The basic idea is that the State graph is constructed so that the -// execution can simply start with a state s, and then for each byte c in -// the input string, execute "s = s->next[c]", checking at each point whether -// the current s represents a matching state. -// -// The simple explanation just given does convey the essence of this code, -// but it omits the details of how the State graph gets constructed as well -// as some performance-driven optimizations to the execution of the automaton. -// All these details are explained in the comments for the code following -// the definition of class DFA. -// -// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent. - +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// A DFA (deterministic finite automaton)-based regular expression search. +// +// The DFA search has two main parts: the construction of the automaton, +// which is represented by a graph of State structures, and the execution +// of the automaton over a given input string. +// +// The basic idea is that the State graph is constructed so that the +// execution can simply start with a state s, and then for each byte c in +// the input string, execute "s = s->next[c]", checking at each point whether +// the current s represents a matching state. +// +// The simple explanation just given does convey the essence of this code, +// but it omits the details of how the State graph gets constructed as well +// as some performance-driven optimizations to the execution of the automaton. +// All these details are explained in the comments for the code following +// the definition of class DFA. +// +// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent. + #include <stddef.h> #include <stdint.h> #include <stdio.h> @@ -41,18 +41,18 @@ #include "util/mutex.h" #include "util/strutil.h" #include "re2/pod_array.h" -#include "re2/prog.h" +#include "re2/prog.h" #include "re2/re2.h" #include "re2/sparse_set.h" #include "re2/stringpiece.h" - + // Silence "zero-sized array in struct/union" warning for DFA::State::next_. #ifdef _MSC_VER #pragma warning(disable: 4200) #endif - -namespace re2 { - + +namespace re2 { + // Controls whether the DFA should bail out early if the NFA would be faster. static bool dfa_should_bail_when_slow = true; @@ -60,65 +60,65 @@ void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) { dfa_should_bail_when_slow = b; } -// Changing this to true compiles in prints that trace execution of the DFA. -// Generates a lot of output -- only useful for debugging. +// Changing this to true compiles in prints that trace execution of the DFA. +// Generates a lot of output -- only useful for debugging. static const bool ExtraDebug = false; - -// A DFA implementation of a regular expression program. -// Since this is entirely a forward declaration mandated by C++, -// some of the comments here are better understood after reading -// the comments in the sections that follow the DFA definition. -class DFA { - public: + +// A DFA implementation of a regular expression program. +// Since this is entirely a forward declaration mandated by C++, +// some of the comments here are better understood after reading +// the comments in the sections that follow the DFA definition. +class DFA { + public: DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem); - ~DFA(); - bool ok() const { return !init_failed_; } - Prog::MatchKind kind() { return kind_; } - - // Searches for the regular expression in text, which is considered - // as a subsection of context for the purposes of interpreting flags - // like ^ and $ and \A and \z. - // Returns whether a match was found. - // If a match is found, sets *ep to the end point of the best match in text. - // If "anchored", the match must begin at the start of text. - // If "want_earliest_match", the match that ends first is used, not - // necessarily the best one. - // If "run_forward" is true, the DFA runs from text.begin() to text.end(). - // If it is false, the DFA runs from text.end() to text.begin(), - // returning the leftmost end of the match instead of the rightmost one. - // If the DFA cannot complete the search (for example, if it is out of - // memory), it sets *failed and returns false. - bool Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool want_earliest_match, bool run_forward, + ~DFA(); + bool ok() const { return !init_failed_; } + Prog::MatchKind kind() { return kind_; } + + // Searches for the regular expression in text, which is considered + // as a subsection of context for the purposes of interpreting flags + // like ^ and $ and \A and \z. + // Returns whether a match was found. + // If a match is found, sets *ep to the end point of the best match in text. + // If "anchored", the match must begin at the start of text. + // If "want_earliest_match", the match that ends first is used, not + // necessarily the best one. + // If "run_forward" is true, the DFA runs from text.begin() to text.end(). + // If it is false, the DFA runs from text.end() to text.begin(), + // returning the leftmost end of the match instead of the rightmost one. + // If the DFA cannot complete the search (for example, if it is out of + // memory), it sets *failed and returns false. + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool want_earliest_match, bool run_forward, bool* failed, const char** ep, SparseSet* matches); - + // Builds out all states for the entire DFA. // If cb is not empty, it receives one callback per state built. // Returns the number of states built. // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. int BuildAllStates(const Prog::DFAStateCallback& cb); - - // Computes min and max for matching strings. Won't return strings - // bigger than maxlen. + + // Computes min and max for matching strings. Won't return strings + // bigger than maxlen. bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); - - // These data structures are logically private, but C++ makes it too - // difficult to mark them as such. - class RWLocker; - class StateSaver; + + // These data structures are logically private, but C++ makes it too + // difficult to mark them as such. + class RWLocker; + class StateSaver; class Workq; - - // A single DFA state. The DFA is represented as a graph of these - // States, linked by the next_ pointers. If in state s and reading - // byte c, the next state should be s->next_[c]. - struct State { + + // A single DFA state. The DFA is represented as a graph of these + // States, linked by the next_ pointers. If in state s and reading + // byte c, the next state should be s->next_[c]. + struct State { inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; } - - int* inst_; // Instruction pointers in the state. - int ninst_; // # of inst_ pointers. + + int* inst_; // Instruction pointers in the state. + int ninst_; // # of inst_ pointers. uint32_t flag_; // Empty string bitfield flags in effect on the way - // into this state, along with kFlagMatch if this - // is a matching state. + // into this state, along with kFlagMatch if this + // is a matching state. // Work around the bug affecting flexible array members in GCC 6.x (for x >= 1). // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932) @@ -128,18 +128,18 @@ class DFA { std::atomic<State*> next_[]; // Outgoing arrows from State, #endif - // one per input byte class - }; - - enum { - kByteEndText = 256, // imaginary byte at end of text - + // one per input byte class + }; + + enum { + kByteEndText = 256, // imaginary byte at end of text + kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags kFlagMatch = 0x0100, // State.flag_: this is a matching state kFlagLastWord = 0x0200, // State.flag_: last byte was a word char - kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left - }; - + kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left + }; + struct StateHash { size_t operator()(const State* a) const { DCHECK(a != NULL); @@ -151,285 +151,285 @@ class DFA { } }; - struct StateEqual { - bool operator()(const State* a, const State* b) const { + struct StateEqual { + bool operator()(const State* a, const State* b) const { DCHECK(a != NULL); DCHECK(b != NULL); - if (a == b) - return true; + if (a == b) + return true; if (a->flag_ != b->flag_) - return false; - if (a->ninst_ != b->ninst_) - return false; - for (int i = 0; i < a->ninst_; i++) - if (a->inst_[i] != b->inst_[i]) - return false; + return false; + if (a->ninst_ != b->ninst_) + return false; + for (int i = 0; i < a->ninst_; i++) + if (a->inst_[i] != b->inst_[i]) + return false; return true; - } - }; - + } + }; + typedef std::unordered_set<State*, StateHash, StateEqual> StateSet; - - private: + + private: // Make it easier to swap in a scalable reader-writer mutex. using CacheMutex = Mutex; - enum { - // Indices into start_ for unanchored searches. - // Add kStartAnchored for anchored searches. - kStartBeginText = 0, // text at beginning of context - kStartBeginLine = 2, // text at beginning of line - kStartAfterWordChar = 4, // text follows a word character - kStartAfterNonWordChar = 6, // text follows non-word character - kMaxStart = 8, - - kStartAnchored = 1, - }; - - // Resets the DFA State cache, flushing all saved State* information. - // Releases and reacquires cache_mutex_ via cache_lock, so any - // State* existing before the call are not valid after the call. - // Use a StateSaver to preserve important states across the call. - // cache_mutex_.r <= L < mutex_ - // After: cache_mutex_.w <= L < mutex_ - void ResetCache(RWLocker* cache_lock); - - // Looks up and returns the State corresponding to a Workq. - // L >= mutex_ + enum { + // Indices into start_ for unanchored searches. + // Add kStartAnchored for anchored searches. + kStartBeginText = 0, // text at beginning of context + kStartBeginLine = 2, // text at beginning of line + kStartAfterWordChar = 4, // text follows a word character + kStartAfterNonWordChar = 6, // text follows non-word character + kMaxStart = 8, + + kStartAnchored = 1, + }; + + // Resets the DFA State cache, flushing all saved State* information. + // Releases and reacquires cache_mutex_ via cache_lock, so any + // State* existing before the call are not valid after the call. + // Use a StateSaver to preserve important states across the call. + // cache_mutex_.r <= L < mutex_ + // After: cache_mutex_.w <= L < mutex_ + void ResetCache(RWLocker* cache_lock); + + // Looks up and returns the State corresponding to a Workq. + // L >= mutex_ State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag); - - // Looks up and returns a State matching the inst, ninst, and flag. - // L >= mutex_ + + // Looks up and returns a State matching the inst, ninst, and flag. + // L >= mutex_ State* CachedState(int* inst, int ninst, uint32_t flag); - - // Clear the cache entirely. - // Must hold cache_mutex_.w or be in destructor. - void ClearCache(); - - // Converts a State into a Workq: the opposite of WorkqToCachedState. - // L >= mutex_ + + // Clear the cache entirely. + // Must hold cache_mutex_.w or be in destructor. + void ClearCache(); + + // Converts a State into a Workq: the opposite of WorkqToCachedState. + // L >= mutex_ void StateToWorkq(State* s, Workq* q); - - // Runs a State on a given byte, returning the next state. - State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_ - State* RunStateOnByte(State*, int); // L >= mutex_ - - // Runs a Workq on a given byte followed by a set of empty-string flags, - // producing a new Workq in nq. If a match instruction is encountered, - // sets *ismatch to true. - // L >= mutex_ - void RunWorkqOnByte(Workq* q, Workq* nq, + + // Runs a State on a given byte, returning the next state. + State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_ + State* RunStateOnByte(State*, int); // L >= mutex_ + + // Runs a Workq on a given byte followed by a set of empty-string flags, + // producing a new Workq in nq. If a match instruction is encountered, + // sets *ismatch to true. + // L >= mutex_ + void RunWorkqOnByte(Workq* q, Workq* nq, int c, uint32_t flag, bool* ismatch); - - // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. - // L >= mutex_ + + // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. + // L >= mutex_ void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag); - - // Adds the instruction id to the Workq, following empty arrows - // according to flag. - // L >= mutex_ + + // Adds the instruction id to the Workq, following empty arrows + // according to flag. + // L >= mutex_ void AddToQueue(Workq* q, int id, uint32_t flag); - - // For debugging, returns a text representation of State. + + // For debugging, returns a text representation of State. static std::string DumpState(State* state); - - // For debugging, returns a text representation of a Workq. + + // For debugging, returns a text representation of a Workq. static std::string DumpWorkq(Workq* q); - - // Search parameters - struct SearchParams { - SearchParams(const StringPiece& text, const StringPiece& context, - RWLocker* cache_lock) + + // Search parameters + struct SearchParams { + SearchParams(const StringPiece& text, const StringPiece& context, + RWLocker* cache_lock) : text(text), context(context), - anchored(false), + anchored(false), can_prefix_accel(false), - want_earliest_match(false), - run_forward(false), - start(NULL), - cache_lock(cache_lock), - failed(false), - ep(NULL), + want_earliest_match(false), + run_forward(false), + start(NULL), + cache_lock(cache_lock), + failed(false), + ep(NULL), matches(NULL) {} - - StringPiece text; - StringPiece context; - bool anchored; + + StringPiece text; + StringPiece context; + bool anchored; bool can_prefix_accel; - bool want_earliest_match; - bool run_forward; - State* start; + bool want_earliest_match; + bool run_forward; + State* start; RWLocker* cache_lock; - bool failed; // "out" parameter: whether search gave up - const char* ep; // "out" parameter: end pointer for match + bool failed; // "out" parameter: whether search gave up + const char* ep; // "out" parameter: end pointer for match SparseSet* matches; - - private: + + private: SearchParams(const SearchParams&) = delete; SearchParams& operator=(const SearchParams&) = delete; - }; - - // Before each search, the parameters to Search are analyzed by + }; + + // Before each search, the parameters to Search are analyzed by // AnalyzeSearch to determine the state in which to start. - struct StartInfo { + struct StartInfo { StartInfo() : start(NULL) {} std::atomic<State*> start; - }; - + }; + // Fills in params->start and params->can_prefix_accel using - // the other search parameters. Returns true on success, - // false on failure. - // cache_mutex_.r <= L < mutex_ - bool AnalyzeSearch(SearchParams* params); + // the other search parameters. Returns true on success, + // false on failure. + // cache_mutex_.r <= L < mutex_ + bool AnalyzeSearch(SearchParams* params); bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint32_t flags); - - // The generic search loop, inlined to create specialized versions. - // cache_mutex_.r <= L < mutex_ - // Might unlock and relock cache_mutex_ via params->cache_lock. + + // The generic search loop, inlined to create specialized versions. + // cache_mutex_.r <= L < mutex_ + // Might unlock and relock cache_mutex_ via params->cache_lock. template <bool can_prefix_accel, bool want_earliest_match, bool run_forward> inline bool InlinedSearchLoop(SearchParams* params); - - // The specialized versions of InlinedSearchLoop. The three letters - // at the ends of the name denote the true/false values used as the - // last three parameters of InlinedSearchLoop. - // cache_mutex_.r <= L < mutex_ - // Might unlock and relock cache_mutex_ via params->cache_lock. - bool SearchFFF(SearchParams* params); - bool SearchFFT(SearchParams* params); - bool SearchFTF(SearchParams* params); - bool SearchFTT(SearchParams* params); - bool SearchTFF(SearchParams* params); - bool SearchTFT(SearchParams* params); - bool SearchTTF(SearchParams* params); - bool SearchTTT(SearchParams* params); - - // The main search loop: calls an appropriate specialized version of - // InlinedSearchLoop. - // cache_mutex_.r <= L < mutex_ - // Might unlock and relock cache_mutex_ via params->cache_lock. - bool FastSearchLoop(SearchParams* params); - - - // Looks up bytes in bytemap_ but handles case c == kByteEndText too. - int ByteMap(int c) { - if (c == kByteEndText) - return prog_->bytemap_range(); - return prog_->bytemap()[c]; - } - - // Constant after initialization. - Prog* prog_; // The regular expression program to run. - Prog::MatchKind kind_; // The kind of DFA. - bool init_failed_; // initialization failed (out of memory) - - Mutex mutex_; // mutex_ >= cache_mutex_.r - - // Scratch areas, protected by mutex_. - Workq* q0_; // Two pre-allocated work queues. - Workq* q1_; + + // The specialized versions of InlinedSearchLoop. The three letters + // at the ends of the name denote the true/false values used as the + // last three parameters of InlinedSearchLoop. + // cache_mutex_.r <= L < mutex_ + // Might unlock and relock cache_mutex_ via params->cache_lock. + bool SearchFFF(SearchParams* params); + bool SearchFFT(SearchParams* params); + bool SearchFTF(SearchParams* params); + bool SearchFTT(SearchParams* params); + bool SearchTFF(SearchParams* params); + bool SearchTFT(SearchParams* params); + bool SearchTTF(SearchParams* params); + bool SearchTTT(SearchParams* params); + + // The main search loop: calls an appropriate specialized version of + // InlinedSearchLoop. + // cache_mutex_.r <= L < mutex_ + // Might unlock and relock cache_mutex_ via params->cache_lock. + bool FastSearchLoop(SearchParams* params); + + + // Looks up bytes in bytemap_ but handles case c == kByteEndText too. + int ByteMap(int c) { + if (c == kByteEndText) + return prog_->bytemap_range(); + return prog_->bytemap()[c]; + } + + // Constant after initialization. + Prog* prog_; // The regular expression program to run. + Prog::MatchKind kind_; // The kind of DFA. + bool init_failed_; // initialization failed (out of memory) + + Mutex mutex_; // mutex_ >= cache_mutex_.r + + // Scratch areas, protected by mutex_. + Workq* q0_; // Two pre-allocated work queues. + Workq* q1_; PODArray<int> stack_; // Pre-allocated stack for AddToQueue - - // State* cache. Many threads use and add to the cache simultaneously, - // holding cache_mutex_ for reading and mutex_ (above) when adding. - // If the cache fills and needs to be discarded, the discarding is done - // while holding cache_mutex_ for writing, to avoid interrupting other - // readers. Any State* pointers are only valid while cache_mutex_ - // is held. + + // State* cache. Many threads use and add to the cache simultaneously, + // holding cache_mutex_ for reading and mutex_ (above) when adding. + // If the cache fills and needs to be discarded, the discarding is done + // while holding cache_mutex_ for writing, to avoid interrupting other + // readers. Any State* pointers are only valid while cache_mutex_ + // is held. CacheMutex cache_mutex_; int64_t mem_budget_; // Total memory budget for all States. int64_t state_budget_; // Amount of memory remaining for new States. - StateSet state_cache_; // All States computed so far. - StartInfo start_[kMaxStart]; + StateSet state_cache_; // All States computed so far. + StartInfo start_[kMaxStart]; DFA(const DFA&) = delete; DFA& operator=(const DFA&) = delete; -}; - +}; + // Shorthand for casting to uint8_t*. static inline const uint8_t* BytePtr(const void* v) { return reinterpret_cast<const uint8_t*>(v); -} - -// Work queues - -// Marks separate thread groups of different priority -// in the work queue when in leftmost-longest matching mode. -#define Mark (-1) - +} + +// Work queues + +// Marks separate thread groups of different priority +// in the work queue when in leftmost-longest matching mode. +#define Mark (-1) + // Separates the match IDs from the instructions in inst_. // Used only for "many match" DFA states. #define MatchSep (-2) -// Internally, the DFA uses a sparse array of -// program instruction pointers as a work queue. -// In leftmost longest mode, marks separate sections -// of workq that started executing at different -// locations in the string (earlier locations first). -class DFA::Workq : public SparseSet { - public: - // Constructor: n is number of normal slots, maxmark number of mark slots. - Workq(int n, int maxmark) : - SparseSet(n+maxmark), - n_(n), - maxmark_(maxmark), - nextmark_(n), - last_was_mark_(true) { - } - - bool is_mark(int i) { return i >= n_; } - - int maxmark() { return maxmark_; } - - void clear() { - SparseSet::clear(); - nextmark_ = n_; - } - - void mark() { - if (last_was_mark_) - return; - last_was_mark_ = false; - SparseSet::insert_new(nextmark_++); - } - - int size() { - return n_ + maxmark_; - } - - void insert(int id) { - if (contains(id)) - return; - insert_new(id); - } - - void insert_new(int id) { - last_was_mark_ = false; - SparseSet::insert_new(id); - } - - private: - int n_; // size excluding marks - int maxmark_; // maximum number of marks - int nextmark_; // id of next mark - bool last_was_mark_; // last inserted was mark +// Internally, the DFA uses a sparse array of +// program instruction pointers as a work queue. +// In leftmost longest mode, marks separate sections +// of workq that started executing at different +// locations in the string (earlier locations first). +class DFA::Workq : public SparseSet { + public: + // Constructor: n is number of normal slots, maxmark number of mark slots. + Workq(int n, int maxmark) : + SparseSet(n+maxmark), + n_(n), + maxmark_(maxmark), + nextmark_(n), + last_was_mark_(true) { + } + + bool is_mark(int i) { return i >= n_; } + + int maxmark() { return maxmark_; } + + void clear() { + SparseSet::clear(); + nextmark_ = n_; + } + + void mark() { + if (last_was_mark_) + return; + last_was_mark_ = false; + SparseSet::insert_new(nextmark_++); + } + + int size() { + return n_ + maxmark_; + } + + void insert(int id) { + if (contains(id)) + return; + insert_new(id); + } + + void insert_new(int id) { + last_was_mark_ = false; + SparseSet::insert_new(id); + } + + private: + int n_; // size excluding marks + int maxmark_; // maximum number of marks + int nextmark_; // id of next mark + bool last_was_mark_; // last inserted was mark Workq(const Workq&) = delete; Workq& operator=(const Workq&) = delete; -}; - +}; + DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem) - : prog_(prog), - kind_(kind), - init_failed_(false), - q0_(NULL), - q1_(NULL), + : prog_(prog), + kind_(kind), + init_failed_(false), + q0_(NULL), + q1_(NULL), mem_budget_(max_mem) { if (ExtraDebug) fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str()); - int nmark = 0; + int nmark = 0; if (kind_ == Prog::kLongestMatch) nmark = prog_->size(); // See DFA::AddToQueue() for why this is so. @@ -437,266 +437,266 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem) prog_->inst_count(kInstEmptyWidth) + prog_->inst_count(kInstNop) + nmark + 1; // + 1 for start inst - + // Account for space needed for DFA, q0, q1, stack. - mem_budget_ -= sizeof(DFA); - mem_budget_ -= (prog_->size() + nmark) * - (sizeof(int)+sizeof(int)) * 2; // q0, q1 + mem_budget_ -= sizeof(DFA); + mem_budget_ -= (prog_->size() + nmark) * + (sizeof(int)+sizeof(int)) * 2; // q0, q1 mem_budget_ -= nstack * sizeof(int); // stack - if (mem_budget_ < 0) { - init_failed_ = true; - return; - } - - state_budget_ = mem_budget_; - - // Make sure there is a reasonable amount of working room left. - // At minimum, the search requires room for two states in order - // to limp along, restarting frequently. We'll get better performance - // if there is room for a larger number of states, say 20. + if (mem_budget_ < 0) { + init_failed_ = true; + return; + } + + state_budget_ = mem_budget_; + + // Make sure there is a reasonable amount of working room left. + // At minimum, the search requires room for two states in order + // to limp along, restarting frequently. We'll get better performance + // if there is room for a larger number of states, say 20. // Note that a state stores list heads only, so we use the program // list count for the upper bound, not the program size. int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic<State*>) + (prog_->list_count()+nmark)*sizeof(int); - if (state_budget_ < 20*one_state) { - init_failed_ = true; - return; - } - + if (state_budget_ < 20*one_state) { + init_failed_ = true; + return; + } + q0_ = new Workq(prog_->size(), nmark); q1_ = new Workq(prog_->size(), nmark); stack_ = PODArray<int>(nstack); -} - -DFA::~DFA() { - delete q0_; - delete q1_; - ClearCache(); -} - -// In the DFA state graph, s->next[c] == NULL means that the -// state has not yet been computed and needs to be. We need -// a different special value to signal that s->next[c] is a -// state that can never lead to a match (and thus the search -// can be called off). Hence DeadState. -#define DeadState reinterpret_cast<State*>(1) - -// Signals that the rest of the string matches no matter what it is. -#define FullMatchState reinterpret_cast<State*>(2) - -#define SpecialStateMax FullMatchState - -// Debugging printouts - -// For debugging, returns a string representation of the work queue. +} + +DFA::~DFA() { + delete q0_; + delete q1_; + ClearCache(); +} + +// In the DFA state graph, s->next[c] == NULL means that the +// state has not yet been computed and needs to be. We need +// a different special value to signal that s->next[c] is a +// state that can never lead to a match (and thus the search +// can be called off). Hence DeadState. +#define DeadState reinterpret_cast<State*>(1) + +// Signals that the rest of the string matches no matter what it is. +#define FullMatchState reinterpret_cast<State*>(2) + +#define SpecialStateMax FullMatchState + +// Debugging printouts + +// For debugging, returns a string representation of the work queue. std::string DFA::DumpWorkq(Workq* q) { std::string s; - const char* sep = ""; + const char* sep = ""; for (Workq::iterator it = q->begin(); it != q->end(); ++it) { - if (q->is_mark(*it)) { + if (q->is_mark(*it)) { s += "|"; - sep = ""; - } else { + sep = ""; + } else { s += StringPrintf("%s%d", sep, *it); - sep = ","; - } - } - return s; -} - -// For debugging, returns a string representation of the state. + sep = ","; + } + } + return s; +} + +// For debugging, returns a string representation of the state. std::string DFA::DumpState(State* state) { - if (state == NULL) - return "_"; - if (state == DeadState) - return "X"; - if (state == FullMatchState) - return "*"; + if (state == NULL) + return "_"; + if (state == DeadState) + return "X"; + if (state == FullMatchState) + return "*"; std::string s; - const char* sep = ""; + const char* sep = ""; s += StringPrintf("(%p)", state); - for (int i = 0; i < state->ninst_; i++) { - if (state->inst_[i] == Mark) { + for (int i = 0; i < state->ninst_; i++) { + if (state->inst_[i] == Mark) { s += "|"; - sep = ""; + sep = ""; } else if (state->inst_[i] == MatchSep) { s += "||"; sep = ""; - } else { + } else { s += StringPrintf("%s%d", sep, state->inst_[i]); - sep = ","; - } - } + sep = ","; + } + } s += StringPrintf(" flag=%#x", state->flag_); - return s; -} - -////////////////////////////////////////////////////////////////////// -// -// DFA state graph construction. -// -// The DFA state graph is a heavily-linked collection of State* structures. -// The state_cache_ is a set of all the State structures ever allocated, -// so that if the same state is reached by two different paths, -// the same State structure can be used. This reduces allocation -// requirements and also avoids duplication of effort across the two -// identical states. -// -// A State is defined by an ordered list of instruction ids and a flag word. -// -// The choice of an ordered list of instructions differs from a typical -// textbook DFA implementation, which would use an unordered set. -// Textbook descriptions, however, only care about whether -// the DFA matches, not where it matches in the text. To decide where the -// DFA matches, we need to mimic the behavior of the dominant backtracking -// implementations like PCRE, which try one possible regular expression -// execution, then another, then another, stopping when one of them succeeds. -// The DFA execution tries these many executions in parallel, representing -// each by an instruction id. These pointers are ordered in the State.inst_ -// list in the same order that the executions would happen in a backtracking -// search: if a match is found during execution of inst_[2], inst_[i] for i>=3 -// can be discarded. -// -// Textbooks also typically do not consider context-aware empty string operators -// like ^ or $. These are handled by the flag word, which specifies the set -// of empty-string operators that should be matched when executing at the -// current text position. These flag bits are defined in prog.h. -// The flag word also contains two DFA-specific bits: kFlagMatch if the state -// is a matching state (one that reached a kInstMatch in the program) -// and kFlagLastWord if the last processed byte was a word character, for the -// implementation of \B and \b. -// -// The flag word also contains, shifted up 16 bits, the bits looked for by -// any kInstEmptyWidth instructions in the state. These provide a useful -// summary indicating when new flags might be useful. -// -// The permanent representation of a State's instruction ids is just an array, -// but while a state is being analyzed, these instruction ids are represented -// as a Workq, which is an array that allows iteration in insertion order. - -// NOTE(rsc): The choice of State construction determines whether the DFA -// mimics backtracking implementations (so-called leftmost first matching) or -// traditional DFA implementations (so-called leftmost longest matching as -// prescribed by POSIX). This implementation chooses to mimic the -// backtracking implementations, because we want to replace PCRE. To get -// POSIX behavior, the states would need to be considered not as a simple -// ordered list of instruction ids, but as a list of unordered sets of instruction -// ids. A match by a state in one set would inhibit the running of sets -// farther down the list but not other instruction ids in the same set. Each -// set would correspond to matches beginning at a given point in the string. -// This is implemented by separating different sets with Mark pointers. - -// Looks in the State cache for a State matching q, flag. -// If one is found, returns it. If one is not found, allocates one, -// inserts it in the cache, and returns it. + return s; +} + +////////////////////////////////////////////////////////////////////// +// +// DFA state graph construction. +// +// The DFA state graph is a heavily-linked collection of State* structures. +// The state_cache_ is a set of all the State structures ever allocated, +// so that if the same state is reached by two different paths, +// the same State structure can be used. This reduces allocation +// requirements and also avoids duplication of effort across the two +// identical states. +// +// A State is defined by an ordered list of instruction ids and a flag word. +// +// The choice of an ordered list of instructions differs from a typical +// textbook DFA implementation, which would use an unordered set. +// Textbook descriptions, however, only care about whether +// the DFA matches, not where it matches in the text. To decide where the +// DFA matches, we need to mimic the behavior of the dominant backtracking +// implementations like PCRE, which try one possible regular expression +// execution, then another, then another, stopping when one of them succeeds. +// The DFA execution tries these many executions in parallel, representing +// each by an instruction id. These pointers are ordered in the State.inst_ +// list in the same order that the executions would happen in a backtracking +// search: if a match is found during execution of inst_[2], inst_[i] for i>=3 +// can be discarded. +// +// Textbooks also typically do not consider context-aware empty string operators +// like ^ or $. These are handled by the flag word, which specifies the set +// of empty-string operators that should be matched when executing at the +// current text position. These flag bits are defined in prog.h. +// The flag word also contains two DFA-specific bits: kFlagMatch if the state +// is a matching state (one that reached a kInstMatch in the program) +// and kFlagLastWord if the last processed byte was a word character, for the +// implementation of \B and \b. +// +// The flag word also contains, shifted up 16 bits, the bits looked for by +// any kInstEmptyWidth instructions in the state. These provide a useful +// summary indicating when new flags might be useful. +// +// The permanent representation of a State's instruction ids is just an array, +// but while a state is being analyzed, these instruction ids are represented +// as a Workq, which is an array that allows iteration in insertion order. + +// NOTE(rsc): The choice of State construction determines whether the DFA +// mimics backtracking implementations (so-called leftmost first matching) or +// traditional DFA implementations (so-called leftmost longest matching as +// prescribed by POSIX). This implementation chooses to mimic the +// backtracking implementations, because we want to replace PCRE. To get +// POSIX behavior, the states would need to be considered not as a simple +// ordered list of instruction ids, but as a list of unordered sets of instruction +// ids. A match by a state in one set would inhibit the running of sets +// farther down the list but not other instruction ids in the same set. Each +// set would correspond to matches beginning at a given point in the string. +// This is implemented by separating different sets with Mark pointers. + +// Looks in the State cache for a State matching q, flag. +// If one is found, returns it. If one is not found, allocates one, +// inserts it in the cache, and returns it. // If mq is not null, MatchSep and the match IDs in mq will be appended // to the State. DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { //mutex_.AssertHeld(); - - // Construct array of instruction ids for the new state. - // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: - // those are the only operators with any effect in - // RunWorkqOnEmptyString or RunWorkqOnByte. + + // Construct array of instruction ids for the new state. + // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: + // those are the only operators with any effect in + // RunWorkqOnEmptyString or RunWorkqOnByte. PODArray<int> inst(q->size()); - int n = 0; + int n = 0; uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions bool sawmatch = false; // whether queue contains guaranteed kInstMatch bool sawmark = false; // whether queue contains a Mark if (ExtraDebug) - fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag); - for (Workq::iterator it = q->begin(); it != q->end(); ++it) { - int id = *it; - if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id))) - break; - if (q->is_mark(id)) { - if (n > 0 && inst[n-1] != Mark) { - sawmark = true; - inst[n++] = Mark; - } - continue; - } - Prog::Inst* ip = prog_->inst(id); - switch (ip->opcode()) { - case kInstAltMatch: - // This state will continue to a match no matter what - // the rest of the input is. If it is the highest priority match - // being considered, return the special FullMatchState - // to indicate that it's all matches from here out. - if (kind_ != Prog::kManyMatch && - (kind_ != Prog::kFirstMatch || - (it == q->begin() && ip->greedy(prog_))) && + fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag); + for (Workq::iterator it = q->begin(); it != q->end(); ++it) { + int id = *it; + if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id))) + break; + if (q->is_mark(id)) { + if (n > 0 && inst[n-1] != Mark) { + sawmark = true; + inst[n++] = Mark; + } + continue; + } + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + case kInstAltMatch: + // This state will continue to a match no matter what + // the rest of the input is. If it is the highest priority match + // being considered, return the special FullMatchState + // to indicate that it's all matches from here out. + if (kind_ != Prog::kManyMatch && + (kind_ != Prog::kFirstMatch || + (it == q->begin() && ip->greedy(prog_))) && (kind_ != Prog::kLongestMatch || !sawmark) && (flag & kFlagMatch)) { if (ExtraDebug) - fprintf(stderr, " -> FullMatchState\n"); - return FullMatchState; - } + fprintf(stderr, " -> FullMatchState\n"); + return FullMatchState; + } FALLTHROUGH_INTENDED; default: // Record iff id is the head of its list, which must // be the case if id-1 is the last of *its* list. :) if (prog_->inst(id-1)->last()) inst[n++] = *it; - if (ip->opcode() == kInstEmptyWidth) - needflags |= ip->empty(); - if (ip->opcode() == kInstMatch && !prog_->anchor_end()) - sawmatch = true; - break; - } - } - DCHECK_LE(n, q->size()); - if (n > 0 && inst[n-1] == Mark) - n--; - - // If there are no empty-width instructions waiting to execute, - // then the extra flag bits will not be used, so there is no - // point in saving them. (Discarding them reduces the number - // of distinct states.) - if (needflags == 0) - flag &= kFlagMatch; - - // NOTE(rsc): The code above cannot do flag &= needflags, - // because if the right flags were present to pass the current - // kInstEmptyWidth instructions, new kInstEmptyWidth instructions - // might be reached that in turn need different flags. - // The only sure thing is that if there are no kInstEmptyWidth - // instructions at all, no flags will be needed. - // We could do the extra work to figure out the full set of - // possibly needed flags by exploring past the kInstEmptyWidth - // instructions, but the check above -- are any flags needed - // at all? -- handles the most common case. More fine-grained - // analysis can only be justified by measurements showing that - // too many redundant states are being allocated. - - // If there are no Insts in the list, it's a dead state, - // which is useful to signal with a special pointer so that - // the execution loop can stop early. This is only okay - // if the state is *not* a matching state. - if (n == 0 && flag == 0) { + if (ip->opcode() == kInstEmptyWidth) + needflags |= ip->empty(); + if (ip->opcode() == kInstMatch && !prog_->anchor_end()) + sawmatch = true; + break; + } + } + DCHECK_LE(n, q->size()); + if (n > 0 && inst[n-1] == Mark) + n--; + + // If there are no empty-width instructions waiting to execute, + // then the extra flag bits will not be used, so there is no + // point in saving them. (Discarding them reduces the number + // of distinct states.) + if (needflags == 0) + flag &= kFlagMatch; + + // NOTE(rsc): The code above cannot do flag &= needflags, + // because if the right flags were present to pass the current + // kInstEmptyWidth instructions, new kInstEmptyWidth instructions + // might be reached that in turn need different flags. + // The only sure thing is that if there are no kInstEmptyWidth + // instructions at all, no flags will be needed. + // We could do the extra work to figure out the full set of + // possibly needed flags by exploring past the kInstEmptyWidth + // instructions, but the check above -- are any flags needed + // at all? -- handles the most common case. More fine-grained + // analysis can only be justified by measurements showing that + // too many redundant states are being allocated. + + // If there are no Insts in the list, it's a dead state, + // which is useful to signal with a special pointer so that + // the execution loop can stop early. This is only okay + // if the state is *not* a matching state. + if (n == 0 && flag == 0) { if (ExtraDebug) - fprintf(stderr, " -> DeadState\n"); - return DeadState; - } - - // If we're in longest match mode, the state is a sequence of - // unordered state sets separated by Marks. Sort each set - // to canonicalize, to reduce the number of distinct sets stored. - if (kind_ == Prog::kLongestMatch) { + fprintf(stderr, " -> DeadState\n"); + return DeadState; + } + + // If we're in longest match mode, the state is a sequence of + // unordered state sets separated by Marks. Sort each set + // to canonicalize, to reduce the number of distinct sets stored. + if (kind_ == Prog::kLongestMatch) { int* ip = inst.data(); - int* ep = ip + n; - while (ip < ep) { - int* markp = ip; - while (markp < ep && *markp != Mark) - markp++; + int* ep = ip + n; + while (ip < ep) { + int* markp = ip; + while (markp < ep && *markp != Mark) + markp++; std::sort(ip, markp); - if (markp < ep) - markp++; - ip = markp; - } - } - + if (markp < ep) + markp++; + ip = markp; + } + } + // If we're in many match mode, canonicalize for similar reasons: // we have an unordered set of states (i.e. we don't have Marks) // and sorting will reduce the number of distinct sets stored. @@ -717,47 +717,47 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { } } - // Save the needed empty-width flags in the top bits for use later. - flag |= needflags << kFlagNeedShift; - + // Save the needed empty-width flags in the top bits for use later. + flag |= needflags << kFlagNeedShift; + State* state = CachedState(inst.data(), n, flag); - return state; -} - -// Looks in the State cache for a State matching inst, ninst, flag. -// If one is found, returns it. If one is not found, allocates one, -// inserts it in the cache, and returns it. + return state; +} + +// Looks in the State cache for a State matching inst, ninst, flag. +// If one is found, returns it. If one is not found, allocates one, +// inserts it in the cache, and returns it. DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) { //mutex_.AssertHeld(); - - // Look in the cache for a pre-existing state. + + // Look in the cache for a pre-existing state. // We have to initialise the struct like this because otherwise // MSVC will complain about the flexible array member. :( State state; state.inst_ = inst; state.ninst_ = ninst; state.flag_ = flag; - StateSet::iterator it = state_cache_.find(&state); - if (it != state_cache_.end()) { + StateSet::iterator it = state_cache_.find(&state); + if (it != state_cache_.end()) { if (ExtraDebug) - fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str()); - return *it; - } - - // Must have enough memory for new state. - // In addition to what we're going to allocate, + fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str()); + return *it; + } + + // Must have enough memory for new state. + // In addition to what we're going to allocate, // the state cache hash table seems to incur about 40 bytes per - // State*, empirically. + // State*, empirically. const int kStateCacheOverhead = 40; - int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot + int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) + ninst*sizeof(int); - if (mem_budget_ < mem + kStateCacheOverhead) { - mem_budget_ = -1; - return NULL; - } - mem_budget_ -= mem + kStateCacheOverhead; - + if (mem_budget_ < mem + kStateCacheOverhead) { + mem_budget_ = -1; + return NULL; + } + mem_budget_ -= mem + kStateCacheOverhead; + // Allocate new state along with room for next_ and inst_. char* space = std::allocator<char>().allocate(mem); State* s = new (space) State; @@ -767,19 +767,19 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) { for (int i = 0; i < nnext; i++) (void) new (s->next_ + i) std::atomic<State*>(NULL); s->inst_ = new (s->next_ + nnext) int[ninst]; - memmove(s->inst_, inst, ninst*sizeof s->inst_[0]); - s->ninst_ = ninst; - s->flag_ = flag; + memmove(s->inst_, inst, ninst*sizeof s->inst_[0]); + s->ninst_ = ninst; + s->flag_ = flag; if (ExtraDebug) - fprintf(stderr, " -> %s\n", DumpState(s).c_str()); - - // Put state in cache and return it. - state_cache_.insert(s); - return s; -} - -// Clear the cache. Must hold cache_mutex_.w or be in destructor. -void DFA::ClearCache() { + fprintf(stderr, " -> %s\n", DumpState(s).c_str()); + + // Put state in cache and return it. + state_cache_.insert(s); + return s; +} + +// Clear the cache. Must hold cache_mutex_.w or be in destructor. +void DFA::ClearCache() { StateSet::iterator begin = state_cache_.begin(); StateSet::iterator end = state_cache_.end(); while (begin != end) { @@ -793,15 +793,15 @@ void DFA::ClearCache() { ninst*sizeof(int); std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem); } - state_cache_.clear(); -} - -// Copies insts in state s to the work queue q. -void DFA::StateToWorkq(State* s, Workq* q) { - q->clear(); - for (int i = 0; i < s->ninst_; i++) { + state_cache_.clear(); +} + +// Copies insts in state s to the work queue q. +void DFA::StateToWorkq(State* s, Workq* q) { + q->clear(); + for (int i = 0; i < s->ninst_; i++) { if (s->inst_[i] == Mark) { - q->mark(); + q->mark(); } else if (s->inst_[i] == MatchSep) { // Nothing after this is an instruction! break; @@ -809,12 +809,12 @@ void DFA::StateToWorkq(State* s, Workq* q) { // Explore from the head of the list. AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask); } - } -} - + } +} + // Adds ip to the work queue, following empty arrows according to flag. void DFA::AddToQueue(Workq* q, int id, uint32_t flag) { - + // Use stack_ to hold our stack of instructions yet to process. // It was preallocated as follows: // one entry per Capture; @@ -825,66 +825,66 @@ void DFA::AddToQueue(Workq* q, int id, uint32_t flag) { // When using marks, we also added nmark == prog_->size(). // (Otherwise, nmark == 0.) int* stk = stack_.data(); - int nstk = 0; - - stk[nstk++] = id; - while (nstk > 0) { + int nstk = 0; + + stk[nstk++] = id; + while (nstk > 0) { DCHECK_LE(nstk, stack_.size()); - id = stk[--nstk]; - + id = stk[--nstk]; + Loop: - if (id == Mark) { - q->mark(); - continue; - } - - if (id == 0) - continue; - - // If ip is already on the queue, nothing to do. + if (id == Mark) { + q->mark(); + continue; + } + + if (id == 0) + continue; + + // If ip is already on the queue, nothing to do. // Otherwise add it. We don't actually keep all the // ones that get added, but adding all of them here - // increases the likelihood of q->contains(id), - // reducing the amount of duplicated work. - if (q->contains(id)) - continue; - q->insert_new(id); - - // Process instruction. - Prog::Inst* ip = prog_->inst(id); - switch (ip->opcode()) { + // increases the likelihood of q->contains(id), + // reducing the amount of duplicated work. + if (q->contains(id)) + continue; + q->insert_new(id); + + // Process instruction. + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { default: LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); - break; - - case kInstByteRange: // just save these on the queue - case kInstMatch: + break; + + case kInstByteRange: // just save these on the queue + case kInstMatch: if (ip->last()) break; id = id+1; goto Loop; - - case kInstCapture: // DFA treats captures as no-ops. - case kInstNop: + + case kInstCapture: // DFA treats captures as no-ops. + case kInstNop: if (!ip->last()) stk[nstk++] = id+1; - + // If this instruction is the [00-FF]* loop at the beginning of // a leftmost-longest unanchored search, separate with a Mark so // that future threads (which will start farther to the right in // the input string) are lower priority than current threads. if (ip->opcode() == kInstNop && q->maxmark() > 0 && - id == prog_->start_unanchored() && id != prog_->start()) - stk[nstk++] = Mark; + id == prog_->start_unanchored() && id != prog_->start()) + stk[nstk++] = Mark; id = ip->out(); goto Loop; - + case kInstAltMatch: DCHECK(!ip->last()); id = id+1; goto Loop; - case kInstEmptyWidth: + case kInstEmptyWidth: if (!ip->last()) stk[nstk++] = id+1; @@ -893,67 +893,67 @@ void DFA::AddToQueue(Workq* q, int id, uint32_t flag) { break; id = ip->out(); goto Loop; - } - } -} - -// Running of work queues. In the work queue, order matters: -// the queue is sorted in priority order. If instruction i comes before j, -// then the instructions that i produces during the run must come before -// the ones that j produces. In order to keep this invariant, all the -// work queue runners have to take an old queue to process and then -// also a new queue to fill in. It's not acceptable to add to the end of -// an existing queue, because new instructions will not end up in the -// correct position. - -// Runs the work queue, processing the empty strings indicated by flag. -// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match -// both ^ and $. It is important that callers pass all flags at once: -// processing both ^ and $ is not the same as first processing only ^ -// and then processing only $. Doing the two-step sequence won't match -// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior -// exhibited by existing implementations). + } + } +} + +// Running of work queues. In the work queue, order matters: +// the queue is sorted in priority order. If instruction i comes before j, +// then the instructions that i produces during the run must come before +// the ones that j produces. In order to keep this invariant, all the +// work queue runners have to take an old queue to process and then +// also a new queue to fill in. It's not acceptable to add to the end of +// an existing queue, because new instructions will not end up in the +// correct position. + +// Runs the work queue, processing the empty strings indicated by flag. +// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match +// both ^ and $. It is important that callers pass all flags at once: +// processing both ^ and $ is not the same as first processing only ^ +// and then processing only $. Doing the two-step sequence won't match +// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior +// exhibited by existing implementations). void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) { - newq->clear(); - for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { - if (oldq->is_mark(*i)) - AddToQueue(newq, Mark, flag); - else - AddToQueue(newq, *i, flag); - } -} - -// Runs the work queue, processing the single byte c followed by any empty -// strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine, -// means to match c$. Sets the bool *ismatch to true if the end of the -// regular expression program has been reached (the regexp has matched). -void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, + newq->clear(); + for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { + if (oldq->is_mark(*i)) + AddToQueue(newq, Mark, flag); + else + AddToQueue(newq, *i, flag); + } +} + +// Runs the work queue, processing the single byte c followed by any empty +// strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine, +// means to match c$. Sets the bool *ismatch to true if the end of the +// regular expression program has been reached (the regexp has matched). +void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, int c, uint32_t flag, bool* ismatch) { //mutex_.AssertHeld(); - - newq->clear(); - for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { - if (oldq->is_mark(*i)) { - if (*ismatch) - return; - newq->mark(); - continue; - } - int id = *i; - Prog::Inst* ip = prog_->inst(id); - switch (ip->opcode()) { + + newq->clear(); + for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { + if (oldq->is_mark(*i)) { + if (*ismatch) + return; + newq->mark(); + continue; + } + int id = *i; + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { default: LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); break; - case kInstFail: // never succeeds - case kInstCapture: // already followed - case kInstNop: // already followed - case kInstAltMatch: // already followed - case kInstEmptyWidth: // already followed - break; - - case kInstByteRange: // can follow if c is in range + case kInstFail: // never succeeds + case kInstCapture: // already followed + case kInstNop: // already followed + case kInstAltMatch: // already followed + case kInstEmptyWidth: // already followed + break; + + case kInstByteRange: // can follow if c is in range if (!ip->Matches(c)) break; AddToQueue(newq, ip->out(), flag); @@ -969,363 +969,363 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, ++ip; i += ip - ip0; } - break; - - case kInstMatch: + break; + + case kInstMatch: if (prog_->anchor_end() && c != kByteEndText && kind_ != Prog::kManyMatch) - break; - *ismatch = true; + break; + *ismatch = true; if (kind_ == Prog::kFirstMatch) { - // Can stop processing work queue since we found a match. - return; - } - break; - } - } - + // Can stop processing work queue since we found a match. + return; + } + break; + } + } + if (ExtraDebug) fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch); -} - -// Processes input byte c in state, returning new state. -// Caller does not hold mutex. -DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) { - // Keep only one RunStateOnByte going - // even if the DFA is being run by multiple threads. - MutexLock l(&mutex_); - return RunStateOnByte(state, c); -} - -// Processes input byte c in state, returning new state. -DFA::State* DFA::RunStateOnByte(State* state, int c) { +} + +// Processes input byte c in state, returning new state. +// Caller does not hold mutex. +DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) { + // Keep only one RunStateOnByte going + // even if the DFA is being run by multiple threads. + MutexLock l(&mutex_); + return RunStateOnByte(state, c); +} + +// Processes input byte c in state, returning new state. +DFA::State* DFA::RunStateOnByte(State* state, int c) { //mutex_.AssertHeld(); - if (state <= SpecialStateMax) { - if (state == FullMatchState) { - // It is convenient for routines like PossibleMatchRange - // if we implement RunStateOnByte for FullMatchState: - // once you get into this state you never get out, - // so it's pretty easy. - return FullMatchState; - } - if (state == DeadState) { - LOG(DFATAL) << "DeadState in RunStateOnByte"; - return NULL; - } - if (state == NULL) { - LOG(DFATAL) << "NULL state in RunStateOnByte"; - return NULL; - } - LOG(DFATAL) << "Unexpected special state in RunStateOnByte"; - return NULL; - } - - // If someone else already computed this, return it. + if (state <= SpecialStateMax) { + if (state == FullMatchState) { + // It is convenient for routines like PossibleMatchRange + // if we implement RunStateOnByte for FullMatchState: + // once you get into this state you never get out, + // so it's pretty easy. + return FullMatchState; + } + if (state == DeadState) { + LOG(DFATAL) << "DeadState in RunStateOnByte"; + return NULL; + } + if (state == NULL) { + LOG(DFATAL) << "NULL state in RunStateOnByte"; + return NULL; + } + LOG(DFATAL) << "Unexpected special state in RunStateOnByte"; + return NULL; + } + + // If someone else already computed this, return it. State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed); if (ns != NULL) return ns; - - // Convert state into Workq. - StateToWorkq(state, q0_); - - // Flags marking the kinds of empty-width things (^ $ etc) - // around this byte. Before the byte we have the flags recorded - // in the State structure itself. After the byte we have - // nothing yet (but that will change: read on). + + // Convert state into Workq. + StateToWorkq(state, q0_); + + // Flags marking the kinds of empty-width things (^ $ etc) + // around this byte. Before the byte we have the flags recorded + // in the State structure itself. After the byte we have + // nothing yet (but that will change: read on). uint32_t needflag = state->flag_ >> kFlagNeedShift; uint32_t beforeflag = state->flag_ & kFlagEmptyMask; uint32_t oldbeforeflag = beforeflag; uint32_t afterflag = 0; - - if (c == '\n') { - // Insert implicit $ and ^ around \n - beforeflag |= kEmptyEndLine; - afterflag |= kEmptyBeginLine; - } - - if (c == kByteEndText) { - // Insert implicit $ and \z before the fake "end text" byte. - beforeflag |= kEmptyEndLine | kEmptyEndText; - } - - // The state flag kFlagLastWord says whether the last - // byte processed was a word character. Use that info to - // insert empty-width (non-)word boundaries. + + if (c == '\n') { + // Insert implicit $ and ^ around \n + beforeflag |= kEmptyEndLine; + afterflag |= kEmptyBeginLine; + } + + if (c == kByteEndText) { + // Insert implicit $ and \z before the fake "end text" byte. + beforeflag |= kEmptyEndLine | kEmptyEndText; + } + + // The state flag kFlagLastWord says whether the last + // byte processed was a word character. Use that info to + // insert empty-width (non-)word boundaries. bool islastword = (state->flag_ & kFlagLastWord) != 0; bool isword = c != kByteEndText && Prog::IsWordChar(static_cast<uint8_t>(c)); - if (isword == islastword) - beforeflag |= kEmptyNonWordBoundary; - else - beforeflag |= kEmptyWordBoundary; - - // Okay, finally ready to run. - // Only useful to rerun on empty string if there are new, useful flags. - if (beforeflag & ~oldbeforeflag & needflag) { - RunWorkqOnEmptyString(q0_, q1_, beforeflag); + if (isword == islastword) + beforeflag |= kEmptyNonWordBoundary; + else + beforeflag |= kEmptyWordBoundary; + + // Okay, finally ready to run. + // Only useful to rerun on empty string if there are new, useful flags. + if (beforeflag & ~oldbeforeflag & needflag) { + RunWorkqOnEmptyString(q0_, q1_, beforeflag); using std::swap; - swap(q0_, q1_); - } - bool ismatch = false; + swap(q0_, q1_); + } + bool ismatch = false; RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch); using std::swap; swap(q0_, q1_); - - // Save afterflag along with ismatch and isword in new state. + + // Save afterflag along with ismatch and isword in new state. uint32_t flag = afterflag; - if (ismatch) - flag |= kFlagMatch; - if (isword) - flag |= kFlagLastWord; - + if (ismatch) + flag |= kFlagMatch; + if (isword) + flag |= kFlagLastWord; + if (ismatch && kind_ == Prog::kManyMatch) ns = WorkqToCachedState(q0_, q1_, flag); else ns = WorkqToCachedState(q0_, NULL, flag); - + // Flush ns before linking to it. - // Write barrier before updating state->next_ so that the - // main search loop can proceed without any locking, for speed. - // (Otherwise it would need one mutex operation per input byte.) + // Write barrier before updating state->next_ so that the + // main search loop can proceed without any locking, for speed. + // (Otherwise it would need one mutex operation per input byte.) state->next_[ByteMap(c)].store(ns, std::memory_order_release); - return ns; -} - - -////////////////////////////////////////////////////////////////////// -// DFA cache reset. - -// Reader-writer lock helper. -// -// The DFA uses a reader-writer mutex to protect the state graph itself. -// Traversing the state graph requires holding the mutex for reading, -// and discarding the state graph and starting over requires holding the -// lock for writing. If a search needs to expand the graph but is out -// of memory, it will need to drop its read lock and then acquire the -// write lock. Since it cannot then atomically downgrade from write lock -// to read lock, it runs the rest of the search holding the write lock. -// (This probably helps avoid repeated contention, but really the decision -// is forced by the Mutex interface.) It's a bit complicated to keep -// track of whether the lock is held for reading or writing and thread -// that through the search, so instead we encapsulate it in the RWLocker -// and pass that around. - -class DFA::RWLocker { - public: + return ns; +} + + +////////////////////////////////////////////////////////////////////// +// DFA cache reset. + +// Reader-writer lock helper. +// +// The DFA uses a reader-writer mutex to protect the state graph itself. +// Traversing the state graph requires holding the mutex for reading, +// and discarding the state graph and starting over requires holding the +// lock for writing. If a search needs to expand the graph but is out +// of memory, it will need to drop its read lock and then acquire the +// write lock. Since it cannot then atomically downgrade from write lock +// to read lock, it runs the rest of the search holding the write lock. +// (This probably helps avoid repeated contention, but really the decision +// is forced by the Mutex interface.) It's a bit complicated to keep +// track of whether the lock is held for reading or writing and thread +// that through the search, so instead we encapsulate it in the RWLocker +// and pass that around. + +class DFA::RWLocker { + public: explicit RWLocker(CacheMutex* mu); - ~RWLocker(); - - // If the lock is only held for reading right now, - // drop the read lock and re-acquire for writing. - // Subsequent calls to LockForWriting are no-ops. - // Notice that the lock is *released* temporarily. - void LockForWriting(); - - private: + ~RWLocker(); + + // If the lock is only held for reading right now, + // drop the read lock and re-acquire for writing. + // Subsequent calls to LockForWriting are no-ops. + // Notice that the lock is *released* temporarily. + void LockForWriting(); + + private: CacheMutex* mu_; - bool writing_; - + bool writing_; + RWLocker(const RWLocker&) = delete; RWLocker& operator=(const RWLocker&) = delete; -}; - +}; + DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) { - mu_->ReaderLock(); -} - + mu_->ReaderLock(); +} + // This function is marked as NO_THREAD_SAFETY_ANALYSIS because // the annotations don't support lock upgrade. -void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { - if (!writing_) { - mu_->ReaderUnlock(); +void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { + if (!writing_) { + mu_->ReaderUnlock(); mu_->WriterLock(); - writing_ = true; - } -} - -DFA::RWLocker::~RWLocker() { + writing_ = true; + } +} + +DFA::RWLocker::~RWLocker() { if (!writing_) mu_->ReaderUnlock(); else - mu_->WriterUnlock(); -} - - -// When the DFA's State cache fills, we discard all the states in the -// cache and start over. Many threads can be using and adding to the -// cache at the same time, so we synchronize using the cache_mutex_ -// to keep from stepping on other threads. Specifically, all the -// threads using the current cache hold cache_mutex_ for reading. -// When a thread decides to flush the cache, it drops cache_mutex_ -// and then re-acquires it for writing. That ensures there are no -// other threads accessing the cache anymore. The rest of the search -// runs holding cache_mutex_ for writing, avoiding any contention -// with or cache pollution caused by other threads. - -void DFA::ResetCache(RWLocker* cache_lock) { - // Re-acquire the cache_mutex_ for writing (exclusive use). - cache_lock->LockForWriting(); - + mu_->WriterUnlock(); +} + + +// When the DFA's State cache fills, we discard all the states in the +// cache and start over. Many threads can be using and adding to the +// cache at the same time, so we synchronize using the cache_mutex_ +// to keep from stepping on other threads. Specifically, all the +// threads using the current cache hold cache_mutex_ for reading. +// When a thread decides to flush the cache, it drops cache_mutex_ +// and then re-acquires it for writing. That ensures there are no +// other threads accessing the cache anymore. The rest of the search +// runs holding cache_mutex_ for writing, avoiding any contention +// with or cache pollution caused by other threads. + +void DFA::ResetCache(RWLocker* cache_lock) { + // Re-acquire the cache_mutex_ for writing (exclusive use). + cache_lock->LockForWriting(); + hooks::GetDFAStateCacheResetHook()({ state_budget_, state_cache_.size(), }); - // Clear the cache, reset the memory budget. + // Clear the cache, reset the memory budget. for (int i = 0; i < kMaxStart; i++) start_[i].start.store(NULL, std::memory_order_relaxed); - ClearCache(); - mem_budget_ = state_budget_; -} - -// Typically, a couple States do need to be preserved across a cache -// reset, like the State at the current point in the search. -// The StateSaver class helps keep States across cache resets. -// It makes a copy of the state's guts outside the cache (before the reset) -// and then can be asked, after the reset, to recreate the State -// in the new cache. For example, in a DFA method ("this" is a DFA): -// -// StateSaver saver(this, s); -// ResetCache(cache_lock); -// s = saver.Restore(); -// -// The saver should always have room in the cache to re-create the state, -// because resetting the cache locks out all other threads, and the cache -// is known to have room for at least a couple states (otherwise the DFA -// constructor fails). - -class DFA::StateSaver { - public: - explicit StateSaver(DFA* dfa, State* state); - ~StateSaver(); - - // Recreates and returns a state equivalent to the - // original state passed to the constructor. - // Returns NULL if the cache has filled, but - // since the DFA guarantees to have room in the cache - // for a couple states, should never return NULL - // if used right after ResetCache. - State* Restore(); - - private: - DFA* dfa_; // the DFA to use - int* inst_; // saved info from State - int ninst_; + ClearCache(); + mem_budget_ = state_budget_; +} + +// Typically, a couple States do need to be preserved across a cache +// reset, like the State at the current point in the search. +// The StateSaver class helps keep States across cache resets. +// It makes a copy of the state's guts outside the cache (before the reset) +// and then can be asked, after the reset, to recreate the State +// in the new cache. For example, in a DFA method ("this" is a DFA): +// +// StateSaver saver(this, s); +// ResetCache(cache_lock); +// s = saver.Restore(); +// +// The saver should always have room in the cache to re-create the state, +// because resetting the cache locks out all other threads, and the cache +// is known to have room for at least a couple states (otherwise the DFA +// constructor fails). + +class DFA::StateSaver { + public: + explicit StateSaver(DFA* dfa, State* state); + ~StateSaver(); + + // Recreates and returns a state equivalent to the + // original state passed to the constructor. + // Returns NULL if the cache has filled, but + // since the DFA guarantees to have room in the cache + // for a couple states, should never return NULL + // if used right after ResetCache. + State* Restore(); + + private: + DFA* dfa_; // the DFA to use + int* inst_; // saved info from State + int ninst_; uint32_t flag_; - bool is_special_; // whether original state was special - State* special_; // if is_special_, the original state - + bool is_special_; // whether original state was special + State* special_; // if is_special_, the original state + StateSaver(const StateSaver&) = delete; StateSaver& operator=(const StateSaver&) = delete; -}; - -DFA::StateSaver::StateSaver(DFA* dfa, State* state) { - dfa_ = dfa; - if (state <= SpecialStateMax) { - inst_ = NULL; - ninst_ = 0; - flag_ = 0; - is_special_ = true; - special_ = state; - return; - } - is_special_ = false; - special_ = NULL; - flag_ = state->flag_; - ninst_ = state->ninst_; - inst_ = new int[ninst_]; - memmove(inst_, state->inst_, ninst_*sizeof inst_[0]); -} - -DFA::StateSaver::~StateSaver() { - if (!is_special_) - delete[] inst_; -} - -DFA::State* DFA::StateSaver::Restore() { - if (is_special_) - return special_; - MutexLock l(&dfa_->mutex_); - State* s = dfa_->CachedState(inst_, ninst_, flag_); - if (s == NULL) - LOG(DFATAL) << "StateSaver failed to restore state."; - return s; -} - - -////////////////////////////////////////////////////////////////////// -// -// DFA execution. -// -// The basic search loop is easy: start in a state s and then for each -// byte c in the input, s = s->next[c]. -// -// This simple description omits a few efficiency-driven complications. -// -// First, the State graph is constructed incrementally: it is possible -// that s->next[c] is null, indicating that that state has not been -// fully explored. In this case, RunStateOnByte must be invoked to -// determine the next state, which is cached in s->next[c] to save -// future effort. An alternative reason for s->next[c] to be null is -// that the DFA has reached a so-called "dead state", in which any match -// is no longer possible. In this case RunStateOnByte will return NULL -// and the processing of the string can stop early. -// -// Second, a 256-element pointer array for s->next_ makes each State -// quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[] -// maps from bytes to "byte classes" and then next_ only needs to have -// as many pointers as there are byte classes. A byte class is simply a -// range of bytes that the regexp never distinguishes between. -// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1, -// 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit -// but in exchange we typically cut the size of a State (and thus our -// memory footprint) by about 5-10x. The comments still refer to -// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]]. -// -// Third, it is common for a DFA for an unanchored match to begin in a -// state in which only one particular byte value can take the DFA to a -// different state. That is, s->next[c] != s for only one c. In this -// situation, the DFA can do better than executing the simple loop. -// Instead, it can call memchr to search very quickly for the byte c. -// Whether the start state has this property is determined during a +}; + +DFA::StateSaver::StateSaver(DFA* dfa, State* state) { + dfa_ = dfa; + if (state <= SpecialStateMax) { + inst_ = NULL; + ninst_ = 0; + flag_ = 0; + is_special_ = true; + special_ = state; + return; + } + is_special_ = false; + special_ = NULL; + flag_ = state->flag_; + ninst_ = state->ninst_; + inst_ = new int[ninst_]; + memmove(inst_, state->inst_, ninst_*sizeof inst_[0]); +} + +DFA::StateSaver::~StateSaver() { + if (!is_special_) + delete[] inst_; +} + +DFA::State* DFA::StateSaver::Restore() { + if (is_special_) + return special_; + MutexLock l(&dfa_->mutex_); + State* s = dfa_->CachedState(inst_, ninst_, flag_); + if (s == NULL) + LOG(DFATAL) << "StateSaver failed to restore state."; + return s; +} + + +////////////////////////////////////////////////////////////////////// +// +// DFA execution. +// +// The basic search loop is easy: start in a state s and then for each +// byte c in the input, s = s->next[c]. +// +// This simple description omits a few efficiency-driven complications. +// +// First, the State graph is constructed incrementally: it is possible +// that s->next[c] is null, indicating that that state has not been +// fully explored. In this case, RunStateOnByte must be invoked to +// determine the next state, which is cached in s->next[c] to save +// future effort. An alternative reason for s->next[c] to be null is +// that the DFA has reached a so-called "dead state", in which any match +// is no longer possible. In this case RunStateOnByte will return NULL +// and the processing of the string can stop early. +// +// Second, a 256-element pointer array for s->next_ makes each State +// quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[] +// maps from bytes to "byte classes" and then next_ only needs to have +// as many pointers as there are byte classes. A byte class is simply a +// range of bytes that the regexp never distinguishes between. +// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1, +// 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit +// but in exchange we typically cut the size of a State (and thus our +// memory footprint) by about 5-10x. The comments still refer to +// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]]. +// +// Third, it is common for a DFA for an unanchored match to begin in a +// state in which only one particular byte value can take the DFA to a +// different state. That is, s->next[c] != s for only one c. In this +// situation, the DFA can do better than executing the simple loop. +// Instead, it can call memchr to search very quickly for the byte c. +// Whether the start state has this property is determined during a // pre-compilation pass and the "can_prefix_accel" argument is set. -// -// Fourth, the desired behavior is to search for the leftmost-best match -// (approximately, the same one that Perl would find), which is not -// necessarily the match ending earliest in the string. Each time a -// match is found, it must be noted, but the DFA must continue on in -// hope of finding a higher-priority match. In some cases, the caller only -// cares whether there is any match at all, not which one is found. -// The "want_earliest_match" flag causes the search to stop at the first -// match found. -// -// Fifth, one algorithm that uses the DFA needs it to run over the -// input string backward, beginning at the end and ending at the beginning. -// Passing false for the "run_forward" flag causes the DFA to run backward. -// -// The checks for these last three cases, which in a naive implementation -// would be performed once per input byte, slow the general loop enough -// to merit specialized versions of the search loop for each of the -// eight possible settings of the three booleans. Rather than write -// eight different functions, we write one general implementation and then -// inline it to create the specialized ones. -// -// Note that matches are delayed by one byte, to make it easier to -// accomodate match conditions depending on the next input byte (like $ and \b). -// When s->next[c]->IsMatch(), it means that there is a match ending just -// *before* byte c. - -// The generic search loop. Searches text for a match, returning -// the pointer to the end of the chosen match, or NULL if no match. -// The bools are equal to the same-named variables in params, but -// making them function arguments lets the inliner specialize -// this function to each combination (see two paragraphs above). +// +// Fourth, the desired behavior is to search for the leftmost-best match +// (approximately, the same one that Perl would find), which is not +// necessarily the match ending earliest in the string. Each time a +// match is found, it must be noted, but the DFA must continue on in +// hope of finding a higher-priority match. In some cases, the caller only +// cares whether there is any match at all, not which one is found. +// The "want_earliest_match" flag causes the search to stop at the first +// match found. +// +// Fifth, one algorithm that uses the DFA needs it to run over the +// input string backward, beginning at the end and ending at the beginning. +// Passing false for the "run_forward" flag causes the DFA to run backward. +// +// The checks for these last three cases, which in a naive implementation +// would be performed once per input byte, slow the general loop enough +// to merit specialized versions of the search loop for each of the +// eight possible settings of the three booleans. Rather than write +// eight different functions, we write one general implementation and then +// inline it to create the specialized ones. +// +// Note that matches are delayed by one byte, to make it easier to +// accomodate match conditions depending on the next input byte (like $ and \b). +// When s->next[c]->IsMatch(), it means that there is a match ending just +// *before* byte c. + +// The generic search loop. Searches text for a match, returning +// the pointer to the end of the chosen match, or NULL if no match. +// The bools are equal to the same-named variables in params, but +// making them function arguments lets the inliner specialize +// this function to each combination (see two paragraphs above). template <bool can_prefix_accel, bool want_earliest_match, bool run_forward> inline bool DFA::InlinedSearchLoop(SearchParams* params) { - State* start = params->start; + State* start = params->start; const uint8_t* bp = BytePtr(params->text.data()); // start of text const uint8_t* p = bp; // text scanning point const uint8_t* ep = BytePtr(params->text.data() + @@ -1333,20 +1333,20 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { const uint8_t* resetp = NULL; // p at last cache reset if (!run_forward) { using std::swap; - swap(p, ep); + swap(p, ep); } - + const uint8_t* bytemap = prog_->bytemap(); const uint8_t* lastmatch = NULL; // most recent matching position in text - bool matched = false; + bool matched = false; - State* s = start; + State* s = start; if (ExtraDebug) fprintf(stderr, "@stx: %s\n", DumpState(s).c_str()); - - if (s->IsMatch()) { - matched = true; - lastmatch = p; + + if (s->IsMatch()) { + matched = true; + lastmatch = p; if (ExtraDebug) fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str()); if (params->matches != NULL && kind_ == Prog::kManyMatch) { @@ -1357,13 +1357,13 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { params->matches->insert(id); } } - if (want_earliest_match) { - params->ep = reinterpret_cast<const char*>(lastmatch); - return true; - } - } - - while (p != ep) { + if (want_earliest_match) { + params->ep = reinterpret_cast<const char*>(lastmatch); + return true; + } + } + + while (p != ep) { if (ExtraDebug) fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); @@ -1375,95 +1375,95 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { if (p == NULL) { p = ep; break; - } - } - - int c; - if (run_forward) - c = *p++; - else - c = *--p; - - // Note that multiple threads might be consulting - // s->next_[bytemap[c]] simultaneously. - // RunStateOnByte takes care of the appropriate locking, - // including a memory barrier so that the unlocked access - // (sometimes known as "double-checked locking") is safe. - // The alternative would be either one DFA per thread - // or one mutex operation per input byte. - // - // ns == DeadState means the state is known to be dead - // (no more matches are possible). - // ns == NULL means the state has not yet been computed - // (need to call RunStateOnByteUnlocked). - // RunStateOnByte returns ns == NULL if it is out of memory. - // ns == FullMatchState means the rest of the string matches. - // - // Okay to use bytemap[] not ByteMap() here, because - // c is known to be an actual byte and not kByteEndText. - + } + } + + int c; + if (run_forward) + c = *p++; + else + c = *--p; + + // Note that multiple threads might be consulting + // s->next_[bytemap[c]] simultaneously. + // RunStateOnByte takes care of the appropriate locking, + // including a memory barrier so that the unlocked access + // (sometimes known as "double-checked locking") is safe. + // The alternative would be either one DFA per thread + // or one mutex operation per input byte. + // + // ns == DeadState means the state is known to be dead + // (no more matches are possible). + // ns == NULL means the state has not yet been computed + // (need to call RunStateOnByteUnlocked). + // RunStateOnByte returns ns == NULL if it is out of memory. + // ns == FullMatchState means the rest of the string matches. + // + // Okay to use bytemap[] not ByteMap() here, because + // c is known to be an actual byte and not kByteEndText. + State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire); - if (ns == NULL) { - ns = RunStateOnByteUnlocked(s, c); - if (ns == NULL) { - // After we reset the cache, we hold cache_mutex exclusively, - // so if resetp != NULL, it means we filled the DFA state - // cache with this search alone (without any other threads). - // Benchmarks show that doing a state computation on every - // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the - // same at about 2 MB/s. Unless we're processing an average - // of 10 bytes per state computation, fail so that RE2 can + if (ns == NULL) { + ns = RunStateOnByteUnlocked(s, c); + if (ns == NULL) { + // After we reset the cache, we hold cache_mutex exclusively, + // so if resetp != NULL, it means we filled the DFA state + // cache with this search alone (without any other threads). + // Benchmarks show that doing a state computation on every + // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the + // same at about 2 MB/s. Unless we're processing an average + // of 10 bytes per state computation, fail so that RE2 can // fall back to the NFA. However, RE2::Set cannot fall back, // so we just have to keep on keeping on in that case. if (dfa_should_bail_when_slow && resetp != NULL && static_cast<size_t>(p - resetp) < 10*state_cache_.size() && kind_ != Prog::kManyMatch) { - params->failed = true; - return false; - } - resetp = p; - - // Prepare to save start and s across the reset. - StateSaver save_start(this, start); - StateSaver save_s(this, s); - - // Discard all the States in the cache. - ResetCache(params->cache_lock); - - // Restore start and s so we can continue. - if ((start = save_start.Restore()) == NULL || - (s = save_s.Restore()) == NULL) { - // Restore already did LOG(DFATAL). - params->failed = true; - return false; - } - ns = RunStateOnByteUnlocked(s, c); - if (ns == NULL) { - LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache"; - params->failed = true; - return false; - } - } - } - if (ns <= SpecialStateMax) { - if (ns == DeadState) { - params->ep = reinterpret_cast<const char*>(lastmatch); - return matched; - } - // FullMatchState - params->ep = reinterpret_cast<const char*>(ep); - return true; - } - - s = ns; - if (s->IsMatch()) { - matched = true; - // The DFA notices the match one byte late, - // so adjust p before using it in the match. - if (run_forward) - lastmatch = p - 1; - else - lastmatch = p + 1; + params->failed = true; + return false; + } + resetp = p; + + // Prepare to save start and s across the reset. + StateSaver save_start(this, start); + StateSaver save_s(this, s); + + // Discard all the States in the cache. + ResetCache(params->cache_lock); + + // Restore start and s so we can continue. + if ((start = save_start.Restore()) == NULL || + (s = save_s.Restore()) == NULL) { + // Restore already did LOG(DFATAL). + params->failed = true; + return false; + } + ns = RunStateOnByteUnlocked(s, c); + if (ns == NULL) { + LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache"; + params->failed = true; + return false; + } + } + } + if (ns <= SpecialStateMax) { + if (ns == DeadState) { + params->ep = reinterpret_cast<const char*>(lastmatch); + return matched; + } + // FullMatchState + params->ep = reinterpret_cast<const char*>(ep); + return true; + } + + s = ns; + if (s->IsMatch()) { + matched = true; + // The DFA notices the match one byte late, + // so adjust p before using it in the match. + if (run_forward) + lastmatch = p - 1; + else + lastmatch = p + 1; if (ExtraDebug) fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str()); if (params->matches != NULL && kind_ == Prog::kManyMatch) { @@ -1474,63 +1474,63 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { params->matches->insert(id); } } - if (want_earliest_match) { - params->ep = reinterpret_cast<const char*>(lastmatch); - return true; - } - } - } - - // Process one more byte to see if it triggers a match. - // (Remember, matches are delayed one byte.) + if (want_earliest_match) { + params->ep = reinterpret_cast<const char*>(lastmatch); + return true; + } + } + } + + // Process one more byte to see if it triggers a match. + // (Remember, matches are delayed one byte.) if (ExtraDebug) fprintf(stderr, "@etx: %s\n", DumpState(s).c_str()); - int lastbyte; - if (run_forward) { + int lastbyte; + if (run_forward) { if (EndPtr(params->text) == EndPtr(params->context)) - lastbyte = kByteEndText; - else + lastbyte = kByteEndText; + else lastbyte = EndPtr(params->text)[0] & 0xFF; - } else { + } else { if (BeginPtr(params->text) == BeginPtr(params->context)) - lastbyte = kByteEndText; - else + lastbyte = kByteEndText; + else lastbyte = BeginPtr(params->text)[-1] & 0xFF; - } - + } + State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire); - if (ns == NULL) { - ns = RunStateOnByteUnlocked(s, lastbyte); - if (ns == NULL) { - StateSaver save_s(this, s); - ResetCache(params->cache_lock); - if ((s = save_s.Restore()) == NULL) { - params->failed = true; - return false; - } - ns = RunStateOnByteUnlocked(s, lastbyte); - if (ns == NULL) { - LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset"; - params->failed = true; - return false; - } - } - } + if (ns == NULL) { + ns = RunStateOnByteUnlocked(s, lastbyte); + if (ns == NULL) { + StateSaver save_s(this, s); + ResetCache(params->cache_lock); + if ((s = save_s.Restore()) == NULL) { + params->failed = true; + return false; + } + ns = RunStateOnByteUnlocked(s, lastbyte); + if (ns == NULL) { + LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset"; + params->failed = true; + return false; + } + } + } if (ns <= SpecialStateMax) { if (ns == DeadState) { params->ep = reinterpret_cast<const char*>(lastmatch); return matched; } // FullMatchState - params->ep = reinterpret_cast<const char*>(ep); - return true; - } + params->ep = reinterpret_cast<const char*>(ep); + return true; + } s = ns; if (s->IsMatch()) { - matched = true; - lastmatch = p; + matched = true; + lastmatch = p; if (ExtraDebug) fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str()); if (params->matches != NULL && kind_ == Prog::kManyMatch) { @@ -1541,146 +1541,146 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { params->matches->insert(id); } } - } - - params->ep = reinterpret_cast<const char*>(lastmatch); - return matched; -} - -// Inline specializations of the general loop. -bool DFA::SearchFFF(SearchParams* params) { + } + + params->ep = reinterpret_cast<const char*>(lastmatch); + return matched; +} + +// Inline specializations of the general loop. +bool DFA::SearchFFF(SearchParams* params) { return InlinedSearchLoop<false, false, false>(params); -} -bool DFA::SearchFFT(SearchParams* params) { +} +bool DFA::SearchFFT(SearchParams* params) { return InlinedSearchLoop<false, false, true>(params); -} -bool DFA::SearchFTF(SearchParams* params) { +} +bool DFA::SearchFTF(SearchParams* params) { return InlinedSearchLoop<false, true, false>(params); -} -bool DFA::SearchFTT(SearchParams* params) { +} +bool DFA::SearchFTT(SearchParams* params) { return InlinedSearchLoop<false, true, true>(params); -} -bool DFA::SearchTFF(SearchParams* params) { +} +bool DFA::SearchTFF(SearchParams* params) { return InlinedSearchLoop<true, false, false>(params); -} -bool DFA::SearchTFT(SearchParams* params) { +} +bool DFA::SearchTFT(SearchParams* params) { return InlinedSearchLoop<true, false, true>(params); -} -bool DFA::SearchTTF(SearchParams* params) { +} +bool DFA::SearchTTF(SearchParams* params) { return InlinedSearchLoop<true, true, false>(params); -} -bool DFA::SearchTTT(SearchParams* params) { +} +bool DFA::SearchTTT(SearchParams* params) { return InlinedSearchLoop<true, true, true>(params); -} - -// For performance, calls the appropriate specialized version -// of InlinedSearchLoop. -bool DFA::FastSearchLoop(SearchParams* params) { - // Because the methods are private, the Searches array - // cannot be declared at top level. - static bool (DFA::*Searches[])(SearchParams*) = { - &DFA::SearchFFF, - &DFA::SearchFFT, - &DFA::SearchFTF, - &DFA::SearchFTT, - &DFA::SearchTFF, - &DFA::SearchTFT, - &DFA::SearchTTF, - &DFA::SearchTTT, - }; - +} + +// For performance, calls the appropriate specialized version +// of InlinedSearchLoop. +bool DFA::FastSearchLoop(SearchParams* params) { + // Because the methods are private, the Searches array + // cannot be declared at top level. + static bool (DFA::*Searches[])(SearchParams*) = { + &DFA::SearchFFF, + &DFA::SearchFFT, + &DFA::SearchFTF, + &DFA::SearchFTT, + &DFA::SearchTFF, + &DFA::SearchTFT, + &DFA::SearchTTF, + &DFA::SearchTTT, + }; + int index = 4 * params->can_prefix_accel + - 2 * params->want_earliest_match + - 1 * params->run_forward; - return (this->*Searches[index])(params); -} - - -// The discussion of DFA execution above ignored the question of how -// to determine the initial state for the search loop. There are two -// factors that influence the choice of start state. -// -// The first factor is whether the search is anchored or not. -// The regexp program (Prog*) itself has -// two different entry points: one for anchored searches and one for -// unanchored searches. (The unanchored version starts with a leading ".*?" -// and then jumps to the anchored one.) -// -// The second factor is where text appears in the larger context, which -// determines which empty-string operators can be matched at the beginning -// of execution. If text is at the very beginning of context, \A and ^ match. -// Otherwise if text is at the beginning of a line, then ^ matches. -// Otherwise it matters whether the character before text is a word character -// or a non-word character. -// -// The two cases (unanchored vs not) and four cases (empty-string flags) -// combine to make the eight cases recorded in the DFA's begin_text_[2], -// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached -// StartInfos. The start state for each is filled in the first time it -// is used for an actual search. - -// Examines text, context, and anchored to determine the right start -// state for the DFA search loop. Fills in params and returns true on success. -// Returns false on failure. -bool DFA::AnalyzeSearch(SearchParams* params) { - const StringPiece& text = params->text; - const StringPiece& context = params->context; - - // Sanity check: make sure that text lies within context. + 2 * params->want_earliest_match + + 1 * params->run_forward; + return (this->*Searches[index])(params); +} + + +// The discussion of DFA execution above ignored the question of how +// to determine the initial state for the search loop. There are two +// factors that influence the choice of start state. +// +// The first factor is whether the search is anchored or not. +// The regexp program (Prog*) itself has +// two different entry points: one for anchored searches and one for +// unanchored searches. (The unanchored version starts with a leading ".*?" +// and then jumps to the anchored one.) +// +// The second factor is where text appears in the larger context, which +// determines which empty-string operators can be matched at the beginning +// of execution. If text is at the very beginning of context, \A and ^ match. +// Otherwise if text is at the beginning of a line, then ^ matches. +// Otherwise it matters whether the character before text is a word character +// or a non-word character. +// +// The two cases (unanchored vs not) and four cases (empty-string flags) +// combine to make the eight cases recorded in the DFA's begin_text_[2], +// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached +// StartInfos. The start state for each is filled in the first time it +// is used for an actual search. + +// Examines text, context, and anchored to determine the right start +// state for the DFA search loop. Fills in params and returns true on success. +// Returns false on failure. +bool DFA::AnalyzeSearch(SearchParams* params) { + const StringPiece& text = params->text; + const StringPiece& context = params->context; + + // Sanity check: make sure that text lies within context. if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { LOG(DFATAL) << "context does not contain text"; - params->start = DeadState; - return true; - } - - // Determine correct search type. - int start; + params->start = DeadState; + return true; + } + + // Determine correct search type. + int start; uint32_t flags; - if (params->run_forward) { + if (params->run_forward) { if (BeginPtr(text) == BeginPtr(context)) { - start = kStartBeginText; - flags = kEmptyBeginText|kEmptyBeginLine; + start = kStartBeginText; + flags = kEmptyBeginText|kEmptyBeginLine; } else if (BeginPtr(text)[-1] == '\n') { - start = kStartBeginLine; - flags = kEmptyBeginLine; + start = kStartBeginLine; + flags = kEmptyBeginLine; } else if (Prog::IsWordChar(BeginPtr(text)[-1] & 0xFF)) { - start = kStartAfterWordChar; - flags = kFlagLastWord; - } else { - start = kStartAfterNonWordChar; - flags = 0; - } - } else { + start = kStartAfterWordChar; + flags = kFlagLastWord; + } else { + start = kStartAfterNonWordChar; + flags = 0; + } + } else { if (EndPtr(text) == EndPtr(context)) { - start = kStartBeginText; - flags = kEmptyBeginText|kEmptyBeginLine; + start = kStartBeginText; + flags = kEmptyBeginText|kEmptyBeginLine; } else if (EndPtr(text)[0] == '\n') { - start = kStartBeginLine; - flags = kEmptyBeginLine; + start = kStartBeginLine; + flags = kEmptyBeginLine; } else if (Prog::IsWordChar(EndPtr(text)[0] & 0xFF)) { - start = kStartAfterWordChar; - flags = kFlagLastWord; - } else { - start = kStartAfterNonWordChar; - flags = 0; - } - } + start = kStartAfterWordChar; + flags = kFlagLastWord; + } else { + start = kStartAfterNonWordChar; + flags = 0; + } + } if (params->anchored) - start |= kStartAnchored; - StartInfo* info = &start_[start]; - - // Try once without cache_lock for writing. - // Try again after resetting the cache - // (ResetCache will relock cache_lock for writing). - if (!AnalyzeSearchHelper(params, info, flags)) { - ResetCache(params->cache_lock); - if (!AnalyzeSearchHelper(params, info, flags)) { - LOG(DFATAL) << "Failed to analyze start state."; - params->failed = true; - return false; - } - } - + start |= kStartAnchored; + StartInfo* info = &start_[start]; + + // Try once without cache_lock for writing. + // Try again after resetting the cache + // (ResetCache will relock cache_lock for writing). + if (!AnalyzeSearchHelper(params, info, flags)) { + ResetCache(params->cache_lock); + if (!AnalyzeSearchHelper(params, info, flags)) { + LOG(DFATAL) << "Failed to analyze start state."; + params->failed = true; + return false; + } + } + params->start = info->start.load(std::memory_order_acquire); // Even if we could prefix accel, we cannot do so when anchored and, @@ -1695,99 +1695,99 @@ bool DFA::AnalyzeSearch(SearchParams* params) { if (ExtraDebug) fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", - params->anchored, params->run_forward, flags, + params->anchored, params->run_forward, flags, DumpState(params->start).c_str(), params->can_prefix_accel); - - return true; -} - -// Fills in info if needed. Returns true on success, false on failure. -bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, + + return true; +} + +// Fills in info if needed. Returns true on success, false on failure. +bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint32_t flags) { // Quick check. State* start = info->start.load(std::memory_order_acquire); if (start != NULL) - return true; - - MutexLock l(&mutex_); + return true; + + MutexLock l(&mutex_); start = info->start.load(std::memory_order_relaxed); if (start != NULL) - return true; - - q0_->clear(); - AddToQueue(q0_, - params->anchored ? prog_->start() : prog_->start_unanchored(), - flags); + return true; + + q0_->clear(); + AddToQueue(q0_, + params->anchored ? prog_->start() : prog_->start_unanchored(), + flags); start = WorkqToCachedState(q0_, NULL, flags); if (start == NULL) - return false; - + return false; + // Synchronize with "quick check" above. info->start.store(start, std::memory_order_release); - return true; -} - -// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop. -bool DFA::Search(const StringPiece& text, - const StringPiece& context, - bool anchored, - bool want_earliest_match, - bool run_forward, - bool* failed, - const char** epp, + return true; +} + +// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop. +bool DFA::Search(const StringPiece& text, + const StringPiece& context, + bool anchored, + bool want_earliest_match, + bool run_forward, + bool* failed, + const char** epp, SparseSet* matches) { - *epp = NULL; - if (!ok()) { - *failed = true; - return false; - } - *failed = false; - + *epp = NULL; + if (!ok()) { + *failed = true; + return false; + } + *failed = false; + if (ExtraDebug) { - fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); - fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", + fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); + fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_); - } - - RWLocker l(&cache_mutex_); - SearchParams params(text, context, &l); - params.anchored = anchored; - params.want_earliest_match = want_earliest_match; - params.run_forward = run_forward; - params.matches = matches; - - if (!AnalyzeSearch(¶ms)) { - *failed = true; - return false; - } - if (params.start == DeadState) + } + + RWLocker l(&cache_mutex_); + SearchParams params(text, context, &l); + params.anchored = anchored; + params.want_earliest_match = want_earliest_match; + params.run_forward = run_forward; + params.matches = matches; + + if (!AnalyzeSearch(¶ms)) { + *failed = true; + return false; + } + if (params.start == DeadState) return false; - if (params.start == FullMatchState) { - if (run_forward == want_earliest_match) + if (params.start == FullMatchState) { + if (run_forward == want_earliest_match) *epp = text.data(); - else + else *epp = text.data() + text.size(); - return true; - } + return true; + } if (ExtraDebug) - fprintf(stderr, "start %s\n", DumpState(params.start).c_str()); - bool ret = FastSearchLoop(¶ms); - if (params.failed) { - *failed = true; - return false; - } - *epp = params.ep; - return ret; -} - -DFA* Prog::GetDFA(MatchKind kind) { - // For a forward DFA, half the memory goes to each DFA. + fprintf(stderr, "start %s\n", DumpState(params.start).c_str()); + bool ret = FastSearchLoop(¶ms); + if (params.failed) { + *failed = true; + return false; + } + *epp = params.ep; + return ret; +} + +DFA* Prog::GetDFA(MatchKind kind) { + // For a forward DFA, half the memory goes to each DFA. // However, if it is a "many match" DFA, then there is // no counterpart with which the memory must be shared. // - // For a reverse DFA, all the memory goes to the - // "longest match" DFA, because RE2 never does reverse - // "first match" searches. + // For a reverse DFA, all the memory goes to the + // "longest match" DFA, because RE2 never does reverse + // "first match" searches. if (kind == kFirstMatch) { std::call_once(dfa_first_once_, [](Prog* prog) { prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2); @@ -1806,55 +1806,55 @@ DFA* Prog::GetDFA(MatchKind kind) { prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_); }, this); return dfa_longest_; - } + } } - + void Prog::DeleteDFA(DFA* dfa) { delete dfa; -} - -// Executes the regexp program to search in text, -// which itself is inside the larger context. (As a convenience, -// passing a NULL context is equivalent to passing text.) -// Returns true if a match is found, false if not. -// If a match is found, fills in match0->end() to point at the end of the match -// and sets match0->begin() to text.begin(), since the DFA can't track -// where the match actually began. -// -// This is the only external interface (class DFA only exists in this file). -// -bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, +} + +// Executes the regexp program to search in text, +// which itself is inside the larger context. (As a convenience, +// passing a NULL context is equivalent to passing text.) +// Returns true if a match is found, false if not. +// If a match is found, fills in match0->end() to point at the end of the match +// and sets match0->begin() to text.begin(), since the DFA can't track +// where the match actually began. +// +// This is the only external interface (class DFA only exists in this file). +// +bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, Anchor anchor, MatchKind kind, StringPiece* match0, bool* failed, SparseSet* matches) { - *failed = false; - - StringPiece context = const_context; + *failed = false; + + StringPiece context = const_context; if (context.data() == NULL) - context = text; + context = text; bool caret = anchor_start(); - bool dollar = anchor_end(); - if (reversed_) { + bool dollar = anchor_end(); + if (reversed_) { using std::swap; swap(caret, dollar); - } + } if (caret && BeginPtr(context) != BeginPtr(text)) - return false; + return false; if (dollar && EndPtr(context) != EndPtr(text)) - return false; - - // Handle full match by running an anchored longest match - // and then checking if it covers all of text. - bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; - bool endmatch = false; - if (kind == kManyMatch) { + return false; + + // Handle full match by running an anchored longest match + // and then checking if it covers all of text. + bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; + bool endmatch = false; + if (kind == kManyMatch) { // This is split out in order to avoid clobbering kind. - } else if (kind == kFullMatch || anchor_end()) { - endmatch = true; - kind = kLongestMatch; - } - - // If the caller doesn't care where the match is (just whether one exists), - // then we can stop at the very first match we find, the so-called + } else if (kind == kFullMatch || anchor_end()) { + endmatch = true; + kind = kLongestMatch; + } + + // If the caller doesn't care where the match is (just whether one exists), + // then we can stop at the very first match we find, the so-called // "earliest match". bool want_earliest_match = false; if (kind == kManyMatch) { @@ -1864,62 +1864,62 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, } } else if (match0 == NULL && !endmatch) { want_earliest_match = true; - kind = kLongestMatch; - } - - DFA* dfa = GetDFA(kind); - const char* ep; - bool matched = dfa->Search(text, context, anchored, + kind = kLongestMatch; + } + + DFA* dfa = GetDFA(kind); + const char* ep; + bool matched = dfa->Search(text, context, anchored, want_earliest_match, !reversed_, - failed, &ep, matches); + failed, &ep, matches); if (*failed) { hooks::GetDFASearchFailureHook()({ // Nothing yet... }); - return false; + return false; } - if (!matched) - return false; + if (!matched) + return false; if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size())) - return false; - - // If caller cares, record the boundary of the match. - // We only know where it ends, so use the boundary of text - // as the beginning. - if (match0) { - if (reversed_) + return false; + + // If caller cares, record the boundary of the match. + // We only know where it ends, so use the boundary of text + // as the beginning. + if (match0) { + if (reversed_) *match0 = StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep)); - else + else *match0 = StringPiece(text.data(), static_cast<size_t>(ep - text.data())); - } - return true; -} - -// Build out all states in DFA. Returns number of states. + } + return true; +} + +// Build out all states in DFA. Returns number of states. int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { - if (!ok()) - return 0; - - // Pick out start state for unanchored search - // at beginning of text. - RWLocker l(&cache_mutex_); + if (!ok()) + return 0; + + // Pick out start state for unanchored search + // at beginning of text. + RWLocker l(&cache_mutex_); SearchParams params(StringPiece(), StringPiece(), &l); - params.anchored = false; + params.anchored = false; if (!AnalyzeSearch(¶ms) || params.start == NULL || params.start == DeadState) - return 0; - - // Add start state to work queue. + return 0; + + // Add start state to work queue. // Note that any State* that we handle here must point into the cache, // so we can simply depend on pointer-as-a-number hashing and equality. std::unordered_map<State*, int> m; std::deque<State*> q; m.emplace(params.start, static_cast<int>(m.size())); - q.push_back(params.start); - + q.push_back(params.start); + // Compute the input bytes needed to cover all of the next pointers. int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot std::vector<int> input(nnext); @@ -1934,13 +1934,13 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { // Scratch space for the output. std::vector<int> output(nnext); - // Flood to expand every state. + // Flood to expand every state. bool oom = false; while (!q.empty()) { State* s = q.front(); q.pop_front(); for (int c : input) { - State* ns = RunStateOnByteUnlocked(s, c); + State* ns = RunStateOnByteUnlocked(s, c); if (ns == NULL) { oom = true; break; @@ -1951,168 +1951,168 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { } if (m.find(ns) == m.end()) { m.emplace(ns, static_cast<int>(m.size())); - q.push_back(ns); - } + q.push_back(ns); + } output[ByteMap(c)] = m[ns]; - } + } if (cb) cb(oom ? NULL : output.data(), s == FullMatchState || s->IsMatch()); if (oom) break; - } - + } + return static_cast<int>(m.size()); -} - -// Build out all states in DFA for kind. Returns number of states. +} + +// Build out all states in DFA for kind. Returns number of states. int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) { return GetDFA(kind)->BuildAllStates(cb); -} - -// Computes min and max for matching string. -// Won't return strings bigger than maxlen. +} + +// Computes min and max for matching string. +// Won't return strings bigger than maxlen. bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { - if (!ok()) - return false; - - // NOTE: if future users of PossibleMatchRange want more precision when - // presented with infinitely repeated elements, consider making this a - // parameter to PossibleMatchRange. - static int kMaxEltRepetitions = 0; - - // Keep track of the number of times we've visited states previously. We only - // revisit a given state if it's part of a repeated group, so if the value - // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set - // |*max| to |PrefixSuccessor(*max)|. - // - // Also note that previously_visited_states[UnseenStatePtr] will, in the STL - // tradition, implicitly insert a '0' value at first use. We take advantage - // of that property below. + if (!ok()) + return false; + + // NOTE: if future users of PossibleMatchRange want more precision when + // presented with infinitely repeated elements, consider making this a + // parameter to PossibleMatchRange. + static int kMaxEltRepetitions = 0; + + // Keep track of the number of times we've visited states previously. We only + // revisit a given state if it's part of a repeated group, so if the value + // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set + // |*max| to |PrefixSuccessor(*max)|. + // + // Also note that previously_visited_states[UnseenStatePtr] will, in the STL + // tradition, implicitly insert a '0' value at first use. We take advantage + // of that property below. std::unordered_map<State*, int> previously_visited_states; - - // Pick out start state for anchored search at beginning of text. - RWLocker l(&cache_mutex_); + + // Pick out start state for anchored search at beginning of text. + RWLocker l(&cache_mutex_); SearchParams params(StringPiece(), StringPiece(), &l); - params.anchored = true; - if (!AnalyzeSearch(¶ms)) - return false; - if (params.start == DeadState) { // No matching strings - *min = ""; - *max = ""; - return true; - } - if (params.start == FullMatchState) // Every string matches: no max - return false; - - // The DFA is essentially a big graph rooted at params.start, - // and paths in the graph correspond to accepted strings. - // Each node in the graph has potentially 256+1 arrows - // coming out, one for each byte plus the magic end of - // text character kByteEndText. - - // To find the smallest possible prefix of an accepted - // string, we just walk the graph preferring to follow - // arrows with the lowest bytes possible. To find the - // largest possible prefix, we follow the largest bytes - // possible. - - // The test for whether there is an arrow from s on byte j is - // ns = RunStateOnByteUnlocked(s, j); - // if (ns == NULL) - // return false; - // if (ns != DeadState && ns->ninst > 0) - // The RunStateOnByteUnlocked call asks the DFA to build out the graph. - // It returns NULL only if the DFA has run out of memory, - // in which case we can't be sure of anything. - // The second check sees whether there was graph built - // and whether it is interesting graph. Nodes might have - // ns->ninst == 0 if they exist only to represent the fact - // that a match was found on the previous byte. - - // Build minimum prefix. - State* s = params.start; - min->clear(); + params.anchored = true; + if (!AnalyzeSearch(¶ms)) + return false; + if (params.start == DeadState) { // No matching strings + *min = ""; + *max = ""; + return true; + } + if (params.start == FullMatchState) // Every string matches: no max + return false; + + // The DFA is essentially a big graph rooted at params.start, + // and paths in the graph correspond to accepted strings. + // Each node in the graph has potentially 256+1 arrows + // coming out, one for each byte plus the magic end of + // text character kByteEndText. + + // To find the smallest possible prefix of an accepted + // string, we just walk the graph preferring to follow + // arrows with the lowest bytes possible. To find the + // largest possible prefix, we follow the largest bytes + // possible. + + // The test for whether there is an arrow from s on byte j is + // ns = RunStateOnByteUnlocked(s, j); + // if (ns == NULL) + // return false; + // if (ns != DeadState && ns->ninst > 0) + // The RunStateOnByteUnlocked call asks the DFA to build out the graph. + // It returns NULL only if the DFA has run out of memory, + // in which case we can't be sure of anything. + // The second check sees whether there was graph built + // and whether it is interesting graph. Nodes might have + // ns->ninst == 0 if they exist only to represent the fact + // that a match was found on the previous byte. + + // Build minimum prefix. + State* s = params.start; + min->clear(); MutexLock lock(&mutex_); - for (int i = 0; i < maxlen; i++) { + for (int i = 0; i < maxlen; i++) { if (previously_visited_states[s] > kMaxEltRepetitions) - break; - previously_visited_states[s]++; - - // Stop if min is a match. + break; + previously_visited_states[s]++; + + // Stop if min is a match. State* ns = RunStateOnByte(s, kByteEndText); - if (ns == NULL) // DFA out of memory - return false; - if (ns != DeadState && (ns == FullMatchState || ns->IsMatch())) - break; - - // Try to extend the string with low bytes. - bool extended = false; - for (int j = 0; j < 256; j++) { + if (ns == NULL) // DFA out of memory + return false; + if (ns != DeadState && (ns == FullMatchState || ns->IsMatch())) + break; + + // Try to extend the string with low bytes. + bool extended = false; + for (int j = 0; j < 256; j++) { ns = RunStateOnByte(s, j); - if (ns == NULL) // DFA out of memory - return false; - if (ns == FullMatchState || - (ns > SpecialStateMax && ns->ninst_ > 0)) { - extended = true; + if (ns == NULL) // DFA out of memory + return false; + if (ns == FullMatchState || + (ns > SpecialStateMax && ns->ninst_ > 0)) { + extended = true; min->append(1, static_cast<char>(j)); - s = ns; - break; - } - } - if (!extended) - break; - } - - // Build maximum prefix. - previously_visited_states.clear(); - s = params.start; - max->clear(); - for (int i = 0; i < maxlen; i++) { + s = ns; + break; + } + } + if (!extended) + break; + } + + // Build maximum prefix. + previously_visited_states.clear(); + s = params.start; + max->clear(); + for (int i = 0; i < maxlen; i++) { if (previously_visited_states[s] > kMaxEltRepetitions) - break; - previously_visited_states[s] += 1; - - // Try to extend the string with high bytes. - bool extended = false; - for (int j = 255; j >= 0; j--) { + break; + previously_visited_states[s] += 1; + + // Try to extend the string with high bytes. + bool extended = false; + for (int j = 255; j >= 0; j--) { State* ns = RunStateOnByte(s, j); - if (ns == NULL) - return false; - if (ns == FullMatchState || - (ns > SpecialStateMax && ns->ninst_ > 0)) { - extended = true; + if (ns == NULL) + return false; + if (ns == FullMatchState || + (ns > SpecialStateMax && ns->ninst_ > 0)) { + extended = true; max->append(1, static_cast<char>(j)); - s = ns; - break; - } - } - if (!extended) { - // Done, no need for PrefixSuccessor. - return true; - } - } - - // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b + s = ns; + break; + } + } + if (!extended) { + // Done, no need for PrefixSuccessor. + return true; + } + } + + // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b PrefixSuccessor(max); - - // If there are no bytes left, we have no way to say "there is no maximum - // string". We could make the interface more complicated and be able to - // return "there is no maximum but here is a minimum", but that seems like - // overkill -- the most common no-max case is all possible strings, so not - // telling the caller that the empty string is the minimum match isn't a - // great loss. - if (max->empty()) - return false; - - return true; -} - -// PossibleMatchRange for a Prog. + + // If there are no bytes left, we have no way to say "there is no maximum + // string". We could make the interface more complicated and be able to + // return "there is no maximum but here is a minimum", but that seems like + // overkill -- the most common no-max case is all possible strings, so not + // telling the caller that the empty string is the minimum match isn't a + // great loss. + if (max->empty()) + return false; + + return true; +} + +// PossibleMatchRange for a Prog. bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { // Have to use dfa_longest_ to get all strings for full matches. // For example, (a|aa) never matches aa in first-match mode. return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen); -} - -} // namespace re2 +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/filtered_re2.cc b/contrib/libs/re2/re2/filtered_re2.cc index 3de2ec8124..5df97456e2 100644 --- a/contrib/libs/re2/re2/filtered_re2.cc +++ b/contrib/libs/re2/re2/filtered_re2.cc @@ -1,8 +1,8 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "re2/filtered_re2.h" +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/filtered_re2.h" #include <stddef.h> #include <string> @@ -10,26 +10,26 @@ #include "util/util.h" #include "util/logging.h" -#include "re2/prefilter.h" -#include "re2/prefilter_tree.h" - -namespace re2 { - -FilteredRE2::FilteredRE2() - : compiled_(false), - prefilter_tree_(new PrefilterTree()) { -} - +#include "re2/prefilter.h" +#include "re2/prefilter_tree.h" + +namespace re2 { + +FilteredRE2::FilteredRE2() + : compiled_(false), + prefilter_tree_(new PrefilterTree()) { +} + FilteredRE2::FilteredRE2(int min_atom_len) : compiled_(false), prefilter_tree_(new PrefilterTree(min_atom_len)) { } -FilteredRE2::~FilteredRE2() { +FilteredRE2::~FilteredRE2() { for (size_t i = 0; i < re2_vec_.size(); i++) - delete re2_vec_[i]; -} - + delete re2_vec_[i]; +} + FilteredRE2::FilteredRE2(FilteredRE2&& other) : re2_vec_(std::move(other.re2_vec_)), compiled_(other.compiled_), @@ -46,79 +46,79 @@ FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { return *this; } -RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, - const RE2::Options& options, int* id) { - RE2* re = new RE2(pattern, options); - RE2::ErrorCode code = re->error_code(); - - if (!re->ok()) { +RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, + const RE2::Options& options, int* id) { + RE2* re = new RE2(pattern, options); + RE2::ErrorCode code = re->error_code(); + + if (!re->ok()) { if (options.log_errors()) { LOG(ERROR) << "Couldn't compile regular expression, skipping: " << pattern << " due to error " << re->error(); } - delete re; - } else { + delete re; + } else { *id = static_cast<int>(re2_vec_.size()); - re2_vec_.push_back(re); - } - - return code; -} - + re2_vec_.push_back(re); + } + + return code; +} + void FilteredRE2::Compile(std::vector<std::string>* atoms) { if (compiled_) { LOG(ERROR) << "Compile called already."; - return; - } - + return; + } + if (re2_vec_.empty()) { LOG(ERROR) << "Compile called before Add."; return; } for (size_t i = 0; i < re2_vec_.size(); i++) { - Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]); - prefilter_tree_->Add(prefilter); - } - atoms->clear(); - prefilter_tree_->Compile(atoms); - compiled_ = true; -} - -int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { + Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]); + prefilter_tree_->Add(prefilter); + } + atoms->clear(); + prefilter_tree_->Compile(atoms); + compiled_ = true; +} + +int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { for (size_t i = 0; i < re2_vec_.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[i])) + if (RE2::PartialMatch(text, *re2_vec_[i])) return static_cast<int>(i); - return -1; -} - -int FilteredRE2::FirstMatch(const StringPiece& text, + return -1; +} + +int FilteredRE2::FirstMatch(const StringPiece& text, const std::vector<int>& atoms) const { - if (!compiled_) { + if (!compiled_) { LOG(DFATAL) << "FirstMatch called before Compile."; - return -1; - } + return -1; + } std::vector<int> regexps; - prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); + prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); for (size_t i = 0; i < regexps.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) - return regexps[i]; - return -1; -} - -bool FilteredRE2::AllMatches( - const StringPiece& text, + if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + return regexps[i]; + return -1; +} + +bool FilteredRE2::AllMatches( + const StringPiece& text, const std::vector<int>& atoms, std::vector<int>* matching_regexps) const { - matching_regexps->clear(); + matching_regexps->clear(); std::vector<int> regexps; - prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); + prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); for (size_t i = 0; i < regexps.size(); i++) - if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) - matching_regexps->push_back(regexps[i]); - return !matching_regexps->empty(); -} - + if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + matching_regexps->push_back(regexps[i]); + return !matching_regexps->empty(); +} + void FilteredRE2::AllPotentials( const std::vector<int>& atoms, std::vector<int>* potential_regexps) const { @@ -127,11 +127,11 @@ void FilteredRE2::AllPotentials( void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms, std::vector<int>* passed_regexps) { - prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps); -} - -void FilteredRE2::PrintPrefilter(int regexpid) { - prefilter_tree_->PrintPrefilter(regexpid); -} - + prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps); +} + +void FilteredRE2::PrintPrefilter(int regexpid) { + prefilter_tree_->PrintPrefilter(regexpid); +} + } // namespace re2 diff --git a/contrib/libs/re2/re2/filtered_re2.h b/contrib/libs/re2/re2/filtered_re2.h index c436b2eca2..dd618c70e8 100644 --- a/contrib/libs/re2/re2/filtered_re2.h +++ b/contrib/libs/re2/re2/filtered_re2.h @@ -1,17 +1,17 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_FILTERED_RE2_H_ #define RE2_FILTERED_RE2_H_ -// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. -// It provides a prefilter mechanism that helps in cutting down the -// number of regexps that need to be actually searched. -// -// By design, it does not include a string matching engine. This is to +// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. +// It provides a prefilter mechanism that helps in cutting down the +// number of regexps that need to be actually searched. +// +// By design, it does not include a string matching engine. This is to // allow the user of the class to use their favorite string matching -// engine. The overall flow is: Add all the regexps using Add, then +// engine. The overall flow is: Add all the regexps using Add, then // Compile the FilteredRE2. Compile returns strings that need to be // matched. Note that the returned strings are lowercased and distinct. // For applying regexps to a search text, the caller does the string @@ -20,23 +20,23 @@ // on a lowercased version of the search text. Then call FirstMatch // or AllMatches with a vector of indices of strings that were found // in the text to get the actual regexp matches. - + #include <memory> #include <string> #include <vector> - + #include "re2/re2.h" - -namespace re2 { - -class PrefilterTree; - -class FilteredRE2 { - public: - FilteredRE2(); + +namespace re2 { + +class PrefilterTree; + +class FilteredRE2 { + public: + FilteredRE2(); explicit FilteredRE2(int min_atom_len); - ~FilteredRE2(); - + ~FilteredRE2(); + // Not copyable. FilteredRE2(const FilteredRE2&) = delete; FilteredRE2& operator=(const FilteredRE2&) = delete; @@ -44,39 +44,39 @@ class FilteredRE2 { FilteredRE2(FilteredRE2&& other); FilteredRE2& operator=(FilteredRE2&& other); - // Uses RE2 constructor to create a RE2 object (re). Returns - // re->error_code(). If error_code is other than NoError, then re is - // deleted and not added to re2_vec_. - RE2::ErrorCode Add(const StringPiece& pattern, - const RE2::Options& options, + // Uses RE2 constructor to create a RE2 object (re). Returns + // re->error_code(). If error_code is other than NoError, then re is + // deleted and not added to re2_vec_. + RE2::ErrorCode Add(const StringPiece& pattern, + const RE2::Options& options, int* id); - - // Prepares the regexps added by Add for filtering. Returns a set - // of strings that the caller should check for in candidate texts. + + // Prepares the regexps added by Add for filtering. Returns a set + // of strings that the caller should check for in candidate texts. // The returned strings are lowercased and distinct. When doing // string matching, it should be performed in a case-insensitive // way or the search text should be lowercased first. Call after - // all Add calls are done. + // all Add calls are done. void Compile(std::vector<std::string>* strings_to_match); - - // Returns the index of the first matching regexp. - // Returns -1 on no match. Can be called prior to Compile. - // Does not do any filtering: simply tries to Match the - // regexps in a loop. - int SlowFirstMatch(const StringPiece& text) const; - - // Returns the index of the first matching regexp. - // Returns -1 on no match. Compile has to be called before - // calling this. - int FirstMatch(const StringPiece& text, + + // Returns the index of the first matching regexp. + // Returns -1 on no match. Can be called prior to Compile. + // Does not do any filtering: simply tries to Match the + // regexps in a loop. + int SlowFirstMatch(const StringPiece& text) const; + + // Returns the index of the first matching regexp. + // Returns -1 on no match. Compile has to be called before + // calling this. + int FirstMatch(const StringPiece& text, const std::vector<int>& atoms) const; - - // Returns the indices of all matching regexps, after first clearing - // matched_regexps. - bool AllMatches(const StringPiece& text, + + // Returns the indices of all matching regexps, after first clearing + // matched_regexps. + bool AllMatches(const StringPiece& text, const std::vector<int>& atoms, std::vector<int>* matching_regexps) const; - + // Returns the indices of all potentially matching regexps after first // clearing potential_regexps. // A regexp is potentially matching if it passes the filter. @@ -85,30 +85,30 @@ class FilteredRE2 { void AllPotentials(const std::vector<int>& atoms, std::vector<int>* potential_regexps) const; - // The number of regexps added. + // The number of regexps added. int NumRegexps() const { return static_cast<int>(re2_vec_.size()); } - + // Get the individual RE2 objects. const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } - private: - // Print prefilter. - void PrintPrefilter(int regexpid); - - // Useful for testing and debugging. + private: + // Print prefilter. + void PrintPrefilter(int regexpid); + + // Useful for testing and debugging. void RegexpsGivenStrings(const std::vector<int>& matched_atoms, std::vector<int>* passed_regexps); - - // All the regexps in the FilteredRE2. + + // All the regexps in the FilteredRE2. std::vector<RE2*> re2_vec_; - - // Has the FilteredRE2 been compiled using Compile() - bool compiled_; - - // An AND-OR tree of string atoms used for filtering regexps. + + // Has the FilteredRE2 been compiled using Compile() + bool compiled_; + + // An AND-OR tree of string atoms used for filtering regexps. std::unique_ptr<PrefilterTree> prefilter_tree_; -}; - -} // namespace re2 - -#endif // RE2_FILTERED_RE2_H_ +}; + +} // namespace re2 + +#endif // RE2_FILTERED_RE2_H_ diff --git a/contrib/libs/re2/re2/mimics_pcre.cc b/contrib/libs/re2/re2/mimics_pcre.cc index 7be60e4212..b1d6a51228 100644 --- a/contrib/libs/re2/re2/mimics_pcre.cc +++ b/contrib/libs/re2/re2/mimics_pcre.cc @@ -1,44 +1,44 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Determine whether this library should match PCRE exactly -// for a particular Regexp. (If so, the testing framework can -// check that it does.) -// -// This library matches PCRE except in these cases: -// * the regexp contains a repetition of an empty string, -// like (a*)* or (a*)+. In this case, PCRE will treat -// the repetition sequence as ending with an empty string, -// while this library does not. -// * Perl and PCRE differ on whether \v matches \n. -// For historical reasons, this library implements the Perl behavior. -// * Perl and PCRE allow $ in one-line mode to match either the very -// end of the text or just before a \n at the end of the text. -// This library requires it to match only the end of the text. -// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to -// match the end of the text if the last character is a \n. -// This library does allow it. -// -// Regexp::MimicsPCRE checks for any of these conditions. - +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Determine whether this library should match PCRE exactly +// for a particular Regexp. (If so, the testing framework can +// check that it does.) +// +// This library matches PCRE except in these cases: +// * the regexp contains a repetition of an empty string, +// like (a*)* or (a*)+. In this case, PCRE will treat +// the repetition sequence as ending with an empty string, +// while this library does not. +// * Perl and PCRE differ on whether \v matches \n. +// For historical reasons, this library implements the Perl behavior. +// * Perl and PCRE allow $ in one-line mode to match either the very +// end of the text or just before a \n at the end of the text. +// This library requires it to match only the end of the text. +// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to +// match the end of the text if the last character is a \n. +// This library does allow it. +// +// Regexp::MimicsPCRE checks for any of these conditions. + #include "util/util.h" #include "util/logging.h" -#include "re2/regexp.h" -#include "re2/walker-inl.h" - -namespace re2 { - -// Returns whether re might match an empty string. -static bool CanBeEmptyString(Regexp *re); - -// Walker class to compute whether library handles a regexp -// exactly as PCRE would. See comment at top for conditions. - -class PCREWalker : public Regexp::Walker<bool> { - public: - PCREWalker() {} - +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// Returns whether re might match an empty string. +static bool CanBeEmptyString(Regexp *re); + +// Walker class to compute whether library handles a regexp +// exactly as PCRE would. See comment at top for conditions. + +class PCREWalker : public Regexp::Walker<bool> { + public: + PCREWalker() {} + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, int nchild_args); @@ -47,151 +47,151 @@ class PCREWalker : public Regexp::Walker<bool> { #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "PCREWalker::ShortVisit called"; #endif - return a; - } + return a; + } private: PCREWalker(const PCREWalker&) = delete; PCREWalker& operator=(const PCREWalker&) = delete; -}; - -// Called after visiting each of re's children and accumulating -// the return values in child_args. So child_args contains whether -// this library mimics PCRE for those subexpressions. -bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args) { - // If children failed, so do we. - for (int i = 0; i < nchild_args; i++) - if (!child_args[i]) - return false; - - // Otherwise look for other reasons to fail. - switch (re->op()) { - // Look for repeated empty string. - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - if (CanBeEmptyString(re->sub()[0])) - return false; - break; - case kRegexpRepeat: - if (re->max() == -1 && CanBeEmptyString(re->sub()[0])) - return false; - break; - - // Look for \v - case kRegexpLiteral: - if (re->rune() == '\v') - return false; - break; - - // Look for $ in single-line mode. - case kRegexpEndText: - case kRegexpEmptyMatch: - if (re->parse_flags() & Regexp::WasDollar) - return false; - break; - - // Look for ^ in multi-line mode. - case kRegexpBeginLine: - // No condition: in single-line mode ^ becomes kRegexpBeginText. - return false; - - default: - break; - } - - // Not proven guilty. - return true; -} - -// Returns whether this regexp's behavior will mimic PCRE's exactly. -bool Regexp::MimicsPCRE() { - PCREWalker w; - return w.Walk(this, true); -} - - -// Walker class to compute whether a Regexp can match an empty string. -// It is okay to overestimate. For example, \b\B cannot match an empty -// string, because \b and \B are mutually exclusive, but this isn't -// that smart and will say it can. Spurious empty strings -// will reduce the number of regexps we sanity check against PCRE, -// but they won't break anything. - -class EmptyStringWalker : public Regexp::Walker<bool> { - public: +}; + +// Called after visiting each of re's children and accumulating +// the return values in child_args. So child_args contains whether +// this library mimics PCRE for those subexpressions. +bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args) { + // If children failed, so do we. + for (int i = 0; i < nchild_args; i++) + if (!child_args[i]) + return false; + + // Otherwise look for other reasons to fail. + switch (re->op()) { + // Look for repeated empty string. + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + if (CanBeEmptyString(re->sub()[0])) + return false; + break; + case kRegexpRepeat: + if (re->max() == -1 && CanBeEmptyString(re->sub()[0])) + return false; + break; + + // Look for \v + case kRegexpLiteral: + if (re->rune() == '\v') + return false; + break; + + // Look for $ in single-line mode. + case kRegexpEndText: + case kRegexpEmptyMatch: + if (re->parse_flags() & Regexp::WasDollar) + return false; + break; + + // Look for ^ in multi-line mode. + case kRegexpBeginLine: + // No condition: in single-line mode ^ becomes kRegexpBeginText. + return false; + + default: + break; + } + + // Not proven guilty. + return true; +} + +// Returns whether this regexp's behavior will mimic PCRE's exactly. +bool Regexp::MimicsPCRE() { + PCREWalker w; + return w.Walk(this, true); +} + + +// Walker class to compute whether a Regexp can match an empty string. +// It is okay to overestimate. For example, \b\B cannot match an empty +// string, because \b and \B are mutually exclusive, but this isn't +// that smart and will say it can. Spurious empty strings +// will reduce the number of regexps we sanity check against PCRE, +// but they won't break anything. + +class EmptyStringWalker : public Regexp::Walker<bool> { + public: EmptyStringWalker() {} - + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, int nchild_args); virtual bool ShortVisit(Regexp* re, bool a) { // Should never be called: we use Walk(), not WalkExponential(). #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; + LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; #endif - return a; - } - - private: + return a; + } + + private: EmptyStringWalker(const EmptyStringWalker&) = delete; EmptyStringWalker& operator=(const EmptyStringWalker&) = delete; -}; - -// Called after visiting re's children. child_args contains the return -// value from each of the children's PostVisits (i.e., whether each child -// can match an empty string). Returns whether this clause can match an -// empty string. -bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args) { - switch (re->op()) { - case kRegexpNoMatch: // never empty - case kRegexpLiteral: - case kRegexpAnyChar: - case kRegexpAnyByte: - case kRegexpCharClass: - case kRegexpLiteralString: - return false; - - case kRegexpEmptyMatch: // always empty - case kRegexpBeginLine: // always empty, when they match - case kRegexpEndLine: - case kRegexpNoWordBoundary: - case kRegexpWordBoundary: - case kRegexpBeginText: - case kRegexpEndText: - case kRegexpStar: // can always be empty - case kRegexpQuest: - case kRegexpHaveMatch: - return true; - - case kRegexpConcat: // can be empty if all children can - for (int i = 0; i < nchild_args; i++) - if (!child_args[i]) - return false; - return true; - - case kRegexpAlternate: // can be empty if any child can - for (int i = 0; i < nchild_args; i++) - if (child_args[i]) - return true; - return false; - - case kRegexpPlus: // can be empty if the child can - case kRegexpCapture: - return child_args[0]; - - case kRegexpRepeat: // can be empty if child can or is x{0} - return child_args[0] || re->min() == 0; - } - return false; -} - -// Returns whether re can match an empty string. -static bool CanBeEmptyString(Regexp* re) { - EmptyStringWalker w; - return w.Walk(re, true); -} - -} // namespace re2 +}; + +// Called after visiting re's children. child_args contains the return +// value from each of the children's PostVisits (i.e., whether each child +// can match an empty string). Returns whether this clause can match an +// empty string. +bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args) { + switch (re->op()) { + case kRegexpNoMatch: // never empty + case kRegexpLiteral: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpCharClass: + case kRegexpLiteralString: + return false; + + case kRegexpEmptyMatch: // always empty + case kRegexpBeginLine: // always empty, when they match + case kRegexpEndLine: + case kRegexpNoWordBoundary: + case kRegexpWordBoundary: + case kRegexpBeginText: + case kRegexpEndText: + case kRegexpStar: // can always be empty + case kRegexpQuest: + case kRegexpHaveMatch: + return true; + + case kRegexpConcat: // can be empty if all children can + for (int i = 0; i < nchild_args; i++) + if (!child_args[i]) + return false; + return true; + + case kRegexpAlternate: // can be empty if any child can + for (int i = 0; i < nchild_args; i++) + if (child_args[i]) + return true; + return false; + + case kRegexpPlus: // can be empty if the child can + case kRegexpCapture: + return child_args[0]; + + case kRegexpRepeat: // can be empty if child can or is x{0} + return child_args[0] || re->min() == 0; + } + return false; +} + +// Returns whether re can match an empty string. +static bool CanBeEmptyString(Regexp* re) { + EmptyStringWalker w; + return w.Walk(re, true); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/nfa.cc b/contrib/libs/re2/re2/nfa.cc index 3c0ed1f60e..c7339f8ffd 100644 --- a/contrib/libs/re2/re2/nfa.cc +++ b/contrib/libs/re2/re2/nfa.cc @@ -1,29 +1,29 @@ -// Copyright 2006-2007 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Tested by search_test.cc. -// -// Prog::SearchNFA, an NFA search. -// This is an actual NFA like the theorists talk about, -// not the pseudo-NFA found in backtracking regexp implementations. -// -// IMPLEMENTATION -// -// This algorithm is a variant of one that appeared in Rob Pike's sam editor, -// which is a variant of the one described in Thompson's 1968 CACM paper. -// See http://swtch.com/~rsc/regexp/ for various history. The main feature -// over the DFA implementation is that it tracks submatch boundaries. -// -// When the choice of submatch boundaries is ambiguous, this particular -// implementation makes the same choices that traditional backtracking -// implementations (in particular, Perl and PCRE) do. -// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential -// time in the length of the input. -// -// Like Thompson's original machine and like the DFA implementation, this -// implementation notices a match only once it is one byte past it. - +// Copyright 2006-2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc. +// +// Prog::SearchNFA, an NFA search. +// This is an actual NFA like the theorists talk about, +// not the pseudo-NFA found in backtracking regexp implementations. +// +// IMPLEMENTATION +// +// This algorithm is a variant of one that appeared in Rob Pike's sam editor, +// which is a variant of the one described in Thompson's 1968 CACM paper. +// See http://swtch.com/~rsc/regexp/ for various history. The main feature +// over the DFA implementation is that it tracks submatch boundaries. +// +// When the choice of submatch boundaries is ambiguous, this particular +// implementation makes the same choices that traditional backtracking +// implementations (in particular, Perl and PCRE) do. +// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential +// time in the length of the input. +// +// Like Thompson's original machine and like the DFA implementation, this +// implementation notices a match only once it is one byte past it. + #include <stdio.h> #include <string.h> #include <algorithm> @@ -35,68 +35,68 @@ #include "util/logging.h" #include "util/strutil.h" #include "re2/pod_array.h" -#include "re2/prog.h" -#include "re2/regexp.h" +#include "re2/prog.h" +#include "re2/regexp.h" #include "re2/sparse_array.h" #include "re2/sparse_set.h" - -namespace re2 { - + +namespace re2 { + static const bool ExtraDebug = false; -class NFA { - public: - NFA(Prog* prog); - ~NFA(); - - // Searches for a matching string. - // * If anchored is true, only considers matches starting at offset. - // Otherwise finds lefmost match at or after offset. - // * If longest is true, returns the longest match starting - // at the chosen start point. Otherwise returns the so-called - // left-biased match, the one traditional backtracking engines - // (like Perl and PCRE) find. - // Records submatch boundaries in submatch[1..nsubmatch-1]. - // Submatch[0] is the entire match. When there is a choice in - // which text matches each subexpression, the submatch boundaries - // are chosen to match what a backtracking implementation would choose. - bool Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch); - - private: - struct Thread { - union { +class NFA { + public: + NFA(Prog* prog); + ~NFA(); + + // Searches for a matching string. + // * If anchored is true, only considers matches starting at offset. + // Otherwise finds lefmost match at or after offset. + // * If longest is true, returns the longest match starting + // at the chosen start point. Otherwise returns the so-called + // left-biased match, the one traditional backtracking engines + // (like Perl and PCRE) find. + // Records submatch boundaries in submatch[1..nsubmatch-1]. + // Submatch[0] is the entire match. When there is a choice in + // which text matches each subexpression, the submatch boundaries + // are chosen to match what a backtracking implementation would choose. + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch); + + private: + struct Thread { + union { int ref; - Thread* next; // when on free list - }; - const char** capture; - }; - - // State for explicit stack in AddToThreadq. - struct AddState { + Thread* next; // when on free list + }; + const char** capture; + }; + + // State for explicit stack in AddToThreadq. + struct AddState { int id; // Inst to process Thread* t; // if not null, set t0 = t before processing id - }; - - // Threadq is a list of threads. The list is sorted by the order - // in which Perl would explore that particular state -- the earlier - // choices appear earlier in the list. - typedef SparseArray<Thread*> Threadq; - - inline Thread* AllocThread(); + }; + + // Threadq is a list of threads. The list is sorted by the order + // in which Perl would explore that particular state -- the earlier + // choices appear earlier in the list. + typedef SparseArray<Thread*> Threadq; + + inline Thread* AllocThread(); inline Thread* Incref(Thread* t); inline void Decref(Thread* t); - + // Follows all empty arrows from id0 and enqueues all the states reached. // Enqueues only the ByteRange instructions that match byte c. // context is used (with p) for evaluating empty-width specials. // p is the current input position, and t0 is the current thread. void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, const char* p, Thread* t0); - - // Run runq on byte c, appending new states to nextq. - // Updates matched_ and match_ as new, better matches are found. + + // Run runq on byte c, appending new states to nextq. + // Updates matched_ and match_ as new, better matches are found. // context is used (with p) for evaluating empty-width specials. // p is the position of byte c in the input string for AddToThreadq; // p-1 will be used when processing Match instructions. @@ -104,14 +104,14 @@ class NFA { // If there is a shortcut to the end, returns that shortcut. int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, const char* p); - - // Returns text version of capture information, for debugging. + + // Returns text version of capture information, for debugging. std::string FormatCapture(const char** capture); - + void CopyCapture(const char** dst, const char** src) { memmove(dst, src, ncapture_*sizeof src[0]); } - + Prog* prog_; // underlying program int start_; // start instruction in program int ncapture_; // number of submatches to track @@ -125,53 +125,53 @@ class NFA { Thread* freelist_; // thread freelist const char** match_; // best match so far bool matched_; // any match so far? - + NFA(const NFA&) = delete; NFA& operator=(const NFA&) = delete; -}; - -NFA::NFA(Prog* prog) { - prog_ = prog; +}; + +NFA::NFA(Prog* prog) { + prog_ = prog; start_ = prog_->start(); - ncapture_ = 0; - longest_ = false; - endmatch_ = false; - btext_ = NULL; - etext_ = NULL; - q0_.resize(prog_->size()); - q1_.resize(prog_->size()); + ncapture_ = 0; + longest_ = false; + endmatch_ = false; + btext_ = NULL; + etext_ = NULL; + q0_.resize(prog_->size()); + q1_.resize(prog_->size()); // See NFA::AddToThreadq() for why this is so. int nstack = 2*prog_->inst_count(kInstCapture) + prog_->inst_count(kInstEmptyWidth) + prog_->inst_count(kInstNop) + 1; // + 1 for start inst stack_ = PODArray<AddState>(nstack); freelist_ = NULL; - match_ = NULL; - matched_ = false; -} - -NFA::~NFA() { - delete[] match_; + match_ = NULL; + matched_ = false; +} + +NFA::~NFA() { + delete[] match_; for (const Thread& t : arena_) delete[] t.capture; -} - -NFA::Thread* NFA::AllocThread() { +} + +NFA::Thread* NFA::AllocThread() { Thread* t = freelist_; if (t != NULL) { freelist_ = t->next; t->ref = 1; // We don't need to touch t->capture because // the caller will immediately overwrite it. - return t; - } + return t; + } arena_.emplace_back(); t = &arena_.back(); t->ref = 1; t->capture = new const char*[ncapture_]; - return t; -} - + return t; +} + NFA::Thread* NFA::Incref(Thread* t) { DCHECK(t != NULL); t->ref++; @@ -194,9 +194,9 @@ void NFA::Decref(Thread* t) { // p is the current input position, and t0 is the current thread. void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, const char* p, Thread* t0) { - if (id0 == 0) - return; - + if (id0 == 0) + return; + // Use stack_ to hold our stack of instructions yet to process. // It was preallocated as follows: // two entries per Capture; @@ -206,12 +206,12 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, // perform. (Each instruction can be processed at most once.) AddState* stk = stack_.data(); int nstk = 0; - + stk[nstk++] = {id0, NULL}; - while (nstk > 0) { + while (nstk > 0) { DCHECK_LE(nstk, stack_.size()); AddState a = stk[--nstk]; - + Loop: if (a.t != NULL) { // t0 was a thread that we allocated and copied in order to @@ -220,76 +220,76 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, t0 = a.t; } - int id = a.id; - if (id == 0) - continue; - if (q->has_index(id)) { + int id = a.id; + if (id == 0) + continue; + if (q->has_index(id)) { if (ExtraDebug) fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str()); - continue; - } - - // Create entry in q no matter what. We might fill it in below, - // or we might not. Even if not, it is necessary to have it, + continue; + } + + // Create entry in q no matter what. We might fill it in below, + // or we might not. Even if not, it is necessary to have it, // so that we don't revisit id0 during the recursion. - q->set_new(id, NULL); + q->set_new(id, NULL); Thread** tp = &q->get_existing(id); - int j; - Thread* t; - Prog::Inst* ip = prog_->inst(id); - switch (ip->opcode()) { - default: - LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; - break; - - case kInstFail: - break; - - case kInstAltMatch: - // Save state; will pick up at next byte. + int j; + Thread* t; + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; + break; + + case kInstFail: + break; + + case kInstAltMatch: + // Save state; will pick up at next byte. t = Incref(t0); - *tp = t; - + *tp = t; + DCHECK(!ip->last()); a = {id+1, NULL}; goto Loop; - - case kInstNop: + + case kInstNop: if (!ip->last()) stk[nstk++] = {id+1, NULL}; - // Continue on. + // Continue on. a = {ip->out(), NULL}; goto Loop; - - case kInstCapture: + + case kInstCapture: if (!ip->last()) stk[nstk++] = {id+1, NULL}; - if ((j=ip->cap()) < ncapture_) { + if ((j=ip->cap()) < ncapture_) { // Push a dummy whose only job is to restore t0 - // once we finish exploring this possibility. + // once we finish exploring this possibility. stk[nstk++] = {0, t0}; - - // Record capture. + + // Record capture. t = AllocThread(); CopyCapture(t->capture, t0->capture); t->capture[j] = p; t0 = t; - } + } a = {ip->out(), NULL}; goto Loop; - + case kInstByteRange: if (!ip->Matches(c)) goto Next; - // Save state; will pick up at next byte. + // Save state; will pick up at next byte. t = Incref(t0); - *tp = t; + *tp = t; if (ExtraDebug) fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str()); - + if (ip->hint() == 0) break; a = {id+ip->hint(), NULL}; @@ -308,61 +308,61 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, a = {id+1, NULL}; goto Loop; - case kInstEmptyWidth: + case kInstEmptyWidth: if (!ip->last()) stk[nstk++] = {id+1, NULL}; - // Continue on if we have all the right flag bits. + // Continue on if we have all the right flag bits. if (ip->empty() & ~Prog::EmptyFlags(context, p)) - break; + break; a = {ip->out(), NULL}; goto Loop; - } - } -} - -// Run runq on byte c, appending new states to nextq. + } + } +} + +// Run runq on byte c, appending new states to nextq. // Updates matched_ and match_ as new, better matches are found. // context is used (with p) for evaluating empty-width specials. // p is the position of byte c in the input string for AddToThreadq; // p-1 will be used when processing Match instructions. -// Frees all the threads on runq. -// If there is a shortcut to the end, returns that shortcut. +// Frees all the threads on runq. +// If there is a shortcut to the end, returns that shortcut. int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, const char* p) { - nextq->clear(); - - for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + nextq->clear(); + + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { Thread* t = i->value(); - if (t == NULL) - continue; - - if (longest_) { - // Can skip any threads started after our current best match. - if (matched_ && match_[0] < t->capture[0]) { + if (t == NULL) + continue; + + if (longest_) { + // Can skip any threads started after our current best match. + if (matched_ && match_[0] < t->capture[0]) { Decref(t); - continue; - } - } - + continue; + } + } + int id = i->index(); - Prog::Inst* ip = prog_->inst(id); - - switch (ip->opcode()) { - default: - // Should only see the values handled below. - LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; - break; - - case kInstByteRange: + Prog::Inst* ip = prog_->inst(id); + + switch (ip->opcode()) { + default: + // Should only see the values handled below. + LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; + break; + + case kInstByteRange: AddToThreadq(nextq, ip->out(), c, context, p, t); - break; - - case kInstAltMatch: - if (i != runq->begin()) - break; - // The match is ours if we want it. - if (ip->greedy(prog_) || longest_) { + break; + + case kInstAltMatch: + if (i != runq->begin()) + break; + // The match is ours if we want it. + if (ip->greedy(prog_) || longest_) { CopyCapture(match_, t->capture); matched_ = true; @@ -371,13 +371,13 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, if (i->value() != NULL) Decref(i->value()); } - runq->clear(); - if (ip->greedy(prog_)) - return ip->out1(); - return ip->out(); - } - break; - + runq->clear(); + if (ip->greedy(prog_)) + return ip->out1(); + return ip->out(); + } + break; + case kInstMatch: { // Avoid invoking undefined behavior (arithmetic on a null pointer) // by storing p instead of p-1. (What would the latter even mean?!) @@ -386,127 +386,127 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, CopyCapture(match_, t->capture); match_[1] = p; matched_ = true; - break; + break; } - + if (endmatch_ && p-1 != etext_) break; - if (longest_) { - // Leftmost-longest mode: save this match only if - // it is either farther to the left or at the same - // point but longer than an existing match. - if (!matched_ || t->capture[0] < match_[0] || + if (longest_) { + // Leftmost-longest mode: save this match only if + // it is either farther to the left or at the same + // point but longer than an existing match. + if (!matched_ || t->capture[0] < match_[0] || (t->capture[0] == match_[0] && p-1 > match_[1])) { CopyCapture(match_, t->capture); match_[1] = p-1; matched_ = true; } - } else { - // Leftmost-biased mode: this match is by definition - // better than what we've already found (see next line). + } else { + // Leftmost-biased mode: this match is by definition + // better than what we've already found (see next line). CopyCapture(match_, t->capture); match_[1] = p-1; matched_ = true; - - // Cut off the threads that can only find matches - // worse than the one we just found: don't run the - // rest of the current Threadq. + + // Cut off the threads that can only find matches + // worse than the one we just found: don't run the + // rest of the current Threadq. Decref(t); for (++i; i != runq->end(); ++i) { if (i->value() != NULL) Decref(i->value()); } - runq->clear(); - return 0; - } - break; + runq->clear(); + return 0; + } + break; } - } + } Decref(t); - } - runq->clear(); - return 0; -} - + } + runq->clear(); + return 0; +} + std::string NFA::FormatCapture(const char** capture) { std::string s; - for (int i = 0; i < ncapture_; i+=2) { - if (capture[i] == NULL) + for (int i = 0; i < ncapture_; i+=2) { + if (capture[i] == NULL) s += "(?,?)"; - else if (capture[i+1] == NULL) + else if (capture[i+1] == NULL) s += StringPrintf("(%td,?)", capture[i] - btext_); - else + else s += StringPrintf("(%td,%td)", capture[i] - btext_, capture[i+1] - btext_); - } - return s; -} - -bool NFA::Search(const StringPiece& text, const StringPiece& const_context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch) { - if (start_ == 0) - return false; - - StringPiece context = const_context; + } + return s; +} + +bool NFA::Search(const StringPiece& text, const StringPiece& const_context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch) { + if (start_ == 0) + return false; + + StringPiece context = const_context; if (context.data() == NULL) - context = text; - + context = text; + // Sanity check: make sure that text lies within context. if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { LOG(DFATAL) << "context does not contain text"; - return false; - } - + return false; + } + if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text)) - return false; + return false; if (prog_->anchor_end() && EndPtr(context) != EndPtr(text)) - return false; - anchored |= prog_->anchor_start(); - if (prog_->anchor_end()) { - longest = true; - endmatch_ = true; - } - - if (nsubmatch < 0) { - LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; - return false; - } - - // Save search parameters. - ncapture_ = 2*nsubmatch; - longest_ = longest; - - if (nsubmatch == 0) { - // We need to maintain match[0], both to distinguish the - // longest match (if longest is true) and also to tell - // whether we've seen any matches at all. - ncapture_ = 2; - } - - match_ = new const char*[ncapture_]; + return false; + anchored |= prog_->anchor_start(); + if (prog_->anchor_end()) { + longest = true; + endmatch_ = true; + } + + if (nsubmatch < 0) { + LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; + return false; + } + + // Save search parameters. + ncapture_ = 2*nsubmatch; + longest_ = longest; + + if (nsubmatch == 0) { + // We need to maintain match[0], both to distinguish the + // longest match (if longest is true) and also to tell + // whether we've seen any matches at all. + ncapture_ = 2; + } + + match_ = new const char*[ncapture_]; memset(match_, 0, ncapture_*sizeof match_[0]); - matched_ = false; - - // For debugging prints. + matched_ = false; + + // For debugging prints. btext_ = context.data(); // For convenience. etext_ = text.data() + text.size(); - + if (ExtraDebug) - fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", + fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", std::string(text).c_str(), std::string(context).c_str(), anchored, longest); - - // Set up search. - Threadq* runq = &q0_; - Threadq* nextq = &q1_; - runq->clear(); - nextq->clear(); - - // Loop over the text, stepping the machine. + + // Set up search. + Threadq* runq = &q0_; + Threadq* nextq = &q1_; + runq->clear(); + nextq->clear(); + + // Loop over the text, stepping the machine. for (const char* p = text.data();; p++) { if (ExtraDebug) { int c = 0; @@ -518,58 +518,58 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, c = p[0] & 0xFF; fprintf(stderr, "%c:", c); - for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { Thread* t = i->value(); - if (t == NULL) - continue; + if (t == NULL) + continue; fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str()); - } - fprintf(stderr, "\n"); - } - + } + fprintf(stderr, "\n"); + } + // This is a no-op the first time around the loop because runq is empty. int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); - DCHECK_EQ(runq->size(), 0); + DCHECK_EQ(runq->size(), 0); using std::swap; - swap(nextq, runq); - nextq->clear(); - if (id != 0) { - // We're done: full match ahead. + swap(nextq, runq); + nextq->clear(); + if (id != 0) { + // We're done: full match ahead. p = etext_; - for (;;) { - Prog::Inst* ip = prog_->inst(id); - switch (ip->opcode()) { - default: - LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(); - break; - - case kInstCapture: + for (;;) { + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(); + break; + + case kInstCapture: if (ip->cap() < ncapture_) match_[ip->cap()] = p; - id = ip->out(); - continue; - - case kInstNop: - id = ip->out(); - continue; - - case kInstMatch: - match_[1] = p; - matched_ = true; - break; - } - break; - } - break; - } - + id = ip->out(); + continue; + + case kInstNop: + id = ip->out(); + continue; + + case kInstMatch: + match_[1] = p; + matched_ = true; + break; + } + break; + } + break; + } + if (p > etext_) - break; - - // Start a new thread if there have not been any matches. - // (No point in starting a new thread if there have been - // matches, since it would be to the right of the match - // we already found.) + break; + + // Start a new thread if there have not been any matches. + // (No point in starting a new thread if there have been + // matches, since it would be to the right of the match + // we already found.) if (!matched_ && (!anchored || p == text.data())) { // Try to use prefix accel (e.g. memchr) to skip ahead. // The search must be unanchored and there must be zero @@ -579,23 +579,23 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p)); if (p == NULL) p = etext_; - } - + } + Thread* t = AllocThread(); CopyCapture(t->capture, match_); t->capture[0] = p; AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p, t); Decref(t); - } - - // If all the threads have died, stop early. - if (runq->size() == 0) { + } + + // If all the threads have died, stop early. + if (runq->size() == 0) { if (ExtraDebug) - fprintf(stderr, "dead\n"); - break; - } - + fprintf(stderr, "dead\n"); + break; + } + // Avoid invoking undefined behavior (arithmetic on a null pointer) // by simply not continuing the loop. // This complements the special case in NFA::Step(). @@ -607,15 +607,15 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, nextq->clear(); break; } - } - + } + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { if (i->value() != NULL) Decref(i->value()); } - - if (matched_) { - for (int i = 0; i < nsubmatch; i++) + + if (matched_) { + for (int i = 0; i < nsubmatch; i++) submatch[i] = StringPiece(match_[2 * i], static_cast<size_t>(match_[2 * i + 1] - match_[2 * i])); @@ -623,34 +623,34 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, fprintf(stderr, "match (%td,%td)\n", match_[0] - btext_, match_[1] - btext_); - return true; - } - return false; -} - -bool -Prog::SearchNFA(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch) { + return true; + } + return false; +} + +bool +Prog::SearchNFA(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch) { if (ExtraDebug) - Dump(); - - NFA nfa(this); - StringPiece sp; - if (kind == kFullMatch) { - anchor = kAnchored; - if (nmatch == 0) { - match = &sp; - nmatch = 1; - } - } - if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) - return false; + Dump(); + + NFA nfa(this); + StringPiece sp; + if (kind == kFullMatch) { + anchor = kAnchored; + if (nmatch == 0) { + match = &sp; + nmatch = 1; + } + } + if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) + return false; if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) - return false; - return true; -} - + return false; + return true; +} + // For each instruction i in the program reachable from the start, compute the // number of instructions reachable from i by following only empty transitions // and record that count as fanout[i]. @@ -710,4 +710,4 @@ void Prog::Fanout(SparseArray<int>* fanout) { } } -} // namespace re2 +} // namespace re2 diff --git a/contrib/libs/re2/re2/onepass.cc b/contrib/libs/re2/re2/onepass.cc index ff53b54e59..263974654d 100644 --- a/contrib/libs/re2/re2/onepass.cc +++ b/contrib/libs/re2/re2/onepass.cc @@ -1,59 +1,59 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Tested by search_test.cc. -// -// Prog::SearchOnePass is an efficient implementation of -// regular expression search with submatch tracking for -// what I call "one-pass regular expressions". (An alternate -// name might be "backtracking-free regular expressions".) -// -// One-pass regular expressions have the property that -// at each input byte during an anchored match, there may be -// multiple alternatives but only one can proceed for any -// given input byte. -// -// For example, the regexp /x*yx*/ is one-pass: you read -// x's until a y, then you read the y, then you keep reading x's. -// At no point do you have to guess what to do or back up -// and try a different guess. -// -// On the other hand, /x*x/ is not one-pass: when you're -// looking at an input "x", it's not clear whether you should -// use it to extend the x* or as the final x. -// -// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not. -// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not. -// -// A simple intuition for identifying one-pass regular expressions -// is that it's always immediately obvious when a repetition ends. -// It must also be immediately obvious which branch of an | to take: -// -// /x(y|z)/ is one-pass, but /(xy|xz)/ is not. -// -// The NFA-based search in nfa.cc does some bookkeeping to -// avoid the need for backtracking and its associated exponential blowup. -// But if we have a one-pass regular expression, there is no -// possibility of backtracking, so there is no need for the -// extra bookkeeping. Hence, this code. -// -// On a one-pass regular expression, the NFA code in nfa.cc -// runs at about 1/20 of the backtracking-based PCRE speed. -// In contrast, the code in this file runs at about the same -// speed as PCRE. -// -// One-pass regular expressions get used a lot when RE is -// used for parsing simple strings, so it pays off to -// notice them and handle them efficiently. -// -// See also Anne Brüggemann-Klein and Derick Wood, -// "One-unambiguous regular languages", Information and Computation 142(2). - +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc. +// +// Prog::SearchOnePass is an efficient implementation of +// regular expression search with submatch tracking for +// what I call "one-pass regular expressions". (An alternate +// name might be "backtracking-free regular expressions".) +// +// One-pass regular expressions have the property that +// at each input byte during an anchored match, there may be +// multiple alternatives but only one can proceed for any +// given input byte. +// +// For example, the regexp /x*yx*/ is one-pass: you read +// x's until a y, then you read the y, then you keep reading x's. +// At no point do you have to guess what to do or back up +// and try a different guess. +// +// On the other hand, /x*x/ is not one-pass: when you're +// looking at an input "x", it's not clear whether you should +// use it to extend the x* or as the final x. +// +// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not. +// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not. +// +// A simple intuition for identifying one-pass regular expressions +// is that it's always immediately obvious when a repetition ends. +// It must also be immediately obvious which branch of an | to take: +// +// /x(y|z)/ is one-pass, but /(xy|xz)/ is not. +// +// The NFA-based search in nfa.cc does some bookkeeping to +// avoid the need for backtracking and its associated exponential blowup. +// But if we have a one-pass regular expression, there is no +// possibility of backtracking, so there is no need for the +// extra bookkeeping. Hence, this code. +// +// On a one-pass regular expression, the NFA code in nfa.cc +// runs at about 1/20 of the backtracking-based PCRE speed. +// In contrast, the code in this file runs at about the same +// speed as PCRE. +// +// One-pass regular expressions get used a lot when RE is +// used for parsing simple strings, so it pays off to +// notice them and handle them efficiently. +// +// See also Anne Brüggemann-Klein and Derick Wood, +// "One-unambiguous regular languages", Information and Computation 142(2). + #include <stdint.h> -#include <string.h> +#include <string.h> #include <algorithm> -#include <map> +#include <map> #include <string> #include <vector> @@ -62,188 +62,188 @@ #include "util/strutil.h" #include "util/utf.h" #include "re2/pod_array.h" -#include "re2/prog.h" +#include "re2/prog.h" #include "re2/sparse_set.h" #include "re2/stringpiece.h" - + // Silence "zero-sized array in struct/union" warning for OneState::action. #ifdef _MSC_VER #pragma warning(disable: 4200) #endif -namespace re2 { - +namespace re2 { + static const bool ExtraDebug = false; - -// The key insight behind this implementation is that the -// non-determinism in an NFA for a one-pass regular expression -// is contained. To explain what that means, first a -// refresher about what regular expression programs look like -// and how the usual NFA execution runs. -// -// In a regular expression program, only the kInstByteRange -// instruction processes an input byte c and moves on to the -// next byte in the string (it does so if c is in the given range). -// The kInstByteRange instructions correspond to literal characters -// and character classes in the regular expression. -// -// The kInstAlt instructions are used as wiring to connect the -// kInstByteRange instructions together in interesting ways when -// implementing | + and *. -// The kInstAlt instruction forks execution, like a goto that -// jumps to ip->out() and ip->out1() in parallel. Each of the -// resulting computation paths is called a thread. -// -// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture -- -// are interesting in their own right but like kInstAlt they don't -// advance the input pointer. Only kInstByteRange does. -// -// The automaton execution in nfa.cc runs all the possible -// threads of execution in lock-step over the input. To process -// a particular byte, each thread gets run until it either dies -// or finds a kInstByteRange instruction matching the byte. -// If the latter happens, the thread stops just past the -// kInstByteRange instruction (at ip->out()) and waits for -// the other threads to finish processing the input byte. -// Then, once all the threads have processed that input byte, -// the whole process repeats. The kInstAlt state instruction -// might create new threads during input processing, but no -// matter what, all the threads stop after a kInstByteRange -// and wait for the other threads to "catch up". -// Running in lock step like this ensures that the NFA reads -// the input string only once. -// -// Each thread maintains its own set of capture registers -// (the string positions at which it executed the kInstCapture -// instructions corresponding to capturing parentheses in the -// regular expression). Repeated copying of the capture registers -// is the main performance bottleneck in the NFA implementation. -// -// A regular expression program is "one-pass" if, no matter what -// the input string, there is only one thread that makes it -// past a kInstByteRange instruction at each input byte. This means -// that there is in some sense only one active thread throughout -// the execution. Other threads might be created during the -// processing of an input byte, but they are ephemeral: only one -// thread is left to start processing the next input byte. -// This is what I meant above when I said the non-determinism -// was "contained". -// -// To execute a one-pass regular expression program, we can build -// a DFA (no non-determinism) that has at most as many states as -// the NFA (compare this to the possibly exponential number of states -// in the general case). Each state records, for each possible -// input byte, the next state along with the conditions required -// before entering that state -- empty-width flags that must be true -// and capture operations that must be performed. It also records -// whether a set of conditions required to finish a match at that -// point in the input rather than process the next byte. - -// A state in the one-pass NFA - just an array of actions indexed -// by the bytemap_[] of the next input byte. (The bytemap -// maps next input bytes into equivalence classes, to reduce -// the memory footprint.) -struct OneState { + +// The key insight behind this implementation is that the +// non-determinism in an NFA for a one-pass regular expression +// is contained. To explain what that means, first a +// refresher about what regular expression programs look like +// and how the usual NFA execution runs. +// +// In a regular expression program, only the kInstByteRange +// instruction processes an input byte c and moves on to the +// next byte in the string (it does so if c is in the given range). +// The kInstByteRange instructions correspond to literal characters +// and character classes in the regular expression. +// +// The kInstAlt instructions are used as wiring to connect the +// kInstByteRange instructions together in interesting ways when +// implementing | + and *. +// The kInstAlt instruction forks execution, like a goto that +// jumps to ip->out() and ip->out1() in parallel. Each of the +// resulting computation paths is called a thread. +// +// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture -- +// are interesting in their own right but like kInstAlt they don't +// advance the input pointer. Only kInstByteRange does. +// +// The automaton execution in nfa.cc runs all the possible +// threads of execution in lock-step over the input. To process +// a particular byte, each thread gets run until it either dies +// or finds a kInstByteRange instruction matching the byte. +// If the latter happens, the thread stops just past the +// kInstByteRange instruction (at ip->out()) and waits for +// the other threads to finish processing the input byte. +// Then, once all the threads have processed that input byte, +// the whole process repeats. The kInstAlt state instruction +// might create new threads during input processing, but no +// matter what, all the threads stop after a kInstByteRange +// and wait for the other threads to "catch up". +// Running in lock step like this ensures that the NFA reads +// the input string only once. +// +// Each thread maintains its own set of capture registers +// (the string positions at which it executed the kInstCapture +// instructions corresponding to capturing parentheses in the +// regular expression). Repeated copying of the capture registers +// is the main performance bottleneck in the NFA implementation. +// +// A regular expression program is "one-pass" if, no matter what +// the input string, there is only one thread that makes it +// past a kInstByteRange instruction at each input byte. This means +// that there is in some sense only one active thread throughout +// the execution. Other threads might be created during the +// processing of an input byte, but they are ephemeral: only one +// thread is left to start processing the next input byte. +// This is what I meant above when I said the non-determinism +// was "contained". +// +// To execute a one-pass regular expression program, we can build +// a DFA (no non-determinism) that has at most as many states as +// the NFA (compare this to the possibly exponential number of states +// in the general case). Each state records, for each possible +// input byte, the next state along with the conditions required +// before entering that state -- empty-width flags that must be true +// and capture operations that must be performed. It also records +// whether a set of conditions required to finish a match at that +// point in the input rather than process the next byte. + +// A state in the one-pass NFA - just an array of actions indexed +// by the bytemap_[] of the next input byte. (The bytemap +// maps next input bytes into equivalence classes, to reduce +// the memory footprint.) +struct OneState { uint32_t matchcond; // conditions to match right now. uint32_t action[]; -}; - +}; + // The uint32_t conditions in the action are a combination of -// condition and capture bits and the next state. The bottom 16 bits -// are the condition and capture bits, and the top 16 are the index of -// the next state. -// -// Bits 0-5 are the empty-width flags from prog.h. -// Bit 6 is kMatchWins, which means the match takes -// priority over moving to next in a first-match search. -// The remaining bits mark capture registers that should -// be set to the current input position. The capture bits -// start at index 2, since the search loop can take care of -// cap[0], cap[1] (the overall match position). -// That means we can handle up to 5 capturing parens: $1 through $4, plus $0. -// No input position can satisfy both kEmptyWordBoundary -// and kEmptyNonWordBoundary, so we can use that as a sentinel -// instead of needing an extra bit. - +// condition and capture bits and the next state. The bottom 16 bits +// are the condition and capture bits, and the top 16 are the index of +// the next state. +// +// Bits 0-5 are the empty-width flags from prog.h. +// Bit 6 is kMatchWins, which means the match takes +// priority over moving to next in a first-match search. +// The remaining bits mark capture registers that should +// be set to the current input position. The capture bits +// start at index 2, since the search loop can take care of +// cap[0], cap[1] (the overall match position). +// That means we can handle up to 5 capturing parens: $1 through $4, plus $0. +// No input position can satisfy both kEmptyWordBoundary +// and kEmptyNonWordBoundary, so we can use that as a sentinel +// instead of needing an extra bit. + static const int kIndexShift = 16; // number of bits below index static const int kEmptyShift = 6; // number of empty flags in prog.h -static const int kRealCapShift = kEmptyShift + 1; -static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; - -// Parameters used to skip over cap[0], cap[1]. -static const int kCapShift = kRealCapShift - 2; -static const int kMaxCap = kRealMaxCap + 2; - +static const int kRealCapShift = kEmptyShift + 1; +static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; + +// Parameters used to skip over cap[0], cap[1]. +static const int kCapShift = kRealCapShift - 2; +static const int kMaxCap = kRealMaxCap + 2; + static const uint32_t kMatchWins = 1 << kEmptyShift; static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift; - + static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary; - -// Check, at compile time, that prog.h agrees with math above. -// This function is never called. -void OnePass_Checks() { + +// Check, at compile time, that prog.h agrees with math above. +// This function is never called. +void OnePass_Checks() { static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags, "kEmptyShift disagrees with kEmptyAllFlags"); - // kMaxCap counts pointers, kMaxOnePassCapture counts pairs. + // kMaxCap counts pointers, kMaxOnePassCapture counts pairs. static_assert(kMaxCap == Prog::kMaxOnePassCapture*2, "kMaxCap disagrees with kMaxOnePassCapture"); -} - +} + static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) { uint32_t satisfied = Prog::EmptyFlags(context, p); - if (cond & kEmptyAllFlags & ~satisfied) - return false; - return true; -} - -// Apply the capture bits in cond, saving p to the appropriate -// locations in cap[]. + if (cond & kEmptyAllFlags & ~satisfied) + return false; + return true; +} + +// Apply the capture bits in cond, saving p to the appropriate +// locations in cap[]. static void ApplyCaptures(uint32_t cond, const char* p, - const char** cap, int ncap) { - for (int i = 2; i < ncap; i++) - if (cond & (1 << kCapShift << i)) - cap[i] = p; -} - + const char** cap, int ncap) { + for (int i = 2; i < ncap; i++) + if (cond & (1 << kCapShift << i)) + cap[i] = p; +} + // Computes the OneState* for the given nodeindex. static inline OneState* IndexToNode(uint8_t* nodes, int statesize, - int nodeindex) { + int nodeindex) { return reinterpret_cast<OneState*>(nodes + statesize*nodeindex); -} - -bool Prog::SearchOnePass(const StringPiece& text, - const StringPiece& const_context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch) { - if (anchor != kAnchored && kind != kFullMatch) { - LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches."; - return false; - } - - // Make sure we have at least cap[1], - // because we use it to tell if we matched. - int ncap = 2*nmatch; - if (ncap < 2) - ncap = 2; - - const char* cap[kMaxCap]; - for (int i = 0; i < ncap; i++) - cap[i] = NULL; - - const char* matchcap[kMaxCap]; - for (int i = 0; i < ncap; i++) - matchcap[i] = NULL; - - StringPiece context = const_context; +} + +bool Prog::SearchOnePass(const StringPiece& text, + const StringPiece& const_context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch) { + if (anchor != kAnchored && kind != kFullMatch) { + LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches."; + return false; + } + + // Make sure we have at least cap[1], + // because we use it to tell if we matched. + int ncap = 2*nmatch; + if (ncap < 2) + ncap = 2; + + const char* cap[kMaxCap]; + for (int i = 0; i < ncap; i++) + cap[i] = NULL; + + const char* matchcap[kMaxCap]; + for (int i = 0; i < ncap; i++) + matchcap[i] = NULL; + + StringPiece context = const_context; if (context.data() == NULL) - context = text; + context = text; if (anchor_start() && BeginPtr(context) != BeginPtr(text)) - return false; + return false; if (anchor_end() && EndPtr(context) != EndPtr(text)) - return false; - if (anchor_end()) - kind = kFullMatch; - + return false; + if (anchor_end()) + kind = kFullMatch; + uint8_t* nodes = onepass_nodes_.data(); int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); // start() is always mapped to the zeroth OneState. @@ -251,231 +251,231 @@ bool Prog::SearchOnePass(const StringPiece& text, uint8_t* bytemap = bytemap_; const char* bp = text.data(); const char* ep = text.data() + text.size(); - const char* p; - bool matched = false; - matchcap[0] = bp; - cap[0] = bp; + const char* p; + bool matched = false; + matchcap[0] = bp; + cap[0] = bp; uint32_t nextmatchcond = state->matchcond; - for (p = bp; p < ep; p++) { - int c = bytemap[*p & 0xFF]; + for (p = bp; p < ep; p++) { + int c = bytemap[*p & 0xFF]; uint32_t matchcond = nextmatchcond; uint32_t cond = state->action[c]; - - // Determine whether we can reach act->next. - // If so, advance state and nextmatchcond. - if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { + + // Determine whether we can reach act->next. + // If so, advance state and nextmatchcond. + if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { uint32_t nextindex = cond >> kIndexShift; - state = IndexToNode(nodes, statesize, nextindex); - nextmatchcond = state->matchcond; - } else { - state = NULL; - nextmatchcond = kImpossible; - } - - // This code section is carefully tuned. - // The goto sequence is about 10% faster than the - // obvious rewrite as a large if statement in the - // ASCIIMatchRE2 and DotMatchRE2 benchmarks. - - // Saving the match capture registers is expensive. - // Is this intermediate match worth thinking about? - - // Not if we want a full match. - if (kind == kFullMatch) - goto skipmatch; - - // Not if it's impossible. - if (matchcond == kImpossible) - goto skipmatch; - - // Not if the possible match is beaten by the certain - // match at the next byte. When this test is useless - // (e.g., HTTPPartialMatchRE2) it slows the loop by - // about 10%, but when it avoids work (e.g., DotMatchRE2), - // it cuts the loop execution by about 45%. - if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0) - goto skipmatch; - - // Finally, the match conditions must be satisfied. - if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) { - for (int i = 2; i < 2*nmatch; i++) - matchcap[i] = cap[i]; - if (nmatch > 1 && (matchcond & kCapMask)) - ApplyCaptures(matchcond, p, matchcap, ncap); - matchcap[1] = p; - matched = true; - - // If we're in longest match mode, we have to keep - // going and see if we find a longer match. - // In first match mode, we can stop if the match - // takes priority over the next state for this input byte. - // That bit is per-input byte and thus in cond, not matchcond. - if (kind == kFirstMatch && (cond & kMatchWins)) - goto done; - } - - skipmatch: - if (state == NULL) - goto done; - if ((cond & kCapMask) && nmatch > 1) - ApplyCaptures(cond, p, cap, ncap); - } - - // Look for match at end of input. - { + state = IndexToNode(nodes, statesize, nextindex); + nextmatchcond = state->matchcond; + } else { + state = NULL; + nextmatchcond = kImpossible; + } + + // This code section is carefully tuned. + // The goto sequence is about 10% faster than the + // obvious rewrite as a large if statement in the + // ASCIIMatchRE2 and DotMatchRE2 benchmarks. + + // Saving the match capture registers is expensive. + // Is this intermediate match worth thinking about? + + // Not if we want a full match. + if (kind == kFullMatch) + goto skipmatch; + + // Not if it's impossible. + if (matchcond == kImpossible) + goto skipmatch; + + // Not if the possible match is beaten by the certain + // match at the next byte. When this test is useless + // (e.g., HTTPPartialMatchRE2) it slows the loop by + // about 10%, but when it avoids work (e.g., DotMatchRE2), + // it cuts the loop execution by about 45%. + if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0) + goto skipmatch; + + // Finally, the match conditions must be satisfied. + if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) { + for (int i = 2; i < 2*nmatch; i++) + matchcap[i] = cap[i]; + if (nmatch > 1 && (matchcond & kCapMask)) + ApplyCaptures(matchcond, p, matchcap, ncap); + matchcap[1] = p; + matched = true; + + // If we're in longest match mode, we have to keep + // going and see if we find a longer match. + // In first match mode, we can stop if the match + // takes priority over the next state for this input byte. + // That bit is per-input byte and thus in cond, not matchcond. + if (kind == kFirstMatch && (cond & kMatchWins)) + goto done; + } + + skipmatch: + if (state == NULL) + goto done; + if ((cond & kCapMask) && nmatch > 1) + ApplyCaptures(cond, p, cap, ncap); + } + + // Look for match at end of input. + { uint32_t matchcond = state->matchcond; - if (matchcond != kImpossible && - ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { - if (nmatch > 1 && (matchcond & kCapMask)) - ApplyCaptures(matchcond, p, cap, ncap); - for (int i = 2; i < ncap; i++) - matchcap[i] = cap[i]; - matchcap[1] = p; - matched = true; - } - } - -done: - if (!matched) - return false; - for (int i = 0; i < nmatch; i++) + if (matchcond != kImpossible && + ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { + if (nmatch > 1 && (matchcond & kCapMask)) + ApplyCaptures(matchcond, p, cap, ncap); + for (int i = 2; i < ncap; i++) + matchcap[i] = cap[i]; + matchcap[1] = p; + matched = true; + } + } + +done: + if (!matched) + return false; + for (int i = 0; i < nmatch; i++) match[i] = StringPiece(matchcap[2 * i], static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i])); - return true; -} - - -// Analysis to determine whether a given regexp program is one-pass. - -// If ip is not on workq, adds ip to work queue and returns true. -// If ip is already on work queue, does nothing and returns false. -// If ip is NULL, does nothing and returns true (pretends to add it). -typedef SparseSet Instq; -static bool AddQ(Instq *q, int id) { - if (id == 0) - return true; - if (q->contains(id)) - return false; - q->insert(id); - return true; -} - -struct InstCond { - int id; + return true; +} + + +// Analysis to determine whether a given regexp program is one-pass. + +// If ip is not on workq, adds ip to work queue and returns true. +// If ip is already on work queue, does nothing and returns false. +// If ip is NULL, does nothing and returns true (pretends to add it). +typedef SparseSet Instq; +static bool AddQ(Instq *q, int id) { + if (id == 0) + return true; + if (q->contains(id)) + return false; + q->insert(id); + return true; +} + +struct InstCond { + int id; uint32_t cond; -}; - -// Returns whether this is a one-pass program; that is, -// returns whether it is safe to use SearchOnePass on this program. -// These conditions must be true for any instruction ip: -// -// (1) for any other Inst nip, there is at most one input-free -// path from ip to nip. -// (2) there is at most one kInstByte instruction reachable from -// ip that matches any particular byte c. -// (3) there is at most one input-free path from ip to a kInstMatch -// instruction. -// -// This is actually just a conservative approximation: it might -// return false when the answer is true, when kInstEmptyWidth -// instructions are involved. -// Constructs and saves corresponding one-pass NFA on success. -bool Prog::IsOnePass() { - if (did_onepass_) +}; + +// Returns whether this is a one-pass program; that is, +// returns whether it is safe to use SearchOnePass on this program. +// These conditions must be true for any instruction ip: +// +// (1) for any other Inst nip, there is at most one input-free +// path from ip to nip. +// (2) there is at most one kInstByte instruction reachable from +// ip that matches any particular byte c. +// (3) there is at most one input-free path from ip to a kInstMatch +// instruction. +// +// This is actually just a conservative approximation: it might +// return false when the answer is true, when kInstEmptyWidth +// instructions are involved. +// Constructs and saves corresponding one-pass NFA on success. +bool Prog::IsOnePass() { + if (did_onepass_) return onepass_nodes_.data() != NULL; - did_onepass_ = true; - - if (start() == 0) // no match - return false; - - // Steal memory for the one-pass NFA from the overall DFA budget. - // Willing to use at most 1/4 of the DFA budget (heuristic). - // Limit max node count to 65000 as a conservative estimate to - // avoid overflowing 16-bit node index in encoding. + did_onepass_ = true; + + if (start() == 0) // no match + return false; + + // Steal memory for the one-pass NFA from the overall DFA budget. + // Willing to use at most 1/4 of the DFA budget (heuristic). + // Limit max node count to 65000 as a conservative estimate to + // avoid overflowing 16-bit node index in encoding. int maxnodes = 2 + inst_count(kInstByteRange); int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); - if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) - return false; - - // Flood the graph starting at the start state, and check - // that in each reachable state, each possible byte leads - // to a unique next state. + if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) + return false; + + // Flood the graph starting at the start state, and check + // that in each reachable state, each possible byte leads + // to a unique next state. int stacksize = inst_count(kInstCapture) + inst_count(kInstEmptyWidth) + inst_count(kInstNop) + 1; // + 1 for start inst PODArray<InstCond> stack(stacksize); - int size = this->size(); + int size = this->size(); PODArray<int> nodebyid(size); // indexed by ip memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]); - + // Originally, nodes was a uint8_t[maxnodes*statesize], but that was // unnecessarily optimistic: why allocate a large amount of memory // upfront for a large program when it is unlikely to be one-pass? std::vector<uint8_t> nodes; - - Instq tovisit(size), workq(size); - AddQ(&tovisit, start()); - nodebyid[start()] = 0; - int nalloc = 1; + + Instq tovisit(size), workq(size); + AddQ(&tovisit, start()); + nodebyid[start()] = 0; + int nalloc = 1; nodes.insert(nodes.end(), statesize, 0); - for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { - int id = *it; - int nodeindex = nodebyid[id]; + for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { + int id = *it; + int nodeindex = nodebyid[id]; OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); - - // Flood graph using manual stack, filling in actions as found. - // Default is none. - for (int b = 0; b < bytemap_range_; b++) - node->action[b] = kImpossible; - node->matchcond = kImpossible; - - workq.clear(); - bool matched = false; - int nstack = 0; - stack[nstack].id = id; - stack[nstack++].cond = 0; - while (nstack > 0) { - int id = stack[--nstack].id; + + // Flood graph using manual stack, filling in actions as found. + // Default is none. + for (int b = 0; b < bytemap_range_; b++) + node->action[b] = kImpossible; + node->matchcond = kImpossible; + + workq.clear(); + bool matched = false; + int nstack = 0; + stack[nstack].id = id; + stack[nstack++].cond = 0; + while (nstack > 0) { + int id = stack[--nstack].id; uint32_t cond = stack[nstack].cond; Loop: - Prog::Inst* ip = inst(id); - switch (ip->opcode()) { + Prog::Inst* ip = inst(id); + switch (ip->opcode()) { default: LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); break; - case kInstAltMatch: - // TODO(rsc): Ignoring kInstAltMatch optimization. - // Should implement it in this engine, but it's subtle. + case kInstAltMatch: + // TODO(rsc): Ignoring kInstAltMatch optimization. + // Should implement it in this engine, but it's subtle. DCHECK(!ip->last()); - // If already on work queue, (1) is violated: bail out. + // If already on work queue, (1) is violated: bail out. if (!AddQ(&workq, id+1)) - goto fail; + goto fail; id = id+1; goto Loop; - - case kInstByteRange: { - int nextindex = nodebyid[ip->out()]; - if (nextindex == -1) { - if (nalloc >= maxnodes) { + + case kInstByteRange: { + int nextindex = nodebyid[ip->out()]; + if (nextindex == -1) { + if (nalloc >= maxnodes) { if (ExtraDebug) LOG(ERROR) << StringPrintf( "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes); - goto fail; - } - nextindex = nalloc; + goto fail; + } + nextindex = nalloc; AddQ(&tovisit, ip->out()); nodebyid[ip->out()] = nalloc; - nalloc++; + nalloc++; nodes.insert(nodes.end(), statesize, 0); // Update node because it might have been invalidated. node = IndexToNode(nodes.data(), statesize, nodeindex); - } - for (int c = ip->lo(); c <= ip->hi(); c++) { - int b = bytemap_[c]; + } + for (int c = ip->lo(); c <= ip->hi(); c++) { + int b = bytemap_[c]; // Skip any bytes immediately after c that are also in b. while (c < 256-1 && bytemap_[c+1] == b) c++; @@ -483,20 +483,20 @@ bool Prog::IsOnePass() { uint32_t newact = (nextindex << kIndexShift) | cond; if (matched) newact |= kMatchWins; - if ((act & kImpossible) == kImpossible) { - node->action[b] = newact; - } else if (act != newact) { + if ((act & kImpossible) == kImpossible) { + node->action[b] = newact; + } else if (act != newact) { if (ExtraDebug) LOG(ERROR) << StringPrintf( "Not OnePass: conflict on byte %#x at state %d", c, *it); - goto fail; - } - } - if (ip->foldcase()) { + goto fail; + } + } + if (ip->foldcase()) { Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a'; Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a'; - for (int c = lo; c <= hi; c++) { - int b = bytemap_[c]; + for (int c = lo; c <= hi; c++) { + int b = bytemap_[c]; // Skip any bytes immediately after c that are also in b. while (c < 256-1 && bytemap_[c+1] == b) c++; @@ -504,16 +504,16 @@ bool Prog::IsOnePass() { uint32_t newact = (nextindex << kIndexShift) | cond; if (matched) newact |= kMatchWins; - if ((act & kImpossible) == kImpossible) { - node->action[b] = newact; - } else if (act != newact) { + if ((act & kImpossible) == kImpossible) { + node->action[b] = newact; + } else if (act != newact) { if (ExtraDebug) LOG(ERROR) << StringPrintf( "Not OnePass: conflict on byte %#x at state %d", c, *it); - goto fail; - } - } - } + goto fail; + } + } + } if (ip->last()) break; @@ -522,9 +522,9 @@ bool Prog::IsOnePass() { goto fail; id = id+1; goto Loop; - } - - case kInstCapture: + } + + case kInstCapture: case kInstEmptyWidth: case kInstNop: if (!ip->last()) { @@ -536,37 +536,37 @@ bool Prog::IsOnePass() { } if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap) - cond |= (1 << kCapShift) << ip->cap(); + cond |= (1 << kCapShift) << ip->cap(); if (ip->opcode() == kInstEmptyWidth) cond |= ip->empty(); - - // kInstCapture and kInstNop always proceed to ip->out(). - // kInstEmptyWidth only sometimes proceeds to ip->out(), - // but as a conservative approximation we assume it always does. - // We could be a little more precise by looking at what c - // is, but that seems like overkill. - - // If already on work queue, (1) is violated: bail out. - if (!AddQ(&workq, ip->out())) { + + // kInstCapture and kInstNop always proceed to ip->out(). + // kInstEmptyWidth only sometimes proceeds to ip->out(), + // but as a conservative approximation we assume it always does. + // We could be a little more precise by looking at what c + // is, but that seems like overkill. + + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, ip->out())) { if (ExtraDebug) LOG(ERROR) << StringPrintf( "Not OnePass: multiple paths %d -> %d", *it, ip->out()); - goto fail; - } + goto fail; + } id = ip->out(); goto Loop; - - case kInstMatch: - if (matched) { - // (3) is violated + + case kInstMatch: + if (matched) { + // (3) is violated if (ExtraDebug) LOG(ERROR) << StringPrintf( "Not OnePass: multiple matches from %d", *it); - goto fail; - } - matched = true; - node->matchcond = cond; - + goto fail; + } + matched = true; + node->matchcond = cond; + if (ip->last()) break; // If already on work queue, (1) is violated: bail out. @@ -575,49 +575,49 @@ bool Prog::IsOnePass() { id = id+1; goto Loop; - case kInstFail: - break; - } - } - } - + case kInstFail: + break; + } + } + } + if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR). LOG(ERROR) << "bytemap:\n" << DumpByteMap(); LOG(ERROR) << "prog:\n" << Dump(); std::map<int, int> idmap; - for (int i = 0; i < size; i++) - if (nodebyid[i] != -1) - idmap[nodebyid[i]] = i; - + for (int i = 0; i < size; i++) + if (nodebyid[i] != -1) + idmap[nodebyid[i]] = i; + std::string dump; - for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { - int id = *it; - int nodeindex = nodebyid[id]; - if (nodeindex == -1) + for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { + int id = *it; + int nodeindex = nodebyid[id]; + if (nodeindex == -1) continue; OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); dump += StringPrintf("node %d id=%d: matchcond=%#x\n", nodeindex, id, node->matchcond); - for (int i = 0; i < bytemap_range_; i++) { - if ((node->action[i] & kImpossible) == kImpossible) - continue; + for (int i = 0; i < bytemap_range_; i++) { + if ((node->action[i] & kImpossible) == kImpossible) + continue; dump += StringPrintf(" %d cond %#x -> %d id=%d\n", i, node->action[i] & 0xFFFF, node->action[i] >> kIndexShift, idmap[node->action[i] >> kIndexShift]); - } - } + } + } LOG(ERROR) << "nodes:\n" << dump; - } - - dfa_mem_ -= nalloc*statesize; + } + + dfa_mem_ -= nalloc*statesize; onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize); memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize); - return true; - -fail: - return false; -} - -} // namespace re2 + return true; + +fail: + return false; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/parse.cc b/contrib/libs/re2/re2/parse.cc index ed7c34db16..85f16f060b 100644 --- a/contrib/libs/re2/re2/parse.cc +++ b/contrib/libs/re2/re2/parse.cc @@ -1,21 +1,21 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Regular expression parser. - -// The parser is a simple precedence-based parser with a -// manual stack. The parsing work is done by the methods -// of the ParseState class. The Regexp::Parse function is -// essentially just a lexer that calls the ParseState method -// for each token. - -// The parser recognizes POSIX extended regular expressions -// excluding backreferences, collating elements, and collating -// classes. It also allows the empty string as a regular expression -// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. -// See regexp.h for rationale. - +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression parser. + +// The parser is a simple precedence-based parser with a +// manual stack. The parsing work is done by the methods +// of the ParseState class. The Regexp::Parse function is +// essentially just a lexer that calls the ParseState method +// for each token. + +// The parser recognizes POSIX extended regular expressions +// excluding backreferences, collating elements, and collating +// classes. It also allows the empty string as a regular expression +// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. +// See regexp.h for rationale. + #include <ctype.h> #include <stddef.h> #include <stdint.h> @@ -30,20 +30,20 @@ #include "util/strutil.h" #include "util/utf.h" #include "re2/pod_array.h" -#include "re2/regexp.h" +#include "re2/regexp.h" #include "re2/stringpiece.h" -#include "re2/unicode_casefold.h" -#include "re2/unicode_groups.h" +#include "re2/unicode_casefold.h" +#include "re2/unicode_groups.h" #include "re2/walker-inl.h" - + #if defined(RE2_USE_ICU) #include "unicode/uniset.h" #include "unicode/unistr.h" #include "unicode/utypes.h" #endif -namespace re2 { - +namespace re2 { + // Controls the maximum repeat count permitted by the parser. static int maximum_repeat_count = 1000; @@ -51,437 +51,437 @@ void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) { maximum_repeat_count = i; } -// Regular expression parse state. -// The list of parsed regexps so far is maintained as a vector of -// Regexp pointers called the stack. Left parenthesis and vertical -// bar markers are also placed on the stack, as Regexps with -// non-standard opcodes. -// Scanning a left parenthesis causes the parser to push a left parenthesis -// marker on the stack. -// Scanning a vertical bar causes the parser to pop the stack until it finds a -// vertical bar or left parenthesis marker (not popping the marker), -// concatenate all the popped results, and push them back on -// the stack (DoConcatenation). -// Scanning a right parenthesis causes the parser to act as though it -// has seen a vertical bar, which then leaves the top of the stack in the -// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar. -// The parser pops all this off the stack and creates an alternation of the -// regexps (DoAlternation). - -class Regexp::ParseState { - public: - ParseState(ParseFlags flags, const StringPiece& whole_regexp, - RegexpStatus* status); - ~ParseState(); - - ParseFlags flags() { return flags_; } - int rune_max() { return rune_max_; } - - // Parse methods. All public methods return a bool saying - // whether parsing should continue. If a method returns - // false, it has set fields in *status_, and the parser - // should return NULL. - - // Pushes the given regular expression onto the stack. - // Could check for too much memory used here. - bool PushRegexp(Regexp* re); - - // Pushes the literal rune r onto the stack. - bool PushLiteral(Rune r); - - // Pushes a regexp with the given op (and no args) onto the stack. - bool PushSimpleOp(RegexpOp op); - - // Pushes a ^ onto the stack. +// Regular expression parse state. +// The list of parsed regexps so far is maintained as a vector of +// Regexp pointers called the stack. Left parenthesis and vertical +// bar markers are also placed on the stack, as Regexps with +// non-standard opcodes. +// Scanning a left parenthesis causes the parser to push a left parenthesis +// marker on the stack. +// Scanning a vertical bar causes the parser to pop the stack until it finds a +// vertical bar or left parenthesis marker (not popping the marker), +// concatenate all the popped results, and push them back on +// the stack (DoConcatenation). +// Scanning a right parenthesis causes the parser to act as though it +// has seen a vertical bar, which then leaves the top of the stack in the +// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar. +// The parser pops all this off the stack and creates an alternation of the +// regexps (DoAlternation). + +class Regexp::ParseState { + public: + ParseState(ParseFlags flags, const StringPiece& whole_regexp, + RegexpStatus* status); + ~ParseState(); + + ParseFlags flags() { return flags_; } + int rune_max() { return rune_max_; } + + // Parse methods. All public methods return a bool saying + // whether parsing should continue. If a method returns + // false, it has set fields in *status_, and the parser + // should return NULL. + + // Pushes the given regular expression onto the stack. + // Could check for too much memory used here. + bool PushRegexp(Regexp* re); + + // Pushes the literal rune r onto the stack. + bool PushLiteral(Rune r); + + // Pushes a regexp with the given op (and no args) onto the stack. + bool PushSimpleOp(RegexpOp op); + + // Pushes a ^ onto the stack. bool PushCaret(); - - // Pushes a \b (word == true) or \B (word == false) onto the stack. - bool PushWordBoundary(bool word); - - // Pushes a $ onto the stack. - bool PushDollar(); - - // Pushes a . onto the stack - bool PushDot(); - - // Pushes a repeat operator regexp onto the stack. - // A valid argument for the operator must already be on the stack. - // s is the name of the operator, for use in error messages. - bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy); - - // Pushes a repetition regexp onto the stack. - // A valid argument for the operator must already be on the stack. - bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy); - - // Checks whether a particular regexp op is a marker. - bool IsMarker(RegexpOp op); - - // Processes a left parenthesis in the input. - // Pushes a marker onto the stack. - bool DoLeftParen(const StringPiece& name); - bool DoLeftParenNoCapture(); - - // Processes a vertical bar in the input. - bool DoVerticalBar(); - - // Processes a right parenthesis in the input. - bool DoRightParen(); - - // Processes the end of input, returning the final regexp. - Regexp* DoFinish(); - - // Finishes the regexp if necessary, preparing it for use - // in a more complicated expression. - // If it is a CharClassBuilder, converts into a CharClass. - Regexp* FinishRegexp(Regexp*); - - // These routines don't manipulate the parse stack - // directly, but they do need to look at flags_. - // ParseCharClass also manipulates the internals of Regexp - // while creating *out_re. - - // Parse a character class into *out_re. - // Removes parsed text from s. - bool ParseCharClass(StringPiece* s, Regexp** out_re, - RegexpStatus* status); - - // Parse a character class character into *rp. - // Removes parsed text from s. - bool ParseCCCharacter(StringPiece* s, Rune *rp, - const StringPiece& whole_class, - RegexpStatus* status); - - // Parse a character class range into rr. - // Removes parsed text from s. - bool ParseCCRange(StringPiece* s, RuneRange* rr, - const StringPiece& whole_class, - RegexpStatus* status); - - // Parse a Perl flag set or non-capturing group from s. - bool ParsePerlFlags(StringPiece* s); - - - // Finishes the current concatenation, - // collapsing it into a single regexp on the stack. - void DoConcatenation(); - - // Finishes the current alternation, - // collapsing it to a single regexp on the stack. - void DoAlternation(); - - // Generalized DoAlternation/DoConcatenation. - void DoCollapse(RegexpOp op); - - // Maybe concatenate Literals into LiteralString. - bool MaybeConcatString(int r, ParseFlags flags); - -private: - ParseFlags flags_; - StringPiece whole_regexp_; - RegexpStatus* status_; - Regexp* stacktop_; - int ncap_; // number of capturing parens seen - int rune_max_; // maximum char value for this encoding - + + // Pushes a \b (word == true) or \B (word == false) onto the stack. + bool PushWordBoundary(bool word); + + // Pushes a $ onto the stack. + bool PushDollar(); + + // Pushes a . onto the stack + bool PushDot(); + + // Pushes a repeat operator regexp onto the stack. + // A valid argument for the operator must already be on the stack. + // s is the name of the operator, for use in error messages. + bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy); + + // Pushes a repetition regexp onto the stack. + // A valid argument for the operator must already be on the stack. + bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy); + + // Checks whether a particular regexp op is a marker. + bool IsMarker(RegexpOp op); + + // Processes a left parenthesis in the input. + // Pushes a marker onto the stack. + bool DoLeftParen(const StringPiece& name); + bool DoLeftParenNoCapture(); + + // Processes a vertical bar in the input. + bool DoVerticalBar(); + + // Processes a right parenthesis in the input. + bool DoRightParen(); + + // Processes the end of input, returning the final regexp. + Regexp* DoFinish(); + + // Finishes the regexp if necessary, preparing it for use + // in a more complicated expression. + // If it is a CharClassBuilder, converts into a CharClass. + Regexp* FinishRegexp(Regexp*); + + // These routines don't manipulate the parse stack + // directly, but they do need to look at flags_. + // ParseCharClass also manipulates the internals of Regexp + // while creating *out_re. + + // Parse a character class into *out_re. + // Removes parsed text from s. + bool ParseCharClass(StringPiece* s, Regexp** out_re, + RegexpStatus* status); + + // Parse a character class character into *rp. + // Removes parsed text from s. + bool ParseCCCharacter(StringPiece* s, Rune *rp, + const StringPiece& whole_class, + RegexpStatus* status); + + // Parse a character class range into rr. + // Removes parsed text from s. + bool ParseCCRange(StringPiece* s, RuneRange* rr, + const StringPiece& whole_class, + RegexpStatus* status); + + // Parse a Perl flag set or non-capturing group from s. + bool ParsePerlFlags(StringPiece* s); + + + // Finishes the current concatenation, + // collapsing it into a single regexp on the stack. + void DoConcatenation(); + + // Finishes the current alternation, + // collapsing it to a single regexp on the stack. + void DoAlternation(); + + // Generalized DoAlternation/DoConcatenation. + void DoCollapse(RegexpOp op); + + // Maybe concatenate Literals into LiteralString. + bool MaybeConcatString(int r, ParseFlags flags); + +private: + ParseFlags flags_; + StringPiece whole_regexp_; + RegexpStatus* status_; + Regexp* stacktop_; + int ncap_; // number of capturing parens seen + int rune_max_; // maximum char value for this encoding + ParseState(const ParseState&) = delete; ParseState& operator=(const ParseState&) = delete; -}; - -// Pseudo-operators - only on parse stack. -const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); -const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); - -Regexp::ParseState::ParseState(ParseFlags flags, - const StringPiece& whole_regexp, - RegexpStatus* status) - : flags_(flags), whole_regexp_(whole_regexp), - status_(status), stacktop_(NULL), ncap_(0) { - if (flags_ & Latin1) - rune_max_ = 0xFF; - else - rune_max_ = Runemax; -} - -// Cleans up by freeing all the regexps on the stack. -Regexp::ParseState::~ParseState() { - Regexp* next; - for (Regexp* re = stacktop_; re != NULL; re = next) { - next = re->down_; - re->down_ = NULL; - if (re->op() == kLeftParen) - delete re->name_; - re->Decref(); - } -} - -// Finishes the regexp if necessary, preparing it for use in -// a more complex expression. -// If it is a CharClassBuilder, converts into a CharClass. -Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { - if (re == NULL) - return NULL; - re->down_ = NULL; - - if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { - CharClassBuilder* ccb = re->ccb_; - re->ccb_ = NULL; - re->cc_ = ccb->GetCharClass(); - delete ccb; - } - - return re; -} - -// Pushes the given regular expression onto the stack. -// Could check for too much memory used here. -bool Regexp::ParseState::PushRegexp(Regexp* re) { - MaybeConcatString(-1, NoParseFlags); - - // Special case: a character class of one character is just - // a literal. This is a common idiom for escaping - // single characters (e.g., [.] instead of \.), and some - // analysis does better with fewer character classes. - // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. +}; + +// Pseudo-operators - only on parse stack. +const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); +const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); + +Regexp::ParseState::ParseState(ParseFlags flags, + const StringPiece& whole_regexp, + RegexpStatus* status) + : flags_(flags), whole_regexp_(whole_regexp), + status_(status), stacktop_(NULL), ncap_(0) { + if (flags_ & Latin1) + rune_max_ = 0xFF; + else + rune_max_ = Runemax; +} + +// Cleans up by freeing all the regexps on the stack. +Regexp::ParseState::~ParseState() { + Regexp* next; + for (Regexp* re = stacktop_; re != NULL; re = next) { + next = re->down_; + re->down_ = NULL; + if (re->op() == kLeftParen) + delete re->name_; + re->Decref(); + } +} + +// Finishes the regexp if necessary, preparing it for use in +// a more complex expression. +// If it is a CharClassBuilder, converts into a CharClass. +Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { + if (re == NULL) + return NULL; + re->down_ = NULL; + + if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { + CharClassBuilder* ccb = re->ccb_; + re->ccb_ = NULL; + re->cc_ = ccb->GetCharClass(); + delete ccb; + } + + return re; +} + +// Pushes the given regular expression onto the stack. +// Could check for too much memory used here. +bool Regexp::ParseState::PushRegexp(Regexp* re) { + MaybeConcatString(-1, NoParseFlags); + + // Special case: a character class of one character is just + // a literal. This is a common idiom for escaping + // single characters (e.g., [.] instead of \.), and some + // analysis does better with fewer character classes. + // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { re->ccb_->RemoveAbove(rune_max_); - if (re->ccb_->size() == 1) { - Rune r = re->ccb_->begin()->lo; - re->Decref(); - re = new Regexp(kRegexpLiteral, flags_); - re->rune_ = r; - } else if (re->ccb_->size() == 2) { - Rune r = re->ccb_->begin()->lo; - if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { - re->Decref(); - re = new Regexp(kRegexpLiteral, flags_ | FoldCase); - re->rune_ = r + 'a' - 'A'; - } - } - } - - if (!IsMarker(re->op())) - re->simple_ = re->ComputeSimple(); - re->down_ = stacktop_; - stacktop_ = re; - return true; -} - -// Searches the case folding tables and returns the CaseFold* that contains r. -// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. -// If there isn't one, returns NULL. + if (re->ccb_->size() == 1) { + Rune r = re->ccb_->begin()->lo; + re->Decref(); + re = new Regexp(kRegexpLiteral, flags_); + re->rune_ = r; + } else if (re->ccb_->size() == 2) { + Rune r = re->ccb_->begin()->lo; + if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { + re->Decref(); + re = new Regexp(kRegexpLiteral, flags_ | FoldCase); + re->rune_ = r + 'a' - 'A'; + } + } + } + + if (!IsMarker(re->op())) + re->simple_ = re->ComputeSimple(); + re->down_ = stacktop_; + stacktop_ = re; + return true; +} + +// Searches the case folding tables and returns the CaseFold* that contains r. +// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. +// If there isn't one, returns NULL. const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) { const CaseFold* ef = f + n; - - // Binary search for entry containing r. - while (n > 0) { + + // Binary search for entry containing r. + while (n > 0) { int m = n/2; - if (f[m].lo <= r && r <= f[m].hi) - return &f[m]; - if (r < f[m].lo) { - n = m; - } else { - f += m+1; - n -= m+1; - } - } - - // There is no entry that contains r, but f points + if (f[m].lo <= r && r <= f[m].hi) + return &f[m]; + if (r < f[m].lo) { + n = m; + } else { + f += m+1; + n -= m+1; + } + } + + // There is no entry that contains r, but f points // where it would have been. Unless f points at - // the end of the array, it points at the next entry - // after r. + // the end of the array, it points at the next entry + // after r. if (f < ef) - return f; - - // No entry contains r; no entry contains runes > r. - return NULL; -} - -// Returns the result of applying the fold f to the rune r. + return f; + + // No entry contains r; no entry contains runes > r. + return NULL; +} + +// Returns the result of applying the fold f to the rune r. Rune ApplyFold(const CaseFold *f, Rune r) { - switch (f->delta) { - default: - return r + f->delta; - + switch (f->delta) { + default: + return r + f->delta; + case EvenOddSkip: // even <-> odd but only applies to every other if ((r - f->lo) % 2) return r; FALLTHROUGH_INTENDED; - case EvenOdd: // even <-> odd - if (r%2 == 0) - return r + 1; - return r - 1; - + case EvenOdd: // even <-> odd + if (r%2 == 0) + return r + 1; + return r - 1; + case OddEvenSkip: // odd <-> even but only applies to every other if ((r - f->lo) % 2) return r; FALLTHROUGH_INTENDED; - case OddEven: // odd <-> even - if (r%2 == 1) - return r + 1; - return r - 1; - } -} - -// Returns the next Rune in r's folding cycle (see unicode_casefold.h). -// Examples: -// CycleFoldRune('A') = 'a' -// CycleFoldRune('a') = 'A' -// -// CycleFoldRune('K') = 'k' -// CycleFoldRune('k') = 0x212A (Kelvin) -// CycleFoldRune(0x212A) = 'K' -// -// CycleFoldRune('?') = '?' -Rune CycleFoldRune(Rune r) { + case OddEven: // odd <-> even + if (r%2 == 1) + return r + 1; + return r - 1; + } +} + +// Returns the next Rune in r's folding cycle (see unicode_casefold.h). +// Examples: +// CycleFoldRune('A') = 'a' +// CycleFoldRune('a') = 'A' +// +// CycleFoldRune('K') = 'k' +// CycleFoldRune('k') = 0x212A (Kelvin) +// CycleFoldRune(0x212A) = 'K' +// +// CycleFoldRune('?') = '?' +Rune CycleFoldRune(Rune r) { const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); - if (f == NULL || r < f->lo) - return r; - return ApplyFold(f, r); -} - -// Add lo-hi to the class, along with their fold-equivalent characters. -// If lo-hi is already in the class, assume that the fold-equivalent -// chars are there too, so there's no work to do. -static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { - // AddFoldedRange calls itself recursively for each rune in the fold cycle. - // Most folding cycles are small: there aren't any bigger than four in the - // current Unicode tables. make_unicode_casefold.py checks that - // the cycles are not too long, and we double-check here using depth. - if (depth > 10) { - LOG(DFATAL) << "AddFoldedRange recurses too much."; - return; - } - - if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done - return; - - while (lo <= hi) { + if (f == NULL || r < f->lo) + return r; + return ApplyFold(f, r); +} + +// Add lo-hi to the class, along with their fold-equivalent characters. +// If lo-hi is already in the class, assume that the fold-equivalent +// chars are there too, so there's no work to do. +static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { + // AddFoldedRange calls itself recursively for each rune in the fold cycle. + // Most folding cycles are small: there aren't any bigger than four in the + // current Unicode tables. make_unicode_casefold.py checks that + // the cycles are not too long, and we double-check here using depth. + if (depth > 10) { + LOG(DFATAL) << "AddFoldedRange recurses too much."; + return; + } + + if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done + return; + + while (lo <= hi) { const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); - if (f == NULL) // lo has no fold, nor does anything above lo - break; - if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo - lo = f->lo; - continue; - } - - // Add in the result of folding the range lo - f->hi - // and that range's fold, recursively. - Rune lo1 = lo; + if (f == NULL) // lo has no fold, nor does anything above lo + break; + if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo + lo = f->lo; + continue; + } + + // Add in the result of folding the range lo - f->hi + // and that range's fold, recursively. + Rune lo1 = lo; Rune hi1 = std::min<Rune>(hi, f->hi); - switch (f->delta) { - default: - lo1 += f->delta; - hi1 += f->delta; - break; - case EvenOdd: - if (lo1%2 == 1) - lo1--; - if (hi1%2 == 0) - hi1++; - break; - case OddEven: - if (lo1%2 == 0) - lo1--; - if (hi1%2 == 1) - hi1++; - break; - } - AddFoldedRange(cc, lo1, hi1, depth+1); - - // Pick up where this fold left off. - lo = f->hi + 1; - } -} - -// Pushes the literal rune r onto the stack. -bool Regexp::ParseState::PushLiteral(Rune r) { - // Do case folding if needed. - if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { - Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); - re->ccb_ = new CharClassBuilder; - Rune r1 = r; - do { - if (!(flags_ & NeverNL) || r != '\n') { - re->ccb_->AddRange(r, r); - } - r = CycleFoldRune(r); - } while (r != r1); - return PushRegexp(re); - } - - // Exclude newline if applicable. - if ((flags_ & NeverNL) && r == '\n') - return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); - - // No fancy stuff worked. Ordinary literal. - if (MaybeConcatString(r, flags_)) - return true; - - Regexp* re = new Regexp(kRegexpLiteral, flags_); - re->rune_ = r; - return PushRegexp(re); -} - -// Pushes a ^ onto the stack. + switch (f->delta) { + default: + lo1 += f->delta; + hi1 += f->delta; + break; + case EvenOdd: + if (lo1%2 == 1) + lo1--; + if (hi1%2 == 0) + hi1++; + break; + case OddEven: + if (lo1%2 == 0) + lo1--; + if (hi1%2 == 1) + hi1++; + break; + } + AddFoldedRange(cc, lo1, hi1, depth+1); + + // Pick up where this fold left off. + lo = f->hi + 1; + } +} + +// Pushes the literal rune r onto the stack. +bool Regexp::ParseState::PushLiteral(Rune r) { + // Do case folding if needed. + if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + Rune r1 = r; + do { + if (!(flags_ & NeverNL) || r != '\n') { + re->ccb_->AddRange(r, r); + } + r = CycleFoldRune(r); + } while (r != r1); + return PushRegexp(re); + } + + // Exclude newline if applicable. + if ((flags_ & NeverNL) && r == '\n') + return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); + + // No fancy stuff worked. Ordinary literal. + if (MaybeConcatString(r, flags_)) + return true; + + Regexp* re = new Regexp(kRegexpLiteral, flags_); + re->rune_ = r; + return PushRegexp(re); +} + +// Pushes a ^ onto the stack. bool Regexp::ParseState::PushCaret() { - if (flags_ & OneLine) { - return PushSimpleOp(kRegexpBeginText); - } - return PushSimpleOp(kRegexpBeginLine); -} - -// Pushes a \b or \B onto the stack. -bool Regexp::ParseState::PushWordBoundary(bool word) { - if (word) - return PushSimpleOp(kRegexpWordBoundary); - return PushSimpleOp(kRegexpNoWordBoundary); -} - -// Pushes a $ onto the stack. -bool Regexp::ParseState::PushDollar() { - if (flags_ & OneLine) { - // Clumsy marker so that MimicsPCRE() can tell whether - // this kRegexpEndText was a $ and not a \z. - Regexp::ParseFlags oflags = flags_; - flags_ = flags_ | WasDollar; - bool ret = PushSimpleOp(kRegexpEndText); - flags_ = oflags; - return ret; - } - return PushSimpleOp(kRegexpEndLine); -} - -// Pushes a . onto the stack. -bool Regexp::ParseState::PushDot() { - if ((flags_ & DotNL) && !(flags_ & NeverNL)) - return PushSimpleOp(kRegexpAnyChar); - // Rewrite . into [^\n] - Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); - re->ccb_ = new CharClassBuilder; - re->ccb_->AddRange(0, '\n' - 1); - re->ccb_->AddRange('\n' + 1, rune_max_); - return PushRegexp(re); -} - -// Pushes a regexp with the given op (and no args) onto the stack. -bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { - Regexp* re = new Regexp(op, flags_); - return PushRegexp(re); -} - -// Pushes a repeat operator regexp onto the stack. -// A valid argument for the operator must already be on the stack. -// The char c is the name of the operator, for use in error messages. -bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, - bool nongreedy) { - if (stacktop_ == NULL || IsMarker(stacktop_->op())) { - status_->set_code(kRegexpRepeatArgument); - status_->set_error_arg(s); - return false; - } - Regexp::ParseFlags fl = flags_; - if (nongreedy) - fl = fl ^ NonGreedy; + if (flags_ & OneLine) { + return PushSimpleOp(kRegexpBeginText); + } + return PushSimpleOp(kRegexpBeginLine); +} + +// Pushes a \b or \B onto the stack. +bool Regexp::ParseState::PushWordBoundary(bool word) { + if (word) + return PushSimpleOp(kRegexpWordBoundary); + return PushSimpleOp(kRegexpNoWordBoundary); +} + +// Pushes a $ onto the stack. +bool Regexp::ParseState::PushDollar() { + if (flags_ & OneLine) { + // Clumsy marker so that MimicsPCRE() can tell whether + // this kRegexpEndText was a $ and not a \z. + Regexp::ParseFlags oflags = flags_; + flags_ = flags_ | WasDollar; + bool ret = PushSimpleOp(kRegexpEndText); + flags_ = oflags; + return ret; + } + return PushSimpleOp(kRegexpEndLine); +} + +// Pushes a . onto the stack. +bool Regexp::ParseState::PushDot() { + if ((flags_ & DotNL) && !(flags_ & NeverNL)) + return PushSimpleOp(kRegexpAnyChar); + // Rewrite . into [^\n] + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + re->ccb_->AddRange(0, '\n' - 1); + re->ccb_->AddRange('\n' + 1, rune_max_); + return PushRegexp(re); +} + +// Pushes a regexp with the given op (and no args) onto the stack. +bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { + Regexp* re = new Regexp(op, flags_); + return PushRegexp(re); +} + +// Pushes a repeat operator regexp onto the stack. +// A valid argument for the operator must already be on the stack. +// The char c is the name of the operator, for use in error messages. +bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, + bool nongreedy) { + if (stacktop_ == NULL || IsMarker(stacktop_->op())) { + status_->set_code(kRegexpRepeatArgument); + status_->set_error_arg(s); + return false; + } + Regexp::ParseFlags fl = flags_; + if (nongreedy) + fl = fl ^ NonGreedy; // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but // they're mostly for use during simplification, not during parsing. @@ -499,15 +499,15 @@ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, return true; } - Regexp* re = new Regexp(op, fl); - re->AllocSub(1); - re->down_ = stacktop_->down_; - re->sub()[0] = FinishRegexp(stacktop_); - re->simple_ = re->ComputeSimple(); - stacktop_ = re; - return true; -} - + Regexp* re = new Regexp(op, fl); + re->AllocSub(1); + re->down_ = stacktop_->down_; + re->sub()[0] = FinishRegexp(stacktop_); + re->simple_ = re->ComputeSimple(); + stacktop_ = re; + return true; +} + // RepetitionWalker reports whether the repetition regexp is valid. // Valid means that the combination of the top-level repetition // and any inner repetitions does not exceed n copies of the @@ -563,34 +563,34 @@ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { return 0; } -// Pushes a repetition regexp onto the stack. -// A valid argument for the operator must already be on the stack. -bool Regexp::ParseState::PushRepetition(int min, int max, - const StringPiece& s, - bool nongreedy) { +// Pushes a repetition regexp onto the stack. +// A valid argument for the operator must already be on the stack. +bool Regexp::ParseState::PushRepetition(int min, int max, + const StringPiece& s, + bool nongreedy) { if ((max != -1 && max < min) || min > maximum_repeat_count || max > maximum_repeat_count) { - status_->set_code(kRegexpRepeatSize); - status_->set_error_arg(s); - return false; - } - if (stacktop_ == NULL || IsMarker(stacktop_->op())) { - status_->set_code(kRegexpRepeatArgument); - status_->set_error_arg(s); - return false; - } - Regexp::ParseFlags fl = flags_; - if (nongreedy) - fl = fl ^ NonGreedy; - Regexp* re = new Regexp(kRegexpRepeat, fl); - re->min_ = min; - re->max_ = max; - re->AllocSub(1); - re->down_ = stacktop_->down_; - re->sub()[0] = FinishRegexp(stacktop_); - re->simple_ = re->ComputeSimple(); - stacktop_ = re; + status_->set_code(kRegexpRepeatSize); + status_->set_error_arg(s); + return false; + } + if (stacktop_ == NULL || IsMarker(stacktop_->op())) { + status_->set_code(kRegexpRepeatArgument); + status_->set_error_arg(s); + return false; + } + Regexp::ParseFlags fl = flags_; + if (nongreedy) + fl = fl ^ NonGreedy; + Regexp* re = new Regexp(kRegexpRepeat, fl); + re->min_ = min; + re->max_ = max; + re->AllocSub(1); + re->down_ = stacktop_->down_; + re->sub()[0] = FinishRegexp(stacktop_); + re->simple_ = re->ComputeSimple(); + stacktop_ = re; if (min >= 2 || max >= 2) { RepetitionWalker w; if (w.Walk(stacktop_, maximum_repeat_count) == 0) { @@ -599,47 +599,47 @@ bool Regexp::ParseState::PushRepetition(int min, int max, return false; } } - return true; -} - -// Checks whether a particular regexp op is a marker. -bool Regexp::ParseState::IsMarker(RegexpOp op) { - return op >= kLeftParen; -} - -// Processes a left parenthesis in the input. -// Pushes a marker onto the stack. -bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { - Regexp* re = new Regexp(kLeftParen, flags_); - re->cap_ = ++ncap_; - if (name.data() != NULL) + return true; +} + +// Checks whether a particular regexp op is a marker. +bool Regexp::ParseState::IsMarker(RegexpOp op) { + return op >= kLeftParen; +} + +// Processes a left parenthesis in the input. +// Pushes a marker onto the stack. +bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { + Regexp* re = new Regexp(kLeftParen, flags_); + re->cap_ = ++ncap_; + if (name.data() != NULL) re->name_ = new std::string(name); - return PushRegexp(re); -} - -// Pushes a non-capturing marker onto the stack. -bool Regexp::ParseState::DoLeftParenNoCapture() { - Regexp* re = new Regexp(kLeftParen, flags_); - re->cap_ = -1; - return PushRegexp(re); -} - -// Processes a vertical bar in the input. -bool Regexp::ParseState::DoVerticalBar() { - MaybeConcatString(-1, NoParseFlags); - DoConcatenation(); - - // Below the vertical bar is a list to alternate. - // Above the vertical bar is a list to concatenate. - // We just did the concatenation, so either swap - // the result below the vertical bar or push a new - // vertical bar on the stack. - Regexp* r1; - Regexp* r2; - if ((r1 = stacktop_) != NULL && + return PushRegexp(re); +} + +// Pushes a non-capturing marker onto the stack. +bool Regexp::ParseState::DoLeftParenNoCapture() { + Regexp* re = new Regexp(kLeftParen, flags_); + re->cap_ = -1; + return PushRegexp(re); +} + +// Processes a vertical bar in the input. +bool Regexp::ParseState::DoVerticalBar() { + MaybeConcatString(-1, NoParseFlags); + DoConcatenation(); + + // Below the vertical bar is a list to alternate. + // Above the vertical bar is a list to concatenate. + // We just did the concatenation, so either swap + // the result below the vertical bar or push a new + // vertical bar on the stack. + Regexp* r1; + Regexp* r2; + if ((r1 = stacktop_) != NULL && (r2 = r1->down_) != NULL && - r2->op() == kVerticalBar) { - Regexp* r3; + r2->op() == kVerticalBar) { + Regexp* r3; if ((r3 = r2->down_) != NULL && (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) { // AnyChar is above or below the vertical bar. Let it subsume @@ -652,7 +652,7 @@ bool Regexp::ParseState::DoVerticalBar() { stacktop_ = r2; r1->Decref(); return true; - } + } if (r1->op() == kRegexpAnyChar && (r3->op() == kRegexpLiteral || r3->op() == kRegexpCharClass || @@ -664,212 +664,212 @@ bool Regexp::ParseState::DoVerticalBar() { r3->Decref(); return true; } - } - // Swap r1 below vertical bar (r2). - r1->down_ = r2->down_; - r2->down_ = r1; - stacktop_ = r2; - return true; - } - return PushSimpleOp(kVerticalBar); -} - -// Processes a right parenthesis in the input. -bool Regexp::ParseState::DoRightParen() { - // Finish the current concatenation and alternation. - DoAlternation(); - - // The stack should be: LeftParen regexp - // Remove the LeftParen, leaving the regexp, - // parenthesized. - Regexp* r1; - Regexp* r2; - if ((r1 = stacktop_) == NULL || - (r2 = r1->down_) == NULL || - r2->op() != kLeftParen) { + } + // Swap r1 below vertical bar (r2). + r1->down_ = r2->down_; + r2->down_ = r1; + stacktop_ = r2; + return true; + } + return PushSimpleOp(kVerticalBar); +} + +// Processes a right parenthesis in the input. +bool Regexp::ParseState::DoRightParen() { + // Finish the current concatenation and alternation. + DoAlternation(); + + // The stack should be: LeftParen regexp + // Remove the LeftParen, leaving the regexp, + // parenthesized. + Regexp* r1; + Regexp* r2; + if ((r1 = stacktop_) == NULL || + (r2 = r1->down_) == NULL || + r2->op() != kLeftParen) { status_->set_code(kRegexpUnexpectedParen); - status_->set_error_arg(whole_regexp_); - return false; - } - - // Pop off r1, r2. Will Decref or reuse below. - stacktop_ = r2->down_; - - // Restore flags from when paren opened. - Regexp* re = r2; - flags_ = re->parse_flags(); - - // Rewrite LeftParen as capture if needed. - if (re->cap_ > 0) { - re->op_ = kRegexpCapture; - // re->cap_ is already set - re->AllocSub(1); - re->sub()[0] = FinishRegexp(r1); - re->simple_ = re->ComputeSimple(); - } else { - re->Decref(); - re = r1; - } - return PushRegexp(re); -} - -// Processes the end of input, returning the final regexp. -Regexp* Regexp::ParseState::DoFinish() { - DoAlternation(); - Regexp* re = stacktop_; - if (re != NULL && re->down_ != NULL) { - status_->set_code(kRegexpMissingParen); - status_->set_error_arg(whole_regexp_); - return NULL; - } - stacktop_ = NULL; - return FinishRegexp(re); -} - -// Returns the leading regexp that re starts with. -// The returned Regexp* points into a piece of re, -// so it must not be used after the caller calls re->Decref(). -Regexp* Regexp::LeadingRegexp(Regexp* re) { - if (re->op() == kRegexpEmptyMatch) - return NULL; - if (re->op() == kRegexpConcat && re->nsub() >= 2) { - Regexp** sub = re->sub(); - if (sub[0]->op() == kRegexpEmptyMatch) - return NULL; - return sub[0]; - } - return re; -} - -// Removes LeadingRegexp(re) from re and returns what's left. -// Consumes the reference to re and may edit it in place. -// If caller wants to hold on to LeadingRegexp(re), -// must have already Incref'ed it. -Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { - if (re->op() == kRegexpEmptyMatch) - return re; - if (re->op() == kRegexpConcat && re->nsub() >= 2) { - Regexp** sub = re->sub(); - if (sub[0]->op() == kRegexpEmptyMatch) - return re; - sub[0]->Decref(); - sub[0] = NULL; - if (re->nsub() == 2) { - // Collapse concatenation to single regexp. - Regexp* nre = sub[1]; - sub[1] = NULL; - re->Decref(); - return nre; - } - // 3 or more -> 2 or more. - re->nsub_--; - memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); - return re; - } - Regexp::ParseFlags pf = re->parse_flags(); - re->Decref(); - return new Regexp(kRegexpEmptyMatch, pf); -} - -// Returns the leading string that re starts with. -// The returned Rune* points into a piece of re, -// so it must not be used after the caller calls re->Decref(). -Rune* Regexp::LeadingString(Regexp* re, int *nrune, - Regexp::ParseFlags *flags) { - while (re->op() == kRegexpConcat && re->nsub() > 0) - re = re->sub()[0]; - - *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase); - - if (re->op() == kRegexpLiteral) { - *nrune = 1; - return &re->rune_; - } - - if (re->op() == kRegexpLiteralString) { - *nrune = re->nrunes_; - return re->runes_; - } - - *nrune = 0; - return NULL; -} - -// Removes the first n leading runes from the beginning of re. -// Edits re in place. -void Regexp::RemoveLeadingString(Regexp* re, int n) { - // Chase down concats to find first string. - // For regexps generated by parser, nested concats are - // flattened except when doing so would overflow the 16-bit - // limit on the size of a concatenation, so we should never - // see more than two here. - Regexp* stk[4]; + status_->set_error_arg(whole_regexp_); + return false; + } + + // Pop off r1, r2. Will Decref or reuse below. + stacktop_ = r2->down_; + + // Restore flags from when paren opened. + Regexp* re = r2; + flags_ = re->parse_flags(); + + // Rewrite LeftParen as capture if needed. + if (re->cap_ > 0) { + re->op_ = kRegexpCapture; + // re->cap_ is already set + re->AllocSub(1); + re->sub()[0] = FinishRegexp(r1); + re->simple_ = re->ComputeSimple(); + } else { + re->Decref(); + re = r1; + } + return PushRegexp(re); +} + +// Processes the end of input, returning the final regexp. +Regexp* Regexp::ParseState::DoFinish() { + DoAlternation(); + Regexp* re = stacktop_; + if (re != NULL && re->down_ != NULL) { + status_->set_code(kRegexpMissingParen); + status_->set_error_arg(whole_regexp_); + return NULL; + } + stacktop_ = NULL; + return FinishRegexp(re); +} + +// Returns the leading regexp that re starts with. +// The returned Regexp* points into a piece of re, +// so it must not be used after the caller calls re->Decref(). +Regexp* Regexp::LeadingRegexp(Regexp* re) { + if (re->op() == kRegexpEmptyMatch) + return NULL; + if (re->op() == kRegexpConcat && re->nsub() >= 2) { + Regexp** sub = re->sub(); + if (sub[0]->op() == kRegexpEmptyMatch) + return NULL; + return sub[0]; + } + return re; +} + +// Removes LeadingRegexp(re) from re and returns what's left. +// Consumes the reference to re and may edit it in place. +// If caller wants to hold on to LeadingRegexp(re), +// must have already Incref'ed it. +Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { + if (re->op() == kRegexpEmptyMatch) + return re; + if (re->op() == kRegexpConcat && re->nsub() >= 2) { + Regexp** sub = re->sub(); + if (sub[0]->op() == kRegexpEmptyMatch) + return re; + sub[0]->Decref(); + sub[0] = NULL; + if (re->nsub() == 2) { + // Collapse concatenation to single regexp. + Regexp* nre = sub[1]; + sub[1] = NULL; + re->Decref(); + return nre; + } + // 3 or more -> 2 or more. + re->nsub_--; + memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); + return re; + } + Regexp::ParseFlags pf = re->parse_flags(); + re->Decref(); + return new Regexp(kRegexpEmptyMatch, pf); +} + +// Returns the leading string that re starts with. +// The returned Rune* points into a piece of re, +// so it must not be used after the caller calls re->Decref(). +Rune* Regexp::LeadingString(Regexp* re, int *nrune, + Regexp::ParseFlags *flags) { + while (re->op() == kRegexpConcat && re->nsub() > 0) + re = re->sub()[0]; + + *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase); + + if (re->op() == kRegexpLiteral) { + *nrune = 1; + return &re->rune_; + } + + if (re->op() == kRegexpLiteralString) { + *nrune = re->nrunes_; + return re->runes_; + } + + *nrune = 0; + return NULL; +} + +// Removes the first n leading runes from the beginning of re. +// Edits re in place. +void Regexp::RemoveLeadingString(Regexp* re, int n) { + // Chase down concats to find first string. + // For regexps generated by parser, nested concats are + // flattened except when doing so would overflow the 16-bit + // limit on the size of a concatenation, so we should never + // see more than two here. + Regexp* stk[4]; size_t d = 0; - while (re->op() == kRegexpConcat) { - if (d < arraysize(stk)) - stk[d++] = re; - re = re->sub()[0]; - } - - // Remove leading string from re. - if (re->op() == kRegexpLiteral) { - re->rune_ = 0; - re->op_ = kRegexpEmptyMatch; - } else if (re->op() == kRegexpLiteralString) { - if (n >= re->nrunes_) { - delete[] re->runes_; - re->runes_ = NULL; - re->nrunes_ = 0; - re->op_ = kRegexpEmptyMatch; - } else if (n == re->nrunes_ - 1) { - Rune rune = re->runes_[re->nrunes_ - 1]; - delete[] re->runes_; - re->runes_ = NULL; - re->nrunes_ = 0; - re->rune_ = rune; - re->op_ = kRegexpLiteral; - } else { - re->nrunes_ -= n; - memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]); - } - } - - // If re is now empty, concatenations might simplify too. + while (re->op() == kRegexpConcat) { + if (d < arraysize(stk)) + stk[d++] = re; + re = re->sub()[0]; + } + + // Remove leading string from re. + if (re->op() == kRegexpLiteral) { + re->rune_ = 0; + re->op_ = kRegexpEmptyMatch; + } else if (re->op() == kRegexpLiteralString) { + if (n >= re->nrunes_) { + delete[] re->runes_; + re->runes_ = NULL; + re->nrunes_ = 0; + re->op_ = kRegexpEmptyMatch; + } else if (n == re->nrunes_ - 1) { + Rune rune = re->runes_[re->nrunes_ - 1]; + delete[] re->runes_; + re->runes_ = NULL; + re->nrunes_ = 0; + re->rune_ = rune; + re->op_ = kRegexpLiteral; + } else { + re->nrunes_ -= n; + memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]); + } + } + + // If re is now empty, concatenations might simplify too. while (d > 0) { re = stk[--d]; - Regexp** sub = re->sub(); - if (sub[0]->op() == kRegexpEmptyMatch) { - sub[0]->Decref(); - sub[0] = NULL; - // Delete first element of concat. - switch (re->nsub()) { - case 0: - case 1: - // Impossible. - LOG(DFATAL) << "Concat of " << re->nsub(); - re->submany_ = NULL; - re->op_ = kRegexpEmptyMatch; - break; - - case 2: { - // Replace re with sub[1]. - Regexp* old = sub[1]; - sub[1] = NULL; - re->Swap(old); - old->Decref(); - break; - } - - default: - // Slide down. - re->nsub_--; - memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); - break; - } - } - } -} - + Regexp** sub = re->sub(); + if (sub[0]->op() == kRegexpEmptyMatch) { + sub[0]->Decref(); + sub[0] = NULL; + // Delete first element of concat. + switch (re->nsub()) { + case 0: + case 1: + // Impossible. + LOG(DFATAL) << "Concat of " << re->nsub(); + re->submany_ = NULL; + re->op_ = kRegexpEmptyMatch; + break; + + case 2: { + // Replace re with sub[1]. + Regexp* old = sub[1]; + sub[1] = NULL; + re->Swap(old); + old->Decref(); + break; + } + + default: + // Slide down. + re->nsub_--; + memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); + break; + } + } + } +} + // In the context of factoring alternations, a Splice is: a factored prefix or // merged character class computed by one iteration of one round of factoring; // the span of subexpressions of the alternation to be "spliced" (i.e. removed @@ -921,28 +921,28 @@ class FactorAlternationImpl { std::vector<Splice>* splices); }; -// Factors common prefixes from alternation. -// For example, -// ABC|ABD|AEF|BCX|BCY -// simplifies to -// A(B(C|D)|EF)|BC(X|Y) +// Factors common prefixes from alternation. +// For example, +// ABC|ABD|AEF|BCX|BCY +// simplifies to +// A(B(C|D)|EF)|BC(X|Y) // and thence to -// A(B[CD]|EF)|BC[XY] -// -// Rewrites sub to contain simplified list to alternate and returns -// the new length of sub. Adjusts reference counts accordingly -// (incoming sub[i] decremented, outgoing sub[i] incremented). +// A(B[CD]|EF)|BC[XY] +// +// Rewrites sub to contain simplified list to alternate and returns +// the new length of sub. Adjusts reference counts accordingly +// (incoming sub[i] decremented, outgoing sub[i] incremented). int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { std::vector<Frame> stk; stk.emplace_back(sub, nsub); - + for (;;) { auto& sub = stk.back().sub; auto& nsub = stk.back().nsub; auto& round = stk.back().round; auto& splices = stk.back().splices; auto& spliceidx = stk.back().spliceidx; - + if (splices.empty()) { // Advance to the next round of factoring. Note that this covers // the initialised state: when splices is empty and round is 0. @@ -990,7 +990,7 @@ int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { // Advance to the next round of factoring. round++; } - + switch (round) { case 1: FactorAlternationImpl::Round1(sub, nsub, flags, &splices); @@ -1018,7 +1018,7 @@ int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { LOG(DFATAL) << "unknown round: " << round; break; } - + // Set spliceidx depending on whether we have Splices to factor. if (splices.empty() || round == 3) { spliceidx = static_cast<int>(splices.size()); @@ -1027,59 +1027,59 @@ int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { } } } - + void FactorAlternationImpl::Round1(Regexp** sub, int nsub, Regexp::ParseFlags flags, std::vector<Splice>* splices) { - // Round 1: Factor out common literal prefixes. + // Round 1: Factor out common literal prefixes. int start = 0; Rune* rune = NULL; - int nrune = 0; - Regexp::ParseFlags runeflags = Regexp::NoParseFlags; + int nrune = 0; + Regexp::ParseFlags runeflags = Regexp::NoParseFlags; for (int i = 0; i <= nsub; i++) { // Invariant: sub[start:i] consists of regexps that all // begin with rune[0:nrune]. - Rune* rune_i = NULL; - int nrune_i = 0; - Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; + Rune* rune_i = NULL; + int nrune_i = 0; + Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; if (i < nsub) { rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i); - if (runeflags_i == runeflags) { - int same = 0; - while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) - same++; - if (same > 0) { - // Matches at least one rune in current range. Keep going around. - nrune = same; - continue; - } - } - } - - // Found end of a run with common leading literal string: + if (runeflags_i == runeflags) { + int same = 0; + while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) + same++; + if (same > 0) { + // Matches at least one rune in current range. Keep going around. + nrune = same; + continue; + } + } + } + + // Found end of a run with common leading literal string: // sub[start:i] all begin with rune[0:nrune], // but sub[i] does not even begin with rune[0]. - if (i == start) { - // Nothing to do - first iteration. - } else if (i == start+1) { - // Just one: don't bother factoring. - } else { + if (i == start) { + // Nothing to do - first iteration. + } else if (i == start+1) { + // Just one: don't bother factoring. + } else { Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags); - for (int j = start; j < i; j++) + for (int j = start; j < i; j++) Regexp::RemoveLeadingString(sub[j], nrune); splices->emplace_back(prefix, sub + start, i - start); - } - + } + // Prepare for next iteration (if there is one). if (i < nsub) { - start = i; - rune = rune_i; - nrune = nrune_i; - runeflags = runeflags_i; - } - } + start = i; + rune = rune_i; + nrune = nrune_i; + runeflags = runeflags_i; + } + } } - + void FactorAlternationImpl::Round2(Regexp** sub, int nsub, Regexp::ParseFlags flags, std::vector<Splice>* splices) { @@ -1092,11 +1092,11 @@ void FactorAlternationImpl::Round2(Regexp** sub, int nsub, // distinct paths through the automaton, which affects // correctness in some cases. int start = 0; - Regexp* first = NULL; + Regexp* first = NULL; for (int i = 0; i <= nsub; i++) { // Invariant: sub[start:i] consists of regexps that all // begin with first. - Regexp* first_i = NULL; + Regexp* first_i = NULL; if (i < nsub) { first_i = Regexp::LeadingRegexp(sub[i]); if (first != NULL && @@ -1119,31 +1119,31 @@ void FactorAlternationImpl::Round2(Regexp** sub, int nsub, first->sub()[0]->op() == kRegexpAnyChar || first->sub()[0]->op() == kRegexpAnyByte))) && Regexp::Equal(first, first_i)) - continue; - } - - // Found end of a run with common leading regexp: + continue; + } + + // Found end of a run with common leading regexp: // sub[start:i] all begin with first, // but sub[i] does not. - if (i == start) { - // Nothing to do - first iteration. - } else if (i == start+1) { - // Just one: don't bother factoring. - } else { + if (i == start) { + // Nothing to do - first iteration. + } else if (i == start+1) { + // Just one: don't bother factoring. + } else { Regexp* prefix = first->Incref(); - for (int j = start; j < i; j++) + for (int j = start; j < i; j++) sub[j] = Regexp::RemoveLeadingRegexp(sub[j]); splices->emplace_back(prefix, sub + start, i - start); - } - + } + // Prepare for next iteration (if there is one). if (i < nsub) { - start = i; - first = first_i; - } - } + start = i; + first = first_i; + } + } } - + void FactorAlternationImpl::Round3(Regexp** sub, int nsub, Regexp::ParseFlags flags, std::vector<Splice>* splices) { @@ -1163,234 +1163,234 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub, first_i->op() == kRegexpCharClass)) continue; } - + // Found end of a run of Literal/CharClass: // sub[start:i] all are either one or the other, // but sub[i] is not. - if (i == start) { + if (i == start) { // Nothing to do - first iteration. - } else if (i == start+1) { + } else if (i == start+1) { // Just one: don't bother factoring. - } else { - CharClassBuilder ccb; - for (int j = start; j < i; j++) { - Regexp* re = sub[j]; - if (re->op() == kRegexpCharClass) { - CharClass* cc = re->cc(); - for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) - ccb.AddRange(it->lo, it->hi); - } else if (re->op() == kRegexpLiteral) { - ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags()); - } else { - LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " " - << re->ToString(); - } - re->Decref(); - } + } else { + CharClassBuilder ccb; + for (int j = start; j < i; j++) { + Regexp* re = sub[j]; + if (re->op() == kRegexpCharClass) { + CharClass* cc = re->cc(); + for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) + ccb.AddRange(it->lo, it->hi); + } else if (re->op() == kRegexpLiteral) { + ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags()); + } else { + LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " " + << re->ToString(); + } + re->Decref(); + } Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); splices->emplace_back(re, sub + start, i - start); - } - + } + // Prepare for next iteration (if there is one). if (i < nsub) { start = i; first = first_i; - } - } -} - -// Collapse the regexps on top of the stack, down to the -// first marker, into a new op node (op == kRegexpAlternate -// or op == kRegexpConcat). -void Regexp::ParseState::DoCollapse(RegexpOp op) { - // Scan backward to marker, counting children of composite. - int n = 0; - Regexp* next = NULL; - Regexp* sub; - for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { - next = sub->down_; - if (sub->op_ == op) - n += sub->nsub_; - else - n++; - } - - // If there's just one child, leave it alone. - // (Concat of one thing is that one thing; alternate of one thing is same.) - if (stacktop_ != NULL && stacktop_->down_ == next) - return; - - // Construct op (alternation or concatenation), flattening op of op. + } + } +} + +// Collapse the regexps on top of the stack, down to the +// first marker, into a new op node (op == kRegexpAlternate +// or op == kRegexpConcat). +void Regexp::ParseState::DoCollapse(RegexpOp op) { + // Scan backward to marker, counting children of composite. + int n = 0; + Regexp* next = NULL; + Regexp* sub; + for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { + next = sub->down_; + if (sub->op_ == op) + n += sub->nsub_; + else + n++; + } + + // If there's just one child, leave it alone. + // (Concat of one thing is that one thing; alternate of one thing is same.) + if (stacktop_ != NULL && stacktop_->down_ == next) + return; + + // Construct op (alternation or concatenation), flattening op of op. PODArray<Regexp*> subs(n); - next = NULL; - int i = n; - for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { - next = sub->down_; - if (sub->op_ == op) { - Regexp** sub_subs = sub->sub(); - for (int k = sub->nsub_ - 1; k >= 0; k--) - subs[--i] = sub_subs[k]->Incref(); - sub->Decref(); - } else { - subs[--i] = FinishRegexp(sub); - } - } - + next = NULL; + int i = n; + for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { + next = sub->down_; + if (sub->op_ == op) { + Regexp** sub_subs = sub->sub(); + for (int k = sub->nsub_ - 1; k >= 0; k--) + subs[--i] = sub_subs[k]->Incref(); + sub->Decref(); + } else { + subs[--i] = FinishRegexp(sub); + } + } + Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true); - re->simple_ = re->ComputeSimple(); - re->down_ = next; - stacktop_ = re; -} - -// Finishes the current concatenation, -// collapsing it into a single regexp on the stack. -void Regexp::ParseState::DoConcatenation() { - Regexp* r1 = stacktop_; - if (r1 == NULL || IsMarker(r1->op())) { - // empty concatenation is special case - Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); - PushRegexp(re); - } - DoCollapse(kRegexpConcat); -} - -// Finishes the current alternation, -// collapsing it to a single regexp on the stack. -void Regexp::ParseState::DoAlternation() { - DoVerticalBar(); - // Now stack top is kVerticalBar. - Regexp* r1 = stacktop_; - stacktop_ = r1->down_; - r1->Decref(); - DoCollapse(kRegexpAlternate); -} - -// Incremental conversion of concatenated literals into strings. -// If top two elements on stack are both literal or string, -// collapse into single string. -// Don't walk down the stack -- the parser calls this frequently -// enough that below the bottom two is known to be collapsed. -// Only called when another regexp is about to be pushed -// on the stack, so that the topmost literal is not being considered. -// (Otherwise ab* would turn into (ab)*.) -// If r >= 0, consider pushing a literal r on the stack. -// Return whether that happened. -bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { - Regexp* re1; - Regexp* re2; - if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) - return false; - - if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) - return false; - if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) - return false; - if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) - return false; - - if (re2->op_ == kRegexpLiteral) { - // convert into string - Rune rune = re2->rune_; - re2->op_ = kRegexpLiteralString; - re2->nrunes_ = 0; - re2->runes_ = NULL; - re2->AddRuneToString(rune); - } - - // push re1 into re2. - if (re1->op_ == kRegexpLiteral) { - re2->AddRuneToString(re1->rune_); - } else { - for (int i = 0; i < re1->nrunes_; i++) - re2->AddRuneToString(re1->runes_[i]); - re1->nrunes_ = 0; - delete[] re1->runes_; - re1->runes_ = NULL; - } - - // reuse re1 if possible - if (r >= 0) { - re1->op_ = kRegexpLiteral; - re1->rune_ = r; + re->simple_ = re->ComputeSimple(); + re->down_ = next; + stacktop_ = re; +} + +// Finishes the current concatenation, +// collapsing it into a single regexp on the stack. +void Regexp::ParseState::DoConcatenation() { + Regexp* r1 = stacktop_; + if (r1 == NULL || IsMarker(r1->op())) { + // empty concatenation is special case + Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); + PushRegexp(re); + } + DoCollapse(kRegexpConcat); +} + +// Finishes the current alternation, +// collapsing it to a single regexp on the stack. +void Regexp::ParseState::DoAlternation() { + DoVerticalBar(); + // Now stack top is kVerticalBar. + Regexp* r1 = stacktop_; + stacktop_ = r1->down_; + r1->Decref(); + DoCollapse(kRegexpAlternate); +} + +// Incremental conversion of concatenated literals into strings. +// If top two elements on stack are both literal or string, +// collapse into single string. +// Don't walk down the stack -- the parser calls this frequently +// enough that below the bottom two is known to be collapsed. +// Only called when another regexp is about to be pushed +// on the stack, so that the topmost literal is not being considered. +// (Otherwise ab* would turn into (ab)*.) +// If r >= 0, consider pushing a literal r on the stack. +// Return whether that happened. +bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { + Regexp* re1; + Regexp* re2; + if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) + return false; + + if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) + return false; + if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) + return false; + if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) + return false; + + if (re2->op_ == kRegexpLiteral) { + // convert into string + Rune rune = re2->rune_; + re2->op_ = kRegexpLiteralString; + re2->nrunes_ = 0; + re2->runes_ = NULL; + re2->AddRuneToString(rune); + } + + // push re1 into re2. + if (re1->op_ == kRegexpLiteral) { + re2->AddRuneToString(re1->rune_); + } else { + for (int i = 0; i < re1->nrunes_; i++) + re2->AddRuneToString(re1->runes_[i]); + re1->nrunes_ = 0; + delete[] re1->runes_; + re1->runes_ = NULL; + } + + // reuse re1 if possible + if (r >= 0) { + re1->op_ = kRegexpLiteral; + re1->rune_ = r; re1->parse_flags_ = static_cast<uint16_t>(flags); - return true; - } - - stacktop_ = re2; - re1->Decref(); - return false; -} - -// Lexing routines. - + return true; + } + + stacktop_ = re2; + re1->Decref(); + return false; +} + +// Lexing routines. + // Parses a decimal integer, storing it in *np. -// Sets *s to span the remainder of the string. -static bool ParseInteger(StringPiece* s, int* np) { +// Sets *s to span the remainder of the string. +static bool ParseInteger(StringPiece* s, int* np) { if (s->empty() || !isdigit((*s)[0] & 0xFF)) - return false; - // Disallow leading zeros. + return false; + // Disallow leading zeros. if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) - return false; - int n = 0; - int c; + return false; + int n = 0; + int c; while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) { - // Avoid overflow. - if (n >= 100000000) - return false; - n = n*10 + c - '0'; - s->remove_prefix(1); // digit - } - *np = n; - return true; -} - -// Parses a repetition suffix like {1,2} or {2} or {2,}. -// Sets *s to span the remainder of the string on success. -// Sets *lo and *hi to the given range. -// In the case of {2,}, the high number is unbounded; -// sets *hi to -1 to signify this. -// {,2} is NOT a valid suffix. -// The Maybe in the name signifies that the regexp parse -// doesn't fail even if ParseRepetition does, so the StringPiece -// s must NOT be edited unless MaybeParseRepetition returns true. -static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { - StringPiece s = *sp; + // Avoid overflow. + if (n >= 100000000) + return false; + n = n*10 + c - '0'; + s->remove_prefix(1); // digit + } + *np = n; + return true; +} + +// Parses a repetition suffix like {1,2} or {2} or {2,}. +// Sets *s to span the remainder of the string on success. +// Sets *lo and *hi to the given range. +// In the case of {2,}, the high number is unbounded; +// sets *hi to -1 to signify this. +// {,2} is NOT a valid suffix. +// The Maybe in the name signifies that the regexp parse +// doesn't fail even if ParseRepetition does, so the StringPiece +// s must NOT be edited unless MaybeParseRepetition returns true. +static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { + StringPiece s = *sp; if (s.empty() || s[0] != '{') - return false; - s.remove_prefix(1); // '{' - if (!ParseInteger(&s, lo)) - return false; + return false; + s.remove_prefix(1); // '{' + if (!ParseInteger(&s, lo)) + return false; if (s.empty()) - return false; - if (s[0] == ',') { - s.remove_prefix(1); // ',' + return false; + if (s[0] == ',') { + s.remove_prefix(1); // ',' if (s.empty()) - return false; - if (s[0] == '}') { - // {2,} means at least 2 - *hi = -1; - } else { - // {2,4} means 2, 3, or 4. - if (!ParseInteger(&s, hi)) - return false; - } - } else { - // {2} means exactly two - *hi = *lo; - } + return false; + if (s[0] == '}') { + // {2,} means at least 2 + *hi = -1; + } else { + // {2,4} means 2, 3, or 4. + if (!ParseInteger(&s, hi)) + return false; + } + } else { + // {2} means exactly two + *hi = *lo; + } if (s.empty() || s[0] != '}') - return false; - s.remove_prefix(1); // '}' - *sp = s; - return true; -} - -// Removes the next Rune from the StringPiece and stores it in *r. -// Returns number of bytes removed from sp. -// Behaves as though there is a terminating NUL at the end of sp. -// Argument order is backwards from usual Google style -// but consistent with chartorune. -static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { + return false; + s.remove_prefix(1); // '}' + *sp = s; + return true; +} + +// Removes the next Rune from the StringPiece and stores it in *r. +// Returns number of bytes removed from sp. +// Behaves as though there is a terminating NUL at the end of sp. +// Argument order is backwards from usual Google style +// but consistent with chartorune. +static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { // fullrune() takes int, not size_t. However, it just looks // at the leading byte and treats any length >= 4 the same. if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) { @@ -1403,278 +1403,278 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { n = 1; *r = Runeerror; } - if (!(n == 1 && *r == Runeerror)) { // no decoding error - sp->remove_prefix(n); - return n; - } - } - + if (!(n == 1 && *r == Runeerror)) { // no decoding error + sp->remove_prefix(n); + return n; + } + } + if (status != NULL) { status->set_code(kRegexpBadUTF8); status->set_error_arg(StringPiece()); } - return -1; -} - + return -1; +} + // Returns whether name is valid UTF-8. // If not, sets status to kRegexpBadUTF8. -static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { - StringPiece t = s; - Rune r; +static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { + StringPiece t = s; + Rune r; while (!t.empty()) { - if (StringPieceToRune(&r, &t, status) < 0) - return false; - } - return true; -} - -// Is c a hex digit? -static int IsHex(int c) { - return ('0' <= c && c <= '9') || - ('A' <= c && c <= 'F') || - ('a' <= c && c <= 'f'); -} - -// Convert hex digit to value. -static int UnHex(int c) { - if ('0' <= c && c <= '9') - return c - '0'; - if ('A' <= c && c <= 'F') - return c - 'A' + 10; - if ('a' <= c && c <= 'f') - return c - 'a' + 10; - LOG(DFATAL) << "Bad hex digit " << c; - return 0; -} - -// Parse an escape sequence (e.g., \n, \{). -// Sets *s to span the remainder of the string. -// Sets *rp to the named character. -static bool ParseEscape(StringPiece* s, Rune* rp, - RegexpStatus* status, int rune_max) { + if (StringPieceToRune(&r, &t, status) < 0) + return false; + } + return true; +} + +// Is c a hex digit? +static int IsHex(int c) { + return ('0' <= c && c <= '9') || + ('A' <= c && c <= 'F') || + ('a' <= c && c <= 'f'); +} + +// Convert hex digit to value. +static int UnHex(int c) { + if ('0' <= c && c <= '9') + return c - '0'; + if ('A' <= c && c <= 'F') + return c - 'A' + 10; + if ('a' <= c && c <= 'f') + return c - 'a' + 10; + LOG(DFATAL) << "Bad hex digit " << c; + return 0; +} + +// Parse an escape sequence (e.g., \n, \{). +// Sets *s to span the remainder of the string. +// Sets *rp to the named character. +static bool ParseEscape(StringPiece* s, Rune* rp, + RegexpStatus* status, int rune_max) { const char* begin = s->data(); if (s->empty() || (*s)[0] != '\\') { - // Should not happen - caller always checks. - status->set_code(kRegexpInternalError); + // Should not happen - caller always checks. + status->set_code(kRegexpInternalError); status->set_error_arg(StringPiece()); - return false; - } + return false; + } if (s->size() == 1) { - status->set_code(kRegexpTrailingBackslash); + status->set_code(kRegexpTrailingBackslash); status->set_error_arg(StringPiece()); - return false; - } - Rune c, c1; - s->remove_prefix(1); // backslash - if (StringPieceToRune(&c, s, status) < 0) - return false; - int code; - switch (c) { - default: + return false; + } + Rune c, c1; + s->remove_prefix(1); // backslash + if (StringPieceToRune(&c, s, status) < 0) + return false; + int code; + switch (c) { + default: if (c < Runeself && !isalpha(c) && !isdigit(c)) { - // Escaped non-word characters are always themselves. - // PCRE is not quite so rigorous: it accepts things like - // \q, but we don't. We once rejected \_, but too many - // programs and people insist on using it, so allow \_. - *rp = c; - return true; - } - goto BadEscape; - - // Octal escapes. - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - // Single non-zero octal digit is a backreference; not supported. + // Escaped non-word characters are always themselves. + // PCRE is not quite so rigorous: it accepts things like + // \q, but we don't. We once rejected \_, but too many + // programs and people insist on using it, so allow \_. + *rp = c; + return true; + } + goto BadEscape; + + // Octal escapes. + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + // Single non-zero octal digit is a backreference; not supported. if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7') - goto BadEscape; + goto BadEscape; FALLTHROUGH_INTENDED; - case '0': - // consume up to three octal digits; already have one. - code = c - '0'; + case '0': + // consume up to three octal digits; already have one. + code = c - '0'; if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') { - code = code * 8 + c - '0'; - s->remove_prefix(1); // digit + code = code * 8 + c - '0'; + s->remove_prefix(1); // digit if (!s->empty()) { - c = (*s)[0]; - if ('0' <= c && c <= '7') { - code = code * 8 + c - '0'; - s->remove_prefix(1); // digit - } - } - } + c = (*s)[0]; + if ('0' <= c && c <= '7') { + code = code * 8 + c - '0'; + s->remove_prefix(1); // digit + } + } + } if (code > rune_max) goto BadEscape; - *rp = code; - return true; - - // Hexadecimal escapes - case 'x': + *rp = code; + return true; + + // Hexadecimal escapes + case 'x': if (s->empty()) - goto BadEscape; - if (StringPieceToRune(&c, s, status) < 0) - return false; - if (c == '{') { - // Any number of digits in braces. - // Update n as we consume the string, so that - // the whole thing gets shown in the error message. - // Perl accepts any text at all; it ignores all text - // after the first non-hex digit. We require only hex digits, - // and at least one. - if (StringPieceToRune(&c, s, status) < 0) - return false; - int nhex = 0; - code = 0; - while (IsHex(c)) { - nhex++; - code = code * 16 + UnHex(c); - if (code > rune_max) - goto BadEscape; + goto BadEscape; + if (StringPieceToRune(&c, s, status) < 0) + return false; + if (c == '{') { + // Any number of digits in braces. + // Update n as we consume the string, so that + // the whole thing gets shown in the error message. + // Perl accepts any text at all; it ignores all text + // after the first non-hex digit. We require only hex digits, + // and at least one. + if (StringPieceToRune(&c, s, status) < 0) + return false; + int nhex = 0; + code = 0; + while (IsHex(c)) { + nhex++; + code = code * 16 + UnHex(c); + if (code > rune_max) + goto BadEscape; if (s->empty()) - goto BadEscape; - if (StringPieceToRune(&c, s, status) < 0) - return false; - } - if (c != '}' || nhex == 0) - goto BadEscape; - *rp = code; - return true; - } - // Easy case: two hex digits. + goto BadEscape; + if (StringPieceToRune(&c, s, status) < 0) + return false; + } + if (c != '}' || nhex == 0) + goto BadEscape; + *rp = code; + return true; + } + // Easy case: two hex digits. if (s->empty()) - goto BadEscape; - if (StringPieceToRune(&c1, s, status) < 0) - return false; - if (!IsHex(c) || !IsHex(c1)) - goto BadEscape; - *rp = UnHex(c) * 16 + UnHex(c1); - return true; - - // C escapes. - case 'n': - *rp = '\n'; - return true; - case 'r': - *rp = '\r'; - return true; - case 't': - *rp = '\t'; - return true; - - // Less common C escapes. - case 'a': - *rp = '\a'; - return true; - case 'f': - *rp = '\f'; - return true; - case 'v': - *rp = '\v'; - return true; - - // This code is disabled to avoid misparsing - // the Perl word-boundary \b as a backspace - // when in POSIX regexp mode. Surprisingly, - // in Perl, \b means word-boundary but [\b] - // means backspace. We don't support that: - // if you want a backspace embed a literal + goto BadEscape; + if (StringPieceToRune(&c1, s, status) < 0) + return false; + if (!IsHex(c) || !IsHex(c1)) + goto BadEscape; + *rp = UnHex(c) * 16 + UnHex(c1); + return true; + + // C escapes. + case 'n': + *rp = '\n'; + return true; + case 'r': + *rp = '\r'; + return true; + case 't': + *rp = '\t'; + return true; + + // Less common C escapes. + case 'a': + *rp = '\a'; + return true; + case 'f': + *rp = '\f'; + return true; + case 'v': + *rp = '\v'; + return true; + + // This code is disabled to avoid misparsing + // the Perl word-boundary \b as a backspace + // when in POSIX regexp mode. Surprisingly, + // in Perl, \b means word-boundary but [\b] + // means backspace. We don't support that: + // if you want a backspace embed a literal // backspace character or use \x08. - // - // case 'b': - // *rp = '\b'; - // return true; - } - - LOG(DFATAL) << "Not reached in ParseEscape."; - -BadEscape: - // Unrecognized escape sequence. - status->set_code(kRegexpBadEscape); + // + // case 'b': + // *rp = '\b'; + // return true; + } + + LOG(DFATAL) << "Not reached in ParseEscape."; + +BadEscape: + // Unrecognized escape sequence. + status->set_code(kRegexpBadEscape); status->set_error_arg( StringPiece(begin, static_cast<size_t>(s->data() - begin))); - return false; -} - -// Add a range to the character class, but exclude newline if asked. -// Also handle case folding. -void CharClassBuilder::AddRangeFlags( - Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { - - // Take out \n if the flags say so. - bool cutnl = !(parse_flags & Regexp::ClassNL) || - (parse_flags & Regexp::NeverNL); - if (cutnl && lo <= '\n' && '\n' <= hi) { - if (lo < '\n') - AddRangeFlags(lo, '\n' - 1, parse_flags); - if (hi > '\n') - AddRangeFlags('\n' + 1, hi, parse_flags); - return; - } - - // If folding case, add fold-equivalent characters too. - if (parse_flags & Regexp::FoldCase) - AddFoldedRange(this, lo, hi, 0); - else - AddRange(lo, hi); -} - -// Look for a group with the given name. + return false; +} + +// Add a range to the character class, but exclude newline if asked. +// Also handle case folding. +void CharClassBuilder::AddRangeFlags( + Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { + + // Take out \n if the flags say so. + bool cutnl = !(parse_flags & Regexp::ClassNL) || + (parse_flags & Regexp::NeverNL); + if (cutnl && lo <= '\n' && '\n' <= hi) { + if (lo < '\n') + AddRangeFlags(lo, '\n' - 1, parse_flags); + if (hi > '\n') + AddRangeFlags('\n' + 1, hi, parse_flags); + return; + } + + // If folding case, add fold-equivalent characters too. + if (parse_flags & Regexp::FoldCase) + AddFoldedRange(this, lo, hi, 0); + else + AddRange(lo, hi); +} + +// Look for a group with the given name. static const UGroup* LookupGroup(const StringPiece& name, const UGroup *groups, int ngroups) { - // Simple name lookup. - for (int i = 0; i < ngroups; i++) - if (StringPiece(groups[i].name) == name) - return &groups[i]; - return NULL; -} - -// Look for a POSIX group with the given name (e.g., "[:^alpha:]") + // Simple name lookup. + for (int i = 0; i < ngroups; i++) + if (StringPiece(groups[i].name) == name) + return &groups[i]; + return NULL; +} + +// Look for a POSIX group with the given name (e.g., "[:^alpha:]") static const UGroup* LookupPosixGroup(const StringPiece& name) { - return LookupGroup(name, posix_groups, num_posix_groups); -} - + return LookupGroup(name, posix_groups, num_posix_groups); +} + static const UGroup* LookupPerlGroup(const StringPiece& name) { - return LookupGroup(name, perl_groups, num_perl_groups); -} - + return LookupGroup(name, perl_groups, num_perl_groups); +} + #if !defined(RE2_USE_ICU) // Fake UGroup containing all Runes static URange16 any16[] = { { 0, 65535 } }; static URange32 any32[] = { { 65536, Runemax } }; static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; -// Look for a Unicode group with the given name (e.g., "Han") +// Look for a Unicode group with the given name (e.g., "Han") static const UGroup* LookupUnicodeGroup(const StringPiece& name) { - // Special case: "Any" means any. - if (name == StringPiece("Any")) - return &anygroup; - return LookupGroup(name, unicode_groups, num_unicode_groups); -} + // Special case: "Any" means any. + if (name == StringPiece("Any")) + return &anygroup; + return LookupGroup(name, unicode_groups, num_unicode_groups); +} #endif - -// Add a UGroup or its negation to the character class. + +// Add a UGroup or its negation to the character class. static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, Regexp::ParseFlags parse_flags) { - if (sign == +1) { - for (int i = 0; i < g->nr16; i++) { - cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags); - } - for (int i = 0; i < g->nr32; i++) { - cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags); - } - } else { - if (parse_flags & Regexp::FoldCase) { - // Normally adding a case-folded group means - // adding all the extra fold-equivalent runes too. - // But if we're adding the negation of the group, - // we have to exclude all the runes that are fold-equivalent - // to what's already missing. Too hard, so do in two steps. - CharClassBuilder ccb1; - AddUGroup(&ccb1, g, +1, parse_flags); + if (sign == +1) { + for (int i = 0; i < g->nr16; i++) { + cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags); + } + for (int i = 0; i < g->nr32; i++) { + cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags); + } + } else { + if (parse_flags & Regexp::FoldCase) { + // Normally adding a case-folded group means + // adding all the extra fold-equivalent runes too. + // But if we're adding the negation of the group, + // we have to exclude all the runes that are fold-equivalent + // to what's already missing. Too hard, so do in two steps. + CharClassBuilder ccb1; + AddUGroup(&ccb1, g, +1, parse_flags); // If the flags say to take out \n, put it in, so that negating will take it out. // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags. bool cutnl = !(parse_flags & Regexp::ClassNL) || @@ -1682,115 +1682,115 @@ static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, if (cutnl) { ccb1.AddRange('\n', '\n'); } - ccb1.Negate(); - cc->AddCharClass(&ccb1); - return; - } - int next = 0; - for (int i = 0; i < g->nr16; i++) { - if (next < g->r16[i].lo) - cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); - next = g->r16[i].hi + 1; - } - for (int i = 0; i < g->nr32; i++) { - if (next < g->r32[i].lo) - cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); - next = g->r32[i].hi + 1; - } - if (next <= Runemax) - cc->AddRangeFlags(next, Runemax, parse_flags); - } -} - -// Maybe parse a Perl character class escape sequence. -// Only recognizes the Perl character classes (\d \s \w \D \S \W), -// not the Perl empty-string classes (\b \B \A \Z \z). -// On success, sets *s to span the remainder of the string -// and returns the corresponding UGroup. -// The StringPiece must *NOT* be edited unless the call succeeds. + ccb1.Negate(); + cc->AddCharClass(&ccb1); + return; + } + int next = 0; + for (int i = 0; i < g->nr16; i++) { + if (next < g->r16[i].lo) + cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); + next = g->r16[i].hi + 1; + } + for (int i = 0; i < g->nr32; i++) { + if (next < g->r32[i].lo) + cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); + next = g->r32[i].hi + 1; + } + if (next <= Runemax) + cc->AddRangeFlags(next, Runemax, parse_flags); + } +} + +// Maybe parse a Perl character class escape sequence. +// Only recognizes the Perl character classes (\d \s \w \D \S \W), +// not the Perl empty-string classes (\b \B \A \Z \z). +// On success, sets *s to span the remainder of the string +// and returns the corresponding UGroup. +// The StringPiece must *NOT* be edited unless the call succeeds. const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { - if (!(parse_flags & Regexp::PerlClasses)) - return NULL; - if (s->size() < 2 || (*s)[0] != '\\') - return NULL; - // Could use StringPieceToRune, but there aren't - // any non-ASCII Perl group names. + if (!(parse_flags & Regexp::PerlClasses)) + return NULL; + if (s->size() < 2 || (*s)[0] != '\\') + return NULL; + // Could use StringPieceToRune, but there aren't + // any non-ASCII Perl group names. StringPiece name(s->data(), 2); const UGroup *g = LookupPerlGroup(name); - if (g == NULL) - return NULL; - s->remove_prefix(name.size()); - return g; -} - -enum ParseStatus { - kParseOk, // Did some parsing. - kParseError, // Found an error. - kParseNothing, // Decided not to parse. -}; - -// Maybe parses a Unicode character group like \p{Han} or \P{Han} -// (the latter is a negated group). -ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, - CharClassBuilder *cc, - RegexpStatus* status) { - // Decide whether to parse. - if (!(parse_flags & Regexp::UnicodeGroups)) - return kParseNothing; - if (s->size() < 2 || (*s)[0] != '\\') - return kParseNothing; - Rune c = (*s)[1]; - if (c != 'p' && c != 'P') - return kParseNothing; - - // Committed to parse. Results: - int sign = +1; // -1 = negated char class - if (c == 'P') + if (g == NULL) + return NULL; + s->remove_prefix(name.size()); + return g; +} + +enum ParseStatus { + kParseOk, // Did some parsing. + kParseError, // Found an error. + kParseNothing, // Decided not to parse. +}; + +// Maybe parses a Unicode character group like \p{Han} or \P{Han} +// (the latter is a negated group). +ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, + CharClassBuilder *cc, + RegexpStatus* status) { + // Decide whether to parse. + if (!(parse_flags & Regexp::UnicodeGroups)) + return kParseNothing; + if (s->size() < 2 || (*s)[0] != '\\') + return kParseNothing; + Rune c = (*s)[1]; + if (c != 'p' && c != 'P') + return kParseNothing; + + // Committed to parse. Results: + int sign = +1; // -1 = negated char class + if (c == 'P') sign = -sign; - StringPiece seq = *s; // \p{Han} or \pL - StringPiece name; // Han or L - s->remove_prefix(2); // '\\', 'p' - - if (!StringPieceToRune(&c, s, status)) - return kParseError; - if (c != '{') { - // Name is the bit of string we just skipped over for c. + StringPiece seq = *s; // \p{Han} or \pL + StringPiece name; // Han or L + s->remove_prefix(2); // '\\', 'p' + + if (!StringPieceToRune(&c, s, status)) + return kParseError; + if (c != '{') { + // Name is the bit of string we just skipped over for c. const char* p = seq.data() + 2; name = StringPiece(p, static_cast<size_t>(s->data() - p)); - } else { - // Name is in braces. Look for closing } + } else { + // Name is in braces. Look for closing } size_t end = s->find('}', 0); if (end == StringPiece::npos) { - if (!IsValidUTF8(seq, status)) - return kParseError; - status->set_code(kRegexpBadCharRange); - status->set_error_arg(seq); - return kParseError; - } + if (!IsValidUTF8(seq, status)) + return kParseError; + status->set_code(kRegexpBadCharRange); + status->set_error_arg(seq); + return kParseError; + } name = StringPiece(s->data(), end); // without '}' - s->remove_prefix(end + 1); // with '}' - if (!IsValidUTF8(name, status)) - return kParseError; - } - - // Chop seq where s now begins. + s->remove_prefix(end + 1); // with '}' + if (!IsValidUTF8(name, status)) + return kParseError; + } + + // Chop seq where s now begins. seq = StringPiece(seq.data(), static_cast<size_t>(s->data() - seq.data())); - + if (!name.empty() && name[0] == '^') { - sign = -sign; - name.remove_prefix(1); // '^' - } + sign = -sign; + name.remove_prefix(1); // '^' + } #if !defined(RE2_USE_ICU) // Look up the group in the RE2 Unicode data. const UGroup *g = LookupUnicodeGroup(name); - if (g == NULL) { - status->set_code(kRegexpBadCharRange); - status->set_error_arg(seq); - return kParseError; - } - - AddUGroup(cc, g, sign, parse_flags); + if (g == NULL) { + status->set_code(kRegexpBadCharRange); + status->set_error_arg(seq); + return kParseError; + } + + AddUGroup(cc, g, sign, parse_flags); #else // Look up the group in the ICU Unicode data. Because ICU provides full // Unicode properties support, this could be more than a lookup by name. @@ -1815,210 +1815,210 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, AddUGroup(cc, &g, sign, parse_flags); #endif - return kParseOk; -} - -// Parses a character class name like [:alnum:]. -// Sets *s to span the remainder of the string. -// Adds the ranges corresponding to the class to ranges. -static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, - CharClassBuilder *cc, - RegexpStatus* status) { - // Check begins with [: - const char* p = s->data(); - const char* ep = s->data() + s->size(); - if (ep - p < 2 || p[0] != '[' || p[1] != ':') - return kParseNothing; - - // Look for closing :]. - const char* q; - for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) - ; - - // If no closing :], then ignore. - if (q > ep-2) - return kParseNothing; - - // Got it. Check that it's valid. - q += 2; + return kParseOk; +} + +// Parses a character class name like [:alnum:]. +// Sets *s to span the remainder of the string. +// Adds the ranges corresponding to the class to ranges. +static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, + CharClassBuilder *cc, + RegexpStatus* status) { + // Check begins with [: + const char* p = s->data(); + const char* ep = s->data() + s->size(); + if (ep - p < 2 || p[0] != '[' || p[1] != ':') + return kParseNothing; + + // Look for closing :]. + const char* q; + for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) + ; + + // If no closing :], then ignore. + if (q > ep-2) + return kParseNothing; + + // Got it. Check that it's valid. + q += 2; StringPiece name(p, static_cast<size_t>(q - p)); - + const UGroup *g = LookupPosixGroup(name); - if (g == NULL) { - status->set_code(kRegexpBadCharRange); - status->set_error_arg(name); - return kParseError; - } - - s->remove_prefix(name.size()); - AddUGroup(cc, g, g->sign, parse_flags); - return kParseOk; -} - -// Parses a character inside a character class. -// There are fewer special characters here than in the rest of the regexp. -// Sets *s to span the remainder of the string. -// Sets *rp to the character. -bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, - const StringPiece& whole_class, - RegexpStatus* status) { + if (g == NULL) { + status->set_code(kRegexpBadCharRange); + status->set_error_arg(name); + return kParseError; + } + + s->remove_prefix(name.size()); + AddUGroup(cc, g, g->sign, parse_flags); + return kParseOk; +} + +// Parses a character inside a character class. +// There are fewer special characters here than in the rest of the regexp. +// Sets *s to span the remainder of the string. +// Sets *rp to the character. +bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, + const StringPiece& whole_class, + RegexpStatus* status) { if (s->empty()) { - status->set_code(kRegexpMissingBracket); - status->set_error_arg(whole_class); - return false; - } - - // Allow regular escape sequences even though - // many need not be escaped in this context. + status->set_code(kRegexpMissingBracket); + status->set_error_arg(whole_class); + return false; + } + + // Allow regular escape sequences even though + // many need not be escaped in this context. if ((*s)[0] == '\\') - return ParseEscape(s, rp, status, rune_max_); - - // Otherwise take the next rune. - return StringPieceToRune(rp, s, status) >= 0; -} - -// Parses a character class character, or, if the character -// is followed by a hyphen, parses a character class range. -// For single characters, rr->lo == rr->hi. -// Sets *s to span the remainder of the string. -// Sets *rp to the character. -bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, - const StringPiece& whole_class, - RegexpStatus* status) { - StringPiece os = *s; - if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) - return false; - // [a-] means (a|-), so check for final ]. - if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') { - s->remove_prefix(1); // '-' - if (!ParseCCCharacter(s, &rr->hi, whole_class, status)) - return false; - if (rr->hi < rr->lo) { - status->set_code(kRegexpBadCharRange); + return ParseEscape(s, rp, status, rune_max_); + + // Otherwise take the next rune. + return StringPieceToRune(rp, s, status) >= 0; +} + +// Parses a character class character, or, if the character +// is followed by a hyphen, parses a character class range. +// For single characters, rr->lo == rr->hi. +// Sets *s to span the remainder of the string. +// Sets *rp to the character. +bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, + const StringPiece& whole_class, + RegexpStatus* status) { + StringPiece os = *s; + if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) + return false; + // [a-] means (a|-), so check for final ]. + if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') { + s->remove_prefix(1); // '-' + if (!ParseCCCharacter(s, &rr->hi, whole_class, status)) + return false; + if (rr->hi < rr->lo) { + status->set_code(kRegexpBadCharRange); status->set_error_arg( StringPiece(os.data(), static_cast<size_t>(s->data() - os.data()))); - return false; - } - } else { - rr->hi = rr->lo; - } - return true; -} - -// Parses a possibly-negated character class expression like [^abx-z[:digit:]]. -// Sets *s to span the remainder of the string. -// Sets *out_re to the regexp for the class. -bool Regexp::ParseState::ParseCharClass(StringPiece* s, - Regexp** out_re, - RegexpStatus* status) { - StringPiece whole_class = *s; + return false; + } + } else { + rr->hi = rr->lo; + } + return true; +} + +// Parses a possibly-negated character class expression like [^abx-z[:digit:]]. +// Sets *s to span the remainder of the string. +// Sets *out_re to the regexp for the class. +bool Regexp::ParseState::ParseCharClass(StringPiece* s, + Regexp** out_re, + RegexpStatus* status) { + StringPiece whole_class = *s; if (s->empty() || (*s)[0] != '[') { - // Caller checked this. - status->set_code(kRegexpInternalError); + // Caller checked this. + status->set_code(kRegexpInternalError); status->set_error_arg(StringPiece()); - return false; - } - bool negated = false; - Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); - re->ccb_ = new CharClassBuilder; - s->remove_prefix(1); // '[' + return false; + } + bool negated = false; + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + s->remove_prefix(1); // '[' if (!s->empty() && (*s)[0] == '^') { - s->remove_prefix(1); // '^' - negated = true; - if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { - // If NL can't match implicitly, then pretend - // negated classes include a leading \n. - re->ccb_->AddRange('\n', '\n'); - } - } - bool first = true; // ] is okay as first char in class + s->remove_prefix(1); // '^' + negated = true; + if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { + // If NL can't match implicitly, then pretend + // negated classes include a leading \n. + re->ccb_->AddRange('\n', '\n'); + } + } + bool first = true; // ] is okay as first char in class while (!s->empty() && ((*s)[0] != ']' || first)) { - // - is only okay unescaped as first or last in class. - // Except that Perl allows - anywhere. - if ((*s)[0] == '-' && !first && !(flags_&PerlX) && - (s->size() == 1 || (*s)[1] != ']')) { - StringPiece t = *s; - t.remove_prefix(1); // '-' - Rune r; - int n = StringPieceToRune(&r, &t, status); - if (n < 0) { - re->Decref(); - return false; - } - status->set_code(kRegexpBadCharRange); - status->set_error_arg(StringPiece(s->data(), 1+n)); - re->Decref(); - return false; - } - first = false; - - // Look for [:alnum:] etc. - if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { - switch (ParseCCName(s, flags_, re->ccb_, status)) { - case kParseOk: - continue; - case kParseError: - re->Decref(); - return false; - case kParseNothing: - break; - } - } - - // Look for Unicode character group like \p{Han} - if (s->size() > 2 && - (*s)[0] == '\\' && - ((*s)[1] == 'p' || (*s)[1] == 'P')) { - switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { - case kParseOk: - continue; - case kParseError: - re->Decref(); - return false; - case kParseNothing: - break; - } - } - - // Look for Perl character class symbols (extension). + // - is only okay unescaped as first or last in class. + // Except that Perl allows - anywhere. + if ((*s)[0] == '-' && !first && !(flags_&PerlX) && + (s->size() == 1 || (*s)[1] != ']')) { + StringPiece t = *s; + t.remove_prefix(1); // '-' + Rune r; + int n = StringPieceToRune(&r, &t, status); + if (n < 0) { + re->Decref(); + return false; + } + status->set_code(kRegexpBadCharRange); + status->set_error_arg(StringPiece(s->data(), 1+n)); + re->Decref(); + return false; + } + first = false; + + // Look for [:alnum:] etc. + if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { + switch (ParseCCName(s, flags_, re->ccb_, status)) { + case kParseOk: + continue; + case kParseError: + re->Decref(); + return false; + case kParseNothing: + break; + } + } + + // Look for Unicode character group like \p{Han} + if (s->size() > 2 && + (*s)[0] == '\\' && + ((*s)[1] == 'p' || (*s)[1] == 'P')) { + switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { + case kParseOk: + continue; + case kParseError: + re->Decref(); + return false; + case kParseNothing: + break; + } + } + + // Look for Perl character class symbols (extension). const UGroup *g = MaybeParsePerlCCEscape(s, flags_); - if (g != NULL) { - AddUGroup(re->ccb_, g, g->sign, flags_); - continue; - } - - // Otherwise assume single character or simple range. - RuneRange rr; - if (!ParseCCRange(s, &rr, whole_class, status)) { - re->Decref(); - return false; - } - // AddRangeFlags is usually called in response to a class like - // \p{Foo} or [[:foo:]]; for those, it filters \n out unless - // Regexp::ClassNL is set. In an explicit range or singleton - // like we just parsed, we do not filter \n out, so set ClassNL - // in the flags. - re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); - } + if (g != NULL) { + AddUGroup(re->ccb_, g, g->sign, flags_); + continue; + } + + // Otherwise assume single character or simple range. + RuneRange rr; + if (!ParseCCRange(s, &rr, whole_class, status)) { + re->Decref(); + return false; + } + // AddRangeFlags is usually called in response to a class like + // \p{Foo} or [[:foo:]]; for those, it filters \n out unless + // Regexp::ClassNL is set. In an explicit range or singleton + // like we just parsed, we do not filter \n out, so set ClassNL + // in the flags. + re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); + } if (s->empty()) { - status->set_code(kRegexpMissingBracket); - status->set_error_arg(whole_class); - re->Decref(); - return false; - } - s->remove_prefix(1); // ']' - - if (negated) - re->ccb_->Negate(); - - *out_re = re; - return true; -} - + status->set_code(kRegexpMissingBracket); + status->set_error_arg(whole_class); + re->Decref(); + return false; + } + s->remove_prefix(1); // ']' + + if (negated) + re->ccb_->Negate(); + + *out_re = re; + return true; +} + // Returns whether name is a valid capture name. -static bool IsValidCaptureName(const StringPiece& name) { +static bool IsValidCaptureName(const StringPiece& name) { if (name.empty()) - return false; + return false; // Historically, we effectively used [0-9A-Za-z_]+ to validate; that // followed Python 2 except for not restricting the first character. @@ -2043,230 +2043,230 @@ static bool IsValidCaptureName(const StringPiece& name) { if (StringPieceToRune(&r, &t, NULL) < 0) return false; if (cc->Contains(r)) - continue; - return false; - } - return true; -} - -// Parses a Perl flag setting or non-capturing group or both, -// like (?i) or (?: or (?i:. Removes from s, updates parse state. -// The caller must check that s begins with "(?". -// Returns true on success. If the Perl flag is not -// well-formed or not supported, sets status_ and returns false. -bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { - StringPiece t = *s; - - // Caller is supposed to check this. - if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { - LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; - status_->set_code(kRegexpInternalError); - return false; - } - - t.remove_prefix(2); // "(?" - - // Check for named captures, first introduced in Python's regexp library. - // As usual, there are three slightly different syntaxes: - // - // (?P<name>expr) the original, introduced by Python - // (?<name>expr) the .NET alteration, adopted by Perl 5.10 - // (?'name'expr) another .NET alteration, adopted by Perl 5.10 - // - // Perl 5.10 gave in and implemented the Python version too, - // but they claim that the last two are the preferred forms. - // PCRE and languages based on it (specifically, PHP and Ruby) - // support all three as well. EcmaScript 4 uses only the Python form. - // - // In both the open source world (via Code Search) and the - // Google source tree, (?P<expr>name) is the dominant form, - // so that's the one we implement. One is enough. - if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { - // Pull out name. + continue; + return false; + } + return true; +} + +// Parses a Perl flag setting or non-capturing group or both, +// like (?i) or (?: or (?i:. Removes from s, updates parse state. +// The caller must check that s begins with "(?". +// Returns true on success. If the Perl flag is not +// well-formed or not supported, sets status_ and returns false. +bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { + StringPiece t = *s; + + // Caller is supposed to check this. + if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { + LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; + status_->set_code(kRegexpInternalError); + return false; + } + + t.remove_prefix(2); // "(?" + + // Check for named captures, first introduced in Python's regexp library. + // As usual, there are three slightly different syntaxes: + // + // (?P<name>expr) the original, introduced by Python + // (?<name>expr) the .NET alteration, adopted by Perl 5.10 + // (?'name'expr) another .NET alteration, adopted by Perl 5.10 + // + // Perl 5.10 gave in and implemented the Python version too, + // but they claim that the last two are the preferred forms. + // PCRE and languages based on it (specifically, PHP and Ruby) + // support all three as well. EcmaScript 4 uses only the Python form. + // + // In both the open source world (via Code Search) and the + // Google source tree, (?P<expr>name) is the dominant form, + // so that's the one we implement. One is enough. + if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { + // Pull out name. size_t end = t.find('>', 2); if (end == StringPiece::npos) { - if (!IsValidUTF8(*s, status_)) - return false; - status_->set_code(kRegexpBadNamedCapture); - status_->set_error_arg(*s); - return false; - } - - // t is "P<name>...", t[end] == '>' + if (!IsValidUTF8(*s, status_)) + return false; + status_->set_code(kRegexpBadNamedCapture); + status_->set_error_arg(*s); + return false; + } + + // t is "P<name>...", t[end] == '>' StringPiece capture(t.data()-2, end+3); // "(?P<name>" StringPiece name(t.data()+2, end-2); // "name" - if (!IsValidUTF8(name, status_)) - return false; - if (!IsValidCaptureName(name)) { - status_->set_code(kRegexpBadNamedCapture); - status_->set_error_arg(capture); - return false; - } - - if (!DoLeftParen(name)) { - // DoLeftParen's failure set status_. - return false; - } - + if (!IsValidUTF8(name, status_)) + return false; + if (!IsValidCaptureName(name)) { + status_->set_code(kRegexpBadNamedCapture); + status_->set_error_arg(capture); + return false; + } + + if (!DoLeftParen(name)) { + // DoLeftParen's failure set status_. + return false; + } + s->remove_prefix( static_cast<size_t>(capture.data() + capture.size() - s->data())); - return true; - } - - bool negated = false; - bool sawflags = false; - int nflags = flags_; - Rune c; - for (bool done = false; !done; ) { + return true; + } + + bool negated = false; + bool sawflags = false; + int nflags = flags_; + Rune c; + for (bool done = false; !done; ) { if (t.empty()) - goto BadPerlOp; - if (StringPieceToRune(&c, &t, status_) < 0) - return false; - switch (c) { - default: - goto BadPerlOp; - - // Parse flags. - case 'i': - sawflags = true; - if (negated) - nflags &= ~FoldCase; - else - nflags |= FoldCase; - break; - - case 'm': // opposite of our OneLine - sawflags = true; - if (negated) - nflags |= OneLine; - else - nflags &= ~OneLine; - break; - - case 's': - sawflags = true; - if (negated) - nflags &= ~DotNL; - else - nflags |= DotNL; - break; - - case 'U': - sawflags = true; - if (negated) - nflags &= ~NonGreedy; - else - nflags |= NonGreedy; - break; - - // Negation - case '-': - if (negated) - goto BadPerlOp; - negated = true; - sawflags = false; - break; - - // Open new group. - case ':': - if (!DoLeftParenNoCapture()) { - // DoLeftParenNoCapture's failure set status_. - return false; - } - done = true; - break; - - // Finish flags. - case ')': - done = true; - break; - } - } - - if (negated && !sawflags) - goto BadPerlOp; - - flags_ = static_cast<Regexp::ParseFlags>(nflags); - *s = t; - return true; - -BadPerlOp: - status_->set_code(kRegexpBadPerlOp); + goto BadPerlOp; + if (StringPieceToRune(&c, &t, status_) < 0) + return false; + switch (c) { + default: + goto BadPerlOp; + + // Parse flags. + case 'i': + sawflags = true; + if (negated) + nflags &= ~FoldCase; + else + nflags |= FoldCase; + break; + + case 'm': // opposite of our OneLine + sawflags = true; + if (negated) + nflags |= OneLine; + else + nflags &= ~OneLine; + break; + + case 's': + sawflags = true; + if (negated) + nflags &= ~DotNL; + else + nflags |= DotNL; + break; + + case 'U': + sawflags = true; + if (negated) + nflags &= ~NonGreedy; + else + nflags |= NonGreedy; + break; + + // Negation + case '-': + if (negated) + goto BadPerlOp; + negated = true; + sawflags = false; + break; + + // Open new group. + case ':': + if (!DoLeftParenNoCapture()) { + // DoLeftParenNoCapture's failure set status_. + return false; + } + done = true; + break; + + // Finish flags. + case ')': + done = true; + break; + } + } + + if (negated && !sawflags) + goto BadPerlOp; + + flags_ = static_cast<Regexp::ParseFlags>(nflags); + *s = t; + return true; + +BadPerlOp: + status_->set_code(kRegexpBadPerlOp); status_->set_error_arg( StringPiece(s->data(), static_cast<size_t>(t.data() - s->data()))); - return false; -} - -// Converts latin1 (assumed to be encoded as Latin1 bytes) -// into UTF8 encoding in string. -// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is -// deprecated and because it rejects code points 0x80-0x9F. + return false; +} + +// Converts latin1 (assumed to be encoded as Latin1 bytes) +// into UTF8 encoding in string. +// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is +// deprecated and because it rejects code points 0x80-0x9F. void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) { - char buf[UTFmax]; - - utf->clear(); + char buf[UTFmax]; + + utf->clear(); for (size_t i = 0; i < latin1.size(); i++) { - Rune r = latin1[i] & 0xFF; - int n = runetochar(buf, &r); - utf->append(buf, n); - } -} - -// Parses the regular expression given by s, -// returning the corresponding Regexp tree. -// The caller must Decref the return value when done with it. -// Returns NULL on error. -Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, - RegexpStatus* status) { - // Make status non-NULL (easier on everyone else). - RegexpStatus xstatus; - if (status == NULL) - status = &xstatus; - - ParseState ps(global_flags, s, status); - StringPiece t = s; - - // Convert regexp to UTF-8 (easier on the rest of the parser). - if (global_flags & Latin1) { + Rune r = latin1[i] & 0xFF; + int n = runetochar(buf, &r); + utf->append(buf, n); + } +} + +// Parses the regular expression given by s, +// returning the corresponding Regexp tree. +// The caller must Decref the return value when done with it. +// Returns NULL on error. +Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, + RegexpStatus* status) { + // Make status non-NULL (easier on everyone else). + RegexpStatus xstatus; + if (status == NULL) + status = &xstatus; + + ParseState ps(global_flags, s, status); + StringPiece t = s; + + // Convert regexp to UTF-8 (easier on the rest of the parser). + if (global_flags & Latin1) { std::string* tmp = new std::string; - ConvertLatin1ToUTF8(t, tmp); - status->set_tmp(tmp); - t = *tmp; - } - - if (global_flags & Literal) { - // Special parse loop for literal string. + ConvertLatin1ToUTF8(t, tmp); + status->set_tmp(tmp); + t = *tmp; + } + + if (global_flags & Literal) { + // Special parse loop for literal string. while (!t.empty()) { - Rune r; - if (StringPieceToRune(&r, &t, status) < 0) - return NULL; - if (!ps.PushLiteral(r)) - return NULL; - } - return ps.DoFinish(); - } - + Rune r; + if (StringPieceToRune(&r, &t, status) < 0) + return NULL; + if (!ps.PushLiteral(r)) + return NULL; + } + return ps.DoFinish(); + } + StringPiece lastunary = StringPiece(); while (!t.empty()) { StringPiece isunary = StringPiece(); - switch (t[0]) { - default: { - Rune r; - if (StringPieceToRune(&r, &t, status) < 0) - return NULL; - if (!ps.PushLiteral(r)) - return NULL; - break; - } - - case '(': - // "(?" introduces Perl escape. - if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) { - // Flag changes and non-capturing groups. - if (!ps.ParsePerlFlags(&t)) - return NULL; - break; - } + switch (t[0]) { + default: { + Rune r; + if (StringPieceToRune(&r, &t, status) < 0) + return NULL; + if (!ps.PushLiteral(r)) + return NULL; + break; + } + + case '(': + // "(?" introduces Perl escape. + if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) { + // Flag changes and non-capturing groups. + if (!ps.ParsePerlFlags(&t)) + return NULL; + break; + } if (ps.flags() & NeverCapture) { if (!ps.DoLeftParenNoCapture()) return NULL; @@ -2274,210 +2274,210 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (!ps.DoLeftParen(StringPiece())) return NULL; } - t.remove_prefix(1); // '(' - break; - - case '|': - if (!ps.DoVerticalBar()) - return NULL; - t.remove_prefix(1); // '|' - break; - - case ')': - if (!ps.DoRightParen()) - return NULL; - t.remove_prefix(1); // ')' - break; - - case '^': // Beginning of line. + t.remove_prefix(1); // '(' + break; + + case '|': + if (!ps.DoVerticalBar()) + return NULL; + t.remove_prefix(1); // '|' + break; + + case ')': + if (!ps.DoRightParen()) + return NULL; + t.remove_prefix(1); // ')' + break; + + case '^': // Beginning of line. if (!ps.PushCaret()) - return NULL; - t.remove_prefix(1); // '^' - break; - - case '$': // End of line. - if (!ps.PushDollar()) - return NULL; - t.remove_prefix(1); // '$' - break; - - case '.': // Any character (possibly except newline). - if (!ps.PushDot()) - return NULL; - t.remove_prefix(1); // '.' - break; - - case '[': { // Character class. - Regexp* re; - if (!ps.ParseCharClass(&t, &re, status)) - return NULL; - if (!ps.PushRegexp(re)) - return NULL; - break; - } - - case '*': { // Zero or more. - RegexpOp op; - op = kRegexpStar; - goto Rep; - case '+': // One or more. - op = kRegexpPlus; - goto Rep; - case '?': // Zero or one. - op = kRegexpQuest; - goto Rep; - Rep: - StringPiece opstr = t; - bool nongreedy = false; - t.remove_prefix(1); // '*' or '+' or '?' - if (ps.flags() & PerlX) { + return NULL; + t.remove_prefix(1); // '^' + break; + + case '$': // End of line. + if (!ps.PushDollar()) + return NULL; + t.remove_prefix(1); // '$' + break; + + case '.': // Any character (possibly except newline). + if (!ps.PushDot()) + return NULL; + t.remove_prefix(1); // '.' + break; + + case '[': { // Character class. + Regexp* re; + if (!ps.ParseCharClass(&t, &re, status)) + return NULL; + if (!ps.PushRegexp(re)) + return NULL; + break; + } + + case '*': { // Zero or more. + RegexpOp op; + op = kRegexpStar; + goto Rep; + case '+': // One or more. + op = kRegexpPlus; + goto Rep; + case '?': // Zero or one. + op = kRegexpQuest; + goto Rep; + Rep: + StringPiece opstr = t; + bool nongreedy = false; + t.remove_prefix(1); // '*' or '+' or '?' + if (ps.flags() & PerlX) { if (!t.empty() && t[0] == '?') { - nongreedy = true; - t.remove_prefix(1); // '?' - } + nongreedy = true; + t.remove_prefix(1); // '?' + } if (!lastunary.empty()) { - // In Perl it is not allowed to stack repetition operators: - // a** is a syntax error, not a double-star. - // (and a++ means something else entirely, which we don't support!) - status->set_code(kRegexpRepeatOp); + // In Perl it is not allowed to stack repetition operators: + // a** is a syntax error, not a double-star. + // (and a++ means something else entirely, which we don't support!) + status->set_code(kRegexpRepeatOp); status->set_error_arg(StringPiece( lastunary.data(), static_cast<size_t>(t.data() - lastunary.data()))); - return NULL; - } - } + return NULL; + } + } opstr = StringPiece(opstr.data(), static_cast<size_t>(t.data() - opstr.data())); - if (!ps.PushRepeatOp(op, opstr, nongreedy)) - return NULL; - isunary = opstr; - break; - } - - case '{': { // Counted repetition. - int lo, hi; - StringPiece opstr = t; - if (!MaybeParseRepetition(&t, &lo, &hi)) { - // Treat like a literal. - if (!ps.PushLiteral('{')) - return NULL; - t.remove_prefix(1); // '{' - break; - } - bool nongreedy = false; - if (ps.flags() & PerlX) { + if (!ps.PushRepeatOp(op, opstr, nongreedy)) + return NULL; + isunary = opstr; + break; + } + + case '{': { // Counted repetition. + int lo, hi; + StringPiece opstr = t; + if (!MaybeParseRepetition(&t, &lo, &hi)) { + // Treat like a literal. + if (!ps.PushLiteral('{')) + return NULL; + t.remove_prefix(1); // '{' + break; + } + bool nongreedy = false; + if (ps.flags() & PerlX) { if (!t.empty() && t[0] == '?') { - nongreedy = true; - t.remove_prefix(1); // '?' - } + nongreedy = true; + t.remove_prefix(1); // '?' + } if (!lastunary.empty()) { - // Not allowed to stack repetition operators. - status->set_code(kRegexpRepeatOp); + // Not allowed to stack repetition operators. + status->set_code(kRegexpRepeatOp); status->set_error_arg(StringPiece( lastunary.data(), static_cast<size_t>(t.data() - lastunary.data()))); - return NULL; - } - } + return NULL; + } + } opstr = StringPiece(opstr.data(), static_cast<size_t>(t.data() - opstr.data())); - if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) - return NULL; - isunary = opstr; - break; - } - - case '\\': { // Escaped character or Perl sequence. - // \b and \B: word boundary or not - if ((ps.flags() & Regexp::PerlB) && - t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) { - if (!ps.PushWordBoundary(t[1] == 'b')) - return NULL; - t.remove_prefix(2); // '\\', 'b' - break; - } - - if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) { - if (t[1] == 'A') { - if (!ps.PushSimpleOp(kRegexpBeginText)) - return NULL; - t.remove_prefix(2); // '\\', 'A' - break; - } - if (t[1] == 'z') { - if (!ps.PushSimpleOp(kRegexpEndText)) - return NULL; - t.remove_prefix(2); // '\\', 'z' - break; - } - // Do not recognize \Z, because this library can't - // implement the exact Perl/PCRE semantics. - // (This library treats "(?-m)$" as \z, even though - // in Perl and PCRE it is equivalent to \Z.) - - if (t[1] == 'C') { // \C: any byte [sic] - if (!ps.PushSimpleOp(kRegexpAnyByte)) - return NULL; - t.remove_prefix(2); // '\\', 'C' - break; - } - - if (t[1] == 'Q') { // \Q ... \E: the ... is always literals - t.remove_prefix(2); // '\\', 'Q' + if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) + return NULL; + isunary = opstr; + break; + } + + case '\\': { // Escaped character or Perl sequence. + // \b and \B: word boundary or not + if ((ps.flags() & Regexp::PerlB) && + t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) { + if (!ps.PushWordBoundary(t[1] == 'b')) + return NULL; + t.remove_prefix(2); // '\\', 'b' + break; + } + + if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) { + if (t[1] == 'A') { + if (!ps.PushSimpleOp(kRegexpBeginText)) + return NULL; + t.remove_prefix(2); // '\\', 'A' + break; + } + if (t[1] == 'z') { + if (!ps.PushSimpleOp(kRegexpEndText)) + return NULL; + t.remove_prefix(2); // '\\', 'z' + break; + } + // Do not recognize \Z, because this library can't + // implement the exact Perl/PCRE semantics. + // (This library treats "(?-m)$" as \z, even though + // in Perl and PCRE it is equivalent to \Z.) + + if (t[1] == 'C') { // \C: any byte [sic] + if (!ps.PushSimpleOp(kRegexpAnyByte)) + return NULL; + t.remove_prefix(2); // '\\', 'C' + break; + } + + if (t[1] == 'Q') { // \Q ... \E: the ... is always literals + t.remove_prefix(2); // '\\', 'Q' while (!t.empty()) { - if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { - t.remove_prefix(2); // '\\', 'E' - break; - } - Rune r; - if (StringPieceToRune(&r, &t, status) < 0) - return NULL; - if (!ps.PushLiteral(r)) - return NULL; - } - break; - } - } - - if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) { - Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); - re->ccb_ = new CharClassBuilder; - switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { - case kParseOk: - if (!ps.PushRegexp(re)) - return NULL; - goto Break2; - case kParseError: - re->Decref(); - return NULL; - case kParseNothing: - re->Decref(); - break; - } - } - + if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { + t.remove_prefix(2); // '\\', 'E' + break; + } + Rune r; + if (StringPieceToRune(&r, &t, status) < 0) + return NULL; + if (!ps.PushLiteral(r)) + return NULL; + } + break; + } + } + + if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) { + Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); + re->ccb_ = new CharClassBuilder; + switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { + case kParseOk: + if (!ps.PushRegexp(re)) + return NULL; + goto Break2; + case kParseError: + re->Decref(); + return NULL; + case kParseNothing: + re->Decref(); + break; + } + } + const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); - if (g != NULL) { - Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); - re->ccb_ = new CharClassBuilder; - AddUGroup(re->ccb_, g, g->sign, ps.flags()); - if (!ps.PushRegexp(re)) - return NULL; - break; - } - - Rune r; - if (!ParseEscape(&t, &r, status, ps.rune_max())) - return NULL; - if (!ps.PushLiteral(r)) - return NULL; - break; - } - } - Break2: - lastunary = isunary; - } - return ps.DoFinish(); -} - -} // namespace re2 + if (g != NULL) { + Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); + re->ccb_ = new CharClassBuilder; + AddUGroup(re->ccb_, g, g->sign, ps.flags()); + if (!ps.PushRegexp(re)) + return NULL; + break; + } + + Rune r; + if (!ParseEscape(&t, &r, status, ps.rune_max())) + return NULL; + if (!ps.PushLiteral(r)) + return NULL; + break; + } + } + Break2: + lastunary = isunary; + } + return ps.DoFinish(); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/perl_groups.cc b/contrib/libs/re2/re2/perl_groups.cc index c8f4dbde5e..4687444581 100644 --- a/contrib/libs/re2/re2/perl_groups.cc +++ b/contrib/libs/re2/re2/perl_groups.cc @@ -1,24 +1,24 @@ -// GENERATED BY make_perl_groups.pl; DO NOT EDIT. -// make_perl_groups.pl >perl_groups.cc - -#include "re2/unicode_groups.h" - -namespace re2 { - +// GENERATED BY make_perl_groups.pl; DO NOT EDIT. +// make_perl_groups.pl >perl_groups.cc + +#include "re2/unicode_groups.h" + +namespace re2 { + static const URange16 code1[] = { /* \d */ - { 0x30, 0x39 }, -}; + { 0x30, 0x39 }, +}; static const URange16 code2[] = { /* \s */ - { 0x9, 0xa }, - { 0xc, 0xd }, - { 0x20, 0x20 }, -}; + { 0x9, 0xa }, + { 0xc, 0xd }, + { 0x20, 0x20 }, +}; static const URange16 code3[] = { /* \w */ - { 0x30, 0x39 }, - { 0x41, 0x5a }, - { 0x5f, 0x5f }, - { 0x61, 0x7a }, -}; + { 0x30, 0x39 }, + { 0x41, 0x5a }, + { 0x5f, 0x5f }, + { 0x61, 0x7a }, +}; const UGroup perl_groups[] = { { "\\d", +1, code1, 1, 0, 0 }, { "\\D", -1, code1, 1, 0, 0 }, @@ -26,64 +26,64 @@ const UGroup perl_groups[] = { { "\\S", -1, code2, 3, 0, 0 }, { "\\w", +1, code3, 4, 0, 0 }, { "\\W", -1, code3, 4, 0, 0 }, -}; +}; const int num_perl_groups = 6; static const URange16 code4[] = { /* [:alnum:] */ - { 0x30, 0x39 }, - { 0x41, 0x5a }, - { 0x61, 0x7a }, -}; + { 0x30, 0x39 }, + { 0x41, 0x5a }, + { 0x61, 0x7a }, +}; static const URange16 code5[] = { /* [:alpha:] */ - { 0x41, 0x5a }, - { 0x61, 0x7a }, -}; + { 0x41, 0x5a }, + { 0x61, 0x7a }, +}; static const URange16 code6[] = { /* [:ascii:] */ - { 0x0, 0x7f }, -}; + { 0x0, 0x7f }, +}; static const URange16 code7[] = { /* [:blank:] */ - { 0x9, 0x9 }, - { 0x20, 0x20 }, -}; + { 0x9, 0x9 }, + { 0x20, 0x20 }, +}; static const URange16 code8[] = { /* [:cntrl:] */ - { 0x0, 0x1f }, - { 0x7f, 0x7f }, -}; + { 0x0, 0x1f }, + { 0x7f, 0x7f }, +}; static const URange16 code9[] = { /* [:digit:] */ - { 0x30, 0x39 }, -}; + { 0x30, 0x39 }, +}; static const URange16 code10[] = { /* [:graph:] */ - { 0x21, 0x7e }, -}; + { 0x21, 0x7e }, +}; static const URange16 code11[] = { /* [:lower:] */ - { 0x61, 0x7a }, -}; + { 0x61, 0x7a }, +}; static const URange16 code12[] = { /* [:print:] */ - { 0x20, 0x7e }, -}; + { 0x20, 0x7e }, +}; static const URange16 code13[] = { /* [:punct:] */ - { 0x21, 0x2f }, - { 0x3a, 0x40 }, - { 0x5b, 0x60 }, - { 0x7b, 0x7e }, -}; + { 0x21, 0x2f }, + { 0x3a, 0x40 }, + { 0x5b, 0x60 }, + { 0x7b, 0x7e }, +}; static const URange16 code14[] = { /* [:space:] */ - { 0x9, 0xd }, - { 0x20, 0x20 }, -}; + { 0x9, 0xd }, + { 0x20, 0x20 }, +}; static const URange16 code15[] = { /* [:upper:] */ - { 0x41, 0x5a }, -}; + { 0x41, 0x5a }, +}; static const URange16 code16[] = { /* [:word:] */ - { 0x30, 0x39 }, - { 0x41, 0x5a }, - { 0x5f, 0x5f }, - { 0x61, 0x7a }, -}; + { 0x30, 0x39 }, + { 0x41, 0x5a }, + { 0x5f, 0x5f }, + { 0x61, 0x7a }, +}; static const URange16 code17[] = { /* [:xdigit:] */ - { 0x30, 0x39 }, - { 0x41, 0x46 }, - { 0x61, 0x66 }, -}; + { 0x30, 0x39 }, + { 0x41, 0x46 }, + { 0x61, 0x66 }, +}; const UGroup posix_groups[] = { { "[:alnum:]", +1, code4, 3, 0, 0 }, { "[:^alnum:]", -1, code4, 3, 0, 0 }, @@ -113,7 +113,7 @@ const UGroup posix_groups[] = { { "[:^word:]", -1, code16, 4, 0, 0 }, { "[:xdigit:]", +1, code17, 3, 0, 0 }, { "[:^xdigit:]", -1, code17, 3, 0, 0 }, -}; +}; const int num_posix_groups = 28; - -} // namespace re2 + +} // namespace re2 diff --git a/contrib/libs/re2/re2/prefilter.cc b/contrib/libs/re2/re2/prefilter.cc index 6a9a670381..a47b3120fb 100644 --- a/contrib/libs/re2/re2/prefilter.cc +++ b/contrib/libs/re2/re2/prefilter.cc @@ -1,8 +1,8 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "re2/prefilter.h" +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/prefilter.h" #include <stddef.h> #include <stdint.h> @@ -15,163 +15,163 @@ #include "util/utf.h" #include "re2/re2.h" #include "re2/unicode_casefold.h" -#include "re2/walker-inl.h" - -namespace re2 { - +#include "re2/walker-inl.h" + +namespace re2 { + static const bool ExtraDebug = false; - + typedef std::set<std::string>::iterator SSIter; typedef std::set<std::string>::const_iterator ConstSSIter; - -// Initializes a Prefilter, allocating subs_ as necessary. -Prefilter::Prefilter(Op op) { - op_ = op; - subs_ = NULL; - if (op_ == AND || op_ == OR) + +// Initializes a Prefilter, allocating subs_ as necessary. +Prefilter::Prefilter(Op op) { + op_ = op; + subs_ = NULL; + if (op_ == AND || op_ == OR) subs_ = new std::vector<Prefilter*>; -} - -// Destroys a Prefilter. -Prefilter::~Prefilter() { - if (subs_) { +} + +// Destroys a Prefilter. +Prefilter::~Prefilter() { + if (subs_) { for (size_t i = 0; i < subs_->size(); i++) - delete (*subs_)[i]; - delete subs_; - subs_ = NULL; - } -} - -// Simplify if the node is an empty Or or And. -Prefilter* Prefilter::Simplify() { - if (op_ != AND && op_ != OR) { - return this; - } - - // Nothing left in the AND/OR. + delete (*subs_)[i]; + delete subs_; + subs_ = NULL; + } +} + +// Simplify if the node is an empty Or or And. +Prefilter* Prefilter::Simplify() { + if (op_ != AND && op_ != OR) { + return this; + } + + // Nothing left in the AND/OR. if (subs_->empty()) { - if (op_ == AND) - op_ = ALL; // AND of nothing is true - else - op_ = NONE; // OR of nothing is false - - return this; - } - - // Just one subnode: throw away wrapper. - if (subs_->size() == 1) { - Prefilter* a = (*subs_)[0]; - subs_->clear(); - delete this; - return a->Simplify(); - } - - return this; -} - -// Combines two Prefilters together to create an "op" (AND or OR). -// The passed Prefilters will be part of the returned Prefilter or deleted. -// Does lots of work to avoid creating unnecessarily complicated structures. -Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) { - // If a, b can be rewritten as op, do so. - a = a->Simplify(); - b = b->Simplify(); - - // Canonicalize: a->op <= b->op. - if (a->op() > b->op()) { - Prefilter* t = a; - a = b; - b = t; - } - - // Trivial cases. - // ALL AND b = b - // NONE OR b = b - // ALL OR b = ALL - // NONE AND b = NONE - // Don't need to look at b, because of canonicalization above. - // ALL and NONE are smallest opcodes. - if (a->op() == ALL || a->op() == NONE) { - if ((a->op() == ALL && op == AND) || - (a->op() == NONE && op == OR)) { - delete a; - return b; - } else { - delete b; - return a; - } - } - - // If a and b match op, merge their contents. - if (a->op() == op && b->op() == op) { + if (op_ == AND) + op_ = ALL; // AND of nothing is true + else + op_ = NONE; // OR of nothing is false + + return this; + } + + // Just one subnode: throw away wrapper. + if (subs_->size() == 1) { + Prefilter* a = (*subs_)[0]; + subs_->clear(); + delete this; + return a->Simplify(); + } + + return this; +} + +// Combines two Prefilters together to create an "op" (AND or OR). +// The passed Prefilters will be part of the returned Prefilter or deleted. +// Does lots of work to avoid creating unnecessarily complicated structures. +Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) { + // If a, b can be rewritten as op, do so. + a = a->Simplify(); + b = b->Simplify(); + + // Canonicalize: a->op <= b->op. + if (a->op() > b->op()) { + Prefilter* t = a; + a = b; + b = t; + } + + // Trivial cases. + // ALL AND b = b + // NONE OR b = b + // ALL OR b = ALL + // NONE AND b = NONE + // Don't need to look at b, because of canonicalization above. + // ALL and NONE are smallest opcodes. + if (a->op() == ALL || a->op() == NONE) { + if ((a->op() == ALL && op == AND) || + (a->op() == NONE && op == OR)) { + delete a; + return b; + } else { + delete b; + return a; + } + } + + // If a and b match op, merge their contents. + if (a->op() == op && b->op() == op) { for (size_t i = 0; i < b->subs()->size(); i++) { - Prefilter* bb = (*b->subs())[i]; - a->subs()->push_back(bb); - } - b->subs()->clear(); - delete b; - return a; - } - - // If a already has the same op as the op that is under construction - // add in b (similarly if b already has the same op, add in a). - if (b->op() == op) { - Prefilter* t = a; - a = b; - b = t; - } - if (a->op() == op) { - a->subs()->push_back(b); - return a; - } - - // Otherwise just return the op. - Prefilter* c = new Prefilter(op); - c->subs()->push_back(a); - c->subs()->push_back(b); - return c; -} - -Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) { - return AndOr(AND, a, b); -} - -Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { - return AndOr(OR, a, b); -} - + Prefilter* bb = (*b->subs())[i]; + a->subs()->push_back(bb); + } + b->subs()->clear(); + delete b; + return a; + } + + // If a already has the same op as the op that is under construction + // add in b (similarly if b already has the same op, add in a). + if (b->op() == op) { + Prefilter* t = a; + a = b; + b = t; + } + if (a->op() == op) { + a->subs()->push_back(b); + return a; + } + + // Otherwise just return the op. + Prefilter* c = new Prefilter(op); + c->subs()->push_back(a); + c->subs()->push_back(b); + return c; +} + +Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) { + return AndOr(AND, a, b); +} + +Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { + return AndOr(OR, a, b); +} + static void SimplifyStringSet(std::set<std::string>* ss) { - // Now make sure that the strings aren't redundant. For example, if - // we know "ab" is a required string, then it doesn't help at all to - // know that "abc" is also a required string, so delete "abc". This - // is because, when we are performing a string search to filter + // Now make sure that the strings aren't redundant. For example, if + // we know "ab" is a required string, then it doesn't help at all to + // know that "abc" is also a required string, so delete "abc". This + // is because, when we are performing a string search to filter // regexps, matching "ab" will already allow this regexp to be a // candidate for match, so further matching "abc" is redundant. // Note that we must ignore "" because find() would find it at the // start of everything and thus we would end up erasing everything. - for (SSIter i = ss->begin(); i != ss->end(); ++i) { + for (SSIter i = ss->begin(); i != ss->end(); ++i) { if (i->empty()) continue; - SSIter j = i; - ++j; - while (j != ss->end()) { + SSIter j = i; + ++j; + while (j != ss->end()) { if (j->find(*i) != std::string::npos) { j = ss->erase(j); continue; } - ++j; - } - } -} - + ++j; + } + } +} + Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) { Prefilter* or_prefilter = new Prefilter(NONE); - SimplifyStringSet(ss); + SimplifyStringSet(ss); for (SSIter i = ss->begin(); i != ss->end(); ++i) or_prefilter = Or(or_prefilter, FromString(*i)); - return or_prefilter; -} - + return or_prefilter; +} + static Rune ToLowerRune(Rune r) { if (r < Runeself) { if ('A' <= r && r <= 'Z') @@ -192,221 +192,221 @@ static Rune ToLowerRuneLatin1(Rune r) { } Prefilter* Prefilter::FromString(const std::string& str) { - Prefilter* m = new Prefilter(Prefilter::ATOM); + Prefilter* m = new Prefilter(Prefilter::ATOM); m->atom_ = str; - return m; -} - -// Information about a regexp used during computation of Prefilter. -// Can be thought of as information about the set of strings matching -// the given regular expression. -class Prefilter::Info { - public: - Info(); - ~Info(); - - // More constructors. They delete their Info* arguments. - static Info* Alt(Info* a, Info* b); - static Info* Concat(Info* a, Info* b); - static Info* And(Info* a, Info* b); - static Info* Star(Info* a); - static Info* Plus(Info* a); - static Info* Quest(Info* a); - static Info* EmptyString(); - static Info* NoMatch(); + return m; +} + +// Information about a regexp used during computation of Prefilter. +// Can be thought of as information about the set of strings matching +// the given regular expression. +class Prefilter::Info { + public: + Info(); + ~Info(); + + // More constructors. They delete their Info* arguments. + static Info* Alt(Info* a, Info* b); + static Info* Concat(Info* a, Info* b); + static Info* And(Info* a, Info* b); + static Info* Star(Info* a); + static Info* Plus(Info* a); + static Info* Quest(Info* a); + static Info* EmptyString(); + static Info* NoMatch(); static Info* AnyCharOrAnyByte(); static Info* CClass(CharClass* cc, bool latin1); - static Info* Literal(Rune r); + static Info* Literal(Rune r); static Info* LiteralLatin1(Rune r); - static Info* AnyMatch(); - - // Format Info as a string. + static Info* AnyMatch(); + + // Format Info as a string. std::string ToString(); - - // Caller takes ownership of the Prefilter. - Prefilter* TakeMatch(); - + + // Caller takes ownership of the Prefilter. + Prefilter* TakeMatch(); + std::set<std::string>& exact() { return exact_; } - - bool is_exact() const { return is_exact_; } - - class Walker; - - private: + + bool is_exact() const { return is_exact_; } + + class Walker; + + private: std::set<std::string> exact_; - - // When is_exact_ is true, the strings that match - // are placed in exact_. When it is no longer an exact - // set of strings that match this RE, then is_exact_ - // is false and the match_ contains the required match - // criteria. - bool is_exact_; - - // Accumulated Prefilter query that any - // match for this regexp is guaranteed to match. - Prefilter* match_; -}; - - -Prefilter::Info::Info() - : is_exact_(false), - match_(NULL) { -} - -Prefilter::Info::~Info() { - delete match_; -} - -Prefilter* Prefilter::Info::TakeMatch() { - if (is_exact_) { - match_ = Prefilter::OrStrings(&exact_); - is_exact_ = false; - } - Prefilter* m = match_; - match_ = NULL; - return m; -} - -// Format a Info in string form. + + // When is_exact_ is true, the strings that match + // are placed in exact_. When it is no longer an exact + // set of strings that match this RE, then is_exact_ + // is false and the match_ contains the required match + // criteria. + bool is_exact_; + + // Accumulated Prefilter query that any + // match for this regexp is guaranteed to match. + Prefilter* match_; +}; + + +Prefilter::Info::Info() + : is_exact_(false), + match_(NULL) { +} + +Prefilter::Info::~Info() { + delete match_; +} + +Prefilter* Prefilter::Info::TakeMatch() { + if (is_exact_) { + match_ = Prefilter::OrStrings(&exact_); + is_exact_ = false; + } + Prefilter* m = match_; + match_ = NULL; + return m; +} + +// Format a Info in string form. std::string Prefilter::Info::ToString() { - if (is_exact_) { - int n = 0; + if (is_exact_) { + int n = 0; std::string s; for (SSIter i = exact_.begin(); i != exact_.end(); ++i) { - if (n++ > 0) - s += ","; - s += *i; - } - return s; - } - - if (match_) - return match_->DebugString(); - - return ""; -} - -// Add the strings from src to dst. + if (n++ > 0) + s += ","; + s += *i; + } + return s; + } + + if (match_) + return match_->DebugString(); + + return ""; +} + +// Add the strings from src to dst. static void CopyIn(const std::set<std::string>& src, std::set<std::string>* dst) { - for (ConstSSIter i = src.begin(); i != src.end(); ++i) - dst->insert(*i); -} - -// Add the cross-product of a and b to dst. -// (For each string i in a and j in b, add i+j.) + for (ConstSSIter i = src.begin(); i != src.end(); ++i) + dst->insert(*i); +} + +// Add the cross-product of a and b to dst. +// (For each string i in a and j in b, add i+j.) static void CrossProduct(const std::set<std::string>& a, const std::set<std::string>& b, std::set<std::string>* dst) { - for (ConstSSIter i = a.begin(); i != a.end(); ++i) - for (ConstSSIter j = b.begin(); j != b.end(); ++j) - dst->insert(*i + *j); -} - -// Concats a and b. Requires that both are exact sets. -// Forms an exact set that is a crossproduct of a and b. -Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) { - if (a == NULL) - return b; - DCHECK(a->is_exact_); - DCHECK(b && b->is_exact_); - Info *ab = new Info(); - - CrossProduct(a->exact_, b->exact_, &ab->exact_); - ab->is_exact_ = true; - - delete a; - delete b; - return ab; -} - -// Constructs an inexact Info for ab given a and b. -// Used only when a or b is not exact or when the -// exact cross product is likely to be too big. -Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) { - if (a == NULL) - return b; - if (b == NULL) - return a; - - Info *ab = new Info(); - - ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch()); - ab->is_exact_ = false; - delete a; - delete b; - return ab; -} - -// Constructs Info for a|b given a and b. -Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { - Info *ab = new Info(); - - if (a->is_exact_ && b->is_exact_) { - CopyIn(a->exact_, &ab->exact_); - CopyIn(b->exact_, &ab->exact_); - ab->is_exact_ = true; - } else { - // Either a or b has is_exact_ = false. If the other - // one has is_exact_ = true, we move it to match_ and - // then create a OR of a,b. The resulting Info has - // is_exact_ = false. - ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch()); - ab->is_exact_ = false; - } - - delete a; - delete b; - return ab; -} - -// Constructs Info for a? given a. -Prefilter::Info* Prefilter::Info::Quest(Info *a) { - Info *ab = new Info(); - - ab->is_exact_ = false; - ab->match_ = new Prefilter(ALL); - delete a; - return ab; -} - -// Constructs Info for a* given a. -// Same as a? -- not much to do. -Prefilter::Info* Prefilter::Info::Star(Info *a) { - return Quest(a); -} - -// Constructs Info for a+ given a. If a was exact set, it isn't -// anymore. -Prefilter::Info* Prefilter::Info::Plus(Info *a) { - Info *ab = new Info(); - - ab->match_ = a->TakeMatch(); - ab->is_exact_ = false; - - delete a; - return ab; -} - + for (ConstSSIter i = a.begin(); i != a.end(); ++i) + for (ConstSSIter j = b.begin(); j != b.end(); ++j) + dst->insert(*i + *j); +} + +// Concats a and b. Requires that both are exact sets. +// Forms an exact set that is a crossproduct of a and b. +Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) { + if (a == NULL) + return b; + DCHECK(a->is_exact_); + DCHECK(b && b->is_exact_); + Info *ab = new Info(); + + CrossProduct(a->exact_, b->exact_, &ab->exact_); + ab->is_exact_ = true; + + delete a; + delete b; + return ab; +} + +// Constructs an inexact Info for ab given a and b. +// Used only when a or b is not exact or when the +// exact cross product is likely to be too big. +Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) { + if (a == NULL) + return b; + if (b == NULL) + return a; + + Info *ab = new Info(); + + ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch()); + ab->is_exact_ = false; + delete a; + delete b; + return ab; +} + +// Constructs Info for a|b given a and b. +Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { + Info *ab = new Info(); + + if (a->is_exact_ && b->is_exact_) { + CopyIn(a->exact_, &ab->exact_); + CopyIn(b->exact_, &ab->exact_); + ab->is_exact_ = true; + } else { + // Either a or b has is_exact_ = false. If the other + // one has is_exact_ = true, we move it to match_ and + // then create a OR of a,b. The resulting Info has + // is_exact_ = false. + ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch()); + ab->is_exact_ = false; + } + + delete a; + delete b; + return ab; +} + +// Constructs Info for a? given a. +Prefilter::Info* Prefilter::Info::Quest(Info *a) { + Info *ab = new Info(); + + ab->is_exact_ = false; + ab->match_ = new Prefilter(ALL); + delete a; + return ab; +} + +// Constructs Info for a* given a. +// Same as a? -- not much to do. +Prefilter::Info* Prefilter::Info::Star(Info *a) { + return Quest(a); +} + +// Constructs Info for a+ given a. If a was exact set, it isn't +// anymore. +Prefilter::Info* Prefilter::Info::Plus(Info *a) { + Info *ab = new Info(); + + ab->match_ = a->TakeMatch(); + ab->is_exact_ = false; + + delete a; + return ab; +} + static std::string RuneToString(Rune r) { - char buf[UTFmax]; - int n = runetochar(buf, &r); + char buf[UTFmax]; + int n = runetochar(buf, &r); return std::string(buf, n); -} - +} + static std::string RuneToStringLatin1(Rune r) { char c = r & 0xff; return std::string(&c, 1); } -// Constructs Info for literal rune. -Prefilter::Info* Prefilter::Info::Literal(Rune r) { - Info* info = new Info(); +// Constructs Info for literal rune. +Prefilter::Info* Prefilter::Info::Literal(Rune r) { + Info* info = new Info(); info->exact_.insert(RuneToString(ToLowerRune(r))); - info->is_exact_ = true; - return info; -} - + info->is_exact_ = true; + return info; +} + // Constructs Info for literal rune for Latin1 encoded string. Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) { Info* info = new Info(); @@ -417,52 +417,52 @@ Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) { // Constructs Info for dot (any character) or \C (any byte). Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() { - Prefilter::Info* info = new Prefilter::Info(); - info->match_ = new Prefilter(ALL); - return info; -} - -// Constructs Prefilter::Info for no possible match. -Prefilter::Info* Prefilter::Info::NoMatch() { - Prefilter::Info* info = new Prefilter::Info(); - info->match_ = new Prefilter(NONE); - return info; -} - -// Constructs Prefilter::Info for any possible match. -// This Prefilter::Info is valid for any regular expression, -// since it makes no assertions whatsoever about the -// strings being matched. -Prefilter::Info* Prefilter::Info::AnyMatch() { - Prefilter::Info *info = new Prefilter::Info(); - info->match_ = new Prefilter(ALL); - return info; -} - -// Constructs Prefilter::Info for just the empty string. -Prefilter::Info* Prefilter::Info::EmptyString() { - Prefilter::Info* info = new Prefilter::Info(); - info->is_exact_ = true; - info->exact_.insert(""); - return info; -} - -// Constructs Prefilter::Info for a character class. -typedef CharClass::iterator CCIter; + Prefilter::Info* info = new Prefilter::Info(); + info->match_ = new Prefilter(ALL); + return info; +} + +// Constructs Prefilter::Info for no possible match. +Prefilter::Info* Prefilter::Info::NoMatch() { + Prefilter::Info* info = new Prefilter::Info(); + info->match_ = new Prefilter(NONE); + return info; +} + +// Constructs Prefilter::Info for any possible match. +// This Prefilter::Info is valid for any regular expression, +// since it makes no assertions whatsoever about the +// strings being matched. +Prefilter::Info* Prefilter::Info::AnyMatch() { + Prefilter::Info *info = new Prefilter::Info(); + info->match_ = new Prefilter(ALL); + return info; +} + +// Constructs Prefilter::Info for just the empty string. +Prefilter::Info* Prefilter::Info::EmptyString() { + Prefilter::Info* info = new Prefilter::Info(); + info->is_exact_ = true; + info->exact_.insert(""); + return info; +} + +// Constructs Prefilter::Info for a character class. +typedef CharClass::iterator CCIter; Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, bool latin1) { if (ExtraDebug) { LOG(ERROR) << "CharClassInfo:"; - for (CCIter i = cc->begin(); i != cc->end(); ++i) + for (CCIter i = cc->begin(); i != cc->end(); ++i) LOG(ERROR) << " " << i->lo << "-" << i->hi; - } - - // If the class is too large, it's okay to overestimate. - if (cc->size() > 10) + } + + // If the class is too large, it's okay to overestimate. + if (cc->size() > 10) return AnyCharOrAnyByte(); - - Prefilter::Info *a = new Prefilter::Info(); - for (CCIter i = cc->begin(); i != cc->end(); ++i) + + Prefilter::Info *a = new Prefilter::Info(); + for (CCIter i = cc->begin(); i != cc->end(); ++i) for (Rune r = i->lo; r <= i->hi; r++) { if (latin1) { a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); @@ -470,101 +470,101 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, a->exact_.insert(RuneToString(ToLowerRune(r))); } } - - a->is_exact_ = true; - + + a->is_exact_ = true; + if (ExtraDebug) LOG(ERROR) << " = " << a->ToString(); - - return a; -} - -class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> { - public: + + return a; +} + +class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> { + public: Walker(bool latin1) : latin1_(latin1) {} - - virtual Info* PostVisit( - Regexp* re, Info* parent_arg, - Info* pre_arg, - Info** child_args, int nchild_args); - - virtual Info* ShortVisit( - Regexp* re, - Info* parent_arg); - + + virtual Info* PostVisit( + Regexp* re, Info* parent_arg, + Info* pre_arg, + Info** child_args, int nchild_args); + + virtual Info* ShortVisit( + Regexp* re, + Info* parent_arg); + bool latin1() { return latin1_; } - private: + private: bool latin1_; Walker(const Walker&) = delete; Walker& operator=(const Walker&) = delete; -}; - -Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { +}; + +Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { if (ExtraDebug) LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString(); bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0; Prefilter::Info::Walker w(latin1); - Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); - - if (w.stopped_early()) { - delete info; - return NULL; - } - - return info; -} - -Prefilter::Info* Prefilter::Info::Walker::ShortVisit( - Regexp* re, Prefilter::Info* parent_arg) { - return AnyMatch(); -} - -// Constructs the Prefilter::Info for the given regular expression. -// Assumes re is simplified. -Prefilter::Info* Prefilter::Info::Walker::PostVisit( - Regexp* re, Prefilter::Info* parent_arg, - Prefilter::Info* pre_arg, Prefilter::Info** child_args, - int nchild_args) { - Prefilter::Info *info; - switch (re->op()) { - default: - case kRegexpRepeat: - LOG(DFATAL) << "Bad regexp op " << re->op(); - info = EmptyString(); - break; - - case kRegexpNoMatch: - info = NoMatch(); - break; - - // These ops match the empty string: - case kRegexpEmptyMatch: // anywhere - case kRegexpBeginLine: // at beginning of line - case kRegexpEndLine: // at end of line - case kRegexpBeginText: // at beginning of text - case kRegexpEndText: // at end of text - case kRegexpWordBoundary: // at word boundary - case kRegexpNoWordBoundary: // not at word boundary - info = EmptyString(); - break; - - case kRegexpLiteral: + Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); + + if (w.stopped_early()) { + delete info; + return NULL; + } + + return info; +} + +Prefilter::Info* Prefilter::Info::Walker::ShortVisit( + Regexp* re, Prefilter::Info* parent_arg) { + return AnyMatch(); +} + +// Constructs the Prefilter::Info for the given regular expression. +// Assumes re is simplified. +Prefilter::Info* Prefilter::Info::Walker::PostVisit( + Regexp* re, Prefilter::Info* parent_arg, + Prefilter::Info* pre_arg, Prefilter::Info** child_args, + int nchild_args) { + Prefilter::Info *info; + switch (re->op()) { + default: + case kRegexpRepeat: + LOG(DFATAL) << "Bad regexp op " << re->op(); + info = EmptyString(); + break; + + case kRegexpNoMatch: + info = NoMatch(); + break; + + // These ops match the empty string: + case kRegexpEmptyMatch: // anywhere + case kRegexpBeginLine: // at beginning of line + case kRegexpEndLine: // at end of line + case kRegexpBeginText: // at beginning of text + case kRegexpEndText: // at end of text + case kRegexpWordBoundary: // at word boundary + case kRegexpNoWordBoundary: // not at word boundary + info = EmptyString(); + break; + + case kRegexpLiteral: if (latin1()) { info = LiteralLatin1(re->rune()); } else { info = Literal(re->rune()); } - break; - - case kRegexpLiteralString: - if (re->nrunes() == 0) { - info = NoMatch(); - break; - } + break; + + case kRegexpLiteralString: + if (re->nrunes() == 0) { + info = NoMatch(); + break; + } if (latin1()) { info = LiteralLatin1(re->runes()[0]); for (int i = 1; i < re->nrunes(); i++) { @@ -576,136 +576,136 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( info = Concat(info, Literal(re->runes()[i])); } } - break; - - case kRegexpConcat: { - // Accumulate in info. - // Exact is concat of recent contiguous exact nodes. - info = NULL; - Info* exact = NULL; - for (int i = 0; i < nchild_args; i++) { - Info* ci = child_args[i]; // child info - if (!ci->is_exact() || - (exact && ci->exact().size() * exact->exact().size() > 16)) { - // Exact run is over. - info = And(info, exact); - exact = NULL; - // Add this child's info. - info = And(info, ci); - } else { - // Append to exact run. - exact = Concat(exact, ci); - } - } - info = And(info, exact); - } - break; - - case kRegexpAlternate: - info = child_args[0]; - for (int i = 1; i < nchild_args; i++) - info = Alt(info, child_args[i]); - break; - - case kRegexpStar: - info = Star(child_args[0]); - break; - - case kRegexpQuest: - info = Quest(child_args[0]); - break; - - case kRegexpPlus: - info = Plus(child_args[0]); - break; - - case kRegexpAnyChar: + break; + + case kRegexpConcat: { + // Accumulate in info. + // Exact is concat of recent contiguous exact nodes. + info = NULL; + Info* exact = NULL; + for (int i = 0; i < nchild_args; i++) { + Info* ci = child_args[i]; // child info + if (!ci->is_exact() || + (exact && ci->exact().size() * exact->exact().size() > 16)) { + // Exact run is over. + info = And(info, exact); + exact = NULL; + // Add this child's info. + info = And(info, ci); + } else { + // Append to exact run. + exact = Concat(exact, ci); + } + } + info = And(info, exact); + } + break; + + case kRegexpAlternate: + info = child_args[0]; + for (int i = 1; i < nchild_args; i++) + info = Alt(info, child_args[i]); + break; + + case kRegexpStar: + info = Star(child_args[0]); + break; + + case kRegexpQuest: + info = Quest(child_args[0]); + break; + + case kRegexpPlus: + info = Plus(child_args[0]); + break; + + case kRegexpAnyChar: case kRegexpAnyByte: - // Claim nothing, except that it's not empty. + // Claim nothing, except that it's not empty. info = AnyCharOrAnyByte(); - break; - - case kRegexpCharClass: + break; + + case kRegexpCharClass: info = CClass(re->cc(), latin1()); - break; - - case kRegexpCapture: - // These don't affect the set of matching strings. - info = child_args[0]; - break; - } - + break; + + case kRegexpCapture: + // These don't affect the set of matching strings. + info = child_args[0]; + break; + } + if (ExtraDebug) LOG(ERROR) << "BuildInfo " << re->ToString() << ": " << (info ? info->ToString() : ""); - - return info; -} - - -Prefilter* Prefilter::FromRegexp(Regexp* re) { - if (re == NULL) - return NULL; - - Regexp* simple = re->Simplify(); + + return info; +} + + +Prefilter* Prefilter::FromRegexp(Regexp* re) { + if (re == NULL) + return NULL; + + Regexp* simple = re->Simplify(); if (simple == NULL) return NULL; - + Prefilter::Info* info = BuildInfo(simple); - simple->Decref(); - if (info == NULL) - return NULL; - - Prefilter* m = info->TakeMatch(); - delete info; - return m; -} - + simple->Decref(); + if (info == NULL) + return NULL; + + Prefilter* m = info->TakeMatch(); + delete info; + return m; +} + std::string Prefilter::DebugString() const { - switch (op_) { - default: - LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; - return StringPrintf("op%d", op_); - case NONE: - return "*no-matches*"; - case ATOM: - return atom_; - case ALL: - return ""; - case AND: { + switch (op_) { + default: + LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; + return StringPrintf("op%d", op_); + case NONE: + return "*no-matches*"; + case ATOM: + return atom_; + case ALL: + return ""; + case AND: { std::string s = ""; for (size_t i = 0; i < subs_->size(); i++) { - if (i > 0) - s += " "; + if (i > 0) + s += " "; Prefilter* sub = (*subs_)[i]; s += sub ? sub->DebugString() : "<nil>"; - } - return s; - } - case OR: { + } + return s; + } + case OR: { std::string s = "("; for (size_t i = 0; i < subs_->size(); i++) { - if (i > 0) - s += "|"; + if (i > 0) + s += "|"; Prefilter* sub = (*subs_)[i]; s += sub ? sub->DebugString() : "<nil>"; - } - s += ")"; - return s; - } - } -} - -Prefilter* Prefilter::FromRE2(const RE2* re2) { - if (re2 == NULL) - return NULL; - - Regexp* regexp = re2->Regexp(); - if (regexp == NULL) - return NULL; - - return FromRegexp(regexp); -} - - -} // namespace re2 + } + s += ")"; + return s; + } + } +} + +Prefilter* Prefilter::FromRE2(const RE2* re2) { + if (re2 == NULL) + return NULL; + + Regexp* regexp = re2->Regexp(); + if (regexp == NULL) + return NULL; + + return FromRegexp(regexp); +} + + +} // namespace re2 diff --git a/contrib/libs/re2/re2/prefilter.h b/contrib/libs/re2/re2/prefilter.h index 1ce0b63c76..4fedeb4a7c 100644 --- a/contrib/libs/re2/re2/prefilter.h +++ b/contrib/libs/re2/re2/prefilter.h @@ -1,108 +1,108 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_PREFILTER_H_ #define RE2_PREFILTER_H_ -// Prefilter is the class used to extract string guards from regexps. -// Rather than using Prefilter class directly, use FilteredRE2. -// See filtered_re2.h - +// Prefilter is the class used to extract string guards from regexps. +// Rather than using Prefilter class directly, use FilteredRE2. +// See filtered_re2.h + #include <set> #include <string> #include <vector> - + #include "util/util.h" #include "util/logging.h" - -namespace re2 { - -class RE2; - -class Regexp; - -class Prefilter { - // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h - public: - enum Op { - ALL = 0, // Everything matches - NONE, // Nothing matches - ATOM, // The string atom() must match - AND, // All in subs() must match - OR, // One of subs() must match - }; - - explicit Prefilter(Op op); - ~Prefilter(); - - Op op() { return op_; } + +namespace re2 { + +class RE2; + +class Regexp; + +class Prefilter { + // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h + public: + enum Op { + ALL = 0, // Everything matches + NONE, // Nothing matches + ATOM, // The string atom() must match + AND, // All in subs() must match + OR, // One of subs() must match + }; + + explicit Prefilter(Op op); + ~Prefilter(); + + Op op() { return op_; } const std::string& atom() const { return atom_; } - void set_unique_id(int id) { unique_id_ = id; } - int unique_id() const { return unique_id_; } - - // The children of the Prefilter node. + void set_unique_id(int id) { unique_id_ = id; } + int unique_id() const { return unique_id_; } + + // The children of the Prefilter node. std::vector<Prefilter*>* subs() { DCHECK(op_ == AND || op_ == OR); - return subs_; - } - - // Set the children vector. Prefilter takes ownership of subs and - // subs_ will be deleted when Prefilter is deleted. + return subs_; + } + + // Set the children vector. Prefilter takes ownership of subs and + // subs_ will be deleted when Prefilter is deleted. void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; } - - // Given a RE2, return a Prefilter. The caller takes ownership of - // the Prefilter and should deallocate it. Returns NULL if Prefilter - // cannot be formed. - static Prefilter* FromRE2(const RE2* re2); - - // Returns a readable debug string of the prefilter. + + // Given a RE2, return a Prefilter. The caller takes ownership of + // the Prefilter and should deallocate it. Returns NULL if Prefilter + // cannot be formed. + static Prefilter* FromRE2(const RE2* re2); + + // Returns a readable debug string of the prefilter. std::string DebugString() const; - - private: - class Info; - - // Combines two prefilters together to create an AND. The passed - // Prefilters will be part of the returned Prefilter or deleted. - static Prefilter* And(Prefilter* a, Prefilter* b); - - // Combines two prefilters together to create an OR. The passed - // Prefilters will be part of the returned Prefilter or deleted. - static Prefilter* Or(Prefilter* a, Prefilter* b); - - // Generalized And/Or - static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b); - - static Prefilter* FromRegexp(Regexp* a); - + + private: + class Info; + + // Combines two prefilters together to create an AND. The passed + // Prefilters will be part of the returned Prefilter or deleted. + static Prefilter* And(Prefilter* a, Prefilter* b); + + // Combines two prefilters together to create an OR. The passed + // Prefilters will be part of the returned Prefilter or deleted. + static Prefilter* Or(Prefilter* a, Prefilter* b); + + // Generalized And/Or + static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b); + + static Prefilter* FromRegexp(Regexp* a); + static Prefilter* FromString(const std::string& str); - + static Prefilter* OrStrings(std::set<std::string>* ss); - - static Info* BuildInfo(Regexp* re); - - Prefilter* Simplify(); - - // Kind of Prefilter. - Op op_; - - // Sub-matches for AND or OR Prefilter. + + static Info* BuildInfo(Regexp* re); + + Prefilter* Simplify(); + + // Kind of Prefilter. + Op op_; + + // Sub-matches for AND or OR Prefilter. std::vector<Prefilter*>* subs_; - - // Actual string to match in leaf node. + + // Actual string to match in leaf node. std::string atom_; - - // If different prefilters have the same string atom, or if they are - // structurally the same (e.g., OR of same atom strings) they are - // considered the same unique nodes. This is the id for each unique - // node. This field is populated with a unique id for every node, - // and -1 for duplicate nodes. - int unique_id_; - + + // If different prefilters have the same string atom, or if they are + // structurally the same (e.g., OR of same atom strings) they are + // considered the same unique nodes. This is the id for each unique + // node. This field is populated with a unique id for every node, + // and -1 for duplicate nodes. + int unique_id_; + Prefilter(const Prefilter&) = delete; Prefilter& operator=(const Prefilter&) = delete; -}; - -} // namespace re2 - -#endif // RE2_PREFILTER_H_ +}; + +} // namespace re2 + +#endif // RE2_PREFILTER_H_ diff --git a/contrib/libs/re2/re2/prefilter_tree.cc b/contrib/libs/re2/re2/prefilter_tree.cc index 1d24198590..fdf4e083c9 100644 --- a/contrib/libs/re2/re2/prefilter_tree.cc +++ b/contrib/libs/re2/re2/prefilter_tree.cc @@ -1,9 +1,9 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #include "re2/prefilter_tree.h" - + #include <stddef.h> #include <algorithm> #include <map> @@ -16,118 +16,118 @@ #include "util/util.h" #include "util/logging.h" #include "util/strutil.h" -#include "re2/prefilter.h" +#include "re2/prefilter.h" #include "re2/re2.h" - -namespace re2 { - + +namespace re2 { + static const bool ExtraDebug = false; -PrefilterTree::PrefilterTree() +PrefilterTree::PrefilterTree() : compiled_(false), min_atom_len_(3) { -} - +} + PrefilterTree::PrefilterTree(int min_atom_len) : compiled_(false), min_atom_len_(min_atom_len) { } -PrefilterTree::~PrefilterTree() { +PrefilterTree::~PrefilterTree() { for (size_t i = 0; i < prefilter_vec_.size(); i++) - delete prefilter_vec_[i]; - + delete prefilter_vec_[i]; + for (size_t i = 0; i < entries_.size(); i++) - delete entries_[i].parents; -} - + delete entries_[i].parents; +} + void PrefilterTree::Add(Prefilter* prefilter) { - if (compiled_) { + if (compiled_) { LOG(DFATAL) << "Add called after Compile."; - return; - } + return; + } if (prefilter != NULL && !KeepNode(prefilter)) { delete prefilter; prefilter = NULL; - } - + } + prefilter_vec_.push_back(prefilter); -} - +} + void PrefilterTree::Compile(std::vector<std::string>* atom_vec) { - if (compiled_) { + if (compiled_) { LOG(DFATAL) << "Compile called already."; - return; - } - + return; + } + // Some legacy users of PrefilterTree call Compile() before // adding any regexps and expect Compile() to have no effect. - if (prefilter_vec_.empty()) - return; - - compiled_ = true; - + if (prefilter_vec_.empty()) + return; + + compiled_ = true; + // TODO(junyer): Use std::unordered_set<Prefilter*> instead? NodeMap nodes; AssignUniqueIds(&nodes, atom_vec); - - // Identify nodes that are too common among prefilters and are - // triggering too many parents. Then get rid of them if possible. - // Note that getting rid of a prefilter node simply means they are - // no longer necessary for their parent to trigger; that is, we do - // not miss out on any regexps triggering by getting rid of a - // prefilter node. + + // Identify nodes that are too common among prefilters and are + // triggering too many parents. Then get rid of them if possible. + // Note that getting rid of a prefilter node simply means they are + // no longer necessary for their parent to trigger; that is, we do + // not miss out on any regexps triggering by getting rid of a + // prefilter node. for (size_t i = 0; i < entries_.size(); i++) { StdIntMap* parents = entries_[i].parents; - if (parents->size() > 8) { - // This one triggers too many things. If all the parents are AND - // nodes and have other things guarding them, then get rid of - // this trigger. TODO(vsri): Adjust the threshold appropriately, - // make it a function of total number of nodes? - bool have_other_guard = true; + if (parents->size() > 8) { + // This one triggers too many things. If all the parents are AND + // nodes and have other things guarding them, then get rid of + // this trigger. TODO(vsri): Adjust the threshold appropriately, + // make it a function of total number of nodes? + bool have_other_guard = true; for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) { - have_other_guard = have_other_guard && + have_other_guard = have_other_guard && (entries_[it->first].propagate_up_at_count > 1); } - - if (have_other_guard) { + + if (have_other_guard) { for (StdIntMap::iterator it = parents->begin(); - it != parents->end(); ++it) + it != parents->end(); ++it) entries_[it->first].propagate_up_at_count -= 1; - - parents->clear(); // Forget the parents - } - } - } - + + parents->clear(); // Forget the parents + } + } + } + if (ExtraDebug) PrintDebugInfo(&nodes); -} - +} + Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { std::string node_string = NodeString(node); NodeMap::iterator iter = nodes->find(node_string); if (iter == nodes->end()) - return NULL; - return (*iter).second; -} - + return NULL; + return (*iter).second; +} + std::string PrefilterTree::NodeString(Prefilter* node) const { - // Adding the operation disambiguates AND/OR/atom nodes. + // Adding the operation disambiguates AND/OR/atom nodes. std::string s = StringPrintf("%d", node->op()) + ":"; - if (node->op() == Prefilter::ATOM) { - s += node->atom(); - } else { + if (node->op() == Prefilter::ATOM) { + s += node->atom(); + } else { for (size_t i = 0; i < node->subs()->size(); i++) { - if (i > 0) - s += ','; + if (i > 0) + s += ','; s += StringPrintf("%d", (*node->subs())[i]->unique_id()); - } - } - return s; -} - + } + } + return s; +} + bool PrefilterTree::KeepNode(Prefilter* node) const { if (node == NULL) return false; @@ -167,137 +167,137 @@ bool PrefilterTree::KeepNode(Prefilter* node) const { void PrefilterTree::AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec) { - atom_vec->clear(); - - // Build vector of all filter nodes, sorted topologically - // from top to bottom in v. + atom_vec->clear(); + + // Build vector of all filter nodes, sorted topologically + // from top to bottom in v. std::vector<Prefilter*> v; - - // Add the top level nodes of each regexp prefilter. + + // Add the top level nodes of each regexp prefilter. for (size_t i = 0; i < prefilter_vec_.size(); i++) { - Prefilter* f = prefilter_vec_[i]; - if (f == NULL) + Prefilter* f = prefilter_vec_[i]; + if (f == NULL) unfiltered_.push_back(static_cast<int>(i)); - - // We push NULL also on to v, so that we maintain the - // mapping of index==regexpid for level=0 prefilter nodes. - v.push_back(f); - } - - // Now add all the descendant nodes. + + // We push NULL also on to v, so that we maintain the + // mapping of index==regexpid for level=0 prefilter nodes. + v.push_back(f); + } + + // Now add all the descendant nodes. for (size_t i = 0; i < v.size(); i++) { - Prefilter* f = v[i]; - if (f == NULL) - continue; - if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { + Prefilter* f = v[i]; + if (f == NULL) + continue; + if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { const std::vector<Prefilter*>& subs = *f->subs(); for (size_t j = 0; j < subs.size(); j++) - v.push_back(subs[j]); - } - } - - // Identify unique nodes. - int unique_id = 0; + v.push_back(subs[j]); + } + } + + // Identify unique nodes. + int unique_id = 0; for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { - Prefilter *node = v[i]; - if (node == NULL) - continue; - node->set_unique_id(-1); + Prefilter *node = v[i]; + if (node == NULL) + continue; + node->set_unique_id(-1); Prefilter* canonical = CanonicalNode(nodes, node); - if (canonical == NULL) { - // Any further nodes that have the same node string - // will find this node as the canonical node. + if (canonical == NULL) { + // Any further nodes that have the same node string + // will find this node as the canonical node. nodes->emplace(NodeString(node), node); - if (node->op() == Prefilter::ATOM) { - atom_vec->push_back(node->atom()); - atom_index_to_id_.push_back(unique_id); - } - node->set_unique_id(unique_id++); - } else { - node->set_unique_id(canonical->unique_id()); - } - } + if (node->op() == Prefilter::ATOM) { + atom_vec->push_back(node->atom()); + atom_index_to_id_.push_back(unique_id); + } + node->set_unique_id(unique_id++); + } else { + node->set_unique_id(canonical->unique_id()); + } + } entries_.resize(nodes->size()); - + // Create parent StdIntMap for the entries. for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { - Prefilter* prefilter = v[i]; - if (prefilter == NULL) - continue; - + Prefilter* prefilter = v[i]; + if (prefilter == NULL) + continue; + if (CanonicalNode(nodes, prefilter) != prefilter) - continue; - - Entry* entry = &entries_[prefilter->unique_id()]; + continue; + + Entry* entry = &entries_[prefilter->unique_id()]; entry->parents = new StdIntMap(); - } - - // Fill the entries. + } + + // Fill the entries. for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { - Prefilter* prefilter = v[i]; - if (prefilter == NULL) - continue; - + Prefilter* prefilter = v[i]; + if (prefilter == NULL) + continue; + if (CanonicalNode(nodes, prefilter) != prefilter) - continue; - - Entry* entry = &entries_[prefilter->unique_id()]; - - switch (prefilter->op()) { - default: - case Prefilter::ALL: - LOG(DFATAL) << "Unexpected op: " << prefilter->op(); - return; - - case Prefilter::ATOM: - entry->propagate_up_at_count = 1; - break; - - case Prefilter::OR: - case Prefilter::AND: { + continue; + + Entry* entry = &entries_[prefilter->unique_id()]; + + switch (prefilter->op()) { + default: + case Prefilter::ALL: + LOG(DFATAL) << "Unexpected op: " << prefilter->op(); + return; + + case Prefilter::ATOM: + entry->propagate_up_at_count = 1; + break; + + case Prefilter::OR: + case Prefilter::AND: { std::set<int> uniq_child; for (size_t j = 0; j < prefilter->subs()->size(); j++) { - Prefilter* child = (*prefilter->subs())[j]; + Prefilter* child = (*prefilter->subs())[j]; Prefilter* canonical = CanonicalNode(nodes, child); - if (canonical == NULL) { - LOG(DFATAL) << "Null canonical node"; - return; - } - int child_id = canonical->unique_id(); + if (canonical == NULL) { + LOG(DFATAL) << "Null canonical node"; + return; + } + int child_id = canonical->unique_id(); uniq_child.insert(child_id); - // To the child, we want to add to parent indices. - Entry* child_entry = &entries_[child_id]; + // To the child, we want to add to parent indices. + Entry* child_entry = &entries_[child_id]; if (child_entry->parents->find(prefilter->unique_id()) == child_entry->parents->end()) { (*child_entry->parents)[prefilter->unique_id()] = 1; } - } + } entry->propagate_up_at_count = prefilter->op() == Prefilter::AND ? static_cast<int>(uniq_child.size()) : 1; - - break; - } - } - } - - // For top level nodes, populate regexp id. + + break; + } + } + } + + // For top level nodes, populate regexp id. for (size_t i = 0; i < prefilter_vec_.size(); i++) { - if (prefilter_vec_[i] == NULL) - continue; + if (prefilter_vec_[i] == NULL) + continue; int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id(); - DCHECK_LE(0, id); - Entry* entry = &entries_[id]; + DCHECK_LE(0, id); + Entry* entry = &entries_[id]; entry->regexps.push_back(static_cast<int>(i)); - } -} - -// Functions for triggering during search. -void PrefilterTree::RegexpsGivenStrings( + } +} + +// Functions for triggering during search. +void PrefilterTree::RegexpsGivenStrings( const std::vector<int>& matched_atoms, std::vector<int>* regexps) const { - regexps->clear(); - if (!compiled_) { + regexps->clear(); + if (!compiled_) { // Some legacy users of PrefilterTree call Compile() before // adding any regexps and expect Compile() to have no effect. // This kludge is a counterpart to that kludge. @@ -307,7 +307,7 @@ void PrefilterTree::RegexpsGivenStrings( LOG(ERROR) << "RegexpsGivenStrings called before Compile."; for (size_t i = 0; i < prefilter_vec_.size(); i++) regexps->push_back(static_cast<int>(i)); - } else { + } else { IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); std::vector<int> matched_atom_ids; for (size_t j = 0; j < matched_atoms.size(); j++) @@ -317,57 +317,57 @@ void PrefilterTree::RegexpsGivenStrings( it != regexps_map.end(); ++it) regexps->push_back(it->index()); - + regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); - } + } std::sort(regexps->begin(), regexps->end()); -} - +} + void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids, - IntMap* regexps) const { + IntMap* regexps) const { IntMap count(static_cast<int>(entries_.size())); IntMap work(static_cast<int>(entries_.size())); for (size_t i = 0; i < atom_ids.size(); i++) - work.set(atom_ids[i], 1); - for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { - const Entry& entry = entries_[it->index()]; - // Record regexps triggered. + work.set(atom_ids[i], 1); + for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { + const Entry& entry = entries_[it->index()]; + // Record regexps triggered. for (size_t i = 0; i < entry.regexps.size(); i++) - regexps->set(entry.regexps[i], 1); - int c; - // Pass trigger up to parents. + regexps->set(entry.regexps[i], 1); + int c; + // Pass trigger up to parents. for (StdIntMap::iterator it = entry.parents->begin(); - it != entry.parents->end(); - ++it) { + it != entry.parents->end(); + ++it) { int j = it->first; - const Entry& parent = entries_[j]; - // Delay until all the children have succeeded. - if (parent.propagate_up_at_count > 1) { - if (count.has_index(j)) { - c = count.get_existing(j) + 1; - count.set_existing(j, c); - } else { - c = 1; - count.set_new(j, c); - } - if (c < parent.propagate_up_at_count) - continue; - } - // Trigger the parent. - work.set(j, 1); - } - } -} - -// Debugging help. -void PrefilterTree::PrintPrefilter(int regexpid) { + const Entry& parent = entries_[j]; + // Delay until all the children have succeeded. + if (parent.propagate_up_at_count > 1) { + if (count.has_index(j)) { + c = count.get_existing(j) + 1; + count.set_existing(j, c); + } else { + c = 1; + count.set_new(j, c); + } + if (c < parent.propagate_up_at_count) + continue; + } + // Trigger the parent. + work.set(j, 1); + } + } +} + +// Debugging help. +void PrefilterTree::PrintPrefilter(int regexpid) { LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]); -} - +} + void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size(); LOG(ERROR) << "#Unique Nodes: " << entries_.size(); - + for (size_t i = 0; i < entries_.size(); i++) { StdIntMap* parents = entries_[i].parents; const std::vector<int>& regexps = entries_[i].regexps; @@ -375,33 +375,33 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { << " N: " << parents->size() << " R: " << regexps.size(); for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) LOG(ERROR) << it->first; - } + } LOG(ERROR) << "Map:"; for (NodeMap::const_iterator iter = nodes->begin(); iter != nodes->end(); ++iter) LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() << " Str: " << (*iter).first; -} - +} + std::string PrefilterTree::DebugNodeString(Prefilter* node) const { std::string node_string = ""; - if (node->op() == Prefilter::ATOM) { - DCHECK(!node->atom().empty()); - node_string += node->atom(); - } else { - // Adding the operation disambiguates AND and OR nodes. - node_string += node->op() == Prefilter::AND ? "AND" : "OR"; - node_string += "("; + if (node->op() == Prefilter::ATOM) { + DCHECK(!node->atom().empty()); + node_string += node->atom(); + } else { + // Adding the operation disambiguates AND and OR nodes. + node_string += node->op() == Prefilter::AND ? "AND" : "OR"; + node_string += "("; for (size_t i = 0; i < node->subs()->size(); i++) { - if (i > 0) - node_string += ','; + if (i > 0) + node_string += ','; node_string += StringPrintf("%d", (*node->subs())[i]->unique_id()); - node_string += ":"; - node_string += DebugNodeString((*node->subs())[i]); - } - node_string += ")"; - } - return node_string; -} - -} // namespace re2 + node_string += ":"; + node_string += DebugNodeString((*node->subs())[i]); + } + node_string += ")"; + } + return node_string; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/prefilter_tree.h b/contrib/libs/re2/re2/prefilter_tree.h index 2d30fbd717..5d73074d97 100644 --- a/contrib/libs/re2/re2/prefilter_tree.h +++ b/contrib/libs/re2/re2/prefilter_tree.h @@ -1,21 +1,21 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_PREFILTER_TREE_H_ #define RE2_PREFILTER_TREE_H_ -// The PrefilterTree class is used to form an AND-OR tree of strings -// that would trigger each regexp. The 'prefilter' of each regexp is +// The PrefilterTree class is used to form an AND-OR tree of strings +// that would trigger each regexp. The 'prefilter' of each regexp is // added to PrefilterTree, and then PrefilterTree is used to find all -// the unique strings across the prefilters. During search, by using -// matches from a string matching engine, PrefilterTree deduces the -// set of regexps that are to be triggered. The 'string matching -// engine' itself is outside of this class, and the caller can use any -// favorite engine. PrefilterTree provides a set of strings (called -// atoms) that the user of this class should use to do the string -// matching. - +// the unique strings across the prefilters. During search, by using +// matches from a string matching engine, PrefilterTree deduces the +// set of regexps that are to be triggered. The 'string matching +// engine' itself is outside of this class, and the caller can use any +// favorite engine. PrefilterTree provides a set of strings (called +// atoms) that the user of this class should use to do the string +// matching. + #include <map> #include <string> #include <vector> @@ -23,117 +23,117 @@ #include "util/util.h" #include "re2/prefilter.h" #include "re2/sparse_array.h" - -namespace re2 { - -class PrefilterTree { - public: - PrefilterTree(); + +namespace re2 { + +class PrefilterTree { + public: + PrefilterTree(); explicit PrefilterTree(int min_atom_len); - ~PrefilterTree(); - - // Adds the prefilter for the next regexp. Note that we assume that - // Add called sequentially for all regexps. All Add calls - // must precede Compile. - void Add(Prefilter* prefilter); - - // The Compile returns a vector of string in atom_vec. - // Call this after all the prefilters are added through Add. - // No calls to Add after Compile are allowed. - // The caller should use the returned set of strings to do string matching. - // Each time a string matches, the corresponding index then has to be - // and passed to RegexpsGivenStrings below. + ~PrefilterTree(); + + // Adds the prefilter for the next regexp. Note that we assume that + // Add called sequentially for all regexps. All Add calls + // must precede Compile. + void Add(Prefilter* prefilter); + + // The Compile returns a vector of string in atom_vec. + // Call this after all the prefilters are added through Add. + // No calls to Add after Compile are allowed. + // The caller should use the returned set of strings to do string matching. + // Each time a string matches, the corresponding index then has to be + // and passed to RegexpsGivenStrings below. void Compile(std::vector<std::string>* atom_vec); - - // Given the indices of the atoms that matched, returns the indexes - // of regexps that should be searched. The matched_atoms should - // contain all the ids of string atoms that were found to match the - // content. The caller can use any string match engine to perform - // this function. This function is thread safe. + + // Given the indices of the atoms that matched, returns the indexes + // of regexps that should be searched. The matched_atoms should + // contain all the ids of string atoms that were found to match the + // content. The caller can use any string match engine to perform + // this function. This function is thread safe. void RegexpsGivenStrings(const std::vector<int>& matched_atoms, std::vector<int>* regexps) const; - - // Print debug prefilter. Also prints unique ids associated with - // nodes of the prefilter of the regexp. - void PrintPrefilter(int regexpid); - + + // Print debug prefilter. Also prints unique ids associated with + // nodes of the prefilter of the regexp. + void PrintPrefilter(int regexpid); + private: typedef SparseArray<int> IntMap; typedef std::map<int, int> StdIntMap; typedef std::map<std::string, Prefilter*> NodeMap; - - // Each unique node has a corresponding Entry that helps in - // passing the matching trigger information along the tree. - struct Entry { - public: - // How many children should match before this node triggers the - // parent. For an atom and an OR node, this is 1 and for an AND - // node, it is the number of unique children. - int propagate_up_at_count; - - // When this node is ready to trigger the parent, what are the indices - // of the parent nodes to trigger. The reason there may be more than - // one is because of sharing. For example (abc | def) and (xyz | def) - // are two different nodes, but they share the atom 'def'. So when - // 'def' matches, it triggers two parents, corresponding to the two - // different OR nodes. + + // Each unique node has a corresponding Entry that helps in + // passing the matching trigger information along the tree. + struct Entry { + public: + // How many children should match before this node triggers the + // parent. For an atom and an OR node, this is 1 and for an AND + // node, it is the number of unique children. + int propagate_up_at_count; + + // When this node is ready to trigger the parent, what are the indices + // of the parent nodes to trigger. The reason there may be more than + // one is because of sharing. For example (abc | def) and (xyz | def) + // are two different nodes, but they share the atom 'def'. So when + // 'def' matches, it triggers two parents, corresponding to the two + // different OR nodes. StdIntMap* parents; - - // When this node is ready to trigger the parent, what are the - // regexps that are triggered. + + // When this node is ready to trigger the parent, what are the + // regexps that are triggered. std::vector<int> regexps; - }; - + }; + // Returns true if the prefilter node should be kept. bool KeepNode(Prefilter* node) const; - // This function assigns unique ids to various parts of the - // prefilter, by looking at if these nodes are already in the - // PrefilterTree. + // This function assigns unique ids to various parts of the + // prefilter, by looking at if these nodes are already in the + // PrefilterTree. void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec); - - // Given the matching atoms, find the regexps to be triggered. + + // Given the matching atoms, find the regexps to be triggered. void PropagateMatch(const std::vector<int>& atom_ids, - IntMap* regexps) const; - - // Returns the prefilter node that has the same NodeString as this - // node. For the canonical node, returns node. + IntMap* regexps) const; + + // Returns the prefilter node that has the same NodeString as this + // node. For the canonical node, returns node. Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); - - // A string that uniquely identifies the node. Assumes that the - // children of node has already been assigned unique ids. + + // A string that uniquely identifies the node. Assumes that the + // children of node has already been assigned unique ids. std::string NodeString(Prefilter* node) const; - - // Recursively constructs a readable prefilter string. + + // Recursively constructs a readable prefilter string. std::string DebugNodeString(Prefilter* node) const; - - // Used for debugging. + + // Used for debugging. void PrintDebugInfo(NodeMap* nodes); - - // These are all the nodes formed by Compile. Essentially, there is - // one node for each unique atom and each unique AND/OR node. + + // These are all the nodes formed by Compile. Essentially, there is + // one node for each unique atom and each unique AND/OR node. std::vector<Entry> entries_; - - // indices of regexps that always pass through the filter (since we - // found no required literals in these regexps). + + // indices of regexps that always pass through the filter (since we + // found no required literals in these regexps). std::vector<int> unfiltered_; - - // vector of Prefilter for all regexps. + + // vector of Prefilter for all regexps. std::vector<Prefilter*> prefilter_vec_; - - // Atom index in returned strings to entry id mapping. + + // Atom index in returned strings to entry id mapping. std::vector<int> atom_index_to_id_; - - // Has the prefilter tree been compiled. - bool compiled_; - + + // Has the prefilter tree been compiled. + bool compiled_; + // Strings less than this length are not stored as atoms. const int min_atom_len_; PrefilterTree(const PrefilterTree&) = delete; PrefilterTree& operator=(const PrefilterTree&) = delete; -}; - +}; + } // namespace - -#endif // RE2_PREFILTER_TREE_H_ + +#endif // RE2_PREFILTER_TREE_H_ diff --git a/contrib/libs/re2/re2/prog.cc b/contrib/libs/re2/re2/prog.cc index 3756c67f66..a700d35de3 100644 --- a/contrib/libs/re2/re2/prog.cc +++ b/contrib/libs/re2/re2/prog.cc @@ -1,12 +1,12 @@ -// Copyright 2007 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Compiled regular expression representation. -// Tested by compile_test.cc - -#include "re2/prog.h" - +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Compiled regular expression representation. +// Tested by compile_test.cc + +#include "re2/prog.h" + #if defined(__AVX2__) #include <immintrin.h> #ifdef _MSC_VER @@ -25,132 +25,132 @@ #include "re2/bitmap256.h" #include "re2/stringpiece.h" -namespace re2 { - -// Constructors per Inst opcode - +namespace re2 { + +// Constructors per Inst opcode + void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) { - DCHECK_EQ(out_opcode_, 0); - set_out_opcode(out, kInstAlt); - out1_ = out1; -} - + DCHECK_EQ(out_opcode_, 0); + set_out_opcode(out, kInstAlt); + out1_ = out1; +} + void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) { - DCHECK_EQ(out_opcode_, 0); - set_out_opcode(out, kInstByteRange); - lo_ = lo & 0xFF; - hi_ = hi & 0xFF; + DCHECK_EQ(out_opcode_, 0); + set_out_opcode(out, kInstByteRange); + lo_ = lo & 0xFF; + hi_ = hi & 0xFF; hint_foldcase_ = foldcase&1; -} - +} + void Prog::Inst::InitCapture(int cap, uint32_t out) { - DCHECK_EQ(out_opcode_, 0); - set_out_opcode(out, kInstCapture); - cap_ = cap; -} - + DCHECK_EQ(out_opcode_, 0); + set_out_opcode(out, kInstCapture); + cap_ = cap; +} + void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) { - DCHECK_EQ(out_opcode_, 0); - set_out_opcode(out, kInstEmptyWidth); - empty_ = empty; -} - + DCHECK_EQ(out_opcode_, 0); + set_out_opcode(out, kInstEmptyWidth); + empty_ = empty; +} + void Prog::Inst::InitMatch(int32_t id) { - DCHECK_EQ(out_opcode_, 0); - set_opcode(kInstMatch); - match_id_ = id; -} - + DCHECK_EQ(out_opcode_, 0); + set_opcode(kInstMatch); + match_id_ = id; +} + void Prog::Inst::InitNop(uint32_t out) { - DCHECK_EQ(out_opcode_, 0); - set_opcode(kInstNop); -} - -void Prog::Inst::InitFail() { - DCHECK_EQ(out_opcode_, 0); - set_opcode(kInstFail); -} - + DCHECK_EQ(out_opcode_, 0); + set_opcode(kInstNop); +} + +void Prog::Inst::InitFail() { + DCHECK_EQ(out_opcode_, 0); + set_opcode(kInstFail); +} + std::string Prog::Inst::Dump() { - switch (opcode()) { - default: - return StringPrintf("opcode %d", static_cast<int>(opcode())); - - case kInstAlt: - return StringPrintf("alt -> %d | %d", out(), out1_); - - case kInstAltMatch: - return StringPrintf("altmatch -> %d | %d", out(), out1_); - - case kInstByteRange: + switch (opcode()) { + default: + return StringPrintf("opcode %d", static_cast<int>(opcode())); + + case kInstAlt: + return StringPrintf("alt -> %d | %d", out(), out1_); + + case kInstAltMatch: + return StringPrintf("altmatch -> %d | %d", out(), out1_); + + case kInstByteRange: return StringPrintf("byte%s [%02x-%02x] %d -> %d", foldcase() ? "/i" : "", lo_, hi_, hint(), out()); - - case kInstCapture: - return StringPrintf("capture %d -> %d", cap_, out()); - - case kInstEmptyWidth: - return StringPrintf("emptywidth %#x -> %d", - static_cast<int>(empty_), out()); - - case kInstMatch: - return StringPrintf("match! %d", match_id()); - - case kInstNop: - return StringPrintf("nop -> %d", out()); - - case kInstFail: - return StringPrintf("fail"); - } -} - -Prog::Prog() - : anchor_start_(false), - anchor_end_(false), - reversed_(false), + + case kInstCapture: + return StringPrintf("capture %d -> %d", cap_, out()); + + case kInstEmptyWidth: + return StringPrintf("emptywidth %#x -> %d", + static_cast<int>(empty_), out()); + + case kInstMatch: + return StringPrintf("match! %d", match_id()); + + case kInstNop: + return StringPrintf("nop -> %d", out()); + + case kInstFail: + return StringPrintf("fail"); + } +} + +Prog::Prog() + : anchor_start_(false), + anchor_end_(false), + reversed_(false), did_flatten_(false), - did_onepass_(false), - start_(0), - start_unanchored_(0), - size_(0), - bytemap_range_(0), + did_onepass_(false), + start_(0), + start_unanchored_(0), + size_(0), + bytemap_range_(0), prefix_foldcase_(false), prefix_size_(0), list_count_(0), bit_state_text_max_size_(0), dfa_mem_(0), - dfa_first_(NULL), + dfa_first_(NULL), dfa_longest_(NULL) { -} - -Prog::~Prog() { +} + +Prog::~Prog() { DeleteDFA(dfa_longest_); DeleteDFA(dfa_first_); if (prefix_foldcase_) delete[] prefix_dfa_; -} - -typedef SparseSet Workq; - -static inline void AddToQueue(Workq* q, int id) { - if (id != 0) - q->insert(id); -} - +} + +typedef SparseSet Workq; + +static inline void AddToQueue(Workq* q, int id) { + if (id != 0) + q->insert(id); +} + static std::string ProgToString(Prog* prog, Workq* q) { std::string s; - for (Workq::iterator i = q->begin(); i != q->end(); ++i) { - int id = *i; - Prog::Inst* ip = prog->inst(id); + for (Workq::iterator i = q->begin(); i != q->end(); ++i) { + int id = *i; + Prog::Inst* ip = prog->inst(id); s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); - AddToQueue(q, ip->out()); - if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) - AddToQueue(q, ip->out1()); - } - return s; -} - + AddToQueue(q, ip->out()); + if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) + AddToQueue(q, ip->out1()); + } + return s; +} + static std::string FlattenedProgToString(Prog* prog, int start) { std::string s; for (int id = start; id < prog->size(); id++) { @@ -159,28 +159,28 @@ static std::string FlattenedProgToString(Prog* prog, int start) { s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); else s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str()); - } + } return s; } - + std::string Prog::Dump() { if (did_flatten_) return FlattenedProgToString(this, start_); - Workq q(size_); - AddToQueue(&q, start_); + Workq q(size_); + AddToQueue(&q, start_); return ProgToString(this, &q); -} - +} + std::string Prog::DumpUnanchored() { if (did_flatten_) return FlattenedProgToString(this, start_unanchored_); - Workq q(size_); - AddToQueue(&q, start_unanchored_); - return ProgToString(this, &q); -} - + Workq q(size_); + AddToQueue(&q, start_unanchored_); + return ProgToString(this, &q); +} + std::string Prog::DumpByteMap() { std::string map; for (int c = 0; c < 256; c++) { @@ -220,104 +220,104 @@ static bool IsMatch(Prog* prog, Prog::Inst* ip) { } } -// Peep-hole optimizer. -void Prog::Optimize() { - Workq q(size_); - - // Eliminate nops. Most are taken out during compilation - // but a few are hard to avoid. - q.clear(); - AddToQueue(&q, start_); - for (Workq::iterator i = q.begin(); i != q.end(); ++i) { - int id = *i; - - Inst* ip = inst(id); - int j = ip->out(); - Inst* jp; - while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { - j = jp->out(); - } - ip->set_out(j); - AddToQueue(&q, ip->out()); - - if (ip->opcode() == kInstAlt) { - j = ip->out1(); - while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { - j = jp->out(); - } - ip->out1_ = j; - AddToQueue(&q, ip->out1()); - } - } - - // Insert kInstAltMatch instructions - // Look for - // ip: Alt -> j | k - // j: ByteRange [00-FF] -> ip - // k: Match - // or the reverse (the above is the greedy one). - // Rewrite Alt to AltMatch. - q.clear(); - AddToQueue(&q, start_); - for (Workq::iterator i = q.begin(); i != q.end(); ++i) { - int id = *i; - Inst* ip = inst(id); - AddToQueue(&q, ip->out()); - if (ip->opcode() == kInstAlt) - AddToQueue(&q, ip->out1()); - - if (ip->opcode() == kInstAlt) { - Inst* j = inst(ip->out()); - Inst* k = inst(ip->out1()); - if (j->opcode() == kInstByteRange && j->out() == id && - j->lo() == 0x00 && j->hi() == 0xFF && - IsMatch(this, k)) { - ip->set_opcode(kInstAltMatch); - continue; - } - if (IsMatch(this, j) && - k->opcode() == kInstByteRange && k->out() == id && - k->lo() == 0x00 && k->hi() == 0xFF) { - ip->set_opcode(kInstAltMatch); - } - } - } -} - +// Peep-hole optimizer. +void Prog::Optimize() { + Workq q(size_); + + // Eliminate nops. Most are taken out during compilation + // but a few are hard to avoid. + q.clear(); + AddToQueue(&q, start_); + for (Workq::iterator i = q.begin(); i != q.end(); ++i) { + int id = *i; + + Inst* ip = inst(id); + int j = ip->out(); + Inst* jp; + while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { + j = jp->out(); + } + ip->set_out(j); + AddToQueue(&q, ip->out()); + + if (ip->opcode() == kInstAlt) { + j = ip->out1(); + while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { + j = jp->out(); + } + ip->out1_ = j; + AddToQueue(&q, ip->out1()); + } + } + + // Insert kInstAltMatch instructions + // Look for + // ip: Alt -> j | k + // j: ByteRange [00-FF] -> ip + // k: Match + // or the reverse (the above is the greedy one). + // Rewrite Alt to AltMatch. + q.clear(); + AddToQueue(&q, start_); + for (Workq::iterator i = q.begin(); i != q.end(); ++i) { + int id = *i; + Inst* ip = inst(id); + AddToQueue(&q, ip->out()); + if (ip->opcode() == kInstAlt) + AddToQueue(&q, ip->out1()); + + if (ip->opcode() == kInstAlt) { + Inst* j = inst(ip->out()); + Inst* k = inst(ip->out1()); + if (j->opcode() == kInstByteRange && j->out() == id && + j->lo() == 0x00 && j->hi() == 0xFF && + IsMatch(this, k)) { + ip->set_opcode(kInstAltMatch); + continue; + } + if (IsMatch(this, j) && + k->opcode() == kInstByteRange && k->out() == id && + k->lo() == 0x00 && k->hi() == 0xFF) { + ip->set_opcode(kInstAltMatch); + } + } + } +} + uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { - int flags = 0; - - // ^ and \A + int flags = 0; + + // ^ and \A if (p == text.data()) - flags |= kEmptyBeginText | kEmptyBeginLine; - else if (p[-1] == '\n') - flags |= kEmptyBeginLine; - - // $ and \z + flags |= kEmptyBeginText | kEmptyBeginLine; + else if (p[-1] == '\n') + flags |= kEmptyBeginLine; + + // $ and \z if (p == text.data() + text.size()) - flags |= kEmptyEndText | kEmptyEndLine; + flags |= kEmptyEndText | kEmptyEndLine; else if (p < text.data() + text.size() && p[0] == '\n') - flags |= kEmptyEndLine; - - // \b and \B + flags |= kEmptyEndLine; + + // \b and \B if (p == text.data() && p == text.data() + text.size()) { - // no word boundary here + // no word boundary here } else if (p == text.data()) { - if (IsWordChar(p[0])) - flags |= kEmptyWordBoundary; + if (IsWordChar(p[0])) + flags |= kEmptyWordBoundary; } else if (p == text.data() + text.size()) { - if (IsWordChar(p[-1])) - flags |= kEmptyWordBoundary; - } else { - if (IsWordChar(p[-1]) != IsWordChar(p[0])) - flags |= kEmptyWordBoundary; - } - if (!(flags & kEmptyWordBoundary)) - flags |= kEmptyNonWordBoundary; - - return flags; -} - + if (IsWordChar(p[-1])) + flags |= kEmptyWordBoundary; + } else { + if (IsWordChar(p[-1]) != IsWordChar(p[0])) + flags |= kEmptyWordBoundary; + } + if (!(flags & kEmptyWordBoundary)) + flags |= kEmptyNonWordBoundary; + + return flags; +} + // ByteMapBuilder implements a coloring algorithm. // // The first phase is a series of "mark and merge" batches: we mark one or more @@ -375,8 +375,8 @@ void ByteMapBuilder::Mark(int lo, int hi) { return; ranges_.emplace_back(lo, hi); -} - +} + void ByteMapBuilder::Merge() { for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin(); it != ranges_.end(); @@ -443,12 +443,12 @@ int ByteMapBuilder::Recolor(int oldcolor) { return newcolor; } -void Prog::ComputeByteMap() { +void Prog::ComputeByteMap() { // Fill in bytemap with byte classes for the program. // Ranges of bytes that are treated indistinguishably // will be mapped to a single byte class. ByteMapBuilder builder; - + // Don't repeat the work for ^ and $. bool marked_line_boundaries = false; // Don't repeat the work for \b and \B. @@ -507,18 +507,18 @@ void Prog::ComputeByteMap() { marked_word_boundaries = true; } } - } - + } + builder.Build(bytemap_, &bytemap_range_); if (0) { // For debugging, use trivial bytemap. LOG(ERROR) << "Using trivial bytemap."; for (int i = 0; i < 256; i++) bytemap_[i] = static_cast<uint8_t>(i); - bytemap_range_ = 256; - } -} - + bytemap_range_ = 256; + } +} + // Prog::Flatten() implements a graph rewriting algorithm. // // The overall process is similar to epsilon removal, but retains some epsilon @@ -1172,4 +1172,4 @@ const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { } } -} // namespace re2 +} // namespace re2 diff --git a/contrib/libs/re2/re2/prog.h b/contrib/libs/re2/re2/prog.h index 2f35a918b6..4af012ab6f 100644 --- a/contrib/libs/re2/re2/prog.h +++ b/contrib/libs/re2/re2/prog.h @@ -1,150 +1,150 @@ -// Copyright 2007 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_PROG_H_ #define RE2_PROG_H_ -// Compiled representation of regular expressions. -// See regexp.h for the Regexp class, which represents a regular -// expression symbolically. - +// Compiled representation of regular expressions. +// See regexp.h for the Regexp class, which represents a regular +// expression symbolically. + #include <stdint.h> #include <functional> #include <mutex> #include <string> #include <vector> #include <type_traits> - + #include "util/util.h" #include "util/logging.h" #include "re2/pod_array.h" #include "re2/re2.h" #include "re2/sparse_array.h" #include "re2/sparse_set.h" - -namespace re2 { - -// Opcodes for Inst -enum InstOp { - kInstAlt = 0, // choose between out_ and out1_ - kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. - kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] - kInstCapture, // capturing parenthesis number cap_ - kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ - kInstMatch, // found a match! - kInstNop, // no-op; occasionally unavoidable - kInstFail, // never match; occasionally unavoidable + +namespace re2 { + +// Opcodes for Inst +enum InstOp { + kInstAlt = 0, // choose between out_ and out1_ + kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. + kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] + kInstCapture, // capturing parenthesis number cap_ + kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ + kInstMatch, // found a match! + kInstNop, // no-op; occasionally unavoidable + kInstFail, // never match; occasionally unavoidable kNumInst, -}; - -// Bit flags for empty-width specials -enum EmptyOp { - kEmptyBeginLine = 1<<0, // ^ - beginning of line - kEmptyEndLine = 1<<1, // $ - end of line - kEmptyBeginText = 1<<2, // \A - beginning of text - kEmptyEndText = 1<<3, // \z - end of text - kEmptyWordBoundary = 1<<4, // \b - word boundary - kEmptyNonWordBoundary = 1<<5, // \B - not \b - kEmptyAllFlags = (1<<6)-1, -}; - +}; + +// Bit flags for empty-width specials +enum EmptyOp { + kEmptyBeginLine = 1<<0, // ^ - beginning of line + kEmptyEndLine = 1<<1, // $ - end of line + kEmptyBeginText = 1<<2, // \A - beginning of text + kEmptyEndText = 1<<3, // \z - end of text + kEmptyWordBoundary = 1<<4, // \b - word boundary + kEmptyNonWordBoundary = 1<<5, // \B - not \b + kEmptyAllFlags = (1<<6)-1, +}; + class DFA; -class Regexp; - -// Compiled form of regexp program. -class Prog { - public: - Prog(); - ~Prog(); - - // Single instruction in regexp program. - class Inst { - public: +class Regexp; + +// Compiled form of regexp program. +class Prog { + public: + Prog(); + ~Prog(); + + // Single instruction in regexp program. + class Inst { + public: // See the assertion below for why this is so. Inst() = default; - + // Copyable. Inst(const Inst&) = default; Inst& operator=(const Inst&) = default; - // Constructors per opcode + // Constructors per opcode void InitAlt(uint32_t out, uint32_t out1); void InitByteRange(int lo, int hi, int foldcase, uint32_t out); void InitCapture(int cap, uint32_t out); void InitEmptyWidth(EmptyOp empty, uint32_t out); - void InitMatch(int id); + void InitMatch(int id); void InitNop(uint32_t out); - void InitFail(); - - // Getters + void InitFail(); + + // Getters int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); } - InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } + InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } int last() { return (out_opcode_>>3)&1; } int out() { return out_opcode_>>4; } int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } - int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } - int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } - int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } + int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } + int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } + int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; } int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; } - int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } - EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } + int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } + EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } bool greedy(Prog* p) { - DCHECK_EQ(opcode(), kInstAltMatch); + DCHECK_EQ(opcode(), kInstAltMatch); return p->inst(out())->opcode() == kInstByteRange || (p->inst(out())->opcode() == kInstNop && p->inst(p->inst(out())->out())->opcode() == kInstByteRange); - } - - // Does this inst (an kInstByteRange) match c? - inline bool Matches(int c) { - DCHECK_EQ(opcode(), kInstByteRange); + } + + // Does this inst (an kInstByteRange) match c? + inline bool Matches(int c) { + DCHECK_EQ(opcode(), kInstByteRange); if (foldcase() && 'A' <= c && c <= 'Z') - c += 'a' - 'A'; - return lo_ <= c && c <= hi_; - } - - // Returns string representation for debugging. + c += 'a' - 'A'; + return lo_ <= c && c <= hi_; + } + + // Returns string representation for debugging. std::string Dump(); - - // Maximum instruction id. + + // Maximum instruction id. // (Must fit in out_opcode_. PatchList/last steal another bit.) - static const int kMaxInst = (1<<28) - 1; - - private: - void set_opcode(InstOp opcode) { + static const int kMaxInst = (1<<28) - 1; + + private: + void set_opcode(InstOp opcode) { out_opcode_ = (out()<<4) | (last()<<3) | opcode; - } - + } + void set_last() { out_opcode_ = (out()<<4) | (1<<3) | opcode(); } - void set_out(int out) { + void set_out(int out) { out_opcode_ = (out<<4) | (last()<<3) | opcode(); - } - - void set_out_opcode(int out, InstOp opcode) { + } + + void set_out_opcode(int out, InstOp opcode) { out_opcode_ = (out<<4) | (last()<<3) | opcode; - } - + } + uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode union { // additional instruction arguments: uint32_t out1_; // opcode == kInstAlt // alternate next instruction - + int32_t cap_; // opcode == kInstCapture // Index of capture register (holds text // position recorded by capturing parentheses). // For \n (the submatch for the nth parentheses), // the left parenthesis captures into register 2*n // and the right one captures into register 2*n+1. - + int32_t match_id_; // opcode == kInstMatch // Match ID to identify this match (for re2::Set). - + struct { // opcode == kInstByteRange uint8_t lo_; // byte range is lo_-hi_ inclusive uint8_t hi_; // @@ -155,69 +155,69 @@ class Prog { // means there are no remaining possibilities, // which is most likely for character classes. // foldcase: A-Z -> a-z before checking range. - }; - + }; + EmptyOp empty_; // opcode == kInstEmptyWidth // empty_ is bitwise OR of kEmpty* flags above. - }; - - friend class Compiler; - friend struct PatchList; - friend class Prog; - }; - + }; + + friend class Compiler; + friend struct PatchList; + friend class Prog; + }; + // Inst must be trivial so that we can freely clear it with memset(3). // Arrays of Inst are initialised by copying the initial elements with // memmove(3) and then clearing any remaining elements with memset(3). static_assert(std::is_trivial<Inst>::value, "Inst must be trivial"); - // Whether to anchor the search. - enum Anchor { - kUnanchored, // match anywhere - kAnchored, // match only starting at beginning of text - }; - - // Kind of match to look for (for anchor != kFullMatch) - // - // kLongestMatch mode finds the overall longest - // match but still makes its submatch choices the way - // Perl would, not in the way prescribed by POSIX. - // The POSIX rules are much more expensive to implement, - // and no one has needed them. - // - // kFullMatch is not strictly necessary -- we could use - // kLongestMatch and then check the length of the match -- but - // the matching code can run faster if it knows to consider only - // full matches. - enum MatchKind { - kFirstMatch, // like Perl, PCRE - kLongestMatch, // like egrep or POSIX - kFullMatch, // match only entire text; implies anchor==kAnchored - kManyMatch // for SearchDFA, records set of matches - }; - - Inst *inst(int id) { return &inst_[id]; } - int start() { return start_; } + // Whether to anchor the search. + enum Anchor { + kUnanchored, // match anywhere + kAnchored, // match only starting at beginning of text + }; + + // Kind of match to look for (for anchor != kFullMatch) + // + // kLongestMatch mode finds the overall longest + // match but still makes its submatch choices the way + // Perl would, not in the way prescribed by POSIX. + // The POSIX rules are much more expensive to implement, + // and no one has needed them. + // + // kFullMatch is not strictly necessary -- we could use + // kLongestMatch and then check the length of the match -- but + // the matching code can run faster if it knows to consider only + // full matches. + enum MatchKind { + kFirstMatch, // like Perl, PCRE + kLongestMatch, // like egrep or POSIX + kFullMatch, // match only entire text; implies anchor==kAnchored + kManyMatch // for SearchDFA, records set of matches + }; + + Inst *inst(int id) { return &inst_[id]; } + int start() { return start_; } void set_start(int start) { start_ = start; } - int start_unanchored() { return start_unanchored_; } - void set_start_unanchored(int start) { start_unanchored_ = start; } + int start_unanchored() { return start_unanchored_; } + void set_start_unanchored(int start) { start_unanchored_ = start; } int size() { return size_; } - bool reversed() { return reversed_; } - void set_reversed(bool reversed) { reversed_ = reversed; } + bool reversed() { return reversed_; } + void set_reversed(bool reversed) { reversed_ = reversed; } int list_count() { return list_count_; } int inst_count(InstOp op) { return inst_count_[op]; } uint16_t* list_heads() { return list_heads_.data(); } size_t bit_state_text_max_size() { return bit_state_text_max_size_; } int64_t dfa_mem() { return dfa_mem_; } void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } - bool anchor_start() { return anchor_start_; } - void set_anchor_start(bool b) { anchor_start_ = b; } - bool anchor_end() { return anchor_end_; } - void set_anchor_end(bool b) { anchor_end_ = b; } - int bytemap_range() { return bytemap_range_; } + bool anchor_start() { return anchor_start_; } + void set_anchor_start(bool b) { anchor_start_ = b; } + bool anchor_end() { return anchor_end_; } + void set_anchor_end(bool b) { anchor_end_ = b; } + int bytemap_range() { return bytemap_range_; } const uint8_t* bytemap() { return bytemap_; } bool can_prefix_accel() { return prefix_size_ != 0; } - + // Accelerates to the first likely occurrence of the prefix. // Returns a pointer to the first byte or NULL if not found. const void* PrefixAccel(const void* data, size_t size) { @@ -242,58 +242,58 @@ class Prog { // prefix_back_ to return fewer false positives than memchr(3) alone. const void* PrefixAccel_FrontAndBack(const void* data, size_t size); - // Returns string representation of program for debugging. + // Returns string representation of program for debugging. std::string Dump(); std::string DumpUnanchored(); std::string DumpByteMap(); - - // Returns the set of kEmpty flags that are in effect at - // position p within context. + + // Returns the set of kEmpty flags that are in effect at + // position p within context. static uint32_t EmptyFlags(const StringPiece& context, const char* p); - - // Returns whether byte c is a word character: ASCII only. - // Used by the implementation of \b and \B. - // This is not right for Unicode, but: - // - it's hard to get right in a byte-at-a-time matching world - // (the DFA has only one-byte lookahead). - // - even if the lookahead were possible, the Progs would be huge. - // This crude approximation is the same one PCRE uses. + + // Returns whether byte c is a word character: ASCII only. + // Used by the implementation of \b and \B. + // This is not right for Unicode, but: + // - it's hard to get right in a byte-at-a-time matching world + // (the DFA has only one-byte lookahead). + // - even if the lookahead were possible, the Progs would be huge. + // This crude approximation is the same one PCRE uses. static bool IsWordChar(uint8_t c) { - return ('A' <= c && c <= 'Z') || - ('a' <= c && c <= 'z') || - ('0' <= c && c <= '9') || - c == '_'; - } - - // Execution engines. They all search for the regexp (run the prog) - // in text, which is in the larger context (used for ^ $ \b etc). - // Anchor and kind control the kind of search. - // Returns true if match found, false if not. - // If match found, fills match[0..nmatch-1] with submatch info. - // match[0] is overall match, match[1] is first set of parens, etc. - // If a particular submatch is not matched during the regexp match, - // it is set to NULL. - // - // Matching text == StringPiece(NULL, 0) is treated as any other empty - // string, but note that on return, it will not be possible to distinguish - // submatches that matched that empty string from submatches that didn't - // match anything. Either way, match[i] == NULL. - - // Search using NFA: can find submatches but kind of slow. - bool SearchNFA(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch); - - // Search using DFA: much faster than NFA but only finds - // end of match and can use a lot more memory. - // Returns whether a match was found. - // If the DFA runs out of memory, sets *failed to true and returns false. - // If matches != NULL and kind == kManyMatch and there is a match, - // SearchDFA fills matches with the match IDs of the final matching state. - bool SearchDFA(const StringPiece& text, const StringPiece& context, + return ('A' <= c && c <= 'Z') || + ('a' <= c && c <= 'z') || + ('0' <= c && c <= '9') || + c == '_'; + } + + // Execution engines. They all search for the regexp (run the prog) + // in text, which is in the larger context (used for ^ $ \b etc). + // Anchor and kind control the kind of search. + // Returns true if match found, false if not. + // If match found, fills match[0..nmatch-1] with submatch info. + // match[0] is overall match, match[1] is first set of parens, etc. + // If a particular submatch is not matched during the regexp match, + // it is set to NULL. + // + // Matching text == StringPiece(NULL, 0) is treated as any other empty + // string, but note that on return, it will not be possible to distinguish + // submatches that matched that empty string from submatches that didn't + // match anything. Either way, match[i] == NULL. + + // Search using NFA: can find submatches but kind of slow. + bool SearchNFA(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Search using DFA: much faster than NFA but only finds + // end of match and can use a lot more memory. + // Returns whether a match was found. + // If the DFA runs out of memory, sets *failed to true and returns false. + // If matches != NULL and kind == kManyMatch and there is a match, + // SearchDFA fills matches with the match IDs of the final matching state. + bool SearchDFA(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, StringPiece* match0, bool* failed, SparseSet* matches); - + // The callback issued after building each DFA state with BuildEntireDFA(). // If next is null, then the memory budget has been exhausted and building // will halt. Otherwise, the state has been built and next points to an array @@ -304,71 +304,71 @@ class Prog { using DFAStateCallback = std::function<void(const int* next, bool match)>; // Build the entire DFA for the given match kind. - // Usually the DFA is built out incrementally, as needed, which + // Usually the DFA is built out incrementally, as needed, which // avoids lots of unnecessary work. // If cb is not empty, it receives one callback per state built. // Returns the number of states built. // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); - + // Compute bytemap. - void ComputeByteMap(); - - // Run peep-hole optimizer on program. - void Optimize(); - - // One-pass NFA: only correct if IsOnePass() is true, - // but much faster than NFA (competitive with PCRE) - // for those expressions. - bool IsOnePass(); - bool SearchOnePass(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch); - - // Bit-state backtracking. Fast on small cases but uses memory + void ComputeByteMap(); + + // Run peep-hole optimizer on program. + void Optimize(); + + // One-pass NFA: only correct if IsOnePass() is true, + // but much faster than NFA (competitive with PCRE) + // for those expressions. + bool IsOnePass(); + bool SearchOnePass(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Bit-state backtracking. Fast on small cases but uses memory // proportional to the product of the list count and the text size. bool CanBitState() { return list_heads_.data() != NULL; } - bool SearchBitState(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch); - - static const int kMaxOnePassCapture = 5; // $0 through $4 - - // Backtracking search: the gold standard against which the other - // implementations are checked. FOR TESTING ONLY. - // It allocates a ton of memory to avoid running forever. - // It is also recursive, so can't use in production (will overflow stacks). - // The name "Unsafe" here is supposed to be a flag that - // you should not be using this function. - bool UnsafeSearchBacktrack(const StringPiece& text, - const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch); - - // Computes range for any strings matching regexp. The min and max can in - // some cases be arbitrarily precise, so the caller gets to specify the - // maximum desired length of string returned. - // - // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any - // string s that is an anchored match for this regexp satisfies - // min <= s && s <= max. - // - // Note that PossibleMatchRange() will only consider the first copy of an - // infinitely repeated element (i.e., any regexp element followed by a '*' or - // '+' operator). Regexps with "{N}" constructions are not affected, as those - // do not compile down to infinite repetitions. - // - // Returns true on success, false on error. + bool SearchBitState(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + static const int kMaxOnePassCapture = 5; // $0 through $4 + + // Backtracking search: the gold standard against which the other + // implementations are checked. FOR TESTING ONLY. + // It allocates a ton of memory to avoid running forever. + // It is also recursive, so can't use in production (will overflow stacks). + // The name "Unsafe" here is supposed to be a flag that + // you should not be using this function. + bool UnsafeSearchBacktrack(const StringPiece& text, + const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Computes range for any strings matching regexp. The min and max can in + // some cases be arbitrarily precise, so the caller gets to specify the + // maximum desired length of string returned. + // + // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any + // string s that is an anchored match for this regexp satisfies + // min <= s && s <= max. + // + // Note that PossibleMatchRange() will only consider the first copy of an + // infinitely repeated element (i.e., any regexp element followed by a '*' or + // '+' operator). Regexps with "{N}" constructions are not affected, as those + // do not compile down to infinite repetitions. + // + // Returns true on success, false on error. bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); - + // EXPERIMENTAL! SUBJECT TO CHANGE! // Outputs the program fanout into the given sparse array. void Fanout(SparseArray<int>* fanout); - // Compiles a collection of regexps to Prog. Each regexp will have + // Compiles a collection of regexps to Prog. Each regexp will have // its own Match instruction recording the index in the output vector. static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); - + // Flattens the Prog from "tree" form to "list" form. This is an in-place // operation in the sense that the old instructions are lost. void Flatten(); @@ -403,22 +403,22 @@ class Prog { // FOR TESTING ONLY. static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b); - private: - friend class Compiler; - - DFA* GetDFA(MatchKind kind); + private: + friend class Compiler; + + DFA* GetDFA(MatchKind kind); void DeleteDFA(DFA* dfa); - - bool anchor_start_; // regexp has explicit start anchor - bool anchor_end_; // regexp has explicit end anchor - bool reversed_; // whether program runs backward over input + + bool anchor_start_; // regexp has explicit start anchor + bool anchor_end_; // regexp has explicit end anchor + bool reversed_; // whether program runs backward over input bool did_flatten_; // has Flatten been called? - bool did_onepass_; // has IsOnePass been called? - - int start_; // entry point for program - int start_unanchored_; // unanchored entry point for program - int size_; // number of instructions - int bytemap_range_; // bytemap_[x] < bytemap_range_ + bool did_onepass_; // has IsOnePass been called? + + int start_; // entry point for program + int start_unanchored_; // unanchored entry point for program + int size_; // number of instructions + int bytemap_range_; // bytemap_[x] < bytemap_range_ bool prefix_foldcase_; // whether prefix is case-insensitive size_t prefix_size_; // size of prefix (0 if no prefix) @@ -429,7 +429,7 @@ class Prog { int prefix_back_; // last byte of prefix }; }; - + int list_count_; // count of lists (see above) int inst_count_[kNumInst]; // count of instructions by opcode PODArray<uint16_t> list_heads_; // sparse array enumerating list heads @@ -438,20 +438,20 @@ class Prog { PODArray<Inst> inst_; // pointer to instruction array PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes - + int64_t dfa_mem_; // Maximum memory for DFAs. DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch - + uint8_t bytemap_[256]; // map from input bytes to byte classes - + std::once_flag dfa_first_once_; std::once_flag dfa_longest_once_; - + Prog(const Prog&) = delete; Prog& operator=(const Prog&) = delete; -}; - +}; + // std::string_view in MSVC has iterators that aren't just pointers and // that don't allow comparisons between different objects - not even if // those objects are views into the same string! Thus, we provide these @@ -463,6 +463,6 @@ static inline const char* EndPtr(const StringPiece& s) { return s.data() + s.size(); } -} // namespace re2 - +} // namespace re2 + #endif // RE2_PROG_H_ diff --git a/contrib/libs/re2/re2/re2.cc b/contrib/libs/re2/re2/re2.cc index c32090b4fc..47fb385e4e 100644 --- a/contrib/libs/re2/re2/re2.cc +++ b/contrib/libs/re2/re2/re2.cc @@ -1,14 +1,14 @@ -// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Regular expression interface RE2. -// -// Originally the PCRE C++ wrapper, but adapted to use -// the new automata-based regular expression engines. - +// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression interface RE2. +// +// Originally the PCRE C++ wrapper, but adapted to use +// the new automata-based regular expression engines. + #include "re2/re2.h" - + #include <assert.h> #include <ctype.h> #include <errno.h> @@ -22,7 +22,7 @@ #include <atomic> #include <iterator> #include <mutex> -#include <string> +#include <string> #include <utility> #include <vector> @@ -30,18 +30,18 @@ #include "util/logging.h" #include "util/strutil.h" #include "util/utf.h" -#include "re2/prog.h" -#include "re2/regexp.h" +#include "re2/prog.h" +#include "re2/regexp.h" #include "re2/sparse_array.h" - -namespace re2 { - -// Maximum number of args we can set -static const int kMaxArgs = 16; -static const int kVecSize = 1+kMaxArgs; - + +namespace re2 { + +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = 1+kMaxArgs; + const int RE2::Options::kDefaultMaxMem; // initialized in re2.h - + RE2::Options::Options(RE2::CannedOptions opt) : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), posix_syntax_(opt == RE2::POSIX), @@ -57,120 +57,120 @@ RE2::Options::Options(RE2::CannedOptions opt) word_boundary_(false), one_line_(false) { } - + // static empty objects for use as const references. // To avoid global constructors, allocated in RE2::Init(). static const std::string* empty_string; static const std::map<std::string, int>* empty_named_groups; static const std::map<int, std::string>* empty_group_names; - -// Converts from Regexp error code to RE2 error code. -// Maybe some day they will diverge. In any event, this -// hides the existence of Regexp from RE2 users. + +// Converts from Regexp error code to RE2 error code. +// Maybe some day they will diverge. In any event, this +// hides the existence of Regexp from RE2 users. static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { - switch (code) { + switch (code) { case re2::kRegexpSuccess: - return RE2::NoError; + return RE2::NoError; case re2::kRegexpInternalError: - return RE2::ErrorInternal; + return RE2::ErrorInternal; case re2::kRegexpBadEscape: - return RE2::ErrorBadEscape; + return RE2::ErrorBadEscape; case re2::kRegexpBadCharClass: - return RE2::ErrorBadCharClass; + return RE2::ErrorBadCharClass; case re2::kRegexpBadCharRange: - return RE2::ErrorBadCharRange; + return RE2::ErrorBadCharRange; case re2::kRegexpMissingBracket: - return RE2::ErrorMissingBracket; + return RE2::ErrorMissingBracket; case re2::kRegexpMissingParen: - return RE2::ErrorMissingParen; + return RE2::ErrorMissingParen; case re2::kRegexpUnexpectedParen: return RE2::ErrorUnexpectedParen; case re2::kRegexpTrailingBackslash: - return RE2::ErrorTrailingBackslash; + return RE2::ErrorTrailingBackslash; case re2::kRegexpRepeatArgument: - return RE2::ErrorRepeatArgument; + return RE2::ErrorRepeatArgument; case re2::kRegexpRepeatSize: - return RE2::ErrorRepeatSize; + return RE2::ErrorRepeatSize; case re2::kRegexpRepeatOp: - return RE2::ErrorRepeatOp; + return RE2::ErrorRepeatOp; case re2::kRegexpBadPerlOp: - return RE2::ErrorBadPerlOp; + return RE2::ErrorBadPerlOp; case re2::kRegexpBadUTF8: - return RE2::ErrorBadUTF8; + return RE2::ErrorBadUTF8; case re2::kRegexpBadNamedCapture: - return RE2::ErrorBadNamedCapture; - } - return RE2::ErrorInternal; -} - + return RE2::ErrorBadNamedCapture; + } + return RE2::ErrorInternal; +} + static std::string trunc(const StringPiece& pattern) { - if (pattern.size() < 100) + if (pattern.size() < 100) return std::string(pattern); return std::string(pattern.substr(0, 100)) + "..."; -} - - -RE2::RE2(const char* pattern) { - Init(pattern, DefaultOptions); -} - +} + + +RE2::RE2(const char* pattern) { + Init(pattern, DefaultOptions); +} + RE2::RE2(const std::string& pattern) { - Init(pattern, DefaultOptions); -} - -RE2::RE2(const StringPiece& pattern) { - Init(pattern, DefaultOptions); -} - -RE2::RE2(const StringPiece& pattern, const Options& options) { - Init(pattern, options); -} - -int RE2::Options::ParseFlags() const { - int flags = Regexp::ClassNL; - switch (encoding()) { - default: + Init(pattern, DefaultOptions); +} + +RE2::RE2(const StringPiece& pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const StringPiece& pattern, const Options& options) { + Init(pattern, options); +} + +int RE2::Options::ParseFlags() const { + int flags = Regexp::ClassNL; + switch (encoding()) { + default: if (log_errors()) LOG(ERROR) << "Unknown encoding " << encoding(); - break; - case RE2::Options::EncodingUTF8: - break; - case RE2::Options::EncodingLatin1: - flags |= Regexp::Latin1; - break; - } - - if (!posix_syntax()) - flags |= Regexp::LikePerl; - - if (literal()) - flags |= Regexp::Literal; - - if (never_nl()) - flags |= Regexp::NeverNL; - + break; + case RE2::Options::EncodingUTF8: + break; + case RE2::Options::EncodingLatin1: + flags |= Regexp::Latin1; + break; + } + + if (!posix_syntax()) + flags |= Regexp::LikePerl; + + if (literal()) + flags |= Regexp::Literal; + + if (never_nl()) + flags |= Regexp::NeverNL; + if (dot_nl()) flags |= Regexp::DotNL; if (never_capture()) flags |= Regexp::NeverCapture; - if (!case_sensitive()) - flags |= Regexp::FoldCase; - - if (perl_classes()) - flags |= Regexp::PerlClasses; - - if (word_boundary()) - flags |= Regexp::PerlB; - - if (one_line()) - flags |= Regexp::OneLine; - - return flags; -} - -void RE2::Init(const StringPiece& pattern, const Options& options) { + if (!case_sensitive()) + flags |= Regexp::FoldCase; + + if (perl_classes()) + flags |= Regexp::PerlClasses; + + if (word_boundary()) + flags |= Regexp::PerlB; + + if (one_line()) + flags |= Regexp::OneLine; + + return flags; +} + +void RE2::Init(const StringPiece& pattern, const Options& options) { static std::once_flag empty_once; std::call_once(empty_once, []() { empty_string = new std::string; @@ -179,70 +179,70 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { }); pattern_.assign(pattern.data(), pattern.size()); - options_.Copy(options); + options_.Copy(options); entire_regexp_ = NULL; error_ = empty_string; error_code_ = NoError; error_arg_.clear(); prefix_.clear(); prefix_foldcase_ = false; - suffix_regexp_ = NULL; - prog_ = NULL; + suffix_regexp_ = NULL; + prog_ = NULL; num_captures_ = -1; is_one_pass_ = false; - rprog_ = NULL; - named_groups_ = NULL; - group_names_ = NULL; - - RegexpStatus status; - entire_regexp_ = Regexp::Parse( - pattern_, - static_cast<Regexp::ParseFlags>(options_.ParseFlags()), - &status); - if (entire_regexp_ == NULL) { - if (options_.log_errors()) { - LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " - << status.Text(); - } + rprog_ = NULL; + named_groups_ = NULL; + group_names_ = NULL; + + RegexpStatus status; + entire_regexp_ = Regexp::Parse( + pattern_, + static_cast<Regexp::ParseFlags>(options_.ParseFlags()), + &status); + if (entire_regexp_ == NULL) { + if (options_.log_errors()) { + LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " + << status.Text(); + } error_ = new std::string(status.Text()); - error_code_ = RegexpErrorToRE2(status.code()); + error_code_ = RegexpErrorToRE2(status.code()); error_arg_ = std::string(status.error_arg()); - return; - } - + return; + } + re2::Regexp* suffix; - if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) - suffix_regexp_ = suffix; - else - suffix_regexp_ = entire_regexp_->Incref(); - - // Two thirds of the memory goes to the forward Prog, - // one third to the reverse prog, because the forward - // Prog has two DFAs but the reverse prog has one. - prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); - if (prog_ == NULL) { - if (options_.log_errors()) - LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; + if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) + suffix_regexp_ = suffix; + else + suffix_regexp_ = entire_regexp_->Incref(); + + // Two thirds of the memory goes to the forward Prog, + // one third to the reverse prog, because the forward + // Prog has two DFAs but the reverse prog has one. + prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); + if (prog_ == NULL) { + if (options_.log_errors()) + LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; error_ = new std::string("pattern too large - compile failed"); - error_code_ = RE2::ErrorPatternTooLarge; - return; - } - + error_code_ = RE2::ErrorPatternTooLarge; + return; + } + // We used to compute this lazily, but it's used during the // typical control flow for a match call, so we now compute // it eagerly, which avoids the overhead of std::once_flag. num_captures_ = suffix_regexp_->NumCaptures(); - // Could delay this until the first match call that - // cares about submatch information, but the one-pass - // machine's memory gets cut from the DFA memory budget, - // and that is harder to do if the DFA has already - // been built. - is_one_pass_ = prog_->IsOnePass(); -} - -// Returns rprog_, computing it if needed. + // Could delay this until the first match call that + // cares about submatch information, but the one-pass + // machine's memory gets cut from the DFA memory budget, + // and that is harder to do if the DFA has already + // been built. + is_one_pass_ = prog_->IsOnePass(); +} + +// Returns rprog_, computing it if needed. re2::Prog* RE2::ReverseProg() const { std::call_once(rprog_once_, [](const RE2* re) { re->rprog_ = @@ -255,32 +255,32 @@ re2::Prog* RE2::ReverseProg() const { // is fine. More importantly, an RE2 object is supposed to be logically // immutable: whatever ok() would have returned after Init() completed, // it should continue to return that no matter what ReverseProg() does. - } + } }, this); - return rprog_; -} - -RE2::~RE2() { - if (suffix_regexp_) - suffix_regexp_->Decref(); - if (entire_regexp_) - entire_regexp_->Decref(); - delete prog_; - delete rprog_; + return rprog_; +} + +RE2::~RE2() { + if (suffix_regexp_) + suffix_regexp_->Decref(); + if (entire_regexp_) + entire_regexp_->Decref(); + delete prog_; + delete rprog_; if (error_ != empty_string) - delete error_; + delete error_; if (named_groups_ != NULL && named_groups_ != empty_named_groups) - delete named_groups_; + delete named_groups_; if (group_names_ != NULL && group_names_ != empty_group_names) - delete group_names_; -} - -int RE2::ProgramSize() const { - if (prog_ == NULL) - return -1; - return prog_->size(); -} - + delete group_names_; +} + +int RE2::ProgramSize() const { + if (prog_ == NULL) + return -1; + return prog_->size(); +} + int RE2::ReverseProgramSize() const { if (prog_ == NULL) return -1; @@ -346,7 +346,7 @@ int RE2::ReverseProgramFanout(std::vector<int>* histogram) const { return Fanout(prog, histogram); } -// Returns named_groups_, computing it if needed. +// Returns named_groups_, computing it if needed. const std::map<std::string, int>& RE2::NamedCapturingGroups() const { std::call_once(named_groups_once_, [](const RE2* re) { if (re->suffix_regexp_ != NULL) @@ -354,10 +354,10 @@ const std::map<std::string, int>& RE2::NamedCapturingGroups() const { if (re->named_groups_ == NULL) re->named_groups_ = empty_named_groups; }, this); - return *named_groups_; -} - -// Returns group_names_, computing it if needed. + return *named_groups_; +} + +// Returns group_names_, computing it if needed. const std::map<int, std::string>& RE2::CapturingGroupNames() const { std::call_once(group_names_once_, [](const RE2* re) { if (re->suffix_regexp_ != NULL) @@ -365,94 +365,94 @@ const std::map<int, std::string>& RE2::CapturingGroupNames() const { if (re->group_names_ == NULL) re->group_names_ = empty_group_names; }, this); - return *group_names_; -} - -/***** Convenience interfaces *****/ - -bool RE2::FullMatchN(const StringPiece& text, const RE2& re, - const Arg* const args[], int n) { - return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); -} - -bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, - const Arg* const args[], int n) { - return re.DoMatch(text, UNANCHORED, NULL, args, n); -} - -bool RE2::ConsumeN(StringPiece* input, const RE2& re, - const Arg* const args[], int n) { + return *group_names_; +} + +/***** Convenience interfaces *****/ + +bool RE2::FullMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n) { + return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); +} + +bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n) { + return re.DoMatch(text, UNANCHORED, NULL, args, n); +} + +bool RE2::ConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n) { size_t consumed; - if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { - input->remove_prefix(consumed); - return true; - } else { - return false; - } -} - -bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, - const Arg* const args[], int n) { + if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n) { size_t consumed; - if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { - input->remove_prefix(consumed); - return true; - } else { - return false; - } -} - + if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + bool RE2::Replace(std::string* str, const RE2& re, const StringPiece& rewrite) { - StringPiece vec[kVecSize]; - int nvec = 1 + MaxSubmatch(rewrite); + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); if (nvec > 1 + re.NumberOfCapturingGroups()) - return false; + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) - return false; - + return false; + std::string s; - if (!re.Rewrite(&s, rewrite, vec, nvec)) - return false; - + if (!re.Rewrite(&s, rewrite, vec, nvec)) + return false; + assert(vec[0].data() >= str->data()); assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); - str->replace(vec[0].data() - str->data(), vec[0].size(), s); - return true; -} - + str->replace(vec[0].data() - str->data(), vec[0].size(), s); + return true; +} + int RE2::GlobalReplace(std::string* str, const RE2& re, const StringPiece& rewrite) { - StringPiece vec[kVecSize]; - int nvec = 1 + MaxSubmatch(rewrite); + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); if (nvec > 1 + re.NumberOfCapturingGroups()) - return false; + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; - - const char* p = str->data(); - const char* ep = p + str->size(); - const char* lastend = NULL; + + const char* p = str->data(); + const char* ep = p + str->size(); + const char* lastend = NULL; std::string out; - int count = 0; + int count = 0; #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION // Iterate just once when fuzzing. Otherwise, we easily get bogged down // and coverage is unlikely to improve despite significant expense. while (p == str->data()) { #else - while (p <= ep) { + while (p <= ep) { #endif if (!re.Match(*str, static_cast<size_t>(p - str->data()), str->size(), UNANCHORED, vec, nvec)) - break; + break; if (p < vec[0].data()) out.append(p, vec[0].data() - p); if (vec[0].data() == lastend && vec[0].empty()) { - // Disallow empty match at end of last match: skip ahead. + // Disallow empty match at end of last match: skip ahead. // // fullrune() takes int, not ptrdiff_t. However, it just looks // at the leading byte and treats any length >= 4 the same. @@ -476,155 +476,155 @@ int RE2::GlobalReplace(std::string* str, } // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, // we fell through from above and the GIGO principle applies. - if (p < ep) - out.append(p, 1); - p++; - continue; - } - re.Rewrite(&out, rewrite, vec, nvec); + if (p < ep) + out.append(p, 1); + p++; + continue; + } + re.Rewrite(&out, rewrite, vec, nvec); p = vec[0].data() + vec[0].size(); - lastend = p; - count++; - } - - if (count == 0) - return 0; - - if (p < ep) - out.append(p, ep - p); + lastend = p; + count++; + } + + if (count == 0) + return 0; + + if (p < ep) + out.append(p, ep - p); using std::swap; - swap(out, *str); - return count; -} - + swap(out, *str); + return count; +} + bool RE2::Extract(const StringPiece& text, const RE2& re, const StringPiece& rewrite, std::string* out) { - StringPiece vec[kVecSize]; - int nvec = 1 + MaxSubmatch(rewrite); + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); if (nvec > 1 + re.NumberOfCapturingGroups()) - return false; + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) - return false; - - out->clear(); - return re.Rewrite(out, rewrite, vec, nvec); -} - + return false; + + out->clear(); + return re.Rewrite(out, rewrite, vec, nvec); +} + std::string RE2::QuoteMeta(const StringPiece& unquoted) { std::string result; - result.reserve(unquoted.size() << 1); - - // Escape any ascii character not in [A-Za-z_0-9]. - // - // Note that it's legal to escape a character even if it has no - // special meaning in a regular expression -- so this function does - // that. (This also makes it identical to the perl function of the - // same name except for the null-character special case; - // see `perldoc -f quotemeta`.) + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) for (size_t ii = 0; ii < unquoted.size(); ++ii) { - // Note that using 'isalnum' here raises the benchmark time from - // 32ns to 58ns: - if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && - (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && - (unquoted[ii] < '0' || unquoted[ii] > '9') && - unquoted[ii] != '_' && - // If this is the part of a UTF8 or Latin1 character, we need - // to copy this byte without escaping. Experimentally this is - // what works correctly with the regexp library. - !(unquoted[ii] & 128)) { - if (unquoted[ii] == '\0') { // Special handling for null chars. - // Note that this special handling is not strictly required for RE2, - // but this quoting is required for other regexp libraries such as - // PCRE. - // Can't use "\\0" since the next character might be a digit. - result += "\\x00"; - continue; - } - result += '\\'; - } - result += unquoted[ii]; - } - - return result; -} - + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Note that this special handling is not strictly required for RE2, + // but this quoting is required for other regexp libraries such as + // PCRE. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + bool RE2::PossibleMatchRange(std::string* min, std::string* max, int maxlen) const { - if (prog_ == NULL) - return false; - + if (prog_ == NULL) + return false; + int n = static_cast<int>(prefix_.size()); - if (n > maxlen) - n = maxlen; - - // Determine initial min max from prefix_ literal. + if (n > maxlen) + n = maxlen; + + // Determine initial min max from prefix_ literal. *min = prefix_.substr(0, n); *max = prefix_.substr(0, n); - if (prefix_foldcase_) { + if (prefix_foldcase_) { // prefix is ASCII lowercase; change *min to uppercase. - for (int i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { char& c = (*min)[i]; if ('a' <= c && c <= 'z') c += 'A' - 'a'; - } - } - - // Add to prefix min max using PossibleMatchRange on regexp. + } + } + + // Add to prefix min max using PossibleMatchRange on regexp. std::string dmin, dmax; - maxlen -= n; - if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { + maxlen -= n; + if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { min->append(dmin); max->append(dmax); } else if (!max->empty()) { - // prog_->PossibleMatchRange has failed us, - // but we still have useful information from prefix_. + // prog_->PossibleMatchRange has failed us, + // but we still have useful information from prefix_. // Round up *max to allow any possible suffix. PrefixSuccessor(max); - } else { - // Nothing useful. - *min = ""; - *max = ""; - return false; - } - - return true; -} - -// Avoid possible locale nonsense in standard strcasecmp. -// The string a is known to be all lowercase. + } else { + // Nothing useful. + *min = ""; + *max = ""; + return false; + } + + return true; +} + +// Avoid possible locale nonsense in standard strcasecmp. +// The string a is known to be all lowercase. static int ascii_strcasecmp(const char* a, const char* b, size_t len) { const char* ae = a + len; - - for (; a < ae; a++, b++) { + + for (; a < ae; a++, b++) { uint8_t x = *a; uint8_t y = *b; - if ('A' <= y && y <= 'Z') - y += 'a' - 'A'; - if (x != y) - return x - y; - } - return 0; -} - - -/***** Actual matching and rewriting code *****/ - -bool RE2::Match(const StringPiece& text, + if ('A' <= y && y <= 'Z') + y += 'a' - 'A'; + if (x != y) + return x - y; + } + return 0; +} + + +/***** Actual matching and rewriting code *****/ + +bool RE2::Match(const StringPiece& text, size_t startpos, size_t endpos, - Anchor re_anchor, - StringPiece* submatch, - int nsubmatch) const { + Anchor re_anchor, + StringPiece* submatch, + int nsubmatch) const { if (!ok()) { - if (options_.log_errors()) - LOG(ERROR) << "Invalid RE2: " << *error_; - return false; - } - + if (options_.log_errors()) + LOG(ERROR) << "Invalid RE2: " << *error_; + return false; + } + if (startpos > endpos || endpos > text.size()) { if (options_.log_errors()) LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" @@ -634,23 +634,23 @@ bool RE2::Match(const StringPiece& text, return false; } - StringPiece subtext = text; - subtext.remove_prefix(startpos); + StringPiece subtext = text; + subtext.remove_prefix(startpos); subtext.remove_suffix(text.size() - endpos); - - // Use DFAs to find exact location of match, filter out non-matches. - - // Don't ask for the location if we won't use it. - // SearchDFA can do extra optimizations in that case. - StringPiece match; - StringPiece* matchp = &match; - if (nsubmatch == 0) - matchp = NULL; - - int ncap = 1 + NumberOfCapturingGroups(); - if (ncap > nsubmatch) - ncap = nsubmatch; - + + // Use DFAs to find exact location of match, filter out non-matches. + + // Don't ask for the location if we won't use it. + // SearchDFA can do extra optimizations in that case. + StringPiece match; + StringPiece* matchp = &match; + if (nsubmatch == 0) + matchp = NULL; + + int ncap = 1 + NumberOfCapturingGroups(); + if (ncap > nsubmatch) + ncap = nsubmatch; + // If the regexp is anchored explicitly, must not be in middle of text. if (prog_->anchor_start() && startpos != 0) return false; @@ -658,53 +658,53 @@ bool RE2::Match(const StringPiece& text, return false; // If the regexp is anchored explicitly, update re_anchor - // so that we can potentially fall into a faster case below. - if (prog_->anchor_start() && prog_->anchor_end()) - re_anchor = ANCHOR_BOTH; - else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) - re_anchor = ANCHOR_START; - - // Check for the required prefix, if any. + // so that we can potentially fall into a faster case below. + if (prog_->anchor_start() && prog_->anchor_end()) + re_anchor = ANCHOR_BOTH; + else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) + re_anchor = ANCHOR_START; + + // Check for the required prefix, if any. size_t prefixlen = 0; - if (!prefix_.empty()) { + if (!prefix_.empty()) { if (startpos != 0) return false; - prefixlen = prefix_.size(); - if (prefixlen > subtext.size()) - return false; - if (prefix_foldcase_) { - if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) - return false; - } else { - if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) - return false; - } - subtext.remove_prefix(prefixlen); - // If there is a required prefix, the anchor must be at least ANCHOR_START. - if (re_anchor != ANCHOR_BOTH) - re_anchor = ANCHOR_START; - } - - Prog::Anchor anchor = Prog::kUnanchored; - Prog::MatchKind kind = Prog::kFirstMatch; - if (options_.longest_match()) - kind = Prog::kLongestMatch; - + prefixlen = prefix_.size(); + if (prefixlen > subtext.size()) + return false; + if (prefix_foldcase_) { + if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) + return false; + } else { + if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) + return false; + } + subtext.remove_prefix(prefixlen); + // If there is a required prefix, the anchor must be at least ANCHOR_START. + if (re_anchor != ANCHOR_BOTH) + re_anchor = ANCHOR_START; + } + + Prog::Anchor anchor = Prog::kUnanchored; + Prog::MatchKind kind = Prog::kFirstMatch; + if (options_.longest_match()) + kind = Prog::kLongestMatch; + bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; bool can_bit_state = prog_->CanBitState(); size_t bit_state_text_max_size = prog_->bit_state_text_max_size(); - + #ifdef RE2_HAVE_THREAD_LOCAL hooks::context = this; #endif - bool dfa_failed = false; + bool dfa_failed = false; bool skipped_test = false; - switch (re_anchor) { - default: + switch (re_anchor) { + default: LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor; return false; - case UNANCHORED: { + case UNANCHORED: { if (prog_->anchor_end()) { // This is a very special case: we don't need the forward DFA because // we already know where the match must end! Instead, the reverse DFA @@ -735,78 +735,78 @@ bool RE2::Match(const StringPiece& text, break; } - if (!prog_->SearchDFA(subtext, text, anchor, kind, - matchp, &dfa_failed, NULL)) { - if (dfa_failed) { + if (!prog_->SearchDFA(subtext, text, anchor, kind, + matchp, &dfa_failed, NULL)) { + if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " << "pattern length " << pattern_.size() << ", " << "program size " << prog_->size() << ", " << "list count " << prog_->list_count() << ", " << "bytemap range " << prog_->bytemap_range(); - // Fall back to NFA below. - skipped_test = true; - break; - } - return false; - } + // Fall back to NFA below. + skipped_test = true; + break; + } + return false; + } if (matchp == NULL) // Matched. Don't care where. - return true; + return true; // SearchDFA set match.end() but didn't know where the // match started. Run the regexp backward from match.end() - // to find the longest possible match -- that's where it started. - Prog* prog = ReverseProg(); + // to find the longest possible match -- that's where it started. + Prog* prog = ReverseProg(); if (prog == NULL) { // Fall back to NFA below. skipped_test = true; break; } - if (!prog->SearchDFA(match, text, Prog::kAnchored, - Prog::kLongestMatch, &match, &dfa_failed, NULL)) { - if (dfa_failed) { + if (!prog->SearchDFA(match, text, Prog::kAnchored, + Prog::kLongestMatch, &match, &dfa_failed, NULL)) { + if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " << "pattern length " << pattern_.size() << ", " << "program size " << prog->size() << ", " << "list count " << prog->list_count() << ", " << "bytemap range " << prog->bytemap_range(); - // Fall back to NFA below. - skipped_test = true; - break; - } + // Fall back to NFA below. + skipped_test = true; + break; + } if (options_.log_errors()) LOG(ERROR) << "SearchDFA inconsistency"; - return false; - } - break; - } - - case ANCHOR_BOTH: - case ANCHOR_START: - if (re_anchor == ANCHOR_BOTH) - kind = Prog::kFullMatch; - anchor = Prog::kAnchored; - - // If only a small amount of text and need submatch - // information anyway and we're going to use OnePass or BitState - // to get it, we might as well not even bother with the DFA: - // OnePass or BitState will be fast enough. - // On tiny texts, OnePass outruns even the DFA, and - // it doesn't have the shared state and occasional mutex that - // the DFA does. - if (can_one_pass && text.size() <= 4096 && + return false; + } + break; + } + + case ANCHOR_BOTH: + case ANCHOR_START: + if (re_anchor == ANCHOR_BOTH) + kind = Prog::kFullMatch; + anchor = Prog::kAnchored; + + // If only a small amount of text and need submatch + // information anyway and we're going to use OnePass or BitState + // to get it, we might as well not even bother with the DFA: + // OnePass or BitState will be fast enough. + // On tiny texts, OnePass outruns even the DFA, and + // it doesn't have the shared state and occasional mutex that + // the DFA does. + if (can_one_pass && text.size() <= 4096 && (ncap > 1 || text.size() <= 16)) { - skipped_test = true; - break; - } + skipped_test = true; + break; + } if (can_bit_state && text.size() <= bit_state_text_max_size && ncap > 1) { - skipped_test = true; - break; - } - if (!prog_->SearchDFA(subtext, text, anchor, kind, - &match, &dfa_failed, NULL)) { - if (dfa_failed) { + skipped_test = true; + break; + } + if (!prog_->SearchDFA(subtext, text, anchor, kind, + &match, &dfa_failed, NULL)) { + if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " << "pattern length " << pattern_.size() << ", " @@ -814,169 +814,169 @@ bool RE2::Match(const StringPiece& text, << "list count " << prog_->list_count() << ", " << "bytemap range " << prog_->bytemap_range(); // Fall back to NFA below. - skipped_test = true; - break; - } - return false; - } - break; - } - - if (!skipped_test && ncap <= 1) { - // We know exactly where it matches. That's enough. - if (ncap == 1) - submatch[0] = match; - } else { - StringPiece subtext1; - if (skipped_test) { - // DFA ran out of memory or was skipped: - // need to search in entire original text. - subtext1 = subtext; - } else { - // DFA found the exact match location: - // let NFA run an anchored, full match search - // to find submatch locations. - subtext1 = match; - anchor = Prog::kAnchored; - kind = Prog::kFullMatch; - } - - if (can_one_pass && anchor != Prog::kUnanchored) { - if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { + skipped_test = true; + break; + } + return false; + } + break; + } + + if (!skipped_test && ncap <= 1) { + // We know exactly where it matches. That's enough. + if (ncap == 1) + submatch[0] = match; + } else { + StringPiece subtext1; + if (skipped_test) { + // DFA ran out of memory or was skipped: + // need to search in entire original text. + subtext1 = subtext; + } else { + // DFA found the exact match location: + // let NFA run an anchored, full match search + // to find submatch locations. + subtext1 = match; + anchor = Prog::kAnchored; + kind = Prog::kFullMatch; + } + + if (can_one_pass && anchor != Prog::kUnanchored) { + if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { if (!skipped_test && options_.log_errors()) - LOG(ERROR) << "SearchOnePass inconsistency"; - return false; - } + LOG(ERROR) << "SearchOnePass inconsistency"; + return false; + } } else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) { - if (!prog_->SearchBitState(subtext1, text, anchor, - kind, submatch, ncap)) { + if (!prog_->SearchBitState(subtext1, text, anchor, + kind, submatch, ncap)) { if (!skipped_test && options_.log_errors()) - LOG(ERROR) << "SearchBitState inconsistency"; - return false; - } - } else { - if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { + LOG(ERROR) << "SearchBitState inconsistency"; + return false; + } + } else { + if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { if (!skipped_test && options_.log_errors()) - LOG(ERROR) << "SearchNFA inconsistency"; - return false; - } - } - } - - // Adjust overall match for required prefix that we stripped off. - if (prefixlen > 0 && nsubmatch > 0) + LOG(ERROR) << "SearchNFA inconsistency"; + return false; + } + } + } + + // Adjust overall match for required prefix that we stripped off. + if (prefixlen > 0 && nsubmatch > 0) submatch[0] = StringPiece(submatch[0].data() - prefixlen, - submatch[0].size() + prefixlen); - - // Zero submatches that don't exist in the regexp. - for (int i = ncap; i < nsubmatch; i++) + submatch[0].size() + prefixlen); + + // Zero submatches that don't exist in the regexp. + for (int i = ncap; i < nsubmatch; i++) submatch[i] = StringPiece(); - return true; -} - -// Internal matcher - like Match() but takes Args not StringPieces. -bool RE2::DoMatch(const StringPiece& text, + return true; +} + +// Internal matcher - like Match() but takes Args not StringPieces. +bool RE2::DoMatch(const StringPiece& text, Anchor re_anchor, size_t* consumed, - const Arg* const* args, - int n) const { - if (!ok()) { - if (options_.log_errors()) - LOG(ERROR) << "Invalid RE2: " << *error_; - return false; - } - + const Arg* const* args, + int n) const { + if (!ok()) { + if (options_.log_errors()) + LOG(ERROR) << "Invalid RE2: " << *error_; + return false; + } + if (NumberOfCapturingGroups() < n) { // RE has fewer capturing groups than number of Arg pointers passed in. return false; } - // Count number of capture groups needed. - int nvec; - if (n == 0 && consumed == NULL) - nvec = 0; - else - nvec = n+1; - - StringPiece* vec; - StringPiece stkvec[kVecSize]; - StringPiece* heapvec = NULL; - + // Count number of capture groups needed. + int nvec; + if (n == 0 && consumed == NULL) + nvec = 0; + else + nvec = n+1; + + StringPiece* vec; + StringPiece stkvec[kVecSize]; + StringPiece* heapvec = NULL; + if (nvec <= static_cast<int>(arraysize(stkvec))) { - vec = stkvec; - } else { - vec = new StringPiece[nvec]; - heapvec = vec; - } - + vec = stkvec; + } else { + vec = new StringPiece[nvec]; + heapvec = vec; + } + if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { - delete[] heapvec; - return false; - } - + delete[] heapvec; + return false; + } + if (consumed != NULL) *consumed = static_cast<size_t>(EndPtr(vec[0]) - BeginPtr(text)); - - if (n == 0 || args == NULL) { - // We are not interested in results - delete[] heapvec; - return true; - } - - // If we got here, we must have matched the whole pattern. - for (int i = 0; i < n; i++) { - const StringPiece& s = vec[i+1]; - if (!args[i]->Parse(s.data(), s.size())) { - // TODO: Should we indicate what the error was? - delete[] heapvec; - return false; - } - } - - delete[] heapvec; - return true; -} - -// Checks that the rewrite string is well-formed with respect to this -// regular expression. + + if (n == 0 || args == NULL) { + // We are not interested in results + delete[] heapvec; + return true; + } + + // If we got here, we must have matched the whole pattern. + for (int i = 0; i < n; i++) { + const StringPiece& s = vec[i+1]; + if (!args[i]->Parse(s.data(), s.size())) { + // TODO: Should we indicate what the error was? + delete[] heapvec; + return false; + } + } + + delete[] heapvec; + return true; +} + +// Checks that the rewrite string is well-formed with respect to this +// regular expression. bool RE2::CheckRewriteString(const StringPiece& rewrite, std::string* error) const { - int max_token = -1; - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - int c = *s; - if (c != '\\') { - continue; - } - if (++s == end) { - *error = "Rewrite schema error: '\\' not allowed at end."; - return false; - } - c = *s; - if (c == '\\') { - continue; - } + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c != '\\') { + continue; + } + if (++s == end) { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') { + continue; + } if (!isdigit(c)) { - *error = "Rewrite schema error: " - "'\\' must be followed by a digit or '\\'."; - return false; - } - int n = (c - '0'); - if (max_token < n) { - max_token = n; - } - } - - if (max_token > NumberOfCapturingGroups()) { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) { + max_token = n; + } + } + + if (max_token > NumberOfCapturingGroups()) { *error = StringPrintf( "Rewrite schema requests %d matches, but the regexp only has %d " "parenthesized subexpressions.", max_token, NumberOfCapturingGroups()); - return false; - } - return true; -} - + return false; + } + return true; +} + // Returns the maximum submatch needed for the rewrite to be done by Replace(). // E.g. if rewrite == "foo \\2,\\1", returns 2. int RE2::MaxSubmatch(const StringPiece& rewrite) { @@ -1033,32 +1033,32 @@ bool RE2::Rewrite(std::string* out, return true; } -/***** Parsers for various types *****/ - +/***** Parsers for various types *****/ + namespace re2_internal { template <> bool Parse(const char* str, size_t n, void* dest) { - // We fail if somebody asked us to store into a non-NULL void* pointer - return (dest == NULL); -} - + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + template <> bool Parse(const char* str, size_t n, std::string* dest) { - if (dest == NULL) return true; + if (dest == NULL) return true; dest->assign(str, n); - return true; -} - + return true; +} + #if defined(ARCADIA_ROOT) template <> bool Parse(const char* str, size_t n, TString* dest) { - if (dest == NULL) return true; + if (dest == NULL) return true; dest->assign(str, n); - return true; -} + return true; +} #endif - + template <> bool Parse(const char* str, size_t n, StringPiece* dest) { if (dest == NULL) return true; @@ -1068,16 +1068,16 @@ bool Parse(const char* str, size_t n, StringPiece* dest) { template <> bool Parse(const char* str, size_t n, char* dest) { - if (n != 1) return false; - if (dest == NULL) return true; + if (n != 1) return false; + if (dest == NULL) return true; *dest = str[0]; - return true; -} - + return true; +} + template <> bool Parse(const char* str, size_t n, signed char* dest) { - if (n != 1) return false; - if (dest == NULL) return true; + if (n != 1) return false; + if (dest == NULL) return true; *dest = str[0]; return true; } @@ -1087,12 +1087,12 @@ bool Parse(const char* str, size_t n, unsigned char* dest) { if (n != 1) return false; if (dest == NULL) return true; *dest = str[0]; - return true; -} - -// Largest number spec that we are willing to parse -static const int kMaxNumberLength = 32; - + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + // REQUIRES "buf" must have length at least nbuf. // Copies "str" into "buf" and null-terminates. // Overwrites *np with the new length. @@ -1101,7 +1101,7 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, size_t n = *np; if (n == 0) return ""; if (n > 0 && isspace(*str)) { - // We are less forgiving than the strtoxxx() routines and do not + // We are less forgiving than the strtoxxx() routines and do not // allow leading spaces. We do allow leading spaces for floats. if (!accept_spaces) { return ""; @@ -1110,8 +1110,8 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, n--; str++; } - } - + } + // Although buf has a fixed maximum size, we can still handle // arbitrarily large integers correctly by omitting leading zeros. // (Numbers that are still too long will be out of range.) @@ -1125,7 +1125,7 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, neg = true; n--; str++; - } + } if (n >= 3 && str[0] == '0' && str[1] == '0') { while (n >= 3 && str[2] == '0') { @@ -1148,11 +1148,11 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, buf[n] = '\0'; *np = n; return buf; -} - +} + template <> bool Parse(const char* str, size_t n, float* dest) { - if (n == 0) return false; + if (n == 0) return false; static const int kMaxLength = 200; char buf[kMaxLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, true); @@ -1185,127 +1185,127 @@ bool Parse(const char* str, size_t n, double* dest) { template <> bool Parse(const char* str, size_t n, long* dest, int radix) { if (n == 0) return false; - char buf[kMaxNumberLength+1]; + char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); - char* end; - errno = 0; - long r = strtol(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; *dest = r; - return true; -} - + return true; +} + template <> bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; + if (n == 0) return false; + char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); - if (str[0] == '-') { + if (str[0] == '-') { // strtoul() will silently accept negative numbers and parse // them. This module is more strict and treats them as errors. return false; - } - - char* end; - errno = 0; - unsigned long r = strtoul(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; *dest = r; - return true; -} - + return true; +} + template <> bool Parse(const char* str, size_t n, short* dest, int radix) { - long r; + long r; if (!Parse(str, n, &r, radix)) return false; // Could not parse if ((short)r != r) return false; // Out of range - if (dest == NULL) return true; + if (dest == NULL) return true; *dest = (short)r; - return true; -} - + return true; +} + template <> bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { - unsigned long r; + unsigned long r; if (!Parse(str, n, &r, radix)) return false; // Could not parse if ((unsigned short)r != r) return false; // Out of range - if (dest == NULL) return true; + if (dest == NULL) return true; *dest = (unsigned short)r; - return true; -} - + return true; +} + template <> bool Parse(const char* str, size_t n, int* dest, int radix) { - long r; + long r; if (!Parse(str, n, &r, radix)) return false; // Could not parse if ((int)r != r) return false; // Out of range - if (dest == NULL) return true; + if (dest == NULL) return true; *dest = (int)r; - return true; -} - + return true; +} + template <> bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { - unsigned long r; + unsigned long r; if (!Parse(str, n, &r, radix)) return false; // Could not parse if ((unsigned int)r != r) return false; // Out of range - if (dest == NULL) return true; + if (dest == NULL) return true; *dest = (unsigned int)r; - return true; -} - + return true; +} + template <> bool Parse(const char* str, size_t n, long long* dest, int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; + if (n == 0) return false; + char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); - char* end; - errno = 0; + char* end; + errno = 0; long long r = strtoll(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; *dest = r; - return true; -} - + return true; +} + template <> bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; + if (n == 0) return false; + char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); - if (str[0] == '-') { - // strtoull() will silently accept negative numbers and parse - // them. This module is more strict and treats them as errors. - return false; - } - char* end; - errno = 0; + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; unsigned long long r = strtoull(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; *dest = r; - return true; -} - + return true; +} + } // namespace re2_internal - + namespace hooks { - + #ifdef RE2_HAVE_THREAD_LOCAL thread_local const RE2* context = NULL; #endif - + template <typename T> union Hook { void Store(T* cb) { cb_.store(cb, std::memory_order_release); } T* Load() const { return cb_.load(std::memory_order_acquire); } - + #if !defined(__clang__) && defined(_MSC_VER) // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC. @@ -1313,10 +1313,10 @@ union Hook { "std::atomic<T*> must be always lock-free"); T* cb_for_constinit_; #endif - + std::atomic<T*> cb_; }; - + template <typename T> static void DoNothing(const T&) {} @@ -1332,4 +1332,4 @@ DEFINE_HOOK(DFASearchFailure, dfa_search_failure) } // namespace hooks -} // namespace re2 +} // namespace re2 diff --git a/contrib/libs/re2/re2/regexp.cc b/contrib/libs/re2/re2/regexp.cc index 949f9dbf72..ca1318b43d 100644 --- a/contrib/libs/re2/re2/regexp.cc +++ b/contrib/libs/re2/re2/regexp.cc @@ -1,11 +1,11 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Regular expression representation. -// Tested by parse_test.cc - -#include "re2/regexp.h" +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression representation. +// Tested by parse_test.cc + +#include "re2/regexp.h" #include <stddef.h> #include <stdint.h> @@ -22,80 +22,80 @@ #include "util/utf.h" #include "re2/pod_array.h" #include "re2/stringpiece.h" -#include "re2/walker-inl.h" - -namespace re2 { - -// Constructor. Allocates vectors as appropriate for operator. -Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) +#include "re2/walker-inl.h" + +namespace re2 { + +// Constructor. Allocates vectors as appropriate for operator. +Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) : op_(static_cast<uint8_t>(op)), - simple_(false), + simple_(false), parse_flags_(static_cast<uint16_t>(parse_flags)), - ref_(1), - nsub_(0), - down_(NULL) { - subone_ = NULL; - memset(the_union_, 0, sizeof the_union_); -} - -// Destructor. Assumes already cleaned up children. -// Private: use Decref() instead of delete to destroy Regexps. -// Can't call Decref on the sub-Regexps here because -// that could cause arbitrarily deep recursion, so -// required Decref() to have handled them for us. -Regexp::~Regexp() { - if (nsub_ > 0) - LOG(DFATAL) << "Regexp not destroyed."; - - switch (op_) { - default: - break; - case kRegexpCapture: - delete name_; - break; - case kRegexpLiteralString: - delete[] runes_; - break; - case kRegexpCharClass: + ref_(1), + nsub_(0), + down_(NULL) { + subone_ = NULL; + memset(the_union_, 0, sizeof the_union_); +} + +// Destructor. Assumes already cleaned up children. +// Private: use Decref() instead of delete to destroy Regexps. +// Can't call Decref on the sub-Regexps here because +// that could cause arbitrarily deep recursion, so +// required Decref() to have handled them for us. +Regexp::~Regexp() { + if (nsub_ > 0) + LOG(DFATAL) << "Regexp not destroyed."; + + switch (op_) { + default: + break; + case kRegexpCapture: + delete name_; + break; + case kRegexpLiteralString: + delete[] runes_; + break; + case kRegexpCharClass: if (cc_) cc_->Delete(); - delete ccb_; - break; - } -} - -// If it's possible to destroy this regexp without recurring, -// do so and return true. Else return false. -bool Regexp::QuickDestroy() { - if (nsub_ == 0) { - delete this; - return true; - } - return false; -} - + delete ccb_; + break; + } +} + +// If it's possible to destroy this regexp without recurring, +// do so and return true. Else return false. +bool Regexp::QuickDestroy() { + if (nsub_ == 0) { + delete this; + return true; + } + return false; +} + // Lazily allocated. static Mutex* ref_mutex; static std::map<Regexp*, int>* ref_map; - -int Regexp::Ref() { - if (ref_ < kMaxRef) - return ref_; - + +int Regexp::Ref() { + if (ref_ < kMaxRef) + return ref_; + MutexLock l(ref_mutex); return (*ref_map)[this]; -} - -// Increments reference count, returns object as convenience. -Regexp* Regexp::Incref() { - if (ref_ >= kMaxRef-1) { +} + +// Increments reference count, returns object as convenience. +Regexp* Regexp::Incref() { + if (ref_ >= kMaxRef-1) { static std::once_flag ref_once; std::call_once(ref_once, []() { ref_mutex = new Mutex; ref_map = new std::map<Regexp*, int>; }); - // Store ref count in overflow map. + // Store ref count in overflow map. MutexLock l(ref_mutex); if (ref_ == kMaxRef) { // already overflowed @@ -104,97 +104,97 @@ Regexp* Regexp::Incref() { // overflowing now (*ref_map)[this] = kMaxRef; ref_ = kMaxRef; - } - return this; - } - - ref_++; - return this; -} - -// Decrements reference count and deletes this object if count reaches 0. -void Regexp::Decref() { - if (ref_ == kMaxRef) { - // Ref count is stored in overflow map. + } + return this; + } + + ref_++; + return this; +} + +// Decrements reference count and deletes this object if count reaches 0. +void Regexp::Decref() { + if (ref_ == kMaxRef) { + // Ref count is stored in overflow map. MutexLock l(ref_mutex); int r = (*ref_map)[this] - 1; - if (r < kMaxRef) { + if (r < kMaxRef) { ref_ = static_cast<uint16_t>(r); ref_map->erase(this); - } else { + } else { (*ref_map)[this] = r; - } - return; - } - ref_--; - if (ref_ == 0) - Destroy(); -} - -// Deletes this object; ref count has count reached 0. -void Regexp::Destroy() { - if (QuickDestroy()) - return; - - // Handle recursive Destroy with explicit stack - // to avoid arbitrarily deep recursion on process stack [sigh]. - down_ = NULL; - Regexp* stack = this; - while (stack != NULL) { - Regexp* re = stack; - stack = re->down_; - if (re->ref_ != 0) - LOG(DFATAL) << "Bad reference count " << re->ref_; - if (re->nsub_ > 0) { - Regexp** subs = re->sub(); - for (int i = 0; i < re->nsub_; i++) { - Regexp* sub = subs[i]; - if (sub == NULL) - continue; - if (sub->ref_ == kMaxRef) - sub->Decref(); - else - --sub->ref_; - if (sub->ref_ == 0 && !sub->QuickDestroy()) { - sub->down_ = stack; - stack = sub; - } - } - if (re->nsub_ > 1) - delete[] subs; - re->nsub_ = 0; - } - delete re; - } -} - -void Regexp::AddRuneToString(Rune r) { - DCHECK(op_ == kRegexpLiteralString); - if (nrunes_ == 0) { - // start with 8 - runes_ = new Rune[8]; - } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { - // double on powers of two - Rune *old = runes_; - runes_ = new Rune[nrunes_ * 2]; - for (int i = 0; i < nrunes_; i++) - runes_[i] = old[i]; - delete[] old; - } - - runes_[nrunes_++] = r; -} - -Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { - Regexp* re = new Regexp(kRegexpHaveMatch, flags); - re->match_id_ = match_id; - return re; -} - + } + return; + } + ref_--; + if (ref_ == 0) + Destroy(); +} + +// Deletes this object; ref count has count reached 0. +void Regexp::Destroy() { + if (QuickDestroy()) + return; + + // Handle recursive Destroy with explicit stack + // to avoid arbitrarily deep recursion on process stack [sigh]. + down_ = NULL; + Regexp* stack = this; + while (stack != NULL) { + Regexp* re = stack; + stack = re->down_; + if (re->ref_ != 0) + LOG(DFATAL) << "Bad reference count " << re->ref_; + if (re->nsub_ > 0) { + Regexp** subs = re->sub(); + for (int i = 0; i < re->nsub_; i++) { + Regexp* sub = subs[i]; + if (sub == NULL) + continue; + if (sub->ref_ == kMaxRef) + sub->Decref(); + else + --sub->ref_; + if (sub->ref_ == 0 && !sub->QuickDestroy()) { + sub->down_ = stack; + stack = sub; + } + } + if (re->nsub_ > 1) + delete[] subs; + re->nsub_ = 0; + } + delete re; + } +} + +void Regexp::AddRuneToString(Rune r) { + DCHECK(op_ == kRegexpLiteralString); + if (nrunes_ == 0) { + // start with 8 + runes_ = new Rune[8]; + } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { + // double on powers of two + Rune *old = runes_; + runes_ = new Rune[nrunes_ * 2]; + for (int i = 0; i < nrunes_; i++) + runes_[i] = old[i]; + delete[] old; + } + + runes_[nrunes_++] = r; +} + +Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { + Regexp* re = new Regexp(kRegexpHaveMatch, flags); + re->match_id_ = match_id; + return re; +} + Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { // Squash **, ++ and ??. if (op == sub->op() && flags == sub->parse_flags()) - return sub; + return sub; // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because // op is Star/Plus/Quest, we just have to check that sub->op() is too. @@ -215,28 +215,28 @@ Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { } Regexp* re = new Regexp(op, flags); - re->AllocSub(1); - re->sub()[0] = sub; - return re; -} - + re->AllocSub(1); + re->sub()[0] = sub; + return re; +} + Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpPlus, sub, flags); } -Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { +Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpStar, sub, flags); -} - -Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { +} + +Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { return StarPlusOrQuest(kRegexpQuest, sub, flags); -} - -Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, - ParseFlags flags, bool can_factor) { - if (nsub == 1) - return sub[0]; - +} + +Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, + ParseFlags flags, bool can_factor) { + if (nsub == 1) + return sub[0]; + if (nsub == 0) { if (op == kRegexpAlternate) return new Regexp(kRegexpNoMatch, flags); @@ -245,416 +245,416 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, } PODArray<Regexp*> subcopy; - if (op == kRegexpAlternate && can_factor) { - // Going to edit sub; make a copy so we don't step on caller. + if (op == kRegexpAlternate && can_factor) { + // Going to edit sub; make a copy so we don't step on caller. subcopy = PODArray<Regexp*>(nsub); memmove(subcopy.data(), sub, nsub * sizeof sub[0]); sub = subcopy.data(); - nsub = FactorAlternation(sub, nsub, flags); - if (nsub == 1) { - Regexp* re = sub[0]; - return re; - } - } - - if (nsub > kMaxNsub) { - // Too many subexpressions to fit in a single Regexp. - // Make a two-level tree. Two levels gets us to 65535^2. - int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; - Regexp* re = new Regexp(op, flags); - re->AllocSub(nbigsub); - Regexp** subs = re->sub(); - for (int i = 0; i < nbigsub - 1; i++) - subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); - subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, - nsub - (nbigsub-1)*kMaxNsub, flags, - false); - return re; - } - - Regexp* re = new Regexp(op, flags); - re->AllocSub(nsub); - Regexp** subs = re->sub(); - for (int i = 0; i < nsub; i++) - subs[i] = sub[i]; - return re; -} - -Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { - return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); -} - -Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { - return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); -} - -Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { - return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); -} - -Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { - Regexp* re = new Regexp(kRegexpCapture, flags); - re->AllocSub(1); - re->sub()[0] = sub; - re->cap_ = cap; - return re; -} - -Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { - Regexp* re = new Regexp(kRegexpRepeat, flags); - re->AllocSub(1); - re->sub()[0] = sub; - re->min_ = min; - re->max_ = max; - return re; -} - -Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { - Regexp* re = new Regexp(kRegexpLiteral, flags); - re->rune_ = rune; - return re; -} - -Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { - if (nrunes <= 0) - return new Regexp(kRegexpEmptyMatch, flags); - if (nrunes == 1) - return NewLiteral(runes[0], flags); - Regexp* re = new Regexp(kRegexpLiteralString, flags); - for (int i = 0; i < nrunes; i++) - re->AddRuneToString(runes[i]); - return re; -} - -Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { - Regexp* re = new Regexp(kRegexpCharClass, flags); - re->cc_ = cc; - return re; -} - -void Regexp::Swap(Regexp* that) { + nsub = FactorAlternation(sub, nsub, flags); + if (nsub == 1) { + Regexp* re = sub[0]; + return re; + } + } + + if (nsub > kMaxNsub) { + // Too many subexpressions to fit in a single Regexp. + // Make a two-level tree. Two levels gets us to 65535^2. + int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; + Regexp* re = new Regexp(op, flags); + re->AllocSub(nbigsub); + Regexp** subs = re->sub(); + for (int i = 0; i < nbigsub - 1; i++) + subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); + subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, + nsub - (nbigsub-1)*kMaxNsub, flags, + false); + return re; + } + + Regexp* re = new Regexp(op, flags); + re->AllocSub(nsub); + Regexp** subs = re->sub(); + for (int i = 0; i < nsub; i++) + subs[i] = sub[i]; + return re; +} + +Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { + return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); +} + +Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { + return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); +} + +Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { + return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); +} + +Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { + Regexp* re = new Regexp(kRegexpCapture, flags); + re->AllocSub(1); + re->sub()[0] = sub; + re->cap_ = cap; + return re; +} + +Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { + Regexp* re = new Regexp(kRegexpRepeat, flags); + re->AllocSub(1); + re->sub()[0] = sub; + re->min_ = min; + re->max_ = max; + return re; +} + +Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { + Regexp* re = new Regexp(kRegexpLiteral, flags); + re->rune_ = rune; + return re; +} + +Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { + if (nrunes <= 0) + return new Regexp(kRegexpEmptyMatch, flags); + if (nrunes == 1) + return NewLiteral(runes[0], flags); + Regexp* re = new Regexp(kRegexpLiteralString, flags); + for (int i = 0; i < nrunes; i++) + re->AddRuneToString(runes[i]); + return re; +} + +Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { + Regexp* re = new Regexp(kRegexpCharClass, flags); + re->cc_ = cc; + return re; +} + +void Regexp::Swap(Regexp* that) { // Regexp is not trivially copyable, so we cannot freely copy it with // memmove(3), but swapping objects like so is safe for our purposes. - char tmp[sizeof *this]; + char tmp[sizeof *this]; void* vthis = reinterpret_cast<void*>(this); void* vthat = reinterpret_cast<void*>(that); memmove(tmp, vthis, sizeof *this); memmove(vthis, vthat, sizeof *this); memmove(vthat, tmp, sizeof *this); -} - -// Tests equality of all top-level structure but not subregexps. -static bool TopEqual(Regexp* a, Regexp* b) { - if (a->op() != b->op()) - return false; - - switch (a->op()) { - case kRegexpNoMatch: - case kRegexpEmptyMatch: - case kRegexpAnyChar: - case kRegexpAnyByte: - case kRegexpBeginLine: - case kRegexpEndLine: - case kRegexpWordBoundary: - case kRegexpNoWordBoundary: - case kRegexpBeginText: - return true; - - case kRegexpEndText: - // The parse flags remember whether it's \z or (?-m:$), - // which matters when testing against PCRE. - return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; - - case kRegexpLiteral: - return a->rune() == b->rune() && - ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; - - case kRegexpLiteralString: - return a->nrunes() == b->nrunes() && - ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && - memcmp(a->runes(), b->runes(), - a->nrunes() * sizeof a->runes()[0]) == 0; - - case kRegexpAlternate: - case kRegexpConcat: - return a->nsub() == b->nsub(); - - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; - - case kRegexpRepeat: - return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && - a->min() == b->min() && - a->max() == b->max(); - - case kRegexpCapture: - return a->cap() == b->cap() && a->name() == b->name(); - - case kRegexpHaveMatch: - return a->match_id() == b->match_id(); - - case kRegexpCharClass: { - CharClass* acc = a->cc(); - CharClass* bcc = b->cc(); - return acc->size() == bcc->size() && - acc->end() - acc->begin() == bcc->end() - bcc->begin() && - memcmp(acc->begin(), bcc->begin(), - (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; - } - } - - LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); - return 0; -} - -bool Regexp::Equal(Regexp* a, Regexp* b) { - if (a == NULL || b == NULL) - return a == b; - - if (!TopEqual(a, b)) - return false; - - // Fast path: - // return without allocating vector if there are no subregexps. - switch (a->op()) { - case kRegexpAlternate: - case kRegexpConcat: - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - case kRegexpRepeat: - case kRegexpCapture: - break; - - default: - return true; - } - - // Committed to doing real work. - // The stack (vector) has pairs of regexps waiting to - // be compared. The regexps are only equal if - // all the pairs end up being equal. +} + +// Tests equality of all top-level structure but not subregexps. +static bool TopEqual(Regexp* a, Regexp* b) { + if (a->op() != b->op()) + return false; + + switch (a->op()) { + case kRegexpNoMatch: + case kRegexpEmptyMatch: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpBeginLine: + case kRegexpEndLine: + case kRegexpWordBoundary: + case kRegexpNoWordBoundary: + case kRegexpBeginText: + return true; + + case kRegexpEndText: + // The parse flags remember whether it's \z or (?-m:$), + // which matters when testing against PCRE. + return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; + + case kRegexpLiteral: + return a->rune() == b->rune() && + ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; + + case kRegexpLiteralString: + return a->nrunes() == b->nrunes() && + ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && + memcmp(a->runes(), b->runes(), + a->nrunes() * sizeof a->runes()[0]) == 0; + + case kRegexpAlternate: + case kRegexpConcat: + return a->nsub() == b->nsub(); + + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; + + case kRegexpRepeat: + return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && + a->min() == b->min() && + a->max() == b->max(); + + case kRegexpCapture: + return a->cap() == b->cap() && a->name() == b->name(); + + case kRegexpHaveMatch: + return a->match_id() == b->match_id(); + + case kRegexpCharClass: { + CharClass* acc = a->cc(); + CharClass* bcc = b->cc(); + return acc->size() == bcc->size() && + acc->end() - acc->begin() == bcc->end() - bcc->begin() && + memcmp(acc->begin(), bcc->begin(), + (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; + } + } + + LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); + return 0; +} + +bool Regexp::Equal(Regexp* a, Regexp* b) { + if (a == NULL || b == NULL) + return a == b; + + if (!TopEqual(a, b)) + return false; + + // Fast path: + // return without allocating vector if there are no subregexps. + switch (a->op()) { + case kRegexpAlternate: + case kRegexpConcat: + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + case kRegexpCapture: + break; + + default: + return true; + } + + // Committed to doing real work. + // The stack (vector) has pairs of regexps waiting to + // be compared. The regexps are only equal if + // all the pairs end up being equal. std::vector<Regexp*> stk; - - for (;;) { - // Invariant: TopEqual(a, b) == true. - Regexp* a2; - Regexp* b2; - switch (a->op()) { - default: - break; - case kRegexpAlternate: - case kRegexpConcat: - for (int i = 0; i < a->nsub(); i++) { - a2 = a->sub()[i]; - b2 = b->sub()[i]; - if (!TopEqual(a2, b2)) - return false; - stk.push_back(a2); - stk.push_back(b2); - } - break; - - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - case kRegexpRepeat: - case kRegexpCapture: - a2 = a->sub()[0]; - b2 = b->sub()[0]; - if (!TopEqual(a2, b2)) - return false; - // Really: - // stk.push_back(a2); - // stk.push_back(b2); - // break; - // but faster to assign directly and loop. - a = a2; - b = b2; - continue; - } - + + for (;;) { + // Invariant: TopEqual(a, b) == true. + Regexp* a2; + Regexp* b2; + switch (a->op()) { + default: + break; + case kRegexpAlternate: + case kRegexpConcat: + for (int i = 0; i < a->nsub(); i++) { + a2 = a->sub()[i]; + b2 = b->sub()[i]; + if (!TopEqual(a2, b2)) + return false; + stk.push_back(a2); + stk.push_back(b2); + } + break; + + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + case kRegexpCapture: + a2 = a->sub()[0]; + b2 = b->sub()[0]; + if (!TopEqual(a2, b2)) + return false; + // Really: + // stk.push_back(a2); + // stk.push_back(b2); + // break; + // but faster to assign directly and loop. + a = a2; + b = b2; + continue; + } + size_t n = stk.size(); - if (n == 0) - break; - + if (n == 0) + break; + DCHECK_GE(n, 2); - a = stk[n-2]; - b = stk[n-1]; - stk.resize(n-2); - } - - return true; -} - -// Keep in sync with enum RegexpStatusCode in regexp.h + a = stk[n-2]; + b = stk[n-1]; + stk.resize(n-2); + } + + return true; +} + +// Keep in sync with enum RegexpStatusCode in regexp.h static const char *kErrorStrings[] = { - "no error", - "unexpected error", - "invalid escape sequence", - "invalid character class", - "invalid character class range", - "missing ]", - "missing )", + "no error", + "unexpected error", + "invalid escape sequence", + "invalid character class", + "invalid character class range", + "missing ]", + "missing )", "unexpected )", - "trailing \\", - "no argument for repetition operator", - "invalid repetition size", - "bad repetition operator", - "invalid perl operator", - "invalid UTF-8", - "invalid named capture group", -}; - + "trailing \\", + "no argument for repetition operator", + "invalid repetition size", + "bad repetition operator", + "invalid perl operator", + "invalid UTF-8", + "invalid named capture group", +}; + std::string RegexpStatus::CodeText(enum RegexpStatusCode code) { - if (code < 0 || code >= arraysize(kErrorStrings)) - code = kRegexpInternalError; - return kErrorStrings[code]; -} - + if (code < 0 || code >= arraysize(kErrorStrings)) + code = kRegexpInternalError; + return kErrorStrings[code]; +} + std::string RegexpStatus::Text() const { - if (error_arg_.empty()) - return CodeText(code_); + if (error_arg_.empty()) + return CodeText(code_); std::string s; - s.append(CodeText(code_)); - s.append(": "); - s.append(error_arg_.data(), error_arg_.size()); - return s; -} - -void RegexpStatus::Copy(const RegexpStatus& status) { - code_ = status.code_; - error_arg_ = status.error_arg_; -} - -typedef int Ignored; // Walker<void> doesn't exist - -// Walker subclass to count capturing parens in regexp. -class NumCapturesWalker : public Regexp::Walker<Ignored> { - public: - NumCapturesWalker() : ncapture_(0) {} - int ncapture() { return ncapture_; } - - virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { - if (re->op() == kRegexpCapture) - ncapture_++; - return ignored; - } - - virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { + s.append(CodeText(code_)); + s.append(": "); + s.append(error_arg_.data(), error_arg_.size()); + return s; +} + +void RegexpStatus::Copy(const RegexpStatus& status) { + code_ = status.code_; + error_arg_ = status.error_arg_; +} + +typedef int Ignored; // Walker<void> doesn't exist + +// Walker subclass to count capturing parens in regexp. +class NumCapturesWalker : public Regexp::Walker<Ignored> { + public: + NumCapturesWalker() : ncapture_(0) {} + int ncapture() { return ncapture_; } + + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + if (re->op() == kRegexpCapture) + ncapture_++; + return ignored; + } + + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { // Should never be called: we use Walk(), not WalkExponential(). #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; + LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; #endif - return ignored; - } - - private: - int ncapture_; + return ignored; + } + + private: + int ncapture_; NumCapturesWalker(const NumCapturesWalker&) = delete; NumCapturesWalker& operator=(const NumCapturesWalker&) = delete; -}; - -int Regexp::NumCaptures() { - NumCapturesWalker w; - w.Walk(this, 0); - return w.ncapture(); -} - -// Walker class to build map of named capture groups and their indices. -class NamedCapturesWalker : public Regexp::Walker<Ignored> { - public: - NamedCapturesWalker() : map_(NULL) {} - ~NamedCapturesWalker() { delete map_; } - +}; + +int Regexp::NumCaptures() { + NumCapturesWalker w; + w.Walk(this, 0); + return w.ncapture(); +} + +// Walker class to build map of named capture groups and their indices. +class NamedCapturesWalker : public Regexp::Walker<Ignored> { + public: + NamedCapturesWalker() : map_(NULL) {} + ~NamedCapturesWalker() { delete map_; } + std::map<std::string, int>* TakeMap() { std::map<std::string, int>* m = map_; - map_ = NULL; - return m; - } - + map_ = NULL; + return m; + } + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { - if (re->op() == kRegexpCapture && re->name() != NULL) { - // Allocate map once we find a name. - if (map_ == NULL) + if (re->op() == kRegexpCapture && re->name() != NULL) { + // Allocate map once we find a name. + if (map_ == NULL) map_ = new std::map<std::string, int>; - - // Record first occurrence of each name. - // (The rule is that if you have the same name - // multiple times, only the leftmost one counts.) + + // Record first occurrence of each name. + // (The rule is that if you have the same name + // multiple times, only the leftmost one counts.) map_->insert({*re->name(), re->cap()}); - } - return ignored; - } - - virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { + } + return ignored; + } + + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { // Should never be called: we use Walk(), not WalkExponential(). #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; + LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; #endif - return ignored; - } - - private: + return ignored; + } + + private: std::map<std::string, int>* map_; NamedCapturesWalker(const NamedCapturesWalker&) = delete; NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; -}; - +}; + std::map<std::string, int>* Regexp::NamedCaptures() { - NamedCapturesWalker w; - w.Walk(this, 0); - return w.TakeMap(); -} - -// Walker class to build map from capture group indices to their names. -class CaptureNamesWalker : public Regexp::Walker<Ignored> { - public: - CaptureNamesWalker() : map_(NULL) {} - ~CaptureNamesWalker() { delete map_; } - + NamedCapturesWalker w; + w.Walk(this, 0); + return w.TakeMap(); +} + +// Walker class to build map from capture group indices to their names. +class CaptureNamesWalker : public Regexp::Walker<Ignored> { + public: + CaptureNamesWalker() : map_(NULL) {} + ~CaptureNamesWalker() { delete map_; } + std::map<int, std::string>* TakeMap() { std::map<int, std::string>* m = map_; - map_ = NULL; - return m; - } - + map_ = NULL; + return m; + } + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { - if (re->op() == kRegexpCapture && re->name() != NULL) { - // Allocate map once we find a name. - if (map_ == NULL) + if (re->op() == kRegexpCapture && re->name() != NULL) { + // Allocate map once we find a name. + if (map_ == NULL) map_ = new std::map<int, std::string>; - - (*map_)[re->cap()] = *re->name(); - } - return ignored; - } - - virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { + + (*map_)[re->cap()] = *re->name(); + } + return ignored; + } + + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { // Should never be called: we use Walk(), not WalkExponential(). #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; + LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; #endif - return ignored; - } - - private: + return ignored; + } + + private: std::map<int, std::string>* map_; CaptureNamesWalker(const CaptureNamesWalker&) = delete; CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; -}; - +}; + std::map<int, std::string>* Regexp::CaptureNames() { - CaptureNamesWalker w; - w.Walk(this, 0); - return w.TakeMap(); -} - + CaptureNamesWalker w; + w.Walk(this, 0); + return w.TakeMap(); +} + void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, std::string* bytes) { if (latin1) { @@ -671,48 +671,48 @@ void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, } } -// Determines whether regexp matches must be anchored -// with a fixed string prefix. If so, returns the prefix and -// the regexp that remains after the prefix. The prefix might -// be ASCII case-insensitive. +// Determines whether regexp matches must be anchored +// with a fixed string prefix. If so, returns the prefix and +// the regexp that remains after the prefix. The prefix might +// be ASCII case-insensitive. bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix) { prefix->clear(); *foldcase = false; *suffix = NULL; - // No need for a walker: the regexp must be of the form - // 1. some number of ^ anchors - // 2. a literal char or string - // 3. the rest - if (op_ != kRegexpConcat) - return false; - int i = 0; + // No need for a walker: the regexp must be of the form + // 1. some number of ^ anchors + // 2. a literal char or string + // 3. the rest + if (op_ != kRegexpConcat) + return false; + int i = 0; while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) - i++; + i++; if (i == 0 || i >= nsub_) - return false; + return false; Regexp* re = sub()[i]; if (re->op_ != kRegexpLiteral && re->op_ != kRegexpLiteralString) return false; i++; - if (i < nsub_) { - for (int j = i; j < nsub_; j++) + if (i < nsub_) { + for (int j = i; j < nsub_; j++) sub()[j]->Incref(); *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); - } else { + } else { *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); - } + } bool latin1 = (re->parse_flags() & Latin1) != 0; Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; ConvertRunesToBytes(latin1, runes, nrunes, prefix); *foldcase = (re->parse_flags() & FoldCase) != 0; - return true; -} - + return true; +} + // Determines whether regexp matches must be unanchored // with a fixed string prefix. If so, returns the prefix. // The prefix might be ASCII case-insensitive. @@ -741,246 +741,246 @@ bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { return true; } -// Character class builder is a balanced binary tree (STL set) -// containing non-overlapping, non-abutting RuneRanges. -// The less-than operator used in the tree treats two -// ranges as equal if they overlap at all, so that -// lookups for a particular Rune are possible. - -CharClassBuilder::CharClassBuilder() { - nrunes_ = 0; - upper_ = 0; - lower_ = 0; -} - -// Add lo-hi to the class; return whether class got bigger. -bool CharClassBuilder::AddRange(Rune lo, Rune hi) { - if (hi < lo) - return false; - - if (lo <= 'z' && hi >= 'A') { - // Overlaps some alpha, maybe not all. - // Update bitmaps telling which ASCII letters are in the set. +// Character class builder is a balanced binary tree (STL set) +// containing non-overlapping, non-abutting RuneRanges. +// The less-than operator used in the tree treats two +// ranges as equal if they overlap at all, so that +// lookups for a particular Rune are possible. + +CharClassBuilder::CharClassBuilder() { + nrunes_ = 0; + upper_ = 0; + lower_ = 0; +} + +// Add lo-hi to the class; return whether class got bigger. +bool CharClassBuilder::AddRange(Rune lo, Rune hi) { + if (hi < lo) + return false; + + if (lo <= 'z' && hi >= 'A') { + // Overlaps some alpha, maybe not all. + // Update bitmaps telling which ASCII letters are in the set. Rune lo1 = std::max<Rune>(lo, 'A'); Rune hi1 = std::min<Rune>(hi, 'Z'); - if (lo1 <= hi1) - upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); - + if (lo1 <= hi1) + upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); + lo1 = std::max<Rune>(lo, 'a'); hi1 = std::min<Rune>(hi, 'z'); - if (lo1 <= hi1) - lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); - } - - { // Check whether lo, hi is already in the class. - iterator it = ranges_.find(RuneRange(lo, lo)); - if (it != end() && it->lo <= lo && hi <= it->hi) - return false; - } - - // Look for a range abutting lo on the left. - // If it exists, take it out and increase our range. - if (lo > 0) { - iterator it = ranges_.find(RuneRange(lo-1, lo-1)); - if (it != end()) { - lo = it->lo; - if (it->hi > hi) - hi = it->hi; - nrunes_ -= it->hi - it->lo + 1; - ranges_.erase(it); - } - } - - // Look for a range abutting hi on the right. - // If it exists, take it out and increase our range. - if (hi < Runemax) { - iterator it = ranges_.find(RuneRange(hi+1, hi+1)); - if (it != end()) { - hi = it->hi; - nrunes_ -= it->hi - it->lo + 1; - ranges_.erase(it); - } - } - - // Look for ranges between lo and hi. Take them out. - // This is only safe because the set has no overlapping ranges. - // We've already removed any ranges abutting lo and hi, so - // any that overlap [lo, hi] must be contained within it. - for (;;) { - iterator it = ranges_.find(RuneRange(lo, hi)); - if (it == end()) - break; - nrunes_ -= it->hi - it->lo + 1; - ranges_.erase(it); - } - - // Finally, add [lo, hi]. - nrunes_ += hi - lo + 1; - ranges_.insert(RuneRange(lo, hi)); - return true; -} - -void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { - for (iterator it = cc->begin(); it != cc->end(); ++it) - AddRange(it->lo, it->hi); -} - -bool CharClassBuilder::Contains(Rune r) { - return ranges_.find(RuneRange(r, r)) != end(); -} - -// Does the character class behave the same on A-Z as on a-z? -bool CharClassBuilder::FoldsASCII() { - return ((upper_ ^ lower_) & AlphaMask) == 0; -} - -CharClassBuilder* CharClassBuilder::Copy() { - CharClassBuilder* cc = new CharClassBuilder; - for (iterator it = begin(); it != end(); ++it) - cc->ranges_.insert(RuneRange(it->lo, it->hi)); - cc->upper_ = upper_; - cc->lower_ = lower_; - cc->nrunes_ = nrunes_; - return cc; -} - - - -void CharClassBuilder::RemoveAbove(Rune r) { - if (r >= Runemax) - return; - - if (r < 'z') { - if (r < 'a') - lower_ = 0; - else - lower_ &= AlphaMask >> ('z' - r); - } - - if (r < 'Z') { - if (r < 'A') - upper_ = 0; - else - upper_ &= AlphaMask >> ('Z' - r); - } - - for (;;) { - - iterator it = ranges_.find(RuneRange(r + 1, Runemax)); - if (it == end()) - break; - RuneRange rr = *it; - ranges_.erase(it); - nrunes_ -= rr.hi - rr.lo + 1; - if (rr.lo <= r) { - rr.hi = r; - ranges_.insert(rr); - nrunes_ += rr.hi - rr.lo + 1; - } - } -} - -void CharClassBuilder::Negate() { - // Build up negation and then copy in. - // Could edit ranges in place, but C++ won't let me. + if (lo1 <= hi1) + lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); + } + + { // Check whether lo, hi is already in the class. + iterator it = ranges_.find(RuneRange(lo, lo)); + if (it != end() && it->lo <= lo && hi <= it->hi) + return false; + } + + // Look for a range abutting lo on the left. + // If it exists, take it out and increase our range. + if (lo > 0) { + iterator it = ranges_.find(RuneRange(lo-1, lo-1)); + if (it != end()) { + lo = it->lo; + if (it->hi > hi) + hi = it->hi; + nrunes_ -= it->hi - it->lo + 1; + ranges_.erase(it); + } + } + + // Look for a range abutting hi on the right. + // If it exists, take it out and increase our range. + if (hi < Runemax) { + iterator it = ranges_.find(RuneRange(hi+1, hi+1)); + if (it != end()) { + hi = it->hi; + nrunes_ -= it->hi - it->lo + 1; + ranges_.erase(it); + } + } + + // Look for ranges between lo and hi. Take them out. + // This is only safe because the set has no overlapping ranges. + // We've already removed any ranges abutting lo and hi, so + // any that overlap [lo, hi] must be contained within it. + for (;;) { + iterator it = ranges_.find(RuneRange(lo, hi)); + if (it == end()) + break; + nrunes_ -= it->hi - it->lo + 1; + ranges_.erase(it); + } + + // Finally, add [lo, hi]. + nrunes_ += hi - lo + 1; + ranges_.insert(RuneRange(lo, hi)); + return true; +} + +void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { + for (iterator it = cc->begin(); it != cc->end(); ++it) + AddRange(it->lo, it->hi); +} + +bool CharClassBuilder::Contains(Rune r) { + return ranges_.find(RuneRange(r, r)) != end(); +} + +// Does the character class behave the same on A-Z as on a-z? +bool CharClassBuilder::FoldsASCII() { + return ((upper_ ^ lower_) & AlphaMask) == 0; +} + +CharClassBuilder* CharClassBuilder::Copy() { + CharClassBuilder* cc = new CharClassBuilder; + for (iterator it = begin(); it != end(); ++it) + cc->ranges_.insert(RuneRange(it->lo, it->hi)); + cc->upper_ = upper_; + cc->lower_ = lower_; + cc->nrunes_ = nrunes_; + return cc; +} + + + +void CharClassBuilder::RemoveAbove(Rune r) { + if (r >= Runemax) + return; + + if (r < 'z') { + if (r < 'a') + lower_ = 0; + else + lower_ &= AlphaMask >> ('z' - r); + } + + if (r < 'Z') { + if (r < 'A') + upper_ = 0; + else + upper_ &= AlphaMask >> ('Z' - r); + } + + for (;;) { + + iterator it = ranges_.find(RuneRange(r + 1, Runemax)); + if (it == end()) + break; + RuneRange rr = *it; + ranges_.erase(it); + nrunes_ -= rr.hi - rr.lo + 1; + if (rr.lo <= r) { + rr.hi = r; + ranges_.insert(rr); + nrunes_ += rr.hi - rr.lo + 1; + } + } +} + +void CharClassBuilder::Negate() { + // Build up negation and then copy in. + // Could edit ranges in place, but C++ won't let me. std::vector<RuneRange> v; - v.reserve(ranges_.size() + 1); - - // In negation, first range begins at 0, unless - // the current class begins at 0. - iterator it = begin(); - if (it == end()) { - v.push_back(RuneRange(0, Runemax)); - } else { - int nextlo = 0; - if (it->lo == 0) { - nextlo = it->hi + 1; - ++it; - } - for (; it != end(); ++it) { - v.push_back(RuneRange(nextlo, it->lo - 1)); - nextlo = it->hi + 1; - } - if (nextlo <= Runemax) - v.push_back(RuneRange(nextlo, Runemax)); - } - - ranges_.clear(); + v.reserve(ranges_.size() + 1); + + // In negation, first range begins at 0, unless + // the current class begins at 0. + iterator it = begin(); + if (it == end()) { + v.push_back(RuneRange(0, Runemax)); + } else { + int nextlo = 0; + if (it->lo == 0) { + nextlo = it->hi + 1; + ++it; + } + for (; it != end(); ++it) { + v.push_back(RuneRange(nextlo, it->lo - 1)); + nextlo = it->hi + 1; + } + if (nextlo <= Runemax) + v.push_back(RuneRange(nextlo, Runemax)); + } + + ranges_.clear(); for (size_t i = 0; i < v.size(); i++) - ranges_.insert(v[i]); - - upper_ = AlphaMask & ~upper_; - lower_ = AlphaMask & ~lower_; - nrunes_ = Runemax+1 - nrunes_; -} - -// Character class is a sorted list of ranges. -// The ranges are allocated in the same block as the header, -// necessitating a special allocator and Delete method. - + ranges_.insert(v[i]); + + upper_ = AlphaMask & ~upper_; + lower_ = AlphaMask & ~lower_; + nrunes_ = Runemax+1 - nrunes_; +} + +// Character class is a sorted list of ranges. +// The ranges are allocated in the same block as the header, +// necessitating a special allocator and Delete method. + CharClass* CharClass::New(size_t maxranges) { - CharClass* cc; + CharClass* cc; uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; - cc = reinterpret_cast<CharClass*>(data); - cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); - cc->nranges_ = 0; - cc->folds_ascii_ = false; - cc->nrunes_ = 0; - return cc; -} - -void CharClass::Delete() { + cc = reinterpret_cast<CharClass*>(data); + cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); + cc->nranges_ = 0; + cc->folds_ascii_ = false; + cc->nrunes_ = 0; + return cc; +} + +void CharClass::Delete() { uint8_t* data = reinterpret_cast<uint8_t*>(this); - delete[] data; -} - -CharClass* CharClass::Negate() { + delete[] data; +} + +CharClass* CharClass::Negate() { CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1)); - cc->folds_ascii_ = folds_ascii_; - cc->nrunes_ = Runemax + 1 - nrunes_; - int n = 0; - int nextlo = 0; - for (CharClass::iterator it = begin(); it != end(); ++it) { - if (it->lo == nextlo) { - nextlo = it->hi + 1; - } else { - cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); - nextlo = it->hi + 1; - } - } - if (nextlo <= Runemax) - cc->ranges_[n++] = RuneRange(nextlo, Runemax); - cc->nranges_ = n; - return cc; -} - + cc->folds_ascii_ = folds_ascii_; + cc->nrunes_ = Runemax + 1 - nrunes_; + int n = 0; + int nextlo = 0; + for (CharClass::iterator it = begin(); it != end(); ++it) { + if (it->lo == nextlo) { + nextlo = it->hi + 1; + } else { + cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); + nextlo = it->hi + 1; + } + } + if (nextlo <= Runemax) + cc->ranges_[n++] = RuneRange(nextlo, Runemax); + cc->nranges_ = n; + return cc; +} + bool CharClass::Contains(Rune r) const { - RuneRange* rr = ranges_; - int n = nranges_; - while (n > 0) { - int m = n/2; - if (rr[m].hi < r) { - rr += m+1; - n -= m+1; - } else if (r < rr[m].lo) { - n = m; - } else { // rr[m].lo <= r && r <= rr[m].hi - return true; - } - } - return false; -} - -CharClass* CharClassBuilder::GetCharClass() { + RuneRange* rr = ranges_; + int n = nranges_; + while (n > 0) { + int m = n/2; + if (rr[m].hi < r) { + rr += m+1; + n -= m+1; + } else if (r < rr[m].lo) { + n = m; + } else { // rr[m].lo <= r && r <= rr[m].hi + return true; + } + } + return false; +} + +CharClass* CharClassBuilder::GetCharClass() { CharClass* cc = CharClass::New(ranges_.size()); - int n = 0; - for (iterator it = begin(); it != end(); ++it) - cc->ranges_[n++] = *it; - cc->nranges_ = n; + int n = 0; + for (iterator it = begin(); it != end(); ++it) + cc->ranges_[n++] = *it; + cc->nranges_ = n; DCHECK_LE(n, static_cast<int>(ranges_.size())); - cc->nrunes_ = nrunes_; - cc->folds_ascii_ = FoldsASCII(); - return cc; -} - -} // namespace re2 + cc->nrunes_ = nrunes_; + cc->folds_ascii_ = FoldsASCII(); + return cc; +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/regexp.h b/contrib/libs/re2/re2/regexp.h index 73dca2d64e..b6446f9fe5 100644 --- a/contrib/libs/re2/re2/regexp.h +++ b/contrib/libs/re2/re2/regexp.h @@ -1,283 +1,283 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_REGEXP_H_ #define RE2_REGEXP_H_ -// --- SPONSORED LINK -------------------------------------------------- -// If you want to use this library for regular expression matching, -// you should use re2/re2.h, which provides a class RE2 that -// mimics the PCRE interface provided by PCRE's C++ wrappers. -// This header describes the low-level interface used to implement RE2 -// and may change in backwards-incompatible ways from time to time. -// In contrast, RE2's interface will not. -// --------------------------------------------------------------------- - -// Regular expression library: parsing, execution, and manipulation -// of regular expressions. -// -// Any operation that traverses the Regexp structures should be written -// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested -// regular expressions such as x++++++++++++++++++++... might cause recursive -// traversals to overflow the stack. -// -// It is the caller's responsibility to provide appropriate mutual exclusion -// around manipulation of the regexps. RE2 does this. -// -// PARSING -// -// Regexp::Parse parses regular expressions encoded in UTF-8. -// The default syntax is POSIX extended regular expressions, -// with the following changes: -// -// 1. Backreferences (optional in POSIX EREs) are not supported. -// (Supporting them precludes the use of DFA-based -// matching engines.) -// -// 2. Collating elements and collation classes are not supported. -// (No one has needed or wanted them.) -// -// The exact syntax accepted can be modified by passing flags to -// Regexp::Parse. In particular, many of the basic Perl additions -// are available. The flags are documented below (search for LikePerl). -// -// If parsed with the flag Regexp::Latin1, both the regular expression -// and the input to the matching routines are assumed to be encoded in -// Latin-1, not UTF-8. -// -// EXECUTION -// -// Once Regexp has parsed a regular expression, it provides methods -// to search text using that regular expression. These methods are -// implemented via calling out to other regular expression libraries. -// (Let's call them the sublibraries.) -// -// To call a sublibrary, Regexp does not simply prepare a -// string version of the regular expression and hand it to the -// sublibrary. Instead, Regexp prepares, from its own parsed form, the -// corresponding internal representation used by the sublibrary. -// This has the drawback of needing to know the internal representation -// used by the sublibrary, but it has two important benefits: -// -// 1. The syntax and meaning of regular expressions is guaranteed -// to be that used by Regexp's parser, not the syntax expected -// by the sublibrary. Regexp might accept a restricted or -// expanded syntax for regular expressions as compared with -// the sublibrary. As long as Regexp can translate from its -// internal form into the sublibrary's, clients need not know -// exactly which sublibrary they are using. -// -// 2. The sublibrary parsers are bypassed. For whatever reason, -// sublibrary regular expression parsers often have security -// problems. For example, plan9grep's regular expression parser -// has a buffer overflow in its handling of large character -// classes, and PCRE's parser has had buffer overflow problems -// in the past. Security-team requires sandboxing of sublibrary -// regular expression parsers. Avoiding the sublibrary parsers -// avoids the sandbox. -// -// The execution methods we use now are provided by the compiled form, -// Prog, described in prog.h -// -// MANIPULATION -// -// Unlike other regular expression libraries, Regexp makes its parsed -// form accessible to clients, so that client code can analyze the -// parsed regular expressions. - +// --- SPONSORED LINK -------------------------------------------------- +// If you want to use this library for regular expression matching, +// you should use re2/re2.h, which provides a class RE2 that +// mimics the PCRE interface provided by PCRE's C++ wrappers. +// This header describes the low-level interface used to implement RE2 +// and may change in backwards-incompatible ways from time to time. +// In contrast, RE2's interface will not. +// --------------------------------------------------------------------- + +// Regular expression library: parsing, execution, and manipulation +// of regular expressions. +// +// Any operation that traverses the Regexp structures should be written +// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested +// regular expressions such as x++++++++++++++++++++... might cause recursive +// traversals to overflow the stack. +// +// It is the caller's responsibility to provide appropriate mutual exclusion +// around manipulation of the regexps. RE2 does this. +// +// PARSING +// +// Regexp::Parse parses regular expressions encoded in UTF-8. +// The default syntax is POSIX extended regular expressions, +// with the following changes: +// +// 1. Backreferences (optional in POSIX EREs) are not supported. +// (Supporting them precludes the use of DFA-based +// matching engines.) +// +// 2. Collating elements and collation classes are not supported. +// (No one has needed or wanted them.) +// +// The exact syntax accepted can be modified by passing flags to +// Regexp::Parse. In particular, many of the basic Perl additions +// are available. The flags are documented below (search for LikePerl). +// +// If parsed with the flag Regexp::Latin1, both the regular expression +// and the input to the matching routines are assumed to be encoded in +// Latin-1, not UTF-8. +// +// EXECUTION +// +// Once Regexp has parsed a regular expression, it provides methods +// to search text using that regular expression. These methods are +// implemented via calling out to other regular expression libraries. +// (Let's call them the sublibraries.) +// +// To call a sublibrary, Regexp does not simply prepare a +// string version of the regular expression and hand it to the +// sublibrary. Instead, Regexp prepares, from its own parsed form, the +// corresponding internal representation used by the sublibrary. +// This has the drawback of needing to know the internal representation +// used by the sublibrary, but it has two important benefits: +// +// 1. The syntax and meaning of regular expressions is guaranteed +// to be that used by Regexp's parser, not the syntax expected +// by the sublibrary. Regexp might accept a restricted or +// expanded syntax for regular expressions as compared with +// the sublibrary. As long as Regexp can translate from its +// internal form into the sublibrary's, clients need not know +// exactly which sublibrary they are using. +// +// 2. The sublibrary parsers are bypassed. For whatever reason, +// sublibrary regular expression parsers often have security +// problems. For example, plan9grep's regular expression parser +// has a buffer overflow in its handling of large character +// classes, and PCRE's parser has had buffer overflow problems +// in the past. Security-team requires sandboxing of sublibrary +// regular expression parsers. Avoiding the sublibrary parsers +// avoids the sandbox. +// +// The execution methods we use now are provided by the compiled form, +// Prog, described in prog.h +// +// MANIPULATION +// +// Unlike other regular expression libraries, Regexp makes its parsed +// form accessible to clients, so that client code can analyze the +// parsed regular expressions. + #include <stddef.h> #include <stdint.h> #include <map> #include <set> #include <string> - + #include "util/util.h" #include "util/logging.h" #include "util/utf.h" #include "re2/stringpiece.h" - -namespace re2 { - -// Keep in sync with string list kOpcodeNames[] in testing/dump.cc -enum RegexpOp { - // Matches no strings. - kRegexpNoMatch = 1, - - // Matches empty string. - kRegexpEmptyMatch, - - // Matches rune_. - kRegexpLiteral, - - // Matches runes_. - kRegexpLiteralString, - - // Matches concatenation of sub_[0..nsub-1]. - kRegexpConcat, - // Matches union of sub_[0..nsub-1]. - kRegexpAlternate, - - // Matches sub_[0] zero or more times. - kRegexpStar, - // Matches sub_[0] one or more times. - kRegexpPlus, - // Matches sub_[0] zero or one times. - kRegexpQuest, - - // Matches sub_[0] at least min_ times, at most max_ times. - // max_ == -1 means no upper limit. - kRegexpRepeat, - - // Parenthesized (capturing) subexpression. Index is cap_. - // Optionally, capturing name is name_. - kRegexpCapture, - - // Matches any character. - kRegexpAnyChar, - - // Matches any byte [sic]. - kRegexpAnyByte, - - // Matches empty string at beginning of line. - kRegexpBeginLine, - // Matches empty string at end of line. - kRegexpEndLine, - - // Matches word boundary "\b". - kRegexpWordBoundary, - // Matches not-a-word boundary "\B". - kRegexpNoWordBoundary, - - // Matches empty string at beginning of text. - kRegexpBeginText, - // Matches empty string at end of text. - kRegexpEndText, - - // Matches character class given by cc_. - kRegexpCharClass, - - // Forces match of entire expression right now, - // with match ID match_id_ (used by RE2::Set). - kRegexpHaveMatch, - - kMaxRegexpOp = kRegexpHaveMatch, -}; - -// Keep in sync with string list in regexp.cc -enum RegexpStatusCode { - // No error - kRegexpSuccess = 0, - - // Unexpected error - kRegexpInternalError, - - // Parse errors - kRegexpBadEscape, // bad escape sequence - kRegexpBadCharClass, // bad character class - kRegexpBadCharRange, // bad character class range - kRegexpMissingBracket, // missing closing ] - kRegexpMissingParen, // missing closing ) + +namespace re2 { + +// Keep in sync with string list kOpcodeNames[] in testing/dump.cc +enum RegexpOp { + // Matches no strings. + kRegexpNoMatch = 1, + + // Matches empty string. + kRegexpEmptyMatch, + + // Matches rune_. + kRegexpLiteral, + + // Matches runes_. + kRegexpLiteralString, + + // Matches concatenation of sub_[0..nsub-1]. + kRegexpConcat, + // Matches union of sub_[0..nsub-1]. + kRegexpAlternate, + + // Matches sub_[0] zero or more times. + kRegexpStar, + // Matches sub_[0] one or more times. + kRegexpPlus, + // Matches sub_[0] zero or one times. + kRegexpQuest, + + // Matches sub_[0] at least min_ times, at most max_ times. + // max_ == -1 means no upper limit. + kRegexpRepeat, + + // Parenthesized (capturing) subexpression. Index is cap_. + // Optionally, capturing name is name_. + kRegexpCapture, + + // Matches any character. + kRegexpAnyChar, + + // Matches any byte [sic]. + kRegexpAnyByte, + + // Matches empty string at beginning of line. + kRegexpBeginLine, + // Matches empty string at end of line. + kRegexpEndLine, + + // Matches word boundary "\b". + kRegexpWordBoundary, + // Matches not-a-word boundary "\B". + kRegexpNoWordBoundary, + + // Matches empty string at beginning of text. + kRegexpBeginText, + // Matches empty string at end of text. + kRegexpEndText, + + // Matches character class given by cc_. + kRegexpCharClass, + + // Forces match of entire expression right now, + // with match ID match_id_ (used by RE2::Set). + kRegexpHaveMatch, + + kMaxRegexpOp = kRegexpHaveMatch, +}; + +// Keep in sync with string list in regexp.cc +enum RegexpStatusCode { + // No error + kRegexpSuccess = 0, + + // Unexpected error + kRegexpInternalError, + + // Parse errors + kRegexpBadEscape, // bad escape sequence + kRegexpBadCharClass, // bad character class + kRegexpBadCharRange, // bad character class range + kRegexpMissingBracket, // missing closing ] + kRegexpMissingParen, // missing closing ) kRegexpUnexpectedParen, // unexpected closing ) - kRegexpTrailingBackslash, // at end of regexp - kRegexpRepeatArgument, // repeat argument missing, e.g. "*" - kRegexpRepeatSize, // bad repetition argument - kRegexpRepeatOp, // bad repetition operator - kRegexpBadPerlOp, // bad perl operator - kRegexpBadUTF8, // invalid UTF-8 in regexp - kRegexpBadNamedCapture, // bad named capture -}; - -// Error status for certain operations. -class RegexpStatus { - public: - RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} - ~RegexpStatus() { delete tmp_; } - + kRegexpTrailingBackslash, // at end of regexp + kRegexpRepeatArgument, // repeat argument missing, e.g. "*" + kRegexpRepeatSize, // bad repetition argument + kRegexpRepeatOp, // bad repetition operator + kRegexpBadPerlOp, // bad perl operator + kRegexpBadUTF8, // invalid UTF-8 in regexp + kRegexpBadNamedCapture, // bad named capture +}; + +// Error status for certain operations. +class RegexpStatus { + public: + RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} + ~RegexpStatus() { delete tmp_; } + void set_code(RegexpStatusCode code) { code_ = code; } - void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } + void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; } RegexpStatusCode code() const { return code_; } - const StringPiece& error_arg() const { return error_arg_; } - bool ok() const { return code() == kRegexpSuccess; } - - // Copies state from status. - void Copy(const RegexpStatus& status); - - // Returns text equivalent of code, e.g.: - // "Bad character class" + const StringPiece& error_arg() const { return error_arg_; } + bool ok() const { return code() == kRegexpSuccess; } + + // Copies state from status. + void Copy(const RegexpStatus& status); + + // Returns text equivalent of code, e.g.: + // "Bad character class" static std::string CodeText(RegexpStatusCode code); - - // Returns text describing error, e.g.: - // "Bad character class: [z-a]" + + // Returns text describing error, e.g.: + // "Bad character class: [z-a]" std::string Text() const; - - private: + + private: RegexpStatusCode code_; // Kind of error StringPiece error_arg_; // Piece of regexp containing syntax error. std::string* tmp_; // Temporary storage, possibly where error_arg_ is. - + RegexpStatus(const RegexpStatus&) = delete; RegexpStatus& operator=(const RegexpStatus&) = delete; -}; - -// Compiled form; see prog.h -class Prog; - -struct RuneRange { - RuneRange() : lo(0), hi(0) { } - RuneRange(int l, int h) : lo(l), hi(h) { } - Rune lo; - Rune hi; -}; - -// Less-than on RuneRanges treats a == b if they overlap at all. -// This lets us look in a set to find the range covering a particular Rune. -struct RuneRangeLess { - bool operator()(const RuneRange& a, const RuneRange& b) const { - return a.hi < b.lo; - } -}; - -class CharClassBuilder; - -class CharClass { - public: - void Delete(); - - typedef RuneRange* iterator; - iterator begin() { return ranges_; } - iterator end() { return ranges_ + nranges_; } - - int size() { return nrunes_; } - bool empty() { return nrunes_ == 0; } - bool full() { return nrunes_ == Runemax+1; } - bool FoldsASCII() { return folds_ascii_; } - +}; + +// Compiled form; see prog.h +class Prog; + +struct RuneRange { + RuneRange() : lo(0), hi(0) { } + RuneRange(int l, int h) : lo(l), hi(h) { } + Rune lo; + Rune hi; +}; + +// Less-than on RuneRanges treats a == b if they overlap at all. +// This lets us look in a set to find the range covering a particular Rune. +struct RuneRangeLess { + bool operator()(const RuneRange& a, const RuneRange& b) const { + return a.hi < b.lo; + } +}; + +class CharClassBuilder; + +class CharClass { + public: + void Delete(); + + typedef RuneRange* iterator; + iterator begin() { return ranges_; } + iterator end() { return ranges_ + nranges_; } + + int size() { return nrunes_; } + bool empty() { return nrunes_ == 0; } + bool full() { return nrunes_ == Runemax+1; } + bool FoldsASCII() { return folds_ascii_; } + bool Contains(Rune r) const; - CharClass* Negate(); - - private: - CharClass(); // not implemented - ~CharClass(); // not implemented + CharClass* Negate(); + + private: + CharClass(); // not implemented + ~CharClass(); // not implemented static CharClass* New(size_t maxranges); - - friend class CharClassBuilder; - - bool folds_ascii_; - int nrunes_; - RuneRange *ranges_; - int nranges_; + + friend class CharClassBuilder; + + bool folds_ascii_; + int nrunes_; + RuneRange *ranges_; + int nranges_; CharClass(const CharClass&) = delete; CharClass& operator=(const CharClass&) = delete; -}; - -class Regexp { - public: - - // Flags for parsing. Can be ORed together. - enum ParseFlags { +}; + +class Regexp { + public: + + // Flags for parsing. Can be ORed together. + enum ParseFlags { NoParseFlags = 0, FoldCase = 1<<0, // Fold case during matching (case-insensitive). Literal = 1<<1, // Treat s as literal string instead of a regexp. @@ -309,139 +309,139 @@ class Regexp { NeverNL = 1<<11, // Never match NL, even if the regexp mentions // it explicitly. NeverCapture = 1<<12, // Parse all parens as non-capturing. - - // As close to Perl as we can get. + + // As close to Perl as we can get. LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | UnicodeGroups, - - // Internal use only. + + // Internal use only. WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text AllParseFlags = (1<<14)-1, - }; - - // Get. No set, Regexps are logically immutable once created. - RegexpOp op() { return static_cast<RegexpOp>(op_); } - int nsub() { return nsub_; } + }; + + // Get. No set, Regexps are logically immutable once created. + RegexpOp op() { return static_cast<RegexpOp>(op_); } + int nsub() { return nsub_; } bool simple() { return simple_ != 0; } ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); } - int Ref(); // For testing. - - Regexp** sub() { - if(nsub_ <= 1) - return &subone_; - else - return submany_; - } - - int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; } - int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; } - Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } - CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } - int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } + int Ref(); // For testing. + + Regexp** sub() { + if(nsub_ <= 1) + return &subone_; + else + return submany_; + } + + int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; } + int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; } + Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } + CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } + int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } - Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } - int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } - int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } - - // Increments reference count, returns object as convenience. - Regexp* Incref(); - - // Decrements reference count and deletes this object if count reaches 0. - void Decref(); - - // Parses string s to produce regular expression, returned. - // Caller must release return value with re->Decref(). - // On failure, sets *status (if status != NULL) and returns NULL. - static Regexp* Parse(const StringPiece& s, ParseFlags flags, - RegexpStatus* status); - - // Returns a _new_ simplified version of the current regexp. - // Does not edit the current regexp. - // Caller must release return value with re->Decref(). - // Simplified means that counted repetition has been rewritten - // into simpler terms and all Perl/POSIX features have been - // removed. The result will capture exactly the same - // subexpressions the original did, unless formatted with ToString. - Regexp* Simplify(); + Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } + int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } + int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } + + // Increments reference count, returns object as convenience. + Regexp* Incref(); + + // Decrements reference count and deletes this object if count reaches 0. + void Decref(); + + // Parses string s to produce regular expression, returned. + // Caller must release return value with re->Decref(). + // On failure, sets *status (if status != NULL) and returns NULL. + static Regexp* Parse(const StringPiece& s, ParseFlags flags, + RegexpStatus* status); + + // Returns a _new_ simplified version of the current regexp. + // Does not edit the current regexp. + // Caller must release return value with re->Decref(). + // Simplified means that counted repetition has been rewritten + // into simpler terms and all Perl/POSIX features have been + // removed. The result will capture exactly the same + // subexpressions the original did, unless formatted with ToString. + Regexp* Simplify(); friend class CoalesceWalker; - friend class SimplifyWalker; - - // Parses the regexp src and then simplifies it and sets *dst to the - // string representation of the simplified form. Returns true on success. - // Returns false and sets *status (if status != NULL) on parse error. - static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, + friend class SimplifyWalker; + + // Parses the regexp src and then simplifies it and sets *dst to the + // string representation of the simplified form. Returns true on success. + // Returns false and sets *status (if status != NULL) on parse error. + static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, std::string* dst, RegexpStatus* status); - - // Returns the number of capturing groups in the regexp. - int NumCaptures(); - friend class NumCapturesWalker; - - // Returns a map from names to capturing group indices, - // or NULL if the regexp contains no named capture groups. - // The caller is responsible for deleting the map. + + // Returns the number of capturing groups in the regexp. + int NumCaptures(); + friend class NumCapturesWalker; + + // Returns a map from names to capturing group indices, + // or NULL if the regexp contains no named capture groups. + // The caller is responsible for deleting the map. std::map<std::string, int>* NamedCaptures(); - - // Returns a map from capturing group indices to capturing group - // names or NULL if the regexp contains no named capture groups. The - // caller is responsible for deleting the map. + + // Returns a map from capturing group indices to capturing group + // names or NULL if the regexp contains no named capture groups. The + // caller is responsible for deleting the map. std::map<int, std::string>* CaptureNames(); - - // Returns a string representation of the current regexp, - // using as few parentheses as possible. + + // Returns a string representation of the current regexp, + // using as few parentheses as possible. std::string ToString(); - - // Convenience functions. They consume the passed reference, - // so in many cases you should use, e.g., Plus(re->Incref(), flags). - // They do not consume allocated arrays like subs or runes. - static Regexp* Plus(Regexp* sub, ParseFlags flags); - static Regexp* Star(Regexp* sub, ParseFlags flags); - static Regexp* Quest(Regexp* sub, ParseFlags flags); - static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); - static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); - static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); - static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); - static Regexp* NewLiteral(Rune rune, ParseFlags flags); - static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); - static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); - static Regexp* HaveMatch(int match_id, ParseFlags flags); - - // Like Alternate but does not factor out common prefixes. - static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); - - // Debugging function. Returns string format for regexp - // that makes structure clear. Does NOT use regexp syntax. + + // Convenience functions. They consume the passed reference, + // so in many cases you should use, e.g., Plus(re->Incref(), flags). + // They do not consume allocated arrays like subs or runes. + static Regexp* Plus(Regexp* sub, ParseFlags flags); + static Regexp* Star(Regexp* sub, ParseFlags flags); + static Regexp* Quest(Regexp* sub, ParseFlags flags); + static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); + static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); + static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); + static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); + static Regexp* NewLiteral(Rune rune, ParseFlags flags); + static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); + static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); + static Regexp* HaveMatch(int match_id, ParseFlags flags); + + // Like Alternate but does not factor out common prefixes. + static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); + + // Debugging function. Returns string format for regexp + // that makes structure clear. Does NOT use regexp syntax. std::string Dump(); - - // Helper traversal class, defined fully in walker-inl.h. - template<typename T> class Walker; - - // Compile to Prog. See prog.h - // Reverse prog expects to be run over text backward. - // Construction and execution of prog will - // stay within approximately max_mem bytes of memory. - // If max_mem <= 0, a reasonable default is used. + + // Helper traversal class, defined fully in walker-inl.h. + template<typename T> class Walker; + + // Compile to Prog. See prog.h + // Reverse prog expects to be run over text backward. + // Construction and execution of prog will + // stay within approximately max_mem bytes of memory. + // If max_mem <= 0, a reasonable default is used. Prog* CompileToProg(int64_t max_mem); Prog* CompileToReverseProg(int64_t max_mem); - - // Whether to expect this library to find exactly the same answer as PCRE - // when running this regexp. Most regexps do mimic PCRE exactly, but a few - // obscure cases behave differently. Technically this is more a property - // of the Prog than the Regexp, but the computation is much easier to do - // on the Regexp. See mimics_pcre.cc for the exact conditions. - bool MimicsPCRE(); - - // Benchmarking function. - void NullWalk(); - - // Whether every match of this regexp must be anchored and - // begin with a non-empty fixed string (perhaps after ASCII - // case-folding). If so, returns the prefix and the sub-regexp that - // follows it. + + // Whether to expect this library to find exactly the same answer as PCRE + // when running this regexp. Most regexps do mimic PCRE exactly, but a few + // obscure cases behave differently. Technically this is more a property + // of the Prog than the Regexp, but the computation is much easier to do + // on the Regexp. See mimics_pcre.cc for the exact conditions. + bool MimicsPCRE(); + + // Benchmarking function. + void NullWalk(); + + // Whether every match of this regexp must be anchored and + // begin with a non-empty fixed string (perhaps after ASCII + // case-folding). If so, returns the prefix and the sub-regexp that + // follows it. // Callers should expect *prefix, *foldcase and *suffix to be "zeroed" // regardless of the return value. bool RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix); - + // Whether every match of this regexp must be unanchored and // begin with a non-empty fixed string (perhaps after ASCII // case-folding). If so, returns the prefix. @@ -453,213 +453,213 @@ class Regexp { // FOR FUZZING ONLY. static void FUZZING_ONLY_set_maximum_repeat_count(int i); - private: - // Constructor allocates vectors as appropriate for operator. - explicit Regexp(RegexpOp op, ParseFlags parse_flags); - - // Use Decref() instead of delete to release Regexps. - // This is private to catch deletes at compile time. - ~Regexp(); - void Destroy(); - bool QuickDestroy(); - - // Helpers for Parse. Listed here so they can edit Regexps. - class ParseState; - - friend class ParseState; - friend bool ParseCharClass(StringPiece* s, Regexp** out_re, - RegexpStatus* status); - - // Helper for testing [sic]. - friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); - - // Computes whether Regexp is already simple. - bool ComputeSimple(); - + private: + // Constructor allocates vectors as appropriate for operator. + explicit Regexp(RegexpOp op, ParseFlags parse_flags); + + // Use Decref() instead of delete to release Regexps. + // This is private to catch deletes at compile time. + ~Regexp(); + void Destroy(); + bool QuickDestroy(); + + // Helpers for Parse. Listed here so they can edit Regexps. + class ParseState; + + friend class ParseState; + friend bool ParseCharClass(StringPiece* s, Regexp** out_re, + RegexpStatus* status); + + // Helper for testing [sic]. + friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); + + // Computes whether Regexp is already simple. + bool ComputeSimple(); + // Constructor that generates a Star, Plus or Quest, // squashing the pair if sub is also a Star, Plus or Quest. static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags); - // Constructor that generates a concatenation or alternation, - // enforcing the limit on the number of subexpressions for - // a particular Regexp. - static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, - ParseFlags flags, bool can_factor); - - // Returns the leading string that re starts with. - // The returned Rune* points into a piece of re, - // so it must not be used after the caller calls re->Decref(). - static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); - - // Removes the first n leading runes from the beginning of re. - // Edits re in place. - static void RemoveLeadingString(Regexp* re, int n); - - // Returns the leading regexp in re's top-level concatenation. - // The returned Regexp* points at re or a sub-expression of re, - // so it must not be used after the caller calls re->Decref(). - static Regexp* LeadingRegexp(Regexp* re); - - // Removes LeadingRegexp(re) from re and returns the remainder. - // Might edit re in place. - static Regexp* RemoveLeadingRegexp(Regexp* re); - - // Simplifies an alternation of literal strings by factoring out - // common prefixes. - static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); + // Constructor that generates a concatenation or alternation, + // enforcing the limit on the number of subexpressions for + // a particular Regexp. + static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, + ParseFlags flags, bool can_factor); + + // Returns the leading string that re starts with. + // The returned Rune* points into a piece of re, + // so it must not be used after the caller calls re->Decref(). + static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); + + // Removes the first n leading runes from the beginning of re. + // Edits re in place. + static void RemoveLeadingString(Regexp* re, int n); + + // Returns the leading regexp in re's top-level concatenation. + // The returned Regexp* points at re or a sub-expression of re, + // so it must not be used after the caller calls re->Decref(). + static Regexp* LeadingRegexp(Regexp* re); + + // Removes LeadingRegexp(re) from re and returns the remainder. + // Might edit re in place. + static Regexp* RemoveLeadingRegexp(Regexp* re); + + // Simplifies an alternation of literal strings by factoring out + // common prefixes. + static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); friend class FactorAlternationImpl; - - // Is a == b? Only efficient on regexps that have not been through - // Simplify yet - the expansion of a kRegexpRepeat will make this - // take a long time. Do not call on such regexps, hence private. - static bool Equal(Regexp* a, Regexp* b); - - // Allocate space for n sub-regexps. - void AllocSub(int n) { + + // Is a == b? Only efficient on regexps that have not been through + // Simplify yet - the expansion of a kRegexpRepeat will make this + // take a long time. Do not call on such regexps, hence private. + static bool Equal(Regexp* a, Regexp* b); + + // Allocate space for n sub-regexps. + void AllocSub(int n) { DCHECK(n >= 0 && static_cast<uint16_t>(n) == n); - if (n > 1) - submany_ = new Regexp*[n]; + if (n > 1) + submany_ = new Regexp*[n]; nsub_ = static_cast<uint16_t>(n); - } - - // Add Rune to LiteralString - void AddRuneToString(Rune r); - - // Swaps this with that, in place. - void Swap(Regexp *that); - - // Operator. See description of operators above. + } + + // Add Rune to LiteralString + void AddRuneToString(Rune r); + + // Swaps this with that, in place. + void Swap(Regexp *that); + + // Operator. See description of operators above. // uint8_t instead of RegexpOp to control space usage. uint8_t op_; - - // Is this regexp structure already simple - // (has it been returned by Simplify)? + + // Is this regexp structure already simple + // (has it been returned by Simplify)? // uint8_t instead of bool to control space usage. uint8_t simple_; - - // Flags saved from parsing and used during execution. - // (Only FoldCase is used.) + + // Flags saved from parsing and used during execution. + // (Only FoldCase is used.) // uint16_t instead of ParseFlags to control space usage. uint16_t parse_flags_; - - // Reference count. Exists so that SimplifyRegexp can build - // regexp structures that are dags rather than trees to avoid - // exponential blowup in space requirements. + + // Reference count. Exists so that SimplifyRegexp can build + // regexp structures that are dags rather than trees to avoid + // exponential blowup in space requirements. // uint16_t to control space usage. - // The standard regexp routines will never generate a + // The standard regexp routines will never generate a // ref greater than the maximum repeat count (kMaxRepeat), - // but even so, Incref and Decref consult an overflow map - // when ref_ reaches kMaxRef. + // but even so, Incref and Decref consult an overflow map + // when ref_ reaches kMaxRef. uint16_t ref_; static const uint16_t kMaxRef = 0xffff; - - // Subexpressions. + + // Subexpressions. // uint16_t to control space usage. - // Concat and Alternate handle larger numbers of subexpressions - // by building concatenation or alternation trees. - // Other routines should call Concat or Alternate instead of - // filling in sub() by hand. + // Concat and Alternate handle larger numbers of subexpressions + // by building concatenation or alternation trees. + // Other routines should call Concat or Alternate instead of + // filling in sub() by hand. uint16_t nsub_; static const uint16_t kMaxNsub = 0xffff; - union { - Regexp** submany_; // if nsub_ > 1 - Regexp* subone_; // if nsub_ == 1 - }; - - // Extra space for parse and teardown stacks. - Regexp* down_; - - // Arguments to operator. See description of operators above. - union { - struct { // Repeat - int max_; - int min_; - }; - struct { // Capture - int cap_; + union { + Regexp** submany_; // if nsub_ > 1 + Regexp* subone_; // if nsub_ == 1 + }; + + // Extra space for parse and teardown stacks. + Regexp* down_; + + // Arguments to operator. See description of operators above. + union { + struct { // Repeat + int max_; + int min_; + }; + struct { // Capture + int cap_; std::string* name_; - }; - struct { // LiteralString - int nrunes_; - Rune* runes_; - }; - struct { // CharClass - // These two could be in separate union members, - // but it wouldn't save any space (there are other two-word structs) - // and keeping them separate avoids confusion during parsing. - CharClass* cc_; - CharClassBuilder* ccb_; - }; - Rune rune_; // Literal - int match_id_; // HaveMatch - void *the_union_[2]; // as big as any other element, for memset - }; - + }; + struct { // LiteralString + int nrunes_; + Rune* runes_; + }; + struct { // CharClass + // These two could be in separate union members, + // but it wouldn't save any space (there are other two-word structs) + // and keeping them separate avoids confusion during parsing. + CharClass* cc_; + CharClassBuilder* ccb_; + }; + Rune rune_; // Literal + int match_id_; // HaveMatch + void *the_union_[2]; // as big as any other element, for memset + }; + Regexp(const Regexp&) = delete; Regexp& operator=(const Regexp&) = delete; -}; - -// Character class set: contains non-overlapping, non-abutting RuneRanges. +}; + +// Character class set: contains non-overlapping, non-abutting RuneRanges. typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet; - -class CharClassBuilder { - public: - CharClassBuilder(); - - typedef RuneRangeSet::iterator iterator; - iterator begin() { return ranges_.begin(); } - iterator end() { return ranges_.end(); } - - int size() { return nrunes_; } - bool empty() { return nrunes_ == 0; } - bool full() { return nrunes_ == Runemax+1; } - - bool Contains(Rune r); - bool FoldsASCII(); - bool AddRange(Rune lo, Rune hi); // returns whether class changed - CharClassBuilder* Copy(); - void AddCharClass(CharClassBuilder* cc); - void Negate(); - void RemoveAbove(Rune r); - CharClass* GetCharClass(); - void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); - - private: + +class CharClassBuilder { + public: + CharClassBuilder(); + + typedef RuneRangeSet::iterator iterator; + iterator begin() { return ranges_.begin(); } + iterator end() { return ranges_.end(); } + + int size() { return nrunes_; } + bool empty() { return nrunes_ == 0; } + bool full() { return nrunes_ == Runemax+1; } + + bool Contains(Rune r); + bool FoldsASCII(); + bool AddRange(Rune lo, Rune hi); // returns whether class changed + CharClassBuilder* Copy(); + void AddCharClass(CharClassBuilder* cc); + void Negate(); + void RemoveAbove(Rune r); + CharClass* GetCharClass(); + void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); + + private: static const uint32_t AlphaMask = (1<<26) - 1; uint32_t upper_; // bitmap of A-Z uint32_t lower_; // bitmap of a-z - int nrunes_; - RuneRangeSet ranges_; + int nrunes_; + RuneRangeSet ranges_; CharClassBuilder(const CharClassBuilder&) = delete; CharClassBuilder& operator=(const CharClassBuilder&) = delete; -}; - +}; + // Bitwise ops on ParseFlags produce ParseFlags. inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b) { return static_cast<Regexp::ParseFlags>( static_cast<int>(a) | static_cast<int>(b)); -} - +} + inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b) { return static_cast<Regexp::ParseFlags>( static_cast<int>(a) ^ static_cast<int>(b)); -} - +} + inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b) { return static_cast<Regexp::ParseFlags>( static_cast<int>(a) & static_cast<int>(b)); -} - +} + inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) { // Attempting to produce a value out of enum's range has undefined behaviour. return static_cast<Regexp::ParseFlags>( ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags)); -} - -} // namespace re2 +} + +} // namespace re2 #endif // RE2_REGEXP_H_ diff --git a/contrib/libs/re2/re2/set.cc b/contrib/libs/re2/re2/set.cc index df27ca5fd0..18705663a5 100644 --- a/contrib/libs/re2/re2/set.cc +++ b/contrib/libs/re2/re2/set.cc @@ -1,9 +1,9 @@ -// Copyright 2010 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #include "re2/set.h" - + #include <stddef.h> #include <algorithm> #include <memory> @@ -12,26 +12,26 @@ #include "util/util.h" #include "util/logging.h" #include "re2/pod_array.h" -#include "re2/prog.h" +#include "re2/prog.h" #include "re2/re2.h" -#include "re2/regexp.h" +#include "re2/regexp.h" #include "re2/stringpiece.h" - + namespace re2 { - + RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) : options_(options), anchor_(anchor), compiled_(false), size_(0) { options_.set_never_capture(true); // might unblock some optimisations -} - -RE2::Set::~Set() { +} + +RE2::Set::~Set() { for (size_t i = 0; i < elem_.size(); i++) elem_[i].second->Decref(); -} - +} + RE2::Set::Set(Set&& other) : options_(other.options_), anchor_(other.anchor_), @@ -53,52 +53,52 @@ RE2::Set& RE2::Set::operator=(Set&& other) { } int RE2::Set::Add(const StringPiece& pattern, std::string* error) { - if (compiled_) { + if (compiled_) { LOG(DFATAL) << "RE2::Set::Add() called after compiling"; - return -1; - } - - Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( - options_.ParseFlags()); - RegexpStatus status; + return -1; + } + + Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( + options_.ParseFlags()); + RegexpStatus status; re2::Regexp* re = Regexp::Parse(pattern, pf, &status); - if (re == NULL) { - if (error != NULL) - *error = status.Text(); - if (options_.log_errors()) - LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text(); - return -1; - } - - // Concatenate with match index and push on vector. + if (re == NULL) { + if (error != NULL) + *error = status.Text(); + if (options_.log_errors()) + LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text(); + return -1; + } + + // Concatenate with match index and push on vector. int n = static_cast<int>(elem_.size()); re2::Regexp* m = re2::Regexp::HaveMatch(n, pf); - if (re->op() == kRegexpConcat) { - int nsub = re->nsub(); + if (re->op() == kRegexpConcat) { + int nsub = re->nsub(); PODArray<re2::Regexp*> sub(nsub + 1); - for (int i = 0; i < nsub; i++) - sub[i] = re->sub()[i]->Incref(); - sub[nsub] = m; - re->Decref(); + for (int i = 0; i < nsub; i++) + sub[i] = re->sub()[i]->Incref(); + sub[nsub] = m; + re->Decref(); re = re2::Regexp::Concat(sub.data(), nsub + 1, pf); - } else { + } else { re2::Regexp* sub[2]; - sub[0] = re; - sub[1] = m; + sub[0] = re; + sub[1] = m; re = re2::Regexp::Concat(sub, 2, pf); - } + } elem_.emplace_back(std::string(pattern), re); - return n; -} - -bool RE2::Set::Compile() { - if (compiled_) { + return n; +} + +bool RE2::Set::Compile() { + if (compiled_) { LOG(DFATAL) << "RE2::Set::Compile() called more than once"; - return false; - } - compiled_ = true; + return false; + } + compiled_ = true; size_ = static_cast<int>(elem_.size()); - + // Sort the elements by their patterns. This is good enough for now // until we have a Regexp comparison function. (Maybe someday...) std::sort(elem_.begin(), elem_.end(), @@ -112,27 +112,27 @@ bool RE2::Set::Compile() { elem_.clear(); elem_.shrink_to_fit(); - Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( - options_.ParseFlags()); + Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( + options_.ParseFlags()); re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf); prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem())); - re->Decref(); + re->Decref(); return prog_ != nullptr; } - + bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const { return Match(text, v, NULL); -} - +} + bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, ErrorInfo* error_info) const { - if (!compiled_) { + if (!compiled_) { LOG(DFATAL) << "RE2::Set::Match() called before compiling"; if (error_info != NULL) error_info->kind = kNotCompiled; - return false; - } + return false; + } #ifdef RE2_HAVE_THREAD_LOCAL hooks::context = NULL; #endif @@ -157,8 +157,8 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, if (ret == false) { if (error_info != NULL) error_info->kind = kNoError; - return false; - } + return false; + } if (v != NULL) { if (matches->empty()) { LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; @@ -170,7 +170,7 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, } if (error_info != NULL) error_info->kind = kNoError; - return true; -} + return true; +} } // namespace re2 diff --git a/contrib/libs/re2/re2/simplify.cc b/contrib/libs/re2/re2/simplify.cc index e80cbca3fa..663d5fcd45 100644 --- a/contrib/libs/re2/re2/simplify.cc +++ b/contrib/libs/re2/re2/simplify.cc @@ -1,104 +1,104 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Rewrite POSIX and other features in re -// to use simple extended regular expression features. -// Also sort and simplify character classes. - +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Rewrite POSIX and other features in re +// to use simple extended regular expression features. +// Also sort and simplify character classes. + #include <string> #include "util/util.h" #include "util/logging.h" #include "util/utf.h" #include "re2/pod_array.h" -#include "re2/regexp.h" -#include "re2/walker-inl.h" - -namespace re2 { - -// Parses the regexp src and then simplifies it and sets *dst to the -// string representation of the simplified form. Returns true on success. -// Returns false and sets *error (if error != NULL) on error. -bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// Parses the regexp src and then simplifies it and sets *dst to the +// string representation of the simplified form. Returns true on success. +// Returns false and sets *error (if error != NULL) on error. +bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, std::string* dst, RegexpStatus* status) { - Regexp* re = Parse(src, flags, status); - if (re == NULL) - return false; - Regexp* sre = re->Simplify(); - re->Decref(); - if (sre == NULL) { - if (status) { - status->set_code(kRegexpInternalError); - status->set_error_arg(src); - } - return false; - } - *dst = sre->ToString(); - sre->Decref(); - return true; -} - -// Assuming the simple_ flags on the children are accurate, -// is this Regexp* simple? -bool Regexp::ComputeSimple() { - Regexp** subs; - switch (op_) { - case kRegexpNoMatch: - case kRegexpEmptyMatch: - case kRegexpLiteral: - case kRegexpLiteralString: - case kRegexpBeginLine: - case kRegexpEndLine: - case kRegexpBeginText: - case kRegexpWordBoundary: - case kRegexpNoWordBoundary: - case kRegexpEndText: - case kRegexpAnyChar: - case kRegexpAnyByte: - case kRegexpHaveMatch: - return true; - case kRegexpConcat: - case kRegexpAlternate: - // These are simple as long as the subpieces are simple. - subs = sub(); - for (int i = 0; i < nsub_; i++) + Regexp* re = Parse(src, flags, status); + if (re == NULL) + return false; + Regexp* sre = re->Simplify(); + re->Decref(); + if (sre == NULL) { + if (status) { + status->set_code(kRegexpInternalError); + status->set_error_arg(src); + } + return false; + } + *dst = sre->ToString(); + sre->Decref(); + return true; +} + +// Assuming the simple_ flags on the children are accurate, +// is this Regexp* simple? +bool Regexp::ComputeSimple() { + Regexp** subs; + switch (op_) { + case kRegexpNoMatch: + case kRegexpEmptyMatch: + case kRegexpLiteral: + case kRegexpLiteralString: + case kRegexpBeginLine: + case kRegexpEndLine: + case kRegexpBeginText: + case kRegexpWordBoundary: + case kRegexpNoWordBoundary: + case kRegexpEndText: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpHaveMatch: + return true; + case kRegexpConcat: + case kRegexpAlternate: + // These are simple as long as the subpieces are simple. + subs = sub(); + for (int i = 0; i < nsub_; i++) if (!subs[i]->simple()) - return false; - return true; - case kRegexpCharClass: - // Simple as long as the char class is not empty, not full. - if (ccb_ != NULL) - return !ccb_->empty() && !ccb_->full(); - return !cc_->empty() && !cc_->full(); - case kRegexpCapture: - subs = sub(); + return false; + return true; + case kRegexpCharClass: + // Simple as long as the char class is not empty, not full. + if (ccb_ != NULL) + return !ccb_->empty() && !ccb_->full(); + return !cc_->empty() && !cc_->full(); + case kRegexpCapture: + subs = sub(); return subs[0]->simple(); - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - subs = sub(); + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + subs = sub(); if (!subs[0]->simple()) - return false; - switch (subs[0]->op_) { - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - case kRegexpEmptyMatch: - case kRegexpNoMatch: - return false; - default: - break; - } - return true; - case kRegexpRepeat: - return false; - } - LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; - return false; -} - -// Walker subclass used by Simplify. + return false; + switch (subs[0]->op_) { + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpEmptyMatch: + case kRegexpNoMatch: + return false; + default: + break; + } + return true; + case kRegexpRepeat: + return false; + } + LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; + return false; +} + +// Walker subclass used by Simplify. // Coalesces runs of star/plus/quest/repeat of the same literal along with any // occurrences of that literal into repeats of that literal. It also works for // char classes, any char and any byte. @@ -130,51 +130,51 @@ class CoalesceWalker : public Regexp::Walker<Regexp*> { }; // Walker subclass used by Simplify. -// The simplify walk is purely post-recursive: given the simplified children, -// PostVisit creates the simplified result. -// The child_args are simplified Regexp*s. -class SimplifyWalker : public Regexp::Walker<Regexp*> { - public: - SimplifyWalker() {} - virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); +// The simplify walk is purely post-recursive: given the simplified children, +// PostVisit creates the simplified result. +// The child_args are simplified Regexp*s. +class SimplifyWalker : public Regexp::Walker<Regexp*> { + public: + SimplifyWalker() {} + virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, - Regexp** child_args, int nchild_args); - virtual Regexp* Copy(Regexp* re); - virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); - - private: - // These functions are declared inside SimplifyWalker so that - // they can edit the private fields of the Regexps they construct. - - // Creates a concatenation of two Regexp, consuming refs to re1 and re2. - // Caller must Decref return value when done with it. - static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); - - // Simplifies the expression re{min,max} in terms of *, +, and ?. - // Returns a new regexp. Does not edit re. Does not consume reference to re. - // Caller must Decref return value when done with it. - static Regexp* SimplifyRepeat(Regexp* re, int min, int max, - Regexp::ParseFlags parse_flags); - - // Simplifies a character class by expanding any named classes - // into rune ranges. Does not edit re. Does not consume ref to re. - // Caller must Decref return value when done with it. - static Regexp* SimplifyCharClass(Regexp* re); - + Regexp** child_args, int nchild_args); + virtual Regexp* Copy(Regexp* re); + virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); + + private: + // These functions are declared inside SimplifyWalker so that + // they can edit the private fields of the Regexps they construct. + + // Creates a concatenation of two Regexp, consuming refs to re1 and re2. + // Caller must Decref return value when done with it. + static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); + + // Simplifies the expression re{min,max} in terms of *, +, and ?. + // Returns a new regexp. Does not edit re. Does not consume reference to re. + // Caller must Decref return value when done with it. + static Regexp* SimplifyRepeat(Regexp* re, int min, int max, + Regexp::ParseFlags parse_flags); + + // Simplifies a character class by expanding any named classes + // into rune ranges. Does not edit re. Does not consume ref to re. + // Caller must Decref return value when done with it. + static Regexp* SimplifyCharClass(Regexp* re); + SimplifyWalker(const SimplifyWalker&) = delete; SimplifyWalker& operator=(const SimplifyWalker&) = delete; -}; - -// Simplifies a regular expression, returning a new regexp. -// The new regexp uses traditional Unix egrep features only, -// plus the Perl (?:) non-capturing parentheses. -// Otherwise, no POSIX or Perl additions. The new regexp -// captures exactly the same subexpressions (with the same indices) -// as the original. -// Does not edit current object. -// Caller must Decref() return value when done with it. - -Regexp* Regexp::Simplify() { +}; + +// Simplifies a regular expression, returning a new regexp. +// The new regexp uses traditional Unix egrep features only, +// plus the Perl (?:) non-capturing parentheses. +// Otherwise, no POSIX or Perl additions. The new regexp +// captures exactly the same subexpressions (with the same indices) +// as the original. +// Does not edit current object. +// Caller must Decref() return value when done with it. + +Regexp* Regexp::Simplify() { CoalesceWalker cw; Regexp* cre = cw.Walk(this, NULL); if (cre == NULL) @@ -193,10 +193,10 @@ Regexp* Regexp::Simplify() { return NULL; } return sre; -} - -#define Simplify DontCallSimplify // Avoid accidental recursion - +} + +#define Simplify DontCallSimplify // Avoid accidental recursion + // Utility function for PostVisit implementations that compares re->sub() with // child_args to determine whether any child_args changed. In the common case, // where nothing changed, calls Decref() for all child_args and returns false, @@ -441,225 +441,225 @@ void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { r2->Decref(); } -Regexp* SimplifyWalker::Copy(Regexp* re) { - return re->Incref(); -} - -Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { +Regexp* SimplifyWalker::Copy(Regexp* re) { + return re->Incref(); +} + +Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { // Should never be called: we use Walk(), not WalkExponential(). #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; + LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; #endif - return re->Incref(); -} - -Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { + return re->Incref(); +} + +Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { if (re->simple()) { - *stop = true; - return re->Incref(); - } - return NULL; -} - -Regexp* SimplifyWalker::PostVisit(Regexp* re, - Regexp* parent_arg, - Regexp* pre_arg, - Regexp** child_args, - int nchild_args) { - switch (re->op()) { - case kRegexpNoMatch: - case kRegexpEmptyMatch: - case kRegexpLiteral: - case kRegexpLiteralString: - case kRegexpBeginLine: - case kRegexpEndLine: - case kRegexpBeginText: - case kRegexpWordBoundary: - case kRegexpNoWordBoundary: - case kRegexpEndText: - case kRegexpAnyChar: - case kRegexpAnyByte: - case kRegexpHaveMatch: - // All these are always simple. - re->simple_ = true; - return re->Incref(); - - case kRegexpConcat: - case kRegexpAlternate: { - // These are simple as long as the subpieces are simple. + *stop = true; + return re->Incref(); + } + return NULL; +} + +Regexp* SimplifyWalker::PostVisit(Regexp* re, + Regexp* parent_arg, + Regexp* pre_arg, + Regexp** child_args, + int nchild_args) { + switch (re->op()) { + case kRegexpNoMatch: + case kRegexpEmptyMatch: + case kRegexpLiteral: + case kRegexpLiteralString: + case kRegexpBeginLine: + case kRegexpEndLine: + case kRegexpBeginText: + case kRegexpWordBoundary: + case kRegexpNoWordBoundary: + case kRegexpEndText: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpHaveMatch: + // All these are always simple. + re->simple_ = true; + return re->Incref(); + + case kRegexpConcat: + case kRegexpAlternate: { + // These are simple as long as the subpieces are simple. if (!ChildArgsChanged(re, child_args)) { - re->simple_ = true; - return re->Incref(); - } - Regexp* nre = new Regexp(re->op(), re->parse_flags()); + re->simple_ = true; + return re->Incref(); + } + Regexp* nre = new Regexp(re->op(), re->parse_flags()); nre->AllocSub(re->nsub()); - Regexp** nre_subs = nre->sub(); + Regexp** nre_subs = nre->sub(); for (int i = 0; i < re->nsub(); i++) - nre_subs[i] = child_args[i]; - nre->simple_ = true; - return nre; - } - - case kRegexpCapture: { - Regexp* newsub = child_args[0]; - if (newsub == re->sub()[0]) { - newsub->Decref(); - re->simple_ = true; - return re->Incref(); - } - Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); - nre->AllocSub(1); - nre->sub()[0] = newsub; + nre_subs[i] = child_args[i]; + nre->simple_ = true; + return nre; + } + + case kRegexpCapture: { + Regexp* newsub = child_args[0]; + if (newsub == re->sub()[0]) { + newsub->Decref(); + re->simple_ = true; + return re->Incref(); + } + Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); + nre->AllocSub(1); + nre->sub()[0] = newsub; nre->cap_ = re->cap(); - nre->simple_ = true; - return nre; - } - - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: { - Regexp* newsub = child_args[0]; - // Special case: repeat the empty string as much as - // you want, but it's still the empty string. - if (newsub->op() == kRegexpEmptyMatch) - return newsub; - - // These are simple as long as the subpiece is simple. - if (newsub == re->sub()[0]) { - newsub->Decref(); - re->simple_ = true; - return re->Incref(); - } - - // These are also idempotent if flags are constant. - if (re->op() == newsub->op() && - re->parse_flags() == newsub->parse_flags()) - return newsub; - - Regexp* nre = new Regexp(re->op(), re->parse_flags()); - nre->AllocSub(1); - nre->sub()[0] = newsub; - nre->simple_ = true; - return nre; - } - - case kRegexpRepeat: { - Regexp* newsub = child_args[0]; - // Special case: repeat the empty string as much as - // you want, but it's still the empty string. - if (newsub->op() == kRegexpEmptyMatch) - return newsub; - - Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, - re->parse_flags()); - newsub->Decref(); - nre->simple_ = true; - return nre; - } - - case kRegexpCharClass: { - Regexp* nre = SimplifyCharClass(re); - nre->simple_ = true; - return nre; - } - } - - LOG(ERROR) << "Simplify case not handled: " << re->op(); - return re->Incref(); -} - -// Creates a concatenation of two Regexp, consuming refs to re1 and re2. -// Returns a new Regexp, handing the ref to the caller. -Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, - Regexp::ParseFlags parse_flags) { - Regexp* re = new Regexp(kRegexpConcat, parse_flags); - re->AllocSub(2); - Regexp** subs = re->sub(); - subs[0] = re1; - subs[1] = re2; - return re; -} - -// Simplifies the expression re{min,max} in terms of *, +, and ?. -// Returns a new regexp. Does not edit re. Does not consume reference to re. -// Caller must Decref return value when done with it. -// The result will *not* necessarily have the right capturing parens -// if you call ToString() and re-parse it: (x){2} becomes (x)(x), -// but in the Regexp* representation, both (x) are marked as $1. -Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, - Regexp::ParseFlags f) { - // x{n,} means at least n matches of x. - if (max == -1) { - // Special case: x{0,} is x* - if (min == 0) - return Regexp::Star(re->Incref(), f); - - // Special case: x{1,} is x+ - if (min == 1) - return Regexp::Plus(re->Incref(), f); - - // General case: x{4,} is xxxx+ + nre->simple_ = true; + return nre; + } + + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: { + Regexp* newsub = child_args[0]; + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if (newsub->op() == kRegexpEmptyMatch) + return newsub; + + // These are simple as long as the subpiece is simple. + if (newsub == re->sub()[0]) { + newsub->Decref(); + re->simple_ = true; + return re->Incref(); + } + + // These are also idempotent if flags are constant. + if (re->op() == newsub->op() && + re->parse_flags() == newsub->parse_flags()) + return newsub; + + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(1); + nre->sub()[0] = newsub; + nre->simple_ = true; + return nre; + } + + case kRegexpRepeat: { + Regexp* newsub = child_args[0]; + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if (newsub->op() == kRegexpEmptyMatch) + return newsub; + + Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, + re->parse_flags()); + newsub->Decref(); + nre->simple_ = true; + return nre; + } + + case kRegexpCharClass: { + Regexp* nre = SimplifyCharClass(re); + nre->simple_ = true; + return nre; + } + } + + LOG(ERROR) << "Simplify case not handled: " << re->op(); + return re->Incref(); +} + +// Creates a concatenation of two Regexp, consuming refs to re1 and re2. +// Returns a new Regexp, handing the ref to the caller. +Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, + Regexp::ParseFlags parse_flags) { + Regexp* re = new Regexp(kRegexpConcat, parse_flags); + re->AllocSub(2); + Regexp** subs = re->sub(); + subs[0] = re1; + subs[1] = re2; + return re; +} + +// Simplifies the expression re{min,max} in terms of *, +, and ?. +// Returns a new regexp. Does not edit re. Does not consume reference to re. +// Caller must Decref return value when done with it. +// The result will *not* necessarily have the right capturing parens +// if you call ToString() and re-parse it: (x){2} becomes (x)(x), +// but in the Regexp* representation, both (x) are marked as $1. +Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, + Regexp::ParseFlags f) { + // x{n,} means at least n matches of x. + if (max == -1) { + // Special case: x{0,} is x* + if (min == 0) + return Regexp::Star(re->Incref(), f); + + // Special case: x{1,} is x+ + if (min == 1) + return Regexp::Plus(re->Incref(), f); + + // General case: x{4,} is xxxx+ PODArray<Regexp*> nre_subs(min); - for (int i = 0; i < min-1; i++) - nre_subs[i] = re->Incref(); - nre_subs[min-1] = Regexp::Plus(re->Incref(), f); + for (int i = 0; i < min-1; i++) + nre_subs[i] = re->Incref(); + nre_subs[min-1] = Regexp::Plus(re->Incref(), f); return Regexp::Concat(nre_subs.data(), min, f); - } - - // Special case: (x){0} matches only empty string. - if (min == 0 && max == 0) - return new Regexp(kRegexpEmptyMatch, f); - - // Special case: x{1} is just x. - if (min == 1 && max == 1) - return re->Incref(); - - // General case: x{n,m} means n copies of x and m copies of x?. - // The machine will do less work if we nest the final m copies, - // so that x{2,5} = xx(x(x(x)?)?)? - - // Build leading prefix: xx. Capturing only on the last one. - Regexp* nre = NULL; - if (min > 0) { + } + + // Special case: (x){0} matches only empty string. + if (min == 0 && max == 0) + return new Regexp(kRegexpEmptyMatch, f); + + // Special case: x{1} is just x. + if (min == 1 && max == 1) + return re->Incref(); + + // General case: x{n,m} means n copies of x and m copies of x?. + // The machine will do less work if we nest the final m copies, + // so that x{2,5} = xx(x(x(x)?)?)? + + // Build leading prefix: xx. Capturing only on the last one. + Regexp* nre = NULL; + if (min > 0) { PODArray<Regexp*> nre_subs(min); - for (int i = 0; i < min; i++) - nre_subs[i] = re->Incref(); + for (int i = 0; i < min; i++) + nre_subs[i] = re->Incref(); nre = Regexp::Concat(nre_subs.data(), min, f); - } - - // Build and attach suffix: (x(x(x)?)?)? - if (max > min) { - Regexp* suf = Regexp::Quest(re->Incref(), f); - for (int i = min+1; i < max; i++) - suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); - if (nre == NULL) - nre = suf; - else - nre = Concat2(nre, suf, f); - } - - if (nre == NULL) { - // Some degenerate case, like min > max, or min < max < 0. - // This shouldn't happen, because the parser rejects such regexps. - LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; - return new Regexp(kRegexpNoMatch, f); - } - - return nre; -} - -// Simplifies a character class. -// Caller must Decref return value when done with it. -Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { - CharClass* cc = re->cc(); - - // Special cases - if (cc->empty()) - return new Regexp(kRegexpNoMatch, re->parse_flags()); - if (cc->full()) - return new Regexp(kRegexpAnyChar, re->parse_flags()); - - return re->Incref(); -} - -} // namespace re2 + } + + // Build and attach suffix: (x(x(x)?)?)? + if (max > min) { + Regexp* suf = Regexp::Quest(re->Incref(), f); + for (int i = min+1; i < max; i++) + suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); + if (nre == NULL) + nre = suf; + else + nre = Concat2(nre, suf, f); + } + + if (nre == NULL) { + // Some degenerate case, like min > max, or min < max < 0. + // This shouldn't happen, because the parser rejects such regexps. + LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; + return new Regexp(kRegexpNoMatch, f); + } + + return nre; +} + +// Simplifies a character class. +// Caller must Decref return value when done with it. +Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { + CharClass* cc = re->cc(); + + // Special cases + if (cc->empty()) + return new Regexp(kRegexpNoMatch, re->parse_flags()); + if (cc->full()) + return new Regexp(kRegexpAnyChar, re->parse_flags()); + + return re->Incref(); +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/sparse_array.h b/contrib/libs/re2/re2/sparse_array.h index 343b1ffdf2..09ffe086b7 100644 --- a/contrib/libs/re2/re2/sparse_array.h +++ b/contrib/libs/re2/re2/sparse_array.h @@ -1,68 +1,68 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_SPARSE_ARRAY_H_ #define RE2_SPARSE_ARRAY_H_ -// DESCRIPTION +// DESCRIPTION // -// SparseArray<T>(m) is a map from integers in [0, m) to T values. -// It requires (sizeof(T)+sizeof(int))*m memory, but it provides -// fast iteration through the elements in the array and fast clearing -// of the array. The array has a concept of certain elements being -// uninitialized (having no value). +// SparseArray<T>(m) is a map from integers in [0, m) to T values. +// It requires (sizeof(T)+sizeof(int))*m memory, but it provides +// fast iteration through the elements in the array and fast clearing +// of the array. The array has a concept of certain elements being +// uninitialized (having no value). // -// Insertion and deletion are constant time operations. +// Insertion and deletion are constant time operations. // // Allocating the array is a constant time operation -// when memory allocation is a constant time operation. +// when memory allocation is a constant time operation. +// +// Clearing the array is a constant time operation (unusual!). // -// Clearing the array is a constant time operation (unusual!). +// Iterating through the array is an O(n) operation, where n +// is the number of items in the array (not O(m)). // -// Iterating through the array is an O(n) operation, where n -// is the number of items in the array (not O(m)). -// // The array iterator visits entries in the order they were first -// inserted into the array. It is safe to add items to the array while -// using an iterator: the iterator will visit indices added to the array -// during the iteration, but will not re-visit indices whose values -// change after visiting. Thus SparseArray can be a convenient -// implementation of a work queue. +// inserted into the array. It is safe to add items to the array while +// using an iterator: the iterator will visit indices added to the array +// during the iteration, but will not re-visit indices whose values +// change after visiting. Thus SparseArray can be a convenient +// implementation of a work queue. // -// The SparseArray implementation is NOT thread-safe. It is up to the -// caller to make sure only one thread is accessing the array. (Typically -// these arrays are temporary values and used in situations where speed is -// important.) +// The SparseArray implementation is NOT thread-safe. It is up to the +// caller to make sure only one thread is accessing the array. (Typically +// these arrays are temporary values and used in situations where speed is +// important.) // -// The SparseArray interface does not present all the usual STL bells and -// whistles. +// The SparseArray interface does not present all the usual STL bells and +// whistles. // -// Implemented with reference to Briggs & Torczon, An Efficient -// Representation for Sparse Sets, ACM Letters on Programming Languages -// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69. +// Implemented with reference to Briggs & Torczon, An Efficient +// Representation for Sparse Sets, ACM Letters on Programming Languages +// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69. // -// Briggs & Torczon popularized this technique, but it had been known -// long before their paper. They point out that Aho, Hopcroft, and -// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's -// 1986 Programming Pearls both hint at the technique in exercises to the -// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1 -// exercise 8). +// Briggs & Torczon popularized this technique, but it had been known +// long before their paper. They point out that Aho, Hopcroft, and +// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's +// 1986 Programming Pearls both hint at the technique in exercises to the +// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1 +// exercise 8). +// +// Briggs & Torczon describe a sparse set implementation. I have +// trivially generalized it to create a sparse array (actually the original +// target of the AHU and Bentley exercises). + +// IMPLEMENTATION // -// Briggs & Torczon describe a sparse set implementation. I have -// trivially generalized it to create a sparse array (actually the original -// target of the AHU and Bentley exercises). - -// IMPLEMENTATION -// // SparseArray is an array dense_ and an array sparse_ of identical size. // At any point, the number of elements in the sparse array is size_. // // The array dense_ contains the size_ elements in the sparse array (with -// their indices), -// in the order that the elements were first inserted. This array is dense: -// the size_ pairs are dense_[0] through dense_[size_-1]. -// +// their indices), +// in the order that the elements were first inserted. This array is dense: +// the size_ pairs are dense_[0] through dense_[size_-1]. +// // The array sparse_ maps from indices in [0,m) to indices in [0,size_). // For indices present in the array, dense_[sparse_[i]].index_ == i. // For indices not present in the array, sparse_ can contain any value at all, @@ -75,19 +75,19 @@ // dense_[sparse_[i]].index_ == i. // If both these properties hold, only then it is safe to refer to // dense_[sparse_[i]].value_ -// as the value associated with index i. -// +// as the value associated with index i. +// // To insert a new entry, set sparse_[i] to size_, -// initialize dense_[size_], and then increment size_. -// -// To make the sparse array as efficient as possible for non-primitive types, -// elements may or may not be destroyed when they are deleted from the sparse +// initialize dense_[size_], and then increment size_. +// +// To make the sparse array as efficient as possible for non-primitive types, +// elements may or may not be destroyed when they are deleted from the sparse // array through a call to resize(). They immediately become inaccessible, but // they are only guaranteed to be destroyed when the SparseArray destructor is // called. // // A moved-from SparseArray will be empty. - + // Doing this simplifies the logic below. #ifndef __has_feature #define __has_feature(x) 0 @@ -101,100 +101,100 @@ #include <algorithm> #include <memory> #include <utility> - + #include "re2/pod_array.h" -namespace re2 { - -template<typename Value> -class SparseArray { - public: - SparseArray(); +namespace re2 { + +template<typename Value> +class SparseArray { + public: + SparseArray(); explicit SparseArray(int max_size); - ~SparseArray(); - - // IndexValue pairs: exposed in SparseArray::iterator. - class IndexValue; - + ~SparseArray(); + + // IndexValue pairs: exposed in SparseArray::iterator. + class IndexValue; + typedef IndexValue* iterator; typedef const IndexValue* const_iterator; - + SparseArray(const SparseArray& src); SparseArray(SparseArray&& src); - + SparseArray& operator=(const SparseArray& src); SparseArray& operator=(SparseArray&& src); - // Return the number of entries in the array. - int size() const { - return size_; - } - + // Return the number of entries in the array. + int size() const { + return size_; + } + // Indicate whether the array is empty. int empty() const { return size_ == 0; } - // Iterate over the array. - iterator begin() { + // Iterate over the array. + iterator begin() { return dense_.data(); - } - iterator end() { + } + iterator end() { return dense_.data() + size_; - } - - const_iterator begin() const { + } + + const_iterator begin() const { return dense_.data(); - } - const_iterator end() const { + } + const_iterator end() const { return dense_.data() + size_; - } - - // Change the maximum size of the array. - // Invalidates all iterators. + } + + // Change the maximum size of the array. + // Invalidates all iterators. void resize(int new_max_size); - - // Return the maximum size of the array. - // Indices can be in the range [0, max_size). - int max_size() const { + + // Return the maximum size of the array. + // Indices can be in the range [0, max_size). + int max_size() const { if (dense_.data() != NULL) return dense_.size(); else return 0; - } - - // Clear the array. - void clear() { - size_ = 0; - } - - // Check whether index i is in the array. + } + + // Clear the array. + void clear() { + size_ = 0; + } + + // Check whether index i is in the array. bool has_index(int i) const; - - // Comparison function for sorting. - // Can sort the sparse array so that future iterations - // will visit indices in increasing order using + + // Comparison function for sorting. + // Can sort the sparse array so that future iterations + // will visit indices in increasing order using // std::sort(arr.begin(), arr.end(), arr.less); - static bool less(const IndexValue& a, const IndexValue& b); - - public: - // Set the value at index i to v. + static bool less(const IndexValue& a, const IndexValue& b); + + public: + // Set the value at index i to v. iterator set(int i, const Value& v) { return SetInternal(true, i, v); } - + // Set the value at new index i to v. // Fast but unsafe: only use if has_index(i) is false. iterator set_new(int i, const Value& v) { return SetInternal(false, i, v); } - + // Set the value at index i to v. - // Fast but unsafe: only use if has_index(i) is true. + // Fast but unsafe: only use if has_index(i) is true. iterator set_existing(int i, const Value& v) { return SetExistingInternal(i, v); } - + // Get the value at index i. // Fast but unsafe: only use if has_index(i) is true. Value& get_existing(int i) { @@ -205,8 +205,8 @@ class SparseArray { assert(has_index(i)); return dense_[sparse_[i]].value_; } - - private: + + private: iterator SetInternal(bool allow_existing, int i, const Value& v) { DebugCheckInvariants(); if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { @@ -234,18 +234,18 @@ class SparseArray { return dense_.data() + sparse_[i]; } - // Add the index i to the array. - // Only use if has_index(i) is known to be false. - // Since it doesn't set the value associated with i, - // this function is private, only intended as a helper - // for other methods. + // Add the index i to the array. + // Only use if has_index(i) is known to be false. + // Since it doesn't set the value associated with i, + // this function is private, only intended as a helper + // for other methods. void create_index(int i); - - // In debug mode, verify that some invariant properties of the class - // are being maintained. This is called at the end of the constructor - // and at the beginning and end of all public non-const member functions. + + // In debug mode, verify that some invariant properties of the class + // are being maintained. This is called at the end of the constructor + // and at the beginning and end of all public non-const member functions. void DebugCheckInvariants() const; - + // Initializes memory for elements [min, max). void MaybeInitializeMemory(int min, int max) { #if __has_feature(memory_sanitizer) @@ -260,11 +260,11 @@ class SparseArray { int size_ = 0; PODArray<int> sparse_; PODArray<IndexValue> dense_; -}; - -template<typename Value> +}; + +template<typename Value> SparseArray<Value>::SparseArray() = default; - + template<typename Value> SparseArray<Value>::SparseArray(const SparseArray& src) : size_(src.size_), @@ -305,28 +305,28 @@ SparseArray<Value>& SparseArray<Value>::operator=(SparseArray&& src) { return *this; } -// IndexValue pairs: exposed in SparseArray::iterator. -template<typename Value> -class SparseArray<Value>::IndexValue { - public: - int index() const { return index_; } +// IndexValue pairs: exposed in SparseArray::iterator. +template<typename Value> +class SparseArray<Value>::IndexValue { + public: + int index() const { return index_; } Value& value() { return value_; } const Value& value() const { return value_; } - + private: friend class SparseArray; int index_; Value value_; -}; - -// Change the maximum size of the array. -// Invalidates all iterators. -template<typename Value> +}; + +// Change the maximum size of the array. +// Invalidates all iterators. +template<typename Value> void SparseArray<Value>::resize(int new_max_size) { - DebugCheckInvariants(); + DebugCheckInvariants(); if (new_max_size > max_size()) { const int old_max_size = max_size(); - + // Construct these first for exception safety. PODArray<int> a(new_max_size); PODArray<IndexValue> b(new_max_size); @@ -338,55 +338,55 @@ void SparseArray<Value>::resize(int new_max_size) { dense_ = std::move(b); MaybeInitializeMemory(old_max_size, new_max_size); - } + } if (size_ > new_max_size) size_ = new_max_size; - DebugCheckInvariants(); -} - -// Check whether index i is in the array. -template<typename Value> -bool SparseArray<Value>::has_index(int i) const { + DebugCheckInvariants(); +} + +// Check whether index i is in the array. +template<typename Value> +bool SparseArray<Value>::has_index(int i) const { assert(i >= 0); assert(i < max_size()); if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { - return false; - } + return false; + } // Unsigned comparison avoids checking sparse_[i] < 0. return (uint32_t)sparse_[i] < (uint32_t)size_ && dense_[sparse_[i]].index_ == i; -} - -template<typename Value> -void SparseArray<Value>::create_index(int i) { +} + +template<typename Value> +void SparseArray<Value>::create_index(int i) { assert(!has_index(i)); assert(size_ < max_size()); sparse_[i] = size_; - dense_[size_].index_ = i; - size_++; -} - + dense_[size_].index_ = i; + size_++; +} + template<typename Value> SparseArray<Value>::SparseArray(int max_size) : sparse_(max_size), dense_(max_size) { MaybeInitializeMemory(size_, max_size); - DebugCheckInvariants(); -} - -template<typename Value> SparseArray<Value>::~SparseArray() { - DebugCheckInvariants(); -} - -template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const { + DebugCheckInvariants(); +} + +template<typename Value> SparseArray<Value>::~SparseArray() { + DebugCheckInvariants(); +} + +template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const { assert(0 <= size_); assert(size_ <= max_size()); -} - -// Comparison function for sorting. -template<typename Value> bool SparseArray<Value>::less(const IndexValue& a, - const IndexValue& b) { - return a.index_ < b.index_; -} - -} // namespace re2 - +} + +// Comparison function for sorting. +template<typename Value> bool SparseArray<Value>::less(const IndexValue& a, + const IndexValue& b) { + return a.index_ < b.index_; +} + +} // namespace re2 + #endif // RE2_SPARSE_ARRAY_H_ diff --git a/contrib/libs/re2/re2/sparse_set.h b/contrib/libs/re2/re2/sparse_set.h index 99b18051ef..06ed88d81b 100644 --- a/contrib/libs/re2/re2/sparse_set.h +++ b/contrib/libs/re2/re2/sparse_set.h @@ -1,52 +1,52 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_SPARSE_SET_H_ #define RE2_SPARSE_SET_H_ -// DESCRIPTION +// DESCRIPTION // // SparseSet(m) is a set of integers in [0, m). -// It requires sizeof(int)*m memory, but it provides -// fast iteration through the elements in the set and fast clearing -// of the set. +// It requires sizeof(int)*m memory, but it provides +// fast iteration through the elements in the set and fast clearing +// of the set. // -// Insertion and deletion are constant time operations. +// Insertion and deletion are constant time operations. // // Allocating the set is a constant time operation -// when memory allocation is a constant time operation. +// when memory allocation is a constant time operation. +// +// Clearing the set is a constant time operation (unusual!). // -// Clearing the set is a constant time operation (unusual!). +// Iterating through the set is an O(n) operation, where n +// is the number of items in the set (not O(m)). // -// Iterating through the set is an O(n) operation, where n -// is the number of items in the set (not O(m)). -// // The set iterator visits entries in the order they were first // inserted into the set. It is safe to add items to the set while -// using an iterator: the iterator will visit indices added to the set -// during the iteration, but will not re-visit indices whose values -// change after visiting. Thus SparseSet can be a convenient -// implementation of a work queue. +// using an iterator: the iterator will visit indices added to the set +// during the iteration, but will not re-visit indices whose values +// change after visiting. Thus SparseSet can be a convenient +// implementation of a work queue. // -// The SparseSet implementation is NOT thread-safe. It is up to the -// caller to make sure only one thread is accessing the set. (Typically -// these sets are temporary values and used in situations where speed is -// important.) +// The SparseSet implementation is NOT thread-safe. It is up to the +// caller to make sure only one thread is accessing the set. (Typically +// these sets are temporary values and used in situations where speed is +// important.) // -// The SparseSet interface does not present all the usual STL bells and -// whistles. +// The SparseSet interface does not present all the usual STL bells and +// whistles. // -// Implemented with reference to Briggs & Torczon, An Efficient -// Representation for Sparse Sets, ACM Letters on Programming Languages -// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69. +// Implemented with reference to Briggs & Torczon, An Efficient +// Representation for Sparse Sets, ACM Letters on Programming Languages +// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69. // // This is a specialization of sparse array; see sparse_array.h. - -// IMPLEMENTATION -// + +// IMPLEMENTATION +// // See sparse_array.h for implementation details. - + // Doing this simplifies the logic below. #ifndef __has_feature #define __has_feature(x) 0 @@ -60,31 +60,31 @@ #include <algorithm> #include <memory> #include <utility> - + #include "re2/pod_array.h" -namespace re2 { - +namespace re2 { + template<typename Value> class SparseSetT { - public: + public: SparseSetT(); explicit SparseSetT(int max_size); ~SparseSetT(); - + typedef int* iterator; typedef const int* const_iterator; // Return the number of entries in the set. int size() const { return size_; - } - + } + // Indicate whether the set is empty. int empty() const { return size_ == 0; - } - + } + // Iterate over the set. iterator begin() { return dense_.data(); @@ -92,18 +92,18 @@ class SparseSetT { iterator end() { return dense_.data() + size_; } - + const_iterator begin() const { return dense_.data(); } const_iterator end() const { return dense_.data() + size_; } - + // Change the maximum size of the set. - // Invalidates all iterators. + // Invalidates all iterators. void resize(int new_max_size); - + // Return the maximum size of the set. // Indices can be in the range [0, max_size). int max_size() const { @@ -111,16 +111,16 @@ class SparseSetT { return dense_.size(); else return 0; - } - + } + // Clear the set. void clear() { size_ = 0; } - + // Check whether index i is in the set. bool contains(int i) const; - + // Comparison function for sorting. // Can sort the sparse set so that future iterations // will visit indices in increasing order using @@ -131,24 +131,24 @@ class SparseSetT { // Insert index i into the set. iterator insert(int i) { return InsertInternal(true, i); - } - + } + // Insert index i into the set. // Fast but unsafe: only use if contains(i) is false. iterator insert_new(int i) { return InsertInternal(false, i); - } - + } + private: iterator InsertInternal(bool allow_existing, int i) { DebugCheckInvariants(); if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) { assert(false && "illegal index"); - // Semantically, end() would be better here, but we already know - // the user did something stupid, so begin() insulates them from - // dereferencing an invalid pointer. + // Semantically, end() would be better here, but we already know + // the user did something stupid, so begin() insulates them from + // dereferencing an invalid pointer. return begin(); - } + } if (!allow_existing) { assert(!contains(i)); create_index(i); @@ -158,19 +158,19 @@ class SparseSetT { } DebugCheckInvariants(); return dense_.data() + sparse_[i]; - } - + } + // Add the index i to the set. // Only use if contains(i) is known to be false. // This function is private, only intended as a helper // for other methods. void create_index(int i); - + // In debug mode, verify that some invariant properties of the class // are being maintained. This is called at the end of the constructor // and at the beginning and end of all public non-const member functions. void DebugCheckInvariants() const; - + // Initializes memory for elements [min, max). void MaybeInitializeMemory(int min, int max) { #if __has_feature(memory_sanitizer) @@ -185,8 +185,8 @@ class SparseSetT { int size_ = 0; PODArray<int> sparse_; PODArray<int> dense_; -}; - +}; + template<typename Value> SparseSetT<Value>::SparseSetT() = default; @@ -259,6 +259,6 @@ template<typename Value> bool SparseSetT<Value>::less(int a, int b) { typedef SparseSetT<void> SparseSet; -} // namespace re2 - +} // namespace re2 + #endif // RE2_SPARSE_SET_H_ diff --git a/contrib/libs/re2/re2/tostring.cc b/contrib/libs/re2/re2/tostring.cc index a2b2a7ddaf..9c1c038ca6 100644 --- a/contrib/libs/re2/re2/tostring.cc +++ b/contrib/libs/re2/re2/tostring.cc @@ -1,10 +1,10 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Format a regular expression structure as a string. -// Tested by parse_test.cc - +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Format a regular expression structure as a string. +// Tested by parse_test.cc + #include <string.h> #include <string> @@ -12,340 +12,340 @@ #include "util/logging.h" #include "util/strutil.h" #include "util/utf.h" -#include "re2/regexp.h" -#include "re2/walker-inl.h" - -namespace re2 { - -enum { - PrecAtom, - PrecUnary, - PrecConcat, - PrecAlternate, - PrecEmpty, - PrecParen, - PrecToplevel, -}; - -// Helper function. See description below. +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +enum { + PrecAtom, + PrecUnary, + PrecConcat, + PrecAlternate, + PrecEmpty, + PrecParen, + PrecToplevel, +}; + +// Helper function. See description below. static void AppendCCRange(std::string* t, Rune lo, Rune hi); - -// Walker to generate string in s_. -// The arg pointers are actually integers giving the -// context precedence. -// The child_args are always NULL. -class ToStringWalker : public Regexp::Walker<int> { - public: + +// Walker to generate string in s_. +// The arg pointers are actually integers giving the +// context precedence. +// The child_args are always NULL. +class ToStringWalker : public Regexp::Walker<int> { + public: explicit ToStringWalker(std::string* t) : t_(t) {} - - virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); - virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, - int* child_args, int nchild_args); - virtual int ShortVisit(Regexp* re, int parent_arg) { - return 0; - } - - private: + + virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); + virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args); + virtual int ShortVisit(Regexp* re, int parent_arg) { + return 0; + } + + private: std::string* t_; // The string the walker appends to. - + ToStringWalker(const ToStringWalker&) = delete; ToStringWalker& operator=(const ToStringWalker&) = delete; -}; - +}; + std::string Regexp::ToString() { std::string t; - ToStringWalker w(&t); - w.WalkExponential(this, PrecToplevel, 100000); - if (w.stopped_early()) - t += " [truncated]"; - return t; -} - -#define ToString DontCallToString // Avoid accidental recursion. - -// Visits re before children are processed. -// Appends ( if needed and passes new precedence to children. -int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { - int prec = parent_arg; - int nprec = PrecAtom; - - switch (re->op()) { - case kRegexpNoMatch: - case kRegexpEmptyMatch: - case kRegexpLiteral: - case kRegexpAnyChar: - case kRegexpAnyByte: - case kRegexpBeginLine: - case kRegexpEndLine: - case kRegexpBeginText: - case kRegexpEndText: - case kRegexpWordBoundary: - case kRegexpNoWordBoundary: - case kRegexpCharClass: - case kRegexpHaveMatch: - nprec = PrecAtom; - break; - - case kRegexpConcat: - case kRegexpLiteralString: - if (prec < PrecConcat) - t_->append("(?:"); - nprec = PrecConcat; - break; - - case kRegexpAlternate: - if (prec < PrecAlternate) - t_->append("(?:"); - nprec = PrecAlternate; - break; - - case kRegexpCapture: - t_->append("("); + ToStringWalker w(&t); + w.WalkExponential(this, PrecToplevel, 100000); + if (w.stopped_early()) + t += " [truncated]"; + return t; +} + +#define ToString DontCallToString // Avoid accidental recursion. + +// Visits re before children are processed. +// Appends ( if needed and passes new precedence to children. +int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { + int prec = parent_arg; + int nprec = PrecAtom; + + switch (re->op()) { + case kRegexpNoMatch: + case kRegexpEmptyMatch: + case kRegexpLiteral: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpBeginLine: + case kRegexpEndLine: + case kRegexpBeginText: + case kRegexpEndText: + case kRegexpWordBoundary: + case kRegexpNoWordBoundary: + case kRegexpCharClass: + case kRegexpHaveMatch: + nprec = PrecAtom; + break; + + case kRegexpConcat: + case kRegexpLiteralString: + if (prec < PrecConcat) + t_->append("(?:"); + nprec = PrecConcat; + break; + + case kRegexpAlternate: + if (prec < PrecAlternate) + t_->append("(?:"); + nprec = PrecAlternate; + break; + + case kRegexpCapture: + t_->append("("); if (re->cap() == 0) LOG(DFATAL) << "kRegexpCapture cap() == 0"; - if (re->name()) { - t_->append("?P<"); - t_->append(*re->name()); - t_->append(">"); - } - nprec = PrecParen; - break; - - case kRegexpStar: - case kRegexpPlus: - case kRegexpQuest: - case kRegexpRepeat: - if (prec < PrecUnary) - t_->append("(?:"); - // The subprecedence here is PrecAtom instead of PrecUnary - // because PCRE treats two unary ops in a row as a parse error. - nprec = PrecAtom; - break; - } - - return nprec; -} - + if (re->name()) { + t_->append("?P<"); + t_->append(*re->name()); + t_->append(">"); + } + nprec = PrecParen; + break; + + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + if (prec < PrecUnary) + t_->append("(?:"); + // The subprecedence here is PrecAtom instead of PrecUnary + // because PCRE treats two unary ops in a row as a parse error. + nprec = PrecAtom; + break; + } + + return nprec; +} + static void AppendLiteral(std::string *t, Rune r, bool foldcase) { - if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { - t->append(1, '\\'); + if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { + t->append(1, '\\'); t->append(1, static_cast<char>(r)); - } else if (foldcase && 'a' <= r && r <= 'z') { + } else if (foldcase && 'a' <= r && r <= 'z') { r -= 'a' - 'A'; - t->append(1, '['); + t->append(1, '['); t->append(1, static_cast<char>(r)); t->append(1, static_cast<char>(r) + 'a' - 'A'); - t->append(1, ']'); - } else { - AppendCCRange(t, r, r); - } -} - -// Visits re after children are processed. -// For childless regexps, all the work is done here. -// For regexps with children, append any unary suffixes or ). -int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, - int* child_args, int nchild_args) { - int prec = parent_arg; - switch (re->op()) { - case kRegexpNoMatch: - // There's no simple symbol for "no match", but - // [^0-Runemax] excludes everything. - t_->append("[^\\x00-\\x{10ffff}]"); - break; - - case kRegexpEmptyMatch: - // Append (?:) to make empty string visible, - // unless this is already being parenthesized. - if (prec < PrecEmpty) - t_->append("(?:)"); - break; - - case kRegexpLiteral: + t->append(1, ']'); + } else { + AppendCCRange(t, r, r); + } +} + +// Visits re after children are processed. +// For childless regexps, all the work is done here. +// For regexps with children, append any unary suffixes or ). +int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) { + int prec = parent_arg; + switch (re->op()) { + case kRegexpNoMatch: + // There's no simple symbol for "no match", but + // [^0-Runemax] excludes everything. + t_->append("[^\\x00-\\x{10ffff}]"); + break; + + case kRegexpEmptyMatch: + // Append (?:) to make empty string visible, + // unless this is already being parenthesized. + if (prec < PrecEmpty) + t_->append("(?:)"); + break; + + case kRegexpLiteral: AppendLiteral(t_, re->rune(), (re->parse_flags() & Regexp::FoldCase) != 0); - break; - - case kRegexpLiteralString: - for (int i = 0; i < re->nrunes(); i++) + break; + + case kRegexpLiteralString: + for (int i = 0; i < re->nrunes(); i++) AppendLiteral(t_, re->runes()[i], (re->parse_flags() & Regexp::FoldCase) != 0); - if (prec < PrecConcat) - t_->append(")"); - break; - - case kRegexpConcat: - if (prec < PrecConcat) - t_->append(")"); - break; - - case kRegexpAlternate: - // Clumsy but workable: the children all appended | - // at the end of their strings, so just remove the last one. - if ((*t_)[t_->size()-1] == '|') - t_->erase(t_->size()-1); - else - LOG(DFATAL) << "Bad final char: " << t_; - if (prec < PrecAlternate) - t_->append(")"); - break; - - case kRegexpStar: - t_->append("*"); - if (re->parse_flags() & Regexp::NonGreedy) - t_->append("?"); - if (prec < PrecUnary) - t_->append(")"); - break; - - case kRegexpPlus: - t_->append("+"); - if (re->parse_flags() & Regexp::NonGreedy) - t_->append("?"); - if (prec < PrecUnary) - t_->append(")"); - break; - - case kRegexpQuest: - t_->append("?"); - if (re->parse_flags() & Regexp::NonGreedy) - t_->append("?"); - if (prec < PrecUnary) - t_->append(")"); - break; - - case kRegexpRepeat: - if (re->max() == -1) - t_->append(StringPrintf("{%d,}", re->min())); - else if (re->min() == re->max()) - t_->append(StringPrintf("{%d}", re->min())); - else - t_->append(StringPrintf("{%d,%d}", re->min(), re->max())); - if (re->parse_flags() & Regexp::NonGreedy) - t_->append("?"); - if (prec < PrecUnary) - t_->append(")"); - break; - - case kRegexpAnyChar: - t_->append("."); - break; - - case kRegexpAnyByte: - t_->append("\\C"); - break; - - case kRegexpBeginLine: - t_->append("^"); - break; - - case kRegexpEndLine: - t_->append("$"); - break; - - case kRegexpBeginText: - t_->append("(?-m:^)"); - break; - - case kRegexpEndText: - if (re->parse_flags() & Regexp::WasDollar) - t_->append("(?-m:$)"); - else - t_->append("\\z"); - break; - - case kRegexpWordBoundary: - t_->append("\\b"); - break; - - case kRegexpNoWordBoundary: - t_->append("\\B"); - break; - - case kRegexpCharClass: { - if (re->cc()->size() == 0) { - t_->append("[^\\x00-\\x{10ffff}]"); - break; - } - t_->append("["); - // Heuristic: show class as negated if it contains the + if (prec < PrecConcat) + t_->append(")"); + break; + + case kRegexpConcat: + if (prec < PrecConcat) + t_->append(")"); + break; + + case kRegexpAlternate: + // Clumsy but workable: the children all appended | + // at the end of their strings, so just remove the last one. + if ((*t_)[t_->size()-1] == '|') + t_->erase(t_->size()-1); + else + LOG(DFATAL) << "Bad final char: " << t_; + if (prec < PrecAlternate) + t_->append(")"); + break; + + case kRegexpStar: + t_->append("*"); + if (re->parse_flags() & Regexp::NonGreedy) + t_->append("?"); + if (prec < PrecUnary) + t_->append(")"); + break; + + case kRegexpPlus: + t_->append("+"); + if (re->parse_flags() & Regexp::NonGreedy) + t_->append("?"); + if (prec < PrecUnary) + t_->append(")"); + break; + + case kRegexpQuest: + t_->append("?"); + if (re->parse_flags() & Regexp::NonGreedy) + t_->append("?"); + if (prec < PrecUnary) + t_->append(")"); + break; + + case kRegexpRepeat: + if (re->max() == -1) + t_->append(StringPrintf("{%d,}", re->min())); + else if (re->min() == re->max()) + t_->append(StringPrintf("{%d}", re->min())); + else + t_->append(StringPrintf("{%d,%d}", re->min(), re->max())); + if (re->parse_flags() & Regexp::NonGreedy) + t_->append("?"); + if (prec < PrecUnary) + t_->append(")"); + break; + + case kRegexpAnyChar: + t_->append("."); + break; + + case kRegexpAnyByte: + t_->append("\\C"); + break; + + case kRegexpBeginLine: + t_->append("^"); + break; + + case kRegexpEndLine: + t_->append("$"); + break; + + case kRegexpBeginText: + t_->append("(?-m:^)"); + break; + + case kRegexpEndText: + if (re->parse_flags() & Regexp::WasDollar) + t_->append("(?-m:$)"); + else + t_->append("\\z"); + break; + + case kRegexpWordBoundary: + t_->append("\\b"); + break; + + case kRegexpNoWordBoundary: + t_->append("\\B"); + break; + + case kRegexpCharClass: { + if (re->cc()->size() == 0) { + t_->append("[^\\x00-\\x{10ffff}]"); + break; + } + t_->append("["); + // Heuristic: show class as negated if it contains the // non-character 0xFFFE and yet somehow isn't full. - CharClass* cc = re->cc(); + CharClass* cc = re->cc(); if (cc->Contains(0xFFFE) && !cc->full()) { - cc = cc->Negate(); - t_->append("^"); - } - for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) - AppendCCRange(t_, i->lo, i->hi); - if (cc != re->cc()) - cc->Delete(); - t_->append("]"); - break; - } - - case kRegexpCapture: - t_->append(")"); - break; - - case kRegexpHaveMatch: - // There's no syntax accepted by the parser to generate - // this node (it is generated by RE2::Set) so make something - // up that is readable but won't compile. + cc = cc->Negate(); + t_->append("^"); + } + for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) + AppendCCRange(t_, i->lo, i->hi); + if (cc != re->cc()) + cc->Delete(); + t_->append("]"); + break; + } + + case kRegexpCapture: + t_->append(")"); + break; + + case kRegexpHaveMatch: + // There's no syntax accepted by the parser to generate + // this node (it is generated by RE2::Set) so make something + // up that is readable but won't compile. t_->append(StringPrintf("(?HaveMatch:%d)", re->match_id())); - break; - } - - // If the parent is an alternation, append the | for it. - if (prec == PrecAlternate) - t_->append("|"); - - return 0; -} - -// Appends a rune for use in a character class to the string t. + break; + } + + // If the parent is an alternation, append the | for it. + if (prec == PrecAlternate) + t_->append("|"); + + return 0; +} + +// Appends a rune for use in a character class to the string t. static void AppendCCChar(std::string* t, Rune r) { - if (0x20 <= r && r <= 0x7E) { - if (strchr("[]^-\\", r)) - t->append("\\"); + if (0x20 <= r && r <= 0x7E) { + if (strchr("[]^-\\", r)) + t->append("\\"); t->append(1, static_cast<char>(r)); - return; - } - switch (r) { - default: - break; - - case '\r': - t->append("\\r"); - return; - - case '\t': - t->append("\\t"); - return; - - case '\n': - t->append("\\n"); - return; - - case '\f': - t->append("\\f"); - return; - } - - if (r < 0x100) { + return; + } + switch (r) { + default: + break; + + case '\r': + t->append("\\r"); + return; + + case '\t': + t->append("\\t"); + return; + + case '\n': + t->append("\\n"); + return; + + case '\f': + t->append("\\f"); + return; + } + + if (r < 0x100) { *t += StringPrintf("\\x%02x", static_cast<int>(r)); - return; - } + return; + } *t += StringPrintf("\\x{%x}", static_cast<int>(r)); -} - +} + static void AppendCCRange(std::string* t, Rune lo, Rune hi) { - if (lo > hi) - return; - AppendCCChar(t, lo); - if (lo < hi) { - t->append("-"); - AppendCCChar(t, hi); - } -} - -} // namespace re2 + if (lo > hi) + return; + AppendCCChar(t, lo); + if (lo < hi) { + t->append("-"); + AppendCCChar(t, hi); + } +} + +} // namespace re2 diff --git a/contrib/libs/re2/re2/unicode_casefold.h b/contrib/libs/re2/re2/unicode_casefold.h index 70a597010f..8bdbb42fbc 100644 --- a/contrib/libs/re2/re2/unicode_casefold.h +++ b/contrib/libs/re2/re2/unicode_casefold.h @@ -1,78 +1,78 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_UNICODE_CASEFOLD_H_ #define RE2_UNICODE_CASEFOLD_H_ -// Unicode case folding tables. - -// The Unicode case folding tables encode the mapping from one Unicode point -// to the next largest Unicode point with equivalent folding. The largest -// point wraps back to the first. For example, the tables map: -// -// 'A' -> 'a' -// 'a' -> 'A' -// -// 'K' -> 'k' -// 'k' -> 'K' (Kelvin symbol) -// 'K' -> 'K' -// -// Like everything Unicode, these tables are big. If we represent the table +// Unicode case folding tables. + +// The Unicode case folding tables encode the mapping from one Unicode point +// to the next largest Unicode point with equivalent folding. The largest +// point wraps back to the first. For example, the tables map: +// +// 'A' -> 'a' +// 'a' -> 'A' +// +// 'K' -> 'k' +// 'k' -> 'K' (Kelvin symbol) +// 'K' -> 'K' +// +// Like everything Unicode, these tables are big. If we represent the table // as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB. -// Most table entries look like the ones around them: -// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. -// Instead of listing all the pairs explicitly, we make a list of ranges -// and deltas, so that the table entries for 'A' through 'Z' can be represented -// as a single entry { 'A', 'Z', +32 }. -// -// In addition to blocks that map to each other (A-Z mapping to a-z) -// there are blocks of pairs that individually map to each other -// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). -// For those, the special delta value EvenOdd marks even/odd pairs -// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. -// -// In this form, the table has 274 entries, about 3kB. If we were to split -// the table into one for 16-bit codes and an overflow table for larger ones, -// we could get it down to about 1.5kB, but that's not worth the complexity. -// -// The grouped form also allows for efficient fold range calculations -// rather than looping one character at a time. - +// Most table entries look like the ones around them: +// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. +// Instead of listing all the pairs explicitly, we make a list of ranges +// and deltas, so that the table entries for 'A' through 'Z' can be represented +// as a single entry { 'A', 'Z', +32 }. +// +// In addition to blocks that map to each other (A-Z mapping to a-z) +// there are blocks of pairs that individually map to each other +// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). +// For those, the special delta value EvenOdd marks even/odd pairs +// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. +// +// In this form, the table has 274 entries, about 3kB. If we were to split +// the table into one for 16-bit codes and an overflow table for larger ones, +// we could get it down to about 1.5kB, but that's not worth the complexity. +// +// The grouped form also allows for efficient fold range calculations +// rather than looping one character at a time. + #include <stdint.h> - + #include "util/util.h" #include "util/utf.h" - -namespace re2 { - -enum { - EvenOdd = 1, + +namespace re2 { + +enum { + EvenOdd = 1, OddEven = -1, EvenOddSkip = 1<<30, OddEvenSkip, -}; - -struct CaseFold { +}; + +struct CaseFold { Rune lo; Rune hi; int32_t delta; -}; - +}; + extern const CaseFold unicode_casefold[]; extern const int num_unicode_casefold; - + extern const CaseFold unicode_tolower[]; extern const int num_unicode_tolower; -// Returns the CaseFold* in the tables that contains rune. -// If rune is not in the tables, returns the first CaseFold* after rune. -// If rune is larger than any value in the tables, returns NULL. +// Returns the CaseFold* in the tables that contains rune. +// If rune is not in the tables, returns the first CaseFold* after rune. +// If rune is larger than any value in the tables, returns NULL. extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune); - + // Returns the result of applying the fold f to the rune r. extern Rune ApplyFold(const CaseFold *f, Rune r); -} // namespace re2 - +} // namespace re2 + #endif // RE2_UNICODE_CASEFOLD_H_ diff --git a/contrib/libs/re2/re2/unicode_groups.h b/contrib/libs/re2/re2/unicode_groups.h index 17a5900080..75f55daa61 100644 --- a/contrib/libs/re2/re2/unicode_groups.h +++ b/contrib/libs/re2/re2/unicode_groups.h @@ -1,67 +1,67 @@ -// Copyright 2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_UNICODE_GROUPS_H_ #define RE2_UNICODE_GROUPS_H_ -// Unicode character groups. - -// The codes get split into ranges of 16-bit codes -// and ranges of 32-bit codes. It would be simpler -// to use only 32-bit ranges, but these tables are large -// enough to warrant extra care. -// -// Using just 32-bit ranges gives 27 kB of data. -// Adding 16-bit ranges gives 18 kB of data. -// Adding an extra table of 16-bit singletons would reduce -// to 16.5 kB of data but make the data harder to use; -// we don't bother. - +// Unicode character groups. + +// The codes get split into ranges of 16-bit codes +// and ranges of 32-bit codes. It would be simpler +// to use only 32-bit ranges, but these tables are large +// enough to warrant extra care. +// +// Using just 32-bit ranges gives 27 kB of data. +// Adding 16-bit ranges gives 18 kB of data. +// Adding an extra table of 16-bit singletons would reduce +// to 16.5 kB of data but make the data harder to use; +// we don't bother. + #include <stdint.h> - + #include "util/util.h" #include "util/utf.h" - -namespace re2 { - -struct URange16 -{ + +namespace re2 { + +struct URange16 +{ uint16_t lo; uint16_t hi; -}; - -struct URange32 -{ +}; + +struct URange32 +{ Rune lo; Rune hi; -}; - -struct UGroup -{ - const char *name; - int sign; // +1 for [abc], -1 for [^abc] +}; + +struct UGroup +{ + const char *name; + int sign; // +1 for [abc], -1 for [^abc] const URange16 *r16; - int nr16; + int nr16; const URange32 *r32; - int nr32; -}; - -// Named by property or script name (e.g., "Nd", "N", "Han"). -// Negated groups are not included. + int nr32; +}; + +// Named by property or script name (e.g., "Nd", "N", "Han"). +// Negated groups are not included. extern const UGroup unicode_groups[]; extern const int num_unicode_groups; - -// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]"). -// Negated groups are included. + +// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]"). +// Negated groups are included. extern const UGroup posix_groups[]; extern const int num_posix_groups; - -// Named by Perl name (e.g., "\\d", "\\D"). -// Negated groups are included. + +// Named by Perl name (e.g., "\\d", "\\D"). +// Negated groups are included. extern const UGroup perl_groups[]; extern const int num_perl_groups; - -} // namespace re2 - + +} // namespace re2 + #endif // RE2_UNICODE_GROUPS_H_ diff --git a/contrib/libs/re2/re2/walker-inl.h b/contrib/libs/re2/re2/walker-inl.h index 336fa36290..4d064a0970 100644 --- a/contrib/libs/re2/re2/walker-inl.h +++ b/contrib/libs/re2/re2/walker-inl.h @@ -1,247 +1,247 @@ -// Copyright 2006 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #ifndef RE2_WALKER_INL_H_ #define RE2_WALKER_INL_H_ -// Helper class for traversing Regexps without recursion. -// Clients should declare their own subclasses that override -// the PreVisit and PostVisit methods, which are called before -// and after visiting the subexpressions. - -// Not quite the Visitor pattern, because (among other things) -// the Visitor pattern is recursive. - +// Helper class for traversing Regexps without recursion. +// Clients should declare their own subclasses that override +// the PreVisit and PostVisit methods, which are called before +// and after visiting the subexpressions. + +// Not quite the Visitor pattern, because (among other things) +// the Visitor pattern is recursive. + #include <stack> - + #include "util/logging.h" -#include "re2/regexp.h" - -namespace re2 { - -template<typename T> struct WalkState; - -template<typename T> class Regexp::Walker { - public: - Walker(); - virtual ~Walker(); - - // Virtual method called before visiting re's children. - // PreVisit passes ownership of its return value to its caller. - // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg - // and passed to the child PreVisits and PostVisits as parent_arg. - // At the top-most Regexp, parent_arg is arg passed to walk. - // If PreVisit sets *stop to true, the walk does not recurse - // into the children. Instead it behaves as though the return - // value from PreVisit is the return value from PostVisit. - // The default PreVisit returns parent_arg. - virtual T PreVisit(Regexp* re, T parent_arg, bool* stop); - - // Virtual method called after visiting re's children. - // The pre_arg is the T that PreVisit returned. - // The child_args is a vector of the T that the child PostVisits returned. - // PostVisit takes ownership of pre_arg. - // PostVisit takes ownership of the Ts - // in *child_args, but not the vector itself. - // PostVisit passes ownership of its return value - // to its caller. - // The default PostVisit simply returns pre_arg. - virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg, - T* child_args, int nchild_args); - - // Virtual method called to copy a T, - // when Walk notices that more than one child is the same re. - virtual T Copy(T arg); - - // Virtual method called to do a "quick visit" of the re, - // but not its children. Only called once the visit budget - // has been used up and we're trying to abort the walk - // as quickly as possible. Should return a value that - // makes sense for the parent PostVisits still to be run. - // This function is (hopefully) only called by - // WalkExponential, but must be implemented by all clients, - // just in case. - virtual T ShortVisit(Regexp* re, T parent_arg) = 0; - - // Walks over a regular expression. - // Top_arg is passed as parent_arg to PreVisit and PostVisit of re. - // Returns the T returned by PostVisit on re. - T Walk(Regexp* re, T top_arg); - - // Like Walk, but doesn't use Copy. This can lead to - // exponential runtimes on cross-linked Regexps like the - // ones generated by Simplify. To help limit this, - // at most max_visits nodes will be visited and then - // the walk will be cut off early. - // If the walk *is* cut off early, ShortVisit(re) - // will be called on regexps that cannot be fully - // visited rather than calling PreVisit/PostVisit. - T WalkExponential(Regexp* re, T top_arg, int max_visits); - - // Clears the stack. Should never be necessary, since - // Walk always enters and exits with an empty stack. - // Logs DFATAL if stack is not already clear. - void Reset(); - - // Returns whether walk was cut off. - bool stopped_early() { return stopped_early_; } - - private: - // Walk state for the entire traversal. +#include "re2/regexp.h" + +namespace re2 { + +template<typename T> struct WalkState; + +template<typename T> class Regexp::Walker { + public: + Walker(); + virtual ~Walker(); + + // Virtual method called before visiting re's children. + // PreVisit passes ownership of its return value to its caller. + // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg + // and passed to the child PreVisits and PostVisits as parent_arg. + // At the top-most Regexp, parent_arg is arg passed to walk. + // If PreVisit sets *stop to true, the walk does not recurse + // into the children. Instead it behaves as though the return + // value from PreVisit is the return value from PostVisit. + // The default PreVisit returns parent_arg. + virtual T PreVisit(Regexp* re, T parent_arg, bool* stop); + + // Virtual method called after visiting re's children. + // The pre_arg is the T that PreVisit returned. + // The child_args is a vector of the T that the child PostVisits returned. + // PostVisit takes ownership of pre_arg. + // PostVisit takes ownership of the Ts + // in *child_args, but not the vector itself. + // PostVisit passes ownership of its return value + // to its caller. + // The default PostVisit simply returns pre_arg. + virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg, + T* child_args, int nchild_args); + + // Virtual method called to copy a T, + // when Walk notices that more than one child is the same re. + virtual T Copy(T arg); + + // Virtual method called to do a "quick visit" of the re, + // but not its children. Only called once the visit budget + // has been used up and we're trying to abort the walk + // as quickly as possible. Should return a value that + // makes sense for the parent PostVisits still to be run. + // This function is (hopefully) only called by + // WalkExponential, but must be implemented by all clients, + // just in case. + virtual T ShortVisit(Regexp* re, T parent_arg) = 0; + + // Walks over a regular expression. + // Top_arg is passed as parent_arg to PreVisit and PostVisit of re. + // Returns the T returned by PostVisit on re. + T Walk(Regexp* re, T top_arg); + + // Like Walk, but doesn't use Copy. This can lead to + // exponential runtimes on cross-linked Regexps like the + // ones generated by Simplify. To help limit this, + // at most max_visits nodes will be visited and then + // the walk will be cut off early. + // If the walk *is* cut off early, ShortVisit(re) + // will be called on regexps that cannot be fully + // visited rather than calling PreVisit/PostVisit. + T WalkExponential(Regexp* re, T top_arg, int max_visits); + + // Clears the stack. Should never be necessary, since + // Walk always enters and exits with an empty stack. + // Logs DFATAL if stack is not already clear. + void Reset(); + + // Returns whether walk was cut off. + bool stopped_early() { return stopped_early_; } + + private: + // Walk state for the entire traversal. std::stack<WalkState<T>> stack_; - bool stopped_early_; - int max_visits_; - - T WalkInternal(Regexp* re, T top_arg, bool use_copy); - + bool stopped_early_; + int max_visits_; + + T WalkInternal(Regexp* re, T top_arg, bool use_copy); + Walker(const Walker&) = delete; Walker& operator=(const Walker&) = delete; -}; - -template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re, - T parent_arg, - bool* stop) { - return parent_arg; -} - -template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re, - T parent_arg, - T pre_arg, - T* child_args, - int nchild_args) { - return pre_arg; -} - -template<typename T> T Regexp::Walker<T>::Copy(T arg) { - return arg; -} - -// State about a single level in the traversal. -template<typename T> struct WalkState { +}; + +template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re, + T parent_arg, + bool* stop) { + return parent_arg; +} + +template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re, + T parent_arg, + T pre_arg, + T* child_args, + int nchild_args) { + return pre_arg; +} + +template<typename T> T Regexp::Walker<T>::Copy(T arg) { + return arg; +} + +// State about a single level in the traversal. +template<typename T> struct WalkState { WalkState(Regexp* re, T parent) - : re(re), - n(-1), - parent_arg(parent), - child_args(NULL) { } - - Regexp* re; // The regexp - int n; // The index of the next child to process; -1 means need to PreVisit - T parent_arg; // Accumulated arguments. - T pre_arg; - T child_arg; // One-element buffer for child_args. - T* child_args; -}; - -template<typename T> Regexp::Walker<T>::Walker() { - stopped_early_ = false; -} - -template<typename T> Regexp::Walker<T>::~Walker() { - Reset(); -} - -// Clears the stack. Should never be necessary, since -// Walk always enters and exits with an empty stack. -// Logs DFATAL if stack is not already clear. -template<typename T> void Regexp::Walker<T>::Reset() { + : re(re), + n(-1), + parent_arg(parent), + child_args(NULL) { } + + Regexp* re; // The regexp + int n; // The index of the next child to process; -1 means need to PreVisit + T parent_arg; // Accumulated arguments. + T pre_arg; + T child_arg; // One-element buffer for child_args. + T* child_args; +}; + +template<typename T> Regexp::Walker<T>::Walker() { + stopped_early_ = false; +} + +template<typename T> Regexp::Walker<T>::~Walker() { + Reset(); +} + +// Clears the stack. Should never be necessary, since +// Walk always enters and exits with an empty stack. +// Logs DFATAL if stack is not already clear. +template<typename T> void Regexp::Walker<T>::Reset() { if (!stack_.empty()) { - LOG(DFATAL) << "Stack not empty."; + LOG(DFATAL) << "Stack not empty."; while (!stack_.empty()) { if (stack_.top().re->nsub_ > 1) delete[] stack_.top().child_args; stack_.pop(); - } - } -} - -template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, - bool use_copy) { - Reset(); - - if (re == NULL) { - LOG(DFATAL) << "Walk NULL"; - return top_arg; - } - + } + } +} + +template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, + bool use_copy) { + Reset(); + + if (re == NULL) { + LOG(DFATAL) << "Walk NULL"; + return top_arg; + } + stack_.push(WalkState<T>(re, top_arg)); - - WalkState<T>* s; - for (;;) { - T t; + + WalkState<T>* s; + for (;;) { + T t; s = &stack_.top(); re = s->re; - switch (s->n) { - case -1: { - if (--max_visits_ < 0) { - stopped_early_ = true; - t = ShortVisit(re, s->parent_arg); - break; - } - bool stop = false; - s->pre_arg = PreVisit(re, s->parent_arg, &stop); - if (stop) { - t = s->pre_arg; - break; - } - s->n = 0; - s->child_args = NULL; - if (re->nsub_ == 1) - s->child_args = &s->child_arg; - else if (re->nsub_ > 1) - s->child_args = new T[re->nsub_]; + switch (s->n) { + case -1: { + if (--max_visits_ < 0) { + stopped_early_ = true; + t = ShortVisit(re, s->parent_arg); + break; + } + bool stop = false; + s->pre_arg = PreVisit(re, s->parent_arg, &stop); + if (stop) { + t = s->pre_arg; + break; + } + s->n = 0; + s->child_args = NULL; + if (re->nsub_ == 1) + s->child_args = &s->child_arg; + else if (re->nsub_ > 1) + s->child_args = new T[re->nsub_]; FALLTHROUGH_INTENDED; - } - default: { - if (re->nsub_ > 0) { - Regexp** sub = re->sub(); - if (s->n < re->nsub_) { - if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) { - s->child_args[s->n] = Copy(s->child_args[s->n - 1]); - s->n++; - } else { + } + default: { + if (re->nsub_ > 0) { + Regexp** sub = re->sub(); + if (s->n < re->nsub_) { + if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) { + s->child_args[s->n] = Copy(s->child_args[s->n - 1]); + s->n++; + } else { stack_.push(WalkState<T>(sub[s->n], s->pre_arg)); - } - continue; - } - } - - t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n); - if (re->nsub_ > 1) - delete[] s->child_args; - break; - } - } - + } + continue; + } + } + + t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n); + if (re->nsub_ > 1) + delete[] s->child_args; + break; + } + } + // We've finished stack_.top(). - // Update next guy down. + // Update next guy down. stack_.pop(); if (stack_.empty()) - return t; + return t; s = &stack_.top(); - if (s->child_args != NULL) - s->child_args[s->n] = t; - else - s->child_arg = t; - s->n++; - } -} - -template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) { - // Without the exponential walking behavior, - // this budget should be more than enough for any - // regexp, and yet not enough to get us in trouble - // as far as CPU time. - max_visits_ = 1000000; - return WalkInternal(re, top_arg, true); -} - -template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg, - int max_visits) { - max_visits_ = max_visits; - return WalkInternal(re, top_arg, false); -} - -} // namespace re2 - + if (s->child_args != NULL) + s->child_args[s->n] = t; + else + s->child_arg = t; + s->n++; + } +} + +template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) { + // Without the exponential walking behavior, + // this budget should be more than enough for any + // regexp, and yet not enough to get us in trouble + // as far as CPU time. + max_visits_ = 1000000; + return WalkInternal(re, top_arg, true); +} + +template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg, + int max_visits) { + max_visits_ = max_visits; + return WalkInternal(re, top_arg, false); +} + +} // namespace re2 + #endif // RE2_WALKER_INL_H_ diff --git a/contrib/libs/re2/util/rune.cc b/contrib/libs/re2/util/rune.cc index 824656f776..4f625ea380 100644 --- a/contrib/libs/re2/util/rune.cc +++ b/contrib/libs/re2/util/rune.cc @@ -1,260 +1,260 @@ -/* - * The authors of this software are Rob Pike and Ken Thompson. - * Copyright (c) 2002 by Lucent Technologies. - * Permission to use, copy, modify, and distribute this software for any - * purpose without fee is hereby granted, provided that this entire notice - * is included in all copies of any software which is or includes a copy - * or modification of this software and in all copies of the supporting - * documentation for such software. - * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY - * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY - * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. - */ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ -#include <stdarg.h> -#include <string.h> +#include <stdarg.h> +#include <string.h> -#include "util/utf.h" - -namespace re2 { - -enum -{ - Bit1 = 7, - Bitx = 6, - Bit2 = 5, - Bit3 = 4, - Bit4 = 3, - Bit5 = 2, - - T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ - Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ - T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ - T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ - T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ - T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ - Rune4 = (1<<(Bit4+3*Bitx))-1, - /* 0001 1111 1111 1111 1111 1111 */ - - Maskx = (1<<Bitx)-1, /* 0011 1111 */ - Testx = Maskx ^ 0xFF, /* 1100 0000 */ - - Bad = Runeerror, -}; - -int -chartorune(Rune *rune, const char *str) -{ - int c, c1, c2, c3; - long l; - - /* - * one character sequence - * 00000-0007F => T1 - */ - c = *(unsigned char*)str; - if(c < Tx) { - *rune = c; - return 1; - } - - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - c1 = *(unsigned char*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) - goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) - goto bad; - *rune = l; - return 2; - } - - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - c2 = *(unsigned char*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) - goto bad; - *rune = l; - return 3; - } - - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => T4 Tx Tx Tx - */ - c3 = *(unsigned char*)(str+3) ^ Tx; - if (c3 & Testx) - goto bad; - if (c < T5) { - l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; - if (l <= Rune3) - goto bad; - *rune = l; - return 4; - } - - /* - * Support for 5-byte or longer UTF-8 would go here, but - * since we don't have that, we'll just fall through to bad. - */ - - /* - * bad decoding - */ -bad: - *rune = Bad; - return 1; -} - -int -runetochar(char *str, const Rune *rune) -{ - /* Runes are signed, so convert to unsigned for range check. */ - unsigned long c; - - /* - * one character sequence - * 00000-0007F => 00-7F - */ - c = *rune; - if(c <= Rune1) { +#include "util/utf.h" + +namespace re2 { + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, + /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1<<Bitx)-1, /* 0011 1111 */ + Testx = Maskx ^ 0xFF, /* 1100 0000 */ + + Bad = Runeerror, +}; + +int +chartorune(Rune *rune, const char *str) +{ + int c, c1, c2, c3; + long l; + + /* + * one character sequence + * 00000-0007F => T1 + */ + c = *(unsigned char*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(unsigned char*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(unsigned char*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(unsigned char*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + *rune = l; + return 4; + } + + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +runetochar(char *str, const Rune *rune) +{ + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if(c <= Rune1) { str[0] = static_cast<char>(c); - return 1; - } - - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - if(c <= Rune2) { + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if(c <= Rune2) { str[0] = T2 | static_cast<char>(c >> 1*Bitx); - str[1] = Tx | (c & Maskx); - return 2; - } - - /* - * If the Rune is out of range, convert it to the error rune. - * Do this test here because the error rune encodes to three bytes. - * Doing it earlier would duplicate work, since an out of range - * Rune wouldn't have fit in one or two bytes. - */ - if (c > Runemax) - c = Runeerror; - - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - if (c <= Rune3) { + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) { str[0] = T3 | static_cast<char>(c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); str[2] = Tx | (c & Maskx); - return 3; - } - - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => T4 Tx Tx Tx - */ + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ str[0] = T4 | static_cast<char>(c >> 3*Bitx); - str[1] = Tx | ((c >> 2*Bitx) & Maskx); - str[2] = Tx | ((c >> 1*Bitx) & Maskx); - str[3] = Tx | (c & Maskx); - return 4; -} - -int -runelen(Rune rune) -{ - char str[10]; - - return runetochar(str, &rune); -} - -int -fullrune(const char *str, int n) -{ - if (n > 0) { - int c = *(unsigned char*)str; - if (c < Tx) - return 1; - if (n > 1) { - if (c < T3) - return 1; - if (n > 2) { - if (c < T4 || n > 3) - return 1; - } - } - } - return 0; -} - - -int -utflen(const char *s) -{ - int c; - long n; - Rune rune; - - n = 0; - for(;;) { - c = *(unsigned char*)s; - if(c < Runeself) { - if(c == 0) - return n; - s++; - } else - s += chartorune(&rune, s); - n++; - } - return 0; -} - -char* -utfrune(const char *s, Rune c) -{ - long c1; - Rune r; - int n; - - if(c < Runesync) /* not part of utf sequence */ - return strchr((char*)s, c); - - for(;;) { - c1 = *(unsigned char*)s; - if(c1 < Runeself) { /* one byte rune */ - if(c1 == 0) - return 0; - if(c1 == c) - return (char*)s; - s++; - continue; - } - n = chartorune(&r, s); - if(r == c) - return (char*)s; - s += n; - } - return 0; -} - -} // namespace re2 + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; +} + +int +runelen(Rune rune) +{ + char str[10]; + + return runetochar(str, &rune); +} + +int +fullrune(const char *str, int n) +{ + if (n > 0) { + int c = *(unsigned char*)str; + if (c < Tx) + return 1; + if (n > 1) { + if (c < T3) + return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } + } + return 0; +} + + +int +utflen(const char *s) +{ + int c; + long n; + Rune rune; + + n = 0; + for(;;) { + c = *(unsigned char*)s; + if(c < Runeself) { + if(c == 0) + return n; + s++; + } else + s += chartorune(&rune, s); + n++; + } + return 0; +} + +char* +utfrune(const char *s, Rune c) +{ + long c1; + Rune r; + int n; + + if(c < Runesync) /* not part of utf sequence */ + return strchr((char*)s, c); + + for(;;) { + c1 = *(unsigned char*)s; + if(c1 < Runeself) { /* one byte rune */ + if(c1 == 0) + return 0; + if(c1 == c) + return (char*)s; + s++; + continue; + } + n = chartorune(&r, s); + if(r == c) + return (char*)s; + s += n; + } + return 0; +} + +} // namespace re2 diff --git a/contrib/libs/re2/util/strutil.cc b/contrib/libs/re2/util/strutil.cc index f151ab1b80..fb7e6b1b0c 100644 --- a/contrib/libs/re2/util/strutil.cc +++ b/contrib/libs/re2/util/strutil.cc @@ -1,10 +1,10 @@ -// Copyright 1999-2005 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - +// Copyright 1999-2005 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + #include <stdarg.h> #include <stdio.h> - + #include "util/strutil.h" #ifdef _WIN32 @@ -12,86 +12,86 @@ #define vsnprintf _vsnprintf #endif -namespace re2 { - -// ---------------------------------------------------------------------- -// CEscapeString() -// Copies 'src' to 'dest', escaping dangerous characters using -// C-style escape sequences. 'src' and 'dest' should not overlap. -// Returns the number of bytes written to 'dest' (not including the \0) +namespace re2 { + +// ---------------------------------------------------------------------- +// CEscapeString() +// Copies 'src' to 'dest', escaping dangerous characters using +// C-style escape sequences. 'src' and 'dest' should not overlap. +// Returns the number of bytes written to 'dest' (not including the \0) // or (size_t)-1 if there was insufficient space. -// ---------------------------------------------------------------------- +// ---------------------------------------------------------------------- static size_t CEscapeString(const char* src, size_t src_len, char* dest, size_t dest_len) { - const char* src_end = src + src_len; + const char* src_end = src + src_len; size_t used = 0; - - for (; src < src_end; src++) { + + for (; src < src_end; src++) { if (dest_len - used < 2) // space for two-character escape return (size_t)-1; - - unsigned char c = *src; - switch (c) { - case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; - case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; - case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; - case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; - case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; - case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; - default: - // Note that if we emit \xNN and the src character after that is a hex - // digit then that digit must be escaped too to prevent it being - // interpreted as part of the character code by C. - if (c < ' ' || c > '~') { + + unsigned char c = *src; + switch (c) { + case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; + case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; + case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; + case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; + case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; + case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; + default: + // Note that if we emit \xNN and the src character after that is a hex + // digit then that digit must be escaped too to prevent it being + // interpreted as part of the character code by C. + if (c < ' ' || c > '~') { if (dest_len - used < 5) // space for four-character escape + \0 return (size_t)-1; snprintf(dest + used, 5, "\\%03o", c); - used += 4; - } else { - dest[used++] = c; break; - } - } - } - - if (dest_len - used < 1) // make sure that there is room for \0 + used += 4; + } else { + dest[used++] = c; break; + } + } + } + + if (dest_len - used < 1) // make sure that there is room for \0 return (size_t)-1; - - dest[used] = '\0'; // doesn't count towards return value though - return used; -} - -// ---------------------------------------------------------------------- -// CEscape() -// Copies 'src' to result, escaping dangerous characters using -// C-style escape sequences. 'src' and 'dest' should not overlap. -// ---------------------------------------------------------------------- + + dest[used] = '\0'; // doesn't count towards return value though + return used; +} + +// ---------------------------------------------------------------------- +// CEscape() +// Copies 'src' to result, escaping dangerous characters using +// C-style escape sequences. 'src' and 'dest' should not overlap. +// ---------------------------------------------------------------------- std::string CEscape(const StringPiece& src) { const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion char* dest = new char[dest_len]; const size_t used = CEscapeString(src.data(), src.size(), dest, dest_len); std::string s = std::string(dest, used); - delete[] dest; - return s; -} - + delete[] dest; + return s; +} + void PrefixSuccessor(std::string* prefix) { - // We can increment the last character in the string and be done - // unless that character is 255, in which case we have to erase the - // last character and increment the previous character, unless that - // is 255, etc. If the string is empty or consists entirely of - // 255's, we just return the empty string. + // We can increment the last character in the string and be done + // unless that character is 255, in which case we have to erase the + // last character and increment the previous character, unless that + // is 255, etc. If the string is empty or consists entirely of + // 255's, we just return the empty string. while (!prefix->empty()) { char& c = prefix->back(); if (c == '\xff') { // char literal avoids signed/unsigned. prefix->pop_back(); - } else { + } else { ++c; break; - } - } -} - + } + } +} + static void StringAppendV(std::string* dst, const char* format, va_list ap) { // First try with a small fixed size buffer char space[1024]; @@ -146,4 +146,4 @@ std::string StringPrintf(const char* format, ...) { return result; } -} // namespace re2 +} // namespace re2 diff --git a/contrib/libs/re2/util/utf.h b/contrib/libs/re2/util/utf.h index f29404a561..85b4297239 100644 --- a/contrib/libs/re2/util/utf.h +++ b/contrib/libs/re2/util/utf.h @@ -1,44 +1,44 @@ -/* - * The authors of this software are Rob Pike and Ken Thompson. - * Copyright (c) 2002 by Lucent Technologies. - * Permission to use, copy, modify, and distribute this software for any - * purpose without fee is hereby granted, provided that this entire notice - * is included in all copies of any software which is or includes a copy - * or modification of this software and in all copies of the supporting - * documentation for such software. - * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY - * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY - * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. - * - * This file and rune.cc have been converted to compile as C++ code - * in name space re2. - */ - +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + * + * This file and rune.cc have been converted to compile as C++ code + * in name space re2. + */ + #ifndef UTIL_UTF_H_ #define UTIL_UTF_H_ #include <stdint.h> - -namespace re2 { - -typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ - -enum -{ - UTFmax = 4, /* maximum bytes per rune */ - Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ - Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0xFFFD, /* decoding error in UTF */ - Runemax = 0x10FFFF, /* maximum rune value */ -}; - -int runetochar(char* s, const Rune* r); -int chartorune(Rune* r, const char* s); -int fullrune(const char* s, int n); -int utflen(const char* s); -char* utfrune(const char*, Rune); - -} // namespace re2 - + +namespace re2 { + +typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ + +enum +{ + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ +}; + +int runetochar(char* s, const Rune* r); +int chartorune(Rune* r, const char* s); +int fullrune(const char* s, int n); +int utflen(const char* s); +char* utfrune(const char*, Rune); + +} // namespace re2 + #endif // UTIL_UTF_H_ diff --git a/contrib/libs/re2/ya.make b/contrib/libs/re2/ya.make index 0f49b2c6b5..8072de2eb2 100644 --- a/contrib/libs/re2/ya.make +++ b/contrib/libs/re2/ya.make @@ -1,11 +1,11 @@ # Generated by devtools/yamaker from nixpkgs 21.11. -LIBRARY() - +LIBRARY() + OWNER(g:cpp-contrib) VERSION(2022-02-01) - + ORIGINAL_SOURCE(https://github.com/google/re2/archive/2022-02-01.tar.gz) LICENSE( @@ -19,7 +19,7 @@ ADDINCL( GLOBAL contrib/libs/re2/include contrib/libs/re2 ) - + NO_COMPILER_WARNINGS() IF (WITH_VALGRIND) @@ -28,7 +28,7 @@ IF (WITH_VALGRIND) ) ENDIF() -SRCS( +SRCS( re2/bitstate.cc re2/compile.cc re2/dfa.cc @@ -51,9 +51,9 @@ SRCS( re2/unicode_groups.cc util/rune.cc util/strutil.cc -) - -END() +) + +END() RECURSE( re2/testing |