diff options
author | thegeorg <thegeorg@yandex-team.ru> | 2022-02-10 16:45:12 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:12 +0300 |
commit | 49116032d905455a7b1c994e4a696afc885c1e71 (patch) | |
tree | be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/libs/re2/util | |
parent | 4e839db24a3bbc9f1c610c43d6faaaa99824dcca (diff) | |
download | ydb-49116032d905455a7b1c994e4a696afc885c1e71.tar.gz |
Restoring authorship annotation for <thegeorg@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/re2/util')
-rw-r--r-- | contrib/libs/re2/util/flags.h | 52 | ||||
-rw-r--r-- | contrib/libs/re2/util/logging.h | 4 | ||||
-rw-r--r-- | contrib/libs/re2/util/mutex.h | 38 | ||||
-rw-r--r-- | contrib/libs/re2/util/pcre.cc | 2050 | ||||
-rw-r--r-- | contrib/libs/re2/util/pcre.h | 1362 | ||||
-rw-r--r-- | contrib/libs/re2/util/strutil.cc | 24 | ||||
-rw-r--r-- | contrib/libs/re2/util/strutil.h | 8 | ||||
-rw-r--r-- | contrib/libs/re2/util/test.cc | 68 | ||||
-rw-r--r-- | contrib/libs/re2/util/test.h | 100 | ||||
-rw-r--r-- | contrib/libs/re2/util/util.h | 50 |
10 files changed, 1878 insertions, 1878 deletions
diff --git a/contrib/libs/re2/util/flags.h b/contrib/libs/re2/util/flags.h index a3d5fc1234..3386b729d4 100644 --- a/contrib/libs/re2/util/flags.h +++ b/contrib/libs/re2/util/flags.h @@ -1,26 +1,26 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_FLAGS_H_ -#define UTIL_FLAGS_H_ - -// Simplified version of Google's command line flags. -// Does not support parsing the command line. -// If you want to do that, see -// https://gflags.github.io/gflags/ - -#define DEFINE_FLAG(type, name, deflt, desc) \ - namespace re2 { type FLAGS_##name = deflt; } - -#define DECLARE_FLAG(type, name) \ - namespace re2 { extern type FLAGS_##name; } - -namespace re2 { -template <typename T> -T GetFlag(const T& flag) { - return flag; -} -} // namespace re2 - -#endif // UTIL_FLAGS_H_ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_FLAGS_H_ +#define UTIL_FLAGS_H_ + +// Simplified version of Google's command line flags. +// Does not support parsing the command line. +// If you want to do that, see +// https://gflags.github.io/gflags/ + +#define DEFINE_FLAG(type, name, deflt, desc) \ + namespace re2 { type FLAGS_##name = deflt; } + +#define DECLARE_FLAG(type, name) \ + namespace re2 { extern type FLAGS_##name; } + +namespace re2 { +template <typename T> +T GetFlag(const T& flag) { + return flag; +} +} // namespace re2 + +#endif // UTIL_FLAGS_H_ diff --git a/contrib/libs/re2/util/logging.h b/contrib/libs/re2/util/logging.h index be5b4d4dbb..5b2217f29c 100644 --- a/contrib/libs/re2/util/logging.h +++ b/contrib/libs/re2/util/logging.h @@ -62,7 +62,7 @@ class LogMessage { } void Flush() { stream() << "\n"; - std::string s = str_.str(); + std::string s = str_.str(); size_t n = s.size(); if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc flushed_ = true; @@ -93,7 +93,7 @@ class LogMessageFatal : public LogMessage { public: LogMessageFatal(const char* file, int line) : LogMessage(file, line) {} - ATTRIBUTE_NORETURN ~LogMessageFatal() { + ATTRIBUTE_NORETURN ~LogMessageFatal() { Flush(); abort(); } diff --git a/contrib/libs/re2/util/mutex.h b/contrib/libs/re2/util/mutex.h index 0ad97ff1eb..158046bb5c 100644 --- a/contrib/libs/re2/util/mutex.h +++ b/contrib/libs/re2/util/mutex.h @@ -10,13 +10,13 @@ * You should assume the locks are *not* re-entrant. */ -#ifdef _WIN32 -// Requires Windows Vista or Windows Server 2008 at minimum. -#include <windows.h> -#if defined(WINVER) && WINVER >= 0x0600 -#define MUTEX_IS_WIN32_SRWLOCK -#endif -#else +#ifdef _WIN32 +// Requires Windows Vista or Windows Server 2008 at minimum. +#include <windows.h> +#if defined(WINVER) && WINVER >= 0x0600 +#define MUTEX_IS_WIN32_SRWLOCK +#endif +#else #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif @@ -26,9 +26,9 @@ #endif #endif -#if defined(MUTEX_IS_WIN32_SRWLOCK) -typedef SRWLOCK MutexType; -#elif defined(MUTEX_IS_PTHREAD_RWLOCK) +#if defined(MUTEX_IS_WIN32_SRWLOCK) +typedef SRWLOCK MutexType; +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) #include <pthread.h> #include <stdlib.h> typedef pthread_rwlock_t MutexType; @@ -64,17 +64,17 @@ class Mutex { Mutex& operator=(const Mutex&) = delete; }; -#if defined(MUTEX_IS_WIN32_SRWLOCK) +#if defined(MUTEX_IS_WIN32_SRWLOCK) Mutex::Mutex() : mutex_(SRWLOCK_INIT) { } -Mutex::~Mutex() { } -void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } -void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } -void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } -void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } - -#elif defined(MUTEX_IS_PTHREAD_RWLOCK) - +Mutex::~Mutex() { } +void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } +void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } +void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } +void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } + +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) + #define SAFE_PTHREAD(fncall) \ do { \ if ((fncall) != 0) abort(); \ diff --git a/contrib/libs/re2/util/pcre.cc b/contrib/libs/re2/util/pcre.cc index 93ffe9421b..b68985144f 100644 --- a/contrib/libs/re2/util/pcre.cc +++ b/contrib/libs/re2/util/pcre.cc @@ -1,1025 +1,1025 @@ -// Copyright 2003-2009 Google Inc. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This is a variant of PCRE's pcrecpp.cc, originally written at Google. -// The main changes are the addition of the HitLimit method and -// compilation as PCRE in namespace re2. - -#include <assert.h> -#include <ctype.h> -#include <errno.h> -#include <stdlib.h> -#include <string.h> -#include <limits> -#include <string> -#include <utility> - -#include "util/util.h" -#include "util/flags.h" -#include "util/logging.h" -#include "util/pcre.h" -#include "util/strutil.h" - -// Silence warnings about the wacky formatting in the operator() functions. -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 -#pragma GCC diagnostic ignored "-Wmisleading-indentation" -#endif - -#define PCREPORT(level) LOG(level) - -// Default PCRE limits. -// Defaults chosen to allow a plausible amount of CPU and -// not exceed main thread stacks. Note that other threads -// often have smaller stacks, and therefore tightening -// regexp_stack_limit may frequently be necessary. -DEFINE_FLAG(int, regexp_stack_limit, 256 << 10, - "default PCRE stack limit (bytes)"); -DEFINE_FLAG(int, regexp_match_limit, 1000000, - "default PCRE match limit (function calls)"); - -#ifndef USEPCRE - -// Fake just enough of the PCRE API to allow this file to build. :) - -struct pcre_extra { - int flags; - int match_limit; - int match_limit_recursion; -}; - -#define PCRE_EXTRA_MATCH_LIMIT 0 -#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 -#define PCRE_ANCHORED 0 -#define PCRE_NOTEMPTY 0 -#define PCRE_ERROR_NOMATCH 1 -#define PCRE_ERROR_MATCHLIMIT 2 -#define PCRE_ERROR_RECURSIONLIMIT 3 -#define PCRE_INFO_CAPTURECOUNT 0 - -void pcre_free(void*) { -} - -pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { - return NULL; -} - -int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) { - return 0; -} - -int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { - return 0; -} - -#endif - -namespace re2 { - -// Maximum number of args we can set -static const int kMaxArgs = 16; -static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace - -// Approximate size of a recursive invocation of PCRE's -// internal "match()" frame. This varies depending on the -// compiler and architecture, of course, so the constant is -// just a conservative estimate. To find the exact number, -// run regexp_unittest with --regexp_stack_limit=0 under -// a debugger and look at the frames when it crashes. -// The exact frame size was 656 in production on 2008/02/03. -static const int kPCREFrameSize = 700; - -// Special name for missing C++ arguments. -PCRE::Arg PCRE::no_more_args((void*)NULL); - -const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; -const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; -const PCRE::ConsumeFunctor PCRE::Consume = { }; -const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; - -// If a regular expression has no error, its error_ field points here -static const std::string empty_string; - -void PCRE::Init(const char* pattern, Option options, int match_limit, - int stack_limit, bool report_errors) { - pattern_ = pattern; - options_ = options; - match_limit_ = match_limit; - stack_limit_ = stack_limit; - hit_limit_ = false; - error_ = &empty_string; - report_errors_ = report_errors; - re_full_ = NULL; - re_partial_ = NULL; - - if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { - error_ = new std::string("illegal regexp option"); - PCREPORT(ERROR) - << "Error compiling '" << pattern << "': illegal regexp option"; - } else { - re_partial_ = Compile(UNANCHORED); - if (re_partial_ != NULL) { - re_full_ = Compile(ANCHOR_BOTH); - } - } -} - -PCRE::PCRE(const char* pattern) { - Init(pattern, None, 0, 0, true); -} -PCRE::PCRE(const char* pattern, Option option) { - Init(pattern, option, 0, 0, true); -} -PCRE::PCRE(const std::string& pattern) { - Init(pattern.c_str(), None, 0, 0, true); -} -PCRE::PCRE(const std::string& pattern, Option option) { - Init(pattern.c_str(), option, 0, 0, true); -} -PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) { - Init(pattern.c_str(), re_option.option(), re_option.match_limit(), - re_option.stack_limit(), re_option.report_errors()); -} - -PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { - Init(pattern, re_option.option(), re_option.match_limit(), - re_option.stack_limit(), re_option.report_errors()); -} - -PCRE::~PCRE() { - if (re_full_ != NULL) pcre_free(re_full_); - if (re_partial_ != NULL) pcre_free(re_partial_); - if (error_ != &empty_string) delete error_; -} - -pcre* PCRE::Compile(Anchor anchor) { - // Special treatment for anchoring. This is needed because at - // runtime pcre only provides an option for anchoring at the - // beginning of a string. - // - // There are three types of anchoring we want: - // UNANCHORED Compile the original pattern, and use - // a pcre unanchored match. - // ANCHOR_START Compile the original pattern, and use - // a pcre anchored match. - // ANCHOR_BOTH Tack a "\z" to the end of the original pattern - // and use a pcre anchored match. - - const char* error = ""; - int eoffset; - pcre* re; - if (anchor != ANCHOR_BOTH) { - re = pcre_compile(pattern_.c_str(), - (options_ & EnabledCompileOptions), - &error, &eoffset, NULL); - } else { - // Tack a '\z' at the end of PCRE. Parenthesize it first so that - // the '\z' applies to all top-level alternatives in the regexp. - std::string wrapped = "(?:"; // A non-counting grouping operator - wrapped += pattern_; - wrapped += ")\\z"; - re = pcre_compile(wrapped.c_str(), - (options_ & EnabledCompileOptions), - &error, &eoffset, NULL); - } - if (re == NULL) { - if (error_ == &empty_string) error_ = new std::string(error); - PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; - } - return re; -} - -/***** Convenience interfaces *****/ - -bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, - const PCRE& re, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { - const Arg* args[kMaxArgs]; - int n = 0; - if (&a0 == &no_more_args) goto done; args[n++] = &a0; - if (&a1 == &no_more_args) goto done; args[n++] = &a1; - if (&a2 == &no_more_args) goto done; args[n++] = &a2; - if (&a3 == &no_more_args) goto done; args[n++] = &a3; - if (&a4 == &no_more_args) goto done; args[n++] = &a4; - if (&a5 == &no_more_args) goto done; args[n++] = &a5; - if (&a6 == &no_more_args) goto done; args[n++] = &a6; - if (&a7 == &no_more_args) goto done; args[n++] = &a7; - if (&a8 == &no_more_args) goto done; args[n++] = &a8; - if (&a9 == &no_more_args) goto done; args[n++] = &a9; - if (&a10 == &no_more_args) goto done; args[n++] = &a10; - if (&a11 == &no_more_args) goto done; args[n++] = &a11; - if (&a12 == &no_more_args) goto done; args[n++] = &a12; - if (&a13 == &no_more_args) goto done; args[n++] = &a13; - if (&a14 == &no_more_args) goto done; args[n++] = &a14; - if (&a15 == &no_more_args) goto done; args[n++] = &a15; -done: - - size_t consumed; - int vec[kVecSize] = {}; - return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); -} - -bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, - const PCRE& re, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { - const Arg* args[kMaxArgs]; - int n = 0; - if (&a0 == &no_more_args) goto done; args[n++] = &a0; - if (&a1 == &no_more_args) goto done; args[n++] = &a1; - if (&a2 == &no_more_args) goto done; args[n++] = &a2; - if (&a3 == &no_more_args) goto done; args[n++] = &a3; - if (&a4 == &no_more_args) goto done; args[n++] = &a4; - if (&a5 == &no_more_args) goto done; args[n++] = &a5; - if (&a6 == &no_more_args) goto done; args[n++] = &a6; - if (&a7 == &no_more_args) goto done; args[n++] = &a7; - if (&a8 == &no_more_args) goto done; args[n++] = &a8; - if (&a9 == &no_more_args) goto done; args[n++] = &a9; - if (&a10 == &no_more_args) goto done; args[n++] = &a10; - if (&a11 == &no_more_args) goto done; args[n++] = &a11; - if (&a12 == &no_more_args) goto done; args[n++] = &a12; - if (&a13 == &no_more_args) goto done; args[n++] = &a13; - if (&a14 == &no_more_args) goto done; args[n++] = &a14; - if (&a15 == &no_more_args) goto done; args[n++] = &a15; -done: - - size_t consumed; - int vec[kVecSize] = {}; - return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); -} - -bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, - const PCRE& pattern, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { - const Arg* args[kMaxArgs]; - int n = 0; - if (&a0 == &no_more_args) goto done; args[n++] = &a0; - if (&a1 == &no_more_args) goto done; args[n++] = &a1; - if (&a2 == &no_more_args) goto done; args[n++] = &a2; - if (&a3 == &no_more_args) goto done; args[n++] = &a3; - if (&a4 == &no_more_args) goto done; args[n++] = &a4; - if (&a5 == &no_more_args) goto done; args[n++] = &a5; - if (&a6 == &no_more_args) goto done; args[n++] = &a6; - if (&a7 == &no_more_args) goto done; args[n++] = &a7; - if (&a8 == &no_more_args) goto done; args[n++] = &a8; - if (&a9 == &no_more_args) goto done; args[n++] = &a9; - if (&a10 == &no_more_args) goto done; args[n++] = &a10; - if (&a11 == &no_more_args) goto done; args[n++] = &a11; - if (&a12 == &no_more_args) goto done; args[n++] = &a12; - if (&a13 == &no_more_args) goto done; args[n++] = &a13; - if (&a14 == &no_more_args) goto done; args[n++] = &a14; - if (&a15 == &no_more_args) goto done; args[n++] = &a15; -done: - - size_t consumed; - int vec[kVecSize] = {}; - if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, - args, n, vec, kVecSize)) { - input->remove_prefix(consumed); - return true; - } else { - return false; - } -} - -bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, - const PCRE& pattern, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { - const Arg* args[kMaxArgs]; - int n = 0; - if (&a0 == &no_more_args) goto done; args[n++] = &a0; - if (&a1 == &no_more_args) goto done; args[n++] = &a1; - if (&a2 == &no_more_args) goto done; args[n++] = &a2; - if (&a3 == &no_more_args) goto done; args[n++] = &a3; - if (&a4 == &no_more_args) goto done; args[n++] = &a4; - if (&a5 == &no_more_args) goto done; args[n++] = &a5; - if (&a6 == &no_more_args) goto done; args[n++] = &a6; - if (&a7 == &no_more_args) goto done; args[n++] = &a7; - if (&a8 == &no_more_args) goto done; args[n++] = &a8; - if (&a9 == &no_more_args) goto done; args[n++] = &a9; - if (&a10 == &no_more_args) goto done; args[n++] = &a10; - if (&a11 == &no_more_args) goto done; args[n++] = &a11; - if (&a12 == &no_more_args) goto done; args[n++] = &a12; - if (&a13 == &no_more_args) goto done; args[n++] = &a13; - if (&a14 == &no_more_args) goto done; args[n++] = &a14; - if (&a15 == &no_more_args) goto done; args[n++] = &a15; -done: - - size_t consumed; - int vec[kVecSize] = {}; - if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, - args, n, vec, kVecSize)) { - input->remove_prefix(consumed); - return true; - } else { - return false; - } -} - -bool PCRE::Replace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite) { - int vec[kVecSize] = {}; - int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); - if (matches == 0) - return false; - - std::string s; - if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) - return false; - - assert(vec[0] >= 0); - assert(vec[1] >= 0); - str->replace(vec[0], vec[1] - vec[0], s); - return true; -} - -int PCRE::GlobalReplace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite) { - int count = 0; - int vec[kVecSize] = {}; - std::string out; - size_t start = 0; - bool last_match_was_empty_string = false; - - while (start <= str->size()) { - // If the previous match was for the empty string, we shouldn't - // just match again: we'll match in the same way and get an - // infinite loop. Instead, we do the match in a special way: - // anchored -- to force another try at the same position -- - // and with a flag saying that this time, ignore empty matches. - // If this special match returns, that means there's a non-empty - // match at this position as well, and we can continue. If not, - // we do what perl does, and just advance by one. - // Notice that perl prints '@@@' for this; - // perl -le '$_ = "aa"; s/b*|aa/@/g; print' - int matches; - if (last_match_was_empty_string) { - matches = pattern.TryMatch(*str, start, ANCHOR_START, false, - vec, kVecSize); - if (matches <= 0) { - if (start < str->size()) - out.push_back((*str)[start]); - start++; - last_match_was_empty_string = false; - continue; - } - } else { - matches = pattern.TryMatch(*str, start, UNANCHORED, true, - vec, kVecSize); - if (matches <= 0) - break; - } - size_t matchstart = vec[0], matchend = vec[1]; - assert(matchstart >= start); - assert(matchend >= matchstart); - - out.append(*str, start, matchstart - start); - pattern.Rewrite(&out, rewrite, *str, vec, matches); - start = matchend; - count++; - last_match_was_empty_string = (matchstart == matchend); - } - - if (count == 0) - return 0; - - if (start < str->size()) - out.append(*str, start, str->size() - start); - using std::swap; - swap(out, *str); - return count; -} - -bool PCRE::Extract(const StringPiece &text, - const PCRE& pattern, - const StringPiece &rewrite, - std::string *out) { - int vec[kVecSize] = {}; - int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); - if (matches == 0) - return false; - out->clear(); - return pattern.Rewrite(out, rewrite, text, vec, matches); -} - -std::string PCRE::QuoteMeta(const StringPiece& unquoted) { - std::string result; - result.reserve(unquoted.size() << 1); - - // Escape any ascii character not in [A-Za-z_0-9]. - // - // Note that it's legal to escape a character even if it has no - // special meaning in a regular expression -- so this function does - // that. (This also makes it identical to the perl function of the - // same name except for the null-character special case; - // see `perldoc -f quotemeta`.) - for (size_t ii = 0; ii < unquoted.size(); ++ii) { - // Note that using 'isalnum' here raises the benchmark time from - // 32ns to 58ns: - if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && - (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && - (unquoted[ii] < '0' || unquoted[ii] > '9') && - unquoted[ii] != '_' && - // If this is the part of a UTF8 or Latin1 character, we need - // to copy this byte without escaping. Experimentally this is - // what works correctly with the regexp library. - !(unquoted[ii] & 128)) { - if (unquoted[ii] == '\0') { // Special handling for null chars. - // Can't use "\\0" since the next character might be a digit. - result += "\\x00"; - continue; - } - result += '\\'; - } - result += unquoted[ii]; - } - - return result; -} - -/***** Actual matching and rewriting code *****/ - -bool PCRE::HitLimit() { - return hit_limit_ != 0; -} - -void PCRE::ClearHitLimit() { - hit_limit_ = 0; -} - -int PCRE::TryMatch(const StringPiece& text, - size_t startpos, - Anchor anchor, - bool empty_ok, - int *vec, - int vecsize) const { - pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; - if (re == NULL) { - PCREPORT(ERROR) << "Matching against invalid re: " << *error_; - return 0; - } - - int match_limit = match_limit_; - if (match_limit <= 0) { - match_limit = GetFlag(FLAGS_regexp_match_limit); - } - - int stack_limit = stack_limit_; - if (stack_limit <= 0) { - stack_limit = GetFlag(FLAGS_regexp_stack_limit); - } - - pcre_extra extra = { 0 }; - if (match_limit > 0) { - extra.flags |= PCRE_EXTRA_MATCH_LIMIT; - extra.match_limit = match_limit; - } - if (stack_limit > 0) { - extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; - extra.match_limit_recursion = stack_limit / kPCREFrameSize; - } - - int options = 0; - if (anchor != UNANCHORED) - options |= PCRE_ANCHORED; - if (!empty_ok) - options |= PCRE_NOTEMPTY; - - int rc = pcre_exec(re, // The regular expression object - &extra, - (text.data() == NULL) ? "" : text.data(), - static_cast<int>(text.size()), - static_cast<int>(startpos), - options, - vec, - vecsize); - - // Handle errors - if (rc == 0) { - // pcre_exec() returns 0 as a special case when the number of - // capturing subpatterns exceeds the size of the vector. - // When this happens, there is a match and the output vector - // is filled, but we miss out on the positions of the extra subpatterns. - rc = vecsize / 2; - } else if (rc < 0) { - switch (rc) { - case PCRE_ERROR_NOMATCH: - return 0; - case PCRE_ERROR_MATCHLIMIT: - // Writing to hit_limit is not safe if multiple threads - // are using the PCRE, but the flag is only intended - // for use by unit tests anyway, so we let it go. - hit_limit_ = true; - PCREPORT(WARNING) << "Exceeded match limit of " << match_limit - << " when matching '" << pattern_ << "'" - << " against text that is " << text.size() << " bytes."; - return 0; - case PCRE_ERROR_RECURSIONLIMIT: - // See comment about hit_limit above. - hit_limit_ = true; - PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit - << " when matching '" << pattern_ << "'" - << " against text that is " << text.size() << " bytes."; - return 0; - default: - // There are other return codes from pcre.h : - // PCRE_ERROR_NULL (-2) - // PCRE_ERROR_BADOPTION (-3) - // PCRE_ERROR_BADMAGIC (-4) - // PCRE_ERROR_UNKNOWN_NODE (-5) - // PCRE_ERROR_NOMEMORY (-6) - // PCRE_ERROR_NOSUBSTRING (-7) - // ... - PCREPORT(ERROR) << "Unexpected return code: " << rc - << " when matching '" << pattern_ << "'" - << ", re=" << re - << ", text=" << text - << ", vec=" << vec - << ", vecsize=" << vecsize; - return 0; - } - } - - return rc; -} - -bool PCRE::DoMatchImpl(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const* args, - int n, - int* vec, - int vecsize) const { - assert((1 + n) * 3 <= vecsize); // results + PCRE workspace - if (NumberOfCapturingGroups() < n) { - // RE has fewer capturing groups than number of Arg pointers passed in. - return false; - } - - int matches = TryMatch(text, 0, anchor, true, vec, vecsize); - assert(matches >= 0); // TryMatch never returns negatives - if (matches == 0) - return false; - - *consumed = vec[1]; - - if (n == 0 || args == NULL) { - // We are not interested in results - return true; - } - - // If we got here, we must have matched the whole pattern. - // We do not need (can not do) any more checks on the value of 'matches' here - // -- see the comment for TryMatch. - for (int i = 0; i < n; i++) { - const int start = vec[2*(i+1)]; - const int limit = vec[2*(i+1)+1]; - - // Avoid invoking undefined behavior when text.data() happens - // to be null and start happens to be -1, the latter being the - // case for an unmatched subexpression. Even if text.data() is - // not null, pointing one byte before was a longstanding bug. - const char* addr = NULL; - if (start != -1) { - addr = text.data() + start; - } - - if (!args[i]->Parse(addr, limit-start)) { - // TODO: Should we indicate what the error was? - return false; - } - } - - return true; -} - -bool PCRE::DoMatch(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const args[], - int n) const { - assert(n >= 0); - const int vecsize = (1 + n) * 3; // results + PCRE workspace - // (as for kVecSize) - int* vec = new int[vecsize]; - bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); - delete[] vec; - return b; -} - -bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite, - const StringPiece &text, int *vec, int veclen) const { - int number_of_capturing_groups = NumberOfCapturingGroups(); - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - int c = *s; - if (c == '\\') { - c = *++s; - if (isdigit(c)) { - int n = (c - '0'); - if (n >= veclen) { - if (n <= number_of_capturing_groups) { - // unmatched optional capturing group. treat - // its value as empty string; i.e., nothing to append. - } else { - PCREPORT(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); - return false; - } - } - int start = vec[2 * n]; - if (start >= 0) - out->append(text.data() + start, vec[2 * n + 1] - start); - } else if (c == '\\') { - out->push_back('\\'); - } else { - PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); - return false; - } - } else { - out->push_back(c); - } - } - return true; -} - -bool PCRE::CheckRewriteString(const StringPiece& rewrite, - std::string* error) const { - int max_token = -1; - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - int c = *s; - if (c != '\\') { - continue; - } - if (++s == end) { - *error = "Rewrite schema error: '\\' not allowed at end."; - return false; - } - c = *s; - if (c == '\\') { - continue; - } - if (!isdigit(c)) { - *error = "Rewrite schema error: " - "'\\' must be followed by a digit or '\\'."; - return false; - } - int n = (c - '0'); - if (max_token < n) { - max_token = n; - } - } - - if (max_token > NumberOfCapturingGroups()) { - *error = StringPrintf( - "Rewrite schema requests %d matches, but the regexp only has %d " - "parenthesized subexpressions.", - max_token, NumberOfCapturingGroups()); - return false; - } - return true; -} - - -// Return the number of capturing subpatterns, or -1 if the -// regexp wasn't valid on construction. -int PCRE::NumberOfCapturingGroups() const { - if (re_partial_ == NULL) return -1; - - int result; - int rc = pcre_fullinfo(re_partial_, // The regular expression object - NULL, // We did not study the pattern - PCRE_INFO_CAPTURECOUNT, - &result); - if (rc != 0) { - PCREPORT(ERROR) << "Unexpected return code: " << rc; - return -1; - } - return result; -} - - -/***** Parsers for various types *****/ - -bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) { - // We fail if somebody asked us to store into a non-NULL void* pointer - return (dest == NULL); -} - -bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - reinterpret_cast<std::string*>(dest)->assign(str, n); - return true; -} - -bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n); - return true; -} - -bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) { - if (n != 1) return false; - if (dest == NULL) return true; - *(reinterpret_cast<char*>(dest)) = str[0]; - return true; -} - -bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) { - if (n != 1) return false; - if (dest == NULL) return true; - *(reinterpret_cast<signed char*>(dest)) = str[0]; - return true; -} - -bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) { - if (n != 1) return false; - if (dest == NULL) return true; - *(reinterpret_cast<unsigned char*>(dest)) = str[0]; - return true; -} - -// Largest number spec that we are willing to parse -static const int kMaxNumberLength = 32; - -// PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 -// PCREQUIPCRES "n > 0" -// Copies "str" into "buf" and null-terminates if necessary. -// Returns one of: -// a. "str" if no termination is needed -// b. "buf" if the string was copied and null-terminated -// c. "" if the input was invalid and has no hope of being parsed -static const char* TerminateNumber(char* buf, const char* str, size_t n) { - if ((n > 0) && isspace(*str)) { - // We are less forgiving than the strtoxxx() routines and do not - // allow leading spaces. - return ""; - } - - // See if the character right after the input text may potentially - // look like a digit. - if (isdigit(str[n]) || - ((str[n] >= 'a') && (str[n] <= 'f')) || - ((str[n] >= 'A') && (str[n] <= 'F'))) { - if (n > kMaxNumberLength) return ""; // Input too big to be a valid number - memcpy(buf, str, n); - buf[n] = '\0'; - return buf; - } else { - // We can parse right out of the supplied string, so return it. - return str; - } -} - -bool PCRE::Arg::parse_long_radix(const char* str, - size_t n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); - char* end; - errno = 0; - long r = strtol(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast<long*>(dest)) = r; - return true; -} - -bool PCRE::Arg::parse_ulong_radix(const char* str, - size_t n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); - if (str[0] == '-') { - // strtoul() will silently accept negative numbers and parse - // them. This module is more strict and treats them as errors. - return false; - } - - char* end; - errno = 0; - unsigned long r = strtoul(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast<unsigned long*>(dest)) = r; - return true; -} - -bool PCRE::Arg::parse_short_radix(const char* str, - size_t n, - void* dest, - int radix) { - long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range - if (dest == NULL) return true; - *(reinterpret_cast<short*>(dest)) = (short)r; - return true; -} - -bool PCRE::Arg::parse_ushort_radix(const char* str, - size_t n, - void* dest, - int radix) { - unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((unsigned short)r != r) return false; // Out of range - if (dest == NULL) return true; - *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r; - return true; -} - -bool PCRE::Arg::parse_int_radix(const char* str, - size_t n, - void* dest, - int radix) { - long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range - if (dest == NULL) return true; - *(reinterpret_cast<int*>(dest)) = (int)r; - return true; -} - -bool PCRE::Arg::parse_uint_radix(const char* str, - size_t n, - void* dest, - int radix) { - unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((unsigned int)r != r) return false; // Out of range - if (dest == NULL) return true; - *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r; - return true; -} - -bool PCRE::Arg::parse_longlong_radix(const char* str, - size_t n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); - char* end; - errno = 0; - long long r = strtoll(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast<long long*>(dest)) = r; - return true; -} - -bool PCRE::Arg::parse_ulonglong_radix(const char* str, - size_t n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, n); - if (str[0] == '-') { - // strtoull() will silently accept negative numbers and parse - // them. This module is more strict and treats them as errors. - return false; - } - char* end; - errno = 0; - unsigned long long r = strtoull(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast<unsigned long long*>(dest)) = r; - return true; -} - -static bool parse_double_float(const char* str, size_t n, bool isfloat, - void* dest) { - if (n == 0) return false; - static const int kMaxLength = 200; - char buf[kMaxLength]; - if (n >= kMaxLength) return false; - memcpy(buf, str, n); - buf[n] = '\0'; - char* end; - errno = 0; - double r; - if (isfloat) { - r = strtof(buf, &end); - } else { - r = strtod(buf, &end); - } - if (end != buf + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - if (isfloat) { - *(reinterpret_cast<float*>(dest)) = (float)r; - } else { - *(reinterpret_cast<double*>(dest)) = r; - } - return true; -} - -bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) { - return parse_double_float(str, n, false, dest); -} - -bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) { - return parse_double_float(str, n, true, dest); -} - -#define DEFINE_INTEGER_PARSER(name) \ - bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 10); \ - } \ - bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 16); \ - } \ - bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \ - void* dest) { \ - return parse_##name##_radix(str, n, dest, 8); \ - } \ - bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \ - void* dest) { \ - return parse_##name##_radix(str, n, dest, 0); \ - } - -DEFINE_INTEGER_PARSER(short); -DEFINE_INTEGER_PARSER(ushort); -DEFINE_INTEGER_PARSER(int); -DEFINE_INTEGER_PARSER(uint); -DEFINE_INTEGER_PARSER(long); -DEFINE_INTEGER_PARSER(ulong); -DEFINE_INTEGER_PARSER(longlong); -DEFINE_INTEGER_PARSER(ulonglong); - -#undef DEFINE_INTEGER_PARSER - -} // namespace re2 +// Copyright 2003-2009 Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is a variant of PCRE's pcrecpp.cc, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <limits> +#include <string> +#include <utility> + +#include "util/util.h" +#include "util/flags.h" +#include "util/logging.h" +#include "util/pcre.h" +#include "util/strutil.h" + +// Silence warnings about the wacky formatting in the operator() functions. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#endif + +#define PCREPORT(level) LOG(level) + +// Default PCRE limits. +// Defaults chosen to allow a plausible amount of CPU and +// not exceed main thread stacks. Note that other threads +// often have smaller stacks, and therefore tightening +// regexp_stack_limit may frequently be necessary. +DEFINE_FLAG(int, regexp_stack_limit, 256 << 10, + "default PCRE stack limit (bytes)"); +DEFINE_FLAG(int, regexp_match_limit, 1000000, + "default PCRE match limit (function calls)"); + +#ifndef USEPCRE + +// Fake just enough of the PCRE API to allow this file to build. :) + +struct pcre_extra { + int flags; + int match_limit; + int match_limit_recursion; +}; + +#define PCRE_EXTRA_MATCH_LIMIT 0 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 +#define PCRE_ANCHORED 0 +#define PCRE_NOTEMPTY 0 +#define PCRE_ERROR_NOMATCH 1 +#define PCRE_ERROR_MATCHLIMIT 2 +#define PCRE_ERROR_RECURSIONLIMIT 3 +#define PCRE_INFO_CAPTURECOUNT 0 + +void pcre_free(void*) { +} + +pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { + return NULL; +} + +int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) { + return 0; +} + +int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { + return 0; +} + +#endif + +namespace re2 { + +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace + +// Approximate size of a recursive invocation of PCRE's +// internal "match()" frame. This varies depending on the +// compiler and architecture, of course, so the constant is +// just a conservative estimate. To find the exact number, +// run regexp_unittest with --regexp_stack_limit=0 under +// a debugger and look at the frames when it crashes. +// The exact frame size was 656 in production on 2008/02/03. +static const int kPCREFrameSize = 700; + +// Special name for missing C++ arguments. +PCRE::Arg PCRE::no_more_args((void*)NULL); + +const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; +const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; +const PCRE::ConsumeFunctor PCRE::Consume = { }; +const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; + +// If a regular expression has no error, its error_ field points here +static const std::string empty_string; + +void PCRE::Init(const char* pattern, Option options, int match_limit, + int stack_limit, bool report_errors) { + pattern_ = pattern; + options_ = options; + match_limit_ = match_limit; + stack_limit_ = stack_limit; + hit_limit_ = false; + error_ = &empty_string; + report_errors_ = report_errors; + re_full_ = NULL; + re_partial_ = NULL; + + if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { + error_ = new std::string("illegal regexp option"); + PCREPORT(ERROR) + << "Error compiling '" << pattern << "': illegal regexp option"; + } else { + re_partial_ = Compile(UNANCHORED); + if (re_partial_ != NULL) { + re_full_ = Compile(ANCHOR_BOTH); + } + } +} + +PCRE::PCRE(const char* pattern) { + Init(pattern, None, 0, 0, true); +} +PCRE::PCRE(const char* pattern, Option option) { + Init(pattern, option, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern) { + Init(pattern.c_str(), None, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern, Option option) { + Init(pattern.c_str(), option, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) { + Init(pattern.c_str(), re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { + Init(pattern, re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::~PCRE() { + if (re_full_ != NULL) pcre_free(re_full_); + if (re_partial_ != NULL) pcre_free(re_partial_); + if (error_ != &empty_string) delete error_; +} + +pcre* PCRE::Compile(Anchor anchor) { + // Special treatment for anchoring. This is needed because at + // runtime pcre only provides an option for anchoring at the + // beginning of a string. + // + // There are three types of anchoring we want: + // UNANCHORED Compile the original pattern, and use + // a pcre unanchored match. + // ANCHOR_START Compile the original pattern, and use + // a pcre anchored match. + // ANCHOR_BOTH Tack a "\z" to the end of the original pattern + // and use a pcre anchored match. + + const char* error = ""; + int eoffset; + pcre* re; + if (anchor != ANCHOR_BOTH) { + re = pcre_compile(pattern_.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } else { + // Tack a '\z' at the end of PCRE. Parenthesize it first so that + // the '\z' applies to all top-level alternatives in the regexp. + std::string wrapped = "(?:"; // A non-counting grouping operator + wrapped += pattern_; + wrapped += ")\\z"; + re = pcre_compile(wrapped.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } + if (re == NULL) { + if (error_ == &empty_string) error_ = new std::string(error); + PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; + } + return re; +} + +/***** Convenience interfaces *****/ + +bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::Replace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int vec[kVecSize] = {}; + int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + + std::string s; + if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) + return false; + + assert(vec[0] >= 0); + assert(vec[1] >= 0); + str->replace(vec[0], vec[1] - vec[0], s); + return true; +} + +int PCRE::GlobalReplace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int count = 0; + int vec[kVecSize] = {}; + std::string out; + size_t start = 0; + bool last_match_was_empty_string = false; + + while (start <= str->size()) { + // If the previous match was for the empty string, we shouldn't + // just match again: we'll match in the same way and get an + // infinite loop. Instead, we do the match in a special way: + // anchored -- to force another try at the same position -- + // and with a flag saying that this time, ignore empty matches. + // If this special match returns, that means there's a non-empty + // match at this position as well, and we can continue. If not, + // we do what perl does, and just advance by one. + // Notice that perl prints '@@@' for this; + // perl -le '$_ = "aa"; s/b*|aa/@/g; print' + int matches; + if (last_match_was_empty_string) { + matches = pattern.TryMatch(*str, start, ANCHOR_START, false, + vec, kVecSize); + if (matches <= 0) { + if (start < str->size()) + out.push_back((*str)[start]); + start++; + last_match_was_empty_string = false; + continue; + } + } else { + matches = pattern.TryMatch(*str, start, UNANCHORED, true, + vec, kVecSize); + if (matches <= 0) + break; + } + size_t matchstart = vec[0], matchend = vec[1]; + assert(matchstart >= start); + assert(matchend >= matchstart); + + out.append(*str, start, matchstart - start); + pattern.Rewrite(&out, rewrite, *str, vec, matches); + start = matchend; + count++; + last_match_was_empty_string = (matchstart == matchend); + } + + if (count == 0) + return 0; + + if (start < str->size()) + out.append(*str, start, str->size() - start); + using std::swap; + swap(out, *str); + return count; +} + +bool PCRE::Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + std::string *out) { + int vec[kVecSize] = {}; + int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + out->clear(); + return pattern.Rewrite(out, rewrite, text, vec, matches); +} + +std::string PCRE::QuoteMeta(const StringPiece& unquoted) { + std::string result; + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) + for (size_t ii = 0; ii < unquoted.size(); ++ii) { + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + +/***** Actual matching and rewriting code *****/ + +bool PCRE::HitLimit() { + return hit_limit_ != 0; +} + +void PCRE::ClearHitLimit() { + hit_limit_ = 0; +} + +int PCRE::TryMatch(const StringPiece& text, + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const { + pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; + if (re == NULL) { + PCREPORT(ERROR) << "Matching against invalid re: " << *error_; + return 0; + } + + int match_limit = match_limit_; + if (match_limit <= 0) { + match_limit = GetFlag(FLAGS_regexp_match_limit); + } + + int stack_limit = stack_limit_; + if (stack_limit <= 0) { + stack_limit = GetFlag(FLAGS_regexp_stack_limit); + } + + pcre_extra extra = { 0 }; + if (match_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT; + extra.match_limit = match_limit; + } + if (stack_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; + extra.match_limit_recursion = stack_limit / kPCREFrameSize; + } + + int options = 0; + if (anchor != UNANCHORED) + options |= PCRE_ANCHORED; + if (!empty_ok) + options |= PCRE_NOTEMPTY; + + int rc = pcre_exec(re, // The regular expression object + &extra, + (text.data() == NULL) ? "" : text.data(), + static_cast<int>(text.size()), + static_cast<int>(startpos), + options, + vec, + vecsize); + + // Handle errors + if (rc == 0) { + // pcre_exec() returns 0 as a special case when the number of + // capturing subpatterns exceeds the size of the vector. + // When this happens, there is a match and the output vector + // is filled, but we miss out on the positions of the extra subpatterns. + rc = vecsize / 2; + } else if (rc < 0) { + switch (rc) { + case PCRE_ERROR_NOMATCH: + return 0; + case PCRE_ERROR_MATCHLIMIT: + // Writing to hit_limit is not safe if multiple threads + // are using the PCRE, but the flag is only intended + // for use by unit tests anyway, so we let it go. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded match limit of " << match_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + case PCRE_ERROR_RECURSIONLIMIT: + // See comment about hit_limit above. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + default: + // There are other return codes from pcre.h : + // PCRE_ERROR_NULL (-2) + // PCRE_ERROR_BADOPTION (-3) + // PCRE_ERROR_BADMAGIC (-4) + // PCRE_ERROR_UNKNOWN_NODE (-5) + // PCRE_ERROR_NOMEMORY (-6) + // PCRE_ERROR_NOSUBSTRING (-7) + // ... + PCREPORT(ERROR) << "Unexpected return code: " << rc + << " when matching '" << pattern_ << "'" + << ", re=" << re + << ", text=" << text + << ", vec=" << vec + << ", vecsize=" << vecsize; + return 0; + } + } + + return rc; +} + +bool PCRE::DoMatchImpl(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const* args, + int n, + int* vec, + int vecsize) const { + assert((1 + n) * 3 <= vecsize); // results + PCRE workspace + if (NumberOfCapturingGroups() < n) { + // RE has fewer capturing groups than number of Arg pointers passed in. + return false; + } + + int matches = TryMatch(text, 0, anchor, true, vec, vecsize); + assert(matches >= 0); // TryMatch never returns negatives + if (matches == 0) + return false; + + *consumed = vec[1]; + + if (n == 0 || args == NULL) { + // We are not interested in results + return true; + } + + // If we got here, we must have matched the whole pattern. + // We do not need (can not do) any more checks on the value of 'matches' here + // -- see the comment for TryMatch. + for (int i = 0; i < n; i++) { + const int start = vec[2*(i+1)]; + const int limit = vec[2*(i+1)+1]; + + // Avoid invoking undefined behavior when text.data() happens + // to be null and start happens to be -1, the latter being the + // case for an unmatched subexpression. Even if text.data() is + // not null, pointing one byte before was a longstanding bug. + const char* addr = NULL; + if (start != -1) { + addr = text.data() + start; + } + + if (!args[i]->Parse(addr, limit-start)) { + // TODO: Should we indicate what the error was? + return false; + } + } + + return true; +} + +bool PCRE::DoMatch(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n) const { + assert(n >= 0); + const int vecsize = (1 + n) * 3; // results + PCRE workspace + // (as for kVecSize) + int* vec = new int[vecsize]; + bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); + delete[] vec; + return b; +} + +bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite, + const StringPiece &text, int *vec, int veclen) const { + int number_of_capturing_groups = NumberOfCapturingGroups(); + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c == '\\') { + c = *++s; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (n <= number_of_capturing_groups) { + // unmatched optional capturing group. treat + // its value as empty string; i.e., nothing to append. + } else { + PCREPORT(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + return false; + } + } + int start = vec[2 * n]; + if (start >= 0) + out->append(text.data() + start, vec[2 * n + 1] - start); + } else if (c == '\\') { + out->push_back('\\'); + } else { + PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } else { + out->push_back(c); + } + } + return true; +} + +bool PCRE::CheckRewriteString(const StringPiece& rewrite, + std::string* error) const { + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c != '\\') { + continue; + } + if (++s == end) { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') { + continue; + } + if (!isdigit(c)) { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) { + max_token = n; + } + } + + if (max_token > NumberOfCapturingGroups()) { + *error = StringPrintf( + "Rewrite schema requests %d matches, but the regexp only has %d " + "parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); + return false; + } + return true; +} + + +// Return the number of capturing subpatterns, or -1 if the +// regexp wasn't valid on construction. +int PCRE::NumberOfCapturingGroups() const { + if (re_partial_ == NULL) return -1; + + int result; + int rc = pcre_fullinfo(re_partial_, // The regular expression object + NULL, // We did not study the pattern + PCRE_INFO_CAPTURECOUNT, + &result); + if (rc != 0) { + PCREPORT(ERROR) << "Unexpected return code: " << rc; + return -1; + } + return result; +} + + +/***** Parsers for various types *****/ + +bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) { + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + +bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast<std::string*>(dest)->assign(str, n); + return true; +} + +bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n); + return true; +} + +bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<char*>(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<signed char*>(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned char*>(dest)) = str[0]; + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + +// PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 +// PCREQUIPCRES "n > 0" +// Copies "str" into "buf" and null-terminates if necessary. +// Returns one of: +// a. "str" if no termination is needed +// b. "buf" if the string was copied and null-terminated +// c. "" if the input was invalid and has no hope of being parsed +static const char* TerminateNumber(char* buf, const char* str, size_t n) { + if ((n > 0) && isspace(*str)) { + // We are less forgiving than the strtoxxx() routines and do not + // allow leading spaces. + return ""; + } + + // See if the character right after the input text may potentially + // look like a digit. + if (isdigit(str[n]) || + ((str[n] >= 'a') && (str[n] <= 'f')) || + ((str[n] >= 'A') && (str[n] <= 'F'))) { + if (n > kMaxNumberLength) return ""; // Input too big to be a valid number + memcpy(buf, str, n); + buf[n] = '\0'; + return buf; + } else { + // We can parse right out of the supplied string, so return it. + return str; + } +} + +bool PCRE::Arg::parse_long_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_short_radix(const char* str, + size_t n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<short*>(dest)) = (short)r; + return true; +} + +bool PCRE::Arg::parse_ushort_radix(const char* str, + size_t n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r; + return true; +} + +bool PCRE::Arg::parse_int_radix(const char* str, + size_t n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<int*>(dest)) = (int)r; + return true; +} + +bool PCRE::Arg::parse_uint_radix(const char* str, + size_t n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r; + return true; +} + +bool PCRE::Arg::parse_longlong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long long r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<long long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulonglong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; + unsigned long long r = strtoull(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned long long*>(dest)) = r; + return true; +} + +static bool parse_double_float(const char* str, size_t n, bool isfloat, + void* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength]; + if (n >= kMaxLength) return false; + memcpy(buf, str, n); + buf[n] = '\0'; + char* end; + errno = 0; + double r; + if (isfloat) { + r = strtof(buf, &end); + } else { + r = strtod(buf, &end); + } + if (end != buf + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + if (isfloat) { + *(reinterpret_cast<float*>(dest)) = (float)r; + } else { + *(reinterpret_cast<double*>(dest)) = r; + } + return true; +} + +bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, false, dest); +} + +bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, true, dest); +} + +#define DEFINE_INTEGER_PARSER(name) \ + bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ + } + +DEFINE_INTEGER_PARSER(short); +DEFINE_INTEGER_PARSER(ushort); +DEFINE_INTEGER_PARSER(int); +DEFINE_INTEGER_PARSER(uint); +DEFINE_INTEGER_PARSER(long); +DEFINE_INTEGER_PARSER(ulong); +DEFINE_INTEGER_PARSER(longlong); +DEFINE_INTEGER_PARSER(ulonglong); + +#undef DEFINE_INTEGER_PARSER + +} // namespace re2 diff --git a/contrib/libs/re2/util/pcre.h b/contrib/libs/re2/util/pcre.h index 500c56d283..896b0bdf89 100644 --- a/contrib/libs/re2/util/pcre.h +++ b/contrib/libs/re2/util/pcre.h @@ -1,681 +1,681 @@ -// Copyright 2003-2010 Google Inc. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_PCRE_H_ -#define UTIL_PCRE_H_ - -// This is a variant of PCRE's pcrecpp.h, originally written at Google. -// The main changes are the addition of the HitLimit method and -// compilation as PCRE in namespace re2. - -// C++ interface to the pcre regular-expression library. PCRE supports -// Perl-style regular expressions (with extensions like \d, \w, \s, -// ...). -// -// ----------------------------------------------------------------------- -// REGEXP SYNTAX: -// -// This module uses the pcre library and hence supports its syntax -// for regular expressions: -// -// http://www.google.com/search?q=pcre -// -// The syntax is pretty similar to Perl's. For those not familiar -// with Perl's regular expressions, here are some examples of the most -// commonly used extensions: -// -// "hello (\\w+) world" -- \w matches a "word" character -// "version (\\d+)" -- \d matches a digit -// "hello\\s+world" -- \s matches any whitespace character -// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary -// "(?i)hello" -- (?i) turns on case-insensitive matching -// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible -// -// ----------------------------------------------------------------------- -// MATCHING INTERFACE: -// -// The "FullMatch" operation checks that supplied text matches a -// supplied pattern exactly. -// -// Example: successful match -// CHECK(PCRE::FullMatch("hello", "h.*o")); -// -// Example: unsuccessful match (requires full match): -// CHECK(!PCRE::FullMatch("hello", "e")); -// -// ----------------------------------------------------------------------- -// UTF-8 AND THE MATCHING INTERFACE: -// -// By default, pattern and text are plain text, one byte per character. -// The UTF8 flag, passed to the constructor, causes both pattern -// and string to be treated as UTF-8 text, still a byte stream but -// potentially multiple bytes per character. In practice, the text -// is likelier to be UTF-8 than the pattern, but the match returned -// may depend on the UTF8 flag, so always use it when matching -// UTF8 text. E.g., "." will match one byte normally but with UTF8 -// set may match up to three bytes of a multi-byte character. -// -// Example: -// PCRE re(utf8_pattern, PCRE::UTF8); -// CHECK(PCRE::FullMatch(utf8_string, re)); -// -// ----------------------------------------------------------------------- -// MATCHING WITH SUBSTRING EXTRACTION: -// -// You can supply extra pointer arguments to extract matched substrings. -// -// Example: extracts "ruby" into "s" and 1234 into "i" -// int i; -// std::string s; -// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); -// -// Example: fails because string cannot be stored in integer -// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i)); -// -// Example: fails because there aren't enough sub-patterns: -// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s)); -// -// Example: does not try to extract any extra sub-patterns -// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); -// -// Example: does not try to extract into NULL -// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); -// -// Example: integer overflow causes failure -// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); -// -// ----------------------------------------------------------------------- -// PARTIAL MATCHES -// -// You can use the "PartialMatch" operation when you want the pattern -// to match any substring of the text. -// -// Example: simple search for a string: -// CHECK(PCRE::PartialMatch("hello", "ell")); -// -// Example: find first number in a string -// int number; -// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number)); -// CHECK_EQ(number, 100); -// -// ----------------------------------------------------------------------- -// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS -// -// PCRE makes it easy to use any string as a regular expression, without -// requiring a separate compilation step. -// -// If speed is of the essence, you can create a pre-compiled "PCRE" -// object from the pattern and use it multiple times. If you do so, -// you can typically parse text faster than with sscanf. -// -// Example: precompile pattern for faster matching: -// PCRE pattern("h.*o"); -// while (ReadLine(&str)) { -// if (PCRE::FullMatch(str, pattern)) ...; -// } -// -// ----------------------------------------------------------------------- -// SCANNING TEXT INCPCREMENTALLY -// -// The "Consume" operation may be useful if you want to repeatedly -// match regular expressions at the front of a string and skip over -// them as they match. This requires use of the "StringPiece" type, -// which represents a sub-range of a real string. -// -// Example: read lines of the form "var = value" from a string. -// std::string contents = ...; // Fill string somehow -// StringPiece input(contents); // Wrap a StringPiece around it -// -// std::string var; -// int value; -// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { -// ...; -// } -// -// Each successful call to "Consume" will set "var/value", and also -// advance "input" so it points past the matched text. Note that if the -// regular expression matches an empty string, input will advance -// by 0 bytes. If the regular expression being used might match -// an empty string, the loop body must check for this case and either -// advance the string or break out of the loop. -// -// The "FindAndConsume" operation is similar to "Consume" but does not -// anchor your match at the beginning of the string. For example, you -// could extract all words from a string by repeatedly calling -// PCRE::FindAndConsume(&input, "(\\w+)", &word) -// -// ----------------------------------------------------------------------- -// PARSING HEX/OCTAL/C-RADIX NUMBERS -// -// By default, if you pass a pointer to a numeric value, the -// corresponding text is interpreted as a base-10 number. You can -// instead wrap the pointer with a call to one of the operators Hex(), -// Octal(), or CRadix() to interpret the text in another base. The -// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) -// prefixes, but defaults to base-10. -// -// Example: -// int a, b, c, d; -// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", -// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d)); -// will leave 64 in a, b, c, and d. - -#include "util/util.h" -#include "re2/stringpiece.h" - -#ifdef USEPCRE -#include <pcre.h> -namespace re2 { -const bool UsingPCRE = true; -} // namespace re2 -#else -struct pcre; // opaque -namespace re2 { -const bool UsingPCRE = false; -} // namespace re2 -#endif - -namespace re2 { - -class PCRE_Options; - -// Interface for regular expression matching. Also corresponds to a -// pre-compiled regular expression. An "PCRE" object is safe for -// concurrent use by multiple threads. -class PCRE { - public: - // We convert user-passed pointers into special Arg objects - class Arg; - - // Marks end of arg list. - // ONLY USE IN OPTIONAL ARG DEFAULTS. - // DO NOT PASS EXPLICITLY. - static Arg no_more_args; - - // Options are same value as those in pcre. We provide them here - // to avoid users needing to include pcre.h and also to isolate - // users from pcre should we change the underlying library. - // Only those needed by Google programs are exposed here to - // avoid collision with options employed internally by regexp.cc - // Note that some options have equivalents that can be specified in - // the regexp itself. For example, prefixing your regexp with - // "(?s)" has the same effect as the PCRE_DOTALL option. - enum Option { - None = 0x0000, - UTF8 = 0x0800, // == PCRE_UTF8 - EnabledCompileOptions = UTF8, - EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag - }; - - // We provide implicit conversions from strings so that users can - // pass in a string or a "const char*" wherever an "PCRE" is expected. - PCRE(const char* pattern); - PCRE(const char* pattern, Option option); - PCRE(const std::string& pattern); - PCRE(const std::string& pattern, Option option); - PCRE(const char *pattern, const PCRE_Options& re_option); - PCRE(const std::string& pattern, const PCRE_Options& re_option); - - ~PCRE(); - - // The string specification for this PCRE. E.g. - // PCRE re("ab*c?d+"); - // re.pattern(); // "ab*c?d+" - const std::string& pattern() const { return pattern_; } - - // If PCRE could not be created properly, returns an error string. - // Else returns the empty string. - const std::string& error() const { return *error_; } - - // Whether the PCRE has hit a match limit during execution. - // Not thread safe. Intended only for testing. - // If hitting match limits is a problem, - // you should be using PCRE2 (re2/re2.h) - // instead of checking this flag. - bool HitLimit(); - void ClearHitLimit(); - - /***** The useful part: the matching interface *****/ - - // Matches "text" against "pattern". If pointer arguments are - // supplied, copies matched sub-patterns into them. - // - // You can pass in a "const char*" or a "std::string" for "text". - // You can pass in a "const char*" or a "std::string" or a "PCRE" for "pattern". - // - // The provided pointer arguments can be pointers to any scalar numeric - // type, or one of: - // std::string (matched piece is copied to string) - // StringPiece (StringPiece is mutated to point to matched piece) - // T (where "bool T::ParseFrom(const char*, size_t)" exists) - // (void*)NULL (the corresponding matched sub-pattern is not copied) - // - // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "pattern" exactly - // b. The number of matched sub-patterns is >= number of supplied pointers - // c. The "i"th argument has a suitable type for holding the - // string captured as the "i"th sub-pattern. If you pass in - // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, "i"th captured sub-pattern is - // ignored. - // - // CAVEAT: An optional sub-pattern that does not exist in the - // matched string is assigned the empty string. Therefore, the - // following will return false (because the empty string is not a - // valid number): - // int number; - // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number); - struct FullMatchFunctor { - bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args - const Arg& ptr1 = no_more_args, - const Arg& ptr2 = no_more_args, - const Arg& ptr3 = no_more_args, - const Arg& ptr4 = no_more_args, - const Arg& ptr5 = no_more_args, - const Arg& ptr6 = no_more_args, - const Arg& ptr7 = no_more_args, - const Arg& ptr8 = no_more_args, - const Arg& ptr9 = no_more_args, - const Arg& ptr10 = no_more_args, - const Arg& ptr11 = no_more_args, - const Arg& ptr12 = no_more_args, - const Arg& ptr13 = no_more_args, - const Arg& ptr14 = no_more_args, - const Arg& ptr15 = no_more_args, - const Arg& ptr16 = no_more_args) const; - }; - - static const FullMatchFunctor FullMatch; - - // Exactly like FullMatch(), except that "pattern" is allowed to match - // a substring of "text". - struct PartialMatchFunctor { - bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args - const Arg& ptr1 = no_more_args, - const Arg& ptr2 = no_more_args, - const Arg& ptr3 = no_more_args, - const Arg& ptr4 = no_more_args, - const Arg& ptr5 = no_more_args, - const Arg& ptr6 = no_more_args, - const Arg& ptr7 = no_more_args, - const Arg& ptr8 = no_more_args, - const Arg& ptr9 = no_more_args, - const Arg& ptr10 = no_more_args, - const Arg& ptr11 = no_more_args, - const Arg& ptr12 = no_more_args, - const Arg& ptr13 = no_more_args, - const Arg& ptr14 = no_more_args, - const Arg& ptr15 = no_more_args, - const Arg& ptr16 = no_more_args) const; - }; - - static const PartialMatchFunctor PartialMatch; - - // Like FullMatch() and PartialMatch(), except that pattern has to - // match a prefix of "text", and "input" is advanced past the matched - // text. Note: "input" is modified iff this routine returns true. - struct ConsumeFunctor { - bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args - const Arg& ptr1 = no_more_args, - const Arg& ptr2 = no_more_args, - const Arg& ptr3 = no_more_args, - const Arg& ptr4 = no_more_args, - const Arg& ptr5 = no_more_args, - const Arg& ptr6 = no_more_args, - const Arg& ptr7 = no_more_args, - const Arg& ptr8 = no_more_args, - const Arg& ptr9 = no_more_args, - const Arg& ptr10 = no_more_args, - const Arg& ptr11 = no_more_args, - const Arg& ptr12 = no_more_args, - const Arg& ptr13 = no_more_args, - const Arg& ptr14 = no_more_args, - const Arg& ptr15 = no_more_args, - const Arg& ptr16 = no_more_args) const; - }; - - static const ConsumeFunctor Consume; - - // Like Consume(..), but does not anchor the match at the beginning of the - // string. That is, "pattern" need not start its match at the beginning of - // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next - // word in "s" and stores it in "word". - struct FindAndConsumeFunctor { - bool operator ()(StringPiece* input, const PCRE& pattern, - const Arg& ptr1 = no_more_args, - const Arg& ptr2 = no_more_args, - const Arg& ptr3 = no_more_args, - const Arg& ptr4 = no_more_args, - const Arg& ptr5 = no_more_args, - const Arg& ptr6 = no_more_args, - const Arg& ptr7 = no_more_args, - const Arg& ptr8 = no_more_args, - const Arg& ptr9 = no_more_args, - const Arg& ptr10 = no_more_args, - const Arg& ptr11 = no_more_args, - const Arg& ptr12 = no_more_args, - const Arg& ptr13 = no_more_args, - const Arg& ptr14 = no_more_args, - const Arg& ptr15 = no_more_args, - const Arg& ptr16 = no_more_args) const; - }; - - static const FindAndConsumeFunctor FindAndConsume; - - // Replace the first match of "pattern" in "str" with "rewrite". - // Within "rewrite", backslash-escaped digits (\1 to \9) can be - // used to insert text matching corresponding parenthesized group - // from the pattern. \0 in "rewrite" refers to the entire matching - // text. E.g., - // - // std::string s = "yabba dabba doo"; - // CHECK(PCRE::Replace(&s, "b+", "d")); - // - // will leave "s" containing "yada dabba doo" - // - // Returns true if the pattern matches and a replacement occurs, - // false otherwise. - static bool Replace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite); - - // Like Replace(), except replaces all occurrences of the pattern in - // the string with the rewrite. Replacements are not subject to - // re-matching. E.g., - // - // std::string s = "yabba dabba doo"; - // CHECK(PCRE::GlobalReplace(&s, "b+", "d")); - // - // will leave "s" containing "yada dada doo" - // - // Returns the number of replacements made. - static int GlobalReplace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite); - - // Like Replace, except that if the pattern matches, "rewrite" - // is copied into "out" with substitutions. The non-matching - // portions of "text" are ignored. - // - // Returns true iff a match occurred and the extraction happened - // successfully; if no match occurs, the string is left unaffected. - static bool Extract(const StringPiece &text, - const PCRE& pattern, - const StringPiece &rewrite, - std::string *out); - - // Check that the given @p rewrite string is suitable for use with - // this PCRE. It checks that: - // * The PCRE has enough parenthesized subexpressions to satisfy all - // of the \N tokens in @p rewrite, and - // * The @p rewrite string doesn't have any syntax errors - // ('\' followed by anything besides [0-9] and '\'). - // Making this test will guarantee that "replace" and "extract" - // operations won't LOG(ERROR) or fail because of a bad rewrite - // string. - // @param rewrite The proposed rewrite string. - // @param error An error message is recorded here, iff we return false. - // Otherwise, it is unchanged. - // @return true, iff @p rewrite is suitable for use with the PCRE. - bool CheckRewriteString(const StringPiece& rewrite, - std::string* error) const; - - // Returns a copy of 'unquoted' with all potentially meaningful - // regexp characters backslash-escaped. The returned string, used - // as a regular expression, will exactly match the original string. - // For example, - // 1.5-2.0? - // becomes: - // 1\.5\-2\.0\? - static std::string QuoteMeta(const StringPiece& unquoted); - - /***** Generic matching interface (not so nice to use) *****/ - - // Type of match (TODO: Should be restructured as an Option) - enum Anchor { - UNANCHORED, // No anchoring - ANCHOR_START, // Anchor at start only - ANCHOR_BOTH, // Anchor at start and end - }; - - // General matching routine. Stores the length of the match in - // "*consumed" if successful. - bool DoMatch(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const* args, int n) const; - - // Return the number of capturing subpatterns, or -1 if the - // regexp wasn't valid on construction. - int NumberOfCapturingGroups() const; - - private: - void Init(const char* pattern, Option option, int match_limit, - int stack_limit, bool report_errors); - - // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with - // pairs of integers for the beginning and end positions of matched - // text. The first pair corresponds to the entire matched text; - // subsequent pairs correspond, in order, to parentheses-captured - // matches. Returns the number of pairs (one more than the number of - // the last subpattern with a match) if matching was successful - // and zero if the match failed. - // I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching - // against "foo", "bar", and "baz" respectively. - // When matching PCRE("(foo)|hello") against "hello", it will return 1. - // But the values for all subpattern are filled in into "vec". - int TryMatch(const StringPiece& text, - size_t startpos, - Anchor anchor, - bool empty_ok, - int *vec, - int vecsize) const; - - // Append the "rewrite" string, with backslash subsitutions from "text" - // and "vec", to string "out". - bool Rewrite(std::string *out, - const StringPiece &rewrite, - const StringPiece &text, - int *vec, - int veclen) const; - - // internal implementation for DoMatch - bool DoMatchImpl(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const args[], - int n, - int* vec, - int vecsize) const; - - // Compile the regexp for the specified anchoring mode - pcre* Compile(Anchor anchor); - - std::string pattern_; - Option options_; - pcre* re_full_; // For full matches - pcre* re_partial_; // For partial matches - const std::string* error_; // Error indicator (or empty string) - bool report_errors_; // Silences error logging if false - int match_limit_; // Limit on execution resources - int stack_limit_; // Limit on stack resources (bytes) - mutable int32_t hit_limit_; // Hit limit during execution (bool) - - PCRE(const PCRE&) = delete; - PCRE& operator=(const PCRE&) = delete; -}; - -// PCRE_Options allow you to set the PCRE::Options, plus any pcre -// "extra" options. The only extras are match_limit, which limits -// the CPU time of a match, and stack_limit, which limits the -// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default -// that should not cause too many problems in production code. -// If PCRE hits a limit during a match, it may return a false negative, -// but (hopefully) it won't crash. -// -// NOTE: If you are handling regular expressions specified by -// (external or internal) users, rather than hard-coded ones, -// you should be using PCRE2, which uses an alternate implementation -// that avoids these issues. See http://go/re2quick. -class PCRE_Options { - public: - // constructor - PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {} - // accessors - PCRE::Option option() const { return option_; } - void set_option(PCRE::Option option) { - option_ = option; - } - int match_limit() const { return match_limit_; } - void set_match_limit(int match_limit) { - match_limit_ = match_limit; - } - int stack_limit() const { return stack_limit_; } - void set_stack_limit(int stack_limit) { - stack_limit_ = stack_limit; - } - - // If the regular expression is malformed, an error message will be printed - // iff report_errors() is true. Default: true. - bool report_errors() const { return report_errors_; } - void set_report_errors(bool report_errors) { - report_errors_ = report_errors; - } - private: - PCRE::Option option_; - int match_limit_; - int stack_limit_; - bool report_errors_; -}; - - -/***** Implementation details *****/ - -// Hex/Octal/Binary? - -// Special class for parsing into objects that define a ParseFrom() method -template <typename T> -class _PCRE_MatchObject { - public: - static inline bool Parse(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - T* object = reinterpret_cast<T*>(dest); - return object->ParseFrom(str, n); - } -}; - -class PCRE::Arg { - public: - // Empty constructor so we can declare arrays of PCRE::Arg - Arg(); - - // Constructor specially designed for NULL arguments - Arg(void*); - - typedef bool (*Parser)(const char* str, size_t n, void* dest); - -// Type-specific parsers -#define MAKE_PARSER(type, name) \ - Arg(type* p) : arg_(p), parser_(name) {} \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} - - MAKE_PARSER(char, parse_char); - MAKE_PARSER(signed char, parse_schar); - MAKE_PARSER(unsigned char, parse_uchar); - MAKE_PARSER(float, parse_float); - MAKE_PARSER(double, parse_double); - MAKE_PARSER(std::string, parse_string); - MAKE_PARSER(StringPiece, parse_stringpiece); - - MAKE_PARSER(short, parse_short); - MAKE_PARSER(unsigned short, parse_ushort); - MAKE_PARSER(int, parse_int); - MAKE_PARSER(unsigned int, parse_uint); - MAKE_PARSER(long, parse_long); - MAKE_PARSER(unsigned long, parse_ulong); - MAKE_PARSER(long long, parse_longlong); - MAKE_PARSER(unsigned long long, parse_ulonglong); - -#undef MAKE_PARSER - - // Generic constructor - template <typename T> Arg(T*, Parser parser); - // Generic constructor template - template <typename T> Arg(T* p) - : arg_(p), parser_(_PCRE_MatchObject<T>::Parse) { - } - - // Parse the data - bool Parse(const char* str, size_t n) const; - - private: - void* arg_; - Parser parser_; - - static bool parse_null (const char* str, size_t n, void* dest); - static bool parse_char (const char* str, size_t n, void* dest); - static bool parse_schar (const char* str, size_t n, void* dest); - static bool parse_uchar (const char* str, size_t n, void* dest); - static bool parse_float (const char* str, size_t n, void* dest); - static bool parse_double (const char* str, size_t n, void* dest); - static bool parse_string (const char* str, size_t n, void* dest); - static bool parse_stringpiece (const char* str, size_t n, void* dest); - -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_##name(const char* str, size_t n, void* dest); \ - static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ - int radix); \ - \ - public: \ - static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ - static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ - static bool parse_##name##_cradix(const char* str, size_t n, void* dest) - - DECLARE_INTEGER_PARSER(short); - DECLARE_INTEGER_PARSER(ushort); - DECLARE_INTEGER_PARSER(int); - DECLARE_INTEGER_PARSER(uint); - DECLARE_INTEGER_PARSER(long); - DECLARE_INTEGER_PARSER(ulong); - DECLARE_INTEGER_PARSER(longlong); - DECLARE_INTEGER_PARSER(ulonglong); - -#undef DECLARE_INTEGER_PARSER - -}; - -inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } -inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } - -inline bool PCRE::Arg::Parse(const char* str, size_t n) const { - return (*parser_)(str, n, arg_); -} - -// This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline PCRE::Arg Hex(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \ - } \ - inline PCRE::Arg Octal(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \ - } \ - inline PCRE::Arg CRadix(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \ - } - -MAKE_INTEGER_PARSER(short, short); -MAKE_INTEGER_PARSER(unsigned short, ushort); -MAKE_INTEGER_PARSER(int, int); -MAKE_INTEGER_PARSER(unsigned int, uint); -MAKE_INTEGER_PARSER(long, long); -MAKE_INTEGER_PARSER(unsigned long, ulong); -MAKE_INTEGER_PARSER(long long, longlong); -MAKE_INTEGER_PARSER(unsigned long long, ulonglong); - -#undef MAKE_INTEGER_PARSER - -} // namespace re2 - -#endif // UTIL_PCRE_H_ +// Copyright 2003-2010 Google Inc. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_PCRE_H_ +#define UTIL_PCRE_H_ + +// This is a variant of PCRE's pcrecpp.h, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +// C++ interface to the pcre regular-expression library. PCRE supports +// Perl-style regular expressions (with extensions like \d, \w, \s, +// ...). +// +// ----------------------------------------------------------------------- +// REGEXP SYNTAX: +// +// This module uses the pcre library and hence supports its syntax +// for regular expressions: +// +// http://www.google.com/search?q=pcre +// +// The syntax is pretty similar to Perl's. For those not familiar +// with Perl's regular expressions, here are some examples of the most +// commonly used extensions: +// +// "hello (\\w+) world" -- \w matches a "word" character +// "version (\\d+)" -- \d matches a digit +// "hello\\s+world" -- \s matches any whitespace character +// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary +// "(?i)hello" -- (?i) turns on case-insensitive matching +// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible +// +// ----------------------------------------------------------------------- +// MATCHING INTERFACE: +// +// The "FullMatch" operation checks that supplied text matches a +// supplied pattern exactly. +// +// Example: successful match +// CHECK(PCRE::FullMatch("hello", "h.*o")); +// +// Example: unsuccessful match (requires full match): +// CHECK(!PCRE::FullMatch("hello", "e")); +// +// ----------------------------------------------------------------------- +// UTF-8 AND THE MATCHING INTERFACE: +// +// By default, pattern and text are plain text, one byte per character. +// The UTF8 flag, passed to the constructor, causes both pattern +// and string to be treated as UTF-8 text, still a byte stream but +// potentially multiple bytes per character. In practice, the text +// is likelier to be UTF-8 than the pattern, but the match returned +// may depend on the UTF8 flag, so always use it when matching +// UTF8 text. E.g., "." will match one byte normally but with UTF8 +// set may match up to three bytes of a multi-byte character. +// +// Example: +// PCRE re(utf8_pattern, PCRE::UTF8); +// CHECK(PCRE::FullMatch(utf8_string, re)); +// +// ----------------------------------------------------------------------- +// MATCHING WITH SUBSTRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched substrings. +// +// Example: extracts "ruby" into "s" and 1234 into "i" +// int i; +// std::string s; +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); +// +// Example: fails because string cannot be stored in integer +// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i)); +// +// Example: fails because there aren't enough sub-patterns: +// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s)); +// +// Example: does not try to extract any extra sub-patterns +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); +// +// Example: does not try to extract into NULL +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); +// +// Example: integer overflow causes failure +// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); +// +// ----------------------------------------------------------------------- +// PARTIAL MATCHES +// +// You can use the "PartialMatch" operation when you want the pattern +// to match any substring of the text. +// +// Example: simple search for a string: +// CHECK(PCRE::PartialMatch("hello", "ell")); +// +// Example: find first number in a string +// int number; +// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number)); +// CHECK_EQ(number, 100); +// +// ----------------------------------------------------------------------- +// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS +// +// PCRE makes it easy to use any string as a regular expression, without +// requiring a separate compilation step. +// +// If speed is of the essence, you can create a pre-compiled "PCRE" +// object from the pattern and use it multiple times. If you do so, +// you can typically parse text faster than with sscanf. +// +// Example: precompile pattern for faster matching: +// PCRE pattern("h.*o"); +// while (ReadLine(&str)) { +// if (PCRE::FullMatch(str, pattern)) ...; +// } +// +// ----------------------------------------------------------------------- +// SCANNING TEXT INCPCREMENTALLY +// +// The "Consume" operation may be useful if you want to repeatedly +// match regular expressions at the front of a string and skip over +// them as they match. This requires use of the "StringPiece" type, +// which represents a sub-range of a real string. +// +// Example: read lines of the form "var = value" from a string. +// std::string contents = ...; // Fill string somehow +// StringPiece input(contents); // Wrap a StringPiece around it +// +// std::string var; +// int value; +// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { +// ...; +// } +// +// Each successful call to "Consume" will set "var/value", and also +// advance "input" so it points past the matched text. Note that if the +// regular expression matches an empty string, input will advance +// by 0 bytes. If the regular expression being used might match +// an empty string, the loop body must check for this case and either +// advance the string or break out of the loop. +// +// The "FindAndConsume" operation is similar to "Consume" but does not +// anchor your match at the beginning of the string. For example, you +// could extract all words from a string by repeatedly calling +// PCRE::FindAndConsume(&input, "(\\w+)", &word) +// +// ----------------------------------------------------------------------- +// PARSING HEX/OCTAL/C-RADIX NUMBERS +// +// By default, if you pass a pointer to a numeric value, the +// corresponding text is interpreted as a base-10 number. You can +// instead wrap the pointer with a call to one of the operators Hex(), +// Octal(), or CRadix() to interpret the text in another base. The +// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) +// prefixes, but defaults to base-10. +// +// Example: +// int a, b, c, d; +// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", +// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d)); +// will leave 64 in a, b, c, and d. + +#include "util/util.h" +#include "re2/stringpiece.h" + +#ifdef USEPCRE +#include <pcre.h> +namespace re2 { +const bool UsingPCRE = true; +} // namespace re2 +#else +struct pcre; // opaque +namespace re2 { +const bool UsingPCRE = false; +} // namespace re2 +#endif + +namespace re2 { + +class PCRE_Options; + +// Interface for regular expression matching. Also corresponds to a +// pre-compiled regular expression. An "PCRE" object is safe for +// concurrent use by multiple threads. +class PCRE { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + + // Marks end of arg list. + // ONLY USE IN OPTIONAL ARG DEFAULTS. + // DO NOT PASS EXPLICITLY. + static Arg no_more_args; + + // Options are same value as those in pcre. We provide them here + // to avoid users needing to include pcre.h and also to isolate + // users from pcre should we change the underlying library. + // Only those needed by Google programs are exposed here to + // avoid collision with options employed internally by regexp.cc + // Note that some options have equivalents that can be specified in + // the regexp itself. For example, prefixing your regexp with + // "(?s)" has the same effect as the PCRE_DOTALL option. + enum Option { + None = 0x0000, + UTF8 = 0x0800, // == PCRE_UTF8 + EnabledCompileOptions = UTF8, + EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag + }; + + // We provide implicit conversions from strings so that users can + // pass in a string or a "const char*" wherever an "PCRE" is expected. + PCRE(const char* pattern); + PCRE(const char* pattern, Option option); + PCRE(const std::string& pattern); + PCRE(const std::string& pattern, Option option); + PCRE(const char *pattern, const PCRE_Options& re_option); + PCRE(const std::string& pattern, const PCRE_Options& re_option); + + ~PCRE(); + + // The string specification for this PCRE. E.g. + // PCRE re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const std::string& pattern() const { return pattern_; } + + // If PCRE could not be created properly, returns an error string. + // Else returns the empty string. + const std::string& error() const { return *error_; } + + // Whether the PCRE has hit a match limit during execution. + // Not thread safe. Intended only for testing. + // If hitting match limits is a problem, + // you should be using PCRE2 (re2/re2.h) + // instead of checking this flag. + bool HitLimit(); + void ClearHitLimit(); + + /***** The useful part: the matching interface *****/ + + // Matches "text" against "pattern". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "std::string" for "text". + // You can pass in a "const char*" or a "std::string" or a "PCRE" for "pattern". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // std::string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, size_t)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "pattern" exactly + // b. The number of matched sub-patterns is >= number of supplied pointers + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number); + struct FullMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FullMatchFunctor FullMatch; + + // Exactly like FullMatch(), except that "pattern" is allowed to match + // a substring of "text". + struct PartialMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const PartialMatchFunctor PartialMatch; + + // Like FullMatch() and PartialMatch(), except that pattern has to + // match a prefix of "text", and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true. + struct ConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const ConsumeFunctor Consume; + + // Like Consume(..), but does not anchor the match at the beginning of the + // string. That is, "pattern" need not start its match at the beginning of + // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next + // word in "s" and stores it in "word". + struct FindAndConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FindAndConsumeFunctor FindAndConsume; + + // Replace the first match of "pattern" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(PCRE::Replace(&s, "b+", "d")); + // + // will leave "s" containing "yada dabba doo" + // + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace(), except replaces all occurrences of the pattern in + // the string with the rewrite. Replacements are not subject to + // re-matching. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(PCRE::GlobalReplace(&s, "b+", "d")); + // + // will leave "s" containing "yada dada doo" + // + // Returns the number of replacements made. + static int GlobalReplace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + static bool Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + std::string *out); + + // Check that the given @p rewrite string is suitable for use with + // this PCRE. It checks that: + // * The PCRE has enough parenthesized subexpressions to satisfy all + // of the \N tokens in @p rewrite, and + // * The @p rewrite string doesn't have any syntax errors + // ('\' followed by anything besides [0-9] and '\'). + // Making this test will guarantee that "replace" and "extract" + // operations won't LOG(ERROR) or fail because of a bad rewrite + // string. + // @param rewrite The proposed rewrite string. + // @param error An error message is recorded here, iff we return false. + // Otherwise, it is unchanged. + // @return true, iff @p rewrite is suitable for use with the PCRE. + bool CheckRewriteString(const StringPiece& rewrite, + std::string* error) const; + + // Returns a copy of 'unquoted' with all potentially meaningful + // regexp characters backslash-escaped. The returned string, used + // as a regular expression, will exactly match the original string. + // For example, + // 1.5-2.0? + // becomes: + // 1\.5\-2\.0\? + static std::string QuoteMeta(const StringPiece& unquoted); + + /***** Generic matching interface (not so nice to use) *****/ + + // Type of match (TODO: Should be restructured as an Option) + enum Anchor { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH, // Anchor at start and end + }; + + // General matching routine. Stores the length of the match in + // "*consumed" if successful. + bool DoMatch(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const* args, int n) const; + + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. + int NumberOfCapturingGroups() const; + + private: + void Init(const char* pattern, Option option, int match_limit, + int stack_limit, bool report_errors); + + // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with + // pairs of integers for the beginning and end positions of matched + // text. The first pair corresponds to the entire matched text; + // subsequent pairs correspond, in order, to parentheses-captured + // matches. Returns the number of pairs (one more than the number of + // the last subpattern with a match) if matching was successful + // and zero if the match failed. + // I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching + // against "foo", "bar", and "baz" respectively. + // When matching PCRE("(foo)|hello") against "hello", it will return 1. + // But the values for all subpattern are filled in into "vec". + int TryMatch(const StringPiece& text, + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const; + + // Append the "rewrite" string, with backslash subsitutions from "text" + // and "vec", to string "out". + bool Rewrite(std::string *out, + const StringPiece &rewrite, + const StringPiece &text, + int *vec, + int veclen) const; + + // internal implementation for DoMatch + bool DoMatchImpl(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n, + int* vec, + int vecsize) const; + + // Compile the regexp for the specified anchoring mode + pcre* Compile(Anchor anchor); + + std::string pattern_; + Option options_; + pcre* re_full_; // For full matches + pcre* re_partial_; // For partial matches + const std::string* error_; // Error indicator (or empty string) + bool report_errors_; // Silences error logging if false + int match_limit_; // Limit on execution resources + int stack_limit_; // Limit on stack resources (bytes) + mutable int32_t hit_limit_; // Hit limit during execution (bool) + + PCRE(const PCRE&) = delete; + PCRE& operator=(const PCRE&) = delete; +}; + +// PCRE_Options allow you to set the PCRE::Options, plus any pcre +// "extra" options. The only extras are match_limit, which limits +// the CPU time of a match, and stack_limit, which limits the +// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default +// that should not cause too many problems in production code. +// If PCRE hits a limit during a match, it may return a false negative, +// but (hopefully) it won't crash. +// +// NOTE: If you are handling regular expressions specified by +// (external or internal) users, rather than hard-coded ones, +// you should be using PCRE2, which uses an alternate implementation +// that avoids these issues. See http://go/re2quick. +class PCRE_Options { + public: + // constructor + PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {} + // accessors + PCRE::Option option() const { return option_; } + void set_option(PCRE::Option option) { + option_ = option; + } + int match_limit() const { return match_limit_; } + void set_match_limit(int match_limit) { + match_limit_ = match_limit; + } + int stack_limit() const { return stack_limit_; } + void set_stack_limit(int stack_limit) { + stack_limit_ = stack_limit; + } + + // If the regular expression is malformed, an error message will be printed + // iff report_errors() is true. Default: true. + bool report_errors() const { return report_errors_; } + void set_report_errors(bool report_errors) { + report_errors_ = report_errors; + } + private: + PCRE::Option option_; + int match_limit_; + int stack_limit_; + bool report_errors_; +}; + + +/***** Implementation details *****/ + +// Hex/Octal/Binary? + +// Special class for parsing into objects that define a ParseFrom() method +template <typename T> +class _PCRE_MatchObject { + public: + static inline bool Parse(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + T* object = reinterpret_cast<T*>(dest); + return object->ParseFrom(str, n); + } +}; + +class PCRE::Arg { + public: + // Empty constructor so we can declare arrays of PCRE::Arg + Arg(); + + // Constructor specially designed for NULL arguments + Arg(void*); + + typedef bool (*Parser)(const char* str, size_t n, void* dest); + +// Type-specific parsers +#define MAKE_PARSER(type, name) \ + Arg(type* p) : arg_(p), parser_(name) {} \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} + + MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_schar); + MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(std::string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + + MAKE_PARSER(short, parse_short); + MAKE_PARSER(unsigned short, parse_ushort); + MAKE_PARSER(int, parse_int); + MAKE_PARSER(unsigned int, parse_uint); + MAKE_PARSER(long, parse_long); + MAKE_PARSER(unsigned long, parse_ulong); + MAKE_PARSER(long long, parse_longlong); + MAKE_PARSER(unsigned long long, parse_ulonglong); + +#undef MAKE_PARSER + + // Generic constructor + template <typename T> Arg(T*, Parser parser); + // Generic constructor template + template <typename T> Arg(T* p) + : arg_(p), parser_(_PCRE_MatchObject<T>::Parse) { + } + + // Parse the data + bool Parse(const char* str, size_t n) const; + + private: + void* arg_; + Parser parser_; + + static bool parse_null (const char* str, size_t n, void* dest); + static bool parse_char (const char* str, size_t n, void* dest); + static bool parse_schar (const char* str, size_t n, void* dest); + static bool parse_uchar (const char* str, size_t n, void* dest); + static bool parse_float (const char* str, size_t n, void* dest); + static bool parse_double (const char* str, size_t n, void* dest); + static bool parse_string (const char* str, size_t n, void* dest); + static bool parse_stringpiece (const char* str, size_t n, void* dest); + +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_##name(const char* str, size_t n, void* dest); \ + static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ + int radix); \ + \ + public: \ + static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ + static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ + static bool parse_##name##_cradix(const char* str, size_t n, void* dest) + + DECLARE_INTEGER_PARSER(short); + DECLARE_INTEGER_PARSER(ushort); + DECLARE_INTEGER_PARSER(int); + DECLARE_INTEGER_PARSER(uint); + DECLARE_INTEGER_PARSER(long); + DECLARE_INTEGER_PARSER(ulong); + DECLARE_INTEGER_PARSER(longlong); + DECLARE_INTEGER_PARSER(ulonglong); + +#undef DECLARE_INTEGER_PARSER + +}; + +inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } +inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } + +inline bool PCRE::Arg::Parse(const char* str, size_t n) const { + return (*parser_)(str, n, arg_); +} + +// This part of the parser, appropriate only for ints, deals with bases +#define MAKE_INTEGER_PARSER(type, name) \ + inline PCRE::Arg Hex(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \ + } \ + inline PCRE::Arg Octal(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \ + } \ + inline PCRE::Arg CRadix(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \ + } + +MAKE_INTEGER_PARSER(short, short); +MAKE_INTEGER_PARSER(unsigned short, ushort); +MAKE_INTEGER_PARSER(int, int); +MAKE_INTEGER_PARSER(unsigned int, uint); +MAKE_INTEGER_PARSER(long, long); +MAKE_INTEGER_PARSER(unsigned long, ulong); +MAKE_INTEGER_PARSER(long long, longlong); +MAKE_INTEGER_PARSER(unsigned long long, ulonglong); + +#undef MAKE_INTEGER_PARSER + +} // namespace re2 + +#endif // UTIL_PCRE_H_ diff --git a/contrib/libs/re2/util/strutil.cc b/contrib/libs/re2/util/strutil.cc index f9af3a442c..fb7e6b1b0c 100644 --- a/contrib/libs/re2/util/strutil.cc +++ b/contrib/libs/re2/util/strutil.cc @@ -65,34 +65,34 @@ static size_t CEscapeString(const char* src, size_t src_len, // Copies 'src' to result, escaping dangerous characters using // C-style escape sequences. 'src' and 'dest' should not overlap. // ---------------------------------------------------------------------- -std::string CEscape(const StringPiece& src) { +std::string CEscape(const StringPiece& src) { const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion char* dest = new char[dest_len]; const size_t used = CEscapeString(src.data(), src.size(), dest, dest_len); - std::string s = std::string(dest, used); + std::string s = std::string(dest, used); delete[] dest; return s; } -void PrefixSuccessor(std::string* prefix) { +void PrefixSuccessor(std::string* prefix) { // We can increment the last character in the string and be done // unless that character is 255, in which case we have to erase the // last character and increment the previous character, unless that // is 255, etc. If the string is empty or consists entirely of // 255's, we just return the empty string. - while (!prefix->empty()) { - char& c = prefix->back(); - if (c == '\xff') { // char literal avoids signed/unsigned. - prefix->pop_back(); + while (!prefix->empty()) { + char& c = prefix->back(); + if (c == '\xff') { // char literal avoids signed/unsigned. + prefix->pop_back(); } else { - ++c; - break; + ++c; + break; } } } -static void StringAppendV(std::string* dst, const char* format, va_list ap) { +static void StringAppendV(std::string* dst, const char* format, va_list ap) { // First try with a small fixed size buffer char space[1024]; @@ -137,10 +137,10 @@ static void StringAppendV(std::string* dst, const char* format, va_list ap) { } } -std::string StringPrintf(const char* format, ...) { +std::string StringPrintf(const char* format, ...) { va_list ap; va_start(ap, format); - std::string result; + std::string result; StringAppendV(&result, format, ap); va_end(ap); return result; diff --git a/contrib/libs/re2/util/strutil.h b/contrib/libs/re2/util/strutil.h index 16631b0833..a69908a0dd 100644 --- a/contrib/libs/re2/util/strutil.h +++ b/contrib/libs/re2/util/strutil.h @@ -12,10 +12,10 @@ namespace re2 { -std::string CEscape(const StringPiece& src); -void PrefixSuccessor(std::string* prefix); -std::string StringPrintf(const char* format, ...); +std::string CEscape(const StringPiece& src); +void PrefixSuccessor(std::string* prefix); +std::string StringPrintf(const char* format, ...); } // namespace re2 - + #endif // UTIL_STRUTIL_H_ diff --git a/contrib/libs/re2/util/test.cc b/contrib/libs/re2/util/test.cc index 855295f5bf..028616b359 100644 --- a/contrib/libs/re2/util/test.cc +++ b/contrib/libs/re2/util/test.cc @@ -1,34 +1,34 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <stdio.h> -#include <string> - -#include "util/test.h" - -namespace testing { -std::string TempDir() { return "/tmp/"; } -} // namespace testing - -struct Test { - void (*fn)(void); - const char *name; -}; - -static Test tests[10000]; -static int ntests; - -void RegisterTest(void (*fn)(void), const char *name) { - tests[ntests].fn = fn; - tests[ntests++].name = name; -} - -int main(int argc, char** argv) { - for (int i = 0; i < ntests; i++) { - printf("%s\n", tests[i].name); - tests[i].fn(); - } - printf("PASS\n"); - return 0; -} +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <stdio.h> +#include <string> + +#include "util/test.h" + +namespace testing { +std::string TempDir() { return "/tmp/"; } +} // namespace testing + +struct Test { + void (*fn)(void); + const char *name; +}; + +static Test tests[10000]; +static int ntests; + +void RegisterTest(void (*fn)(void), const char *name) { + tests[ntests].fn = fn; + tests[ntests++].name = name; +} + +int main(int argc, char** argv) { + for (int i = 0; i < ntests; i++) { + printf("%s\n", tests[i].name); + tests[i].fn(); + } + printf("PASS\n"); + return 0; +} diff --git a/contrib/libs/re2/util/test.h b/contrib/libs/re2/util/test.h index 40978b8fae..54e6f8fbbb 100644 --- a/contrib/libs/re2/util/test.h +++ b/contrib/libs/re2/util/test.h @@ -1,50 +1,50 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_TEST_H_ -#define UTIL_TEST_H_ - -#include "util/util.h" -#include "util/logging.h" - -namespace testing { -std::string TempDir(); -} // namespace testing - -#define TEST(x, y) \ - void x##y(void); \ - TestRegisterer r##x##y(x##y, # x "." # y); \ - void x##y(void) - -void RegisterTest(void (*)(void), const char*); - -class TestRegisterer { - public: - TestRegisterer(void (*fn)(void), const char *s) { - RegisterTest(fn, s); - } -}; - -// fatal assertions -#define ASSERT_TRUE CHECK -#define ASSERT_FALSE(x) CHECK(!(x)) -#define ASSERT_EQ CHECK_EQ -#define ASSERT_NE CHECK_NE -#define ASSERT_LT CHECK_LT -#define ASSERT_LE CHECK_LE -#define ASSERT_GT CHECK_GT -#define ASSERT_GE CHECK_GE - -// nonfatal assertions -// TODO(rsc): Do a better job? -#define EXPECT_TRUE CHECK -#define EXPECT_FALSE(x) CHECK(!(x)) -#define EXPECT_EQ CHECK_EQ -#define EXPECT_NE CHECK_NE -#define EXPECT_LT CHECK_LT -#define EXPECT_LE CHECK_LE -#define EXPECT_GT CHECK_GT -#define EXPECT_GE CHECK_GE - -#endif // UTIL_TEST_H_ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_TEST_H_ +#define UTIL_TEST_H_ + +#include "util/util.h" +#include "util/logging.h" + +namespace testing { +std::string TempDir(); +} // namespace testing + +#define TEST(x, y) \ + void x##y(void); \ + TestRegisterer r##x##y(x##y, # x "." # y); \ + void x##y(void) + +void RegisterTest(void (*)(void), const char*); + +class TestRegisterer { + public: + TestRegisterer(void (*fn)(void), const char *s) { + RegisterTest(fn, s); + } +}; + +// fatal assertions +#define ASSERT_TRUE CHECK +#define ASSERT_FALSE(x) CHECK(!(x)) +#define ASSERT_EQ CHECK_EQ +#define ASSERT_NE CHECK_NE +#define ASSERT_LT CHECK_LT +#define ASSERT_LE CHECK_LE +#define ASSERT_GT CHECK_GT +#define ASSERT_GE CHECK_GE + +// nonfatal assertions +// TODO(rsc): Do a better job? +#define EXPECT_TRUE CHECK +#define EXPECT_FALSE(x) CHECK(!(x)) +#define EXPECT_EQ CHECK_EQ +#define EXPECT_NE CHECK_NE +#define EXPECT_LT CHECK_LT +#define EXPECT_LE CHECK_LE +#define EXPECT_GT CHECK_GT +#define EXPECT_GE CHECK_GE + +#endif // UTIL_TEST_H_ diff --git a/contrib/libs/re2/util/util.h b/contrib/libs/re2/util/util.h index 0d28a8ca74..56e46c1a33 100644 --- a/contrib/libs/re2/util/util.h +++ b/contrib/libs/re2/util/util.h @@ -5,35 +5,35 @@ #ifndef UTIL_UTIL_H_ #define UTIL_UTIL_H_ -#define arraysize(array) (sizeof(array)/sizeof((array)[0])) +#define arraysize(array) (sizeof(array)/sizeof((array)[0])) -#ifndef ATTRIBUTE_NORETURN -#if defined(__GNUC__) -#define ATTRIBUTE_NORETURN __attribute__((noreturn)) -#elif defined(_MSC_VER) -#define ATTRIBUTE_NORETURN __declspec(noreturn) -#else -#define ATTRIBUTE_NORETURN -#endif -#endif +#ifndef ATTRIBUTE_NORETURN +#if defined(__GNUC__) +#define ATTRIBUTE_NORETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define ATTRIBUTE_NORETURN __declspec(noreturn) +#else +#define ATTRIBUTE_NORETURN +#endif +#endif + +#ifndef ATTRIBUTE_UNUSED +#if defined(__GNUC__) +#define ATTRIBUTE_UNUSED __attribute__((unused)) +#else +#define ATTRIBUTE_UNUSED +#endif +#endif -#ifndef ATTRIBUTE_UNUSED -#if defined(__GNUC__) -#define ATTRIBUTE_UNUSED __attribute__((unused)) -#else -#define ATTRIBUTE_UNUSED -#endif -#endif - #ifndef FALLTHROUGH_INTENDED -#if defined(__clang__) -#define FALLTHROUGH_INTENDED [[clang::fallthrough]] -#elif defined(__GNUC__) && __GNUC__ >= 7 -#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] -#else -#define FALLTHROUGH_INTENDED do {} while (0) +#if defined(__clang__) +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] +#else +#define FALLTHROUGH_INTENDED do {} while (0) +#endif #endif -#endif #ifndef NO_THREAD_SAFETY_ANALYSIS #define NO_THREAD_SAFETY_ANALYSIS |