diff options
author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/re2/util |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/re2/util')
-rw-r--r-- | contrib/libs/re2/util/flags.h | 26 | ||||
-rw-r--r-- | contrib/libs/re2/util/logging.h | 109 | ||||
-rw-r--r-- | contrib/libs/re2/util/mix.h | 41 | ||||
-rw-r--r-- | contrib/libs/re2/util/mutex.h | 148 | ||||
-rw-r--r-- | contrib/libs/re2/util/pcre.cc | 1025 | ||||
-rw-r--r-- | contrib/libs/re2/util/pcre.h | 681 | ||||
-rw-r--r-- | contrib/libs/re2/util/rune.cc | 260 | ||||
-rw-r--r-- | contrib/libs/re2/util/strutil.cc | 149 | ||||
-rw-r--r-- | contrib/libs/re2/util/strutil.h | 21 | ||||
-rw-r--r-- | contrib/libs/re2/util/test.cc | 34 | ||||
-rw-r--r-- | contrib/libs/re2/util/test.h | 50 | ||||
-rw-r--r-- | contrib/libs/re2/util/utf.h | 44 | ||||
-rw-r--r-- | contrib/libs/re2/util/util.h | 42 |
13 files changed, 2630 insertions, 0 deletions
diff --git a/contrib/libs/re2/util/flags.h b/contrib/libs/re2/util/flags.h new file mode 100644 index 00000000000..3386b729d43 --- /dev/null +++ b/contrib/libs/re2/util/flags.h @@ -0,0 +1,26 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_FLAGS_H_ +#define UTIL_FLAGS_H_ + +// Simplified version of Google's command line flags. +// Does not support parsing the command line. +// If you want to do that, see +// https://gflags.github.io/gflags/ + +#define DEFINE_FLAG(type, name, deflt, desc) \ + namespace re2 { type FLAGS_##name = deflt; } + +#define DECLARE_FLAG(type, name) \ + namespace re2 { extern type FLAGS_##name; } + +namespace re2 { +template <typename T> +T GetFlag(const T& flag) { + return flag; +} +} // namespace re2 + +#endif // UTIL_FLAGS_H_ diff --git a/contrib/libs/re2/util/logging.h b/contrib/libs/re2/util/logging.h new file mode 100644 index 00000000000..5b2217f29ca --- /dev/null +++ b/contrib/libs/re2/util/logging.h @@ -0,0 +1,109 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_LOGGING_H_ +#define UTIL_LOGGING_H_ + +// Simplified version of Google's logging. + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <ostream> +#include <sstream> + +#include "util/util.h" + +// Debug-only checking. +#define DCHECK(condition) assert(condition) +#define DCHECK_EQ(val1, val2) assert((val1) == (val2)) +#define DCHECK_NE(val1, val2) assert((val1) != (val2)) +#define DCHECK_LE(val1, val2) assert((val1) <= (val2)) +#define DCHECK_LT(val1, val2) assert((val1) < (val2)) +#define DCHECK_GE(val1, val2) assert((val1) >= (val2)) +#define DCHECK_GT(val1, val2) assert((val1) > (val2)) + +// Always-on checking +#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x +#define CHECK_LT(x, y) CHECK((x) < (y)) +#define CHECK_GT(x, y) CHECK((x) > (y)) +#define CHECK_LE(x, y) CHECK((x) <= (y)) +#define CHECK_GE(x, y) CHECK((x) >= (y)) +#define CHECK_EQ(x, y) CHECK((x) == (y)) +#define CHECK_NE(x, y) CHECK((x) != (y)) + +#define LOG_INFO LogMessage(__FILE__, __LINE__) +#define LOG_WARNING LogMessage(__FILE__, __LINE__) +#define LOG_ERROR LogMessage(__FILE__, __LINE__) +#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__) +#define LOG_QFATAL LOG_FATAL + +// It seems that one of the Windows header files defines ERROR as 0. +#ifdef _WIN32 +#define LOG_0 LOG_INFO +#endif + +#ifdef NDEBUG +#define LOG_DFATAL LOG_ERROR +#else +#define LOG_DFATAL LOG_FATAL +#endif + +#define LOG(severity) LOG_ ## severity.stream() + +#define VLOG(x) if((x)>0){}else LOG_INFO.stream() + +class LogMessage { + public: + LogMessage(const char* file, int line) + : flushed_(false) { + stream() << file << ":" << line << ": "; + } + void Flush() { + stream() << "\n"; + std::string s = str_.str(); + size_t n = s.size(); + if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc + flushed_ = true; + } + ~LogMessage() { + if (!flushed_) { + Flush(); + } + } + std::ostream& stream() { return str_; } + + private: + bool flushed_; + std::ostringstream str_; + + LogMessage(const LogMessage&) = delete; + LogMessage& operator=(const LogMessage&) = delete; +}; + +// Silence "destructor never returns" warning for ~LogMessageFatal(). +// Since this is a header file, push and then pop to limit the scope. +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4722) +#endif + +class LogMessageFatal : public LogMessage { + public: + LogMessageFatal(const char* file, int line) + : LogMessage(file, line) {} + ATTRIBUTE_NORETURN ~LogMessageFatal() { + Flush(); + abort(); + } + private: + LogMessageFatal(const LogMessageFatal&) = delete; + LogMessageFatal& operator=(const LogMessageFatal&) = delete; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#endif // UTIL_LOGGING_H_ diff --git a/contrib/libs/re2/util/mix.h b/contrib/libs/re2/util/mix.h new file mode 100644 index 00000000000..d85c172ab0e --- /dev/null +++ b/contrib/libs/re2/util/mix.h @@ -0,0 +1,41 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_MIX_H_ +#define UTIL_MIX_H_ + +#include <stddef.h> +#include <limits> + +namespace re2 { + +// Silence "truncation of constant value" warning for kMul in 32-bit mode. +// Since this is a header file, push and then pop to limit the scope. +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4309) +#endif + +class HashMix { + public: + HashMix() : hash_(1) {} + explicit HashMix(size_t val) : hash_(val + 83) {} + void Mix(size_t val) { + static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL); + hash_ *= kMul; + hash_ = ((hash_ << 19) | + (hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val; + } + size_t get() const { return hash_; } + private: + size_t hash_; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +} // namespace re2 + +#endif // UTIL_MIX_H_ diff --git a/contrib/libs/re2/util/mutex.h b/contrib/libs/re2/util/mutex.h new file mode 100644 index 00000000000..158046bb5c9 --- /dev/null +++ b/contrib/libs/re2/util/mutex.h @@ -0,0 +1,148 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_MUTEX_H_ +#define UTIL_MUTEX_H_ + +/* + * A simple mutex wrapper, supporting locks and read-write locks. + * You should assume the locks are *not* re-entrant. + */ + +#ifdef _WIN32 +// Requires Windows Vista or Windows Server 2008 at minimum. +#include <windows.h> +#if defined(WINVER) && WINVER >= 0x0600 +#define MUTEX_IS_WIN32_SRWLOCK +#endif +#else +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif +#include <unistd.h> +#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0 +#define MUTEX_IS_PTHREAD_RWLOCK +#endif +#endif + +#if defined(MUTEX_IS_WIN32_SRWLOCK) +typedef SRWLOCK MutexType; +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) +#include <pthread.h> +#include <stdlib.h> +typedef pthread_rwlock_t MutexType; +#else +#include <mutex> +typedef std::mutex MutexType; +#endif + +namespace re2 { + +class Mutex { + public: + inline Mutex(); + inline ~Mutex(); + inline void Lock(); // Block if needed until free then acquire exclusively + inline void Unlock(); // Release a lock acquired via Lock() + // Note that on systems that don't support read-write locks, these may + // be implemented as synonyms to Lock() and Unlock(). So you can use + // these for efficiency, but don't use them anyplace where being able + // to do shared reads is necessary to avoid deadlock. + inline void ReaderLock(); // Block until free or shared then acquire a share + inline void ReaderUnlock(); // Release a read share of this Mutex + inline void WriterLock() { Lock(); } // Acquire an exclusive lock + inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock() + + private: + MutexType mutex_; + + // Catch the error of writing Mutex when intending MutexLock. + Mutex(Mutex *ignored); + + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; +}; + +#if defined(MUTEX_IS_WIN32_SRWLOCK) + +Mutex::Mutex() : mutex_(SRWLOCK_INIT) { } +Mutex::~Mutex() { } +void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } +void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } +void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } +void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } + +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) + +#define SAFE_PTHREAD(fncall) \ + do { \ + if ((fncall) != 0) abort(); \ + } while (0) + +Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); } +Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); } +void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); } +void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } +void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); } +void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } + +#undef SAFE_PTHREAD + +#else + +Mutex::Mutex() { } +Mutex::~Mutex() { } +void Mutex::Lock() { mutex_.lock(); } +void Mutex::Unlock() { mutex_.unlock(); } +void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex. +void Mutex::ReaderUnlock() { Unlock(); } + +#endif + +// -------------------------------------------------------------------------- +// Some helper classes + +// MutexLock(mu) acquires mu when constructed and releases it when destroyed. +class MutexLock { + public: + explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); } + ~MutexLock() { mu_->Unlock(); } + private: + Mutex * const mu_; + + MutexLock(const MutexLock&) = delete; + MutexLock& operator=(const MutexLock&) = delete; +}; + +// ReaderMutexLock and WriterMutexLock do the same, for rwlocks +class ReaderMutexLock { + public: + explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); } + ~ReaderMutexLock() { mu_->ReaderUnlock(); } + private: + Mutex * const mu_; + + ReaderMutexLock(const ReaderMutexLock&) = delete; + ReaderMutexLock& operator=(const ReaderMutexLock&) = delete; +}; + +class WriterMutexLock { + public: + explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); } + ~WriterMutexLock() { mu_->WriterUnlock(); } + private: + Mutex * const mu_; + + WriterMutexLock(const WriterMutexLock&) = delete; + WriterMutexLock& operator=(const WriterMutexLock&) = delete; +}; + +// Catch bug where variable name is omitted, e.g. MutexLock (&mu); +#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name") +#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name") +#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name") + +} // namespace re2 + +#endif // UTIL_MUTEX_H_ diff --git a/contrib/libs/re2/util/pcre.cc b/contrib/libs/re2/util/pcre.cc new file mode 100644 index 00000000000..b68985144ff --- /dev/null +++ b/contrib/libs/re2/util/pcre.cc @@ -0,0 +1,1025 @@ +// Copyright 2003-2009 Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is a variant of PCRE's pcrecpp.cc, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <limits> +#include <string> +#include <utility> + +#include "util/util.h" +#include "util/flags.h" +#include "util/logging.h" +#include "util/pcre.h" +#include "util/strutil.h" + +// Silence warnings about the wacky formatting in the operator() functions. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 +#pragma GCC diagnostic ignored "-Wmisleading-indentation" +#endif + +#define PCREPORT(level) LOG(level) + +// Default PCRE limits. +// Defaults chosen to allow a plausible amount of CPU and +// not exceed main thread stacks. Note that other threads +// often have smaller stacks, and therefore tightening +// regexp_stack_limit may frequently be necessary. +DEFINE_FLAG(int, regexp_stack_limit, 256 << 10, + "default PCRE stack limit (bytes)"); +DEFINE_FLAG(int, regexp_match_limit, 1000000, + "default PCRE match limit (function calls)"); + +#ifndef USEPCRE + +// Fake just enough of the PCRE API to allow this file to build. :) + +struct pcre_extra { + int flags; + int match_limit; + int match_limit_recursion; +}; + +#define PCRE_EXTRA_MATCH_LIMIT 0 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 +#define PCRE_ANCHORED 0 +#define PCRE_NOTEMPTY 0 +#define PCRE_ERROR_NOMATCH 1 +#define PCRE_ERROR_MATCHLIMIT 2 +#define PCRE_ERROR_RECURSIONLIMIT 3 +#define PCRE_INFO_CAPTURECOUNT 0 + +void pcre_free(void*) { +} + +pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { + return NULL; +} + +int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) { + return 0; +} + +int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { + return 0; +} + +#endif + +namespace re2 { + +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace + +// Approximate size of a recursive invocation of PCRE's +// internal "match()" frame. This varies depending on the +// compiler and architecture, of course, so the constant is +// just a conservative estimate. To find the exact number, +// run regexp_unittest with --regexp_stack_limit=0 under +// a debugger and look at the frames when it crashes. +// The exact frame size was 656 in production on 2008/02/03. +static const int kPCREFrameSize = 700; + +// Special name for missing C++ arguments. +PCRE::Arg PCRE::no_more_args((void*)NULL); + +const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; +const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; +const PCRE::ConsumeFunctor PCRE::Consume = { }; +const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; + +// If a regular expression has no error, its error_ field points here +static const std::string empty_string; + +void PCRE::Init(const char* pattern, Option options, int match_limit, + int stack_limit, bool report_errors) { + pattern_ = pattern; + options_ = options; + match_limit_ = match_limit; + stack_limit_ = stack_limit; + hit_limit_ = false; + error_ = &empty_string; + report_errors_ = report_errors; + re_full_ = NULL; + re_partial_ = NULL; + + if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { + error_ = new std::string("illegal regexp option"); + PCREPORT(ERROR) + << "Error compiling '" << pattern << "': illegal regexp option"; + } else { + re_partial_ = Compile(UNANCHORED); + if (re_partial_ != NULL) { + re_full_ = Compile(ANCHOR_BOTH); + } + } +} + +PCRE::PCRE(const char* pattern) { + Init(pattern, None, 0, 0, true); +} +PCRE::PCRE(const char* pattern, Option option) { + Init(pattern, option, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern) { + Init(pattern.c_str(), None, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern, Option option) { + Init(pattern.c_str(), option, 0, 0, true); +} +PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) { + Init(pattern.c_str(), re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { + Init(pattern, re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::~PCRE() { + if (re_full_ != NULL) pcre_free(re_full_); + if (re_partial_ != NULL) pcre_free(re_partial_); + if (error_ != &empty_string) delete error_; +} + +pcre* PCRE::Compile(Anchor anchor) { + // Special treatment for anchoring. This is needed because at + // runtime pcre only provides an option for anchoring at the + // beginning of a string. + // + // There are three types of anchoring we want: + // UNANCHORED Compile the original pattern, and use + // a pcre unanchored match. + // ANCHOR_START Compile the original pattern, and use + // a pcre anchored match. + // ANCHOR_BOTH Tack a "\z" to the end of the original pattern + // and use a pcre anchored match. + + const char* error = ""; + int eoffset; + pcre* re; + if (anchor != ANCHOR_BOTH) { + re = pcre_compile(pattern_.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } else { + // Tack a '\z' at the end of PCRE. Parenthesize it first so that + // the '\z' applies to all top-level alternatives in the regexp. + std::string wrapped = "(?:"; // A non-counting grouping operator + wrapped += pattern_; + wrapped += ")\\z"; + re = pcre_compile(wrapped.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } + if (re == NULL) { + if (error_ == &empty_string) error_ = new std::string(error); + PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; + } + return re; +} + +/***** Convenience interfaces *****/ + +bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + size_t consumed; + int vec[kVecSize] = {}; + if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::Replace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int vec[kVecSize] = {}; + int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + + std::string s; + if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) + return false; + + assert(vec[0] >= 0); + assert(vec[1] >= 0); + str->replace(vec[0], vec[1] - vec[0], s); + return true; +} + +int PCRE::GlobalReplace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int count = 0; + int vec[kVecSize] = {}; + std::string out; + size_t start = 0; + bool last_match_was_empty_string = false; + + while (start <= str->size()) { + // If the previous match was for the empty string, we shouldn't + // just match again: we'll match in the same way and get an + // infinite loop. Instead, we do the match in a special way: + // anchored -- to force another try at the same position -- + // and with a flag saying that this time, ignore empty matches. + // If this special match returns, that means there's a non-empty + // match at this position as well, and we can continue. If not, + // we do what perl does, and just advance by one. + // Notice that perl prints '@@@' for this; + // perl -le '$_ = "aa"; s/b*|aa/@/g; print' + int matches; + if (last_match_was_empty_string) { + matches = pattern.TryMatch(*str, start, ANCHOR_START, false, + vec, kVecSize); + if (matches <= 0) { + if (start < str->size()) + out.push_back((*str)[start]); + start++; + last_match_was_empty_string = false; + continue; + } + } else { + matches = pattern.TryMatch(*str, start, UNANCHORED, true, + vec, kVecSize); + if (matches <= 0) + break; + } + size_t matchstart = vec[0], matchend = vec[1]; + assert(matchstart >= start); + assert(matchend >= matchstart); + + out.append(*str, start, matchstart - start); + pattern.Rewrite(&out, rewrite, *str, vec, matches); + start = matchend; + count++; + last_match_was_empty_string = (matchstart == matchend); + } + + if (count == 0) + return 0; + + if (start < str->size()) + out.append(*str, start, str->size() - start); + using std::swap; + swap(out, *str); + return count; +} + +bool PCRE::Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + std::string *out) { + int vec[kVecSize] = {}; + int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + out->clear(); + return pattern.Rewrite(out, rewrite, text, vec, matches); +} + +std::string PCRE::QuoteMeta(const StringPiece& unquoted) { + std::string result; + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) + for (size_t ii = 0; ii < unquoted.size(); ++ii) { + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + +/***** Actual matching and rewriting code *****/ + +bool PCRE::HitLimit() { + return hit_limit_ != 0; +} + +void PCRE::ClearHitLimit() { + hit_limit_ = 0; +} + +int PCRE::TryMatch(const StringPiece& text, + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const { + pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; + if (re == NULL) { + PCREPORT(ERROR) << "Matching against invalid re: " << *error_; + return 0; + } + + int match_limit = match_limit_; + if (match_limit <= 0) { + match_limit = GetFlag(FLAGS_regexp_match_limit); + } + + int stack_limit = stack_limit_; + if (stack_limit <= 0) { + stack_limit = GetFlag(FLAGS_regexp_stack_limit); + } + + pcre_extra extra = { 0 }; + if (match_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT; + extra.match_limit = match_limit; + } + if (stack_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; + extra.match_limit_recursion = stack_limit / kPCREFrameSize; + } + + int options = 0; + if (anchor != UNANCHORED) + options |= PCRE_ANCHORED; + if (!empty_ok) + options |= PCRE_NOTEMPTY; + + int rc = pcre_exec(re, // The regular expression object + &extra, + (text.data() == NULL) ? "" : text.data(), + static_cast<int>(text.size()), + static_cast<int>(startpos), + options, + vec, + vecsize); + + // Handle errors + if (rc == 0) { + // pcre_exec() returns 0 as a special case when the number of + // capturing subpatterns exceeds the size of the vector. + // When this happens, there is a match and the output vector + // is filled, but we miss out on the positions of the extra subpatterns. + rc = vecsize / 2; + } else if (rc < 0) { + switch (rc) { + case PCRE_ERROR_NOMATCH: + return 0; + case PCRE_ERROR_MATCHLIMIT: + // Writing to hit_limit is not safe if multiple threads + // are using the PCRE, but the flag is only intended + // for use by unit tests anyway, so we let it go. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded match limit of " << match_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + case PCRE_ERROR_RECURSIONLIMIT: + // See comment about hit_limit above. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + default: + // There are other return codes from pcre.h : + // PCRE_ERROR_NULL (-2) + // PCRE_ERROR_BADOPTION (-3) + // PCRE_ERROR_BADMAGIC (-4) + // PCRE_ERROR_UNKNOWN_NODE (-5) + // PCRE_ERROR_NOMEMORY (-6) + // PCRE_ERROR_NOSUBSTRING (-7) + // ... + PCREPORT(ERROR) << "Unexpected return code: " << rc + << " when matching '" << pattern_ << "'" + << ", re=" << re + << ", text=" << text + << ", vec=" << vec + << ", vecsize=" << vecsize; + return 0; + } + } + + return rc; +} + +bool PCRE::DoMatchImpl(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const* args, + int n, + int* vec, + int vecsize) const { + assert((1 + n) * 3 <= vecsize); // results + PCRE workspace + if (NumberOfCapturingGroups() < n) { + // RE has fewer capturing groups than number of Arg pointers passed in. + return false; + } + + int matches = TryMatch(text, 0, anchor, true, vec, vecsize); + assert(matches >= 0); // TryMatch never returns negatives + if (matches == 0) + return false; + + *consumed = vec[1]; + + if (n == 0 || args == NULL) { + // We are not interested in results + return true; + } + + // If we got here, we must have matched the whole pattern. + // We do not need (can not do) any more checks on the value of 'matches' here + // -- see the comment for TryMatch. + for (int i = 0; i < n; i++) { + const int start = vec[2*(i+1)]; + const int limit = vec[2*(i+1)+1]; + + // Avoid invoking undefined behavior when text.data() happens + // to be null and start happens to be -1, the latter being the + // case for an unmatched subexpression. Even if text.data() is + // not null, pointing one byte before was a longstanding bug. + const char* addr = NULL; + if (start != -1) { + addr = text.data() + start; + } + + if (!args[i]->Parse(addr, limit-start)) { + // TODO: Should we indicate what the error was? + return false; + } + } + + return true; +} + +bool PCRE::DoMatch(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n) const { + assert(n >= 0); + const int vecsize = (1 + n) * 3; // results + PCRE workspace + // (as for kVecSize) + int* vec = new int[vecsize]; + bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); + delete[] vec; + return b; +} + +bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite, + const StringPiece &text, int *vec, int veclen) const { + int number_of_capturing_groups = NumberOfCapturingGroups(); + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c == '\\') { + c = *++s; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (n <= number_of_capturing_groups) { + // unmatched optional capturing group. treat + // its value as empty string; i.e., nothing to append. + } else { + PCREPORT(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + return false; + } + } + int start = vec[2 * n]; + if (start >= 0) + out->append(text.data() + start, vec[2 * n + 1] - start); + } else if (c == '\\') { + out->push_back('\\'); + } else { + PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } else { + out->push_back(c); + } + } + return true; +} + +bool PCRE::CheckRewriteString(const StringPiece& rewrite, + std::string* error) const { + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c != '\\') { + continue; + } + if (++s == end) { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') { + continue; + } + if (!isdigit(c)) { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) { + max_token = n; + } + } + + if (max_token > NumberOfCapturingGroups()) { + *error = StringPrintf( + "Rewrite schema requests %d matches, but the regexp only has %d " + "parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); + return false; + } + return true; +} + + +// Return the number of capturing subpatterns, or -1 if the +// regexp wasn't valid on construction. +int PCRE::NumberOfCapturingGroups() const { + if (re_partial_ == NULL) return -1; + + int result; + int rc = pcre_fullinfo(re_partial_, // The regular expression object + NULL, // We did not study the pattern + PCRE_INFO_CAPTURECOUNT, + &result); + if (rc != 0) { + PCREPORT(ERROR) << "Unexpected return code: " << rc; + return -1; + } + return result; +} + + +/***** Parsers for various types *****/ + +bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) { + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + +bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast<std::string*>(dest)->assign(str, n); + return true; +} + +bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n); + return true; +} + +bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<char*>(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<signed char*>(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned char*>(dest)) = str[0]; + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + +// PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 +// PCREQUIPCRES "n > 0" +// Copies "str" into "buf" and null-terminates if necessary. +// Returns one of: +// a. "str" if no termination is needed +// b. "buf" if the string was copied and null-terminated +// c. "" if the input was invalid and has no hope of being parsed +static const char* TerminateNumber(char* buf, const char* str, size_t n) { + if ((n > 0) && isspace(*str)) { + // We are less forgiving than the strtoxxx() routines and do not + // allow leading spaces. + return ""; + } + + // See if the character right after the input text may potentially + // look like a digit. + if (isdigit(str[n]) || + ((str[n] >= 'a') && (str[n] <= 'f')) || + ((str[n] >= 'A') && (str[n] <= 'F'))) { + if (n > kMaxNumberLength) return ""; // Input too big to be a valid number + memcpy(buf, str, n); + buf[n] = '\0'; + return buf; + } else { + // We can parse right out of the supplied string, so return it. + return str; + } +} + +bool PCRE::Arg::parse_long_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_short_radix(const char* str, + size_t n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<short*>(dest)) = (short)r; + return true; +} + +bool PCRE::Arg::parse_ushort_radix(const char* str, + size_t n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r; + return true; +} + +bool PCRE::Arg::parse_int_radix(const char* str, + size_t n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<int*>(dest)) = (int)r; + return true; +} + +bool PCRE::Arg::parse_uint_radix(const char* str, + size_t n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r; + return true; +} + +bool PCRE::Arg::parse_longlong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long long r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<long long*>(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulonglong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; + unsigned long long r = strtoull(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast<unsigned long long*>(dest)) = r; + return true; +} + +static bool parse_double_float(const char* str, size_t n, bool isfloat, + void* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength]; + if (n >= kMaxLength) return false; + memcpy(buf, str, n); + buf[n] = '\0'; + char* end; + errno = 0; + double r; + if (isfloat) { + r = strtof(buf, &end); + } else { + r = strtod(buf, &end); + } + if (end != buf + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + if (isfloat) { + *(reinterpret_cast<float*>(dest)) = (float)r; + } else { + *(reinterpret_cast<double*>(dest)) = r; + } + return true; +} + +bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, false, dest); +} + +bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, true, dest); +} + +#define DEFINE_INTEGER_PARSER(name) \ + bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ + } + +DEFINE_INTEGER_PARSER(short); +DEFINE_INTEGER_PARSER(ushort); +DEFINE_INTEGER_PARSER(int); +DEFINE_INTEGER_PARSER(uint); +DEFINE_INTEGER_PARSER(long); +DEFINE_INTEGER_PARSER(ulong); +DEFINE_INTEGER_PARSER(longlong); +DEFINE_INTEGER_PARSER(ulonglong); + +#undef DEFINE_INTEGER_PARSER + +} // namespace re2 diff --git a/contrib/libs/re2/util/pcre.h b/contrib/libs/re2/util/pcre.h new file mode 100644 index 00000000000..896b0bdf893 --- /dev/null +++ b/contrib/libs/re2/util/pcre.h @@ -0,0 +1,681 @@ +// Copyright 2003-2010 Google Inc. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_PCRE_H_ +#define UTIL_PCRE_H_ + +// This is a variant of PCRE's pcrecpp.h, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +// C++ interface to the pcre regular-expression library. PCRE supports +// Perl-style regular expressions (with extensions like \d, \w, \s, +// ...). +// +// ----------------------------------------------------------------------- +// REGEXP SYNTAX: +// +// This module uses the pcre library and hence supports its syntax +// for regular expressions: +// +// http://www.google.com/search?q=pcre +// +// The syntax is pretty similar to Perl's. For those not familiar +// with Perl's regular expressions, here are some examples of the most +// commonly used extensions: +// +// "hello (\\w+) world" -- \w matches a "word" character +// "version (\\d+)" -- \d matches a digit +// "hello\\s+world" -- \s matches any whitespace character +// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary +// "(?i)hello" -- (?i) turns on case-insensitive matching +// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible +// +// ----------------------------------------------------------------------- +// MATCHING INTERFACE: +// +// The "FullMatch" operation checks that supplied text matches a +// supplied pattern exactly. +// +// Example: successful match +// CHECK(PCRE::FullMatch("hello", "h.*o")); +// +// Example: unsuccessful match (requires full match): +// CHECK(!PCRE::FullMatch("hello", "e")); +// +// ----------------------------------------------------------------------- +// UTF-8 AND THE MATCHING INTERFACE: +// +// By default, pattern and text are plain text, one byte per character. +// The UTF8 flag, passed to the constructor, causes both pattern +// and string to be treated as UTF-8 text, still a byte stream but +// potentially multiple bytes per character. In practice, the text +// is likelier to be UTF-8 than the pattern, but the match returned +// may depend on the UTF8 flag, so always use it when matching +// UTF8 text. E.g., "." will match one byte normally but with UTF8 +// set may match up to three bytes of a multi-byte character. +// +// Example: +// PCRE re(utf8_pattern, PCRE::UTF8); +// CHECK(PCRE::FullMatch(utf8_string, re)); +// +// ----------------------------------------------------------------------- +// MATCHING WITH SUBSTRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched substrings. +// +// Example: extracts "ruby" into "s" and 1234 into "i" +// int i; +// std::string s; +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); +// +// Example: fails because string cannot be stored in integer +// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i)); +// +// Example: fails because there aren't enough sub-patterns: +// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s)); +// +// Example: does not try to extract any extra sub-patterns +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); +// +// Example: does not try to extract into NULL +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); +// +// Example: integer overflow causes failure +// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); +// +// ----------------------------------------------------------------------- +// PARTIAL MATCHES +// +// You can use the "PartialMatch" operation when you want the pattern +// to match any substring of the text. +// +// Example: simple search for a string: +// CHECK(PCRE::PartialMatch("hello", "ell")); +// +// Example: find first number in a string +// int number; +// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number)); +// CHECK_EQ(number, 100); +// +// ----------------------------------------------------------------------- +// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS +// +// PCRE makes it easy to use any string as a regular expression, without +// requiring a separate compilation step. +// +// If speed is of the essence, you can create a pre-compiled "PCRE" +// object from the pattern and use it multiple times. If you do so, +// you can typically parse text faster than with sscanf. +// +// Example: precompile pattern for faster matching: +// PCRE pattern("h.*o"); +// while (ReadLine(&str)) { +// if (PCRE::FullMatch(str, pattern)) ...; +// } +// +// ----------------------------------------------------------------------- +// SCANNING TEXT INCPCREMENTALLY +// +// The "Consume" operation may be useful if you want to repeatedly +// match regular expressions at the front of a string and skip over +// them as they match. This requires use of the "StringPiece" type, +// which represents a sub-range of a real string. +// +// Example: read lines of the form "var = value" from a string. +// std::string contents = ...; // Fill string somehow +// StringPiece input(contents); // Wrap a StringPiece around it +// +// std::string var; +// int value; +// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { +// ...; +// } +// +// Each successful call to "Consume" will set "var/value", and also +// advance "input" so it points past the matched text. Note that if the +// regular expression matches an empty string, input will advance +// by 0 bytes. If the regular expression being used might match +// an empty string, the loop body must check for this case and either +// advance the string or break out of the loop. +// +// The "FindAndConsume" operation is similar to "Consume" but does not +// anchor your match at the beginning of the string. For example, you +// could extract all words from a string by repeatedly calling +// PCRE::FindAndConsume(&input, "(\\w+)", &word) +// +// ----------------------------------------------------------------------- +// PARSING HEX/OCTAL/C-RADIX NUMBERS +// +// By default, if you pass a pointer to a numeric value, the +// corresponding text is interpreted as a base-10 number. You can +// instead wrap the pointer with a call to one of the operators Hex(), +// Octal(), or CRadix() to interpret the text in another base. The +// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) +// prefixes, but defaults to base-10. +// +// Example: +// int a, b, c, d; +// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", +// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d)); +// will leave 64 in a, b, c, and d. + +#include "util/util.h" +#include "re2/stringpiece.h" + +#ifdef USEPCRE +#include <pcre.h> +namespace re2 { +const bool UsingPCRE = true; +} // namespace re2 +#else +struct pcre; // opaque +namespace re2 { +const bool UsingPCRE = false; +} // namespace re2 +#endif + +namespace re2 { + +class PCRE_Options; + +// Interface for regular expression matching. Also corresponds to a +// pre-compiled regular expression. An "PCRE" object is safe for +// concurrent use by multiple threads. +class PCRE { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + + // Marks end of arg list. + // ONLY USE IN OPTIONAL ARG DEFAULTS. + // DO NOT PASS EXPLICITLY. + static Arg no_more_args; + + // Options are same value as those in pcre. We provide them here + // to avoid users needing to include pcre.h and also to isolate + // users from pcre should we change the underlying library. + // Only those needed by Google programs are exposed here to + // avoid collision with options employed internally by regexp.cc + // Note that some options have equivalents that can be specified in + // the regexp itself. For example, prefixing your regexp with + // "(?s)" has the same effect as the PCRE_DOTALL option. + enum Option { + None = 0x0000, + UTF8 = 0x0800, // == PCRE_UTF8 + EnabledCompileOptions = UTF8, + EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag + }; + + // We provide implicit conversions from strings so that users can + // pass in a string or a "const char*" wherever an "PCRE" is expected. + PCRE(const char* pattern); + PCRE(const char* pattern, Option option); + PCRE(const std::string& pattern); + PCRE(const std::string& pattern, Option option); + PCRE(const char *pattern, const PCRE_Options& re_option); + PCRE(const std::string& pattern, const PCRE_Options& re_option); + + ~PCRE(); + + // The string specification for this PCRE. E.g. + // PCRE re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const std::string& pattern() const { return pattern_; } + + // If PCRE could not be created properly, returns an error string. + // Else returns the empty string. + const std::string& error() const { return *error_; } + + // Whether the PCRE has hit a match limit during execution. + // Not thread safe. Intended only for testing. + // If hitting match limits is a problem, + // you should be using PCRE2 (re2/re2.h) + // instead of checking this flag. + bool HitLimit(); + void ClearHitLimit(); + + /***** The useful part: the matching interface *****/ + + // Matches "text" against "pattern". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "std::string" for "text". + // You can pass in a "const char*" or a "std::string" or a "PCRE" for "pattern". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // std::string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, size_t)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "pattern" exactly + // b. The number of matched sub-patterns is >= number of supplied pointers + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number); + struct FullMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FullMatchFunctor FullMatch; + + // Exactly like FullMatch(), except that "pattern" is allowed to match + // a substring of "text". + struct PartialMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const PartialMatchFunctor PartialMatch; + + // Like FullMatch() and PartialMatch(), except that pattern has to + // match a prefix of "text", and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true. + struct ConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const ConsumeFunctor Consume; + + // Like Consume(..), but does not anchor the match at the beginning of the + // string. That is, "pattern" need not start its match at the beginning of + // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next + // word in "s" and stores it in "word". + struct FindAndConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FindAndConsumeFunctor FindAndConsume; + + // Replace the first match of "pattern" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(PCRE::Replace(&s, "b+", "d")); + // + // will leave "s" containing "yada dabba doo" + // + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace(), except replaces all occurrences of the pattern in + // the string with the rewrite. Replacements are not subject to + // re-matching. E.g., + // + // std::string s = "yabba dabba doo"; + // CHECK(PCRE::GlobalReplace(&s, "b+", "d")); + // + // will leave "s" containing "yada dada doo" + // + // Returns the number of replacements made. + static int GlobalReplace(std::string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + static bool Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + std::string *out); + + // Check that the given @p rewrite string is suitable for use with + // this PCRE. It checks that: + // * The PCRE has enough parenthesized subexpressions to satisfy all + // of the \N tokens in @p rewrite, and + // * The @p rewrite string doesn't have any syntax errors + // ('\' followed by anything besides [0-9] and '\'). + // Making this test will guarantee that "replace" and "extract" + // operations won't LOG(ERROR) or fail because of a bad rewrite + // string. + // @param rewrite The proposed rewrite string. + // @param error An error message is recorded here, iff we return false. + // Otherwise, it is unchanged. + // @return true, iff @p rewrite is suitable for use with the PCRE. + bool CheckRewriteString(const StringPiece& rewrite, + std::string* error) const; + + // Returns a copy of 'unquoted' with all potentially meaningful + // regexp characters backslash-escaped. The returned string, used + // as a regular expression, will exactly match the original string. + // For example, + // 1.5-2.0? + // becomes: + // 1\.5\-2\.0\? + static std::string QuoteMeta(const StringPiece& unquoted); + + /***** Generic matching interface (not so nice to use) *****/ + + // Type of match (TODO: Should be restructured as an Option) + enum Anchor { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH, // Anchor at start and end + }; + + // General matching routine. Stores the length of the match in + // "*consumed" if successful. + bool DoMatch(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const* args, int n) const; + + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. + int NumberOfCapturingGroups() const; + + private: + void Init(const char* pattern, Option option, int match_limit, + int stack_limit, bool report_errors); + + // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with + // pairs of integers for the beginning and end positions of matched + // text. The first pair corresponds to the entire matched text; + // subsequent pairs correspond, in order, to parentheses-captured + // matches. Returns the number of pairs (one more than the number of + // the last subpattern with a match) if matching was successful + // and zero if the match failed. + // I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching + // against "foo", "bar", and "baz" respectively. + // When matching PCRE("(foo)|hello") against "hello", it will return 1. + // But the values for all subpattern are filled in into "vec". + int TryMatch(const StringPiece& text, + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const; + + // Append the "rewrite" string, with backslash subsitutions from "text" + // and "vec", to string "out". + bool Rewrite(std::string *out, + const StringPiece &rewrite, + const StringPiece &text, + int *vec, + int veclen) const; + + // internal implementation for DoMatch + bool DoMatchImpl(const StringPiece& text, + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n, + int* vec, + int vecsize) const; + + // Compile the regexp for the specified anchoring mode + pcre* Compile(Anchor anchor); + + std::string pattern_; + Option options_; + pcre* re_full_; // For full matches + pcre* re_partial_; // For partial matches + const std::string* error_; // Error indicator (or empty string) + bool report_errors_; // Silences error logging if false + int match_limit_; // Limit on execution resources + int stack_limit_; // Limit on stack resources (bytes) + mutable int32_t hit_limit_; // Hit limit during execution (bool) + + PCRE(const PCRE&) = delete; + PCRE& operator=(const PCRE&) = delete; +}; + +// PCRE_Options allow you to set the PCRE::Options, plus any pcre +// "extra" options. The only extras are match_limit, which limits +// the CPU time of a match, and stack_limit, which limits the +// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default +// that should not cause too many problems in production code. +// If PCRE hits a limit during a match, it may return a false negative, +// but (hopefully) it won't crash. +// +// NOTE: If you are handling regular expressions specified by +// (external or internal) users, rather than hard-coded ones, +// you should be using PCRE2, which uses an alternate implementation +// that avoids these issues. See http://go/re2quick. +class PCRE_Options { + public: + // constructor + PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {} + // accessors + PCRE::Option option() const { return option_; } + void set_option(PCRE::Option option) { + option_ = option; + } + int match_limit() const { return match_limit_; } + void set_match_limit(int match_limit) { + match_limit_ = match_limit; + } + int stack_limit() const { return stack_limit_; } + void set_stack_limit(int stack_limit) { + stack_limit_ = stack_limit; + } + + // If the regular expression is malformed, an error message will be printed + // iff report_errors() is true. Default: true. + bool report_errors() const { return report_errors_; } + void set_report_errors(bool report_errors) { + report_errors_ = report_errors; + } + private: + PCRE::Option option_; + int match_limit_; + int stack_limit_; + bool report_errors_; +}; + + +/***** Implementation details *****/ + +// Hex/Octal/Binary? + +// Special class for parsing into objects that define a ParseFrom() method +template <typename T> +class _PCRE_MatchObject { + public: + static inline bool Parse(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + T* object = reinterpret_cast<T*>(dest); + return object->ParseFrom(str, n); + } +}; + +class PCRE::Arg { + public: + // Empty constructor so we can declare arrays of PCRE::Arg + Arg(); + + // Constructor specially designed for NULL arguments + Arg(void*); + + typedef bool (*Parser)(const char* str, size_t n, void* dest); + +// Type-specific parsers +#define MAKE_PARSER(type, name) \ + Arg(type* p) : arg_(p), parser_(name) {} \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} + + MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_schar); + MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(std::string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + + MAKE_PARSER(short, parse_short); + MAKE_PARSER(unsigned short, parse_ushort); + MAKE_PARSER(int, parse_int); + MAKE_PARSER(unsigned int, parse_uint); + MAKE_PARSER(long, parse_long); + MAKE_PARSER(unsigned long, parse_ulong); + MAKE_PARSER(long long, parse_longlong); + MAKE_PARSER(unsigned long long, parse_ulonglong); + +#undef MAKE_PARSER + + // Generic constructor + template <typename T> Arg(T*, Parser parser); + // Generic constructor template + template <typename T> Arg(T* p) + : arg_(p), parser_(_PCRE_MatchObject<T>::Parse) { + } + + // Parse the data + bool Parse(const char* str, size_t n) const; + + private: + void* arg_; + Parser parser_; + + static bool parse_null (const char* str, size_t n, void* dest); + static bool parse_char (const char* str, size_t n, void* dest); + static bool parse_schar (const char* str, size_t n, void* dest); + static bool parse_uchar (const char* str, size_t n, void* dest); + static bool parse_float (const char* str, size_t n, void* dest); + static bool parse_double (const char* str, size_t n, void* dest); + static bool parse_string (const char* str, size_t n, void* dest); + static bool parse_stringpiece (const char* str, size_t n, void* dest); + +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_##name(const char* str, size_t n, void* dest); \ + static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ + int radix); \ + \ + public: \ + static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ + static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ + static bool parse_##name##_cradix(const char* str, size_t n, void* dest) + + DECLARE_INTEGER_PARSER(short); + DECLARE_INTEGER_PARSER(ushort); + DECLARE_INTEGER_PARSER(int); + DECLARE_INTEGER_PARSER(uint); + DECLARE_INTEGER_PARSER(long); + DECLARE_INTEGER_PARSER(ulong); + DECLARE_INTEGER_PARSER(longlong); + DECLARE_INTEGER_PARSER(ulonglong); + +#undef DECLARE_INTEGER_PARSER + +}; + +inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } +inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } + +inline bool PCRE::Arg::Parse(const char* str, size_t n) const { + return (*parser_)(str, n, arg_); +} + +// This part of the parser, appropriate only for ints, deals with bases +#define MAKE_INTEGER_PARSER(type, name) \ + inline PCRE::Arg Hex(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \ + } \ + inline PCRE::Arg Octal(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \ + } \ + inline PCRE::Arg CRadix(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \ + } + +MAKE_INTEGER_PARSER(short, short); +MAKE_INTEGER_PARSER(unsigned short, ushort); +MAKE_INTEGER_PARSER(int, int); +MAKE_INTEGER_PARSER(unsigned int, uint); +MAKE_INTEGER_PARSER(long, long); +MAKE_INTEGER_PARSER(unsigned long, ulong); +MAKE_INTEGER_PARSER(long long, longlong); +MAKE_INTEGER_PARSER(unsigned long long, ulonglong); + +#undef MAKE_INTEGER_PARSER + +} // namespace re2 + +#endif // UTIL_PCRE_H_ diff --git a/contrib/libs/re2/util/rune.cc b/contrib/libs/re2/util/rune.cc new file mode 100644 index 00000000000..4f625ea380f --- /dev/null +++ b/contrib/libs/re2/util/rune.cc @@ -0,0 +1,260 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ + +#include <stdarg.h> +#include <string.h> + +#include "util/utf.h" + +namespace re2 { + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, + /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1<<Bitx)-1, /* 0011 1111 */ + Testx = Maskx ^ 0xFF, /* 1100 0000 */ + + Bad = Runeerror, +}; + +int +chartorune(Rune *rune, const char *str) +{ + int c, c1, c2, c3; + long l; + + /* + * one character sequence + * 00000-0007F => T1 + */ + c = *(unsigned char*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(unsigned char*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(unsigned char*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(unsigned char*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + *rune = l; + return 4; + } + + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +runetochar(char *str, const Rune *rune) +{ + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if(c <= Rune1) { + str[0] = static_cast<char>(c); + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if(c <= Rune2) { + str[0] = T2 | static_cast<char>(c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) { + str[0] = T3 | static_cast<char>(c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | static_cast<char>(c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; +} + +int +runelen(Rune rune) +{ + char str[10]; + + return runetochar(str, &rune); +} + +int +fullrune(const char *str, int n) +{ + if (n > 0) { + int c = *(unsigned char*)str; + if (c < Tx) + return 1; + if (n > 1) { + if (c < T3) + return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } + } + return 0; +} + + +int +utflen(const char *s) +{ + int c; + long n; + Rune rune; + + n = 0; + for(;;) { + c = *(unsigned char*)s; + if(c < Runeself) { + if(c == 0) + return n; + s++; + } else + s += chartorune(&rune, s); + n++; + } + return 0; +} + +char* +utfrune(const char *s, Rune c) +{ + long c1; + Rune r; + int n; + + if(c < Runesync) /* not part of utf sequence */ + return strchr((char*)s, c); + + for(;;) { + c1 = *(unsigned char*)s; + if(c1 < Runeself) { /* one byte rune */ + if(c1 == 0) + return 0; + if(c1 == c) + return (char*)s; + s++; + continue; + } + n = chartorune(&r, s); + if(r == c) + return (char*)s; + s += n; + } + return 0; +} + +} // namespace re2 diff --git a/contrib/libs/re2/util/strutil.cc b/contrib/libs/re2/util/strutil.cc new file mode 100644 index 00000000000..fb7e6b1b0c7 --- /dev/null +++ b/contrib/libs/re2/util/strutil.cc @@ -0,0 +1,149 @@ +// Copyright 1999-2005 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <stdarg.h> +#include <stdio.h> + +#include "util/strutil.h" + +#ifdef _WIN32 +#define snprintf _snprintf +#define vsnprintf _vsnprintf +#endif + +namespace re2 { + +// ---------------------------------------------------------------------- +// CEscapeString() +// Copies 'src' to 'dest', escaping dangerous characters using +// C-style escape sequences. 'src' and 'dest' should not overlap. +// Returns the number of bytes written to 'dest' (not including the \0) +// or (size_t)-1 if there was insufficient space. +// ---------------------------------------------------------------------- +static size_t CEscapeString(const char* src, size_t src_len, + char* dest, size_t dest_len) { + const char* src_end = src + src_len; + size_t used = 0; + + for (; src < src_end; src++) { + if (dest_len - used < 2) // space for two-character escape + return (size_t)-1; + + unsigned char c = *src; + switch (c) { + case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; + case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; + case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; + case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; + case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; + case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; + default: + // Note that if we emit \xNN and the src character after that is a hex + // digit then that digit must be escaped too to prevent it being + // interpreted as part of the character code by C. + if (c < ' ' || c > '~') { + if (dest_len - used < 5) // space for four-character escape + \0 + return (size_t)-1; + snprintf(dest + used, 5, "\\%03o", c); + used += 4; + } else { + dest[used++] = c; break; + } + } + } + + if (dest_len - used < 1) // make sure that there is room for \0 + return (size_t)-1; + + dest[used] = '\0'; // doesn't count towards return value though + return used; +} + +// ---------------------------------------------------------------------- +// CEscape() +// Copies 'src' to result, escaping dangerous characters using +// C-style escape sequences. 'src' and 'dest' should not overlap. +// ---------------------------------------------------------------------- +std::string CEscape(const StringPiece& src) { + const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion + char* dest = new char[dest_len]; + const size_t used = CEscapeString(src.data(), src.size(), + dest, dest_len); + std::string s = std::string(dest, used); + delete[] dest; + return s; +} + +void PrefixSuccessor(std::string* prefix) { + // We can increment the last character in the string and be done + // unless that character is 255, in which case we have to erase the + // last character and increment the previous character, unless that + // is 255, etc. If the string is empty or consists entirely of + // 255's, we just return the empty string. + while (!prefix->empty()) { + char& c = prefix->back(); + if (c == '\xff') { // char literal avoids signed/unsigned. + prefix->pop_back(); + } else { + ++c; + break; + } + } +} + +static void StringAppendV(std::string* dst, const char* format, va_list ap) { + // First try with a small fixed size buffer + char space[1024]; + + // It's possible for methods that use a va_list to invalidate + // the data in it upon use. The fix is to make a copy + // of the structure before using it and use that copy instead. + va_list backup_ap; + va_copy(backup_ap, ap); + int result = vsnprintf(space, sizeof(space), format, backup_ap); + va_end(backup_ap); + + if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) { + // It fit + dst->append(space, result); + return; + } + + // Repeatedly increase buffer size until it fits + int length = sizeof(space); + while (true) { + if (result < 0) { + // Older behavior: just try doubling the buffer size + length *= 2; + } else { + // We need exactly "result+1" characters + length = result+1; + } + char* buf = new char[length]; + + // Restore the va_list before we use it again + va_copy(backup_ap, ap); + result = vsnprintf(buf, length, format, backup_ap); + va_end(backup_ap); + + if ((result >= 0) && (result < length)) { + // It fit + dst->append(buf, result); + delete[] buf; + return; + } + delete[] buf; + } +} + +std::string StringPrintf(const char* format, ...) { + va_list ap; + va_start(ap, format); + std::string result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} + +} // namespace re2 diff --git a/contrib/libs/re2/util/strutil.h b/contrib/libs/re2/util/strutil.h new file mode 100644 index 00000000000..a69908a0dd9 --- /dev/null +++ b/contrib/libs/re2/util/strutil.h @@ -0,0 +1,21 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_STRUTIL_H_ +#define UTIL_STRUTIL_H_ + +#include <string> + +#include "re2/stringpiece.h" +#include "util/util.h" + +namespace re2 { + +std::string CEscape(const StringPiece& src); +void PrefixSuccessor(std::string* prefix); +std::string StringPrintf(const char* format, ...); + +} // namespace re2 + +#endif // UTIL_STRUTIL_H_ diff --git a/contrib/libs/re2/util/test.cc b/contrib/libs/re2/util/test.cc new file mode 100644 index 00000000000..028616b359a --- /dev/null +++ b/contrib/libs/re2/util/test.cc @@ -0,0 +1,34 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <stdio.h> +#include <string> + +#include "util/test.h" + +namespace testing { +std::string TempDir() { return "/tmp/"; } +} // namespace testing + +struct Test { + void (*fn)(void); + const char *name; +}; + +static Test tests[10000]; +static int ntests; + +void RegisterTest(void (*fn)(void), const char *name) { + tests[ntests].fn = fn; + tests[ntests++].name = name; +} + +int main(int argc, char** argv) { + for (int i = 0; i < ntests; i++) { + printf("%s\n", tests[i].name); + tests[i].fn(); + } + printf("PASS\n"); + return 0; +} diff --git a/contrib/libs/re2/util/test.h b/contrib/libs/re2/util/test.h new file mode 100644 index 00000000000..54e6f8fbbbc --- /dev/null +++ b/contrib/libs/re2/util/test.h @@ -0,0 +1,50 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_TEST_H_ +#define UTIL_TEST_H_ + +#include "util/util.h" +#include "util/logging.h" + +namespace testing { +std::string TempDir(); +} // namespace testing + +#define TEST(x, y) \ + void x##y(void); \ + TestRegisterer r##x##y(x##y, # x "." # y); \ + void x##y(void) + +void RegisterTest(void (*)(void), const char*); + +class TestRegisterer { + public: + TestRegisterer(void (*fn)(void), const char *s) { + RegisterTest(fn, s); + } +}; + +// fatal assertions +#define ASSERT_TRUE CHECK +#define ASSERT_FALSE(x) CHECK(!(x)) +#define ASSERT_EQ CHECK_EQ +#define ASSERT_NE CHECK_NE +#define ASSERT_LT CHECK_LT +#define ASSERT_LE CHECK_LE +#define ASSERT_GT CHECK_GT +#define ASSERT_GE CHECK_GE + +// nonfatal assertions +// TODO(rsc): Do a better job? +#define EXPECT_TRUE CHECK +#define EXPECT_FALSE(x) CHECK(!(x)) +#define EXPECT_EQ CHECK_EQ +#define EXPECT_NE CHECK_NE +#define EXPECT_LT CHECK_LT +#define EXPECT_LE CHECK_LE +#define EXPECT_GT CHECK_GT +#define EXPECT_GE CHECK_GE + +#endif // UTIL_TEST_H_ diff --git a/contrib/libs/re2/util/utf.h b/contrib/libs/re2/util/utf.h new file mode 100644 index 00000000000..85b42972390 --- /dev/null +++ b/contrib/libs/re2/util/utf.h @@ -0,0 +1,44 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + * + * This file and rune.cc have been converted to compile as C++ code + * in name space re2. + */ + +#ifndef UTIL_UTF_H_ +#define UTIL_UTF_H_ + +#include <stdint.h> + +namespace re2 { + +typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ + +enum +{ + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ +}; + +int runetochar(char* s, const Rune* r); +int chartorune(Rune* r, const char* s); +int fullrune(const char* s, int n); +int utflen(const char* s); +char* utfrune(const char*, Rune); + +} // namespace re2 + +#endif // UTIL_UTF_H_ diff --git a/contrib/libs/re2/util/util.h b/contrib/libs/re2/util/util.h new file mode 100644 index 00000000000..56e46c1a338 --- /dev/null +++ b/contrib/libs/re2/util/util.h @@ -0,0 +1,42 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_UTIL_H_ +#define UTIL_UTIL_H_ + +#define arraysize(array) (sizeof(array)/sizeof((array)[0])) + +#ifndef ATTRIBUTE_NORETURN +#if defined(__GNUC__) +#define ATTRIBUTE_NORETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define ATTRIBUTE_NORETURN __declspec(noreturn) +#else +#define ATTRIBUTE_NORETURN +#endif +#endif + +#ifndef ATTRIBUTE_UNUSED +#if defined(__GNUC__) +#define ATTRIBUTE_UNUSED __attribute__((unused)) +#else +#define ATTRIBUTE_UNUSED +#endif +#endif + +#ifndef FALLTHROUGH_INTENDED +#if defined(__clang__) +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] +#else +#define FALLTHROUGH_INTENDED do {} while (0) +#endif +#endif + +#ifndef NO_THREAD_SAFETY_ANALYSIS +#define NO_THREAD_SAFETY_ANALYSIS +#endif + +#endif // UTIL_UTIL_H_ |