diff options
author | orivej <orivej@yandex-team.ru> | 2022-02-10 16:44:49 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:49 +0300 |
commit | 718c552901d703c502ccbefdfc3c9028d608b947 (patch) | |
tree | 46534a98bbefcd7b1f3faa5b52c138ab27db75b7 /contrib/libs/pcre/pcrecpp.cc | |
parent | e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (diff) | |
download | ydb-718c552901d703c502ccbefdfc3c9028d608b947.tar.gz |
Restoring authorship annotation for <orivej@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/pcre/pcrecpp.cc')
-rw-r--r-- | contrib/libs/pcre/pcrecpp.cc | 132 |
1 files changed, 66 insertions, 66 deletions
diff --git a/contrib/libs/pcre/pcrecpp.cc b/contrib/libs/pcre/pcrecpp.cc index 57daa1f59f..7a3ee6cb98 100644 --- a/contrib/libs/pcre/pcrecpp.cc +++ b/contrib/libs/pcre/pcrecpp.cc @@ -30,7 +30,7 @@ // Author: Sanjay Ghemawat #ifdef HAVE_CONFIG_H -#include "pcre_config.h" +#include "pcre_config.h" #endif #include <stdlib.h> @@ -66,8 +66,8 @@ Arg RE::no_arg((void*)NULL); // inclusive test if we ever needed it. (Note that not only the // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are // gnu-specific.) -#if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) \ - && !defined(__INTEL_COMPILER) && !defined(__LCC__) +#if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) \ + && !defined(__INTEL_COMPILER) && !defined(__LCC__) # define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x) # define ULP_AS_STRING_INTERNAL(x) #x # define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__) @@ -81,25 +81,25 @@ static const string empty_string; // If the user doesn't ask for any options, we just use this one static RE_Options default_options; -// Specials for the start of patterns. See comments where start_options is used -// below. (PH June 2018) -static const char *start_options[] = { - "(*UTF8)", - "(*UTF)", - "(*UCP)", - "(*NO_START_OPT)", - "(*NO_AUTO_POSSESS)", - "(*LIMIT_RECURSION=", - "(*LIMIT_MATCH=", - "(*CRLF)", - "(*LF)", - "(*CR)", - "(*BSR_UNICODE)", - "(*BSR_ANYCRLF)", - "(*ANYCRLF)", - "(*ANY)", - "" }; - +// Specials for the start of patterns. See comments where start_options is used +// below. (PH June 2018) +static const char *start_options[] = { + "(*UTF8)", + "(*UTF)", + "(*UCP)", + "(*NO_START_OPT)", + "(*NO_AUTO_POSSESS)", + "(*LIMIT_RECURSION=", + "(*LIMIT_MATCH=", + "(*CRLF)", + "(*LF)", + "(*CR)", + "(*BSR_UNICODE)", + "(*BSR_ANYCRLF)", + "(*ANYCRLF)", + "(*ANY)", + "" }; + void RE::Init(const string& pat, const RE_Options* options) { pattern_ = pat; if (options == NULL) { @@ -155,49 +155,49 @@ pcre* RE::Compile(Anchor anchor) { } else { // Tack a '\z' at the end of RE. Parenthesize it first so that // the '\z' applies to all top-level alternatives in the regexp. - - /* When this code was written (for PCRE 6.0) it was enough just to - parenthesize the entire pattern. Unfortunately, when the feature of - starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns, - this code was never updated. This bug was not noticed till 2018, long after - PCRE became obsolescent and its maintainer no longer around. Since PCRE is - frozen, I have added a hack to check for all the existing "start of - pattern" specials - knowing that no new ones will ever be added. I am not a - C++ programmer, so the code style is no doubt crude. It is also - inefficient, but is only run when the pattern starts with "(*". - PH June 2018. */ - - string wrapped = ""; - - if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') { - int kk, klen, kmat; - for (;;) { // Loop for any number of leading items - - for (kk = 0; start_options[kk][0] != 0; kk++) { - klen = strlen(start_options[kk]); - kmat = strncmp(pattern_.c_str(), start_options[kk], klen); - if (kmat >= 0) break; - } - if (kmat != 0) break; // Not found - - // If the item ended in "=" we must copy digits up to ")". - - if (start_options[kk][klen-1] == '=') { - while (isdigit(pattern_.c_str()[klen])) klen++; - if (pattern_.c_str()[klen] != ')') break; // Syntax error - klen++; - } - - // Move the item from the pattern to the start of the wrapped string. - - wrapped += pattern_.substr(0, klen); - pattern_.erase(0, klen); - } - } - - // Wrap the rest of the pattern. - - wrapped += "(?:"; // A non-counting grouping operator + + /* When this code was written (for PCRE 6.0) it was enough just to + parenthesize the entire pattern. Unfortunately, when the feature of + starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns, + this code was never updated. This bug was not noticed till 2018, long after + PCRE became obsolescent and its maintainer no longer around. Since PCRE is + frozen, I have added a hack to check for all the existing "start of + pattern" specials - knowing that no new ones will ever be added. I am not a + C++ programmer, so the code style is no doubt crude. It is also + inefficient, but is only run when the pattern starts with "(*". + PH June 2018. */ + + string wrapped = ""; + + if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') { + int kk, klen, kmat; + for (;;) { // Loop for any number of leading items + + for (kk = 0; start_options[kk][0] != 0; kk++) { + klen = strlen(start_options[kk]); + kmat = strncmp(pattern_.c_str(), start_options[kk], klen); + if (kmat >= 0) break; + } + if (kmat != 0) break; // Not found + + // If the item ended in "=" we must copy digits up to ")". + + if (start_options[kk][klen-1] == '=') { + while (isdigit(pattern_.c_str()[klen])) klen++; + if (pattern_.c_str()[klen] != ')') break; // Syntax error + klen++; + } + + // Move the item from the pattern to the start of the wrapped string. + + wrapped += pattern_.substr(0, klen); + pattern_.erase(0, klen); + } + } + + // Wrap the rest of the pattern. + + wrapped += "(?:"; // A non-counting grouping operator wrapped += pattern_; wrapped += ")\\z"; re = pcre_compile(wrapped.c_str(), pcre_options, @@ -477,7 +477,7 @@ int RE::GlobalReplace(const StringPiece& rewrite, matchend++; } // We also need to advance more than one char if we're in utf8 mode. -#ifdef SUPPORT_UTF +#ifdef SUPPORT_UTF if (options_.utf8()) { while (matchend < static_cast<int>(str->length()) && ((*str)[matchend] & 0xc0) == 0x80) |