diff options
author | robot-contrib <robot-contrib@yandex-team.com> | 2024-03-01 19:16:04 +0300 |
---|---|---|
committer | robot-contrib <robot-contrib@yandex-team.com> | 2024-03-01 19:27:19 +0300 |
commit | a388d4a51a6512436615d1006ddb75a623852ce2 (patch) | |
tree | 5ab63ecd3296ef66198879932f4955471e9245f6 | |
parent | 867ad5d3887ef7ee3d87ada618a3a1f2f8172026 (diff) | |
download | ydb-a388d4a51a6512436615d1006ddb75a623852ce2.tar.gz |
Update contrib/libs/re2 to 2024-03-01
253c083803d290c1a0d11dafcaef8a374bf0a38e
-rw-r--r-- | contrib/libs/re2/re2/parse.cc | 63 | ||||
-rw-r--r-- | contrib/libs/re2/re2/re2.h | 2 | ||||
-rw-r--r-- | contrib/libs/re2/re2/testing/dump.cc | 20 | ||||
-rw-r--r-- | contrib/libs/re2/re2/testing/parse_test.cc | 25 | ||||
-rw-r--r-- | contrib/libs/re2/re2/testing/re2_test.cc | 19 | ||||
-rw-r--r-- | contrib/libs/re2/util/pcre.cc | 2 | ||||
-rw-r--r-- | contrib/libs/re2/ya.make | 4 |
7 files changed, 107 insertions, 28 deletions
diff --git a/contrib/libs/re2/re2/parse.cc b/contrib/libs/re2/re2/parse.cc index c22f272553..a3e580f6db 100644 --- a/contrib/libs/re2/re2/parse.cc +++ b/contrib/libs/re2/re2/parse.cc @@ -338,6 +338,20 @@ Rune CycleFoldRune(Rune r) { } // Add lo-hi to the class, along with their fold-equivalent characters. +static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) { + while (lo <= hi) { + cc->AddRange(lo, lo); + if ('A' <= lo && lo <= 'Z') { + cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a'); + } + if ('a' <= lo && lo <= 'z') { + cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A'); + } + lo++; + } +} + +// Add lo-hi to the class, along with their fold-equivalent characters. // If lo-hi is already in the class, assume that the fold-equivalent // chars are there too, so there's no work to do. static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { @@ -394,17 +408,26 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { // Pushes the literal rune r onto the stack. bool Regexp::ParseState::PushLiteral(Rune r) { // Do case folding if needed. - if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { - Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); - re->ccb_ = new CharClassBuilder; - Rune r1 = r; - do { - if (!(flags_ & NeverNL) || r != '\n') { - re->ccb_->AddRange(r, r); - } - r = CycleFoldRune(r); - } while (r != r1); - return PushRegexp(re); + if (flags_ & FoldCase) { + if (flags_ & Latin1 && (('A' <= r && r <= 'Z') || + ('a' <= r && r <= 'z'))) { + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + AddFoldedRangeLatin1(re->ccb_, r, r); + return PushRegexp(re); + } + if (!(flags_ & Latin1) && CycleFoldRune(r) != r) { + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + Rune r1 = r; + do { + if (!(flags_ & NeverNL) || r != '\n') { + re->ccb_->AddRange(r, r); + } + r = CycleFoldRune(r); + } while (r != r1); + return PushRegexp(re); + } } // Exclude newline if applicable. @@ -776,7 +799,8 @@ Rune* Regexp::LeadingString(Regexp* re, int* nrune, while (re->op() == kRegexpConcat && re->nsub() > 0) re = re->sub()[0]; - *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase); + *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & + (Regexp::FoldCase | Regexp::Latin1)); if (re->op() == kRegexpLiteral) { *nrune = 1; @@ -1175,7 +1199,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub, if (re->op() == kRegexpCharClass) { CharClass* cc = re->cc(); for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) - ccb.AddRange(it->lo, it->hi); + ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags()); } else if (re->op() == kRegexpLiteral) { if (re->parse_flags() & Regexp::FoldCase) { // AddFoldedRange() can terminate prematurely if the character class @@ -1194,7 +1218,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub, } re->Decref(); } - Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); + Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase); splices->emplace_back(re, sub + start, i - start); } @@ -1622,10 +1646,15 @@ void CharClassBuilder::AddRangeFlags( } // If folding case, add fold-equivalent characters too. - if (parse_flags & Regexp::FoldCase) - AddFoldedRange(this, lo, hi, 0); - else + if (parse_flags & Regexp::FoldCase) { + if (parse_flags & Regexp::Latin1) { + AddFoldedRangeLatin1(this, lo, hi); + } else { + AddFoldedRange(this, lo, hi, 0); + } + } else { AddRange(lo, hi); + } } // Look for a group with the given name. diff --git a/contrib/libs/re2/re2/re2.h b/contrib/libs/re2/re2/re2.h index cc76f382d8..672ebf9afa 100644 --- a/contrib/libs/re2/re2/re2.h +++ b/contrib/libs/re2/re2/re2.h @@ -1018,7 +1018,7 @@ inline RE2::Arg RE2::Octal(T* ptr) { } // Silence warnings about missing initializers for members of LazyRE2. -#if !defined(__clang__) && defined(__GNUC__) +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif diff --git a/contrib/libs/re2/re2/testing/dump.cc b/contrib/libs/re2/re2/testing/dump.cc index 5cddd23346..9e3c94a696 100644 --- a/contrib/libs/re2/re2/testing/dump.cc +++ b/contrib/libs/re2/re2/testing/dump.cc @@ -96,17 +96,25 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) { break; case kRegexpLiteral: { Rune r = re->rune(); - char buf[UTFmax+1]; - buf[runetochar(buf, &r)] = 0; - s->append(buf); + if (re->parse_flags() & Regexp::Latin1) { + s->push_back(r); + } else { + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } break; } case kRegexpLiteralString: for (int i = 0; i < re->nrunes(); i++) { Rune r = re->runes()[i]; - char buf[UTFmax+1]; - buf[runetochar(buf, &r)] = 0; - s->append(buf); + if (re->parse_flags() & Regexp::Latin1) { + s->push_back(r); + } else { + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } } break; case kRegexpConcat: diff --git a/contrib/libs/re2/re2/testing/parse_test.cc b/contrib/libs/re2/re2/testing/parse_test.cc index 7684b62a49..95294d5fff 100644 --- a/contrib/libs/re2/re2/testing/parse_test.cc +++ b/contrib/libs/re2/re2/testing/parse_test.cc @@ -225,6 +225,29 @@ static Test tests[] = { // Bug in Regexp::ToString() that emitted [^], which // would (obviously) fail to parse when fed back in. { "[\\s\\S]", "cc{0-0x10ffff}" }, + + // As per https://github.com/google/re2/issues/477, + // there were long-standing bugs involving Latin-1. + // Here, we exercise it WITHOUT case folding... + { "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 }, + { "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 }, + { "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 }, + { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 }, + { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 }, + { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 }, + { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 }, + { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 }, + // Here, we exercise it WITH case folding... + // 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2 + // should fold to 0xF1 and 0xF2, respectively. + { "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, }; bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { @@ -492,7 +515,7 @@ TEST(TestToString, EquivalentParse) { // << " t=" << t << " regexp=" << tests[i].regexp; // Test that if we parse the new regexp we get the same structure. - Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); + Regexp* nre = Regexp::Parse(t, f, &status); ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text(); std::string ss = nre->Dump(); std::string tt = nre->ToString(); diff --git a/contrib/libs/re2/re2/testing/re2_test.cc b/contrib/libs/re2/re2/testing/re2_test.cc index 151525f2d6..ddf8dbf8fb 100644 --- a/contrib/libs/re2/re2/testing/re2_test.cc +++ b/contrib/libs/re2/re2/testing/re2_test.cc @@ -1658,4 +1658,23 @@ TEST(RE2, Issue310) { ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; } +TEST(RE2, Issue477) { + // Regexp::LeadingString didn't output Latin1 into flags. + // In the given pattern, 0xA5 should be factored out, but + // shouldn't lose its Latin1-ness in the process. Because + // that was happening, the prefix for accel was 0xC2 0xA5 + // instead of 0xA5. Note that the former doesn't occur in + // the given input and so replacements weren't occurring. + + const char bytes[] = { + (char)0xa5, (char)0xd1, (char)0xa5, (char)0xd1, + (char)0x61, (char)0x63, (char)0xa5, (char)0x64, + }; + std::string s(bytes, ABSL_ARRAYSIZE(bytes)); + RE2 re("\xa5\xd1|\xa5\x64", RE2::Latin1); + int n = RE2::GlobalReplace(&s, re, ""); + ASSERT_EQ(n, 3); + ASSERT_EQ(s, "\x61\x63"); +} + } // namespace re2 diff --git a/contrib/libs/re2/util/pcre.cc b/contrib/libs/re2/util/pcre.cc index f54cb28f83..27aee3dc48 100644 --- a/contrib/libs/re2/util/pcre.cc +++ b/contrib/libs/re2/util/pcre.cc @@ -21,7 +21,7 @@ #include "util/pcre.h" // Silence warnings about the wacky formatting in the operator() functions. -#if !defined(__clang__) && defined(__GNUC__) +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wmisleading-indentation" #endif diff --git a/contrib/libs/re2/ya.make b/contrib/libs/re2/ya.make index 7f6fae30da..da93062de9 100644 --- a/contrib/libs/re2/ya.make +++ b/contrib/libs/re2/ya.make @@ -9,9 +9,9 @@ LICENSE( LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -VERSION(2024-02-01) +VERSION(2024-03-01) -ORIGINAL_SOURCE(https://github.com/google/re2/archive/2024-02-01.tar.gz) +ORIGINAL_SOURCE(https://github.com/google/re2/archive/2024-03-01.tar.gz) PEERDIR( contrib/restricted/abseil-cpp/absl/base |