aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrobot-contrib <robot-contrib@yandex-team.com>2024-03-01 19:16:04 +0300
committerrobot-contrib <robot-contrib@yandex-team.com>2024-03-01 19:27:19 +0300
commita388d4a51a6512436615d1006ddb75a623852ce2 (patch)
tree5ab63ecd3296ef66198879932f4955471e9245f6
parent867ad5d3887ef7ee3d87ada618a3a1f2f8172026 (diff)
downloadydb-a388d4a51a6512436615d1006ddb75a623852ce2.tar.gz
Update contrib/libs/re2 to 2024-03-01
253c083803d290c1a0d11dafcaef8a374bf0a38e
-rw-r--r--contrib/libs/re2/re2/parse.cc63
-rw-r--r--contrib/libs/re2/re2/re2.h2
-rw-r--r--contrib/libs/re2/re2/testing/dump.cc20
-rw-r--r--contrib/libs/re2/re2/testing/parse_test.cc25
-rw-r--r--contrib/libs/re2/re2/testing/re2_test.cc19
-rw-r--r--contrib/libs/re2/util/pcre.cc2
-rw-r--r--contrib/libs/re2/ya.make4
7 files changed, 107 insertions, 28 deletions
diff --git a/contrib/libs/re2/re2/parse.cc b/contrib/libs/re2/re2/parse.cc
index c22f272553..a3e580f6db 100644
--- a/contrib/libs/re2/re2/parse.cc
+++ b/contrib/libs/re2/re2/parse.cc
@@ -338,6 +338,20 @@ Rune CycleFoldRune(Rune r) {
}
// Add lo-hi to the class, along with their fold-equivalent characters.
+static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) {
+ while (lo <= hi) {
+ cc->AddRange(lo, lo);
+ if ('A' <= lo && lo <= 'Z') {
+ cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a');
+ }
+ if ('a' <= lo && lo <= 'z') {
+ cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A');
+ }
+ lo++;
+ }
+}
+
+// Add lo-hi to the class, along with their fold-equivalent characters.
// If lo-hi is already in the class, assume that the fold-equivalent
// chars are there too, so there's no work to do.
static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
@@ -394,17 +408,26 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
// Pushes the literal rune r onto the stack.
bool Regexp::ParseState::PushLiteral(Rune r) {
// Do case folding if needed.
- if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
- Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
- re->ccb_ = new CharClassBuilder;
- Rune r1 = r;
- do {
- if (!(flags_ & NeverNL) || r != '\n') {
- re->ccb_->AddRange(r, r);
- }
- r = CycleFoldRune(r);
- } while (r != r1);
- return PushRegexp(re);
+ if (flags_ & FoldCase) {
+ if (flags_ & Latin1 && (('A' <= r && r <= 'Z') ||
+ ('a' <= r && r <= 'z'))) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ AddFoldedRangeLatin1(re->ccb_, r, r);
+ return PushRegexp(re);
+ }
+ if (!(flags_ & Latin1) && CycleFoldRune(r) != r) {
+ Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+ re->ccb_ = new CharClassBuilder;
+ Rune r1 = r;
+ do {
+ if (!(flags_ & NeverNL) || r != '\n') {
+ re->ccb_->AddRange(r, r);
+ }
+ r = CycleFoldRune(r);
+ } while (r != r1);
+ return PushRegexp(re);
+ }
}
// Exclude newline if applicable.
@@ -776,7 +799,8 @@ Rune* Regexp::LeadingString(Regexp* re, int* nrune,
while (re->op() == kRegexpConcat && re->nsub() > 0)
re = re->sub()[0];
- *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
+ *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ &
+ (Regexp::FoldCase | Regexp::Latin1));
if (re->op() == kRegexpLiteral) {
*nrune = 1;
@@ -1175,7 +1199,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
if (re->op() == kRegexpCharClass) {
CharClass* cc = re->cc();
for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
- ccb.AddRange(it->lo, it->hi);
+ ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags());
} else if (re->op() == kRegexpLiteral) {
if (re->parse_flags() & Regexp::FoldCase) {
// AddFoldedRange() can terminate prematurely if the character class
@@ -1194,7 +1218,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
}
re->Decref();
}
- Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
+ Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase);
splices->emplace_back(re, sub + start, i - start);
}
@@ -1622,10 +1646,15 @@ void CharClassBuilder::AddRangeFlags(
}
// If folding case, add fold-equivalent characters too.
- if (parse_flags & Regexp::FoldCase)
- AddFoldedRange(this, lo, hi, 0);
- else
+ if (parse_flags & Regexp::FoldCase) {
+ if (parse_flags & Regexp::Latin1) {
+ AddFoldedRangeLatin1(this, lo, hi);
+ } else {
+ AddFoldedRange(this, lo, hi, 0);
+ }
+ } else {
AddRange(lo, hi);
+ }
}
// Look for a group with the given name.
diff --git a/contrib/libs/re2/re2/re2.h b/contrib/libs/re2/re2/re2.h
index cc76f382d8..672ebf9afa 100644
--- a/contrib/libs/re2/re2/re2.h
+++ b/contrib/libs/re2/re2/re2.h
@@ -1018,7 +1018,7 @@ inline RE2::Arg RE2::Octal(T* ptr) {
}
// Silence warnings about missing initializers for members of LazyRE2.
-#if !defined(__clang__) && defined(__GNUC__)
+#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
#endif
diff --git a/contrib/libs/re2/re2/testing/dump.cc b/contrib/libs/re2/re2/testing/dump.cc
index 5cddd23346..9e3c94a696 100644
--- a/contrib/libs/re2/re2/testing/dump.cc
+++ b/contrib/libs/re2/re2/testing/dump.cc
@@ -96,17 +96,25 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) {
break;
case kRegexpLiteral: {
Rune r = re->rune();
- char buf[UTFmax+1];
- buf[runetochar(buf, &r)] = 0;
- s->append(buf);
+ if (re->parse_flags() & Regexp::Latin1) {
+ s->push_back(r);
+ } else {
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ s->append(buf);
+ }
break;
}
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
- char buf[UTFmax+1];
- buf[runetochar(buf, &r)] = 0;
- s->append(buf);
+ if (re->parse_flags() & Regexp::Latin1) {
+ s->push_back(r);
+ } else {
+ char buf[UTFmax+1];
+ buf[runetochar(buf, &r)] = 0;
+ s->append(buf);
+ }
}
break;
case kRegexpConcat:
diff --git a/contrib/libs/re2/re2/testing/parse_test.cc b/contrib/libs/re2/re2/testing/parse_test.cc
index 7684b62a49..95294d5fff 100644
--- a/contrib/libs/re2/re2/testing/parse_test.cc
+++ b/contrib/libs/re2/re2/testing/parse_test.cc
@@ -225,6 +225,29 @@ static Test tests[] = {
// Bug in Regexp::ToString() that emitted [^], which
// would (obviously) fail to parse when fed back in.
{ "[\\s\\S]", "cc{0-0x10ffff}" },
+
+ // As per https://github.com/google/re2/issues/477,
+ // there were long-standing bugs involving Latin-1.
+ // Here, we exercise it WITHOUT case folding...
+ { "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 },
+ { "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 },
+ { "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 },
+ { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 },
+ { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
+ { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
+ { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
+ { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
+ // Here, we exercise it WITH case folding...
+ // 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2
+ // should fold to 0xF1 and 0xF2, respectively.
+ { "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
+ { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
};
bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
@@ -492,7 +515,7 @@ TEST(TestToString, EquivalentParse) {
// << " t=" << t << " regexp=" << tests[i].regexp;
// Test that if we parse the new regexp we get the same structure.
- Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
+ Regexp* nre = Regexp::Parse(t, f, &status);
ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
std::string ss = nre->Dump();
std::string tt = nre->ToString();
diff --git a/contrib/libs/re2/re2/testing/re2_test.cc b/contrib/libs/re2/re2/testing/re2_test.cc
index 151525f2d6..ddf8dbf8fb 100644
--- a/contrib/libs/re2/re2/testing/re2_test.cc
+++ b/contrib/libs/re2/re2/testing/re2_test.cc
@@ -1658,4 +1658,23 @@ TEST(RE2, Issue310) {
ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
}
+TEST(RE2, Issue477) {
+ // Regexp::LeadingString didn't output Latin1 into flags.
+ // In the given pattern, 0xA5 should be factored out, but
+ // shouldn't lose its Latin1-ness in the process. Because
+ // that was happening, the prefix for accel was 0xC2 0xA5
+ // instead of 0xA5. Note that the former doesn't occur in
+ // the given input and so replacements weren't occurring.
+
+ const char bytes[] = {
+ (char)0xa5, (char)0xd1, (char)0xa5, (char)0xd1,
+ (char)0x61, (char)0x63, (char)0xa5, (char)0x64,
+ };
+ std::string s(bytes, ABSL_ARRAYSIZE(bytes));
+ RE2 re("\xa5\xd1|\xa5\x64", RE2::Latin1);
+ int n = RE2::GlobalReplace(&s, re, "");
+ ASSERT_EQ(n, 3);
+ ASSERT_EQ(s, "\x61\x63");
+}
+
} // namespace re2
diff --git a/contrib/libs/re2/util/pcre.cc b/contrib/libs/re2/util/pcre.cc
index f54cb28f83..27aee3dc48 100644
--- a/contrib/libs/re2/util/pcre.cc
+++ b/contrib/libs/re2/util/pcre.cc
@@ -21,7 +21,7 @@
#include "util/pcre.h"
// Silence warnings about the wacky formatting in the operator() functions.
-#if !defined(__clang__) && defined(__GNUC__)
+#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wmisleading-indentation"
#endif
diff --git a/contrib/libs/re2/ya.make b/contrib/libs/re2/ya.make
index 7f6fae30da..da93062de9 100644
--- a/contrib/libs/re2/ya.make
+++ b/contrib/libs/re2/ya.make
@@ -9,9 +9,9 @@ LICENSE(
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-VERSION(2024-02-01)
+VERSION(2024-03-01)
-ORIGINAL_SOURCE(https://github.com/google/re2/archive/2024-02-01.tar.gz)
+ORIGINAL_SOURCE(https://github.com/google/re2/archive/2024-03-01.tar.gz)
PEERDIR(
contrib/restricted/abseil-cpp/absl/base