Update contrib/libs/re2 to 2024-03-01

253c083803d290c1a0d11dafcaef8a374bf0a38e
author: robot-contrib <robot-contrib@yandex-team.com> 2024-03-01 19:16:04 +0300
committer: robot-contrib <robot-contrib@yandex-team.com> 2024-03-01 19:27:19 +0300
commit: a388d4a51a6512436615d1006ddb75a623852ce2 (patch)
tree: 5ab63ecd3296ef66198879932f4955471e9245f6
parent: 867ad5d3887ef7ee3d87ada618a3a1f2f8172026 (diff)
download: ydb-a388d4a51a6512436615d1006ddb75a623852ce2.tar.gz
7 files changed, 107 insertions, 28 deletions
diff --git a/contrib/libs/re2/re2/parse.cc b/contrib/libs/re2/re2/parse.cc
index c22f272553..a3e580f6db 100644
--- a/contrib/libs/re2/re2/parse.cc
+++ b/contrib/libs/re2/re2/parse.cc
@@ -338,6 +338,20 @@ Rune CycleFoldRune(Rune r) {
 }
 
 // Add lo-hi to the class, along with their fold-equivalent characters.
+static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) {
+  while (lo <= hi) {
+    cc->AddRange(lo, lo);
+    if ('A' <= lo && lo <= 'Z') {
+      cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a');
+    }
+    if ('a' <= lo && lo <= 'z') {
+      cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A');
+    }
+    lo++;
+  }
+}
+
+// Add lo-hi to the class, along with their fold-equivalent characters.
 // If lo-hi is already in the class, assume that the fold-equivalent
 // chars are there too, so there's no work to do.
 static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
@@ -394,17 +408,26 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
 // Pushes the literal rune r onto the stack.
 bool Regexp::ParseState::PushLiteral(Rune r) {
   // Do case folding if needed.
-  if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
-    Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
-    re->ccb_ = new CharClassBuilder;
-    Rune r1 = r;
-    do {
-      if (!(flags_ & NeverNL) || r != '\n') {
-        re->ccb_->AddRange(r, r);
-      }
-      r = CycleFoldRune(r);
-    } while (r != r1);
-    return PushRegexp(re);
+  if (flags_ & FoldCase) {
+    if (flags_ & Latin1 && (('A' <= r && r <= 'Z') ||
+                            ('a' <= r && r <= 'z'))) {
+      Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+      re->ccb_ = new CharClassBuilder;
+      AddFoldedRangeLatin1(re->ccb_, r, r);
+      return PushRegexp(re);
+    }
+    if (!(flags_ & Latin1) && CycleFoldRune(r) != r) {
+      Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
+      re->ccb_ = new CharClassBuilder;
+      Rune r1 = r;
+      do {
+        if (!(flags_ & NeverNL) || r != '\n') {
+          re->ccb_->AddRange(r, r);
+        }
+        r = CycleFoldRune(r);
+      } while (r != r1);
+      return PushRegexp(re);
+    }
   }
 
   // Exclude newline if applicable.
@@ -776,7 +799,8 @@ Rune* Regexp::LeadingString(Regexp* re, int* nrune,
   while (re->op() == kRegexpConcat && re->nsub() > 0)
     re = re->sub()[0];
 
-  *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
+  *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ &
+                                           (Regexp::FoldCase | Regexp::Latin1));
 
   if (re->op() == kRegexpLiteral) {
     *nrune = 1;
@@ -1175,7 +1199,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
         if (re->op() == kRegexpCharClass) {
           CharClass* cc = re->cc();
           for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
-            ccb.AddRange(it->lo, it->hi);
+            ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags());
         } else if (re->op() == kRegexpLiteral) {
           if (re->parse_flags() & Regexp::FoldCase) {
             // AddFoldedRange() can terminate prematurely if the character class
@@ -1194,7 +1218,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
         }
         re->Decref();
       }
-      Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
+      Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase);
       splices->emplace_back(re, sub + start, i - start);
     }
 
@@ -1622,10 +1646,15 @@ void CharClassBuilder::AddRangeFlags(
   }
 
   // If folding case, add fold-equivalent characters too.
-  if (parse_flags & Regexp::FoldCase)
-    AddFoldedRange(this, lo, hi, 0);
-  else
+  if (parse_flags & Regexp::FoldCase) {
+    if (parse_flags & Regexp::Latin1) {
+      AddFoldedRangeLatin1(this, lo, hi);
+    } else {
+      AddFoldedRange(this, lo, hi, 0);
+    }
+  } else {
     AddRange(lo, hi);
+  }
 }
 
 // Look for a group with the given name.
diff --git a/contrib/libs/re2/re2/re2.h b/contrib/libs/re2/re2/re2.h
index cc76f382d8..672ebf9afa 100644
--- a/contrib/libs/re2/re2/re2.h
+++ b/contrib/libs/re2/re2/re2.h
@@ -1018,7 +1018,7 @@ inline RE2::Arg RE2::Octal(T* ptr) {
 }
 
 // Silence warnings about missing initializers for members of LazyRE2.
-#if !defined(__clang__) && defined(__GNUC__)
+#if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #endif
 
diff --git a/contrib/libs/re2/re2/testing/dump.cc b/contrib/libs/re2/re2/testing/dump.cc
index 5cddd23346..9e3c94a696 100644
--- a/contrib/libs/re2/re2/testing/dump.cc
+++ b/contrib/libs/re2/re2/testing/dump.cc
@@ -96,17 +96,25 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) {
       break;
     case kRegexpLiteral: {
       Rune r = re->rune();
-      char buf[UTFmax+1];
-      buf[runetochar(buf, &r)] = 0;
-      s->append(buf);
+      if (re->parse_flags() & Regexp::Latin1) {
+        s->push_back(r);
+      } else {
+        char buf[UTFmax+1];
+        buf[runetochar(buf, &r)] = 0;
+        s->append(buf);
+      }
       break;
     }
     case kRegexpLiteralString:
       for (int i = 0; i < re->nrunes(); i++) {
         Rune r = re->runes()[i];
-        char buf[UTFmax+1];
-        buf[runetochar(buf, &r)] = 0;
-        s->append(buf);
+        if (re->parse_flags() & Regexp::Latin1) {
+          s->push_back(r);
+        } else {
+          char buf[UTFmax+1];
+          buf[runetochar(buf, &r)] = 0;
+          s->append(buf);
+        }
       }
       break;
     case kRegexpConcat:
diff --git a/contrib/libs/re2/re2/testing/parse_test.cc b/contrib/libs/re2/re2/testing/parse_test.cc
index 7684b62a49..95294d5fff 100644
--- a/contrib/libs/re2/re2/testing/parse_test.cc
+++ b/contrib/libs/re2/re2/testing/parse_test.cc
@@ -225,6 +225,29 @@ static Test tests[] = {
   // Bug in Regexp::ToString() that emitted [^], which
   // would (obviously) fail to parse when fed back in.
   { "[\\s\\S]", "cc{0-0x10ffff}" },
+
+  // As per https://github.com/google/re2/issues/477,
+  // there were long-standing bugs involving Latin-1.
+  // Here, we exercise it WITHOUT case folding...
+  { "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 },
+  { "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 },
+  { "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 },
+  { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 },
+  { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
+  { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
+  { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
+  { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
+  // Here, we exercise it WITH case folding...
+  // 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2
+  // should fold to 0xF1 and 0xF2, respectively.
+  { "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
+  { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
 };
 
 bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
@@ -492,7 +515,7 @@ TEST(TestToString, EquivalentParse) {
       //     << " t=" << t << " regexp=" << tests[i].regexp;
 
       // Test that if we parse the new regexp we get the same structure.
-      Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
+      Regexp* nre = Regexp::Parse(t, f, &status);
       ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
       std::string ss = nre->Dump();
       std::string tt = nre->ToString();
diff --git a/contrib/libs/re2/re2/testing/re2_test.cc b/contrib/libs/re2/re2/testing/re2_test.cc
index 151525f2d6..ddf8dbf8fb 100644
--- a/contrib/libs/re2/re2/testing/re2_test.cc
+++ b/contrib/libs/re2/re2/testing/re2_test.cc
@@ -1658,4 +1658,23 @@ TEST(RE2, Issue310) {
   ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
 }
 
+TEST(RE2, Issue477) {
+  // Regexp::LeadingString didn't output Latin1 into flags.
+  // In the given pattern, 0xA5 should be factored out, but
+  // shouldn't lose its Latin1-ness in the process. Because
+  // that was happening, the prefix for accel was 0xC2 0xA5
+  // instead of 0xA5. Note that the former doesn't occur in
+  // the given input and so replacements weren't occurring.
+
+  const char bytes[] = {
+      (char)0xa5, (char)0xd1, (char)0xa5, (char)0xd1,
+      (char)0x61, (char)0x63, (char)0xa5, (char)0x64,
+  };
+  std::string s(bytes, ABSL_ARRAYSIZE(bytes));
+  RE2 re("\xa5\xd1|\xa5\x64", RE2::Latin1);
+  int n = RE2::GlobalReplace(&s, re, "");
+  ASSERT_EQ(n, 3);
+  ASSERT_EQ(s, "\x61\x63");
+}
+
 }  // namespace re2
diff --git a/contrib/libs/re2/util/pcre.cc b/contrib/libs/re2/util/pcre.cc
index f54cb28f83..27aee3dc48 100644
--- a/contrib/libs/re2/util/pcre.cc
+++ b/contrib/libs/re2/util/pcre.cc
@@ -21,7 +21,7 @@
 #include "util/pcre.h"
 
 // Silence warnings about the wacky formatting in the operator() functions.
-#if !defined(__clang__) && defined(__GNUC__)
+#if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wmisleading-indentation"
 #endif
 
diff --git a/contrib/libs/re2/ya.make b/contrib/libs/re2/ya.make
index 7f6fae30da..da93062de9 100644
--- a/contrib/libs/re2/ya.make
+++ b/contrib/libs/re2/ya.make
@@ -9,9 +9,9 @@ LICENSE(
 
 LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
 
-VERSION(2024-02-01)
+VERSION(2024-03-01)
 
-ORIGINAL_SOURCE(https://github.com/google/re2/archive/2024-02-01.tar.gz)
+ORIGINAL_SOURCE(https://github.com/google/re2/archive/2024-03-01.tar.gz)
 
 PEERDIR(
     contrib/restricted/abseil-cpp/absl/base
author	robot-contrib <robot-contrib@yandex-team.com>	2024-03-01 19:16:04 +0300
committer	robot-contrib <robot-contrib@yandex-team.com>	2024-03-01 19:27:19 +0300
commit	a388d4a51a6512436615d1006ddb75a623852ce2 (patch)
tree	5ab63ecd3296ef66198879932f4955471e9245f6
parent	867ad5d3887ef7ee3d87ada618a3a1f2f8172026 (diff)
download	ydb-a388d4a51a6512436615d1006ddb75a623852ce2.tar.gz