Intermediate changes

author: robot-piglet <robot-piglet@yandex-team.com> 2023-12-02 01:45:21 +0300
committer: robot-piglet <robot-piglet@yandex-team.com> 2023-12-02 02:42:50 +0300
commit: 9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch)
tree: 9f88a486917d371d099cd712efd91b4c122d209d /contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp
parent: 32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff)
download: ydb-9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c.tar.gz
1 files changed, 242 insertions, 0 deletions
diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp b/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp
new file mode 100644
index 0000000000..294e9f1b21
--- /dev/null
+++ b/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp
@@ -0,0 +1,242 @@
+/* Copyright (c) 2021 The ANTLR Project. All rights reserved.
+ * Use of this file is governed by the BSD 3-clause license that
+ * can be found in the LICENSE.txt file in the project root.
+ */
+
+#include <cassert>
+#include <cstdint>
+
+#include "support/Utf8.h"
+#include "support/Unicode.h"
+
+// The below implementation is based off of https://github.com/google/cel-cpp/internal/utf8.cc,
+// which is itself based off of https://go.googlesource.com/go/+/refs/heads/master/src/unicode/utf8/utf8.go.
+// If for some reason you feel the need to copy this implementation, please retain a comment
+// referencing the two source files and giving credit, as well as maintaining any and all
+// obligations required by the BSD 3-clause license that governs this file.
+
+namespace antlrcpp {
+
+namespace {
+
+#undef SELF
+  constexpr uint8_t SELF = 0x80;
+
+#undef LOW
+  constexpr uint8_t LOW = 0x80;
+#undef HIGH
+  constexpr uint8_t HIGH = 0xbf;
+
+#undef MASKX
+  constexpr uint8_t MASKX = 0x3f;
+#undef MASK2
+  constexpr uint8_t MASK2 = 0x1f;
+#undef MASK3
+  constexpr uint8_t MASK3 = 0xf;
+#undef MASK4
+  constexpr uint8_t MASK4 = 0x7;
+
+#undef TX
+  constexpr uint8_t TX = 0x80;
+#undef T2
+  constexpr uint8_t T2 = 0xc0;
+#undef T3
+  constexpr uint8_t T3 = 0xe0;
+#undef T4
+  constexpr uint8_t T4 = 0xf0;
+
+#undef XX
+  constexpr uint8_t XX = 0xf1;
+#undef AS
+  constexpr uint8_t AS = 0xf0;
+#undef S1
+  constexpr uint8_t S1 = 0x02;
+#undef S2
+  constexpr uint8_t S2 = 0x13;
+#undef S3
+  constexpr uint8_t S3 = 0x03;
+#undef S4
+  constexpr uint8_t S4 = 0x23;
+#undef S5
+  constexpr uint8_t S5 = 0x34;
+#undef S6
+  constexpr uint8_t S6 = 0x04;
+#undef S7
+  constexpr uint8_t S7 = 0x44;
+
+  // NOLINTBEGIN
+  // clang-format off
+#undef LEADING
+  constexpr uint8_t LEADING[256] = {
+    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
+    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
+    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
+    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
+    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
+    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
+    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
+    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
+    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
+    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
+    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
+    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
+    XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
+    S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
+    S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
+    S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
+  };
+  // clang-format on
+  // NOLINTEND
+
+#undef ACCEPT
+  constexpr std::pair<uint8_t, uint8_t> ACCEPT[16] = {
+      {LOW, HIGH}, {0xa0, HIGH}, {LOW, 0x9f}, {0x90, HIGH},
+      {LOW, 0x8f}, {0x0, 0x0},   {0x0, 0x0},  {0x0, 0x0},
+      {0x0, 0x0},  {0x0, 0x0},   {0x0, 0x0},  {0x0, 0x0},
+      {0x0, 0x0},  {0x0, 0x0},   {0x0, 0x0},  {0x0, 0x0},
+  };
+
+}  // namespace
+
+  std::pair<char32_t, size_t> Utf8::decode(std::string_view input) {
+    assert(!input.empty());
+    const auto b = static_cast<uint8_t>(input.front());
+    input.remove_prefix(1);
+    if (b < SELF) {
+      return {static_cast<char32_t>(b), 1};
+    }
+    const auto leading = LEADING[b];
+    if (leading == XX) {
+      return {Unicode::REPLACEMENT_CHARACTER, 1};
+    }
+    auto size = static_cast<size_t>(leading & 7) - 1;
+    if (size > input.size()) {
+      return {Unicode::REPLACEMENT_CHARACTER, 1};
+    }
+    const auto& accept = ACCEPT[leading >> 4];
+    const auto b1 = static_cast<uint8_t>(input.front());
+    input.remove_prefix(1);
+    if (b1 < accept.first || b1 > accept.second) {
+      return {Unicode::REPLACEMENT_CHARACTER, 1};
+    }
+    if (size <= 1) {
+      return {(static_cast<char32_t>(b & MASK2) << 6) |
+                  static_cast<char32_t>(b1 & MASKX),
+              2};
+    }
+    const auto b2 = static_cast<uint8_t>(input.front());
+    input.remove_prefix(1);
+    if (b2 < LOW || b2 > HIGH) {
+      return {Unicode::REPLACEMENT_CHARACTER, 1};
+    }
+    if (size <= 2) {
+      return {(static_cast<char32_t>(b & MASK3) << 12) |
+                  (static_cast<char32_t>(b1 & MASKX) << 6) |
+                  static_cast<char32_t>(b2 & MASKX),
+              3};
+    }
+    const auto b3 = static_cast<uint8_t>(input.front());
+    input.remove_prefix(1);
+    if (b3 < LOW || b3 > HIGH) {
+      return {Unicode::REPLACEMENT_CHARACTER, 1};
+    }
+    return {(static_cast<char32_t>(b & MASK4) << 18) |
+                (static_cast<char32_t>(b1 & MASKX) << 12) |
+                (static_cast<char32_t>(b2 & MASKX) << 6) |
+                static_cast<char32_t>(b3 & MASKX),
+            4};
+  }
+
+  std::optional<std::u32string> Utf8::strictDecode(std::string_view input) {
+    std::u32string output;
+    char32_t codePoint;
+    size_t codeUnits;
+    output.reserve(input.size());  // Worst case is each byte is a single Unicode code point.
+    for (size_t index = 0; index < input.size(); index += codeUnits) {
+      std::tie(codePoint, codeUnits) = Utf8::decode(input.substr(index));
+      if (codePoint == Unicode::REPLACEMENT_CHARACTER && codeUnits == 1) {
+        // Condition is only met when an illegal byte sequence is encountered. See Utf8::decode.
+        return std::nullopt;
+      }
+      output.push_back(codePoint);
+    }
+    output.shrink_to_fit();
+    return output;
+  }
+
+  std::u32string Utf8::lenientDecode(std::string_view input) {
+    std::u32string output;
+    char32_t codePoint;
+    size_t codeUnits;
+    output.reserve(input.size());  // Worst case is each byte is a single Unicode code point.
+    for (size_t index = 0; index < input.size(); index += codeUnits) {
+      std::tie(codePoint, codeUnits) = Utf8::decode(input.substr(index));
+      output.push_back(codePoint);
+    }
+    output.shrink_to_fit();
+    return output;
+  }
+
+  std::string& Utf8::encode(std::string* buffer, char32_t codePoint) {
+    assert(buffer != nullptr);
+    if (!Unicode::isValid(codePoint)) {
+      codePoint = Unicode::REPLACEMENT_CHARACTER;
+    }
+    if (codePoint <= 0x7f) {
+      buffer->push_back(static_cast<char>(static_cast<uint8_t>(codePoint)));
+    } else if (codePoint <= 0x7ff) {
+      buffer->push_back(
+          static_cast<char>(T2 | static_cast<uint8_t>(codePoint >> 6)));
+      buffer->push_back(
+          static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX)));
+    } else if (codePoint <= 0xffff) {
+      buffer->push_back(
+          static_cast<char>(T3 | static_cast<uint8_t>(codePoint >> 12)));
+      buffer->push_back(static_cast<char>(
+          TX | (static_cast<uint8_t>(codePoint >> 6) & MASKX)));
+      buffer->push_back(
+          static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX)));
+    } else {
+      buffer->push_back(
+          static_cast<char>(T4 | static_cast<uint8_t>(codePoint >> 18)));
+      buffer->push_back(static_cast<char>(
+          TX | (static_cast<uint8_t>(codePoint >> 12) & MASKX)));
+      buffer->push_back(static_cast<char>(
+          TX | (static_cast<uint8_t>(codePoint >> 6) & MASKX)));
+      buffer->push_back(
+          static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX)));
+    }
+    return *buffer;
+  }
+
+  std::optional<std::string> Utf8::strictEncode(std::u32string_view input) {
+    std::string output;
+    output.reserve(input.size() * 4);  // Worst case is each Unicode code point encodes to 4 bytes.
+    for (size_t index = 0; index < input.size(); index++) {
+      char32_t codePoint = input[index];
+      if (!Unicode::isValid(codePoint)) {
+        return std::nullopt;
+      }
+      Utf8::encode(&output, codePoint);
+    }
+    output.shrink_to_fit();
+    return output;
+  }
+
+  std::string Utf8::lenientEncode(std::u32string_view input) {
+    std::string output;
+    output.reserve(input.size() * 4);  // Worst case is each Unicode code point encodes to 4 bytes.
+    for (size_t index = 0; index < input.size(); index++) {
+      char32_t codePoint = input[index];
+      if (!Unicode::isValid(codePoint)) {
+        codePoint = Unicode::REPLACEMENT_CHARACTER;
+      }
+      Utf8::encode(&output, codePoint);
+    }
+    output.shrink_to_fit();
+    return output;
+  }
+
+}
author	robot-piglet <robot-piglet@yandex-team.com>	2023-12-02 01:45:21 +0300
committer	robot-piglet <robot-piglet@yandex-team.com>	2023-12-02 02:42:50 +0300
commit	9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch)
tree	9f88a486917d371d099cd712efd91b4c122d209d /contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp
parent	32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff)
download	ydb-9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c.tar.gz