diff options
author | robot-piglet <robot-piglet@yandex-team.com> | 2023-12-02 01:45:21 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2023-12-02 02:42:50 +0300 |
commit | 9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch) | |
tree | 9f88a486917d371d099cd712efd91b4c122d209d /contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp | |
parent | 32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff) | |
download | ydb-9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c.tar.gz |
Intermediate changes
Diffstat (limited to 'contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp')
-rw-r--r-- | contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp b/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp new file mode 100644 index 0000000000..294e9f1b21 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp @@ -0,0 +1,242 @@ +/* Copyright (c) 2021 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#include <cassert> +#include <cstdint> + +#include "support/Utf8.h" +#include "support/Unicode.h" + +// The below implementation is based off of https://github.com/google/cel-cpp/internal/utf8.cc, +// which is itself based off of https://go.googlesource.com/go/+/refs/heads/master/src/unicode/utf8/utf8.go. +// If for some reason you feel the need to copy this implementation, please retain a comment +// referencing the two source files and giving credit, as well as maintaining any and all +// obligations required by the BSD 3-clause license that governs this file. + +namespace antlrcpp { + +namespace { + +#undef SELF + constexpr uint8_t SELF = 0x80; + +#undef LOW + constexpr uint8_t LOW = 0x80; +#undef HIGH + constexpr uint8_t HIGH = 0xbf; + +#undef MASKX + constexpr uint8_t MASKX = 0x3f; +#undef MASK2 + constexpr uint8_t MASK2 = 0x1f; +#undef MASK3 + constexpr uint8_t MASK3 = 0xf; +#undef MASK4 + constexpr uint8_t MASK4 = 0x7; + +#undef TX + constexpr uint8_t TX = 0x80; +#undef T2 + constexpr uint8_t T2 = 0xc0; +#undef T3 + constexpr uint8_t T3 = 0xe0; +#undef T4 + constexpr uint8_t T4 = 0xf0; + +#undef XX + constexpr uint8_t XX = 0xf1; +#undef AS + constexpr uint8_t AS = 0xf0; +#undef S1 + constexpr uint8_t S1 = 0x02; +#undef S2 + constexpr uint8_t S2 = 0x13; +#undef S3 + constexpr uint8_t S3 = 0x03; +#undef S4 + constexpr uint8_t S4 = 0x23; +#undef S5 + constexpr uint8_t S5 = 0x34; +#undef S6 + constexpr uint8_t S6 = 0x04; +#undef S7 + constexpr uint8_t S7 = 0x44; + + // NOLINTBEGIN + // clang-format off +#undef LEADING + constexpr uint8_t LEADING[256] = { + // 1 2 3 4 5 6 7 8 9 A B C D E F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F + // 1 2 3 4 5 6 7 8 9 A B C D E F + XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F + XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F + XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF + XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF + XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF + S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF + S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF + S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF + }; + // clang-format on + // NOLINTEND + +#undef ACCEPT + constexpr std::pair<uint8_t, uint8_t> ACCEPT[16] = { + {LOW, HIGH}, {0xa0, HIGH}, {LOW, 0x9f}, {0x90, HIGH}, + {LOW, 0x8f}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, + {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, + {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, + }; + +} // namespace + + std::pair<char32_t, size_t> Utf8::decode(std::string_view input) { + assert(!input.empty()); + const auto b = static_cast<uint8_t>(input.front()); + input.remove_prefix(1); + if (b < SELF) { + return {static_cast<char32_t>(b), 1}; + } + const auto leading = LEADING[b]; + if (leading == XX) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + auto size = static_cast<size_t>(leading & 7) - 1; + if (size > input.size()) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + const auto& accept = ACCEPT[leading >> 4]; + const auto b1 = static_cast<uint8_t>(input.front()); + input.remove_prefix(1); + if (b1 < accept.first || b1 > accept.second) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + if (size <= 1) { + return {(static_cast<char32_t>(b & MASK2) << 6) | + static_cast<char32_t>(b1 & MASKX), + 2}; + } + const auto b2 = static_cast<uint8_t>(input.front()); + input.remove_prefix(1); + if (b2 < LOW || b2 > HIGH) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + if (size <= 2) { + return {(static_cast<char32_t>(b & MASK3) << 12) | + (static_cast<char32_t>(b1 & MASKX) << 6) | + static_cast<char32_t>(b2 & MASKX), + 3}; + } + const auto b3 = static_cast<uint8_t>(input.front()); + input.remove_prefix(1); + if (b3 < LOW || b3 > HIGH) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + return {(static_cast<char32_t>(b & MASK4) << 18) | + (static_cast<char32_t>(b1 & MASKX) << 12) | + (static_cast<char32_t>(b2 & MASKX) << 6) | + static_cast<char32_t>(b3 & MASKX), + 4}; + } + + std::optional<std::u32string> Utf8::strictDecode(std::string_view input) { + std::u32string output; + char32_t codePoint; + size_t codeUnits; + output.reserve(input.size()); // Worst case is each byte is a single Unicode code point. + for (size_t index = 0; index < input.size(); index += codeUnits) { + std::tie(codePoint, codeUnits) = Utf8::decode(input.substr(index)); + if (codePoint == Unicode::REPLACEMENT_CHARACTER && codeUnits == 1) { + // Condition is only met when an illegal byte sequence is encountered. See Utf8::decode. + return std::nullopt; + } + output.push_back(codePoint); + } + output.shrink_to_fit(); + return output; + } + + std::u32string Utf8::lenientDecode(std::string_view input) { + std::u32string output; + char32_t codePoint; + size_t codeUnits; + output.reserve(input.size()); // Worst case is each byte is a single Unicode code point. + for (size_t index = 0; index < input.size(); index += codeUnits) { + std::tie(codePoint, codeUnits) = Utf8::decode(input.substr(index)); + output.push_back(codePoint); + } + output.shrink_to_fit(); + return output; + } + + std::string& Utf8::encode(std::string* buffer, char32_t codePoint) { + assert(buffer != nullptr); + if (!Unicode::isValid(codePoint)) { + codePoint = Unicode::REPLACEMENT_CHARACTER; + } + if (codePoint <= 0x7f) { + buffer->push_back(static_cast<char>(static_cast<uint8_t>(codePoint))); + } else if (codePoint <= 0x7ff) { + buffer->push_back( + static_cast<char>(T2 | static_cast<uint8_t>(codePoint >> 6))); + buffer->push_back( + static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX))); + } else if (codePoint <= 0xffff) { + buffer->push_back( + static_cast<char>(T3 | static_cast<uint8_t>(codePoint >> 12))); + buffer->push_back(static_cast<char>( + TX | (static_cast<uint8_t>(codePoint >> 6) & MASKX))); + buffer->push_back( + static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX))); + } else { + buffer->push_back( + static_cast<char>(T4 | static_cast<uint8_t>(codePoint >> 18))); + buffer->push_back(static_cast<char>( + TX | (static_cast<uint8_t>(codePoint >> 12) & MASKX))); + buffer->push_back(static_cast<char>( + TX | (static_cast<uint8_t>(codePoint >> 6) & MASKX))); + buffer->push_back( + static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX))); + } + return *buffer; + } + + std::optional<std::string> Utf8::strictEncode(std::u32string_view input) { + std::string output; + output.reserve(input.size() * 4); // Worst case is each Unicode code point encodes to 4 bytes. + for (size_t index = 0; index < input.size(); index++) { + char32_t codePoint = input[index]; + if (!Unicode::isValid(codePoint)) { + return std::nullopt; + } + Utf8::encode(&output, codePoint); + } + output.shrink_to_fit(); + return output; + } + + std::string Utf8::lenientEncode(std::u32string_view input) { + std::string output; + output.reserve(input.size() * 4); // Worst case is each Unicode code point encodes to 4 bytes. + for (size_t index = 0; index < input.size(); index++) { + char32_t codePoint = input[index]; + if (!Unicode::isValid(codePoint)) { + codePoint = Unicode::REPLACEMENT_CHARACTER; + } + Utf8::encode(&output, codePoint); + } + output.shrink_to_fit(); + return output; + } + +} |