diff options
author | robot-piglet <[email protected]> | 2023-12-02 01:45:21 +0300 |
---|---|---|
committer | robot-piglet <[email protected]> | 2023-12-02 02:42:50 +0300 |
commit | 9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch) | |
tree | 9f88a486917d371d099cd712efd91b4c122d209d /contrib/libs/antlr4_cpp_runtime/src/support | |
parent | 32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff) |
Intermediate changes
Diffstat (limited to 'contrib/libs/antlr4_cpp_runtime/src/support')
14 files changed, 1137 insertions, 0 deletions
diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Any.cpp b/contrib/libs/antlr4_cpp_runtime/src/support/Any.cpp new file mode 100644 index 00000000000..a1ed50d4563 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Any.cpp @@ -0,0 +1,8 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#include "Any.h" + +using namespace antlrcpp; diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Any.h b/contrib/libs/antlr4_cpp_runtime/src/support/Any.h new file mode 100644 index 00000000000..fa5df58946e --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Any.h @@ -0,0 +1,16 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +// A standard C++ class loosely modeled after boost::Any. + +#pragma once + +#include "antlr4-common.h" + +namespace antlrcpp { + + using Any = std::any; + +} // namespace antlrcpp diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Arrays.cpp b/contrib/libs/antlr4_cpp_runtime/src/support/Arrays.cpp new file mode 100644 index 00000000000..b3c4f94f2fa --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Arrays.cpp @@ -0,0 +1,43 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#include "tree/ParseTree.h" +#include "Exceptions.h" + +#include "support/Arrays.h" + +using namespace antlrcpp; + +std::string Arrays::listToString(const std::vector<std::string> &list, const std::string &separator) +{ + std::stringstream ss; + bool firstEntry = true; + + ss << '['; + for (const auto &entry : list) { + ss << entry; + if (firstEntry) { + ss << separator; + firstEntry = false; + } + } + + ss << ']'; + return ss.str(); +} + +template <> +std::string Arrays::toString(const std::vector<antlr4::tree::ParseTree*> &source) { + std::string result = "["; + bool firstEntry = true; + for (auto *value : source) { + result += value->toStringTree(); + if (firstEntry) { + result += ", "; + firstEntry = false; + } + } + return result + "]"; +} diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Arrays.h b/contrib/libs/antlr4_cpp_runtime/src/support/Arrays.h new file mode 100644 index 00000000000..04b852d9860 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Arrays.h @@ -0,0 +1,149 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#pragma once + +#include "antlr4-common.h" + +namespace antlrcpp { + + class ANTLR4CPP_PUBLIC Arrays { + public: + + static std::string listToString(const std::vector<std::string> &list, const std::string &separator); + + template <typename T> + static bool equals(const std::vector<T> &a, const std::vector<T> &b) { + if (a.size() != b.size()) + return false; + + for (size_t i = 0; i < a.size(); ++i) + if (!(a[i] == b[i])) + return false; + + return true; + } + + template <typename T> + static bool equals(const std::vector<T *> &a, const std::vector<T *> &b) { + if (a.size() != b.size()) + return false; + + for (size_t i = 0; i < a.size(); ++i) { + if (!a[i] && !b[i]) + continue; + if (!a[i] || !b[i]) + return false; + if (a[i] == b[i]) + continue; + + if (!(*a[i] == *b[i])) + return false; + } + + return true; + } + + template <typename T> + static bool equals(const std::vector<Ref<T>> &a, const std::vector<Ref<T>> &b) { + if (a.size() != b.size()) + return false; + + for (size_t i = 0; i < a.size(); ++i) { + if (!a[i] && !b[i]) + continue; + if (!a[i] || !b[i]) + return false; + if (a[i] == b[i]) + continue; + + if (!(*a[i] == *b[i])) + return false; + } + + return true; + } + + template <typename T> + static bool equals(const std::vector<std::unique_ptr<T>> &a, const std::vector<std::unique_ptr<T>> &b) { + if (a.size() != b.size()) + return false; + + for (size_t i = 0; i < a.size(); ++i) { + if (!a[i] && !b[i]) + continue; + if (!a[i] || !b[i]) + return false; + if (a[i] == b[i]) + continue; + + if (!(*a[i] == *b[i])) + return false; + } + + return true; + } + + template <typename T> + static std::string toString(const std::vector<T> &source) { + std::string result = "["; + bool firstEntry = true; + for (auto &value : source) { + result += value.toString(); + if (firstEntry) { + result += ", "; + firstEntry = false; + } + } + return result + "]"; + } + + template <typename T> + static std::string toString(const std::vector<Ref<T>> &source) { + std::string result = "["; + bool firstEntry = true; + for (auto &value : source) { + result += value->toString(); + if (firstEntry) { + result += ", "; + firstEntry = false; + } + } + return result + "]"; + } + + template <typename T> + static std::string toString(const std::vector<std::unique_ptr<T>> &source) { + std::string result = "["; + bool firstEntry = true; + for (auto &value : source) { + result += value->toString(); + if (firstEntry) { + result += ", "; + firstEntry = false; + } + } + return result + "]"; + } + + template <typename T> + static std::string toString(const std::vector<T *> &source) { + std::string result = "["; + bool firstEntry = true; + for (auto value : source) { + result += value->toString(); + if (firstEntry) { + result += ", "; + firstEntry = false; + } + } + return result + "]"; + } + + }; + + template <> + std::string Arrays::toString(const std::vector<antlr4::tree::ParseTree *> &source); +} diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/BitSet.h b/contrib/libs/antlr4_cpp_runtime/src/support/BitSet.h new file mode 100644 index 00000000000..bb30364be08 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/BitSet.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#pragma once + +#include "antlr4-common.h" + +namespace antlrcpp { + + class ANTLR4CPP_PUBLIC BitSet : public std::bitset<2048> { + public: + size_t nextSetBit(size_t pos) const { + for (size_t i = pos; i < size(); i++){ + if (test(i)) { + return i; + } + } + + return INVALID_INDEX; + } + + // Prints a list of every index for which the bitset contains a bit in true. + friend std::wostream& operator << (std::wostream& os, const BitSet& obj) + { + os << "{"; + size_t total = obj.count(); + for (size_t i = 0; i < obj.size(); i++){ + if (obj.test(i)){ + os << i; + --total; + if (total > 1){ + os << ", "; + } + } + } + + os << "}"; + return os; + } + + static std::string subStringRepresentation(const std::vector<BitSet>::iterator &begin, + const std::vector<BitSet>::iterator &end) { + std::string result; + std::vector<BitSet>::iterator vectorIterator; + + for (vectorIterator = begin; vectorIterator != end; vectorIterator++) { + result += vectorIterator->toString(); + } + // Grab the end + result += end->toString(); + + return result; + } + + std::string toString() const { + std::stringstream stream; + stream << "{"; + bool valueAdded = false; + for (size_t i = 0; i < size(); ++i){ + if (test(i)){ + if (valueAdded) { + stream << ", "; + } + stream << i; + valueAdded = true; + } + } + + stream << "}"; + return stream.str(); + } + + }; +} diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/CPPUtils.cpp b/contrib/libs/antlr4_cpp_runtime/src/support/CPPUtils.cpp new file mode 100644 index 00000000000..95321b3dc17 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/CPPUtils.cpp @@ -0,0 +1,207 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#include "support/CPPUtils.h" + +namespace antlrcpp { + + std::string join(const std::vector<std::string> &strings, const std::string &separator) { + std::string str; + bool firstItem = true; + for (const std::string &s : strings) { + if (!firstItem) { + str.append(separator); + } + firstItem = false; + str.append(s); + } + return str; + } + + std::map<std::string, size_t> toMap(const std::vector<std::string> &keys) { + std::map<std::string, size_t> result; + for (size_t i = 0; i < keys.size(); ++i) { + result.insert({ keys[i], i }); + } + return result; + } + + std::string escapeWhitespace(std::string str, bool escapeSpaces) { + std::string result; + for (auto c : str) { + switch (c) { + case '\n': + result += "\\n"; + break; + + case '\r': + result += "\\r"; + break; + + case '\t': + result += "\\t"; + break; + + case ' ': + if (escapeSpaces) { + result += "\u00B7"; + break; + } + result += c; + break; + + default: + result += c; + break; + } + } + + return result; + } + + std::string toHexString(const int t) { + std::stringstream stream; + stream << std::uppercase << std::hex << t; + return stream.str(); + } + + std::string arrayToString(const std::vector<std::string> &data) { + std::string answer; + size_t toReserve = 0; + for (const auto &sub : data) { + toReserve += sub.size(); + } + answer.reserve(toReserve); + for (const auto &sub: data) { + answer.append(sub); + } + return answer; + } + + std::string replaceString(const std::string &s, const std::string &from, const std::string &to) { + std::string::size_type p; + std::string ss, res; + + ss = s; + p = ss.find(from); + while (p != std::string::npos) { + if (p > 0) + res.append(ss.substr(0, p)).append(to); + else + res.append(to); + ss = ss.substr(p + from.size()); + p = ss.find(from); + } + res.append(ss); + + return res; + } + + std::vector<std::string> split(const std::string &s, const std::string &sep, int count) { + std::vector<std::string> parts; + std::string ss = s; + + std::string::size_type p; + + if (s.empty()) + return parts; + + if (count == 0) + count= -1; + + p = ss.find(sep); + while (!ss.empty() && p != std::string::npos && (count < 0 || count > 0)) { + parts.push_back(ss.substr(0, p)); + ss = ss.substr(p+sep.size()); + + --count; + p = ss.find(sep); + } + parts.push_back(ss); + + return parts; + } + + //-------------------------------------------------------------------------------------------------- + + // Debugging helper. Adds indentation to all lines in the given string. + std::string indent(const std::string &s, const std::string &indentation, bool includingFirst) { + std::vector<std::string> parts = split(s, "\n", -1); + for (size_t i = 0; i < parts.size(); ++i) { + if (i == 0 && !includingFirst) + continue; + parts[i].insert(0, indentation); + } + + return join(parts, "\n"); + } + + //-------------------------------------------------------------------------------------------------- + + // Recursively get the error from a, possibly nested, exception. +#if defined(_MSC_FULL_VER) && _MSC_FULL_VER < 190023026 + // No nested exceptions before VS 2015. + template <typename T> + std::exception_ptr get_nested(const T &/*e*/) { + try { + return nullptr; + } + catch (const std::bad_cast &) { + return nullptr; + } + } +#else + template <typename T> + std::exception_ptr get_nested(const T &e) { + try { + auto nested = dynamic_cast<const std::nested_exception&>(e); + return nested.nested_ptr(); + } + catch (const std::bad_cast &) { + return nullptr; + } + } +#endif + + std::string what(std::exception_ptr eptr) { + if (!eptr) { + throw std::bad_exception(); + } + + std::string result; + std::size_t nestCount = 0; + + next: { + try { + std::exception_ptr yeptr; + std::swap(eptr, yeptr); + std::rethrow_exception(yeptr); + } + catch (const std::exception &e) { + result += e.what(); + eptr = get_nested(e); + } + catch (const std::string &e) { + result += e; + } + catch (const char *e) { + result += e; + } + catch (...) { + result += "cannot be determined"; + } + + if (eptr) { + result += " ("; + ++nestCount; + goto next; + } + } + + result += std::string(nestCount, ')'); + return result; + } + +} // namespace antlrcpp diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/CPPUtils.h b/contrib/libs/antlr4_cpp_runtime/src/support/CPPUtils.h new file mode 100644 index 00000000000..2eb1a36037a --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/CPPUtils.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#pragma once + +#include "antlr4-common.h" + +namespace antlrcpp { + + ANTLR4CPP_PUBLIC std::string join(const std::vector<std::string> &strings, const std::string &separator); + ANTLR4CPP_PUBLIC std::map<std::string, size_t> toMap(const std::vector<std::string> &keys); + ANTLR4CPP_PUBLIC std::string escapeWhitespace(std::string str, bool escapeSpaces); + ANTLR4CPP_PUBLIC std::string toHexString(const int t); + ANTLR4CPP_PUBLIC std::string arrayToString(const std::vector<std::string> &data); + ANTLR4CPP_PUBLIC std::string replaceString(const std::string &s, const std::string &from, const std::string &to); + ANTLR4CPP_PUBLIC std::vector<std::string> split(const std::string &s, const std::string &sep, int count); + ANTLR4CPP_PUBLIC std::string indent(const std::string &s, const std::string &indentation, bool includingFirst = true); + + // Using RAII + a lambda to implement a "finally" replacement. + template <typename OnEnd> + struct FinalAction { + FinalAction(OnEnd f) : _cleanUp { std::move(f) } {} + FinalAction(FinalAction &&other) : + _cleanUp(std::move(other._cleanUp)), _enabled(other._enabled) { + other._enabled = false; // Don't trigger the lambda after ownership has moved. + } + ~FinalAction() { if (_enabled) _cleanUp(); } + + void disable() { _enabled = false; } + private: + OnEnd _cleanUp; + bool _enabled {true}; + }; + + template <typename OnEnd> + FinalAction<OnEnd> finally(OnEnd f) { + return FinalAction<OnEnd>(std::move(f)); + } + + // Convenience functions to avoid lengthy dynamic_cast() != nullptr checks in many places. + template <typename T1, typename T2> + inline bool is(T2 *obj) { // For pointer types. + return dynamic_cast<typename std::add_const<T1>::type>(obj) != nullptr; + } + + template <typename T1, typename T2> + inline bool is(Ref<T2> const& obj) { // For shared pointers. + return dynamic_cast<T1 *>(obj.get()) != nullptr; + } + + template <typename T> + std::string toString(const T &o) { + std::stringstream ss; + // typeid gives the mangled class name, but that's all what's possible + // in a portable way. + ss << typeid(o).name() << "@" << std::hex << reinterpret_cast<uintptr_t>(&o); + return ss.str(); + } + + // Get the error text from an exception pointer or the current exception. + ANTLR4CPP_PUBLIC std::string what(std::exception_ptr eptr = std::current_exception()); + +} // namespace antlrcpp diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Casts.h b/contrib/libs/antlr4_cpp_runtime/src/support/Casts.h new file mode 100644 index 00000000000..2ded955dcd7 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Casts.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2012-2021 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#pragma once + +#include <cassert> +#include <memory> +#include <type_traits> + +namespace antlrcpp { + + template <typename To, typename From> + To downCast(From* from) { + static_assert(std::is_pointer_v<To>, "Target type not a pointer."); + static_assert(std::is_base_of_v<From, std::remove_pointer_t<To>>, "Target type not derived from source type."); + #if !defined(__GNUC__) || defined(__GXX_RTTI) + assert(from == nullptr || dynamic_cast<To>(from) != nullptr); + #endif + return static_cast<To>(from); + } + + template <typename To, typename From> + To downCast(From& from) { + static_assert(std::is_lvalue_reference_v<To>, "Target type not a lvalue reference."); + static_assert(std::is_base_of_v<From, std::remove_reference_t<To>>, "Target type not derived from source type."); + #if !defined(__GNUC__) || defined(__GXX_RTTI) + assert(dynamic_cast<std::add_pointer_t<std::remove_reference_t<To>>>(std::addressof(from)) != nullptr); + #endif + return static_cast<To>(from); + } + +} diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Declarations.h b/contrib/libs/antlr4_cpp_runtime/src/support/Declarations.h new file mode 100644 index 00000000000..8e960676cf2 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Declarations.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#pragma once + +namespace antlr4 { + class ANTLRErrorListener; + class ANTLRErrorStrategy; + class ANTLRFileStream; + class ANTLRInputStream; + class BailErrorStrategy; + class BaseErrorListener; + class BufferedTokenStream; + class CharStream; + class CommonToken; + class CommonTokenFactory; + class CommonTokenStream; + class ConsoleErrorListener; + class DefaultErrorStrategy; + class DiagnosticErrorListener; + class EmptyStackException; + class FailedPredicateException; + class IllegalArgumentException; + class IllegalStateException; + class InputMismatchException; + class IntStream; + class InterpreterRuleContext; + class Lexer; + class LexerInterpreter; + class LexerNoViableAltException; + class ListTokenSource; + class NoSuchElementException; + class NoViableAltException; + class NullPointerException; + class ParseCancellationException; + class Parser; + class ParserInterpreter; + class ParserRuleContext; + class ProxyErrorListener; + class RecognitionException; + class Recognizer; + class RuleContext; + class Token; + template<typename Symbol> class TokenFactory; + class TokenSource; + class TokenStream; + class TokenStreamRewriter; + class UnbufferedCharStream; + class UnbufferedTokenStream; + class WritableToken; + + namespace misc { + class InterpreterDataReader; + class Interval; + class IntervalSet; + class MurmurHash; + class Utils; + class Predicate; + } + namespace atn { + class ATN; + class ATNConfig; + class ATNConfigSet; + class ATNDeserializationOptions; + class ATNDeserializer; + class ATNSerializer; + class ATNSimulator; + class ATNState; + enum class ATNType; + class ActionTransition; + class ArrayPredictionContext; + class AtomTransition; + class BasicBlockStartState; + class BasicState; + class BlockEndState; + class BlockStartState; + class DecisionState; + class EpsilonTransition; + class LL1Analyzer; + class LexerAction; + class LexerActionExecutor; + class LexerATNConfig; + class LexerATNSimulator; + class LexerMoreAction; + class LexerPopModeAction; + class LexerSkipAction; + class LookaheadEventInfo; + class LoopEndState; + class NotSetTransition; + class OrderedATNConfigSet; + class ParseInfo; + class ParserATNSimulator; + class PlusBlockStartState; + class PlusLoopbackState; + class PrecedencePredicateTransition; + class PredicateTransition; + class PredictionContext; + enum class PredictionMode; + class PredictionModeClass; + class RangeTransition; + class RuleStartState; + class RuleStopState; + class RuleTransition; + class SemanticContext; + class SetTransition; + class SingletonPredictionContext; + class StarBlockStartState; + class StarLoopEntryState; + class StarLoopbackState; + class TokensStartState; + class Transition; + class WildcardTransition; + } + namespace dfa { + class DFA; + class DFASerializer; + class DFAState; + class LexerDFASerializer; + class Vocabulary; + } + namespace tree { + class AbstractParseTreeVisitor; + class ErrorNode; + class ErrorNodeImpl; + class ParseTree; + class ParseTreeListener; + template<typename T> class ParseTreeProperty; + class ParseTreeVisitor; + class ParseTreeWalker; + class SyntaxTree; + class TerminalNode; + class TerminalNodeImpl; + class Tree; + class Trees; + + namespace pattern { + class Chunk; + class ParseTreeMatch; + class ParseTreePattern; + class ParseTreePatternMatcher; + class RuleTagToken; + class TagChunk; + class TextChunk; + class TokenTagToken; + } + + namespace xpath { + class XPath; + class XPathElement; + class XPathLexerErrorListener; + class XPathRuleAnywhereElement; + class XPathRuleElement; + class XPathTokenAnywhereElement; + class XPathTokenElement; + class XPathWildcardAnywhereElement; + class XPathWildcardElement; + } + } +} diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/StringUtils.cpp b/contrib/libs/antlr4_cpp_runtime/src/support/StringUtils.cpp new file mode 100644 index 00000000000..9ee274c8de4 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/StringUtils.cpp @@ -0,0 +1,38 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#include "support/StringUtils.h" + +namespace antlrcpp { + + std::string escapeWhitespace(std::string_view in) { + std::string out; + escapeWhitespace(out, in); + out.shrink_to_fit(); + return out; + } + + std::string& escapeWhitespace(std::string& out, std::string_view in) { + out.reserve(in.size()); // Best case, no escaping. + for (const auto &c : in) { + switch (c) { + case '\t': + out.append("\\t"); + break; + case '\r': + out.append("\\r"); + break; + case '\n': + out.append("\\n"); + break; + default: + out.push_back(c); + break; + } + } + return out; + } + +} // namespace antrlcpp diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/StringUtils.h b/contrib/libs/antlr4_cpp_runtime/src/support/StringUtils.h new file mode 100644 index 00000000000..aee0d46d6e7 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/StringUtils.h @@ -0,0 +1,16 @@ +/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#pragma once + +#include "antlr4-common.h" + +namespace antlrcpp { + + ANTLR4CPP_PUBLIC std::string escapeWhitespace(std::string_view in); + + ANTLR4CPP_PUBLIC std::string& escapeWhitespace(std::string& out, std::string_view in); + +} diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Unicode.h b/contrib/libs/antlr4_cpp_runtime/src/support/Unicode.h new file mode 100644 index 00000000000..f0f84375add --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Unicode.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2021 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#pragma once + +#include "antlr4-common.h" + +namespace antlrcpp { + + class ANTLR4CPP_PUBLIC Unicode final { + public: + static constexpr char32_t REPLACEMENT_CHARACTER = 0xfffd; + + static constexpr bool isValid(char32_t codePoint) { + return codePoint < 0xd800 || (codePoint > 0xdfff && codePoint <= 0x10ffff); + } + + private: + Unicode() = delete; + Unicode(const Unicode&) = delete; + Unicode(Unicode&&) = delete; + Unicode& operator=(const Unicode&) = delete; + Unicode& operator=(Unicode&&) = delete; + }; + +} diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp b/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp new file mode 100644 index 00000000000..294e9f1b215 --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.cpp @@ -0,0 +1,242 @@ +/* Copyright (c) 2021 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#include <cassert> +#include <cstdint> + +#include "support/Utf8.h" +#include "support/Unicode.h" + +// The below implementation is based off of https://github.com/google/cel-cpp/internal/utf8.cc, +// which is itself based off of https://go.googlesource.com/go/+/refs/heads/master/src/unicode/utf8/utf8.go. +// If for some reason you feel the need to copy this implementation, please retain a comment +// referencing the two source files and giving credit, as well as maintaining any and all +// obligations required by the BSD 3-clause license that governs this file. + +namespace antlrcpp { + +namespace { + +#undef SELF + constexpr uint8_t SELF = 0x80; + +#undef LOW + constexpr uint8_t LOW = 0x80; +#undef HIGH + constexpr uint8_t HIGH = 0xbf; + +#undef MASKX + constexpr uint8_t MASKX = 0x3f; +#undef MASK2 + constexpr uint8_t MASK2 = 0x1f; +#undef MASK3 + constexpr uint8_t MASK3 = 0xf; +#undef MASK4 + constexpr uint8_t MASK4 = 0x7; + +#undef TX + constexpr uint8_t TX = 0x80; +#undef T2 + constexpr uint8_t T2 = 0xc0; +#undef T3 + constexpr uint8_t T3 = 0xe0; +#undef T4 + constexpr uint8_t T4 = 0xf0; + +#undef XX + constexpr uint8_t XX = 0xf1; +#undef AS + constexpr uint8_t AS = 0xf0; +#undef S1 + constexpr uint8_t S1 = 0x02; +#undef S2 + constexpr uint8_t S2 = 0x13; +#undef S3 + constexpr uint8_t S3 = 0x03; +#undef S4 + constexpr uint8_t S4 = 0x23; +#undef S5 + constexpr uint8_t S5 = 0x34; +#undef S6 + constexpr uint8_t S6 = 0x04; +#undef S7 + constexpr uint8_t S7 = 0x44; + + // NOLINTBEGIN + // clang-format off +#undef LEADING + constexpr uint8_t LEADING[256] = { + // 1 2 3 4 5 6 7 8 9 A B C D E F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F + AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F + // 1 2 3 4 5 6 7 8 9 A B C D E F + XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F + XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F + XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF + XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF + XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF + S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF + S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF + S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF + }; + // clang-format on + // NOLINTEND + +#undef ACCEPT + constexpr std::pair<uint8_t, uint8_t> ACCEPT[16] = { + {LOW, HIGH}, {0xa0, HIGH}, {LOW, 0x9f}, {0x90, HIGH}, + {LOW, 0x8f}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, + {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, + {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, + }; + +} // namespace + + std::pair<char32_t, size_t> Utf8::decode(std::string_view input) { + assert(!input.empty()); + const auto b = static_cast<uint8_t>(input.front()); + input.remove_prefix(1); + if (b < SELF) { + return {static_cast<char32_t>(b), 1}; + } + const auto leading = LEADING[b]; + if (leading == XX) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + auto size = static_cast<size_t>(leading & 7) - 1; + if (size > input.size()) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + const auto& accept = ACCEPT[leading >> 4]; + const auto b1 = static_cast<uint8_t>(input.front()); + input.remove_prefix(1); + if (b1 < accept.first || b1 > accept.second) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + if (size <= 1) { + return {(static_cast<char32_t>(b & MASK2) << 6) | + static_cast<char32_t>(b1 & MASKX), + 2}; + } + const auto b2 = static_cast<uint8_t>(input.front()); + input.remove_prefix(1); + if (b2 < LOW || b2 > HIGH) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + if (size <= 2) { + return {(static_cast<char32_t>(b & MASK3) << 12) | + (static_cast<char32_t>(b1 & MASKX) << 6) | + static_cast<char32_t>(b2 & MASKX), + 3}; + } + const auto b3 = static_cast<uint8_t>(input.front()); + input.remove_prefix(1); + if (b3 < LOW || b3 > HIGH) { + return {Unicode::REPLACEMENT_CHARACTER, 1}; + } + return {(static_cast<char32_t>(b & MASK4) << 18) | + (static_cast<char32_t>(b1 & MASKX) << 12) | + (static_cast<char32_t>(b2 & MASKX) << 6) | + static_cast<char32_t>(b3 & MASKX), + 4}; + } + + std::optional<std::u32string> Utf8::strictDecode(std::string_view input) { + std::u32string output; + char32_t codePoint; + size_t codeUnits; + output.reserve(input.size()); // Worst case is each byte is a single Unicode code point. + for (size_t index = 0; index < input.size(); index += codeUnits) { + std::tie(codePoint, codeUnits) = Utf8::decode(input.substr(index)); + if (codePoint == Unicode::REPLACEMENT_CHARACTER && codeUnits == 1) { + // Condition is only met when an illegal byte sequence is encountered. See Utf8::decode. + return std::nullopt; + } + output.push_back(codePoint); + } + output.shrink_to_fit(); + return output; + } + + std::u32string Utf8::lenientDecode(std::string_view input) { + std::u32string output; + char32_t codePoint; + size_t codeUnits; + output.reserve(input.size()); // Worst case is each byte is a single Unicode code point. + for (size_t index = 0; index < input.size(); index += codeUnits) { + std::tie(codePoint, codeUnits) = Utf8::decode(input.substr(index)); + output.push_back(codePoint); + } + output.shrink_to_fit(); + return output; + } + + std::string& Utf8::encode(std::string* buffer, char32_t codePoint) { + assert(buffer != nullptr); + if (!Unicode::isValid(codePoint)) { + codePoint = Unicode::REPLACEMENT_CHARACTER; + } + if (codePoint <= 0x7f) { + buffer->push_back(static_cast<char>(static_cast<uint8_t>(codePoint))); + } else if (codePoint <= 0x7ff) { + buffer->push_back( + static_cast<char>(T2 | static_cast<uint8_t>(codePoint >> 6))); + buffer->push_back( + static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX))); + } else if (codePoint <= 0xffff) { + buffer->push_back( + static_cast<char>(T3 | static_cast<uint8_t>(codePoint >> 12))); + buffer->push_back(static_cast<char>( + TX | (static_cast<uint8_t>(codePoint >> 6) & MASKX))); + buffer->push_back( + static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX))); + } else { + buffer->push_back( + static_cast<char>(T4 | static_cast<uint8_t>(codePoint >> 18))); + buffer->push_back(static_cast<char>( + TX | (static_cast<uint8_t>(codePoint >> 12) & MASKX))); + buffer->push_back(static_cast<char>( + TX | (static_cast<uint8_t>(codePoint >> 6) & MASKX))); + buffer->push_back( + static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX))); + } + return *buffer; + } + + std::optional<std::string> Utf8::strictEncode(std::u32string_view input) { + std::string output; + output.reserve(input.size() * 4); // Worst case is each Unicode code point encodes to 4 bytes. + for (size_t index = 0; index < input.size(); index++) { + char32_t codePoint = input[index]; + if (!Unicode::isValid(codePoint)) { + return std::nullopt; + } + Utf8::encode(&output, codePoint); + } + output.shrink_to_fit(); + return output; + } + + std::string Utf8::lenientEncode(std::u32string_view input) { + std::string output; + output.reserve(input.size() * 4); // Worst case is each Unicode code point encodes to 4 bytes. + for (size_t index = 0; index < input.size(); index++) { + char32_t codePoint = input[index]; + if (!Unicode::isValid(codePoint)) { + codePoint = Unicode::REPLACEMENT_CHARACTER; + } + Utf8::encode(&output, codePoint); + } + output.shrink_to_fit(); + return output; + } + +} diff --git a/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.h b/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.h new file mode 100644 index 00000000000..e4828441cdc --- /dev/null +++ b/contrib/libs/antlr4_cpp_runtime/src/support/Utf8.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2021 The ANTLR Project. All rights reserved. + * Use of this file is governed by the BSD 3-clause license that + * can be found in the LICENSE.txt file in the project root. + */ + +#pragma once + +#include <optional> +#include <string> +#include <string_view> +#include <tuple> + +#include "antlr4-common.h" + +namespace antlrcpp { + + class ANTLR4CPP_PUBLIC Utf8 final { + public: + // Decodes the next code point, returning the decoded code point and the number + // of code units (a.k.a. bytes) consumed. In the event that an invalid code unit + // sequence is returned the replacement character, U+FFFD, is returned with a + // code unit count of 1. As U+FFFD requires 3 code units when encoded, this can + // be used to differentiate valid input from malformed input. + static std::pair<char32_t, size_t> decode(std::string_view input); + + // Decodes the given UTF-8 encoded input into a string of code points. + static std::optional<std::u32string> strictDecode(std::string_view input); + + // Decodes the given UTF-8 encoded input into a string of code points. Unlike strictDecode(), + // each byte in an illegal byte sequence is replaced with the Unicode replacement character, + // U+FFFD. + static std::u32string lenientDecode(std::string_view input); + + // Encodes the given code point and appends it to the buffer. If the code point + // is an unpaired surrogate or outside of the valid Unicode range it is replaced + // with the replacement character, U+FFFD. + static std::string& encode(std::string *buffer, char32_t codePoint); + + // Encodes the given Unicode code point string as UTF-8. + static std::optional<std::string> strictEncode(std::u32string_view input); + + // Encodes the given Unicode code point string as UTF-8. Unlike strictEncode(), + // each invalid Unicode code point is replaced with the Unicode replacement character, U+FFFD. + static std::string lenientEncode(std::u32string_view input); + + private: + Utf8() = delete; + Utf8(const Utf8&) = delete; + Utf8(Utf8&&) = delete; + Utf8& operator=(const Utf8&) = delete; + Utf8& operator=(Utf8&&) = delete; + }; + +} |