diff options
author | thegeorg <thegeorg@yandex-team.com> | 2024-08-06 11:28:07 +0300 |
---|---|---|
committer | thegeorg <thegeorg@yandex-team.com> | 2024-08-06 12:50:21 +0300 |
commit | de4d7efd8871b850e3ea79164d7661e2299836b7 (patch) | |
tree | 47d8cf597b3789a807a4b1cec0a9fd66788767c2 /contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal | |
parent | e003b4c129e1381591dcb75a96bf9a970b2b47fb (diff) | |
download | ydb-de4d7efd8871b850e3ea79164d7661e2299836b7.tar.gz |
Update contrib/restricted/abseil-cpp-tstring to 20240722.0
83a5727000e16bc5a94523a0cf1cce75fa86a191
Diffstat (limited to 'contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal')
12 files changed, 2644 insertions, 116 deletions
diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/bounded_utf8_length_sequence.h b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/bounded_utf8_length_sequence.h new file mode 100644 index 0000000000..df2804299b --- /dev/null +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/bounded_utf8_length_sequence.h @@ -0,0 +1,126 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef Y_ABSL_DEBUGGING_INTERNAL_BOUNDED_UTF8_LENGTH_SEQUENCE_H_ +#define Y_ABSL_DEBUGGING_INTERNAL_BOUNDED_UTF8_LENGTH_SEQUENCE_H_ + +#include <cstdint> + +#include "y_absl/base/config.h" +#include "y_absl/numeric/bits.h" + +namespace y_absl { +Y_ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +// A sequence of up to max_elements integers between 1 and 4 inclusive, whose +// insertion operation computes the sum of all the elements before the insertion +// point. This is useful in decoding Punycode, where one needs to know where in +// a UTF-8 byte stream the n-th code point begins. +// +// BoundedUtf8LengthSequence is async-signal-safe and suitable for use in +// symbolizing stack traces in a signal handler, provided max_elements is not +// improvidently large. For inputs of lengths accepted by the Rust demangler, +// up to a couple hundred code points, InsertAndReturnSumOfPredecessors should +// run in a few dozen clock cycles, on par with the other arithmetic required +// for Punycode decoding. +template <uint32_t max_elements> +class BoundedUtf8LengthSequence { + public: + // Constructs an empty sequence. + BoundedUtf8LengthSequence() = default; + + // Inserts `utf_length` at position `index`, shifting any existing elements at + // or beyond `index` one position to the right. If the sequence is already + // full, the rightmost element is discarded. + // + // Returns the sum of the elements at positions 0 to `index - 1` inclusive. + // If `index` is greater than the number of elements already inserted, the + // excess positions in the range count 1 apiece. + // + // REQUIRES: index < max_elements and 1 <= utf8_length <= 4. + uint32_t InsertAndReturnSumOfPredecessors( + uint32_t index, uint32_t utf8_length) { + // The caller shouldn't pass out-of-bounds inputs, but if it does happen, + // clamp the values and try to continue. If we're being called from a + // signal handler, the last thing we want to do is crash. Emitting + // malformed UTF-8 is a lesser evil. + if (index >= max_elements) index = max_elements - 1; + if (utf8_length == 0 || utf8_length > 4) utf8_length = 1; + + const uint32_t word_index = index/32; + const uint32_t bit_index = 2 * (index % 32); + const uint64_t ones_bit = uint64_t{1} << bit_index; + + // Compute the sum of predecessors. + // - Each value from 1 to 4 is represented by a bit field with value from + // 0 to 3, so the desired sum is index plus the sum of the + // representations actually stored. + // - For each bit field, a set low bit should contribute 1 to the sum, and + // a set high bit should contribute 2. + // - Another way to say the same thing is that each set bit contributes 1, + // and each set high bit contributes an additional 1. + // - So the sum we want is index + popcount(everything) + popcount(bits in + // odd positions). + const uint64_t odd_bits_mask = 0xaaaaaaaaaaaaaaaa; + const uint64_t lower_seminibbles_mask = ones_bit - 1; + const uint64_t higher_seminibbles_mask = ~lower_seminibbles_mask; + const uint64_t same_word_bits_below_insertion = + rep_[word_index] & lower_seminibbles_mask; + int full_popcount = y_absl::popcount(same_word_bits_below_insertion); + int odd_popcount = + y_absl::popcount(same_word_bits_below_insertion & odd_bits_mask); + for (uint32_t j = word_index; j > 0; --j) { + const uint64_t word_below_insertion = rep_[j - 1]; + full_popcount += y_absl::popcount(word_below_insertion); + odd_popcount += y_absl::popcount(word_below_insertion & odd_bits_mask); + } + const uint32_t sum_of_predecessors = + index + static_cast<uint32_t>(full_popcount + odd_popcount); + + // Now insert utf8_length's representation, shifting successors up one + // place. + for (uint32_t j = max_elements/32 - 1; j > word_index; --j) { + rep_[j] = (rep_[j] << 2) | (rep_[j - 1] >> 62); + } + rep_[word_index] = + (rep_[word_index] & lower_seminibbles_mask) | + (uint64_t{utf8_length - 1} << bit_index) | + ((rep_[word_index] & higher_seminibbles_mask) << 2); + + return sum_of_predecessors; + } + + private: + // If the (32 * i + j)-th element of the represented sequence has the value k + // (0 <= j < 32, 1 <= k <= 4), then bits 2 * j and 2 * j + 1 of rep_[i] + // contain the seminibble (k - 1). + // + // In particular, the zero-initialization of rep_ makes positions not holding + // any inserted element count as 1 in InsertAndReturnSumOfPredecessors. + // + // Example: rep_ = {0xb1, ... the rest zeroes ...} represents the sequence + // (2, 1, 4, 3, ... the rest 1's ...). Constructing the sequence of Unicode + // code points "Àa🂻中" = {U+00C0, U+0061, U+1F0BB, U+4E2D} (among many + // other examples) would yield this value of rep_. + static_assert(max_elements > 0 && max_elements % 32 == 0, + "max_elements must be a positive multiple of 32"); + uint64_t rep_[max_elements/32] = {}; +}; + +} // namespace debugging_internal +Y_ABSL_NAMESPACE_END +} // namespace y_absl + +#endif // Y_ABSL_DEBUGGING_INTERNAL_BOUNDED_UTF8_LENGTH_SEQUENCE_H_ diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/decode_rust_punycode.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/decode_rust_punycode.cc new file mode 100644 index 0000000000..bb9fe3cb0e --- /dev/null +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/decode_rust_punycode.cc @@ -0,0 +1,258 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "y_absl/debugging/internal/decode_rust_punycode.h" + +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "y_absl/base/config.h" +#include "y_absl/base/nullability.h" +#include "y_absl/debugging/internal/bounded_utf8_length_sequence.h" +#include "y_absl/debugging/internal/utf8_for_code_point.h" + +namespace y_absl { +Y_ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +namespace { + +// Decoding Punycode requires repeated random-access insertion into a stream of +// variable-length UTF-8 code-point encodings. We need this to be tolerably +// fast (no N^2 slowdown for unfortunate inputs), and we can't allocate any data +// structures on the heap (async-signal-safety). +// +// It is pragmatic to impose a moderately low limit on the identifier length and +// bail out if we ever hit it. Then BoundedUtf8LengthSequence efficiently +// determines where to insert the next code point, and memmove efficiently makes +// room for it. +// +// The chosen limit is a round number several times larger than identifiers +// expected in practice, yet still small enough that a memmove of this many +// UTF-8 characters is not much more expensive than the division and modulus +// operations that Punycode decoding requires. +constexpr uint32_t kMaxChars = 256; + +// Constants from RFC 3492 section 5. +constexpr uint32_t kBase = 36, kTMin = 1, kTMax = 26, kSkew = 38, kDamp = 700; + +constexpr uint32_t kMaxCodePoint = 0x10ffff; + +// Overflow threshold in DecodeRustPunycode's inner loop; see comments there. +constexpr uint32_t kMaxI = 1 << 30; + +// If punycode_begin .. punycode_end begins with a prefix matching the regular +// expression [0-9a-zA-Z_]+_, removes that prefix, copies all but the final +// underscore into out_begin .. out_end, sets num_ascii_chars to the number of +// bytes copied, and returns true. (A prefix of this sort represents the +// nonempty subsequence of ASCII characters in the corresponding plaintext.) +// +// If punycode_begin .. punycode_end does not contain an underscore, sets +// num_ascii_chars to zero and returns true. (The encoding of a plaintext +// without any ASCII characters does not carry such a prefix.) +// +// Returns false and zeroes num_ascii_chars on failure (either parse error or +// not enough space in the output buffer). +bool ConsumeOptionalAsciiPrefix(const char*& punycode_begin, + const char* const punycode_end, + char* const out_begin, + char* const out_end, + uint32_t& num_ascii_chars) { + num_ascii_chars = 0; + + // Remember the last underscore if any. Also use the same string scan to + // reject any ASCII bytes that do not belong in an identifier, including NUL, + // as well as non-ASCII bytes, which should have been delta-encoded instead. + int last_underscore = -1; + for (int i = 0; i < punycode_end - punycode_begin; ++i) { + const char c = punycode_begin[i]; + if (c == '_') { + last_underscore = i; + continue; + } + // We write out the meaning of y_absl::ascii_isalnum rather than call that + // function because its documentation does not promise it will remain + // async-signal-safe under future development. + if ('a' <= c && c <= 'z') continue; + if ('A' <= c && c <= 'Z') continue; + if ('0' <= c && c <= '9') continue; + return false; + } + + // If there was no underscore, that means there were no ASCII characters in + // the plaintext, so there is no prefix to consume. Our work is done. + if (last_underscore < 0) return true; + + // Otherwise there will be an underscore delimiter somewhere. It can't be + // initial because then there would be no ASCII characters to its left, and no + // delimiter would have been added in that case. + if (last_underscore == 0) return false; + + // Any other position is reasonable. Make sure there's room in the buffer. + if (last_underscore + 1 > out_end - out_begin) return false; + + // Consume and write out the ASCII characters. + num_ascii_chars = static_cast<uint32_t>(last_underscore); + std::memcpy(out_begin, punycode_begin, num_ascii_chars); + out_begin[num_ascii_chars] = '\0'; + punycode_begin += num_ascii_chars + 1; + return true; +} + +// Returns the value of `c` as a base-36 digit according to RFC 3492 section 5, +// or -1 if `c` is not such a digit. +int DigitValue(char c) { + if ('0' <= c && c <= '9') return c - '0' + 26; + if ('a' <= c && c <= 'z') return c - 'a'; + if ('A' <= c && c <= 'Z') return c - 'A'; + return -1; +} + +// Consumes the next delta encoding from punycode_begin .. punycode_end, +// updating i accordingly. Returns true on success. Returns false on parse +// failure or arithmetic overflow. +bool ScanNextDelta(const char*& punycode_begin, const char* const punycode_end, + uint32_t bias, uint32_t& i) { + uint64_t w = 1; // 64 bits to prevent overflow in w *= kBase - t + + // "for k = base to infinity in steps of base do begin ... end" in RFC 3492 + // section 6.2. Each loop iteration scans one digit of the delta. + for (uint32_t k = kBase; punycode_begin != punycode_end; k += kBase) { + const int digit_value = DigitValue(*punycode_begin++); + if (digit_value < 0) return false; + + // Compute this in 64-bit arithmetic so we can check for overflow afterward. + const uint64_t new_i = i + static_cast<uint64_t>(digit_value) * w; + + // Valid deltas are bounded by (#chars already emitted) * kMaxCodePoint, but + // invalid input could encode an arbitrarily large delta. Nip that in the + // bud here. + static_assert( + kMaxI >= kMaxChars * kMaxCodePoint, + "kMaxI is too small to prevent spurious failures on good input"); + if (new_i > kMaxI) return false; + + static_assert( + kMaxI < (uint64_t{1} << 32), + "Make kMaxI smaller or i 64 bits wide to prevent silent wraparound"); + i = static_cast<uint32_t>(new_i); + + // Compute the threshold that determines whether this is the last digit and + // (if not) what the next digit's place value will be. This logic from RFC + // 3492 section 6.2 is explained in section 3.3. + uint32_t t; + if (k <= bias + kTMin) { + t = kTMin; + } else if (k >= bias + kTMax) { + t = kTMax; + } else { + t = k - bias; + } + if (static_cast<uint32_t>(digit_value) < t) return true; + + // If this gets too large, the range check on new_i in the next iteration + // will catch it. We know this multiplication will not overwrap because w + // is 64 bits wide. + w *= kBase - t; + } + return false; +} + +} // namespace + +y_absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options) { + const char* punycode_begin = options.punycode_begin; + const char* const punycode_end = options.punycode_end; + char* const out_begin = options.out_begin; + char* const out_end = options.out_end; + + // Write a NUL terminator first. Later memcpy calls will keep bumping it + // along to its new right place. + const size_t out_size = static_cast<size_t>(out_end - out_begin); + if (out_size == 0) return nullptr; + *out_begin = '\0'; + + // RFC 3492 section 6.2 begins here. We retain the names of integer variables + // appearing in that text. + uint32_t n = 128, i = 0, bias = 72, num_chars = 0; + + // If there are any ASCII characters, consume them and their trailing + // underscore delimiter. + if (!ConsumeOptionalAsciiPrefix(punycode_begin, punycode_end, + out_begin, out_end, num_chars)) { + return nullptr; + } + uint32_t total_utf8_bytes = num_chars; + + BoundedUtf8LengthSequence<kMaxChars> utf8_lengths; + + // "while the input is not exhausted do begin ... end" + while (punycode_begin != punycode_end) { + if (num_chars >= kMaxChars) return nullptr; + + const uint32_t old_i = i; + + if (!ScanNextDelta(punycode_begin, punycode_end, bias, i)) return nullptr; + + // Update bias as in RFC 3492 section 6.1. (We have inlined adapt.) + uint32_t delta = i - old_i; + delta /= (old_i == 0 ? kDamp : 2); + delta += delta/(num_chars + 1); + bias = 0; + while (delta > ((kBase - kTMin) * kTMax)/2) { + delta /= kBase - kTMin; + bias += kBase; + } + bias += ((kBase - kTMin + 1) * delta)/(delta + kSkew); + + // Back in section 6.2, compute the new code point and insertion index. + static_assert( + kMaxI + kMaxCodePoint < (uint64_t{1} << 32), + "Make kMaxI smaller or n 64 bits wide to prevent silent wraparound"); + n += i/(num_chars + 1); + i %= num_chars + 1; + + // To actually insert, we need to convert the code point n to UTF-8 and the + // character index i to an index into the byte stream emitted so far. First + // prepare the UTF-8 encoding for n, rejecting surrogates, overlarge values, + // and anything that won't fit into the remaining output storage. + Utf8ForCodePoint utf8_for_code_point(n); + if (!utf8_for_code_point.ok()) return nullptr; + if (total_utf8_bytes + utf8_for_code_point.length + 1 > out_size) { + return nullptr; + } + + // Now insert the new character into both our length map and the output. + uint32_t n_index = + utf8_lengths.InsertAndReturnSumOfPredecessors( + i, utf8_for_code_point.length); + std::memmove( + out_begin + n_index + utf8_for_code_point.length, out_begin + n_index, + total_utf8_bytes + 1 - n_index); + std::memcpy(out_begin + n_index, utf8_for_code_point.bytes, + utf8_for_code_point.length); + total_utf8_bytes += utf8_for_code_point.length; + ++num_chars; + + // Finally, advance to the next state before continuing. + ++i; + } + + return out_begin + total_utf8_bytes; +} + +} // namespace debugging_internal +Y_ABSL_NAMESPACE_END +} // namespace y_absl diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/decode_rust_punycode.h b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/decode_rust_punycode.h new file mode 100644 index 0000000000..b232b4c12f --- /dev/null +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/decode_rust_punycode.h @@ -0,0 +1,55 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef Y_ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_ +#define Y_ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_ + +#include "y_absl/base/config.h" +#include "y_absl/base/nullability.h" + +namespace y_absl { +Y_ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +struct DecodeRustPunycodeOptions { + const char* punycode_begin; + const char* punycode_end; + char* out_begin; + char* out_end; +}; + +// Given Rust Punycode in `punycode_begin .. punycode_end`, writes the +// corresponding UTF-8 plaintext into `out_begin .. out_end`, followed by a NUL +// character, and returns a pointer to that final NUL on success. On failure +// returns a null pointer, and the contents of `out_begin .. out_end` are +// unspecified. +// +// Failure occurs in precisely these cases: +// - Any input byte does not match [0-9a-zA-Z_]. +// - The first input byte is an underscore, but no other underscore appears in +// the input. +// - The delta sequence does not represent a valid sequence of code-point +// insertions. +// - The plaintext would contain more than 256 code points. +// +// DecodeRustPunycode is async-signal-safe with bounded runtime and a small +// stack footprint, making it suitable for use in demangling Rust symbol names +// from a signal handler. +y_absl::Nullable<char*> DecodeRustPunycode(DecodeRustPunycodeOptions options); + +} // namespace debugging_internal +Y_ABSL_NAMESPACE_END +} // namespace y_absl + +#endif // Y_ABSL_DEBUGGING_INTERNAL_DECODE_RUST_PUNYCODE_H_ diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle.cc index f79c798eef..22848f9861 100644 --- a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle.cc +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle.cc @@ -14,18 +14,19 @@ // For reference check out: // https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling -// -// Note that we only have partial C++11 support yet. #include "y_absl/debugging/internal/demangle.h" +#include <cstddef> #include <cstdint> #include <cstdio> #include <cstdlib> +#include <cstring> #include <limits> #include <util/generic/string.h> #include "y_absl/base/config.h" +#include "y_absl/debugging/internal/demangle_rust.h" #if Y_ABSL_INTERNAL_HAS_CXA_DEMANGLE #include <cxxabi.h> @@ -44,14 +45,16 @@ typedef struct { // List of operators from Itanium C++ ABI. static const AbbrevPair kOperatorList[] = { - // New has special syntax (not currently supported). + // New has special syntax. {"nw", "new", 0}, {"na", "new[]", 0}, - // Works except that the 'gs' prefix is not supported. + // Special-cased elsewhere to support the optional gs prefix. {"dl", "delete", 1}, {"da", "delete[]", 1}, + {"aw", "co_await", 1}, + {"ps", "+", 1}, // "positive" {"ng", "-", 1}, // "negative" {"ad", "&", 1}, // "address-of" @@ -79,6 +82,7 @@ static const AbbrevPair kOperatorList[] = { {"rs", ">>", 2}, {"lS", "<<=", 2}, {"rS", ">>=", 2}, + {"ss", "<=>", 2}, {"eq", "==", 2}, {"ne", "!=", 2}, {"lt", "<", 2}, @@ -98,6 +102,7 @@ static const AbbrevPair kOperatorList[] = { {"qu", "?", 3}, {"st", "sizeof", 0}, // Special syntax {"sz", "sizeof", 1}, // Not a real operator name, but used in expressions. + {"sZ", "sizeof...", 0}, // Special syntax {nullptr, nullptr, 0}, }; @@ -187,9 +192,50 @@ typedef struct { int recursion_depth; // For stack exhaustion prevention. int steps; // Cap how much work we'll do, regardless of depth. ParseState parse_state; // Backtrackable state copied for most frames. + + // Conditionally compiled support for marking the position of the first + // construct Demangle couldn't parse. This preprocessor symbol is intended + // for use by Abseil demangler maintainers only; its behavior is not part of + // Abseil's public interface. +#ifdef Y_ABSL_INTERNAL_DEMANGLE_RECORDS_HIGH_WATER_MARK + int high_water_mark; // Input position where parsing failed. + bool too_complex; // True if any guard.IsTooComplex() call returned true. +#endif } State; namespace { + +#ifdef Y_ABSL_INTERNAL_DEMANGLE_RECORDS_HIGH_WATER_MARK +void UpdateHighWaterMark(State *state) { + if (state->high_water_mark < state->parse_state.mangled_idx) { + state->high_water_mark = state->parse_state.mangled_idx; + } +} + +void ReportHighWaterMark(State *state) { + // Write out the mangled name with the trouble point marked, provided that the + // output buffer is large enough and the mangled name did not hit a complexity + // limit (in which case the high water mark wouldn't point out an unparsable + // construct, only the point where a budget ran out). + const size_t input_length = std::strlen(state->mangled_begin); + if (input_length + 6 > static_cast<size_t>(state->out_end_idx) || + state->too_complex) { + if (state->out_end_idx > 0) state->out[0] = '\0'; + return; + } + const size_t high_water_mark = static_cast<size_t>(state->high_water_mark); + std::memcpy(state->out, state->mangled_begin, high_water_mark); + std::memcpy(state->out + high_water_mark, "--!--", 5); + std::memcpy(state->out + high_water_mark + 5, + state->mangled_begin + high_water_mark, + input_length - high_water_mark); + state->out[input_length + 5] = '\0'; +} +#else +void UpdateHighWaterMark(State *) {} +void ReportHighWaterMark(State *) {} +#endif + // Prevent deep recursion / stack exhaustion. // Also prevent unbounded handling of complex inputs. class ComplexityGuard { @@ -201,7 +247,7 @@ class ComplexityGuard { ~ComplexityGuard() { --state_->recursion_depth; } // 256 levels of recursion seems like a reasonable upper limit on depth. - // 128 is not enough to demagle synthetic tests from demangle_unittest.txt: + // 128 is not enough to demangle synthetic tests from demangle_unittest.txt: // "_ZaaZZZZ..." and "_ZaaZcvZcvZ..." static constexpr int kRecursionDepthLimit = 256; @@ -222,8 +268,14 @@ class ComplexityGuard { static constexpr int kParseStepsLimit = 1 << 17; bool IsTooComplex() const { - return state_->recursion_depth > kRecursionDepthLimit || - state_->steps > kParseStepsLimit; + if (state_->recursion_depth > kRecursionDepthLimit || + state_->steps > kParseStepsLimit) { +#ifdef Y_ABSL_INTERNAL_DEMANGLE_RECORDS_HIGH_WATER_MARK + state_->too_complex = true; +#endif + return true; + } + return false; } private: @@ -270,6 +322,10 @@ static void InitState(State* state, state->out_end_idx = static_cast<int>(out_size); state->recursion_depth = 0; state->steps = 0; +#ifdef Y_ABSL_INTERNAL_DEMANGLE_RECORDS_HIGH_WATER_MARK + state->high_water_mark = 0; + state->too_complex = false; +#endif state->parse_state.mangled_idx = 0; state->parse_state.out_cur_idx = 0; @@ -291,13 +347,14 @@ static bool ParseOneCharToken(State *state, const char one_char_token) { if (guard.IsTooComplex()) return false; if (RemainingInput(state)[0] == one_char_token) { ++state->parse_state.mangled_idx; + UpdateHighWaterMark(state); return true; } return false; } -// Returns true and advances "mangled_cur" if we find "two_char_token" -// at "mangled_cur" position. It is assumed that "two_char_token" does +// Returns true and advances "mangled_idx" if we find "two_char_token" +// at "mangled_idx" position. It is assumed that "two_char_token" does // not contain '\0'. static bool ParseTwoCharToken(State *state, const char *two_char_token) { ComplexityGuard guard(state); @@ -305,11 +362,45 @@ static bool ParseTwoCharToken(State *state, const char *two_char_token) { if (RemainingInput(state)[0] == two_char_token[0] && RemainingInput(state)[1] == two_char_token[1]) { state->parse_state.mangled_idx += 2; + UpdateHighWaterMark(state); return true; } return false; } +// Returns true and advances "mangled_idx" if we find "three_char_token" +// at "mangled_idx" position. It is assumed that "three_char_token" does +// not contain '\0'. +static bool ParseThreeCharToken(State *state, const char *three_char_token) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + if (RemainingInput(state)[0] == three_char_token[0] && + RemainingInput(state)[1] == three_char_token[1] && + RemainingInput(state)[2] == three_char_token[2]) { + state->parse_state.mangled_idx += 3; + UpdateHighWaterMark(state); + return true; + } + return false; +} + +// Returns true and advances "mangled_idx" if we find a copy of the +// NUL-terminated string "long_token" at "mangled_idx" position. +static bool ParseLongToken(State *state, const char *long_token) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + int i = 0; + for (; long_token[i] != '\0'; ++i) { + // Note that we cannot run off the end of the NUL-terminated input here. + // Inside the loop body, long_token[i] is known to be different from NUL. + // So if we read the NUL on the end of the input here, we return at once. + if (RemainingInput(state)[i] != long_token[i]) return false; + } + state->parse_state.mangled_idx += i; + UpdateHighWaterMark(state); + return true; +} + // Returns true and advances "mangled_cur" if we find any character in // "char_class" at "mangled_cur" position. static bool ParseCharClass(State *state, const char *char_class) { @@ -322,6 +413,7 @@ static bool ParseCharClass(State *state, const char *char_class) { for (; *p != '\0'; ++p) { if (RemainingInput(state)[0] == *p) { ++state->parse_state.mangled_idx; + UpdateHighWaterMark(state); return true; } } @@ -554,6 +646,7 @@ static bool ParseFloatNumber(State *state); static bool ParseSeqId(State *state); static bool ParseIdentifier(State *state, size_t length); static bool ParseOperatorName(State *state, int *arity); +static bool ParseConversionOperatorType(State *state); static bool ParseSpecialName(State *state); static bool ParseCallOffset(State *state); static bool ParseNVOffset(State *state); @@ -563,21 +656,33 @@ static bool ParseCtorDtorName(State *state); static bool ParseDecltype(State *state); static bool ParseType(State *state); static bool ParseCVQualifiers(State *state); +static bool ParseExtendedQualifier(State *state); static bool ParseBuiltinType(State *state); +static bool ParseVendorExtendedType(State *state); static bool ParseFunctionType(State *state); static bool ParseBareFunctionType(State *state); +static bool ParseOverloadAttribute(State *state); static bool ParseClassEnumType(State *state); static bool ParseArrayType(State *state); static bool ParsePointerToMemberType(State *state); static bool ParseTemplateParam(State *state); +static bool ParseTemplateParamDecl(State *state); static bool ParseTemplateTemplateParam(State *state); static bool ParseTemplateArgs(State *state); static bool ParseTemplateArg(State *state); static bool ParseBaseUnresolvedName(State *state); static bool ParseUnresolvedName(State *state); +static bool ParseUnresolvedQualifierLevel(State *state); +static bool ParseUnionSelector(State* state); +static bool ParseFunctionParam(State* state); +static bool ParseBracedExpression(State *state); static bool ParseExpression(State *state); +static bool ParseInitializer(State *state); static bool ParseExprPrimary(State *state); -static bool ParseExprCastValue(State *state); +static bool ParseExprCastValueAndTrailingE(State *state); +static bool ParseQRequiresClauseExpr(State *state); +static bool ParseRequirement(State *state); +static bool ParseTypeConstraint(State *state); static bool ParseLocalName(State *state); static bool ParseLocalNameSuffix(State *state); static bool ParseDiscriminator(State *state); @@ -622,22 +727,34 @@ static bool ParseMangledName(State *state) { } // <encoding> ::= <(function) name> <bare-function-type> +// [`Q` <requires-clause expr>] // ::= <(data) name> // ::= <special-name> +// +// NOTE: Based on http://shortn/_Hoq9qG83rx static bool ParseEncoding(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; - // Implementing the first two productions together as <name> - // [<bare-function-type>] avoids exponential blowup of backtracking. + // Since the first two productions both start with <name>, attempt + // to parse it only once to avoid exponential blowup of backtracking. // - // Since Optional(...) can't fail, there's no need to copy the state for - // backtracking. - if (ParseName(state) && Optional(ParseBareFunctionType(state))) { + // We're careful about exponential blowup because <encoding> recursively + // appears in other productions downstream of its first two productions, + // which means that every call to `ParseName` would possibly indirectly + // result in two calls to `ParseName` etc. + if (ParseName(state)) { + if (!ParseBareFunctionType(state)) { + return true; // <(data) name> + } + + // Parsed: <(function) name> <bare-function-type> + // Pending: [`Q` <requires-clause expr>] + ParseQRequiresClauseExpr(state); // restores state on failure return true; } if (ParseSpecialName(state)) { - return true; + return true; // <special-name> } return false; } @@ -723,19 +840,26 @@ static bool ParseNestedName(State *state) { // <prefix> ::= <prefix> <unqualified-name> // ::= <template-prefix> <template-args> // ::= <template-param> +// ::= <decltype> // ::= <substitution> // ::= # empty // <template-prefix> ::= <prefix> <(template) unqualified-name> // ::= <template-param> // ::= <substitution> +// ::= <vendor-extended-type> static bool ParsePrefix(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; bool has_something = false; while (true) { MaybeAppendSeparator(state); - if (ParseTemplateParam(state) || + if (ParseTemplateParam(state) || ParseDecltype(state) || ParseSubstitution(state, /*accept_std=*/true) || + // Although the official grammar does not mention it, nested-names + // shaped like Nu14__some_builtinIiE6memberE occur in practice, and it + // is not clear what else a compiler is supposed to do when a + // vendor-extended type has named members. + ParseVendorExtendedType(state) || ParseUnscopedName(state) || (ParseOneCharToken(state, 'M') && ParseUnnamedTypeName(state))) { has_something = true; @@ -757,8 +881,14 @@ static bool ParsePrefix(State *state) { // ::= <source-name> [<abi-tags>] // ::= <local-source-name> [<abi-tags>] // ::= <unnamed-type-name> [<abi-tags>] +// ::= DC <source-name>+ E # C++17 structured binding +// ::= F <source-name> # C++20 constrained friend +// ::= F <operator-name> # C++20 constrained friend // // <local-source-name> is a GCC extension; see below. +// +// For the F notation for constrained friends, see +// https://github.com/itanium-cxx-abi/cxx-abi/issues/24#issuecomment-1491130332. static bool ParseUnqualifiedName(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; @@ -767,6 +897,23 @@ static bool ParseUnqualifiedName(State *state) { ParseUnnamedTypeName(state)) { return ParseAbiTags(state); } + + // DC <source-name>+ E + ParseState copy = state->parse_state; + if (ParseTwoCharToken(state, "DC") && OneOrMore(ParseSourceName, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + + // F <source-name> + // F <operator-name> + if (ParseOneCharToken(state, 'F') && MaybeAppend(state, "friend ") && + (ParseSourceName(state) || ParseOperatorName(state, nullptr))) { + return true; + } + state->parse_state = copy; + return false; } @@ -824,7 +971,11 @@ static bool ParseLocalSourceName(State *state) { // <unnamed-type-name> ::= Ut [<(nonnegative) number>] _ // ::= <closure-type-name> // <closure-type-name> ::= Ul <lambda-sig> E [<(nonnegative) number>] _ -// <lambda-sig> ::= <(parameter) type>+ +// <lambda-sig> ::= <template-param-decl>* <(parameter) type>+ +// +// For <template-param-decl>* in <lambda-sig> see: +// +// https://github.com/itanium-cxx-abi/cxx-abi/issues/31 static bool ParseUnnamedTypeName(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; @@ -847,6 +998,7 @@ static bool ParseUnnamedTypeName(State *state) { // Closure type. which = -1; if (ParseTwoCharToken(state, "Ul") && DisableAppend(state) && + ZeroOrMore(ParseTemplateParamDecl, state) && OneOrMore(ParseType, state) && RestoreAppend(state, copy.append) && ParseOneCharToken(state, 'E') && Optional(ParseNumber(state, &which)) && which <= std::numeric_limits<int>::max() - 2 && // Don't overflow. @@ -888,6 +1040,7 @@ static bool ParseNumber(State *state, int *number_out) { } if (p != RemainingInput(state)) { // Conversion succeeded. state->parse_state.mangled_idx += p - RemainingInput(state); + UpdateHighWaterMark(state); if (number_out != nullptr) { // Note: possibly truncate "number". *number_out = static_cast<int>(number); @@ -910,6 +1063,7 @@ static bool ParseFloatNumber(State *state) { } if (p != RemainingInput(state)) { // Conversion succeeded. state->parse_state.mangled_idx += p - RemainingInput(state); + UpdateHighWaterMark(state); return true; } return false; @@ -928,6 +1082,7 @@ static bool ParseSeqId(State *state) { } if (p != RemainingInput(state)) { // Conversion succeeded. state->parse_state.mangled_idx += p - RemainingInput(state); + UpdateHighWaterMark(state); return true; } return false; @@ -946,11 +1101,13 @@ static bool ParseIdentifier(State *state, size_t length) { MaybeAppendWithLength(state, RemainingInput(state), length); } state->parse_state.mangled_idx += length; + UpdateHighWaterMark(state); return true; } // <operator-name> ::= nw, and other two letters cases // ::= cv <type> # (cast) +// ::= li <source-name> # C++11 user-defined literal // ::= v <digit> <source-name> # vendor extended operator static bool ParseOperatorName(State *state, int *arity) { ComplexityGuard guard(state); @@ -961,7 +1118,7 @@ static bool ParseOperatorName(State *state, int *arity) { // First check with "cv" (cast) case. ParseState copy = state->parse_state; if (ParseTwoCharToken(state, "cv") && MaybeAppend(state, "operator ") && - EnterNestedName(state) && ParseType(state) && + EnterNestedName(state) && ParseConversionOperatorType(state) && LeaveNestedName(state, copy.nest_level)) { if (arity != nullptr) { *arity = 1; @@ -970,6 +1127,13 @@ static bool ParseOperatorName(State *state, int *arity) { } state->parse_state = copy; + // Then user-defined literals. + if (ParseTwoCharToken(state, "li") && MaybeAppend(state, "operator\"\" ") && + ParseSourceName(state)) { + return true; + } + state->parse_state = copy; + // Then vendor extended operators. if (ParseOneCharToken(state, 'v') && ParseDigit(state, arity) && ParseSourceName(state)) { @@ -997,36 +1161,120 @@ static bool ParseOperatorName(State *state, int *arity) { } MaybeAppend(state, p->real_name); state->parse_state.mangled_idx += 2; + UpdateHighWaterMark(state); return true; } } return false; } +// <operator-name> ::= cv <type> # (cast) +// +// The name of a conversion operator is the one place where cv-qualifiers, *, &, +// and other simple type combinators are expected to appear in our stripped-down +// demangling (elsewhere they appear in function signatures or template +// arguments, which we omit from the output). We make reasonable efforts to +// render simple cases accurately. +static bool ParseConversionOperatorType(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + ParseState copy = state->parse_state; + + // Scan pointers, const, and other easy mangling prefixes with postfix + // demanglings. Remember the range of input for later rescanning. + // + // See `ParseType` and the `switch` below for the meaning of each char. + const char* begin_simple_prefixes = RemainingInput(state); + while (ParseCharClass(state, "OPRCGrVK")) {} + const char* end_simple_prefixes = RemainingInput(state); + + // Emit the base type first. + if (!ParseType(state)) { + state->parse_state = copy; + return false; + } + + // Then rescan the easy type combinators in reverse order to emit their + // demanglings in the expected output order. + while (begin_simple_prefixes != end_simple_prefixes) { + switch (*--end_simple_prefixes) { + case 'P': + MaybeAppend(state, "*"); + break; + case 'R': + MaybeAppend(state, "&"); + break; + case 'O': + MaybeAppend(state, "&&"); + break; + case 'C': + MaybeAppend(state, " _Complex"); + break; + case 'G': + MaybeAppend(state, " _Imaginary"); + break; + case 'r': + MaybeAppend(state, " restrict"); + break; + case 'V': + MaybeAppend(state, " volatile"); + break; + case 'K': + MaybeAppend(state, " const"); + break; + } + } + return true; +} + // <special-name> ::= TV <type> // ::= TT <type> // ::= TI <type> // ::= TS <type> -// ::= TH <type> # thread-local +// ::= TW <name> # thread-local wrapper +// ::= TH <name> # thread-local initialization // ::= Tc <call-offset> <call-offset> <(base) encoding> // ::= GV <(object) name> +// ::= GR <(object) name> [<seq-id>] _ // ::= T <call-offset> <(base) encoding> +// ::= GTt <encoding> # transaction-safe entry point +// ::= TA <template-arg> # nontype template parameter object // G++ extensions: // ::= TC <type> <(offset) number> _ <(base) type> // ::= TF <type> // ::= TJ <type> -// ::= GR <name> +// ::= GR <name> # without final _, perhaps an earlier form? // ::= GA <encoding> // ::= Th <call-offset> <(base) encoding> // ::= Tv <call-offset> <(base) encoding> // -// Note: we don't care much about them since they don't appear in -// stack traces. The are special data. +// Note: Most of these are special data, not functions that occur in stack +// traces. Exceptions are TW and TH, which denote functions supporting the +// thread_local feature. For these see: +// +// https://maskray.me/blog/2021-02-14-all-about-thread-local-storage +// +// For TA see https://github.com/itanium-cxx-abi/cxx-abi/issues/63. static bool ParseSpecialName(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; ParseState copy = state->parse_state; - if (ParseOneCharToken(state, 'T') && ParseCharClass(state, "VTISH") && + + if (ParseTwoCharToken(state, "TW")) { + MaybeAppend(state, "thread-local wrapper routine for "); + if (ParseName(state)) return true; + state->parse_state = copy; + return false; + } + + if (ParseTwoCharToken(state, "TH")) { + MaybeAppend(state, "thread-local initialization routine for "); + if (ParseName(state)) return true; + state->parse_state = copy; + return false; + } + + if (ParseOneCharToken(state, 'T') && ParseCharClass(state, "VTIS") && ParseType(state)) { return true; } @@ -1064,21 +1312,51 @@ static bool ParseSpecialName(State *state) { } state->parse_state = copy; - if (ParseTwoCharToken(state, "GR") && ParseName(state)) { + // <special-name> ::= GR <(object) name> [<seq-id>] _ # modern standard + // ::= GR <(object) name> # also recognized + if (ParseTwoCharToken(state, "GR")) { + MaybeAppend(state, "reference temporary for "); + if (!ParseName(state)) { + state->parse_state = copy; + return false; + } + const bool has_seq_id = ParseSeqId(state); + const bool has_underscore = ParseOneCharToken(state, '_'); + if (has_seq_id && !has_underscore) { + state->parse_state = copy; + return false; + } return true; } - state->parse_state = copy; if (ParseTwoCharToken(state, "GA") && ParseEncoding(state)) { return true; } state->parse_state = copy; + if (ParseThreeCharToken(state, "GTt") && + MaybeAppend(state, "transaction clone for ") && ParseEncoding(state)) { + return true; + } + state->parse_state = copy; + if (ParseOneCharToken(state, 'T') && ParseCharClass(state, "hv") && ParseCallOffset(state) && ParseEncoding(state)) { return true; } state->parse_state = copy; + + if (ParseTwoCharToken(state, "TA")) { + bool append = state->parse_state.append; + DisableAppend(state); + if (ParseTemplateArg(state)) { + RestoreAppend(state, append); + MaybeAppend(state, "template parameter object"); + return true; + } + } + state->parse_state = copy; + return false; } @@ -1182,7 +1460,6 @@ static bool ParseDecltype(State *state) { // ::= O <type> # rvalue reference-to (C++0x) // ::= C <type> # complex pair (C 2000) // ::= G <type> # imaginary (C 2000) -// ::= U <source-name> <type> # vendor extended type qualifier // ::= <builtin-type> // ::= <function-type> // ::= <class-enum-type> # note: just an alias for <name> @@ -1193,7 +1470,9 @@ static bool ParseDecltype(State *state) { // ::= <decltype> // ::= <substitution> // ::= Dp <type> # pack expansion of (C++0x) -// ::= Dv <num-elems> _ # GNU vector extension +// ::= Dv <(elements) number> _ <type> # GNU vector extension +// ::= Dv <(bytes) expression> _ <type> +// ::= Dk <type-constraint> # constrained auto // static bool ParseType(State *state) { ComplexityGuard guard(state); @@ -1236,12 +1515,6 @@ static bool ParseType(State *state) { } state->parse_state = copy; - if (ParseOneCharToken(state, 'U') && ParseSourceName(state) && - ParseType(state)) { - return true; - } - state->parse_state = copy; - if (ParseBuiltinType(state) || ParseFunctionType(state) || ParseClassEnumType(state) || ParseArrayType(state) || ParsePointerToMemberType(state) || ParseDecltype(state) || @@ -1260,54 +1533,160 @@ static bool ParseType(State *state) { return true; } + // GNU vector extension Dv <number> _ <type> if (ParseTwoCharToken(state, "Dv") && ParseNumber(state, nullptr) && - ParseOneCharToken(state, '_')) { + ParseOneCharToken(state, '_') && ParseType(state)) { return true; } state->parse_state = copy; - return false; + // GNU vector extension Dv <expression> _ <type> + if (ParseTwoCharToken(state, "Dv") && ParseExpression(state) && + ParseOneCharToken(state, '_') && ParseType(state)) { + return true; + } + state->parse_state = copy; + + if (ParseTwoCharToken(state, "Dk") && ParseTypeConstraint(state)) { + return true; + } + state->parse_state = copy; + + // For this notation see CXXNameMangler::mangleType in Clang's source code. + // The relevant logic and its comment "not clear how to mangle this!" date + // from 2011, so it may be with us awhile. + return ParseLongToken(state, "_SUBSTPACK_"); } +// <qualifiers> ::= <extended-qualifier>* <CV-qualifiers> // <CV-qualifiers> ::= [r] [V] [K] +// // We don't allow empty <CV-qualifiers> to avoid infinite loop in // ParseType(). static bool ParseCVQualifiers(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; int num_cv_qualifiers = 0; + while (ParseExtendedQualifier(state)) ++num_cv_qualifiers; num_cv_qualifiers += ParseOneCharToken(state, 'r'); num_cv_qualifiers += ParseOneCharToken(state, 'V'); num_cv_qualifiers += ParseOneCharToken(state, 'K'); return num_cv_qualifiers > 0; } +// <extended-qualifier> ::= U <source-name> [<template-args>] +static bool ParseExtendedQualifier(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + ParseState copy = state->parse_state; + + if (!ParseOneCharToken(state, 'U')) return false; + + bool append = state->parse_state.append; + DisableAppend(state); + if (!ParseSourceName(state)) { + state->parse_state = copy; + return false; + } + Optional(ParseTemplateArgs(state)); + RestoreAppend(state, append); + return true; +} + // <builtin-type> ::= v, etc. # single-character builtin types -// ::= u <source-name> +// ::= <vendor-extended-type> // ::= Dd, etc. # two-character builtin types +// ::= DB (<number> | <expression>) _ # _BitInt(N) +// ::= DU (<number> | <expression>) _ # unsigned _BitInt(N) +// ::= DF <number> _ # _FloatN (N bits) +// ::= DF <number> x # _FloatNx +// ::= DF16b # std::bfloat16_t // // Not supported: -// ::= DF <number> _ # _FloatN (N bits) -// +// ::= [DS] DA <fixed-point-size> +// ::= [DS] DR <fixed-point-size> +// because real implementations of N1169 fixed-point are scant. static bool ParseBuiltinType(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; - const AbbrevPair *p; - for (p = kBuiltinTypeList; p->abbrev != nullptr; ++p) { + ParseState copy = state->parse_state; + + // DB (<number> | <expression>) _ # _BitInt(N) + // DU (<number> | <expression>) _ # unsigned _BitInt(N) + if (ParseTwoCharToken(state, "DB") || + (ParseTwoCharToken(state, "DU") && MaybeAppend(state, "unsigned "))) { + bool append = state->parse_state.append; + DisableAppend(state); + int number = -1; + if (!ParseNumber(state, &number) && !ParseExpression(state)) { + state->parse_state = copy; + return false; + } + RestoreAppend(state, append); + + if (!ParseOneCharToken(state, '_')) { + state->parse_state = copy; + return false; + } + + MaybeAppend(state, "_BitInt("); + if (number >= 0) { + MaybeAppendDecimal(state, number); + } else { + MaybeAppend(state, "?"); // the best we can do for dependent sizes + } + MaybeAppend(state, ")"); + return true; + } + + // DF <number> _ # _FloatN + // DF <number> x # _FloatNx + // DF16b # std::bfloat16_t + if (ParseTwoCharToken(state, "DF")) { + if (ParseThreeCharToken(state, "16b")) { + MaybeAppend(state, "std::bfloat16_t"); + return true; + } + int number = 0; + if (!ParseNumber(state, &number)) { + state->parse_state = copy; + return false; + } + MaybeAppend(state, "_Float"); + MaybeAppendDecimal(state, number); + if (ParseOneCharToken(state, 'x')) { + MaybeAppend(state, "x"); + return true; + } + if (ParseOneCharToken(state, '_')) return true; + state->parse_state = copy; + return false; + } + + for (const AbbrevPair *p = kBuiltinTypeList; p->abbrev != nullptr; ++p) { // Guaranteed only 1- or 2-character strings in kBuiltinTypeList. if (p->abbrev[1] == '\0') { if (ParseOneCharToken(state, p->abbrev[0])) { MaybeAppend(state, p->real_name); - return true; + return true; // ::= v, etc. # single-character builtin types } } else if (p->abbrev[2] == '\0' && ParseTwoCharToken(state, p->abbrev)) { MaybeAppend(state, p->real_name); - return true; + return true; // ::= Dd, etc. # two-character builtin types } } + return ParseVendorExtendedType(state); +} + +// <vendor-extended-type> ::= u <source-name> [<template-args>] +static bool ParseVendorExtendedType(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + ParseState copy = state->parse_state; - if (ParseOneCharToken(state, 'u') && ParseSourceName(state)) { + if (ParseOneCharToken(state, 'u') && ParseSourceName(state) && + Optional(ParseTemplateArgs(state))) { return true; } state->parse_state = copy; @@ -1342,28 +1721,44 @@ static bool ParseExceptionSpec(State *state) { return false; } -// <function-type> ::= [exception-spec] F [Y] <bare-function-type> [O] E +// <function-type> ::= +// [exception-spec] [Dx] F [Y] <bare-function-type> [<ref-qualifier>] E +// +// <ref-qualifier> ::= R | O static bool ParseFunctionType(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; ParseState copy = state->parse_state; - if (Optional(ParseExceptionSpec(state)) && ParseOneCharToken(state, 'F') && - Optional(ParseOneCharToken(state, 'Y')) && ParseBareFunctionType(state) && - Optional(ParseOneCharToken(state, 'O')) && - ParseOneCharToken(state, 'E')) { - return true; + Optional(ParseExceptionSpec(state)); + Optional(ParseTwoCharToken(state, "Dx")); + if (!ParseOneCharToken(state, 'F')) { + state->parse_state = copy; + return false; } - state->parse_state = copy; - return false; + Optional(ParseOneCharToken(state, 'Y')); + if (!ParseBareFunctionType(state)) { + state->parse_state = copy; + return false; + } + Optional(ParseCharClass(state, "RO")); + if (!ParseOneCharToken(state, 'E')) { + state->parse_state = copy; + return false; + } + return true; } -// <bare-function-type> ::= <(signature) type>+ +// <bare-function-type> ::= <overload-attribute>* <(signature) type>+ +// +// The <overload-attribute>* prefix is nonstandard; see the comment on +// ParseOverloadAttribute. static bool ParseBareFunctionType(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; ParseState copy = state->parse_state; DisableAppend(state); - if (OneOrMore(ParseType, state)) { + if (ZeroOrMore(ParseOverloadAttribute, state) && + OneOrMore(ParseType, state)) { RestoreAppend(state, copy.append); MaybeAppend(state, "()"); return true; @@ -1372,11 +1767,43 @@ static bool ParseBareFunctionType(State *state) { return false; } +// <overload-attribute> ::= Ua <name> +// +// The nonstandard <overload-attribute> production is sufficient to accept the +// current implementation of __attribute__((enable_if(condition, "message"))) +// and future attributes of a similar shape. See +// https://clang.llvm.org/docs/AttributeReference.html#enable-if and the +// definition of CXXNameMangler::mangleFunctionEncodingBareType in Clang's +// source code. +static bool ParseOverloadAttribute(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + ParseState copy = state->parse_state; + if (ParseTwoCharToken(state, "Ua") && ParseName(state)) { + return true; + } + state->parse_state = copy; + return false; +} + // <class-enum-type> ::= <name> +// ::= Ts <name> # struct Name or class Name +// ::= Tu <name> # union Name +// ::= Te <name> # enum Name +// +// See http://shortn/_W3YrltiEd0. static bool ParseClassEnumType(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; - return ParseName(state); + ParseState copy = state->parse_state; + if (Optional(ParseTwoCharToken(state, "Ts") || + ParseTwoCharToken(state, "Tu") || + ParseTwoCharToken(state, "Te")) && + ParseName(state)) { + return true; + } + state->parse_state = copy; + return false; } // <array-type> ::= A <(positive dimension) number> _ <(element) type> @@ -1413,21 +1840,83 @@ static bool ParsePointerToMemberType(State *state) { // <template-param> ::= T_ // ::= T <parameter-2 non-negative number> _ +// ::= TL <level-1> __ +// ::= TL <level-1> _ <parameter-2 non-negative number> _ static bool ParseTemplateParam(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; if (ParseTwoCharToken(state, "T_")) { MaybeAppend(state, "?"); // We don't support template substitutions. - return true; + return true; // ::= T_ } ParseState copy = state->parse_state; if (ParseOneCharToken(state, 'T') && ParseNumber(state, nullptr) && ParseOneCharToken(state, '_')) { MaybeAppend(state, "?"); // We don't support template substitutions. + return true; // ::= T <parameter-2 non-negative number> _ + } + state->parse_state = copy; + + if (ParseTwoCharToken(state, "TL") && ParseNumber(state, nullptr)) { + if (ParseTwoCharToken(state, "__")) { + MaybeAppend(state, "?"); // We don't support template substitutions. + return true; // ::= TL <level-1> __ + } + + if (ParseOneCharToken(state, '_') && ParseNumber(state, nullptr) && + ParseOneCharToken(state, '_')) { + MaybeAppend(state, "?"); // We don't support template substitutions. + return true; // ::= TL <level-1> _ <parameter-2 non-negative number> _ + } + } + state->parse_state = copy; + return false; +} + +// <template-param-decl> +// ::= Ty # template type parameter +// ::= Tk <concept name> [<template-args>] # constrained type parameter +// ::= Tn <type> # template non-type parameter +// ::= Tt <template-param-decl>* E # template template parameter +// ::= Tp <template-param-decl> # template parameter pack +// +// NOTE: <concept name> is just a <name>: http://shortn/_MqJVyr0fc1 +// TODO(b/324066279): Implement optional suffix for `Tt`: +// [Q <requires-clause expr>] +static bool ParseTemplateParamDecl(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + ParseState copy = state->parse_state; + + if (ParseTwoCharToken(state, "Ty")) { + return true; + } + state->parse_state = copy; + + if (ParseTwoCharToken(state, "Tk") && ParseName(state) && + Optional(ParseTemplateArgs(state))) { + return true; + } + state->parse_state = copy; + + if (ParseTwoCharToken(state, "Tn") && ParseType(state)) { return true; } state->parse_state = copy; + + if (ParseTwoCharToken(state, "Tt") && + ZeroOrMore(ParseTemplateParamDecl, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + + if (ParseTwoCharToken(state, "Tp") && ParseTemplateParamDecl(state)) { + return true; + } + state->parse_state = copy; + return false; } @@ -1441,13 +1930,14 @@ static bool ParseTemplateTemplateParam(State *state) { ParseSubstitution(state, /*accept_std=*/false)); } -// <template-args> ::= I <template-arg>+ E +// <template-args> ::= I <template-arg>+ [Q <requires-clause expr>] E static bool ParseTemplateArgs(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; ParseState copy = state->parse_state; DisableAppend(state); if (ParseOneCharToken(state, 'I') && OneOrMore(ParseTemplateArg, state) && + Optional(ParseQRequiresClauseExpr(state)) && ParseOneCharToken(state, 'E')) { RestoreAppend(state, copy.append); MaybeAppend(state, "<>"); @@ -1457,7 +1947,8 @@ static bool ParseTemplateArgs(State *state) { return false; } -// <template-arg> ::= <type> +// <template-arg> ::= <template-param-decl> <template-arg> +// ::= <type> // ::= <expr-primary> // ::= J <template-arg>* E # argument pack // ::= X <expression> E @@ -1541,7 +2032,7 @@ static bool ParseTemplateArg(State *state) { // ::= L <source-name> [<template-args>] [<expr-cast-value> E] if (ParseLocalSourceName(state) && Optional(ParseTemplateArgs(state))) { copy = state->parse_state; - if (ParseExprCastValue(state) && ParseOneCharToken(state, 'E')) { + if (ParseExprCastValueAndTrailingE(state)) { return true; } state->parse_state = copy; @@ -1560,6 +2051,12 @@ static bool ParseTemplateArg(State *state) { return true; } state->parse_state = copy; + + if (ParseTemplateParamDecl(state) && ParseTemplateArg(state)) { + return true; + } + state->parse_state = copy; + return false; } @@ -1614,6 +2111,13 @@ static bool ParseBaseUnresolvedName(State *state) { // <base-unresolved-name> // ::= [gs] sr <unresolved-qualifier-level>+ E // <base-unresolved-name> +// ::= sr St <simple-id> <simple-id> # nonstandard +// +// The last case is not part of the official grammar but has been observed in +// real-world examples that the GNU demangler (but not the LLVM demangler) is +// able to decode; see demangle_test.cc for one such symbol name. The shape +// sr St <simple-id> <simple-id> was inferred by closed-box testing of the GNU +// demangler. static bool ParseUnresolvedName(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; @@ -1633,7 +2137,7 @@ static bool ParseUnresolvedName(State *state) { if (ParseTwoCharToken(state, "sr") && ParseOneCharToken(state, 'N') && ParseUnresolvedType(state) && - OneOrMore(/* <unresolved-qualifier-level> ::= */ ParseSimpleId, state) && + OneOrMore(ParseUnresolvedQualifierLevel, state) && ParseOneCharToken(state, 'E') && ParseBaseUnresolvedName(state)) { return true; } @@ -1641,35 +2145,160 @@ static bool ParseUnresolvedName(State *state) { if (Optional(ParseTwoCharToken(state, "gs")) && ParseTwoCharToken(state, "sr") && - OneOrMore(/* <unresolved-qualifier-level> ::= */ ParseSimpleId, state) && + OneOrMore(ParseUnresolvedQualifierLevel, state) && ParseOneCharToken(state, 'E') && ParseBaseUnresolvedName(state)) { return true; } state->parse_state = copy; + if (ParseTwoCharToken(state, "sr") && ParseTwoCharToken(state, "St") && + ParseSimpleId(state) && ParseSimpleId(state)) { + return true; + } + state->parse_state = copy; + return false; } +// <unresolved-qualifier-level> ::= <simple-id> +// ::= <substitution> <template-args> +// +// The production <substitution> <template-args> is nonstandard but is observed +// in practice. An upstream discussion on the best shape of <unresolved-name> +// has not converged: +// +// https://github.com/itanium-cxx-abi/cxx-abi/issues/38 +static bool ParseUnresolvedQualifierLevel(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + + if (ParseSimpleId(state)) return true; + + ParseState copy = state->parse_state; + if (ParseSubstitution(state, /*accept_std=*/false) && + ParseTemplateArgs(state)) { + return true; + } + state->parse_state = copy; + return false; +} + +// <union-selector> ::= _ [<number>] +// +// https://github.com/itanium-cxx-abi/cxx-abi/issues/47 +static bool ParseUnionSelector(State *state) { + return ParseOneCharToken(state, '_') && Optional(ParseNumber(state, nullptr)); +} + +// <function-param> ::= fp <(top-level) CV-qualifiers> _ +// ::= fp <(top-level) CV-qualifiers> <number> _ +// ::= fL <number> p <(top-level) CV-qualifiers> _ +// ::= fL <number> p <(top-level) CV-qualifiers> <number> _ +// ::= fpT # this +static bool ParseFunctionParam(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + + ParseState copy = state->parse_state; + + // Function-param expression (level 0). + if (ParseTwoCharToken(state, "fp") && Optional(ParseCVQualifiers(state)) && + Optional(ParseNumber(state, nullptr)) && ParseOneCharToken(state, '_')) { + return true; + } + state->parse_state = copy; + + // Function-param expression (level 1+). + if (ParseTwoCharToken(state, "fL") && Optional(ParseNumber(state, nullptr)) && + ParseOneCharToken(state, 'p') && Optional(ParseCVQualifiers(state)) && + Optional(ParseNumber(state, nullptr)) && ParseOneCharToken(state, '_')) { + return true; + } + state->parse_state = copy; + + return ParseThreeCharToken(state, "fpT"); +} + +// <braced-expression> ::= <expression> +// ::= di <field source-name> <braced-expression> +// ::= dx <index expression> <braced-expression> +// ::= dX <expression> <expression> <braced-expression> +static bool ParseBracedExpression(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + + ParseState copy = state->parse_state; + + if (ParseTwoCharToken(state, "di") && ParseSourceName(state) && + ParseBracedExpression(state)) { + return true; + } + state->parse_state = copy; + + if (ParseTwoCharToken(state, "dx") && ParseExpression(state) && + ParseBracedExpression(state)) { + return true; + } + state->parse_state = copy; + + if (ParseTwoCharToken(state, "dX") && + ParseExpression(state) && ParseExpression(state) && + ParseBracedExpression(state)) { + return true; + } + state->parse_state = copy; + + return ParseExpression(state); +} + // <expression> ::= <1-ary operator-name> <expression> // ::= <2-ary operator-name> <expression> <expression> // ::= <3-ary operator-name> <expression> <expression> <expression> +// ::= pp_ <expression> # ++e; pp <expression> is e++ +// ::= mm_ <expression> # --e; mm <expression> is e-- // ::= cl <expression>+ E // ::= cp <simple-id> <expression>* E # Clang-specific. +// ::= so <type> <expression> [<number>] <union-selector>* [p] E // ::= cv <type> <expression> # type (expression) // ::= cv <type> _ <expression>* E # type (expr-list) +// ::= tl <type> <braced-expression>* E +// ::= il <braced-expression>* E +// ::= [gs] nw <expression>* _ <type> E +// ::= [gs] nw <expression>* _ <type> <initializer> +// ::= [gs] na <expression>* _ <type> E +// ::= [gs] na <expression>* _ <type> <initializer> +// ::= [gs] dl <expression> +// ::= [gs] da <expression> +// ::= dc <type> <expression> +// ::= sc <type> <expression> +// ::= cc <type> <expression> +// ::= rc <type> <expression> +// ::= ti <type> +// ::= te <expression> // ::= st <type> +// ::= at <type> +// ::= az <expression> +// ::= nx <expression> // ::= <template-param> // ::= <function-param> +// ::= sZ <template-param> +// ::= sZ <function-param> +// ::= sP <template-arg>* E // ::= <expr-primary> // ::= dt <expression> <unresolved-name> # expr.name // ::= pt <expression> <unresolved-name> # expr->name // ::= sp <expression> # argument pack expansion +// ::= fl <binary operator-name> <expression> +// ::= fr <binary operator-name> <expression> +// ::= fL <binary operator-name> <expression> <expression> +// ::= fR <binary operator-name> <expression> <expression> +// ::= tw <expression> +// ::= tr // ::= sr <type> <unqualified-name> <template-args> // ::= sr <type> <unqualified-name> -// <function-param> ::= fp <(top-level) CV-qualifiers> _ -// ::= fp <(top-level) CV-qualifiers> <number> _ -// ::= fL <number> p <(top-level) CV-qualifiers> _ -// ::= fL <number> p <(top-level) CV-qualifiers> <number> _ +// ::= u <source-name> <template-arg>* E # vendor extension +// ::= rq <requirement>+ E +// ::= rQ <bare-function-type> _ <requirement>+ E static bool ParseExpression(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; @@ -1686,6 +2315,15 @@ static bool ParseExpression(State *state) { } state->parse_state = copy; + // Preincrement and predecrement. Postincrement and postdecrement are handled + // by the operator-name logic later on. + if ((ParseThreeCharToken(state, "pp_") || + ParseThreeCharToken(state, "mm_")) && + ParseExpression(state)) { + return true; + } + state->parse_state = copy; + // Clang-specific "cp <simple-id> <expression>* E" // https://clang.llvm.org/doxygen/ItaniumMangle_8cpp_source.html#l04338 if (ParseTwoCharToken(state, "cp") && ParseSimpleId(state) && @@ -1694,17 +2332,65 @@ static bool ParseExpression(State *state) { } state->parse_state = copy; - // Function-param expression (level 0). - if (ParseTwoCharToken(state, "fp") && Optional(ParseCVQualifiers(state)) && - Optional(ParseNumber(state, nullptr)) && ParseOneCharToken(state, '_')) { + // <expression> ::= so <type> <expression> [<number>] <union-selector>* [p] E + // + // https://github.com/itanium-cxx-abi/cxx-abi/issues/47 + if (ParseTwoCharToken(state, "so") && ParseType(state) && + ParseExpression(state) && Optional(ParseNumber(state, nullptr)) && + ZeroOrMore(ParseUnionSelector, state) && + Optional(ParseOneCharToken(state, 'p')) && + ParseOneCharToken(state, 'E')) { return true; } state->parse_state = copy; - // Function-param expression (level 1+). - if (ParseTwoCharToken(state, "fL") && Optional(ParseNumber(state, nullptr)) && - ParseOneCharToken(state, 'p') && Optional(ParseCVQualifiers(state)) && - Optional(ParseNumber(state, nullptr)) && ParseOneCharToken(state, '_')) { + // <expression> ::= <function-param> + if (ParseFunctionParam(state)) return true; + state->parse_state = copy; + + // <expression> ::= tl <type> <braced-expression>* E + if (ParseTwoCharToken(state, "tl") && ParseType(state) && + ZeroOrMore(ParseBracedExpression, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + + // <expression> ::= il <braced-expression>* E + if (ParseTwoCharToken(state, "il") && + ZeroOrMore(ParseBracedExpression, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + + // <expression> ::= [gs] nw <expression>* _ <type> E + // ::= [gs] nw <expression>* _ <type> <initializer> + // ::= [gs] na <expression>* _ <type> E + // ::= [gs] na <expression>* _ <type> <initializer> + if (Optional(ParseTwoCharToken(state, "gs")) && + (ParseTwoCharToken(state, "nw") || ParseTwoCharToken(state, "na")) && + ZeroOrMore(ParseExpression, state) && ParseOneCharToken(state, '_') && + ParseType(state) && + (ParseOneCharToken(state, 'E') || ParseInitializer(state))) { + return true; + } + state->parse_state = copy; + + // <expression> ::= [gs] dl <expression> + // ::= [gs] da <expression> + if (Optional(ParseTwoCharToken(state, "gs")) && + (ParseTwoCharToken(state, "dl") || ParseTwoCharToken(state, "da")) && + ParseExpression(state)) { + return true; + } + state->parse_state = copy; + + // dynamic_cast, static_cast, const_cast, reinterpret_cast. + // + // <expression> ::= (dc | sc | cc | rc) <type> <expression> + if (ParseCharClass(state, "dscr") && ParseOneCharToken(state, 'c') && + ParseType(state) && ParseExpression(state)) { return true; } state->parse_state = copy; @@ -1746,15 +2432,96 @@ static bool ParseExpression(State *state) { } state->parse_state = copy; + // typeid(type) + if (ParseTwoCharToken(state, "ti") && ParseType(state)) { + return true; + } + state->parse_state = copy; + + // typeid(expression) + if (ParseTwoCharToken(state, "te") && ParseExpression(state)) { + return true; + } + state->parse_state = copy; + // sizeof type if (ParseTwoCharToken(state, "st") && ParseType(state)) { return true; } state->parse_state = copy; + // alignof(type) + if (ParseTwoCharToken(state, "at") && ParseType(state)) { + return true; + } + state->parse_state = copy; + + // alignof(expression), a GNU extension + if (ParseTwoCharToken(state, "az") && ParseExpression(state)) { + return true; + } + state->parse_state = copy; + + // noexcept(expression) appearing as an expression in a dependent signature + if (ParseTwoCharToken(state, "nx") && ParseExpression(state)) { + return true; + } + state->parse_state = copy; + + // sizeof...(pack) + // + // <expression> ::= sZ <template-param> + // ::= sZ <function-param> + if (ParseTwoCharToken(state, "sZ") && + (ParseFunctionParam(state) || ParseTemplateParam(state))) { + return true; + } + state->parse_state = copy; + + // sizeof...(pack) captured from an alias template + // + // <expression> ::= sP <template-arg>* E + if (ParseTwoCharToken(state, "sP") && ZeroOrMore(ParseTemplateArg, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + + // Unary folds (... op pack) and (pack op ...). + // + // <expression> ::= fl <binary operator-name> <expression> + // ::= fr <binary operator-name> <expression> + if ((ParseTwoCharToken(state, "fl") || ParseTwoCharToken(state, "fr")) && + ParseOperatorName(state, nullptr) && ParseExpression(state)) { + return true; + } + state->parse_state = copy; + + // Binary folds (init op ... op pack) and (pack op ... op init). + // + // <expression> ::= fL <binary operator-name> <expression> <expression> + // ::= fR <binary operator-name> <expression> <expression> + if ((ParseTwoCharToken(state, "fL") || ParseTwoCharToken(state, "fR")) && + ParseOperatorName(state, nullptr) && ParseExpression(state) && + ParseExpression(state)) { + return true; + } + state->parse_state = copy; + + // tw <expression>: throw e + if (ParseTwoCharToken(state, "tw") && ParseExpression(state)) { + return true; + } + state->parse_state = copy; + + // tr: throw (rethrows an exception from the handler that caught it) + if (ParseTwoCharToken(state, "tr")) return true; + // Object and pointer member access expressions. + // + // <expression> ::= (dt | pt) <expression> <unresolved-name> if ((ParseTwoCharToken(state, "dt") || ParseTwoCharToken(state, "pt")) && - ParseExpression(state) && ParseType(state)) { + ParseExpression(state) && ParseUnresolvedName(state)) { return true; } state->parse_state = copy; @@ -1774,9 +2541,61 @@ static bool ParseExpression(State *state) { } state->parse_state = copy; + // Vendor extended expressions + if (ParseOneCharToken(state, 'u') && ParseSourceName(state) && + ZeroOrMore(ParseTemplateArg, state) && ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + + // <expression> ::= rq <requirement>+ E + // + // https://github.com/itanium-cxx-abi/cxx-abi/issues/24 + if (ParseTwoCharToken(state, "rq") && OneOrMore(ParseRequirement, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + + // <expression> ::= rQ <bare-function-type> _ <requirement>+ E + // + // https://github.com/itanium-cxx-abi/cxx-abi/issues/24 + if (ParseTwoCharToken(state, "rQ") && ParseBareFunctionType(state) && + ParseOneCharToken(state, '_') && OneOrMore(ParseRequirement, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + return ParseUnresolvedName(state); } +// <initializer> ::= pi <expression>* E +// ::= il <braced-expression>* E +// +// The il ... E form is not in the ABI spec but is seen in practice for +// braced-init-lists in new-expressions, which are standard syntax from C++11 +// on. +static bool ParseInitializer(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + ParseState copy = state->parse_state; + + if (ParseTwoCharToken(state, "pi") && ZeroOrMore(ParseExpression, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + + if (ParseTwoCharToken(state, "il") && + ZeroOrMore(ParseBracedExpression, state) && + ParseOneCharToken(state, 'E')) { + return true; + } + state->parse_state = copy; + return false; +} + // <expr-primary> ::= L <type> <(value) number> E // ::= L <type> <(value) float> E // ::= L <mangled-name> E @@ -1819,10 +2638,35 @@ static bool ParseExprPrimary(State *state) { return false; } - // The merged cast production. - if (ParseOneCharToken(state, 'L') && ParseType(state) && - ParseExprCastValue(state)) { - return true; + if (ParseOneCharToken(state, 'L')) { + // There are two special cases in which a literal may or must contain a type + // without a value. The first is that both LDnE and LDn0E are valid + // encodings of nullptr, used in different situations. Recognize LDnE here, + // leaving LDn0E to be recognized by the general logic afterward. + if (ParseThreeCharToken(state, "DnE")) return true; + + // The second special case is a string literal, currently mangled in C++98 + // style as LA<length + 1>_KcE. This is inadequate to support C++11 and + // later versions, and the discussion of this problem has not converged. + // + // https://github.com/itanium-cxx-abi/cxx-abi/issues/64 + // + // For now the bare-type mangling is what's used in practice, so we + // recognize this form and only this form if an array type appears here. + // Someday we'll probably have to accept a new form of value mangling in + // LA...E constructs. (Note also that C++20 allows a wide range of + // class-type objects as template arguments, so someday their values will be + // mangled and we'll have to recognize them here too.) + if (RemainingInput(state)[0] == 'A' /* an array type follows */) { + if (ParseType(state) && ParseOneCharToken(state, 'E')) return true; + state->parse_state = copy; + return false; + } + + // The merged cast production. + if (ParseType(state) && ParseExprCastValueAndTrailingE(state)) { + return true; + } } state->parse_state = copy; @@ -1836,7 +2680,7 @@ static bool ParseExprPrimary(State *state) { } // <number> or <float>, followed by 'E', as described above ParseExprPrimary. -static bool ParseExprCastValue(State *state) { +static bool ParseExprCastValueAndTrailingE(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; // We have to be able to backtrack after accepting a number because we could @@ -1848,39 +2692,148 @@ static bool ParseExprCastValue(State *state) { } state->parse_state = copy; - if (ParseFloatNumber(state) && ParseOneCharToken(state, 'E')) { + if (ParseFloatNumber(state)) { + // <float> for ordinary floating-point types + if (ParseOneCharToken(state, 'E')) return true; + + // <float> _ <float> for complex floating-point types + if (ParseOneCharToken(state, '_') && ParseFloatNumber(state) && + ParseOneCharToken(state, 'E')) { + return true; + } + } + state->parse_state = copy; + + return false; +} + +// Parses `Q <requires-clause expr>`. +// If parsing fails, applies backtracking to `state`. +// +// This function covers two symbols instead of one for convenience, +// because in LLVM's Itanium ABI mangling grammar, <requires-clause expr> +// always appears after Q. +// +// Does not emit the parsed `requires` clause to simplify the implementation. +// In other words, these two functions' mangled names will demangle identically: +// +// template <typename T> +// int foo(T) requires IsIntegral<T>; +// +// vs. +// +// template <typename T> +// int foo(T); +static bool ParseQRequiresClauseExpr(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + ParseState copy = state->parse_state; + DisableAppend(state); + + // <requires-clause expr> is just an <expression>: http://shortn/_9E1Ul0rIM8 + if (ParseOneCharToken(state, 'Q') && ParseExpression(state)) { + RestoreAppend(state, copy.append); + return true; + } + + // also restores append + state->parse_state = copy; + return false; +} + +// <requirement> ::= X <expression> [N] [R <type-constraint>] +// <requirement> ::= T <type> +// <requirement> ::= Q <constraint-expression> +// +// <constraint-expression> ::= <expression> +// +// https://github.com/itanium-cxx-abi/cxx-abi/issues/24 +static bool ParseRequirement(State *state) { + ComplexityGuard guard(state); + if (guard.IsTooComplex()) return false; + + ParseState copy = state->parse_state; + + if (ParseOneCharToken(state, 'X') && ParseExpression(state) && + Optional(ParseOneCharToken(state, 'N')) && + // This logic backtracks cleanly if we eat an R but a valid type doesn't + // follow it. + (!ParseOneCharToken(state, 'R') || ParseTypeConstraint(state))) { return true; } state->parse_state = copy; + if (ParseOneCharToken(state, 'T') && ParseType(state)) return true; + state->parse_state = copy; + + if (ParseOneCharToken(state, 'Q') && ParseExpression(state)) return true; + state->parse_state = copy; + return false; } +// <type-constraint> ::= <name> +static bool ParseTypeConstraint(State *state) { + return ParseName(state); +} + // <local-name> ::= Z <(function) encoding> E <(entity) name> [<discriminator>] // ::= Z <(function) encoding> E s [<discriminator>] +// ::= Z <(function) encoding> E d [<(parameter) number>] _ <name> // // Parsing a common prefix of these two productions together avoids an // exponential blowup of backtracking. Parse like: // <local-name> := Z <encoding> E <local-name-suffix> // <local-name-suffix> ::= s [<discriminator>] +// ::= d [<(parameter) number>] _ <name> // ::= <name> [<discriminator>] static bool ParseLocalNameSuffix(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; + ParseState copy = state->parse_state; + + // <local-name-suffix> ::= d [<(parameter) number>] _ <name> + if (ParseOneCharToken(state, 'd') && + (IsDigit(RemainingInput(state)[0]) || RemainingInput(state)[0] == '_')) { + int number = -1; + Optional(ParseNumber(state, &number)); + if (number < -1 || number > 2147483645) { + // Work around overflow cases. We do not expect these outside of a fuzzer + // or other source of adversarial input. If we do detect overflow here, + // we'll print {default arg#1}. + number = -1; + } + number += 2; + + // The ::{default arg#1}:: infix must be rendered before the lambda itself, + // so print this before parsing the rest of the <local-name-suffix>. + MaybeAppend(state, "::{default arg#"); + MaybeAppendDecimal(state, number); + MaybeAppend(state, "}::"); + if (ParseOneCharToken(state, '_') && ParseName(state)) return true; + + // On late parse failure, roll back not only the input but also the output, + // whose trailing NUL was overwritten. + state->parse_state = copy; + if (state->parse_state.append) { + state->out[state->parse_state.out_cur_idx] = '\0'; + } + return false; + } + state->parse_state = copy; + // <local-name-suffix> ::= <name> [<discriminator>] if (MaybeAppend(state, "::") && ParseName(state) && Optional(ParseDiscriminator(state))) { return true; } - - // Since we're not going to overwrite the above "::" by re-parsing the - // <encoding> (whose trailing '\0' byte was in the byte now holding the - // first ':'), we have to rollback the "::" if the <name> parse failed. + state->parse_state = copy; if (state->parse_state.append) { - state->out[state->parse_state.out_cur_idx - 2] = '\0'; + state->out[state->parse_state.out_cur_idx] = '\0'; } + // <local-name-suffix> ::= s [<discriminator>] return ParseOneCharToken(state, 's') && Optional(ParseDiscriminator(state)); } @@ -1896,12 +2849,22 @@ static bool ParseLocalName(State *state) { return false; } -// <discriminator> := _ <(non-negative) number> +// <discriminator> := _ <digit> +// := __ <number (>= 10)> _ static bool ParseDiscriminator(State *state) { ComplexityGuard guard(state); if (guard.IsTooComplex()) return false; ParseState copy = state->parse_state; - if (ParseOneCharToken(state, '_') && ParseNumber(state, nullptr)) { + + // Both forms start with _ so parse that first. + if (!ParseOneCharToken(state, '_')) return false; + + // <digit> + if (ParseDigit(state, nullptr)) return true; + + // _ <number> _ + if (ParseOneCharToken(state, '_') && ParseNumber(state, nullptr) && + ParseOneCharToken(state, '_')) { return true; } state->parse_state = copy; @@ -1947,6 +2910,7 @@ static bool ParseSubstitution(State *state, bool accept_std) { MaybeAppend(state, p->real_name); } ++state->parse_state.mangled_idx; + UpdateHighWaterMark(state); return true; } } @@ -1972,10 +2936,13 @@ static bool ParseTopLevelMangledName(State *state) { MaybeAppend(state, RemainingInput(state)); return true; } + ReportHighWaterMark(state); return false; // Unconsumed suffix. } return true; } + + ReportHighWaterMark(state); return false; } @@ -1985,6 +2952,10 @@ static bool Overflowed(const State *state) { // The demangler entry point. bool Demangle(const char* mangled, char* out, size_t out_size) { + if (mangled[0] == '_' && mangled[1] == 'R') { + return DemangleRustSymbolEncoding(mangled, out, out_size); + } + State state; InitState(&state, mangled, out, out_size); return ParseTopLevelMangledName(&state) && !Overflowed(&state) && diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle.h b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle.h index 6acfee2c50..a1166f9e7c 100644 --- a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle.h +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle.h @@ -56,6 +56,9 @@ namespace debugging_internal { // // See the unit test for more examples. // +// Demangle also recognizes Rust mangled names by delegating the parsing of +// anything that starts with _R to DemangleRustSymbolEncoding (demangle_rust.h). +// // Note: we might want to write demanglers for ABIs other than Itanium // C++ ABI in the future. bool Demangle(const char* mangled, char* out, size_t out_size); diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle_rust.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle_rust.cc new file mode 100644 index 0000000000..086e1457f9 --- /dev/null +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle_rust.cc @@ -0,0 +1,925 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "y_absl/debugging/internal/demangle_rust.h" + +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <limits> + +#include "y_absl/base/attributes.h" +#include "y_absl/base/config.h" +#include "y_absl/debugging/internal/decode_rust_punycode.h" + +namespace y_absl { +Y_ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +namespace { + +// Same step limit as the C++ demangler in demangle.cc uses. +constexpr int kMaxReturns = 1 << 17; + +bool IsDigit(char c) { return '0' <= c && c <= '9'; } +bool IsLower(char c) { return 'a' <= c && c <= 'z'; } +bool IsUpper(char c) { return 'A' <= c && c <= 'Z'; } +bool IsAlpha(char c) { return IsLower(c) || IsUpper(c); } +bool IsIdentifierChar(char c) { return IsAlpha(c) || IsDigit(c) || c == '_'; } +bool IsLowerHexDigit(char c) { return IsDigit(c) || ('a' <= c && c <= 'f'); } + +const char* BasicTypeName(char c) { + switch (c) { + case 'a': return "i8"; + case 'b': return "bool"; + case 'c': return "char"; + case 'd': return "f64"; + case 'e': return "str"; + case 'f': return "f32"; + case 'h': return "u8"; + case 'i': return "isize"; + case 'j': return "usize"; + case 'l': return "i32"; + case 'm': return "u32"; + case 'n': return "i128"; + case 'o': return "u128"; + case 'p': return "_"; + case 's': return "i16"; + case 't': return "u16"; + case 'u': return "()"; + case 'v': return "..."; + case 'x': return "i64"; + case 'y': return "u64"; + case 'z': return "!"; + } + return nullptr; +} + +// Parser for Rust symbol mangling v0, whose grammar is defined here: +// +// https://doc.rust-lang.org/rustc/symbol-mangling/v0.html#symbol-grammar-summary +class RustSymbolParser { + public: + // Prepares to demangle the given encoding, a Rust symbol name starting with + // _R, into the output buffer [out, out_end). The caller is expected to + // continue by calling the new object's Parse function. + RustSymbolParser(const char* encoding, char* out, char* const out_end) + : encoding_(encoding), out_(out), out_end_(out_end) { + if (out_ != out_end_) *out_ = '\0'; + } + + // Parses the constructor's encoding argument, writing output into the range + // [out, out_end). Returns true on success and false for input whose + // structure was not recognized or exceeded implementation limits, such as by + // nesting structures too deep. In either case *this should not be used + // again. + Y_ABSL_MUST_USE_RESULT bool Parse() && { + // Recursively parses the grammar production named by callee, then resumes + // execution at the next statement. + // + // Recursive-descent parsing is a beautifully readable translation of a + // grammar, but it risks stack overflow if implemented by naive recursion on + // the C++ call stack. So we simulate recursion by goto and switch instead, + // keeping a bounded stack of "return addresses" in the recursion_stack_ + // member. + // + // The callee argument is a statement label. We goto that label after + // saving the "return address" on recursion_stack_. The next continue + // statement in the for loop below "returns" from this "call". + // + // The caller argument names the return point. Each value of caller must + // appear in only one Y_ABSL_DEMANGLER_RECURSE call and be listed in the + // definition of enum ReturnAddress. The switch implements the control + // transfer from the end of a "called" subroutine back to the statement + // after the "call". + // + // Note that not all the grammar productions have to be packed into the + // switch, but only those which appear in a cycle in the grammar. Anything + // acyclic can be written as ordinary functions and function calls, e.g., + // ParseIdentifier. +#define Y_ABSL_DEMANGLER_RECURSE(callee, caller) \ + do { \ + if (recursion_depth_ == kStackSize) return false; \ + /* The next continue will switch on this saved value ... */ \ + recursion_stack_[recursion_depth_++] = caller; \ + goto callee; \ + /* ... and will land here, resuming the suspended code. */ \ + case caller: {} \ + } while (0) + + // Parse the encoding, counting completed recursive calls to guard against + // excessively complex input and infinite-loop bugs. + int iter = 0; + goto whole_encoding; + for (; iter < kMaxReturns && recursion_depth_ > 0; ++iter) { + // This switch resumes the code path most recently suspended by + // Y_ABSL_DEMANGLER_RECURSE. + switch (recursion_stack_[--recursion_depth_]) { + // + // symbol-name -> + // _R decimal-number? path instantiating-crate? vendor-specific-suffix? + whole_encoding: + if (!Eat('_') || !Eat('R')) return false; + // decimal-number? is always empty today, so proceed to path, which + // can't start with a decimal digit. + Y_ABSL_DEMANGLER_RECURSE(path, kInstantiatingCrate); + if (IsAlpha(Peek())) { + ++silence_depth_; // Print nothing more from here on. + Y_ABSL_DEMANGLER_RECURSE(path, kVendorSpecificSuffix); + } + switch (Take()) { + case '.': case '$': case '\0': return true; + } + return false; // unexpected trailing content + + // path -> crate-root | inherent-impl | trait-impl | trait-definition | + // nested-path | generic-args | backref + // + // Note that Y_ABSL_DEMANGLER_RECURSE does not work inside a nested switch + // (which would hide the generated case label). Thus we jump out of the + // inner switch with gotos before performing any fake recursion. + path: + switch (Take()) { + case 'C': goto crate_root; + case 'M': goto inherent_impl; + case 'X': goto trait_impl; + case 'Y': goto trait_definition; + case 'N': goto nested_path; + case 'I': goto generic_args; + case 'B': goto path_backref; + default: return false; + } + + // crate-root -> C identifier (C consumed above) + crate_root: + if (!ParseIdentifier()) return false; + continue; + + // inherent-impl -> M impl-path type (M already consumed) + inherent_impl: + if (!Emit("<")) return false; + Y_ABSL_DEMANGLER_RECURSE(impl_path, kInherentImplType); + Y_ABSL_DEMANGLER_RECURSE(type, kInherentImplEnding); + if (!Emit(">")) return false; + continue; + + // trait-impl -> X impl-path type path (X already consumed) + trait_impl: + if (!Emit("<")) return false; + Y_ABSL_DEMANGLER_RECURSE(impl_path, kTraitImplType); + Y_ABSL_DEMANGLER_RECURSE(type, kTraitImplInfix); + if (!Emit(" as ")) return false; + Y_ABSL_DEMANGLER_RECURSE(path, kTraitImplEnding); + if (!Emit(">")) return false; + continue; + + // impl-path -> disambiguator? path (but never print it!) + impl_path: + ++silence_depth_; + { + int ignored_disambiguator; + if (!ParseDisambiguator(ignored_disambiguator)) return false; + } + Y_ABSL_DEMANGLER_RECURSE(path, kImplPathEnding); + --silence_depth_; + continue; + + // trait-definition -> Y type path (Y already consumed) + trait_definition: + if (!Emit("<")) return false; + Y_ABSL_DEMANGLER_RECURSE(type, kTraitDefinitionInfix); + if (!Emit(" as ")) return false; + Y_ABSL_DEMANGLER_RECURSE(path, kTraitDefinitionEnding); + if (!Emit(">")) return false; + continue; + + // nested-path -> N namespace path identifier (N already consumed) + // namespace -> lower | upper + nested_path: + // Uppercase namespaces must be saved on a stack so we can print + // ::{closure#0} or ::{shim:vtable#0} or ::{X:name#0} as needed. + if (IsUpper(Peek())) { + if (!PushNamespace(Take())) return false; + Y_ABSL_DEMANGLER_RECURSE(path, kIdentifierInUppercaseNamespace); + if (!Emit("::")) return false; + if (!ParseIdentifier(PopNamespace())) return false; + continue; + } + + // Lowercase namespaces, however, are never represented in the output; + // they all emit just ::name. + if (IsLower(Take())) { + Y_ABSL_DEMANGLER_RECURSE(path, kIdentifierInLowercaseNamespace); + if (!Emit("::")) return false; + if (!ParseIdentifier()) return false; + continue; + } + + // Neither upper or lower + return false; + + // type -> basic-type | array-type | slice-type | tuple-type | + // ref-type | mut-ref-type | const-ptr-type | mut-ptr-type | + // fn-type | dyn-trait-type | path | backref + // + // We use ifs instead of switch (Take()) because the default case jumps + // to path, which will need to see the first character not yet Taken + // from the input. Because we do not use a nested switch here, + // Y_ABSL_DEMANGLER_RECURSE works fine in the 'S' case. + type: + if (IsLower(Peek())) { + const char* type_name = BasicTypeName(Take()); + if (type_name == nullptr || !Emit(type_name)) return false; + continue; + } + if (Eat('A')) { + // array-type = A type const + if (!Emit("[")) return false; + Y_ABSL_DEMANGLER_RECURSE(type, kArraySize); + if (!Emit("; ")) return false; + Y_ABSL_DEMANGLER_RECURSE(constant, kFinishArray); + if (!Emit("]")) return false; + continue; + } + if (Eat('S')) { + if (!Emit("[")) return false; + Y_ABSL_DEMANGLER_RECURSE(type, kSliceEnding); + if (!Emit("]")) return false; + continue; + } + if (Eat('T')) goto tuple_type; + if (Eat('R')) { + if (!Emit("&")) return false; + if (!ParseOptionalLifetime()) return false; + goto type; + } + if (Eat('Q')) { + if (!Emit("&mut ")) return false; + if (!ParseOptionalLifetime()) return false; + goto type; + } + if (Eat('P')) { + if (!Emit("*const ")) return false; + goto type; + } + if (Eat('O')) { + if (!Emit("*mut ")) return false; + goto type; + } + if (Eat('F')) goto fn_type; + if (Eat('D')) goto dyn_trait_type; + if (Eat('B')) goto type_backref; + goto path; + + // tuple-type -> T type* E (T already consumed) + tuple_type: + if (!Emit("(")) return false; + + // The toolchain should call the unit type u instead of TE, but the + // grammar and other demanglers also recognize TE, so we do too. + if (Eat('E')) { + if (!Emit(")")) return false; + continue; + } + + // A tuple with one element is rendered (type,) instead of (type). + Y_ABSL_DEMANGLER_RECURSE(type, kAfterFirstTupleElement); + if (Eat('E')) { + if (!Emit(",)")) return false; + continue; + } + + // A tuple with two elements is of course (x, y). + if (!Emit(", ")) return false; + Y_ABSL_DEMANGLER_RECURSE(type, kAfterSecondTupleElement); + if (Eat('E')) { + if (!Emit(")")) return false; + continue; + } + + // And (x, y, z) for three elements. + if (!Emit(", ")) return false; + Y_ABSL_DEMANGLER_RECURSE(type, kAfterThirdTupleElement); + if (Eat('E')) { + if (!Emit(")")) return false; + continue; + } + + // For longer tuples we write (x, y, z, ...), printing none of the + // content of the fourth and later types. Thus we avoid exhausting + // output buffers and human readers' patience when some library has a + // long tuple as an implementation detail, without having to + // completely obfuscate all tuples. + if (!Emit(", ...)")) return false; + ++silence_depth_; + while (!Eat('E')) { + Y_ABSL_DEMANGLER_RECURSE(type, kAfterSubsequentTupleElement); + } + --silence_depth_; + continue; + + // fn-type -> F fn-sig (F already consumed) + // fn-sig -> binder? U? (K abi)? type* E type + // abi -> C | undisambiguated-identifier + // + // We follow the C++ demangler in suppressing details of function + // signatures. Every function type is rendered "fn...". + fn_type: + if (!Emit("fn...")) return false; + ++silence_depth_; + if (!ParseOptionalBinder()) return false; + (void)Eat('U'); + if (Eat('K')) { + if (!Eat('C') && !ParseUndisambiguatedIdentifier()) return false; + } + while (!Eat('E')) { + Y_ABSL_DEMANGLER_RECURSE(type, kContinueParameterList); + } + Y_ABSL_DEMANGLER_RECURSE(type, kFinishFn); + --silence_depth_; + continue; + + // dyn-trait-type -> D dyn-bounds lifetime (D already consumed) + // dyn-bounds -> binder? dyn-trait* E + // + // The grammar strangely allows an empty trait list, even though the + // compiler should never output one. We follow existing demanglers in + // rendering DEL_ as "dyn ". + // + // Because auto traits lengthen a type name considerably without + // providing much value to a search for related source code, it would be + // desirable to abbreviate + // dyn main::Trait + std::marker::Copy + std::marker::Send + // to + // dyn main::Trait + ..., + // eliding the auto traits. But it is difficult to do so correctly, in + // part because there is no guarantee that the mangling will list the + // main trait first. So we just print all the traits in their order of + // appearance in the mangled name. + dyn_trait_type: + if (!Emit("dyn ")) return false; + if (!ParseOptionalBinder()) return false; + if (!Eat('E')) { + Y_ABSL_DEMANGLER_RECURSE(dyn_trait, kBeginAutoTraits); + while (!Eat('E')) { + if (!Emit(" + ")) return false; + Y_ABSL_DEMANGLER_RECURSE(dyn_trait, kContinueAutoTraits); + } + } + if (!ParseRequiredLifetime()) return false; + continue; + + // dyn-trait -> path dyn-trait-assoc-binding* + // dyn-trait-assoc-binding -> p undisambiguated-identifier type + // + // We render nonempty binding lists as <>, omitting their contents as + // for generic-args. + dyn_trait: + Y_ABSL_DEMANGLER_RECURSE(path, kContinueDynTrait); + if (Peek() == 'p') { + if (!Emit("<>")) return false; + ++silence_depth_; + while (Eat('p')) { + if (!ParseUndisambiguatedIdentifier()) return false; + Y_ABSL_DEMANGLER_RECURSE(type, kContinueAssocBinding); + } + --silence_depth_; + } + continue; + + // const -> type const-data | p | backref + // + // const is a C++ keyword, so we use the label `constant` instead. + constant: + if (Eat('B')) goto const_backref; + if (Eat('p')) { + if (!Emit("_")) return false; + continue; + } + + // Scan the type without printing it. + // + // The Rust language restricts the type of a const generic argument + // much more than the mangling grammar does. We do not enforce this. + // + // We also do not bother printing false, true, 'A', and '\u{abcd}' for + // the types bool and char. Because we do not print generic-args + // contents, we expect to print constants only in array sizes, and + // those should not be bool or char. + ++silence_depth_; + Y_ABSL_DEMANGLER_RECURSE(type, kConstData); + --silence_depth_; + + // const-data -> n? hex-digit* _ + // + // Although the grammar doesn't say this, existing demanglers expect + // that zero is 0, not an empty digit sequence, and no nonzero value + // may have leading zero digits. Also n0_ is accepted and printed as + // -0, though a toolchain will probably never write that encoding. + if (Eat('n') && !EmitChar('-')) return false; + if (!Emit("0x")) return false; + if (Eat('0')) { + if (!EmitChar('0')) return false; + if (!Eat('_')) return false; + continue; + } + while (IsLowerHexDigit(Peek())) { + if (!EmitChar(Take())) return false; + } + if (!Eat('_')) return false; + continue; + + // generic-args -> I path generic-arg* E (I already consumed) + // + // We follow the C++ demangler in omitting all the arguments from the + // output, printing only the list opening and closing tokens. + generic_args: + Y_ABSL_DEMANGLER_RECURSE(path, kBeginGenericArgList); + if (!Emit("::<>")) return false; + ++silence_depth_; + while (!Eat('E')) { + Y_ABSL_DEMANGLER_RECURSE(generic_arg, kContinueGenericArgList); + } + --silence_depth_; + continue; + + // generic-arg -> lifetime | type | K const + generic_arg: + if (Peek() == 'L') { + if (!ParseOptionalLifetime()) return false; + continue; + } + if (Eat('K')) goto constant; + goto type; + + // backref -> B base-62-number (B already consumed) + // + // The BeginBackref call parses and range-checks the base-62-number. We + // always do that much. + // + // The recursive call parses and prints what the backref points at. We + // save CPU and stack by skipping this work if the output would be + // suppressed anyway. + path_backref: + if (!BeginBackref()) return false; + if (silence_depth_ == 0) { + Y_ABSL_DEMANGLER_RECURSE(path, kPathBackrefEnding); + } + EndBackref(); + continue; + + // This represents the same backref production as in path_backref but + // parses the target as a type instead of a path. + type_backref: + if (!BeginBackref()) return false; + if (silence_depth_ == 0) { + Y_ABSL_DEMANGLER_RECURSE(type, kTypeBackrefEnding); + } + EndBackref(); + continue; + + const_backref: + if (!BeginBackref()) return false; + if (silence_depth_ == 0) { + Y_ABSL_DEMANGLER_RECURSE(constant, kConstantBackrefEnding); + } + EndBackref(); + continue; + } + } + + return false; // hit iteration limit or a bug in our stack handling + } + + private: + // Enumerates resumption points for Y_ABSL_DEMANGLER_RECURSE calls. + enum ReturnAddress : uint8_t { + kInstantiatingCrate, + kVendorSpecificSuffix, + kIdentifierInUppercaseNamespace, + kIdentifierInLowercaseNamespace, + kInherentImplType, + kInherentImplEnding, + kTraitImplType, + kTraitImplInfix, + kTraitImplEnding, + kImplPathEnding, + kTraitDefinitionInfix, + kTraitDefinitionEnding, + kArraySize, + kFinishArray, + kSliceEnding, + kAfterFirstTupleElement, + kAfterSecondTupleElement, + kAfterThirdTupleElement, + kAfterSubsequentTupleElement, + kContinueParameterList, + kFinishFn, + kBeginAutoTraits, + kContinueAutoTraits, + kContinueDynTrait, + kContinueAssocBinding, + kConstData, + kBeginGenericArgList, + kContinueGenericArgList, + kPathBackrefEnding, + kTypeBackrefEnding, + kConstantBackrefEnding, + }; + + // Element counts for the stack arrays. Larger stack sizes accommodate more + // deeply nested names at the cost of a larger footprint on the C++ call + // stack. + enum { + // Maximum recursive calls outstanding at one time. + kStackSize = 256, + + // Maximum N<uppercase> nested-paths open at once. We do not expect + // closures inside closures inside closures as much as functions inside + // modules inside other modules, so we can use a smaller array here. + kNamespaceStackSize = 64, + + // Maximum number of nested backrefs. We can keep this stack pretty small + // because we do not follow backrefs inside generic-args or other contexts + // that suppress printing, so deep stacking is unlikely in practice. + kPositionStackSize = 16, + }; + + // Returns the next input character without consuming it. + char Peek() const { return encoding_[pos_]; } + + // Consumes and returns the next input character. + char Take() { return encoding_[pos_++]; } + + // If the next input character is the given character, consumes it and returns + // true; otherwise returns false without consuming a character. + Y_ABSL_MUST_USE_RESULT bool Eat(char want) { + if (encoding_[pos_] != want) return false; + ++pos_; + return true; + } + + // Provided there is enough remaining output space, appends c to the output, + // writing a fresh NUL terminator afterward, and returns true. Returns false + // if the output buffer had less than two bytes free. + Y_ABSL_MUST_USE_RESULT bool EmitChar(char c) { + if (silence_depth_ > 0) return true; + if (out_end_ - out_ < 2) return false; + *out_++ = c; + *out_ = '\0'; + return true; + } + + // Provided there is enough remaining output space, appends the C string token + // to the output, followed by a NUL character, and returns true. Returns + // false if not everything fit into the output buffer. + Y_ABSL_MUST_USE_RESULT bool Emit(const char* token) { + if (silence_depth_ > 0) return true; + const size_t token_length = std::strlen(token); + const size_t bytes_to_copy = token_length + 1; // token and final NUL + if (static_cast<size_t>(out_end_ - out_) < bytes_to_copy) return false; + std::memcpy(out_, token, bytes_to_copy); + out_ += token_length; + return true; + } + + // Provided there is enough remaining output space, appends the decimal form + // of disambiguator (if it's nonnegative) or "?" (if it's negative) to the + // output, followed by a NUL character, and returns true. Returns false if + // not everything fit into the output buffer. + Y_ABSL_MUST_USE_RESULT bool EmitDisambiguator(int disambiguator) { + if (disambiguator < 0) return EmitChar('?'); // parsed but too large + if (disambiguator == 0) return EmitChar('0'); + // Convert disambiguator to decimal text. Three digits per byte is enough + // because 999 > 256. The bound will remain correct even if future + // maintenance changes the type of the disambiguator variable. + char digits[3 * sizeof(disambiguator)] = {}; + size_t leading_digit_index = sizeof(digits) - 1; + for (; disambiguator > 0; disambiguator /= 10) { + digits[--leading_digit_index] = + static_cast<char>('0' + disambiguator % 10); + } + return Emit(digits + leading_digit_index); + } + + // Consumes an optional disambiguator (s123_) from the input. + // + // On success returns true and fills value with the encoded value if it was + // not too big, otherwise with -1. If the optional disambiguator was omitted, + // value is 0. On parse failure returns false and sets value to -1. + Y_ABSL_MUST_USE_RESULT bool ParseDisambiguator(int& value) { + value = -1; + + // disambiguator = s base-62-number + // + // Disambiguators are optional. An omitted disambiguator is zero. + if (!Eat('s')) { + value = 0; + return true; + } + int base_62_value = 0; + if (!ParseBase62Number(base_62_value)) return false; + value = base_62_value < 0 ? -1 : base_62_value + 1; + return true; + } + + // Consumes a base-62 number like _ or 123_ from the input. + // + // On success returns true and fills value with the encoded value if it was + // not too big, otherwise with -1. On parse failure returns false and sets + // value to -1. + Y_ABSL_MUST_USE_RESULT bool ParseBase62Number(int& value) { + value = -1; + + // base-62-number = (digit | lower | upper)* _ + // + // An empty base-62 digit sequence means 0. + if (Eat('_')) { + value = 0; + return true; + } + + // A nonempty digit sequence denotes its base-62 value plus 1. + int encoded_number = 0; + bool overflowed = false; + while (IsAlpha(Peek()) || IsDigit(Peek())) { + const char c = Take(); + if (encoded_number >= std::numeric_limits<int>::max()/62) { + // If we are close to overflowing an int, keep parsing but stop updating + // encoded_number and remember to return -1 at the end. The point is to + // avoid undefined behavior while parsing crate-root disambiguators, + // which are large in practice but not shown in demangling, while + // successfully computing closure and shim disambiguators, which are + // typically small and are printed out. + overflowed = true; + } else { + int digit; + if (IsDigit(c)) { + digit = c - '0'; + } else if (IsLower(c)) { + digit = c - 'a' + 10; + } else { + digit = c - 'A' + 36; + } + encoded_number = 62 * encoded_number + digit; + } + } + + if (!Eat('_')) return false; + if (!overflowed) value = encoded_number + 1; + return true; + } + + // Consumes an identifier from the input, returning true on success. + // + // A nonzero uppercase_namespace specifies the character after the N in a + // nested-identifier, e.g., 'C' for a closure, allowing ParseIdentifier to + // write out the name with the conventional decoration for that namespace. + Y_ABSL_MUST_USE_RESULT bool ParseIdentifier(char uppercase_namespace = '\0') { + // identifier -> disambiguator? undisambiguated-identifier + int disambiguator = 0; + if (!ParseDisambiguator(disambiguator)) return false; + + return ParseUndisambiguatedIdentifier(uppercase_namespace, disambiguator); + } + + // Consumes from the input an identifier with no preceding disambiguator, + // returning true on success. + // + // When ParseIdentifier calls this, it passes the N<namespace> character and + // disambiguator value so that "{closure#42}" and similar forms can be + // rendered correctly. + // + // At other appearances of undisambiguated-identifier in the grammar, this + // treatment is not applicable, and the call site omits both arguments. + Y_ABSL_MUST_USE_RESULT bool ParseUndisambiguatedIdentifier( + char uppercase_namespace = '\0', int disambiguator = 0) { + // undisambiguated-identifier -> u? decimal-number _? bytes + const bool is_punycoded = Eat('u'); + if (!IsDigit(Peek())) return false; + int num_bytes = 0; + if (!ParseDecimalNumber(num_bytes)) return false; + (void)Eat('_'); // optional separator, needed if a digit follows + if (is_punycoded) { + DecodeRustPunycodeOptions options; + options.punycode_begin = &encoding_[pos_]; + options.punycode_end = &encoding_[pos_] + num_bytes; + options.out_begin = out_; + options.out_end = out_end_; + out_ = DecodeRustPunycode(options); + if (out_ == nullptr) return false; + pos_ += static_cast<size_t>(num_bytes); + } + + // Emit the beginnings of braced forms like {shim:vtable#0}. + if (uppercase_namespace != '\0') { + switch (uppercase_namespace) { + case 'C': + if (!Emit("{closure")) return false; + break; + case 'S': + if (!Emit("{shim")) return false; + break; + default: + if (!EmitChar('{') || !EmitChar(uppercase_namespace)) return false; + break; + } + if (num_bytes > 0 && !Emit(":")) return false; + } + + // Emit the name itself. + if (!is_punycoded) { + for (int i = 0; i < num_bytes; ++i) { + const char c = Take(); + if (!IsIdentifierChar(c) && + // The spec gives toolchains the choice of Punycode or raw UTF-8 for + // identifiers containing code points above 0x7f, so accept bytes + // with the high bit set. + (c & 0x80) == 0) { + return false; + } + if (!EmitChar(c)) return false; + } + } + + // Emit the endings of braced forms, e.g., "#42}". + if (uppercase_namespace != '\0') { + if (!EmitChar('#')) return false; + if (!EmitDisambiguator(disambiguator)) return false; + if (!EmitChar('}')) return false; + } + + return true; + } + + // Consumes a decimal number like 0 or 123 from the input. On success returns + // true and fills value with the encoded value. If the encoded value is too + // large or otherwise unparsable, returns false and sets value to -1. + Y_ABSL_MUST_USE_RESULT bool ParseDecimalNumber(int& value) { + value = -1; + if (!IsDigit(Peek())) return false; + int encoded_number = Take() - '0'; + if (encoded_number == 0) { + // Decimal numbers are never encoded with extra leading zeroes. + value = 0; + return true; + } + while (IsDigit(Peek()) && + // avoid overflow + encoded_number < std::numeric_limits<int>::max()/10) { + encoded_number = 10 * encoded_number + (Take() - '0'); + } + if (IsDigit(Peek())) return false; // too big + value = encoded_number; + return true; + } + + // Consumes a binder of higher-ranked lifetimes if one is present. On success + // returns true and discards the encoded lifetime count. On parse failure + // returns false. + Y_ABSL_MUST_USE_RESULT bool ParseOptionalBinder() { + // binder -> G base-62-number + if (!Eat('G')) return true; + int ignored_binding_count; + return ParseBase62Number(ignored_binding_count); + } + + // Consumes a lifetime if one is present. + // + // On success returns true and discards the lifetime index. We do not print + // or even range-check lifetimes because they are a finer detail than other + // things we omit from output, such as the entire contents of generic-args. + // + // On parse failure returns false. + Y_ABSL_MUST_USE_RESULT bool ParseOptionalLifetime() { + // lifetime -> L base-62-number + if (!Eat('L')) return true; + int ignored_de_bruijn_index; + return ParseBase62Number(ignored_de_bruijn_index); + } + + // Consumes a lifetime just like ParseOptionalLifetime, but returns false if + // there is no lifetime here. + Y_ABSL_MUST_USE_RESULT bool ParseRequiredLifetime() { + if (Peek() != 'L') return false; + return ParseOptionalLifetime(); + } + + // Pushes ns onto the namespace stack and returns true if the stack is not + // full, else returns false. + Y_ABSL_MUST_USE_RESULT bool PushNamespace(char ns) { + if (namespace_depth_ == kNamespaceStackSize) return false; + namespace_stack_[namespace_depth_++] = ns; + return true; + } + + // Pops the last pushed namespace. Requires that the namespace stack is not + // empty (namespace_depth_ > 0). + char PopNamespace() { return namespace_stack_[--namespace_depth_]; } + + // Pushes position onto the position stack and returns true if the stack is + // not full, else returns false. + Y_ABSL_MUST_USE_RESULT bool PushPosition(int position) { + if (position_depth_ == kPositionStackSize) return false; + position_stack_[position_depth_++] = position; + return true; + } + + // Pops the last pushed input position. Requires that the position stack is + // not empty (position_depth_ > 0). + int PopPosition() { return position_stack_[--position_depth_]; } + + // Consumes a base-62-number denoting a backref target, pushes the current + // input position on the data stack, and sets the input position to the + // beginning of the backref target. Returns true on success. Returns false + // if parsing failed, the stack is exhausted, or the backref target position + // is out of range. + Y_ABSL_MUST_USE_RESULT bool BeginBackref() { + // backref = B base-62-number (B already consumed) + // + // Reject backrefs that don't parse, overflow int, or don't point backward. + // If the offset looks fine, adjust it to account for the _R prefix. + int offset = 0; + const int offset_of_this_backref = + pos_ - 2 /* _R */ - 1 /* B already consumed */; + if (!ParseBase62Number(offset) || offset < 0 || + offset >= offset_of_this_backref) { + return false; + } + offset += 2; + + // Save the old position to restore later. + if (!PushPosition(pos_)) return false; + + // Move the input position to the backref target. + // + // Note that we do not check whether the new position points to the + // beginning of a construct matching the context in which the backref + // appeared. We just jump to it and see whether nested parsing succeeds. + // We therefore accept various wrong manglings, e.g., a type backref + // pointing to an 'l' character inside an identifier, which happens to mean + // i32 when parsed as a type mangling. This saves the complexity and RAM + // footprint of remembering which offsets began which kinds of + // substructures. Existing demanglers take similar shortcuts. + pos_ = offset; + return true; + } + + // Cleans up after a backref production by restoring the previous input + // position from the data stack. + void EndBackref() { pos_ = PopPosition(); } + + // The leftmost recursion_depth_ elements of recursion_stack_ contain the + // ReturnAddresses pushed by Y_ABSL_DEMANGLER_RECURSE calls not yet completed. + ReturnAddress recursion_stack_[kStackSize] = {}; + int recursion_depth_ = 0; + + // The leftmost namespace_depth_ elements of namespace_stack_ contain the + // uppercase namespace identifiers for open nested-paths, e.g., 'C' for a + // closure. + char namespace_stack_[kNamespaceStackSize] = {}; + int namespace_depth_ = 0; + + // The leftmost position_depth_ elements of position_stack_ contain the input + // positions to return to after fully printing the targets of backrefs. + int position_stack_[kPositionStackSize] = {}; + int position_depth_ = 0; + + // Anything parsed while silence_depth_ > 0 contributes nothing to the + // demangled output. For constructs omitted from the demangling, such as + // impl-path and the contents of generic-args, we will increment + // silence_depth_ on the way in and decrement silence_depth_ on the way out. + int silence_depth_ = 0; + + // Input: encoding_ points to a Rust mangled symbol, and encoding_[pos_] is + // the next input character to be scanned. + int pos_ = 0; + const char* encoding_ = nullptr; + + // Output: *out_ is where the next output character should be written, and + // out_end_ points past the last byte of available space. + char* out_ = nullptr; + char* out_end_ = nullptr; +}; + +} // namespace + +bool DemangleRustSymbolEncoding(const char* mangled, char* out, + size_t out_size) { + return RustSymbolParser(mangled, out, out + out_size).Parse(); +} + +} // namespace debugging_internal +Y_ABSL_NAMESPACE_END +} // namespace y_absl diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle_rust.h b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle_rust.h new file mode 100644 index 0000000000..70ec90d3b9 --- /dev/null +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/demangle_rust.h @@ -0,0 +1,42 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef Y_ABSL_DEBUGGING_INTERNAL_DEMANGLE_RUST_H_ +#define Y_ABSL_DEBUGGING_INTERNAL_DEMANGLE_RUST_H_ + +#include <cstddef> + +#include "y_absl/base/config.h" + +namespace y_absl { +Y_ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +// Demangle the Rust encoding `mangled`. On success, return true and write the +// demangled symbol name to `out`. Otherwise, return false, leaving unspecified +// contents in `out`. For example, calling DemangleRustSymbolEncoding with +// `mangled = "_RNvC8my_crate7my_func"` will yield `my_crate::my_func` in `out`, +// provided `out_size` is large enough for that value and its trailing NUL. +// +// DemangleRustSymbolEncoding is async-signal-safe and runs in bounded C++ +// call-stack space. It is suitable for symbolizing stack traces in a signal +// handler. +bool DemangleRustSymbolEncoding(const char* mangled, char* out, + size_t out_size); + +} // namespace debugging_internal +Y_ABSL_NAMESPACE_END +} // namespace y_absl + +#endif // Y_ABSL_DEBUGGING_INTERNAL_DEMANGLE_RUST_H_ diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/elf_mem_image.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/elf_mem_image.cc index cead525c93..f9fc638597 100644 --- a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/elf_mem_image.cc +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/elf_mem_image.cc @@ -20,8 +20,11 @@ #ifdef Y_ABSL_HAVE_ELF_MEM_IMAGE // defined in elf_mem_image.h #include <string.h> + #include <cassert> #include <cstddef> +#include <cstdint> + #include "y_absl/base/config.h" #include "y_absl/base/internal/raw_logging.h" @@ -86,20 +89,14 @@ ElfMemImage::ElfMemImage(const void *base) { Init(base); } -int ElfMemImage::GetNumSymbols() const { - if (!hash_) { - return 0; - } - // See http://www.caldera.com/developers/gabi/latest/ch5.dynamic.html#hash - return static_cast<int>(hash_[1]); -} +uint32_t ElfMemImage::GetNumSymbols() const { return num_syms_; } -const ElfW(Sym) *ElfMemImage::GetDynsym(int index) const { +const ElfW(Sym) * ElfMemImage::GetDynsym(uint32_t index) const { Y_ABSL_RAW_CHECK(index < GetNumSymbols(), "index out of range"); return dynsym_ + index; } -const ElfW(Versym) *ElfMemImage::GetVersym(int index) const { +const ElfW(Versym) *ElfMemImage::GetVersym(uint32_t index) const { Y_ABSL_RAW_CHECK(index < GetNumSymbols(), "index out of range"); return versym_ + index; } @@ -154,7 +151,7 @@ void ElfMemImage::Init(const void *base) { dynstr_ = nullptr; versym_ = nullptr; verdef_ = nullptr; - hash_ = nullptr; + num_syms_ = 0; strsize_ = 0; verdefnum_ = 0; // Sentinel: PT_LOAD .p_vaddr can't possibly be this. @@ -219,12 +216,17 @@ void ElfMemImage::Init(const void *base) { base_as_char - reinterpret_cast<const char *>(link_base_); ElfW(Dyn)* dynamic_entry = reinterpret_cast<ElfW(Dyn)*>( static_cast<intptr_t>(dynamic_program_header->p_vaddr) + relocation); + uint32_t *sysv_hash = nullptr; + uint32_t *gnu_hash = nullptr; for (; dynamic_entry->d_tag != DT_NULL; ++dynamic_entry) { const auto value = static_cast<intptr_t>(dynamic_entry->d_un.d_val) + relocation; switch (dynamic_entry->d_tag) { case DT_HASH: - hash_ = reinterpret_cast<ElfW(Word) *>(value); + sysv_hash = reinterpret_cast<uint32_t *>(value); + break; + case DT_GNU_HASH: + gnu_hash = reinterpret_cast<uint32_t *>(value); break; case DT_SYMTAB: dynsym_ = reinterpret_cast<ElfW(Sym) *>(value); @@ -249,13 +251,38 @@ void ElfMemImage::Init(const void *base) { break; } } - if (!hash_ || !dynsym_ || !dynstr_ || !versym_ || + if ((!sysv_hash && !gnu_hash) || !dynsym_ || !dynstr_ || !versym_ || !verdef_ || !verdefnum_ || !strsize_) { assert(false); // invalid VDSO // Mark this image as not present. Can not recur infinitely. Init(nullptr); return; } + if (sysv_hash) { + num_syms_ = sysv_hash[1]; + } else { + assert(gnu_hash); + // Compute the number of symbols for DT_GNU_HASH, which is specified by + // https://sourceware.org/gnu-gabi/program-loading-and-dynamic-linking.txt + uint32_t nbuckets = gnu_hash[0]; + // The buckets array is located after the header (4 uint32) and the bloom + // filter (size_t array of gnu_hash[2] elements). + uint32_t *buckets = gnu_hash + 4 + sizeof(size_t) / 4 * gnu_hash[2]; + // Find the chain of the last non-empty bucket. + uint32_t idx = 0; + for (uint32_t i = nbuckets; i > 0;) { + idx = buckets[--i]; + if (idx != 0) break; + } + if (idx != 0) { + // Find the last element of the chain, which has an odd value. + // Add one to get the number of symbols. + uint32_t *chain = buckets + nbuckets - gnu_hash[1]; + while (chain[idx++] % 2 == 0) { + } + } + num_syms_ = idx; + } } bool ElfMemImage::LookupSymbol(const char *name, @@ -300,9 +327,9 @@ bool ElfMemImage::LookupSymbolByAddress(const void *address, return false; } -ElfMemImage::SymbolIterator::SymbolIterator(const void *const image, int index) - : index_(index), image_(image) { -} +ElfMemImage::SymbolIterator::SymbolIterator(const void *const image, + uint32_t index) + : index_(index), image_(image) {} const ElfMemImage::SymbolInfo *ElfMemImage::SymbolIterator::operator->() const { return &info_; @@ -335,7 +362,7 @@ ElfMemImage::SymbolIterator ElfMemImage::end() const { return SymbolIterator(this, GetNumSymbols()); } -void ElfMemImage::SymbolIterator::Update(int increment) { +void ElfMemImage::SymbolIterator::Update(uint32_t increment) { const ElfMemImage *image = reinterpret_cast<const ElfMemImage *>(image_); Y_ABSL_RAW_CHECK(image->IsPresent() || increment == 0, ""); if (!image->IsPresent()) { diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/elf_mem_image.h b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/elf_mem_image.h index 9c5e588fbb..4a3ee56edf 100644 --- a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/elf_mem_image.h +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/elf_mem_image.h @@ -22,6 +22,7 @@ // Including this will define the __GLIBC__ macro if glibc is being // used. #include <climits> +#include <cstdint> #include "y_absl/base/config.h" @@ -82,10 +83,10 @@ class ElfMemImage { bool operator!=(const SymbolIterator &rhs) const; bool operator==(const SymbolIterator &rhs) const; private: - SymbolIterator(const void *const image, int index); - void Update(int incr); + SymbolIterator(const void *const image, uint32_t index); + void Update(uint32_t incr); SymbolInfo info_; - int index_; + uint32_t index_; const void *const image_; }; @@ -94,14 +95,14 @@ class ElfMemImage { void Init(const void *base); bool IsPresent() const { return ehdr_ != nullptr; } const ElfW(Phdr)* GetPhdr(int index) const; - const ElfW(Sym)* GetDynsym(int index) const; - const ElfW(Versym)* GetVersym(int index) const; + const ElfW(Sym) * GetDynsym(uint32_t index) const; + const ElfW(Versym)* GetVersym(uint32_t index) const; const ElfW(Verdef)* GetVerdef(int index) const; const ElfW(Verdaux)* GetVerdefAux(const ElfW(Verdef) *verdef) const; const char* GetDynstr(ElfW(Word) offset) const; const void* GetSymAddr(const ElfW(Sym) *sym) const; const char* GetVerstr(ElfW(Word) offset) const; - int GetNumSymbols() const; + uint32_t GetNumSymbols() const; SymbolIterator begin() const; SymbolIterator end() const; @@ -124,8 +125,8 @@ class ElfMemImage { const ElfW(Sym) *dynsym_; const ElfW(Versym) *versym_; const ElfW(Verdef) *verdef_; - const ElfW(Word) *hash_; const char *dynstr_; + uint32_t num_syms_; size_t strsize_; size_t verdefnum_; ElfW(Addr) link_base_; // Link-time base (p_vaddr of first PT_LOAD). diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/stacktrace_aarch64-inl.inc b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/stacktrace_aarch64-inl.inc index 7c7c0304e3..1bf78f03e1 100644 --- a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/stacktrace_aarch64-inl.inc +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/stacktrace_aarch64-inl.inc @@ -89,6 +89,8 @@ struct StackInfo { static bool InsideSignalStack(void** ptr, const StackInfo* stack_info) { uintptr_t comparable_ptr = reinterpret_cast<uintptr_t>(ptr); + if (stack_info->sig_stack_high == kUnknownStackEnd) + return false; return (comparable_ptr >= stack_info->sig_stack_low && comparable_ptr < stack_info->sig_stack_high); } @@ -122,13 +124,6 @@ static void **NextStackFrame(void **old_frame_pointer, const void *uc, if (pre_signal_frame_pointer >= old_frame_pointer) { new_frame_pointer = pre_signal_frame_pointer; } - // Check that alleged frame pointer is actually readable. This is to - // prevent "double fault" in case we hit the first fault due to e.g. - // stack corruption. - if (!y_absl::debugging_internal::AddressIsReadable( - new_frame_pointer)) - return nullptr; - } } #endif @@ -136,6 +131,14 @@ static void **NextStackFrame(void **old_frame_pointer, const void *uc, if ((reinterpret_cast<uintptr_t>(new_frame_pointer) & 7) != 0) return nullptr; + // Check that alleged frame pointer is actually readable. This is to + // prevent "double fault" in case we hit the first fault due to e.g. + // stack corruption. + if (!y_absl::debugging_internal::AddressIsReadable( + new_frame_pointer)) + return nullptr; + } + // Only check the size if both frames are in the same stack. if (InsideSignalStack(new_frame_pointer, stack_info) == InsideSignalStack(old_frame_pointer, stack_info)) { diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/utf8_for_code_point.cc b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/utf8_for_code_point.cc new file mode 100644 index 0000000000..ed158c62b3 --- /dev/null +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/utf8_for_code_point.cc @@ -0,0 +1,70 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "y_absl/debugging/internal/utf8_for_code_point.h" + +#include <cstdint> + +#include "y_absl/base/config.h" + +namespace y_absl { +Y_ABSL_NAMESPACE_BEGIN +namespace debugging_internal { +namespace { + +// UTF-8 encoding bounds. +constexpr uint32_t kMinSurrogate = 0xd800, kMaxSurrogate = 0xdfff; +constexpr uint32_t kMax1ByteCodePoint = 0x7f; +constexpr uint32_t kMax2ByteCodePoint = 0x7ff; +constexpr uint32_t kMax3ByteCodePoint = 0xffff; +constexpr uint32_t kMaxCodePoint = 0x10ffff; + +} // namespace + +Utf8ForCodePoint::Utf8ForCodePoint(uint64_t code_point) { + if (code_point <= kMax1ByteCodePoint) { + length = 1; + bytes[0] = static_cast<char>(code_point); + return; + } + + if (code_point <= kMax2ByteCodePoint) { + length = 2; + bytes[0] = static_cast<char>(0xc0 | (code_point >> 6)); + bytes[1] = static_cast<char>(0x80 | (code_point & 0x3f)); + return; + } + + if (kMinSurrogate <= code_point && code_point <= kMaxSurrogate) return; + + if (code_point <= kMax3ByteCodePoint) { + length = 3; + bytes[0] = static_cast<char>(0xe0 | (code_point >> 12)); + bytes[1] = static_cast<char>(0x80 | ((code_point >> 6) & 0x3f)); + bytes[2] = static_cast<char>(0x80 | (code_point & 0x3f)); + return; + } + + if (code_point > kMaxCodePoint) return; + + length = 4; + bytes[0] = static_cast<char>(0xf0 | (code_point >> 18)); + bytes[1] = static_cast<char>(0x80 | ((code_point >> 12) & 0x3f)); + bytes[2] = static_cast<char>(0x80 | ((code_point >> 6) & 0x3f)); + bytes[3] = static_cast<char>(0x80 | (code_point & 0x3f)); +} + +} // namespace debugging_internal +Y_ABSL_NAMESPACE_END +} // namespace y_absl diff --git a/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/utf8_for_code_point.h b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/utf8_for_code_point.h new file mode 100644 index 0000000000..aa0dc65b82 --- /dev/null +++ b/contrib/restricted/abseil-cpp-tstring/y_absl/debugging/internal/utf8_for_code_point.h @@ -0,0 +1,47 @@ +// Copyright 2024 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef Y_ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_ +#define Y_ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_ + +#include <cstdint> + +#include "y_absl/base/config.h" + +namespace y_absl { +Y_ABSL_NAMESPACE_BEGIN +namespace debugging_internal { + +struct Utf8ForCodePoint { + // Converts a Unicode code point to the corresponding UTF-8 byte sequence. + // Async-signal-safe to support use in symbolizing stack traces from a signal + // handler. + explicit Utf8ForCodePoint(uint64_t code_point); + + // Returns true if the constructor's code_point argument was valid. + bool ok() const { return length != 0; } + + // If code_point was in range, then 1 <= length <= 4, and the UTF-8 encoding + // is found in bytes[0 .. (length - 1)]. If code_point was invalid, then + // length == 0. In either case, the contents of bytes[length .. 3] are + // unspecified. + char bytes[4] = {}; + uint32_t length = 0; +}; + +} // namespace debugging_internal +Y_ABSL_NAMESPACE_END +} // namespace y_absl + +#endif // Y_ABSL_DEBUGGING_INTERNAL_UTF8_FOR_CODE_POINT_H_ |