diff options
author | arkady-e1ppa <arkady-e1ppa@yandex-team.com> | 2024-11-18 14:48:05 +0300 |
---|---|---|
committer | arkady-e1ppa <arkady-e1ppa@yandex-team.com> | 2024-11-18 15:04:54 +0300 |
commit | 9e876c7c66440327e3bba353d37e99d68eabb0b9 (patch) | |
tree | b7daabaa3386ab7b1a783fad91da2f014f9354e7 | |
parent | 13bff9a72fbc1bd6a2643251982afcc3b4a7e93a (diff) | |
download | ydb-9e876c7c66440327e3bba353d37e99d68eabb0b9.tar.gz |
YT-23435: Parse format string at compile time
commit_hash:804530d1ee861ff42d7d8cad25d9f569b4feaacf
-rw-r--r-- | library/cpp/yt/string/format-inl.h | 236 | ||||
-rw-r--r-- | library/cpp/yt/string/format_analyser.h | 42 | ||||
-rw-r--r-- | library/cpp/yt/string/format_arg.h | 2 | ||||
-rw-r--r-- | library/cpp/yt/string/format_string-inl.h | 16 | ||||
-rw-r--r-- | library/cpp/yt/string/format_string.h | 4 | ||||
-rw-r--r-- | library/cpp/yt/string/unittests/format_ut.cpp | 5 |
6 files changed, 251 insertions, 54 deletions
diff --git a/library/cpp/yt/string/format-inl.h b/library/cpp/yt/string/format-inl.h index 7f1f725966..e90d68bfe1 100644 --- a/library/cpp/yt/string/format-inl.h +++ b/library/cpp/yt/string/format-inl.h @@ -871,12 +871,43 @@ concept CFormatter = CInvocable<T, void(size_t, TStringBuilderBase*, TStringBuf) //////////////////////////////////////////////////////////////////////////////// template <CFormatter TFormatter> -void RunFormatter( +void RunFormatterAt( + const TFormatter& formatter, + size_t index, + TStringBuilderBase* builder, + TStringBuf spec, + bool singleQuotes, + bool doubleQuotes) +{ + // 'n' means 'nothing'; skip the argument. + if (!spec.Contains('n')) { + if (singleQuotes) { + builder->AppendChar('\''); + } + if (doubleQuotes) { + builder->AppendChar('"'); + } + + formatter(index, builder, spec); + + if (singleQuotes) { + builder->AppendChar('\''); + } + if (doubleQuotes) { + builder->AppendChar('"'); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <CFormatter TFormatter> +void RunFormatterFullScan( TStringBuilderBase* builder, TStringBuf format, - const TFormatter& formatter) + const TFormatter& formatter, + int argIndex = 0) { - size_t argIndex = 0; auto current = std::begin(format); auto end = std::end(format); while (true) { @@ -912,27 +943,13 @@ void RunFormatter( bool singleQuotes = false; bool doubleQuotes = false; + static constexpr TStringBuf conversionSpecifiers = + "diuoxXfFeEgGaAcspn"; + while ( argFormatEnd != end && - *argFormatEnd != GenericSpecSymbol && // value in generic format - *argFormatEnd != 'd' && // others are standard specifiers supported by printf - *argFormatEnd != 'i' && - *argFormatEnd != 'u' && - *argFormatEnd != 'o' && - *argFormatEnd != 'x' && - *argFormatEnd != 'X' && - *argFormatEnd != 'f' && - *argFormatEnd != 'F' && - *argFormatEnd != 'e' && - *argFormatEnd != 'E' && - *argFormatEnd != 'g' && - *argFormatEnd != 'G' && - *argFormatEnd != 'a' && - *argFormatEnd != 'A' && - *argFormatEnd != 'c' && - *argFormatEnd != 's' && - *argFormatEnd != 'p' && - *argFormatEnd != 'n') + *argFormatEnd != GenericSpecSymbol && // value in generic format + !conversionSpecifiers.Contains(*argFormatEnd)) // others are standard specifiers supported by printf { switch (*argFormatEnd) { case 'q': @@ -952,27 +969,162 @@ void RunFormatter( ++argFormatEnd; } - // 'n' means 'nothing'; skip the argument. - if (*argFormatBegin != 'n') { - // Format argument. - TStringBuf argFormat(argFormatBegin, argFormatEnd); - if (singleQuotes) { - builder->AppendChar('\''); - } - if (doubleQuotes) { - builder->AppendChar('"'); - } - formatter(argIndex++, builder, argFormat); - if (singleQuotes) { - builder->AppendChar('\''); - } - if (doubleQuotes) { - builder->AppendChar('"'); + RunFormatterAt( + formatter, + argIndex++, + builder, + TStringBuf{argFormatBegin, argFormatEnd}, + singleQuotes, + doubleQuotes); + + current = argFormatEnd; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <CFormatter TFormatter, class... TArgs> +void RunFormatter( + TStringBuilderBase* builder, + TBasicFormatString<TArgs...> formatString, + const TFormatter& formatter) +{ + auto isValidLocations = [] (const auto& t) { + return std::get<0>(t) != std::get<1>(t); + }; + // Generally marker is simply "%v" e.g. 2 symbols. + // We assume it is used to insert something for roughly 5 symbols + // of size. + builder->Preallocate(std::size(formatString.Get()) + sizeof...(TArgs) * (5 - 2)); + + // Empty marker positions -- fallback to the normal impl. + if constexpr (sizeof...(TArgs) != 0) { + if (!isValidLocations(formatString.Markers[0])) { + RunFormatterFullScan(builder, formatString.Get(), formatter); + return; + } + } else { + if (formatString.Escapes[0] == -2) { + RunFormatterFullScan(builder, formatString.Get(), formatter); + return; + } + } + + int escapesFound = 0; + int currentPos = 0; + + auto beginIt = formatString.Get().begin(); + auto size = formatString.Get().size(); + + const auto& [markers, escapes] = std::tie(formatString.Markers, formatString.Escapes); + + auto appendVerbatim = [&] (int offsetToEnd) { + builder->AppendString(TStringBuf{beginIt + currentPos, beginIt + offsetToEnd}); + }; + + auto processEscape = [&] () mutable { + // OpenMP doesn't support structured bindings :(. + auto escapePos = formatString.Escapes[escapesFound]; + + // Append everything that's present until %%. + appendVerbatim(escapePos); + + // Append '%'. + builder->AppendChar('%'); + + // Advance position to first '%' pos + 2. + currentPos = escapePos + 2; + }; + + int argIndex = 0; + + while(argIndex < std::ssize(markers)) { + auto [markerStart, markerEnd] = markers[argIndex]; + + if ( + escapes[escapesFound] != -1 && + escapes[escapesFound] - currentPos < markerStart - currentPos) + { + // Escape sequence is closer. + processEscape(); + ++escapesFound; + } else { + // Normal marker is closer. + + // Append everything that's present until marker start. + appendVerbatim(markerStart); + + // Parsing format string. + + // We skip '%' here since spec does not contain it. + auto spec = TStringBuf{beginIt + markerStart + 1, beginIt + markerEnd}; + bool singleQuotes = false; + bool doubleQuotes = false; + for (auto c : spec) { + if (c == 'q') { + singleQuotes = true; + } + if (c == 'Q') { + doubleQuotes = true; + } } + RunFormatterAt( + formatter, + argIndex, + builder, + spec, + singleQuotes, + doubleQuotes); + + // Advance position past marker's end. + currentPos = markerEnd; + ++argIndex; + continue; } - current = argFormatEnd; + // Check if the number of escapes we have found has exceeded the recorded limit + // e.g. we have to manually scan the rest of the formatString. + if (escapesFound == std::ssize(escapes)) { + break; + } } + + // Process remaining escapes. + while (escapesFound < std::ssize(escapes)) { + if (escapes[escapesFound] == -1) { + break; + } + + processEscape(); + ++escapesFound; + } + + // We either ran out of markers or reached the limit of allowed + // escape sequences. + + // Happy path: no limit on escape sequences. + if (escapesFound != std::ssize(escapes)) { + // Append whatever's left until the end. + appendVerbatim(size); + return; + } + + // Sad path: we have to fully parse remainder of format. + RunFormatterFullScan(builder, TStringBuf{beginIt + currentPos, beginIt + size}, formatter, argIndex); +} + +//////////////////////////////////////////////////////////////////////////////// + +// For benchmarking purposes. +template <class... TArgs> +TString FormatOld(TFormatString<TArgs...> format, TArgs&&... args) +{ + TStringBuilder builder; + if constexpr ((CFormattable<TArgs> && ...)) { + NYT::NDetail::TValueFormatter<0, TArgs...> formatter(args...); + NYT::NDetail::RunFormatterFullScan(&builder, format.Get(), formatter); + } + return builder.Flush(); } } // namespace NDetail @@ -991,7 +1143,7 @@ void Format(TStringBuilderBase* builder, TFormatString<TArgs...> format, TArgs&& // a second error. if constexpr ((CFormattable<TArgs> && ...)) { NYT::NDetail::TValueFormatter<0, TArgs...> formatter(args...); - NYT::NDetail::RunFormatter(builder, format.Get(), formatter); + NYT::NDetail::RunFormatter(builder, format, formatter); } } @@ -1012,7 +1164,7 @@ void FormatVector( const TVector& vec) { NYT::NDetail::TRangeFormatter<typename TVector::value_type> formatter(vec); - NYT::NDetail::RunFormatter(builder, format, formatter); + NYT::NDetail::RunFormatterFullScan(builder, format, formatter); } template <class TVector> @@ -1022,7 +1174,7 @@ void FormatVector( const TVector& vec) { NYT::NDetail::TRangeFormatter<typename TVector::value_type> formatter(vec); - NYT::NDetail::RunFormatter(builder, format, formatter); + NYT::NDetail::RunFormatterFullScan(builder, format, formatter); } template <size_t Length, class TVector> diff --git a/library/cpp/yt/string/format_analyser.h b/library/cpp/yt/string/format_analyser.h index 20eee60580..9f194144dc 100644 --- a/library/cpp/yt/string/format_analyser.h +++ b/library/cpp/yt/string/format_analyser.h @@ -4,6 +4,7 @@ #include <util/generic/strbuf.h> +#include <algorithm> #include <array> #include <string_view> @@ -14,13 +15,26 @@ namespace NYT::NDetail { struct TFormatAnalyser { public: + using TMarkerLocation = std::tuple<int, int>; + // NB(arkady-e1ppa): Location is considered invalid (e.g. not filled) + // if get<0> == get<1> == 0. + template <class... TArgs> + using TMarkerLocations = std::array<TMarkerLocation, sizeof...(TArgs)>; + // NB(arkady-e1ppa): We can't cover all of them since that would require + // dynamic storage for their coordinates and we do not have + // constexpr context large enough to deallocate dynamic memory at the + // correct time. Thus we store first 5 position and scanning afterwards + // is pessimized. |-1| is for no position at all. + // |-2| is used to imply runtime format. + using TEscapeLocations = std::array<int, 5>; + // TODO(arkady-e1ppa): Until clang-19 consteval functions // defined out of line produce symbols in rare cases // causing linker to crash. template <class... TArgs> - static consteval void ValidateFormat(std::string_view fmt) + static consteval auto AnalyzeFormat(std::string_view fmt) { - DoValidateFormat<TArgs...>(fmt); + return DoAnalyzeFormat<TArgs...>(fmt); } private: @@ -51,11 +65,16 @@ private: static constexpr char IntroductorySymbol = '%'; template <class... TArgs> - static consteval void DoValidateFormat(std::string_view format) + static consteval auto DoAnalyzeFormat(std::string_view format) { - std::array<std::string_view, sizeof...(TArgs)> markers = {}; std::array<TSpecifiers, sizeof...(TArgs)> specifiers{GetSpecifiers<TArgs>()...}; + TMarkerLocations<TArgs...> markerLocations = {}; + TEscapeLocations escapeLocations = {}; + std::ranges::fill(escapeLocations, -1); + + int escapesCount = 0; + int markerCount = 0; int currentMarkerStart = -1; @@ -81,12 +100,17 @@ private: throw "You may not terminate flag sequence other than %% with \'%\' symbol"; } // '%%' detected --- skip + if (escapesCount < std::ssize(escapeLocations)) { + escapeLocations[escapesCount] = currentMarkerStart; + ++escapesCount; + } + currentMarkerStart = -1; continue; } // We are inside of marker. - if (markerCount == std::ssize(markers)) { + if (markerCount == std::ssize(markerLocations)) { // Too many markers throw "Number of arguments supplied to format is smaller than the number of flag sequences"; } @@ -94,8 +118,8 @@ private: if (Contains(specifiers[markerCount].Conversion, symbol)) { // Marker has finished. - markers[markerCount] - = std::string_view(format.begin() + currentMarkerStart, index - currentMarkerStart + 1); + markerLocations[markerCount] + = std::tuple{currentMarkerStart, index + 1}; currentMarkerStart = -1; ++markerCount; @@ -110,16 +134,16 @@ private: if (currentMarkerStart != -1) { // Runaway marker. throw "Unterminated flag sequence detected; Use \'%%\' to type plain %"; - return; } - if (markerCount < std::ssize(markers)) { + if (markerCount < std::ssize(markerLocations)) { // Missing markers. throw "Number of arguments supplied to format is greater than the number of flag sequences"; } // TODO(arkady-e1ppa): Consider per-type verification // of markers. + return std::tuple(markerLocations, escapeLocations); } }; diff --git a/library/cpp/yt/string/format_arg.h b/library/cpp/yt/string/format_arg.h index 544e265766..4dc7be06e8 100644 --- a/library/cpp/yt/string/format_arg.h +++ b/library/cpp/yt/string/format_arg.h @@ -32,7 +32,7 @@ class TFormatArgBase public: // TODO(arkady-e1ppa): Consider more strict formatting rules. static constexpr std::array ConversionSpecifiers = { - 'v', '1', 'c', 's', 'd', 'i', 'o', + 'v', 'c', 's', 'd', 'i', 'o', 'x', 'X', 'u', 'f', 'F', 'e', 'E', 'a', 'A', 'g', 'G', 'n', 'p' }; diff --git a/library/cpp/yt/string/format_string-inl.h b/library/cpp/yt/string/format_string-inl.h index a692d9648d..67f9bad45c 100644 --- a/library/cpp/yt/string/format_string-inl.h +++ b/library/cpp/yt/string/format_string-inl.h @@ -4,6 +4,8 @@ #include "format_string.h" #endif +#include <algorithm> + namespace NYT { //////////////////////////////////////////////////////////////////////////////// @@ -15,8 +17,13 @@ consteval TBasicFormatString<TArgs...>::TBasicFormatString(const T& fmt) : Format_(fmt) { CheckFormattability(); -#if !defined(NDEBUG) && !defined(YT_DISABLE_FORMAT_STATIC_ANALYSIS) - NDetail::TFormatAnalyser::ValidateFormat<std::remove_cvref_t<TArgs>...>(Format_); +#if !defined(YT_DISABLE_FORMAT_STATIC_ANALYSIS) + std::tie(Markers, Escapes) = NDetail::TFormatAnalyser::AnalyzeFormat<std::remove_cvref_t<TArgs>...>(Format_); +#else + std::ranges::fill_n(std::ranges::begin(Escapes), 1, -1); + if constexpr (sizeof...(TArgs) != 0) { + std::ranges::fill_n(std::ranges::begin(Markers), 1, std::tuple{0, 0}); + } #endif } @@ -46,6 +53,11 @@ template <class... TArgs> TBasicFormatString<TArgs...>::TBasicFormatString(TRuntimeFormat fmt) : Format_(fmt.Get()) { + std::ranges::fill_n(std::ranges::begin(Escapes), 1, -1); + if constexpr (sizeof...(TArgs) != 0) { + std::ranges::fill_n(std::ranges::begin(Markers), 1, std::tuple{0, 0}); + } + // NB(arkady-e1ppa): StaticFormat performs the // formattability check of the args in a way // that provides more useful information diff --git a/library/cpp/yt/string/format_string.h b/library/cpp/yt/string/format_string.h index 786c2e39ed..1008ccb453 100644 --- a/library/cpp/yt/string/format_string.h +++ b/library/cpp/yt/string/format_string.h @@ -43,6 +43,10 @@ public: static consteval void CheckFormattability(); + // Data used for compile-time slicing of the format string. + NDetail::TFormatAnalyser::TMarkerLocations<TArgs...> Markers = {}; + NDetail::TFormatAnalyser::TEscapeLocations Escapes = {}; + private: std::string_view Format_; diff --git a/library/cpp/yt/string/unittests/format_ut.cpp b/library/cpp/yt/string/unittests/format_ut.cpp index ac3be99ad9..8aca8c8e29 100644 --- a/library/cpp/yt/string/unittests/format_ut.cpp +++ b/library/cpp/yt/string/unittests/format_ut.cpp @@ -326,6 +326,11 @@ TEST(TFormatTest, CustomFlagsCollectionTwoLevels) EXPECT_EQ(Format("%NRv", arr), toCollectionD2("RNP")); } +TEST(TFormatTest, ManyEscapes) +{ + EXPECT_EQ("a%b%c%d%e%f%g", Format("%v%%%v%%%v%%%v%%%v%%%v%%%g", "a", "b", "c", "d", "e", "f", "g")); +} + //////////////////////////////////////////////////////////////////////////////// } // namespace |