diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /util/string | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'util/string')
74 files changed, 8446 insertions, 0 deletions
diff --git a/util/string/ascii.cpp b/util/string/ascii.cpp new file mode 100644 index 0000000000..95edb95cc8 --- /dev/null +++ b/util/string/ascii.cpp @@ -0,0 +1,59 @@ +#include "ascii.h" + +#include <util/system/yassert.h> +#include <util/system/compat.h> + +// clang-format off +extern const unsigned char NPrivate::ASCII_CLASS[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x68, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, + 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x74, 0x74, 0x74, 0x74, 0x74, 0x74, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, + 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x80, 0x80, 0x80, 0x80, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern const unsigned char NPrivate::ASCII_LOWER[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, +}; +// clang-format on + +int AsciiCompareIgnoreCase(const TStringBuf s1, const TStringBuf s2) noexcept { + if (s1.size() <= s2.size()) { + if (int cmp = strnicmp(s1.data(), s2.data(), s1.size())) { + return cmp; + } + return (s1.size() < s2.size()) ? -1 : 0; + } + + Y_ASSERT(s1.size() > s2.size()); + if (int cmp = strnicmp(s1.data(), s2.data(), s2.size())) { + return cmp; + } + return 1; +} diff --git a/util/string/ascii.h b/util/string/ascii.h new file mode 100644 index 0000000000..10344384d3 --- /dev/null +++ b/util/string/ascii.h @@ -0,0 +1,247 @@ +#pragma once + +#include <util/system/defaults.h> +#include <util/system/compat.h> +#include <util/generic/string.h> + +// ctype.h-like functions, locale-independent: +// IsAscii{Upper,Lower,Digit,Alpha,Alnum,Space} and +// AsciiTo{Upper,Lower} +// +// standard functions from <ctype.h> are locale dependent, +// and cause undefined behavior when called on chars outside [0..127] range + +namespace NPrivate { + enum ECharClass { + CC_SPACE = 1, + CC_UPPER = 2, + CC_LOWER = 4, + CC_DIGIT = 8, + CC_ALPHA = 16, + CC_ALNUM = 32, + CC_ISHEX = 64, + CC_PUNCT = 128, + }; + + extern const unsigned char ASCII_CLASS[256]; + extern const unsigned char ASCII_LOWER[256]; + + template <class T> + struct TDereference { + using type = T; + }; + +#ifndef TSTRING_IS_STD_STRING + template <class String> + struct TDereference<TBasicCharRef<String>> { + using type = typename String::value_type; + }; +#endif + + template <class T> + using TDereferenced = typename TDereference<T>::type; + + template <class T> + bool RangeOk(T c) noexcept { + static_assert(std::is_integral<T>::value, "Integral type character expected"); + + if (sizeof(T) == 1) { + return true; + } + + return c >= static_cast<T>(0) && c <= static_cast<T>(127); + } + +#ifndef TSTRING_IS_STD_STRING + template <class String> + bool RangeOk(const TBasicCharRef<String>& c) { + return RangeOk(static_cast<typename String::value_type>(c)); + } +#endif +} + +constexpr bool IsAscii(const int c) noexcept { + return !(c & ~0x7f); +} + +inline bool IsAsciiSpace(unsigned char c) { + return ::NPrivate::ASCII_CLASS[c] & ::NPrivate::CC_SPACE; +} + +inline bool IsAsciiUpper(unsigned char c) { + return ::NPrivate::ASCII_CLASS[c] & ::NPrivate::CC_UPPER; +} + +inline bool IsAsciiLower(unsigned char c) { + return ::NPrivate::ASCII_CLASS[c] & ::NPrivate::CC_LOWER; +} + +inline bool IsAsciiDigit(unsigned char c) { + return ::NPrivate::ASCII_CLASS[c] & ::NPrivate::CC_DIGIT; +} + +inline bool IsAsciiAlpha(unsigned char c) { + return ::NPrivate::ASCII_CLASS[c] & ::NPrivate::CC_ALPHA; +} + +inline bool IsAsciiAlnum(unsigned char c) { + return ::NPrivate::ASCII_CLASS[c] & ::NPrivate::CC_ALNUM; +} + +inline bool IsAsciiHex(unsigned char c) { + return ::NPrivate::ASCII_CLASS[c] & ::NPrivate::CC_ISHEX; +} + +inline bool IsAsciiPunct(unsigned char c) { + return ::NPrivate::ASCII_CLASS[c] & ::NPrivate::CC_PUNCT; +} + +// some overloads + +template <class T> +inline bool IsAsciiSpace(T c) { + return ::NPrivate::RangeOk(c) && IsAsciiSpace(static_cast<unsigned char>(c)); +} + +template <class T> +inline bool IsAsciiUpper(T c) { + return ::NPrivate::RangeOk(c) && IsAsciiUpper(static_cast<unsigned char>(c)); +} + +template <class T> +inline bool IsAsciiLower(T c) { + return ::NPrivate::RangeOk(c) && IsAsciiLower(static_cast<unsigned char>(c)); +} + +template <class T> +inline bool IsAsciiDigit(T c) { + return ::NPrivate::RangeOk(c) && IsAsciiDigit(static_cast<unsigned char>(c)); +} + +template <class T> +inline bool IsAsciiAlpha(T c) { + return ::NPrivate::RangeOk(c) && IsAsciiAlpha(static_cast<unsigned char>(c)); +} + +template <class T> +inline bool IsAsciiAlnum(T c) { + return ::NPrivate::RangeOk(c) && IsAsciiAlnum(static_cast<unsigned char>(c)); +} + +template <class T> +inline bool IsAsciiHex(T c) { + return ::NPrivate::RangeOk(c) && IsAsciiHex(static_cast<unsigned char>(c)); +} + +template <class T> +inline bool IsAsciiPunct(T c) { + return ::NPrivate::RangeOk(c) && IsAsciiPunct(static_cast<unsigned char>(c)); +} + +// some extra helpers +inline ui8 AsciiToLower(ui8 c) noexcept { + return ::NPrivate::ASCII_LOWER[c]; +} + +inline char AsciiToLower(char c) noexcept { + return (char)AsciiToLower((ui8)c); +} + +template <class T> +inline ::NPrivate::TDereferenced<T> AsciiToLower(T c) noexcept { + return (c >= 0 && c <= 127) ? (::NPrivate::TDereferenced<T>)AsciiToLower((ui8)c) : c; +} + +template <class T> +inline ::NPrivate::TDereferenced<T> AsciiToUpper(T c) noexcept { + return IsAsciiLower(c) ? (c + ('A' - 'a')) : c; +} + +/** + * ASCII case-insensitive string comparison (for proper UTF8 strings + * case-insensitive comparison consider using @c library/cpp/charset). + * + * BUGS: Currently will NOT work properly with strings that contain + * 0-terminator character inside. See IGNIETFERRO-1641 for details. + * + * @return true iff @c s1 ans @c s2 are case-insensitively equal. + */ +static inline bool AsciiEqualsIgnoreCase(const char* s1, const char* s2) noexcept { + return stricmp(s1, s2) == 0; +} + +/** + * ASCII case-insensitive string comparison (for proper UTF8 strings + * case-insensitive comparison consider using @c library/cpp/charset). + * + * BUGS: Currently will NOT work properly with strings that contain + * 0-terminator character inside. See IGNIETFERRO-1641 for details. + * + * @return true iff @c s1 ans @c s2 are case-insensitively equal. + */ +static inline bool AsciiEqualsIgnoreCase(const TStringBuf s1, const TStringBuf s2) noexcept { + return (s1.size() == s2.size()) && strnicmp(s1.data(), s2.data(), s1.size()) == 0; +} + +/** + * ASCII case-insensitive string comparison (for proper UTF8 strings + * case-insensitive comparison consider using @c library/cpp/charset). + * + * BUGS: Currently will NOT work properly with strings that contain + * 0-terminator character inside. See IGNIETFERRO-1641 for details. + * + * @return 0 if strings are equal, negative if @c s1 < @c s2 + * and positive otherwise. + * (same value as @c stricmp does). + */ +static inline int AsciiCompareIgnoreCase(const char* s1, const char* s2) noexcept { + return stricmp(s1, s2); +} + +/** + * ASCII case-insensitive string comparison (for proper UTF8 strings + * case-insensitive comparison consider using @c library/cpp/charset). + * + * BUGS: Currently will NOT work properly with strings that contain + * 0-terminator character inside. See IGNIETFERRO-1641 for details. + * + * @return + * - zero if strings are equal + * - negative if @c s1 < @c s2 + * - positive otherwise, + * similar to stricmp. + */ +Y_PURE_FUNCTION int AsciiCompareIgnoreCase(const TStringBuf s1, const TStringBuf s2) noexcept; + +/** + * ASCII case-sensitive string comparison (for proper UTF8 strings + * case-sensitive comparison consider using @c library/cpp/charset). + * + * BUGS: Currently will NOT work properly with strings that contain + * 0-terminator character inside. See IGNIETFERRO-1641 for details. + * + * @return true iff @c s2 are case-sensitively prefix of @c s1. + */ +static inline bool AsciiHasPrefix(const TStringBuf s1, const TStringBuf s2) noexcept { + return (s1.size() >= s2.size()) && memcmp(s1.data(), s2.data(), s2.size()) == 0; +} + +/** + * ASCII case-insensitive string comparison (for proper UTF8 strings + * case-insensitive comparison consider using @c library/cpp/charset). + * + * @return true iff @c s2 are case-insensitively prefix of @c s1. + */ +static inline bool AsciiHasPrefixIgnoreCase(const TStringBuf s1, const TStringBuf s2) noexcept { + return (s1.size() >= s2.size()) && strnicmp(s1.data(), s2.data(), s2.size()) == 0; +} + +/** + * ASCII case-insensitive string comparison (for proper UTF8 strings + * case-insensitive comparison consider using @c library/cpp/charset). + * + * @return true iff @c s2 are case-insensitively suffix of @c s1. + */ +static inline bool AsciiHasSuffixIgnoreCase(const TStringBuf s1, const TStringBuf s2) noexcept { + return (s1.size() >= s2.size()) && strnicmp((s1.data() + (s1.size() - s2.size())), s2.data(), s2.size()) == 0; +} diff --git a/util/string/ascii_ut.cpp b/util/string/ascii_ut.cpp new file mode 100644 index 0000000000..89069fee50 --- /dev/null +++ b/util/string/ascii_ut.cpp @@ -0,0 +1,98 @@ +#include "ascii.h" +#include <ctype.h> + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(TAsciiTest) { + Y_UNIT_TEST(TestAscii) { + UNIT_ASSERT(IsAsciiDigit('3')); + UNIT_ASSERT(!IsAsciiDigit('x')); + + UNIT_ASSERT(IsAsciiAlpha('r')); + UNIT_ASSERT(IsAsciiAlpha('R')); + UNIT_ASSERT(!IsAsciiAlpha('3')); + + UNIT_ASSERT_EQUAL(AsciiToLower('3'), '3'); + UNIT_ASSERT_EQUAL(AsciiToLower('A'), 'a'); + UNIT_ASSERT_EQUAL(AsciiToLower('a'), 'a'); + + UNIT_ASSERT_EQUAL(AsciiToUpper('3'), '3'); + UNIT_ASSERT_EQUAL(AsciiToUpper('A'), 'A'); + UNIT_ASSERT_EQUAL(AsciiToUpper('a'), 'A'); + + UNIT_ASSERT(IsAscii('a')); + UNIT_ASSERT(!IsAscii(-100)); + UNIT_ASSERT(!IsAscii(+200)); + UNIT_ASSERT(!IsAscii(int('a') + 256)); + + for (int i = 0; i < 128; ++i) { + UNIT_ASSERT_VALUES_EQUAL((bool)isxdigit(i), IsAsciiHex(i)); + UNIT_ASSERT_VALUES_EQUAL((bool)isspace(i), IsAsciiSpace((char)i)); + UNIT_ASSERT_VALUES_EQUAL((bool)isspace(i), IsAsciiSpace((char)i)); + UNIT_ASSERT_VALUES_EQUAL((bool)isalnum(i), IsAsciiAlnum((char)i)); + UNIT_ASSERT_VALUES_EQUAL((bool)isalpha(i), IsAsciiAlpha((char)i)); + UNIT_ASSERT_VALUES_EQUAL((bool)isupper(i), IsAsciiUpper((char)i)); + UNIT_ASSERT_VALUES_EQUAL((bool)islower(i), IsAsciiLower((char)i)); + UNIT_ASSERT_VALUES_EQUAL((bool)isdigit(i), IsAsciiDigit((char)i)); + UNIT_ASSERT_VALUES_EQUAL((bool)ispunct(i), IsAsciiPunct((char)i)); + } + } + + Y_UNIT_TEST(Test1) { + for (int i = 128; i < 1000; ++i) { + UNIT_ASSERT(!IsAsciiHex(i)); + UNIT_ASSERT(!IsAsciiSpace(i)); + UNIT_ASSERT(!IsAsciiAlnum(i)); + UNIT_ASSERT(!IsAsciiAlpha(i)); + UNIT_ASSERT(!IsAsciiUpper(i)); + UNIT_ASSERT(!IsAsciiLower(i)); + UNIT_ASSERT(!IsAsciiDigit(i)); + UNIT_ASSERT(!IsAsciiPunct(i)); + } + + for (int i = -1000; i < 0; ++i) { + UNIT_ASSERT(!IsAsciiHex(i)); + UNIT_ASSERT(!IsAsciiSpace(i)); + UNIT_ASSERT(!IsAsciiAlnum(i)); + UNIT_ASSERT(!IsAsciiAlpha(i)); + UNIT_ASSERT(!IsAsciiUpper(i)); + UNIT_ASSERT(!IsAsciiLower(i)); + UNIT_ASSERT(!IsAsciiDigit(i)); + UNIT_ASSERT(!IsAsciiPunct(i)); + } + } + + Y_UNIT_TEST(CompareTest) { + UNIT_ASSERT(AsciiEqualsIgnoreCase("qqq", "qQq")); + UNIT_ASSERT(AsciiEqualsIgnoreCase("qqq", TStringBuf("qQq"))); + TString qq = "qq"; + TString qQ = "qQ"; + UNIT_ASSERT(AsciiEqualsIgnoreCase(qq, qQ)); + + TString x = "qqqA"; + TString y = "qQqB"; + TString z = "qQnB"; + TString zz = "qQqq"; + TString zzz = "qQqqq"; + TStringBuf xs = TStringBuf(x.data(), 3); + TStringBuf ys = TStringBuf(y.data(), 3); + TStringBuf zs = TStringBuf(z.data(), 3); + UNIT_ASSERT(AsciiCompareIgnoreCase(xs, ys) == 0); + UNIT_ASSERT(AsciiCompareIgnoreCase(xs, zs) > 0); + UNIT_ASSERT(AsciiCompareIgnoreCase(xs, zz) < 0); + UNIT_ASSERT(AsciiCompareIgnoreCase(zzz, zz) > 0); + + UNIT_ASSERT(AsciiCompareIgnoreCase("qqQ", "qq") > 0); + UNIT_ASSERT(AsciiCompareIgnoreCase("qq", "qq") == 0); + + UNIT_ASSERT_EQUAL(AsciiHasPrefix("qweasd", "qwe"), true); + UNIT_ASSERT_EQUAL(AsciiHasPrefix("qweasd", "qWe"), false); + UNIT_ASSERT_EQUAL(AsciiHasPrefix("qweasd", "eWq"), false); + + UNIT_ASSERT_EQUAL(AsciiHasPrefixIgnoreCase("qweasd", "qWe"), true); + UNIT_ASSERT_EQUAL(AsciiHasPrefixIgnoreCase("qweasd", "eWq"), false); + + UNIT_ASSERT_EQUAL(AsciiHasSuffixIgnoreCase("qweasd", "asD"), true); + UNIT_ASSERT_EQUAL(AsciiHasSuffixIgnoreCase("qweasd", "ast"), false); + } +} diff --git a/util/string/benchmark/ascii/main.cpp b/util/string/benchmark/ascii/main.cpp new file mode 100644 index 0000000000..673047025d --- /dev/null +++ b/util/string/benchmark/ascii/main.cpp @@ -0,0 +1,123 @@ +#include <library/cpp/testing/benchmark/bench.h> + +#include <util/generic/xrange.h> +#include <util/string/ascii.h> +#include <util/generic/bitmap.h> +#include <util/generic/singleton.h> + +namespace { + struct TUpperMap: public TBitMap<256> { + inline TUpperMap() noexcept { + for (unsigned i = 'A'; i <= 'Z'; ++i) { + Set((ui8)i); + } + } + + inline char ToLower(char x) const noexcept { + return Get((ui8)x) ? x + ('a' - 'A') : x; + } + }; + + struct TToLowerLookup { + char Table[256]; + + TToLowerLookup() { + for (size_t i : xrange(256)) { + Table[i] = AsciiToLower(i); + } + } + + char ToLower(char x) const noexcept { + return Table[(ui8)x]; + } + }; +} + +static inline char FastAsciiToLower(char c) { + return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; +} + +static inline char FastAsciiToLower2(char c) { + return c + ('a' - 'A') * (int)(c >= 'A' && c <= 'Z'); +} + +Y_CPU_BENCHMARK(AsciiToLower, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + + for (int j = 0; j < 256; ++j) { + Y_DO_NOT_OPTIMIZE_AWAY(AsciiToLower(j)); + } + } +} + +Y_CPU_BENCHMARK(AsciiToLowerChar, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + + for (int j = 0; j < 256; ++j) { + Y_DO_NOT_OPTIMIZE_AWAY(AsciiToLower((char)j)); + } + } +} + +Y_CPU_BENCHMARK(FastAsciiToLower, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + + for (int j = 0; j < 256; ++j) { + Y_DO_NOT_OPTIMIZE_AWAY(FastAsciiToLower(j)); + } + } +} + +Y_CPU_BENCHMARK(FastAsciiToLower2, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + + for (int j = 0; j < 256; ++j) { + Y_DO_NOT_OPTIMIZE_AWAY(FastAsciiToLower2(j)); + } + } +} + +Y_CPU_BENCHMARK(BitMapAsciiToLower, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + + for (int j = 0; j < 256; ++j) { + Y_DO_NOT_OPTIMIZE_AWAY(Singleton<TUpperMap>()->ToLower(j)); + } + } +} + +Y_CPU_BENCHMARK(LookupAsciiToLower, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + + for (int j = 0; j < 256; ++j) { + Y_DO_NOT_OPTIMIZE_AWAY(Singleton<TToLowerLookup>()->ToLower(j)); + } + } +} + +Y_CPU_BENCHMARK(LookupAsciiToLowerNoSingleton, iface) { + TToLowerLookup lookup; + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + + for (int j = 0; j < 256; ++j) { + Y_DO_NOT_OPTIMIZE_AWAY(lookup.ToLower(j)); + } + } +} + +Y_CPU_BENCHMARK(tolower, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + + for (int j = 0; j < 256; ++j) { + Y_DO_NOT_OPTIMIZE_AWAY(tolower(j)); + } + } +} diff --git a/util/string/benchmark/ascii/ya.make b/util/string/benchmark/ascii/ya.make new file mode 100644 index 0000000000..f95b9e0fa8 --- /dev/null +++ b/util/string/benchmark/ascii/ya.make @@ -0,0 +1,9 @@ +Y_BENCHMARK() + +OWNER(pg) + +SRCS( + main.cpp +) + +END() diff --git a/util/string/benchmark/cast/main.cpp b/util/string/benchmark/cast/main.cpp new file mode 100644 index 0000000000..f604712ab6 --- /dev/null +++ b/util/string/benchmark/cast/main.cpp @@ -0,0 +1,66 @@ +#include <library/cpp/testing/benchmark/bench.h> + +#include <util/string/cast.h> +#include <util/generic/xrange.h> + +char str1[] = "1"; +char str12[] = "12"; +char str1234[] = "1234"; +char str12345678[] = "12345678"; + +Y_CPU_BENCHMARK(Parse_1, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + Y_DO_NOT_OPTIMIZE_AWAY(FromString<ui32>(str1, 1)); + } +} + +Y_CPU_BENCHMARK(Parse_12, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + Y_DO_NOT_OPTIMIZE_AWAY(FromString<ui32>(str12, 2)); + } +} + +Y_CPU_BENCHMARK(Parse_1234, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + Y_DO_NOT_OPTIMIZE_AWAY(FromString<ui32>(str1234, 4)); + } +} + +Y_CPU_BENCHMARK(Parse_12345678, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + Y_DO_NOT_OPTIMIZE_AWAY(FromString<ui32>(str12345678, 8)); + } +} + +//atoi +Y_CPU_BENCHMARK(Atoi_1, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + Y_DO_NOT_OPTIMIZE_AWAY(atoi(str1)); + } +} + +Y_CPU_BENCHMARK(Atoi_12, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + Y_DO_NOT_OPTIMIZE_AWAY(atoi(str12)); + } +} + +Y_CPU_BENCHMARK(Atoi_1234, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + Y_DO_NOT_OPTIMIZE_AWAY(atoi(str1234)); + } +} + +Y_CPU_BENCHMARK(Atoi_12345678, iface) { + for (const auto i : xrange(iface.Iterations())) { + Y_UNUSED(i); + Y_DO_NOT_OPTIMIZE_AWAY(atoi(str12345678)); + } +} diff --git a/util/string/benchmark/cast/ya.make b/util/string/benchmark/cast/ya.make new file mode 100644 index 0000000000..f95b9e0fa8 --- /dev/null +++ b/util/string/benchmark/cast/ya.make @@ -0,0 +1,9 @@ +Y_BENCHMARK() + +OWNER(pg) + +SRCS( + main.cpp +) + +END() diff --git a/util/string/benchmark/float_to_string/main.cpp b/util/string/benchmark/float_to_string/main.cpp new file mode 100644 index 0000000000..1c7c0684a3 --- /dev/null +++ b/util/string/benchmark/float_to_string/main.cpp @@ -0,0 +1,253 @@ +#include <library/cpp/testing/benchmark/bench.h> + +#include <util/generic/singleton.h> +#include <util/generic/vector.h> +#include <util/generic/xrange.h> +#include <util/generic/ymath.h> +#include <util/random/fast.h> +#include <util/string/cast.h> +#include <util/string/printf.h> + +#include <limits> + +#include <cmath> + +/* Please be careful before making any decisions based on this benchmark. + * + * Only `Sprintf("%.<decimals>f", x)` and `FloatToString(x, PREC_POINT_DIGITS, decimals` produce + * equal results in general case. However, results for cases when x \in [0, 1) must be equal for + * both `Sprintf` and `FloatToString`. + * + * Read more about formatting in STL [1, 2] and Yandex Util formatting [3] + * + * [1] http://www.cplusplus.com/reference/cstdio/printf/ + * [2] http://en.cppreference.com/w/c/io/fprintf + * [3] https://a.yandex-team.ru/arc/trunk/arcadia/util/string/cast.h?rev=2432660#L29 + */ + +namespace { + template <typename T> + struct TExample { + T Value{}; + int DigitsCount{}; + }; + + template <typename T, size_t N> + struct TExamplesHolder { + TVector<TExample<T>> Examples; + + TExamplesHolder() + : Examples(N) + { + TFastRng<ui64> prng{N * sizeof(T) * 42}; + for (auto& x : Examples) { + x.Value = prng.GenRandReal4() + prng.Uniform(Max<ui16>()); + x.DigitsCount = prng.Uniform(std::numeric_limits<T>::max_digits10 + 1); + } + } + }; + + template <typename T, size_t N> + struct TNearZeroExamplesHolder { + TVector<TExample<T>> Examples; + + TNearZeroExamplesHolder() + : Examples(N) + { + TFastRng<ui64> prng{N * sizeof(T) * 42}; + for (auto& x : Examples) { + x.Value = prng.GenRandReal4(); + x.DigitsCount = prng.Uniform(std::numeric_limits<T>::max_digits10 + 1); + } + } + }; +} + +static const char* FORMAT_FIXED[] = { + "%.0f", + "%.1f", + "%.2f", + "%.3f", + "%.4f", + "%.5f", + "%.6f", + "%.7f", + "%.8f", + "%.9f", + "%.10f", + "%.11f", + "%.12f", + "%.13f", + "%.14f", + "%.15f", + "%.16f", + "%.17f", +}; + +static const char* FORMAT_SIGNIFICANT[] = { + "%.0g", + "%.1g", + "%.2g", + "%.3g", + "%.4g", + "%.5g", + "%.6g", + "%.7g", + "%.8g", + "%.9g", + "%.10g", + "%.11g", + "%.12g", + "%.13g", + "%.14g", + "%.15g", + "%.16g", + "%.17g", +}; + +#define DEFINE_BENCHMARK(type, count) \ + Y_CPU_BENCHMARK(SprintfAuto_##type##_##count, iface) { \ + const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + /* this is in fact equal to Sprintf("%.6f", e.Value) and that is why it is faster */ \ + /* than FloatToString(e.Value) */ \ + Y_DO_NOT_OPTIMIZE_AWAY(Sprintf("%f", e.Value)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(FloatToStringAuto_##type##_##count, iface) { \ + const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(FloatToString(e.Value)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(SprintfFixed_##type##_##count, iface) { \ + const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(Sprintf(FORMAT_FIXED[e.DigitsCount], e.Value)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(FloatToStringFixed_##type##_##count, iface) { \ + const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(FloatToString(e.Value, PREC_NDIGITS, e.DigitsCount)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(SprintfSignificant_##type##_##count, iface) { \ + const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(Sprintf(FORMAT_SIGNIFICANT[e.DigitsCount], e.Value)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(FloatToStringSignificant_##type##_##count, iface) { \ + const auto& examples = Default<TExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(FloatToString(e.Value, PREC_POINT_DIGITS, e.DigitsCount)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(NearZeroSprintfAuto_##type##_##count, iface) { \ + const auto& examples = Default<TNearZeroExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + /* this is in fact equal to Sprintf("%.6f", e.Value) and that is why it is faster */ \ + /* than FloatToString(e.Value) */ \ + Y_DO_NOT_OPTIMIZE_AWAY(Sprintf("%f", e.Value)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(NearZeroFloatToStringAuto_##type##_##count, iface) { \ + const auto& examples = Default<TNearZeroExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(FloatToString(e.Value)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(NearZeroSprintfFixed_##type##_##count, iface) { \ + const auto& examples = Default<TNearZeroExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(Sprintf(FORMAT_FIXED[e.DigitsCount], e.Value)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(NearZeroFloatToStringFixed_##type##_##count, iface) { \ + const auto& examples = Default<TNearZeroExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(FloatToString(e.Value, PREC_NDIGITS, e.DigitsCount)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(NearZeroSprintfSignificant_##type##_##count, iface) { \ + const auto& examples = Default<TNearZeroExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(Sprintf(FORMAT_SIGNIFICANT[e.DigitsCount], e.Value)); \ + } \ + } \ + } \ + \ + Y_CPU_BENCHMARK(NearZeroFloatToStringSignificant_##type##_##count, iface) { \ + const auto& examples = Default<TNearZeroExamplesHolder<type, count>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(FloatToString(e.Value, PREC_POINT_DIGITS, e.DigitsCount)); \ + } \ + } \ + } + +DEFINE_BENCHMARK(float, 1); +DEFINE_BENCHMARK(float, 2); +DEFINE_BENCHMARK(float, 4); +DEFINE_BENCHMARK(float, 8); +DEFINE_BENCHMARK(float, 16); +DEFINE_BENCHMARK(float, 32); +DEFINE_BENCHMARK(float, 64); +DEFINE_BENCHMARK(float, 128); +DEFINE_BENCHMARK(float, 256); + +DEFINE_BENCHMARK(double, 1); +DEFINE_BENCHMARK(double, 2); +DEFINE_BENCHMARK(double, 4); +DEFINE_BENCHMARK(double, 8); +DEFINE_BENCHMARK(double, 16); +DEFINE_BENCHMARK(double, 32); +DEFINE_BENCHMARK(double, 64); +DEFINE_BENCHMARK(double, 128); +DEFINE_BENCHMARK(double, 256); + +#undef DEFINE_BENCHMARK diff --git a/util/string/benchmark/float_to_string/metrics/main.py b/util/string/benchmark/float_to_string/metrics/main.py new file mode 100644 index 0000000000..e9d4b7ac1d --- /dev/null +++ b/util/string/benchmark/float_to_string/metrics/main.py @@ -0,0 +1,5 @@ +import yatest.common as yc + + +def test_export_metrics(metrics): + metrics.set_benchmark(yc.execute_benchmark('util/string/benchmark/float_to_string/float_to_string', threads=8)) diff --git a/util/string/benchmark/float_to_string/metrics/ya.make b/util/string/benchmark/float_to_string/metrics/ya.make new file mode 100644 index 0000000000..4b8c4cc07d --- /dev/null +++ b/util/string/benchmark/float_to_string/metrics/ya.make @@ -0,0 +1,21 @@ +OWNER( + yazevnul + g:util +) +SUBSCRIBER(g:util-subscribers) + +PY2TEST() + +SIZE(LARGE) + +TAG( + ya:force_sandbox + sb:intel_e5_2660v1 + ya:fat +) + +TEST_SRCS(main.py) + +DEPENDS(util/string/benchmark/float_to_string) + +END() diff --git a/util/string/benchmark/float_to_string/ya.make b/util/string/benchmark/float_to_string/ya.make new file mode 100644 index 0000000000..8136ad34f0 --- /dev/null +++ b/util/string/benchmark/float_to_string/ya.make @@ -0,0 +1,12 @@ +OWNER(yazevnul) + +Y_BENCHMARK() + +# to minimize allocations overhead +ALLOCATOR(B) + +SRCS( + main.cpp +) + +END() diff --git a/util/string/benchmark/join/main.cpp b/util/string/benchmark/join/main.cpp new file mode 100644 index 0000000000..1a8633d3a8 --- /dev/null +++ b/util/string/benchmark/join/main.cpp @@ -0,0 +1,95 @@ +#include <library/cpp/testing/benchmark/bench.h> + +#include <util/generic/function.h> +#include <util/generic/singleton.h> +#include <util/generic/vector.h> +#include <util/generic/xrange.h> +#include <util/random/fast.h> +#include <util/string/cast.h> +#include <util/string/join.h> + +namespace { + // This class assigns random values to variadic lists of variables of different types. + // It can be used to randomize a tuple via Apply() (arcadia version of std::apply). + class TRandomizer { + public: + TRandomizer(ui64 seed) + : Prng(seed) + { + } + + void Randomize(ui16& i) { + i = static_cast<ui16>(Prng.GenRand()); + } + + void Randomize(ui32& i) { + i = static_cast<ui32>(Prng.GenRand()); + } + + void Randomize(double& d) { + d = Prng.GenRandReal4() + Prng.Uniform(Max<ui16>()); + } + + void Randomize(TString& s) { + s = ::ToString(Prng.GenRand()); + } + + template <typename T, typename... TArgs> + void Randomize(T& t, TArgs&... args) { + Randomize(t); + Randomize(args...); + } + + private: + TFastRng<ui64> Prng; + }; + + template <size_t N, typename... T> + struct TExamplesHolder { + using TExamples = TVector<std::tuple<T...>>; + TExamples Examples; + + TExamplesHolder() + : Examples(N) + { + TRandomizer r{N * sizeof(typename TExamples::value_type) * 42}; + for (auto& x : Examples) { + Apply([&r](T&... t) { r.Randomize(t...); }, x); + } + } + }; + + template <typename... TArgs> + TString JoinTuple(std::tuple<TArgs...> t) { + return Apply([](TArgs... x) -> TString { return Join("-", x...); }, t); + } +} + +#define DEFINE_BENCHMARK(count, types, ...) \ + Y_CPU_BENCHMARK(Join_##count##_##types, iface) { \ + const auto& examples = Default<TExamplesHolder<count, __VA_ARGS__>>().Examples; \ + for (const auto i : xrange(iface.Iterations())) { \ + Y_UNUSED(i); \ + for (const auto e : examples) { \ + Y_DO_NOT_OPTIMIZE_AWAY(JoinTuple(e)); \ + } \ + } \ + } + +DEFINE_BENCHMARK(100, SS, TString, TString); +DEFINE_BENCHMARK(100, SSS, TString, TString, TString); +DEFINE_BENCHMARK(100, SSSSS, TString, TString, TString, TString, TString); + +DEFINE_BENCHMARK(100, ss, ui16, ui16); +DEFINE_BENCHMARK(100, SsS, TString, ui16, TString); +DEFINE_BENCHMARK(100, SsSsS, TString, ui16, TString, ui16, TString); + +DEFINE_BENCHMARK(100, ii, ui32, ui32); +DEFINE_BENCHMARK(100, SiS, TString, ui32, TString); +DEFINE_BENCHMARK(100, SiSiS, TString, ui32, TString, ui32, TString); + +DEFINE_BENCHMARK(100, dd, double, double); +DEFINE_BENCHMARK(100, SdS, TString, double, TString); +DEFINE_BENCHMARK(100, SdSdS, TString, double, TString, double, TString); + +#undef DEFINE_BENCHMARK diff --git a/util/string/benchmark/join/metrics/main.py b/util/string/benchmark/join/metrics/main.py new file mode 100644 index 0000000000..1ed5014808 --- /dev/null +++ b/util/string/benchmark/join/metrics/main.py @@ -0,0 +1,5 @@ +import yatest.common as yc + + +def test_export_metrics(metrics): + metrics.set_benchmark(yc.execute_benchmark('util/string/benchmark/join/join', threads=8)) diff --git a/util/string/benchmark/join/metrics/ya.make b/util/string/benchmark/join/metrics/ya.make new file mode 100644 index 0000000000..08ff3a149f --- /dev/null +++ b/util/string/benchmark/join/metrics/ya.make @@ -0,0 +1,21 @@ +OWNER( + salmin + g:util +) +SUBSCRIBER(g:util-subscribers) + +PY2TEST() + +SIZE(LARGE) + +TAG( + ya:force_sandbox + sb:intel_e5_2660v1 + ya:fat +) + +TEST_SRCS(main.py) + +DEPENDS(util/string/benchmark/join) + +END() diff --git a/util/string/benchmark/join/ya.make b/util/string/benchmark/join/ya.make new file mode 100644 index 0000000000..dfcc1d264e --- /dev/null +++ b/util/string/benchmark/join/ya.make @@ -0,0 +1,13 @@ +Y_BENCHMARK() + +OWNER( + salmin + g:util +) +SUBSCRIBER(g:util-subscribers) + +SRCS( + main.cpp +) + +END() diff --git a/util/string/benchmark/subst_global/main.cpp b/util/string/benchmark/subst_global/main.cpp new file mode 100644 index 0000000000..e0decfa042 --- /dev/null +++ b/util/string/benchmark/subst_global/main.cpp @@ -0,0 +1,203 @@ +#include <library/cpp/testing/benchmark/bench.h> + +#include <util/generic/cast.h> +#include <util/generic/singleton.h> +#include <util/generic/string.h> +#include <util/generic/xrange.h> +#include <util/random/fast.h> +#include <util/string/cast.h> +#include <util/string/subst.h> + +namespace { + template <size_t N, char What, char With> + struct TNoMatches { + enum : char { + WHAT = What, + WITH = With + }; + TString Str; + + TNoMatches() { + for (const auto dummy : xrange(N)) { + Y_UNUSED(dummy); + Str += WHAT + 1; + } + } + }; + + template <size_t N, char What, char With> + struct TOneMatchInTheBeginning { + enum : char { + WHAT = What, + WITH = With + }; + TString Str; + + TOneMatchInTheBeginning() { + if (!N) { + return; + } + + Str += WHAT; + if (N > 1) { + for (const auto dummy : xrange(N - 1)) { + Y_UNUSED(dummy); + Str += WHAT + 1; + } + } + } + }; + + template <size_t N, char What, char With> + struct TOneMatchInTheEnd { + enum : char { + WHAT = What, + WITH = With + }; + TString Str; + + TOneMatchInTheEnd() { + if (!N) { + return; + } + + if (N > 1) { + for (const auto dummy : xrange(N - 1)) { + Y_UNUSED(dummy); + Str += WHAT + 1; + } + } + Str += WHAT; + } + }; + + template <size_t N, char What, char With> + struct TOneMatchInTheMiddle { + enum : char { + WHAT = What, + WITH = With + }; + TString Str; + + TOneMatchInTheMiddle() { + if (!N) { + return; + } + + for (size_t i = 0; i < N / 2; ++i) { + Str += WHAT + 1; + } + Str += WHAT; + for (; Str.size() < N;) { + Str += WHAT + 1; + } + } + }; + + template <size_t N, char What, char With> + struct TFirstHalfMatches { + enum : char { + WHAT = What, + WITH = With + }; + TString Str; + + TFirstHalfMatches() { + for (size_t i = 0; i < N / 2; ++i) { + Str += WHAT; + } + for (; Str.size() != N;) { + Str += WHAT + 1; + } + } + }; + + template <size_t N, char What, char With> + struct TSecondHalfMatches { + enum : char { + WHAT = What, + WITH = With + }; + TString Str; + + TSecondHalfMatches() { + for (size_t i = 0; i < N / 2; ++i) { + Str += WHAT + 1; + } + for (; Str.size() != N;) { + Str += WHAT; + } + } + }; + + template <size_t N, size_t K, char What, char With> + struct TEveryKth { + enum : char { + WHAT = What, + WITH = With + }; + TString Str; + + TEveryKth() { + TFastRng<ui64> prng{N * K * 101}; + for (size_t i = 0; i < N; ++i) { + Str += (prng() % K) ? (WHAT + 1) : WHAT; + } + } + }; +} + +#define DEFINE_BENCHMARK(type, N) \ + Y_CPU_BENCHMARK(type##_##N, i) { \ + using D = T##type<N, 'a', 'z'>; \ + const auto& str = Default<D>().Str; \ + for (const auto dummy : xrange(i.Iterations())) { \ + Y_UNUSED(dummy); \ + auto s = str; \ + NBench::Escape(s.data()); \ + Y_DO_NOT_OPTIMIZE_AWAY(SubstGlobal(s, ToUnderlying(D::WHAT), ToUnderlying(D::WITH))); \ + NBench::Clobber(); \ + } \ + } + +#define DEFINE_RNG_BENCHMARK(N, K) \ + Y_CPU_BENCHMARK(Random_##N##_##K, i) { \ + using D = TEveryKth<N, K, 'a', 'z'>; \ + const auto& str = Default<D>().Str; \ + for (const auto dummy : xrange(i.Iterations())) { \ + Y_UNUSED(dummy); \ + auto s = str; \ + NBench::Escape(s.data()); \ + Y_DO_NOT_OPTIMIZE_AWAY(SubstGlobal(s, ToUnderlying(D::WHAT), ToUnderlying(D::WITH))); \ + NBench::Clobber(); \ + } \ + } + +DEFINE_BENCHMARK(NoMatches, 0) +DEFINE_BENCHMARK(NoMatches, 1) +DEFINE_BENCHMARK(NoMatches, 128) +DEFINE_BENCHMARK(NoMatches, 4096) +DEFINE_BENCHMARK(OneMatchInTheBeginning, 1) +DEFINE_BENCHMARK(OneMatchInTheBeginning, 16) +DEFINE_BENCHMARK(OneMatchInTheBeginning, 128) +DEFINE_BENCHMARK(OneMatchInTheBeginning, 4096) +DEFINE_BENCHMARK(OneMatchInTheEnd, 16) +DEFINE_BENCHMARK(OneMatchInTheEnd, 128) +DEFINE_BENCHMARK(OneMatchInTheEnd, 4096) +DEFINE_BENCHMARK(OneMatchInTheMiddle, 16) +DEFINE_BENCHMARK(OneMatchInTheMiddle, 128) +DEFINE_BENCHMARK(OneMatchInTheMiddle, 4096) +DEFINE_BENCHMARK(FirstHalfMatches, 16) +DEFINE_BENCHMARK(FirstHalfMatches, 128) +DEFINE_BENCHMARK(FirstHalfMatches, 4096) +DEFINE_BENCHMARK(SecondHalfMatches, 16) +DEFINE_BENCHMARK(SecondHalfMatches, 128) +DEFINE_BENCHMARK(SecondHalfMatches, 4096) + +DEFINE_RNG_BENCHMARK(4096, 1) +DEFINE_RNG_BENCHMARK(4096, 2) +DEFINE_RNG_BENCHMARK(4096, 3) +DEFINE_RNG_BENCHMARK(4096, 4) +DEFINE_RNG_BENCHMARK(4096, 10) +DEFINE_RNG_BENCHMARK(4096, 32) +DEFINE_RNG_BENCHMARK(4096, 100) diff --git a/util/string/benchmark/subst_global/metrics/main.py b/util/string/benchmark/subst_global/metrics/main.py new file mode 100644 index 0000000000..62f2f3d76d --- /dev/null +++ b/util/string/benchmark/subst_global/metrics/main.py @@ -0,0 +1,5 @@ +import yatest.common as yc + + +def test_export_metrics(metrics): + metrics.set_benchmark(yc.execute_benchmark('util/string/benchmark/subst_global/subst_global', threads=8)) diff --git a/util/string/benchmark/subst_global/metrics/ya.make b/util/string/benchmark/subst_global/metrics/ya.make new file mode 100644 index 0000000000..d8c30ad460 --- /dev/null +++ b/util/string/benchmark/subst_global/metrics/ya.make @@ -0,0 +1,21 @@ +OWNER( + yazevnul + g:util +) +SUBSCRIBER(g:util-subscribers) + +PY2TEST() + +SIZE(LARGE) + +TAG( + ya:force_sandbox + sb:intel_e5_2660v1 + ya:fat +) + +TEST_SRCS(main.py) + +DEPENDS(util/string/benchmark/subst_global) + +END() diff --git a/util/string/benchmark/subst_global/ya.make b/util/string/benchmark/subst_global/ya.make new file mode 100644 index 0000000000..8136ad34f0 --- /dev/null +++ b/util/string/benchmark/subst_global/ya.make @@ -0,0 +1,12 @@ +OWNER(yazevnul) + +Y_BENCHMARK() + +# to minimize allocations overhead +ALLOCATOR(B) + +SRCS( + main.cpp +) + +END() diff --git a/util/string/benchmark/ya.make b/util/string/benchmark/ya.make new file mode 100644 index 0000000000..266b53c7b3 --- /dev/null +++ b/util/string/benchmark/ya.make @@ -0,0 +1,16 @@ +OWNER( + g:util + yazevnul +) +SUBSCRIBER(g:util-subscribers) + +RECURSE( + ascii + cast + float_to_string + float_to_string/metrics + join + join/metrics + subst_global + subst_global/metrics +) diff --git a/util/string/builder.cpp b/util/string/builder.cpp new file mode 100644 index 0000000000..a3821d3399 --- /dev/null +++ b/util/string/builder.cpp @@ -0,0 +1,8 @@ +#include "builder.h" + +#include <util/stream/output.h> + +template <> +void Out<TStringBuilder>(IOutputStream& os, const TStringBuilder& sb) { + os << static_cast<const TString&>(sb); +} diff --git a/util/string/builder.h b/util/string/builder.h new file mode 100644 index 0000000000..7b54821151 --- /dev/null +++ b/util/string/builder.h @@ -0,0 +1,39 @@ +#pragma once + +#include <util/stream/str.h> +#include <utility> +#include <util/generic/string.h> + +namespace NPrivateStringBuilder { + class TStringBuilder: public TString { + public: + inline TStringBuilder() + : Out(*this) + { + } + + TStringBuilder(TStringBuilder&& rhs) + : TString(std::move(rhs)) + , Out(*this) + { + } + + TStringOutput Out; + }; + + template <class T> + static inline TStringBuilder& operator<<(TStringBuilder& builder, const T& t) { + builder.Out << t; + + return builder; + } + + template <class T> + static inline TStringBuilder&& operator<<(TStringBuilder&& builder, const T& t) { + builder.Out << t; + + return std::move(builder); + } +} + +using TStringBuilder = NPrivateStringBuilder::TStringBuilder; diff --git a/util/string/builder_ut.cpp b/util/string/builder_ut.cpp new file mode 100644 index 0000000000..22def683ec --- /dev/null +++ b/util/string/builder_ut.cpp @@ -0,0 +1,63 @@ +#include "builder.h" + +#include <library/cpp/testing/unittest/registar.h> + +static void TestEquals(const TString& expected, const TString& actual) { + UNIT_ASSERT_VALUES_EQUAL(expected, actual); +} + +struct TClassWithStreamOperator { + ui32 Id; + TString Name; + + TClassWithStreamOperator(ui32 id, const TString& name) + : Id(id) + , Name(name) + { + } +}; + +IOutputStream& operator<<(IOutputStream& out, const TClassWithStreamOperator& value) { + return out << value.Id << " " << value.Name; +} + +Y_UNIT_TEST_SUITE(TStringBuilderTest) { + Y_UNIT_TEST(TestStringBuilder) { + TestEquals("", TStringBuilder()); + TestEquals("a", TStringBuilder() << "a"); + TestEquals("a1", TStringBuilder() << "a" << 1); + TestEquals("value: 123 name", TStringBuilder() << "value: " << TClassWithStreamOperator(123, "name")); + } + + Y_UNIT_TEST(TestStringBuilderOut) { + TString s; + TStringOutput out(s); + TStringBuilder sb; + sb << "a"; + out << sb; + TestEquals("a", s); + } + + Y_UNIT_TEST(TestStringBuilderRValue) { + struct TRValueAcceptTester { + static bool IsRValue(const TString&) { + return false; + } + + static bool IsRValue(TString&&) { + return true; + } + }; + + UNIT_ASSERT(TRValueAcceptTester::IsRValue(TStringBuilder() << "a" << 1)); + + TStringBuilder b; + UNIT_ASSERT(!TRValueAcceptTester::IsRValue(b << "a" << 1)); + TStringBuilder b2; + UNIT_ASSERT(!TRValueAcceptTester::IsRValue(b2 << "a" << 1 << TStringBuilder() << "a")); + UNIT_ASSERT_VALUES_EQUAL("a1a", b2); + + UNIT_ASSERT(TRValueAcceptTester::IsRValue(TStringBuilder() << b2)); + UNIT_ASSERT_VALUES_EQUAL("a1a", TStringBuilder() << b2); + } +} diff --git a/util/string/cast.cpp b/util/string/cast.cpp new file mode 100644 index 0000000000..aa1e65a8e9 --- /dev/null +++ b/util/string/cast.cpp @@ -0,0 +1,844 @@ +#include <util/system/defaults.h> + +#if defined(_freebsd_) && !defined(__LONG_LONG_SUPPORTED) + #define __LONG_LONG_SUPPORTED +#endif + +#include <cstdio> +#include <string> +#include <cmath> + +#include <util/string/type.h> +#include <util/string/cast.h> +#include <util/string/escape.h> + +#include <contrib/libs/double-conversion/double-conversion.h> + +#include <util/generic/string.h> +#include <util/system/yassert.h> +#include <util/generic/yexception.h> +#include <util/generic/typetraits.h> +#include <util/generic/ylimits.h> +#include <util/generic/singleton.h> +#include <util/generic/utility.h> + +using double_conversion::DoubleToStringConverter; +using double_conversion::StringBuilder; +using double_conversion::StringToDoubleConverter; + +/* + * ------------------------------ formatters ------------------------------ + */ + +namespace { + constexpr char IntToChar[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + + static_assert(Y_ARRAY_SIZE(IntToChar) == 16, "expect Y_ARRAY_SIZE(IntToChar) == 16"); + + // clang-format off + constexpr int LetterToIntMap[] = { + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 20, 20, + 20, 20, 20, 20, 20, 10, 11, 12, 13, 14, + 15, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 10, 11, 12, + 13, 14, 15, + }; + // clang-format on + + template <class T> + std::enable_if_t<std::is_signed<T>::value, std::make_unsigned_t<T>> NegateNegativeSigned(T value) noexcept { + return std::make_unsigned_t<T>(-(value + 1)) + std::make_unsigned_t<T>(1); + } + + template <class T> + std::enable_if_t<std::is_unsigned<T>::value, std::make_unsigned_t<T>> NegateNegativeSigned(T) noexcept { + Y_UNREACHABLE(); + } + + template <class T> + std::make_signed_t<T> NegatePositiveSigned(T value) noexcept { + return value > 0 ? (-std::make_signed_t<T>(value - 1) - 1) : 0; + } + + template <class T, unsigned base, class TChar> + struct TBasicIntFormatter { + static_assert(1 < base && base < 17, "expect 1 < base && base < 17"); + static_assert(std::is_unsigned<T>::value, "TBasicIntFormatter can only handle unsigned integers."); + + static inline size_t Format(T value, TChar* buf, size_t len) { + Y_ENSURE(len, TStringBuf("zero length")); + + TChar* tmp = buf; + + do { + // divide only once, do not use mod + const T nextVal = static_cast<T>(value / base); + *tmp++ = IntToChar[base == 2 || base == 4 || base == 8 || base == 16 ? value & (base - 1) : value - base * nextVal]; + value = nextVal; + } while (value && --len); + + Y_ENSURE(!value, TStringBuf("not enough room in buffer")); + + const size_t result = tmp - buf; + + --tmp; + + while (buf < tmp) { + TChar c = *buf; + + *buf = *tmp; + *tmp = c; + ++buf; + --tmp; + } + + return result; + } + }; + + template <class T, unsigned base, class TChar> + struct TIntFormatter { + static_assert(1 < base && base < 17, "expect 1 < base && base < 17"); + static_assert(std::is_integral<T>::value, "T must be an integral type."); + + static inline size_t Format(T value, TChar* buf, size_t len) { + using TUFmt = TBasicIntFormatter<std::make_unsigned_t<T>, base, TChar>; + + if (std::is_signed<T>::value && value < 0) { + Y_ENSURE(len >= 2, TStringBuf("not enough room in buffer")); + + *buf = '-'; + + return 1 + TUFmt::Format(NegateNegativeSigned(value), buf + 1, len - 1); + } + + return TUFmt::Format(value, buf, len); + } + }; + + template <class T> + struct TFltModifiers; + + template <class T, int base, class TChar> + Y_NO_INLINE size_t FormatInt(T value, TChar* buf, size_t len) { + return TIntFormatter<T, base, TChar>::Format(value, buf, len); + } + + template <class T> + inline size_t FormatFlt(T t, char* buf, size_t len) { + const int ret = snprintf(buf, len, TFltModifiers<T>::ModifierWrite, t); + + Y_ENSURE(ret >= 0 && (size_t)ret <= len, TStringBuf("cannot format float")); + + return (size_t)ret; + } + + enum EParseStatus { + PS_OK = 0, + PS_EMPTY_STRING, + PS_PLUS_STRING, + PS_MINUS_STRING, + PS_BAD_SYMBOL, + PS_OVERFLOW, + }; + + constexpr ui8 SAFE_LENS[4][17] = { + {0, 0, 7, 5, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1}, + {0, 0, 15, 10, 7, 6, 6, 5, 5, 5, 4, 4, 4, 4, 4, 4, 3}, + {0, 0, 31, 20, 15, 13, 12, 11, 10, 10, 9, 9, 8, 8, 8, 8, 7}, + {0, 0, 63, 40, 31, 27, 24, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15}, + }; + + inline constexpr ui8 ConstLog2(ui8 x) noexcept { + return x == 1 ? 0 : 1 + ConstLog2(x / 2); + } + + template <unsigned BASE, class TChar, class T> + inline std::enable_if_t<(BASE > 10), bool> CharToDigit(TChar c, T* digit) noexcept { + unsigned uc = c; + + if (uc >= Y_ARRAY_SIZE(LetterToIntMap)) { + return false; + } + + *digit = LetterToIntMap[uc]; + + return *digit < BASE; + } + + template <unsigned BASE, class TChar, class T> + inline std::enable_if_t<(BASE <= 10), bool> CharToDigit(TChar c, T* digit) noexcept { + return (c >= '0') && ((*digit = (c - '0')) < BASE); + } + + template <class T, unsigned base, class TChar> + struct TBasicIntParser { + static_assert(1 < base && base < 17, "Expect 1 < base && base < 17."); + static_assert(std::is_unsigned<T>::value, "TBasicIntParser can only handle unsigned integers."); + + enum : unsigned { + BASE_POW_2 = base * base, + }; + + static inline EParseStatus Parse(const TChar** ppos, const TChar* end, T max, T* target) noexcept { + Y_ASSERT(*ppos != end); /* This check should be somewhere up the stack. */ + const size_t maxSafeLen = SAFE_LENS[ConstLog2(sizeof(T))][base]; + + // can parse without overflow + if (size_t(end - *ppos) <= maxSafeLen) { + T result; + + if (ParseFast(*ppos, end, &result) && result <= max) { + *target = result; + + return PS_OK; + } + } + + return ParseSlow(ppos, end, max, target); + } + + static inline bool ParseFast(const TChar* pos, const TChar* end, T* target) noexcept { + T result = T(); + T d1; + T d2; + + // we have end > pos + auto beforeEnd = end - 1; + + while (pos < beforeEnd && CharToDigit<base>(*pos, &d1) && CharToDigit<base>(*(pos + 1), &d2)) { + result = result * BASE_POW_2 + d1 * base + d2; + pos += 2; + } + + while (pos != end && CharToDigit<base>(*pos, &d1)) { + result = result * base + d1; + ++pos; + } + + *target = result; + + return pos == end; + } + + static inline EParseStatus ParseSlow(const TChar** ppos, const TChar* end, T max, T* target) noexcept { + T result = T(); + T preMulMax = max / base; + const TChar* pos = *ppos; + + while (pos != end) { + T digit; + + if (!CharToDigit<base>(*pos, &digit)) { + *ppos = pos; + + return PS_BAD_SYMBOL; + } + + if (result > preMulMax) { + return PS_OVERFLOW; + } + + result *= base; + + if (result > max - digit) { + return PS_OVERFLOW; + } + + result += digit; + pos++; + } + + *target = result; + + return PS_OK; + } + }; + + template <class T> + struct TBounds { + T PositiveMax; + T NegativeMax; + }; + + template <class T, unsigned base, class TChar> + struct TIntParser { + static_assert(1 < base && base < 17, "Expect 1 < base && base < 17."); + static_assert(std::is_integral<T>::value, "T must be an integral type."); + + enum { + IsSigned = std::is_signed<T>::value + }; + + using TUnsigned = std::make_unsigned_t<T>; + + static inline EParseStatus Parse(const TChar** ppos, const TChar* end, const TBounds<TUnsigned>& bounds, T* target) { + const TChar* pos = *ppos; + if (pos == end) { + return PS_EMPTY_STRING; + } + + bool negative = false; + TUnsigned max; + if (*pos == '+') { + pos++; + max = bounds.PositiveMax; + + if (pos == end) { + return PS_PLUS_STRING; + } + } else if (IsSigned && *pos == '-') { + pos++; + max = bounds.NegativeMax; + negative = true; + + if (pos == end) { + return PS_MINUS_STRING; + } + } else { + max = bounds.PositiveMax; + } + + TUnsigned result; + EParseStatus error = TBasicIntParser<TUnsigned, base, TChar>::Parse(&pos, end, max, &result); + if (error != PS_OK) { + *ppos = pos; + return error; + } + + if (IsSigned) { + *target = negative ? NegatePositiveSigned(result) : static_cast<T>(result); + } else { + *target = result; + } + return PS_OK; + } + }; + + template <class TChar> + [[noreturn]] static Y_NO_INLINE void ThrowParseError(EParseStatus status, const TChar* data, size_t len, const TChar* pos) { + Y_ASSERT(status != PS_OK); + + typedef TBasicString<TChar> TStringType; + + switch (status) { + case PS_EMPTY_STRING: + ythrow TFromStringException() << TStringBuf("Cannot parse empty string as number. "); + case PS_PLUS_STRING: + ythrow TFromStringException() << TStringBuf("Cannot parse string \"+\" as number. "); + case PS_MINUS_STRING: + ythrow TFromStringException() << TStringBuf("Cannot parse string \"-\" as number. "); + case PS_BAD_SYMBOL: + ythrow TFromStringException() << TStringBuf("Unexpected symbol \"") << EscapeC(*pos) << TStringBuf("\" at pos ") << (pos - data) << TStringBuf(" in string ") << TStringType(data, len).Quote() << TStringBuf(". "); + case PS_OVERFLOW: + ythrow TFromStringException() << TStringBuf("Integer overflow in string ") << TStringType(data, len).Quote() << TStringBuf(". "); + default: + ythrow yexception() << TStringBuf("Unknown error code in string converter. "); + } + } + + template <typename T, typename TUnsigned, int base, typename TChar> + Y_NO_INLINE T ParseInt(const TChar* data, size_t len, const TBounds<TUnsigned>& bounds) { + T result; + const TChar* pos = data; + EParseStatus status = TIntParser<T, base, TChar>::Parse(&pos, pos + len, bounds, &result); + + if (status == PS_OK) { + return result; + } else { + ThrowParseError(status, data, len, pos); + } + } + + template <typename T, typename TUnsigned, int base, typename TChar> + Y_NO_INLINE bool TryParseInt(const TChar* data, size_t len, const TBounds<TUnsigned>& bounds, T* result) { + return TIntParser<T, base, TChar>::Parse(&data, data + len, bounds, result) == PS_OK; + } + + template <class T> + inline T ParseFlt(const char* data, size_t len) { + /* + * TODO + */ + + if (len > 256) { + len = 256; + } + + char* c = (char*)alloca(len + 1); + memcpy(c, data, len); + c[len] = 0; + + T ret; + char ec; + + // try to read a value and an extra character in order to catch cases when + // the string start with a valid float but is followed by unexpected characters + if (sscanf(c, TFltModifiers<T>::ModifierReadAndChar, &ret, &ec) == 1) { + return ret; + } + + ythrow TFromStringException() << TStringBuf("cannot parse float(") << TStringBuf(data, len) << TStringBuf(")"); + } + +#define DEF_FLT_MOD(type, modifierWrite, modifierRead) \ + template <> \ + struct TFltModifiers<type> { \ + static const char* const ModifierWrite; \ + static const char* const ModifierReadAndChar; \ + }; \ + \ + const char* const TFltModifiers<type>::ModifierWrite = modifierWrite; \ + const char* const TFltModifiers<type>::ModifierReadAndChar = modifierRead "%c"; + + DEF_FLT_MOD(long double, "%.10Lg", "%Lg") + +#undef DEF_FLT_MOD + + /* The following constants are initialized in terms of <climits> constants to make + * sure they go into binary as actual values and there is no associated + * initialization code. + * */ + constexpr TBounds<ui64> bSBounds = {static_cast<ui64>(SCHAR_MAX), static_cast<ui64>(UCHAR_MAX - SCHAR_MAX)}; + constexpr TBounds<ui64> bUBounds = {static_cast<ui64>(UCHAR_MAX), 0}; + constexpr TBounds<ui64> sSBounds = {static_cast<ui64>(SHRT_MAX), static_cast<ui64>(USHRT_MAX - SHRT_MAX)}; + constexpr TBounds<ui64> sUBounds = {static_cast<ui64>(USHRT_MAX), 0}; + constexpr TBounds<ui64> iSBounds = {static_cast<ui64>(INT_MAX), static_cast<ui64>(UINT_MAX - INT_MAX)}; + constexpr TBounds<ui64> iUBounds = {static_cast<ui64>(UINT_MAX), 0}; + constexpr TBounds<ui64> lSBounds = {static_cast<ui64>(LONG_MAX), static_cast<ui64>(ULONG_MAX - LONG_MAX)}; + constexpr TBounds<ui64> lUBounds = {static_cast<ui64>(ULONG_MAX), 0}; + constexpr TBounds<ui64> llSBounds = {static_cast<ui64>(LLONG_MAX), static_cast<ui64>(ULLONG_MAX - LLONG_MAX)}; + constexpr TBounds<ui64> llUBounds = {static_cast<ui64>(ULLONG_MAX), 0}; +} + +#define DEF_INT_SPEC_II(TYPE, ITYPE, BASE) \ + template <> \ + size_t IntToString<BASE, TYPE>(TYPE value, char* buf, size_t len) { \ + return FormatInt<ITYPE, BASE, char>(value, buf, len); \ + } + +#define DEF_INT_SPEC_I(TYPE, ITYPE) \ + template <> \ + size_t ToStringImpl<TYPE>(TYPE value, char* buf, size_t len) { \ + return FormatInt<ITYPE, 10, char>(value, buf, len); \ + } \ + DEF_INT_SPEC_II(TYPE, ITYPE, 2) \ + DEF_INT_SPEC_II(TYPE, ITYPE, 8) \ + DEF_INT_SPEC_II(TYPE, ITYPE, 10) \ + DEF_INT_SPEC_II(TYPE, ITYPE, 16) + +#define DEF_INT_SPEC(TYPE) \ + DEF_INT_SPEC_I(signed TYPE, i64) \ + DEF_INT_SPEC_I(unsigned TYPE, ui64) + +DEF_INT_SPEC(char) +DEF_INT_SPEC(short) +DEF_INT_SPEC(int) +DEF_INT_SPEC(long) +DEF_INT_SPEC(long long) + +#ifdef __cpp_char8_t +template <> +size_t ToStringImpl<char8_t>(char8_t value, char* buf, size_t len) { + return FormatInt<ui64, 10, char>(value, buf, len); +} +#endif + +using TCharIType = std::conditional_t<std::is_signed<char>::value, i64, ui64>; +using TWCharIType = std::conditional_t<std::is_signed<wchar_t>::value, i64, ui64>; + +DEF_INT_SPEC_I(char, TCharIType) +DEF_INT_SPEC_I(wchar_t, TWCharIType) +DEF_INT_SPEC_I(wchar16, ui64) // wchar16 is always unsigned +DEF_INT_SPEC_I(wchar32, ui64) // wchar32 is always unsigned + +#undef DEF_INT_SPEC +#undef DEF_INT_SPEC_I +#undef DEF_INT_SPEC_II + +#define DEF_FLT_SPEC(type) \ + template <> \ + size_t ToStringImpl<type>(type t, char* buf, size_t len) { \ + return FormatFlt<type>(t, buf, len); \ + } + +DEF_FLT_SPEC(long double) + +#undef DEF_FLT_SPEC + +template <> +size_t ToStringImpl<bool>(bool t, char* buf, size_t len) { + Y_ENSURE(len, TStringBuf("zero length")); + *buf = t ? '1' : '0'; + return 1; +} + +/* + * ------------------------------ parsers ------------------------------ + */ + +template <> +bool TryFromStringImpl<bool>(const char* data, size_t len, bool& result) { + if (len == 1) { + if (data[0] == '0') { + result = false; + return true; + } else if (data[0] == '1') { + result = true; + return true; + } + } + TStringBuf buf(data, len); + if (IsTrue(buf)) { + result = true; + return true; + } else if (IsFalse(buf)) { + result = false; + return true; + } + return false; +} + +template <> +bool FromStringImpl<bool>(const char* data, size_t len) { + bool result; + + if (!TryFromStringImpl<bool>(data, len, result)) { + ythrow TFromStringException() << TStringBuf("Cannot parse bool(") << TStringBuf(data, len) << TStringBuf("). "); + } + + return result; +} + +template <> +TString FromStringImpl<TString>(const char* data, size_t len) { + return TString(data, len); +} + +template <> +TStringBuf FromStringImpl<TStringBuf>(const char* data, size_t len) { + return TStringBuf(data, len); +} + +template <> +std::string FromStringImpl<std::string>(const char* data, size_t len) { + return std::string(data, len); +} + +template <> +TUtf16String FromStringImpl<TUtf16String>(const wchar16* data, size_t len) { + return TUtf16String(data, len); +} + +template <> +TWtringBuf FromStringImpl<TWtringBuf>(const wchar16* data, size_t len) { + return TWtringBuf(data, len); +} + +// Try-versions +template <> +bool TryFromStringImpl<TStringBuf>(const char* data, size_t len, TStringBuf& result) { + result = {data, len}; + return true; +} + +template <> +bool TryFromStringImpl<TString>(const char* data, size_t len, TString& result) { + result = TString(data, len); + return true; +} + +template <> +bool TryFromStringImpl<std::string>(const char* data, size_t len, std::string& result) { + result.assign(data, len); + return true; +} + +template <> +bool TryFromStringImpl<TWtringBuf>(const wchar16* data, size_t len, TWtringBuf& result) { + result = {data, len}; + return true; +} + +template <> +bool TryFromStringImpl<TUtf16String>(const wchar16* data, size_t len, TUtf16String& result) { + result = TUtf16String(data, len); + return true; +} + +#define DEF_INT_SPEC_III(CHAR, TYPE, ITYPE, BOUNDS, BASE) \ + template <> \ + TYPE IntFromString<TYPE, BASE>(const CHAR* data, size_t len) { \ + return ParseInt<ITYPE, ui64, BASE>(data, len, BOUNDS); \ + } \ + template <> \ + bool TryIntFromString<BASE>(const CHAR* data, size_t len, TYPE& result) { \ + ITYPE tmp; \ + bool status = TryParseInt<ITYPE, ui64, BASE>(data, len, BOUNDS, &tmp); \ + if (status) { \ + result = tmp; \ + } \ + return status; \ + } + +#define DEF_INT_SPEC_II(CHAR, TYPE, ITYPE, BOUNDS) \ + template <> \ + TYPE FromStringImpl<TYPE>(const CHAR* data, size_t len) { \ + return ParseInt<ITYPE, ui64, 10>(data, len, BOUNDS); \ + } \ + template <> \ + bool TryFromStringImpl<TYPE>(const CHAR* data, size_t len, TYPE& result) { \ + ITYPE tmp; \ + bool status = TryParseInt<ITYPE, ui64, 10>(data, len, BOUNDS, &tmp); \ + if (status) { \ + result = tmp; \ + } \ + return status; \ + } \ + DEF_INT_SPEC_III(CHAR, TYPE, ITYPE, BOUNDS, 2) \ + DEF_INT_SPEC_III(CHAR, TYPE, ITYPE, BOUNDS, 8) \ + DEF_INT_SPEC_III(CHAR, TYPE, ITYPE, BOUNDS, 10) \ + DEF_INT_SPEC_III(CHAR, TYPE, ITYPE, BOUNDS, 16) + +#define DEF_INT_SPEC_I(TYPE, ITYPE, BOUNDS) \ + DEF_INT_SPEC_II(char, TYPE, ITYPE, BOUNDS) \ + DEF_INT_SPEC_II(wchar16, TYPE, ITYPE, BOUNDS) + +#define DEF_INT_SPEC(TYPE, ID) \ + DEF_INT_SPEC_I(signed TYPE, i64, ID##SBounds) \ + DEF_INT_SPEC_I(unsigned TYPE, ui64, ID##UBounds) + +#define DEF_INT_SPEC_FIXED_WIDTH(TYPE, ID) \ + DEF_INT_SPEC_I(TYPE, i64, ID##SBounds) \ + DEF_INT_SPEC_I(u##TYPE, ui64, ID##UBounds) + +DEF_INT_SPEC_FIXED_WIDTH(i8, b) +DEF_INT_SPEC(short, s) +DEF_INT_SPEC(int, i) +DEF_INT_SPEC(long, l) +DEF_INT_SPEC(long long, ll) + +#undef DEF_INT_SPEC_FIXED_WIDTH +#undef DEF_INT_SPEC +#undef DEF_INT_SPEC_I +#undef DEF_INT_SPEC_II +#undef DEF_INT_SPEC_III + +#define DEF_FLT_SPEC(type) \ + template <> \ + type FromStringImpl<type>(const char* data, size_t len) { \ + return ParseFlt<type>(data, len); \ + } + +DEF_FLT_SPEC(long double) + +#undef DEF_FLT_SPEC + +// Using StrToD for float and double because it is faster than sscanf. +// Exception-free, specialized for float types +template <> +bool TryFromStringImpl<double>(const char* data, size_t len, double& result) { + if (!len) { + return false; + } + + char* se = nullptr; + double d = StrToD(data, data + len, &se); + + if (se != data + len) { + return false; + } + result = d; + return true; +} + +template <> +bool TryFromStringImpl<float>(const char* data, size_t len, float& result) { + double d; + if (TryFromStringImpl<double>(data, len, d)) { + result = static_cast<float>(d); + return true; + } + return false; +} + +template <> +bool TryFromStringImpl<long double>(const char* data, size_t len, long double& result) { + double d; + if (TryFromStringImpl<double>(data, len, d)) { + result = static_cast<long double>(d); + return true; + } + return false; +} + +// Exception-throwing, specialized for float types +template <> +double FromStringImpl<double>(const char* data, size_t len) { + double d = 0.0; + if (!TryFromStringImpl(data, len, d)) { + ythrow TFromStringException() << TStringBuf("cannot parse float(") << TStringBuf(data, len) << TStringBuf(")"); + } + return d; +} + +template <> +float FromStringImpl<float>(const char* data, size_t len) { + return static_cast<float>(FromStringImpl<double>(data, len)); +} + +double StrToD(const char* b, const char* e, char** se) { + struct TCvt: public StringToDoubleConverter { + inline TCvt() + : StringToDoubleConverter(ALLOW_TRAILING_JUNK | ALLOW_HEX | ALLOW_LEADING_SPACES, 0.0, NAN, nullptr, nullptr) + { + } + }; + + int out = 0; + + const auto res = SingletonWithPriority<TCvt, 0>()->StringToDouble(b, e - b, &out); + + if (se) { + *se = (char*)(b + out); + } + + return res; +} + +double StrToD(const char* b, char** se) { + return StrToD(b, b + strlen(b), se); +} + +namespace { + static inline DoubleToStringConverter& ToStringConverterNoPad() noexcept { + struct TCvt: public DoubleToStringConverter { + inline TCvt() noexcept + : DoubleToStringConverter(EMIT_POSITIVE_EXPONENT_SIGN, "inf", "nan", 'e', -10, 21, 4, 0) + { + } + }; + + return *SingletonWithPriority<TCvt, 0>(); + } + + struct TBuilder { + alignas(StringBuilder) char Store[sizeof(StringBuilder)]; + StringBuilder* SB; + + inline TBuilder(char* buf, size_t len) noexcept + : SB(new (Store) StringBuilder(buf, len)) + { + } + }; + + static inline size_t FixZeros(char* buf, size_t len) noexcept { + auto end = buf + len; + auto point = (char*)memchr(buf, '.', len); + + if (!point) { + return len; + } + + auto exp = (char*)memchr(point, 'e', end - point); + + if (!exp) { + exp = end; + } + + auto c = exp; + + c -= 1; + + while (point < c && *c == '0') { + --c; + } + + if (*c == '.') { + --c; + } + + memmove(c + 1, exp, end - exp); + + return c - buf + 1 + end - exp; + } + + static inline size_t FixEnd(char* buf, size_t len) noexcept { + if (len > 2) { + auto sign = buf[len - 2]; + + if (sign == '-' || sign == '+') { + buf[len] = buf[len - 1]; + buf[len - 1] = '0'; + ++len; + } + } + + buf[len] = 0; + + return len; + } + + static inline size_t DoDtoa(double d, char* buf, size_t len, int prec) noexcept { + TBuilder sb(buf, len); + + Y_VERIFY(ToStringConverterNoPad().ToPrecision(d, prec, sb.SB), "conversion failed"); + + return FixEnd(buf, FixZeros(buf, sb.SB->position())); + } +} + +template <> +size_t ToStringImpl<double>(double d, char* buf, size_t len) { + return DoDtoa(d, buf, len, 10); +} + +template <> +size_t ToStringImpl<float>(float f, char* buf, size_t len) { + return DoDtoa(f, buf, len, 6); +} + +size_t FloatToString(float t, char* buf, size_t len, EFloatToStringMode mode, int ndigits) { + if (mode == PREC_AUTO) { + TBuilder sb(buf, len); + + Y_VERIFY(ToStringConverterNoPad().ToShortestSingle(t, sb.SB), "conversion failed"); + + return FixEnd(buf, sb.SB->position()); + } + + return FloatToString((double)t, buf, len, mode, ndigits); +} + +size_t FloatToString(double t, char* buf, size_t len, EFloatToStringMode mode, int ndigits) { + if (mode == PREC_NDIGITS) { + auto minDigits = DoubleToStringConverter::kMinPrecisionDigits; + auto maxDigits = DoubleToStringConverter::kMaxPrecisionDigits; + + return DoDtoa(t, buf, len, ClampVal(ndigits, minDigits, maxDigits)); + } + + TBuilder sb(buf, len); + + if (mode == PREC_AUTO) { + Y_VERIFY(ToStringConverterNoPad().ToShortest(t, sb.SB), "conversion failed"); + + return FixEnd(buf, sb.SB->position()); + } + + if (!ToStringConverterNoPad().ToFixed(t, ndigits, sb.SB)) { + return FloatToString(t, buf, len, PREC_AUTO); + } + + if (mode == PREC_POINT_DIGITS_STRIP_ZEROES) { + return FixZeros(buf, sb.SB->position()); + } + + return sb.SB->position(); +} diff --git a/util/string/cast.h b/util/string/cast.h new file mode 100644 index 0000000000..90e925c194 --- /dev/null +++ b/util/string/cast.h @@ -0,0 +1,357 @@ +#pragma once + +#include <util/system/defaults.h> +#include <util/stream/str.h> +#include <util/generic/string.h> +#include <util/generic/strbuf.h> +#include <util/generic/typetraits.h> +#include <util/generic/yexception.h> + +/* + * specialized for all arithmetic types + */ + +template <class T> +size_t ToStringImpl(T t, char* buf, size_t len); + +/** + * Converts @c t to string writing not more than @c len bytes to output buffer @c buf. + * No NULL terminator appended! Throws exception on buffer overflow. + * @return number of bytes written + */ +template <class T> +inline size_t ToString(const T& t, char* buf, size_t len) { + using TParam = typename TTypeTraits<T>::TFuncParam; + + return ToStringImpl<TParam>(t, buf, len); +} + +/** + * Floating point to string conversion mode, values are enforced by `dtoa_impl.cpp`. + */ +enum EFloatToStringMode { + /** 0.1f -> "0.1", 0.12345678f -> "0.12345678", ignores ndigits. */ + PREC_AUTO = 0, + + /** "%g" mode, writes up to the given number of significant digits: + * 0.1f -> "0.1", 0.12345678f -> "0.123457" for ndigits=6, 1.2e-06f -> "1.2e-06" */ + PREC_NDIGITS = 2, + + /** "%f" mode, writes the given number of digits after decimal point: + * 0.1f -> "0.100000", 1.2e-06f -> "0.000001" for ndigits=6 */ + PREC_POINT_DIGITS = 3, + + /** same as PREC_POINT_DIGITS, but stripping trailing zeroes: + * 0.1f for ndgigits=6 -> "0.1" */ + PREC_POINT_DIGITS_STRIP_ZEROES = 4 +}; + +size_t FloatToString(float t, char* buf, size_t len, EFloatToStringMode mode = PREC_AUTO, int ndigits = 0); +size_t FloatToString(double t, char* buf, size_t len, EFloatToStringMode mode = PREC_AUTO, int ndigits = 0); + +template <typename T> +inline TString FloatToString(const T& t, EFloatToStringMode mode = PREC_AUTO, int ndigits = 0) { + char buf[512]; // Max<double>() with mode = PREC_POINT_DIGITS has 309 digits before the decimal point + size_t count = FloatToString(t, buf, sizeof(buf), mode, ndigits); + return TString(buf, count); +} + +namespace NPrivate { + template <class T, bool isSimple> + struct TToString { + static inline TString Cvt(const T& t) { + char buf[512]; + + return TString(buf, ToString<T>(t, buf, sizeof(buf))); + } + }; + + template <class T> + struct TToString<T, false> { + static inline TString Cvt(const T& t) { + TString s; + TStringOutput o(s); + o << t; + return s; + } + }; +} + +/* + * some clever implementations... + */ +template <class T> +inline TString ToString(const T& t) { + using TR = std::remove_cv_t<T>; + + return ::NPrivate::TToString<TR, std::is_arithmetic<TR>::value>::Cvt((const TR&)t); +} + +inline const TString& ToString(const TString& s) noexcept { + return s; +} + +inline const TString& ToString(TString& s) noexcept { + return s; +} + +inline TString ToString(const char* s) { + return s; +} + +inline TString ToString(char* s) { + return s; +} + +/* + * Wrapper for wide strings. + */ +template <class T> +inline TUtf16String ToWtring(const T& t) { + return TUtf16String::FromAscii(ToString(t)); +} + +inline const TUtf16String& ToWtring(const TUtf16String& w) { + return w; +} + +inline const TUtf16String& ToWtring(TUtf16String& w) { + return w; +} + +struct TFromStringException: public TBadCastException { +}; + +/* + * specialized for: + * bool + * short + * unsigned short + * int + * unsigned int + * long + * unsigned long + * long long + * unsigned long long + * float + * double + * long double + */ +template <typename T, typename TChar> +T FromStringImpl(const TChar* data, size_t len); + +template <typename T, typename TChar> +inline T FromString(const TChar* data, size_t len) { + return ::FromStringImpl<T>(data, len); +} + +template <typename T, typename TChar> +inline T FromString(const TChar* data) { + return ::FromString<T>(data, std::char_traits<TChar>::length(data)); +} + +template <class T> +inline T FromString(const TStringBuf& s) { + return ::FromString<T>(s.data(), s.size()); +} + +template <class T> +inline T FromString(const TString& s) { + return ::FromString<T>(s.data(), s.size()); +} + +template <class T> +inline T FromString(const std::string& s) { + return ::FromString<T>(s.data(), s.size()); +} + +template <> +inline TString FromString<TString>(const TString& s) { + return s; +} + +template <class T> +inline T FromString(const TWtringBuf& s) { + return ::FromString<T, typename TWtringBuf::char_type>(s.data(), s.size()); +} + +template <class T> +inline T FromString(const TUtf16String& s) { + return ::FromString<T, wchar16>(s.data(), s.size()); +} + +namespace NPrivate { + template <typename TChar> + class TFromString { + const TChar* const Data; + const size_t Len; + + public: + inline TFromString(const TChar* data, size_t len) + : Data(data) + , Len(len) + { + } + + template <typename T> + inline operator T() const { + return FromString<T, TChar>(Data, Len); + } + }; +} + +template <typename TChar> +inline ::NPrivate::TFromString<TChar> FromString(const TChar* data, size_t len) { + return ::NPrivate::TFromString<TChar>(data, len); +} + +template <typename TChar> +inline ::NPrivate::TFromString<TChar> FromString(const TChar* data) { + return ::NPrivate::TFromString<TChar>(data, std::char_traits<TChar>::length(data)); +} + +template <typename T> +inline ::NPrivate::TFromString<typename T::TChar> FromString(const T& s) { + return ::NPrivate::TFromString<typename T::TChar>(s.data(), s.size()); +} + +// Conversion exception free versions +template <typename T, typename TChar> +bool TryFromStringImpl(const TChar* data, size_t len, T& result); + +/** + * @param data Source string buffer pointer + * @param len Source string length, in characters + * @param result Place to store conversion result value. + * If conversion error occurs, no value stored in @c result + * @return @c true in case of successful conversion, @c false otherwise + **/ +template <typename T, typename TChar> +inline bool TryFromString(const TChar* data, size_t len, T& result) { + return TryFromStringImpl<T>(data, len, result); +} + +template <typename T, typename TChar> +inline bool TryFromString(const TChar* data, T& result) { + return TryFromString<T>(data, std::char_traits<TChar>::length(data), result); +} + +template <class T, class TChar> +inline bool TryFromString(const TChar* data, const size_t len, T& result, const T& def) { + if (TryFromString<T>(data, len, result)) { + return true; + } + result = def; + return false; +} + +template <class T> +inline bool TryFromString(const TStringBuf& s, T& result) { + return TryFromString<T>(s.data(), s.size(), result); +} + +template <class T> +inline bool TryFromString(const TString& s, T& result) { + return TryFromString<T>(s.data(), s.size(), result); +} + +template <class T> +inline bool TryFromString(const std::string& s, T& result) { + return TryFromString<T>(s.data(), s.size(), result); +} + +template <class T> +inline bool TryFromString(const TWtringBuf& s, T& result) { + return TryFromString<T>(s.data(), s.size(), result); +} + +template <class T> +inline bool TryFromString(const TUtf16String& s, T& result) { + return TryFromString<T>(s.data(), s.size(), result); +} + +template <class T, class TStringType> +inline bool TryFromStringWithDefault(const TStringType& s, T& result, const T& def) { + return TryFromString<T>(s.data(), s.size(), result, def); +} + +template <class T> +inline bool TryFromStringWithDefault(const char* s, T& result, const T& def) { + return TryFromStringWithDefault<T>(TStringBuf(s), result, def); +} + +template <class T, class TStringType> +inline bool TryFromStringWithDefault(const TStringType& s, T& result) { + return TryFromStringWithDefault<T>(s, result, T()); +} + +// FromString methods with default value if data is invalid +template <class T, class TChar> +inline T FromString(const TChar* data, const size_t len, const T& def) { + T result; + TryFromString<T>(data, len, result, def); + return result; +} + +template <class T, class TStringType> +inline T FromStringWithDefault(const TStringType& s, const T& def) { + return FromString<T>(s.data(), s.size(), def); +} + +template <class T> +inline T FromStringWithDefault(const char* s, const T& def) { + return FromStringWithDefault<T>(TStringBuf(s), def); +} + +template <class T, class TStringType> +inline T FromStringWithDefault(const TStringType& s) { + return FromStringWithDefault<T>(s, T()); +} + +double StrToD(const char* b, char** se); +double StrToD(const char* b, const char* e, char** se); + +template <int base, class T> +size_t IntToString(T t, char* buf, size_t len); + +template <int base, class T> +inline TString IntToString(T t) { + static_assert(std::is_arithmetic<std::remove_cv_t<T>>::value, "expect std::is_arithmetic<std::remove_cv_t<T>>::value"); + + char buf[256]; + + return TString(buf, IntToString<base>(t, buf, sizeof(buf))); +} + +template <int base, class TInt, class TChar> +bool TryIntFromString(const TChar* data, size_t len, TInt& result); + +template <int base, class TInt, class TStringType> +inline bool TryIntFromString(const TStringType& s, TInt& result) { + return TryIntFromString<base>(s.data(), s.size(), result); +} + +template <class TInt, int base, class TChar> +TInt IntFromString(const TChar* str, size_t len); + +template <class TInt, int base, class TChar> +inline TInt IntFromString(const TChar* str) { + return IntFromString<TInt, base>(str, std::char_traits<TChar>::length(str)); +} + +template <class TInt, int base, class TStringType> +inline TInt IntFromString(const TStringType& str) { + return IntFromString<TInt, base>(str.data(), str.size()); +} + +static inline TString ToString(const TStringBuf str) { + return TString(str); +} + +static inline TUtf16String ToWtring(const TWtringBuf wtr) { + return TUtf16String(wtr); +} + +static inline TUtf32String ToUtf32String(const TUtf32StringBuf wtr) { + return TUtf32String(wtr); +} diff --git a/util/string/cast.pxd b/util/string/cast.pxd new file mode 100644 index 0000000000..dc23619e1e --- /dev/null +++ b/util/string/cast.pxd @@ -0,0 +1,10 @@ +from util.generic.string cimport TString + +from libcpp cimport bool as bool_t + +cdef extern from "<util/string/cast.h>" nogil: + T FromString[T](const TString&) except + + bool_t TryFromString[T](const TString&, T&) except + + TString ToString[T](const T&) except + + + cdef double StrToD(const char* b, char** se) except + diff --git a/util/string/cast.py b/util/string/cast.py new file mode 100644 index 0000000000..4787f6ef44 --- /dev/null +++ b/util/string/cast.py @@ -0,0 +1,27 @@ +print 'static const ui8 SAFE_LENS[4][15] = {' + + +def nb(n, b): + if n == 0: + return [0] + + digits = [] + + while n: + digits.append(int(n % b)) + n /= b + + return digits[::-1] + + +for p in (1, 2, 4, 8): + + def it1(): + for base in range(2, 17): + m = 2 ** (8 * p) - 1 + + yield len(nb(m, base)) - 1 + + print ' {0, 0, ' + ', '.join(str(x) for x in it1()) + '},' + +print '};' diff --git a/util/string/cast_ut.cpp b/util/string/cast_ut.cpp new file mode 100644 index 0000000000..033450c38c --- /dev/null +++ b/util/string/cast_ut.cpp @@ -0,0 +1,602 @@ +#include "cast.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/charset/wide.h> +#include <util/system/defaults.h> + +#include <limits> + +// positive test (return true or no exception) +#define test1(t, v) \ + F<t>().CheckTryOK(v); \ + F<t>().CheckOK(v) + +// negative test (return false or exception) +#define test2(t, v) \ + F<t>().CheckTryFail(v); \ + F<t>().CheckExc(v) + +#define EPS 10E-7 + +#define HEX_MACROS_MAP(mac, type, val) mac(type, val, 2) mac(type, val, 8) mac(type, val, 10) mac(type, val, 16) + +#define OK_HEX_CHECK(type, val, base) UNIT_ASSERT_EQUAL((IntFromStringForCheck<base>(IntToString<base>(val))), val); +#define EXC_HEX_CHECK(type, val, base) UNIT_ASSERT_EXCEPTION((IntFromString<type, base>(IntToString<base>(val))), yexception); + +#define TRY_HEX_MACROS_MAP(mac, type, val, result, def) \ + mac(type, val, result, def, 2) \ + mac(type, val, result, def, 8) \ + mac(type, val, result, def, 10) \ + mac(type, val, result, def, 16) + +#define TRY_OK_HEX_CHECK(type, val, result, def, base) \ + result = def; \ + UNIT_ASSERT_EQUAL(TryIntFromStringForCheck<base>(IntToString<base>(val), result), true); \ + UNIT_ASSERT_EQUAL(result, val); + +#define TRY_FAIL_HEX_CHECK(type, val, result, def, base) \ + result = def; \ + UNIT_ASSERT_VALUES_EQUAL(TryIntFromStringForCheck<base>(IntToString<base>(val), result), false); \ + UNIT_ASSERT_VALUES_EQUAL(result, def); + +template <class A> +struct TRet { + template <int base> + inline A IntFromStringForCheck(const TString& str) { + return IntFromString<A, base>(str); + } + + template <int base> + inline bool TryIntFromStringForCheck(const TString& str, A& result) { + return TryIntFromString<base>(str, result); + } + + template <class B> + inline void CheckOK(B v) { + UNIT_ASSERT_VALUES_EQUAL(FromString<A>(ToString(v)), v); // char + UNIT_ASSERT_VALUES_EQUAL(FromString<A>(ToWtring(v)), v); // wide char + HEX_MACROS_MAP(OK_HEX_CHECK, A, v); + } + + template <class B> + inline void CheckExc(B v) { + UNIT_ASSERT_EXCEPTION(FromString<A>(ToString(v)), yexception); // char + UNIT_ASSERT_EXCEPTION(FromString<A>(ToWtring(v)), yexception); // wide char + HEX_MACROS_MAP(EXC_HEX_CHECK, A, v); + } + + template <class B> + inline void CheckTryOK(B v) { + static const A defaultV = 42; + A convV; + UNIT_ASSERT_VALUES_EQUAL(TryFromString<A>(ToString(v), convV), true); // char + UNIT_ASSERT_VALUES_EQUAL(v, convV); + UNIT_ASSERT_VALUES_EQUAL(TryFromString<A>(ToWtring(v), convV), true); // wide char + UNIT_ASSERT_VALUES_EQUAL(v, convV); + + TRY_HEX_MACROS_MAP(TRY_OK_HEX_CHECK, A, v, convV, defaultV); + } + + template <class B> + inline void CheckTryFail(B v) { + static const A defaultV = 42; + A convV = defaultV; // to check that original value is not trashed on bad cast + UNIT_ASSERT_VALUES_EQUAL(TryFromString<A>(ToString(v), convV), false); // char + UNIT_ASSERT_VALUES_EQUAL(defaultV, convV); + UNIT_ASSERT_VALUES_EQUAL(TryFromString<A>(ToWtring(v), convV), false); // wide char + UNIT_ASSERT_VALUES_EQUAL(defaultV, convV); + + TRY_HEX_MACROS_MAP(TRY_FAIL_HEX_CHECK, A, v, convV, defaultV); + } +}; + +template <> +struct TRet<bool> { + template <class B> + inline void CheckOK(B v) { + UNIT_ASSERT_VALUES_EQUAL(FromString<bool>(ToString(v)), v); + } + + template <class B> + inline void CheckTryOK(B v) { + B convV; + UNIT_ASSERT_VALUES_EQUAL(TryFromString<bool>(ToString(v), convV), true); + UNIT_ASSERT_VALUES_EQUAL(v, convV); + } + + template <class B> + inline void CheckExc(B v) { + UNIT_ASSERT_EXCEPTION(FromString<bool>(ToString(v)), yexception); + } + + template <class B> + inline void CheckTryFail(B v) { + static const bool defaultV = false; + bool convV = defaultV; + UNIT_ASSERT_VALUES_EQUAL(TryFromString<bool>(ToString(v), convV), false); + UNIT_ASSERT_VALUES_EQUAL(defaultV, convV); + } +}; + +template <class A> +inline TRet<A> F() { + return TRet<A>(); +}; + +#if 0 +template <class T> +inline void CheckConvertToBuffer(const T& value, const size_t size, const TString& canonValue) { + const size_t maxSize = 256; + char buffer[maxSize]; + const char magic = 0x7F; + memset(buffer, magic, maxSize); + size_t length = 0; + if (canonValue.size() > size) { // overflow will occur + UNIT_ASSERT_EXCEPTION(length = ToString(value, buffer, size), yexception); + // check that no bytes after size was trashed + for (size_t i = size; i < maxSize; ++i) + UNIT_ASSERT_VALUES_EQUAL(buffer[i], magic); + } else { + length = ToString(value, buffer, size); + UNIT_ASSERT(length < maxSize); + // check that no bytes after length was trashed + for (size_t i = length; i < maxSize; ++i) + UNIT_ASSERT_VALUES_EQUAL(buffer[i], magic); + TStringBuf result(buffer, length); + UNIT_ASSERT_VALUES_EQUAL(result, TStringBuf(canonValue)); + } +} +#endif + +Y_UNIT_TEST_SUITE(TCastTest) { + template <class A> + inline TRet<A> F() { + return TRet<A>(); + }; + + template <class TFloat> + void GoodFloatTester(const char* str, const TFloat canonValue, const double eps) { + TFloat f = canonValue + 42.0; // shift value to make it far from proper + UNIT_ASSERT_VALUES_EQUAL(TryFromString<TFloat>(str, f), true); + UNIT_ASSERT_DOUBLES_EQUAL(f, canonValue, eps); + f = FromString<TFloat>(str); + UNIT_ASSERT_DOUBLES_EQUAL(f, canonValue, eps); + } + + template <class TFloat> + void BadFloatTester(const char* str) { + const double eps = 10E-5; + TFloat f = 42.0; // make it far from proper + auto res = TryFromString<TFloat>(str, f); + + UNIT_ASSERT_VALUES_EQUAL(res, false); + UNIT_ASSERT_DOUBLES_EQUAL(f, 42.0, eps); // check value was not trashed + UNIT_ASSERT_EXCEPTION(f = FromString<TFloat>(str), TFromStringException); + Y_UNUSED(f); // shut up compiler about 'assigned value that is not used' + } + + Y_UNIT_TEST(TestToFrom) { + test1(bool, true); + test1(bool, false); + test2(bool, ""); + test2(bool, "a"); + + test2(ui8, -1); + test1(i8, -1); + test1(i8, SCHAR_MAX); + test1(i8, SCHAR_MIN); + test1(i8, SCHAR_MAX - 1); + test1(i8, SCHAR_MIN + 1); + test2(i8, (int)SCHAR_MAX + 1); + test2(i8, (int)SCHAR_MIN - 1); + test1(ui8, UCHAR_MAX); + test1(ui8, UCHAR_MAX - 1); + test2(ui8, (int)UCHAR_MAX + 1); + test2(ui8, -1); + test1(int, -1); + test2(unsigned int, -1); + test1(short int, -1); + test2(unsigned short int, -1); + test1(long int, -1); + test2(unsigned long int, -1); + test1(int, INT_MAX); + test1(int, INT_MIN); + test1(int, INT_MAX - 1); + test1(int, INT_MIN + 1); + test2(int, (long long int)INT_MAX + 1); + test2(int, (long long int)INT_MIN - 1); + test1(unsigned int, UINT_MAX); + test1(unsigned int, UINT_MAX - 1); + test2(unsigned int, (long long int)UINT_MAX + 1); + test1(short int, SHRT_MAX); + test1(short int, SHRT_MIN); + test1(short int, SHRT_MAX - 1); + test1(short int, SHRT_MIN + 1); + test2(short int, (long long int)SHRT_MAX + 1); + test2(short int, (long long int)SHRT_MIN - 1); + test1(unsigned short int, USHRT_MAX); + test1(unsigned short int, USHRT_MAX - 1); + test2(unsigned short int, (long long int)USHRT_MAX + 1); + test1(long int, LONG_MAX); + test1(long int, LONG_MIN); + test1(long int, LONG_MAX - 1); + test1(long int, LONG_MIN + 1); + + test1(long long int, LLONG_MAX); + test1(long long int, LLONG_MIN); + test1(long long int, LLONG_MAX - 1); + test1(long long int, LLONG_MIN + 1); + } + + Y_UNIT_TEST(TestVolatile) { + volatile int x = 1; + UNIT_ASSERT_VALUES_EQUAL(ToString(x), "1"); + } + + Y_UNIT_TEST(TestStrToD) { + UNIT_ASSERT_DOUBLES_EQUAL(StrToD("1.1", nullptr), 1.1, EPS); + UNIT_ASSERT_DOUBLES_EQUAL(StrToD("1.12345678", nullptr), 1.12345678, EPS); + UNIT_ASSERT_DOUBLES_EQUAL(StrToD("10E-5", nullptr), 10E-5, EPS); + UNIT_ASSERT_DOUBLES_EQUAL(StrToD("1.1E+5", nullptr), 1.1E+5, EPS); + + char* ret = nullptr; + + UNIT_ASSERT_DOUBLES_EQUAL(StrToD("1.1y", &ret), 1.1, EPS); + UNIT_ASSERT_VALUES_EQUAL(*ret, 'y'); + UNIT_ASSERT_DOUBLES_EQUAL(StrToD("1.12345678z", &ret), 1.12345678, EPS); + UNIT_ASSERT_VALUES_EQUAL(*ret, 'z'); + UNIT_ASSERT_DOUBLES_EQUAL(StrToD("10E-5y", &ret), 10E-5, EPS); + UNIT_ASSERT_VALUES_EQUAL(*ret, 'y'); + UNIT_ASSERT_DOUBLES_EQUAL(StrToD("1.1E+5z", &ret), 1.1E+5, EPS); + UNIT_ASSERT_VALUES_EQUAL(*ret, 'z'); + } + + Y_UNIT_TEST(TestFloats) { + // "%g" mode + UNIT_ASSERT_VALUES_EQUAL(FloatToString(0.1f, PREC_NDIGITS, 6), "0.1"); // drop trailing zeroes + UNIT_ASSERT_VALUES_EQUAL(FloatToString(0.12345678f, PREC_NDIGITS, 6), "0.123457"); + UNIT_ASSERT_VALUES_EQUAL(FloatToString(1e-20f, PREC_NDIGITS, 6), "1e-20"); + // "%f" mode + UNIT_ASSERT_VALUES_EQUAL(FloatToString(0.1f, PREC_POINT_DIGITS, 6), "0.100000"); + UNIT_ASSERT_VALUES_EQUAL(FloatToString(0.12345678f, PREC_POINT_DIGITS, 6), "0.123457"); + UNIT_ASSERT_VALUES_EQUAL(FloatToString(1e-20f, PREC_POINT_DIGITS, 6), "0.000000"); + UNIT_ASSERT_VALUES_EQUAL(FloatToString(12.34f, PREC_POINT_DIGITS, 0), "12"); // rounding to integers drops '.' + // strip trailing zeroes + UNIT_ASSERT_VALUES_EQUAL(FloatToString(0.1f, PREC_POINT_DIGITS_STRIP_ZEROES, 6), "0.1"); + UNIT_ASSERT_VALUES_EQUAL(FloatToString(0.12345678f, PREC_POINT_DIGITS_STRIP_ZEROES, 6), "0.123457"); + UNIT_ASSERT_VALUES_EQUAL(FloatToString(1e-20f, PREC_POINT_DIGITS_STRIP_ZEROES, 6), "0"); + UNIT_ASSERT_VALUES_EQUAL(FloatToString(12.34f, PREC_POINT_DIGITS_STRIP_ZEROES, 0), "12"); // rounding to integers drops '.' + UNIT_ASSERT_VALUES_EQUAL(FloatToString(10000.0f, PREC_POINT_DIGITS_STRIP_ZEROES, 0), "10000"); + // automatic selection of ndigits + UNIT_ASSERT_VALUES_EQUAL(FloatToString(0.1f), "0.1"); // drop trailing zeroes + UNIT_ASSERT_VALUES_EQUAL(FloatToString(0.12345678f), "0.12345678"); // 8 valid digits + UNIT_ASSERT_VALUES_EQUAL(FloatToString(1000.00006f), "1000.00006"); // 9 valid digits + UNIT_ASSERT_VALUES_EQUAL(FloatToString(1e-45f), "1e-45"); // denormalized: 1 valid digit + UNIT_ASSERT_VALUES_EQUAL(FloatToString(-0.0f), "-0"); // sign must be preserved + // version for double + UNIT_ASSERT_VALUES_EQUAL(FloatToString(1.0 / 10000), "0.0001"); // trailing zeroes + UNIT_ASSERT_VALUES_EQUAL(FloatToString(1.2345678901234567), "1.2345678901234567"); // no truncation + UNIT_ASSERT_VALUES_EQUAL(FloatToString(5e-324), "5e-324"); // denormalized + UNIT_ASSERT_VALUES_EQUAL(FloatToString(-0.0), "-0"); // sign must be preserved + + UNIT_ASSERT_STRINGS_EQUAL(FloatToString(std::numeric_limits<double>::quiet_NaN()), "nan"); + UNIT_ASSERT_STRINGS_EQUAL(FloatToString(std::numeric_limits<double>::infinity()), "inf"); + UNIT_ASSERT_STRINGS_EQUAL(FloatToString(-std::numeric_limits<double>::infinity()), "-inf"); + + UNIT_ASSERT_STRINGS_EQUAL(FloatToString(std::numeric_limits<float>::quiet_NaN()), "nan"); + UNIT_ASSERT_STRINGS_EQUAL(FloatToString(std::numeric_limits<float>::infinity()), "inf"); + UNIT_ASSERT_STRINGS_EQUAL(FloatToString(-std::numeric_limits<float>::infinity()), "-inf"); + } + + Y_UNIT_TEST(TestReadFloats) { + GoodFloatTester<float>("0.0001", 0.0001f, EPS); + GoodFloatTester<double>("0.0001", 0.0001, EPS); + GoodFloatTester<long double>("0.0001", 0.0001, EPS); + GoodFloatTester<float>("10E-5", 10E-5f, EPS); + GoodFloatTester<double>("1.0001E5", 1.0001E5, EPS); + GoodFloatTester<long double>("1.0001e5", 1.0001e5, EPS); + GoodFloatTester<long double>(".0001e5", .0001e5, EPS); + BadFloatTester<float>("a10E-5"); + BadFloatTester<float>("10 "); + BadFloatTester<float>("10\t"); + //BadFloatTester<float>("10E"); + //BadFloatTester<float>("10.E"); + BadFloatTester<float>("..0"); + BadFloatTester<float>(""); // IGNIETFERRO-300 + BadFloatTester<double>("1.00.01"); + BadFloatTester<double>("1.0001E5b"); + BadFloatTester<double>("1.0001s"); + BadFloatTester<double>("1..01"); + BadFloatTester<double>(""); // IGNIETFERRO-300 + BadFloatTester<long double>(".1.00"); + BadFloatTester<long double>("1.00."); + BadFloatTester<long double>("1.0001e5-"); + BadFloatTester<long double>("10e 2"); + BadFloatTester<long double>(""); // IGNIETFERRO-300 + } + + Y_UNIT_TEST(TestLiteral) { + UNIT_ASSERT_VALUES_EQUAL(ToString("abc"), TString("abc")); + } + + Y_UNIT_TEST(TestFromStringStringBuf) { + TString a = "xyz"; + TStringBuf b = FromString<TStringBuf>(a); + UNIT_ASSERT_VALUES_EQUAL(a, b); + UNIT_ASSERT_VALUES_EQUAL((void*)a.data(), (void*)b.data()); + } + +#if 0 + Y_UNIT_TEST(TestBufferOverflow) { + CheckConvertToBuffer<float>(1.f, 5, "1"); + CheckConvertToBuffer<float>(1.005f, 3, "1.005"); + CheckConvertToBuffer<float>(1.00000000f, 3, "1"); + + CheckConvertToBuffer<double>(1.f, 5, "1"); + CheckConvertToBuffer<double>(1.005f, 3, "1.005"); + CheckConvertToBuffer<double>(1.00000000f, 3, "1"); + + CheckConvertToBuffer<int>(2, 5, "2"); + CheckConvertToBuffer<int>(1005, 3, "1005"); + + CheckConvertToBuffer<size_t>(2, 5, "2"); + CheckConvertToBuffer<ui64>(1005000000000000ull, 32, "1005000000000000"); + CheckConvertToBuffer<ui64>(1005000000000000ull, 3, "1005000000000000"); + + // TString longNumber = TString("1.") + TString(1 << 20, '1'); + // UNIT_ASSERT_EXCEPTION(FromString<double>(longNumber), yexception); + } +#endif + + Y_UNIT_TEST(TestWide) { + TUtf16String iw = u"-100500"; + int iv = 0; + UNIT_ASSERT_VALUES_EQUAL(TryFromString(iw, iv), true); + UNIT_ASSERT_VALUES_EQUAL(iv, -100500); + + ui64 uv = 0; + TUtf16String uw = u"21474836470"; + UNIT_ASSERT_VALUES_EQUAL(TryFromString(uw, uv), true); + UNIT_ASSERT_VALUES_EQUAL(uv, 21474836470ull); + + TWtringBuf bw(uw.data(), uw.size()); + uv = 0; + UNIT_ASSERT_VALUES_EQUAL(TryFromString(uw, uv), true); + UNIT_ASSERT_VALUES_EQUAL(uv, 21474836470ull); + + const wchar16* beg = uw.data(); + uv = 0; + UNIT_ASSERT_VALUES_EQUAL(TryFromString(beg, uw.size(), uv), true); + UNIT_ASSERT_VALUES_EQUAL(uv, 21474836470ull); + } + + Y_UNIT_TEST(TestDefault) { + size_t res = 0; + const size_t def1 = 42; + + TString s1("100500"); + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault(s1, res, def1), true); + UNIT_ASSERT_VALUES_EQUAL(res, 100500); + + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault(s1, res), true); + UNIT_ASSERT_VALUES_EQUAL(res, 100500); + + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault("100500", res, def1), true); + UNIT_ASSERT_VALUES_EQUAL(res, 100500); + + UNIT_CHECK_GENERATED_NO_EXCEPTION(FromStringWithDefault(s1, def1), yexception); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault(s1, def1), 100500); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault<size_t>(s1), 100500); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault("100500", def1), 100500); + + TString s2("100q500"); + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault(s2, res), false); + UNIT_ASSERT_VALUES_EQUAL(res, size_t()); + + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault(s2, res, def1), false); + UNIT_ASSERT_VALUES_EQUAL(res, def1); + + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault("100q500", res), false); + UNIT_ASSERT_VALUES_EQUAL(res, size_t()); + + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault("100 500", res), false); + UNIT_ASSERT_VALUES_EQUAL(res, size_t()); + + UNIT_CHECK_GENERATED_NO_EXCEPTION(FromStringWithDefault(s2, def1), yexception); + UNIT_CHECK_GENERATED_NO_EXCEPTION(FromStringWithDefault("100q500", def1), yexception); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault(s2, def1), def1); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault<size_t>(s2), size_t()); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault<size_t>("100q500"), size_t()); + UNIT_CHECK_GENERATED_EXCEPTION(FromString<size_t>(s2), TFromStringException); + + int res2 = 0; + const int def2 = -6; + + TUtf16String s3 = u"-100500"; + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault(s3, res2, def2), true); + UNIT_ASSERT_VALUES_EQUAL(res2, -100500); + + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault(s3, res2), true); + UNIT_ASSERT_VALUES_EQUAL(res2, -100500); + + UNIT_CHECK_GENERATED_NO_EXCEPTION(FromStringWithDefault(s3, def1), yexception); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault(s3, def2), -100500); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault<size_t>(s3), size_t()); + + TUtf16String s4 = u"-f100500"; + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault(s4, res2, def2), false); + UNIT_ASSERT_VALUES_EQUAL(res2, def2); + + UNIT_ASSERT_VALUES_EQUAL(TryFromStringWithDefault(s4, res2), false); + UNIT_ASSERT_VALUES_EQUAL(res2, size_t()); + + UNIT_CHECK_GENERATED_NO_EXCEPTION(FromStringWithDefault(s4, def2), yexception); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault(s4, def2), def2); + UNIT_CHECK_GENERATED_EXCEPTION(FromString<size_t>(s4), yexception); + UNIT_ASSERT_VALUES_EQUAL(FromStringWithDefault<size_t>(s4), size_t()); + } + + Y_UNIT_TEST(TestBool) { + // True cases + UNIT_ASSERT_VALUES_EQUAL(FromString<bool>("yes"), true); + UNIT_ASSERT_VALUES_EQUAL(FromString<bool>("1"), true); + // False cases + UNIT_ASSERT_VALUES_EQUAL(FromString<bool>("no"), false); + UNIT_ASSERT_VALUES_EQUAL(FromString<bool>("0"), false); + // Strange cases + UNIT_ASSERT_EXCEPTION(FromString<bool>(""), yexception); + UNIT_ASSERT_EXCEPTION(FromString<bool>("something"), yexception); + } + + Y_UNIT_TEST(TestAutoDetectType) { + UNIT_ASSERT_DOUBLES_EQUAL((float)FromString("0.0001"), 0.0001, EPS); + UNIT_ASSERT_DOUBLES_EQUAL((double)FromString("0.0015", sizeof("0.0015") - 2), 0.001, EPS); + UNIT_ASSERT_DOUBLES_EQUAL((long double)FromString(TStringBuf("0.0001")), 0.0001, EPS); + UNIT_ASSERT_DOUBLES_EQUAL((float)FromString(TString("10E-5")), 10E-5, EPS); + UNIT_ASSERT_VALUES_EQUAL((bool)FromString("da"), true); + UNIT_ASSERT_VALUES_EQUAL((bool)FromString("no"), false); + UNIT_ASSERT_VALUES_EQUAL((short)FromString(u"9000"), 9000); + UNIT_ASSERT_VALUES_EQUAL((int)FromString(u"-100500"), -100500); + UNIT_ASSERT_VALUES_EQUAL((unsigned long long)FromString(TWtringBuf(u"42", 1)), 4); + int integer = FromString("125"); + ui16 wideCharacterCode = FromString(u"125"); + UNIT_ASSERT_VALUES_EQUAL(integer, wideCharacterCode); + } + + static void CheckMessage(TFromStringException& exc, const TString& phrase) { + TString message = exc.what(); + if (!message.Contains(phrase)) { + Cerr << message << Endl; + UNIT_ASSERT(false); + } + } + + Y_UNIT_TEST(ErrorMessages) { + try { + FromString<ui32>(""); + UNIT_ASSERT(false); + } catch (TFromStringException& e) { + CheckMessage(e, "empty string as number"); + } + + try { + FromString<ui32>("-"); + UNIT_ASSERT(false); + } catch (TFromStringException& e) { + // Unsigned should have no sign at all, so - is not expected + CheckMessage(e, "Unexpected symbol \"-\" at pos 0 in string \"-\""); + } + + try { + FromString<i32>("-"); + UNIT_ASSERT(false); + } catch (TFromStringException& e) { + CheckMessage(e, "Cannot parse string \"-\" as number"); + } + + try { + FromString<i32>("+"); + UNIT_ASSERT(false); + } catch (TFromStringException& e) { + CheckMessage(e, "Cannot parse string \"+\" as number"); + } + + try { + FromString<ui32>("0.328413745072"); + UNIT_ASSERT(false); + } catch (TFromStringException& e) { + CheckMessage(e, "Unexpected symbol \".\" at pos 1 in string \"0.328413745072\""); + } + } + + Y_UNIT_TEST(TryStringBuf) { + { + constexpr TStringBuf hello = "hello"; + TStringBuf out; + UNIT_ASSERT(TryFromString(hello, out)); + UNIT_ASSERT_VALUES_EQUAL(hello, out); + } + { + constexpr TStringBuf empty = ""; + TStringBuf out; + UNIT_ASSERT(TryFromString(empty, out)); + UNIT_ASSERT_VALUES_EQUAL(empty, out); + } + { + constexpr TStringBuf empty; + TStringBuf out; + UNIT_ASSERT(TryFromString(empty, out)); + UNIT_ASSERT_VALUES_EQUAL(empty, out); + } + { + const auto hello = u"hello"; + TWtringBuf out; + UNIT_ASSERT(TryFromString(hello, out)); + UNIT_ASSERT_VALUES_EQUAL(hello, out); + } + { + const TUtf16String empty; + TWtringBuf out; + UNIT_ASSERT(TryFromString(empty, out)); + UNIT_ASSERT_VALUES_EQUAL(empty, out); + } + { + constexpr TWtringBuf empty; + TWtringBuf out; + UNIT_ASSERT(TryFromString(empty, out)); + UNIT_ASSERT_VALUES_EQUAL(empty, out); + } + } + + Y_UNIT_TEST(Nan) { + double xx = 0; + + UNIT_ASSERT(!TryFromString("NaN", xx)); + UNIT_ASSERT(!TryFromString("NAN", xx)); + UNIT_ASSERT(!TryFromString("nan", xx)); + } + + Y_UNIT_TEST(Infinity) { + double xx = 0; + + UNIT_ASSERT(!TryFromString("Infinity", xx)); + UNIT_ASSERT(!TryFromString("INFINITY", xx)); + UNIT_ASSERT(!TryFromString("infinity", xx)); + } + + Y_UNIT_TEST(TestBorderCases) { + UNIT_ASSERT_VALUES_EQUAL(ToString(0.0), "0"); + UNIT_ASSERT_VALUES_EQUAL(ToString(1.0), "1"); + UNIT_ASSERT_VALUES_EQUAL(ToString(10.0), "10"); + UNIT_ASSERT_VALUES_EQUAL(ToString(NAN), "nan"); + UNIT_ASSERT_VALUES_EQUAL(ToString(-NAN), "nan"); + UNIT_ASSERT_VALUES_EQUAL(ToString(INFINITY), "inf"); + UNIT_ASSERT_VALUES_EQUAL(ToString(-INFINITY), "-inf"); + UNIT_ASSERT_VALUES_EQUAL(ToString(1.1e+100), "1.1e+100"); + UNIT_ASSERT_VALUES_EQUAL(ToString(1e+100), "1e+100"); + UNIT_ASSERT_VALUES_EQUAL(ToString(87423.2031250000001), "87423.20313"); + UNIT_ASSERT_VALUES_EQUAL(FloatToString(1.0e60, PREC_POINT_DIGITS_STRIP_ZEROES, 0), "1e+60"); + } + + Y_UNIT_TEST(TestChar) { + // Given a character ch, ToString(ch) returns + // the decimal representation of its integral value + + // char + UNIT_ASSERT_VALUES_EQUAL(ToString('\0'), "0"); + UNIT_ASSERT_VALUES_EQUAL(ToString('0'), "48"); + + // wchar16 + UNIT_ASSERT_VALUES_EQUAL(ToString(u'\0'), "0"); + UNIT_ASSERT_VALUES_EQUAL(ToString(u'0'), "48"); + UNIT_ASSERT_VALUES_EQUAL(ToString(u'я'), "1103"); + UNIT_ASSERT_VALUES_EQUAL(ToString(u'\uFFFF'), "65535"); + + // wchar32 + UNIT_ASSERT_VALUES_EQUAL(ToString(U'\0'), "0"); + UNIT_ASSERT_VALUES_EQUAL(ToString(U'0'), "48"); + UNIT_ASSERT_VALUES_EQUAL(ToString(U'я'), "1103"); + UNIT_ASSERT_VALUES_EQUAL(ToString(U'\U0001F600'), "128512"); // 'GRINNING FACE' (U+1F600) + } +}; diff --git a/util/string/cast_ut.pyx b/util/string/cast_ut.pyx new file mode 100644 index 0000000000..88e86ef961 --- /dev/null +++ b/util/string/cast_ut.pyx @@ -0,0 +1,13 @@ +# cython: c_string_type=str, c_string_encoding=utf8 + +from util.string.cast cimport FromString, ToString + +import unittest + +class TestFromString(unittest.TestCase): + def test_from_int(self): + self.assertEquals(FromString[int]("42"), 42) + +class TestToString(unittest.TestCase): + def test_from_int(self): + self.assertEquals(ToString(42), "42") diff --git a/util/string/cstriter.cpp b/util/string/cstriter.cpp new file mode 100644 index 0000000000..fd61359c3d --- /dev/null +++ b/util/string/cstriter.cpp @@ -0,0 +1 @@ +#include "cstriter.h" diff --git a/util/string/cstriter.h b/util/string/cstriter.h new file mode 100644 index 0000000000..ca57728c39 --- /dev/null +++ b/util/string/cstriter.h @@ -0,0 +1,14 @@ +#pragma once + +struct TCStringEndIterator { +}; + +template <class It> +static inline bool operator==(It b, TCStringEndIterator) { + return !*b; +} + +template <class It> +static inline bool operator!=(It b, TCStringEndIterator) { + return !!*b; +} diff --git a/util/string/escape.cpp b/util/string/escape.cpp new file mode 100644 index 0000000000..cd09a7dbd0 --- /dev/null +++ b/util/string/escape.cpp @@ -0,0 +1,433 @@ +#include "escape.h" +#include "cast.h" + +#include <util/system/defaults.h> +#include <util/charset/utf8.h> +#include <util/charset/wide.h> + +/// @todo: escape trigraphs (eg "??/" is "\") + +/* REFEREBCES FOR ESCAPE SEQUENCE INTERPRETATION: + * C99 p. 6.4.3 Universal character names. + * C99 p. 6.4.4.4 Character constants. + * + * <simple-escape-sequence> ::= { + * \' , \" , \? , \\ , + * \a , \b , \f , \n , \r , \t , \v + * } + * + * <octal-escape-sequence> ::= \ <octal-digit> {1, 3} + * <hexadecimal-escape-sequence> ::= \x <hexadecimal-digit> + + * <universal-character-name> ::= \u <hexadecimal-digit> {4} + * || \U <hexadecimal-digit> {8} + * + * NOTE (6.4.4.4.7): + * Each octal or hexadecimal escape sequence is the longest sequence of characters that can + * constitute the escape sequence. + * + * THEREFORE: + * - Octal escape sequence spans until rightmost non-octal-digit character. + * - Octal escape sequence always terminates after three octal digits. + * - Hexadecimal escape sequence spans until rightmost non-hexadecimal-digit character. + * - Universal character name consists of exactly 4 or 8 hexadecimal digit. + * + * by kerzum@ + * It is also required to escape trigraphs that are enabled in compilers by default and + * are also processed inside string literals + * The nine trigraphs and their replacements are + * + * Trigraph: ??( ??) ??< ??> ??= ??/ ??' ??! ??- + * Replacement: [ ] { } # \ ^ | ~ + * + */ +namespace { + template <typename TChar> + static inline char HexDigit(TChar value) { + Y_ASSERT(value < 16); + if (value < 10) { + return '0' + value; + } else { + return 'A' + value - 10; + } + } + + template <typename TChar> + static inline char OctDigit(TChar value) { + Y_ASSERT(value < 8); + return '0' + value; + } + + template <typename TChar> + static inline bool IsPrintable(TChar c) { + return c >= 32 && c <= 126; + } + + template <typename TChar> + static inline bool IsHexDigit(TChar c) { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); + } + + template <typename TChar> + static inline bool IsOctDigit(TChar c) { + return c >= '0' && c <= '7'; + } + + template <typename TChar> + struct TEscapeUtil; + + template <> + struct TEscapeUtil<char> { + static const size_t ESCAPE_C_BUFFER_SIZE = 4; + + template <typename TNextChar, typename TBufferChar> + static inline size_t EscapeC(unsigned char c, TNextChar next, TBufferChar r[ESCAPE_C_BUFFER_SIZE]) { + // (1) Printable characters go as-is, except backslash and double quote. + // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible). + // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal. + if (c == '\"') { + r[0] = '\\'; + r[1] = '\"'; + return 2; + } else if (c == '\\') { + r[0] = '\\'; + r[1] = '\\'; + return 2; + } else if (IsPrintable(c) && (!(c == '?' && next == '?'))) { + r[0] = c; + return 1; + } else if (c == '\r') { + r[0] = '\\'; + r[1] = 'r'; + return 2; + } else if (c == '\n') { + r[0] = '\\'; + r[1] = 'n'; + return 2; + } else if (c == '\t') { + r[0] = '\\'; + r[1] = 't'; + return 2; + } else if (c < 8 && !IsOctDigit(next)) { + r[0] = '\\'; + r[1] = OctDigit(c); + return 2; + } else if (!IsHexDigit(next)) { + r[0] = '\\'; + r[1] = 'x'; + r[2] = HexDigit((c & 0xF0) >> 4); + r[3] = HexDigit((c & 0x0F) >> 0); + return 4; + } else { + r[0] = '\\'; + r[1] = OctDigit((c & 0700) >> 6); + r[2] = OctDigit((c & 0070) >> 3); + r[3] = OctDigit((c & 0007) >> 0); + return 4; + } + } + }; + + template <> + struct TEscapeUtil<wchar16> { + static const size_t ESCAPE_C_BUFFER_SIZE = 6; + + template <typename TNextChar, typename TBufferChar> + static inline size_t EscapeC(wchar16 c, TNextChar next, TBufferChar r[ESCAPE_C_BUFFER_SIZE]) { + if (c < 0x100) { + return TEscapeUtil<char>::EscapeC(char(c), next, r); + } else { + r[0] = '\\'; + r[1] = 'u'; + r[2] = HexDigit((c & 0xF000) >> 12); + r[3] = HexDigit((c & 0x0F00) >> 8); + r[4] = HexDigit((c & 0x00F0) >> 4); + r[5] = HexDigit((c & 0x000F) >> 0); + return 6; + } + } + }; +} + +template <class TChar> +TBasicString<TChar>& EscapeCImpl(const TChar* str, size_t len, TBasicString<TChar>& r) { + using TEscapeUtil = ::TEscapeUtil<TChar>; + + TChar buffer[TEscapeUtil::ESCAPE_C_BUFFER_SIZE]; + + size_t i, j; + for (i = 0, j = 0; i < len; ++i) { + size_t rlen = TEscapeUtil::EscapeC(str[i], (i + 1 < len ? str[i + 1] : 0), buffer); + + if (rlen > 1) { + r.append(str + j, i - j); + j = i + 1; + r.append(buffer, rlen); + } + } + + if (j > 0) { + r.append(str + j, len - j); + } else { + r.append(str, len); + } + + return r; +} + +template TString& EscapeCImpl<TString::TChar>(const TString::TChar* str, size_t len, TString& r); +template TUtf16String& EscapeCImpl<TUtf16String::TChar>(const TUtf16String::TChar* str, size_t len, TUtf16String& r); + +namespace { + template <class TStr> + inline void AppendUnicode(TStr& s, wchar32 v) { + char buf[10]; + size_t sz = 0; + + WriteUTF8Char(v, sz, (ui8*)buf); + s.AppendNoAlias(buf, sz); + } + + inline void AppendUnicode(TUtf16String& s, wchar32 v) { + WriteSymbol(v, s); + } + + template <ui32 sz, typename TChar> + inline size_t CountHex(const TChar* p, const TChar* pe) { + auto b = p; + auto e = Min(p + sz, pe); + + while (b < e && IsHexDigit(*b)) { + ++b; + } + + return b - p; + } + + template <size_t sz, typename TChar, typename T> + inline bool ParseHex(const TChar* p, const TChar* pe, T& t) noexcept { + return (p + sz <= pe) && TryIntFromString<16>(p, sz, t); + } + + template <ui32 sz, typename TChar> + inline size_t CountOct(const TChar* p, const TChar* pe) { + ui32 maxsz = Min<size_t>(sz, pe - p); + + if (3 == sz && 3 == maxsz && !(*p >= '0' && *p <= '3')) { + maxsz = 2; + } + + for (ui32 i = 0; i < maxsz; ++i, ++p) { + if (!IsOctDigit(*p)) { + return i; + } + } + + return maxsz; + } +} + +template <class TChar, class TStr> +static TStr& DoUnescapeC(const TChar* p, size_t sz, TStr& res) { + const TChar* pe = p + sz; + + while (p != pe) { + if ('\\' == *p) { + ++p; + + if (p == pe) { + return res; + } + + switch (*p) { + default: + res.append(*p); + break; + case 'a': + res.append('\a'); + break; + case 'b': + res.append('\b'); + break; + case 'f': + res.append('\f'); + break; + case 'n': + res.append('\n'); + break; + case 'r': + res.append('\r'); + break; + case 't': + res.append('\t'); + break; + case 'v': + res.append('\v'); + break; + case 'u': { + ui16 cp[2]; + + if (ParseHex<4>(p + 1, pe, cp[0])) { + if (Y_UNLIKELY(cp[0] >= 0xD800 && cp[0] <= 0xDBFF && ParseHex<4>(p + 7, pe, cp[1]) && p[5] == '\\' && p[6] == 'u')) { + const wchar16 wbuf[] = {wchar16(cp[0]), wchar16(cp[1])}; + AppendUnicode(res, ReadSymbol(wbuf, wbuf + 2)); + p += 10; + } else { + AppendUnicode(res, (wchar32)cp[0]); + p += 4; + } + } else { + res.append(*p); + } + + break; + } + + case 'U': + if (CountHex<8>(p + 1, pe) != 8) { + res.append(*p); + } else { + AppendUnicode(res, IntFromString<ui32, 16>(p + 1, 8)); + p += 8; + } + break; + case 'x': + if (ui32 v = CountHex<2>(p + 1, pe)) { + res.append((TChar)IntFromString<ui32, 16>(p + 1, v)); + p += v; + } else { + res.append(*p); + } + + break; + case '0': + case '1': + case '2': + case '3': { + ui32 v = CountOct<3>(p, pe); // v is always positive + res.append((TChar)IntFromString<ui32, 8>(p, v)); + p += v - 1; + } break; + case '4': + case '5': + case '6': + case '7': { + ui32 v = CountOct<2>(p, pe); // v is always positive + res.append((TChar)IntFromString<ui32, 8>(p, v)); + p += v - 1; + } break; + } + + ++p; + } else { + const auto r = std::basic_string_view<TChar>(p, pe - p).find('\\'); + const auto n = r != std::string::npos ? p + r : pe; + + res.append(p, n); + p = n; + } + } + + return res; +} + +template <class TChar> +TBasicString<TChar>& UnescapeCImpl(const TChar* p, size_t sz, TBasicString<TChar>& res) { + return DoUnescapeC(p, sz, res); +} + +template <class TChar> +TChar* UnescapeC(const TChar* str, size_t len, TChar* buf) { + struct TUnboundedString { + void append(TChar ch) noexcept { + *P++ = ch; + } + + void append(const TChar* b, const TChar* e) noexcept { + while (b != e) { + append(*b++); + } + } + + void AppendNoAlias(const TChar* s, size_t l) noexcept { + append(s, s + l); + } + + TChar* P; + } bufbuf = {buf}; + + return DoUnescapeC(str, len, bufbuf).P; +} + +template TString& UnescapeCImpl<TString::TChar>(const TString::TChar* str, size_t len, TString& r); +template TUtf16String& UnescapeCImpl<TUtf16String::TChar>(const TUtf16String::TChar* str, size_t len, TUtf16String& r); + +template char* UnescapeC<char>(const char* str, size_t len, char* buf); + +template <class TChar> +size_t UnescapeCCharLen(const TChar* begin, const TChar* end) { + if (begin >= end) { + return 0; + } + if (*begin != '\\') { + return 1; + } + if (++begin == end) { + return 1; + } + + switch (*begin) { + default: + return 2; + case 'u': + return CountHex<4>(begin + 1, end) == 4 ? 6 : 2; + case 'U': + return CountHex<8>(begin + 1, end) == 8 ? 10 : 2; + case 'x': + return 2 + CountHex<2>(begin + 1, end); + case '0': + case '1': + case '2': + case '3': + return 1 + CountOct<3>(begin, end); // >= 2 + case '4': + case '5': + case '6': + case '7': + return 1 + CountOct<2>(begin, end); // >= 2 + } +} + +template size_t UnescapeCCharLen<char>(const char* begin, const char* end); +template size_t UnescapeCCharLen<TUtf16String::TChar>(const TUtf16String::TChar* begin, const TUtf16String::TChar* end); + +TString& EscapeC(const TStringBuf str, TString& s) { + return EscapeC(str.data(), str.size(), s); +} + +TUtf16String& EscapeC(const TWtringBuf str, TUtf16String& w) { + return EscapeC(str.data(), str.size(), w); +} + +TString EscapeC(const TString& str) { + return EscapeC(str.data(), str.size()); +} + +TUtf16String EscapeC(const TUtf16String& str) { + return EscapeC(str.data(), str.size()); +} + +TString& UnescapeC(const TStringBuf str, TString& s) { + return UnescapeC(str.data(), str.size(), s); +} + +TUtf16String& UnescapeC(const TWtringBuf str, TUtf16String& w) { + return UnescapeC(str.data(), str.size(), w); +} + +TString UnescapeC(const TStringBuf str) { + return UnescapeC(str.data(), str.size()); +} + +TUtf16String UnescapeC(const TWtringBuf str) { + return UnescapeC(str.data(), str.size()); +} diff --git a/util/string/escape.h b/util/string/escape.h new file mode 100644 index 0000000000..b01be65b0e --- /dev/null +++ b/util/string/escape.h @@ -0,0 +1,70 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/strbuf.h> + +template <class TChar> +TBasicString<TChar>& EscapeCImpl(const TChar* str, size_t len, TBasicString<TChar>&); + +template <class TChar> +TBasicString<TChar>& UnescapeCImpl(const TChar* str, size_t len, TBasicString<TChar>&); + +template <class TChar> +TChar* UnescapeC(const TChar* str, size_t len, TChar* buf); + +template <typename TChar> +static inline TBasicString<TChar>& EscapeC(const TChar* str, size_t len, TBasicString<TChar>& s) { + return EscapeCImpl(str, len, s); +} + +template <typename TChar> +static inline TBasicString<TChar> EscapeC(const TChar* str, size_t len) { + TBasicString<TChar> s; + return EscapeC(str, len, s); +} + +template <typename TChar> +static inline TBasicString<TChar> EscapeC(const TBasicStringBuf<TChar>& str) { + return EscapeC(str.data(), str.size()); +} + +template <typename TChar> +static inline TBasicString<TChar>& UnescapeC(const TChar* str, size_t len, TBasicString<TChar>& s) { + return UnescapeCImpl(str, len, s); +} + +template <typename TChar> +static inline TBasicString<TChar> UnescapeC(const TChar* str, size_t len) { + TBasicString<TChar> s; + return UnescapeCImpl(str, len, s); +} + +template <typename TChar> +static inline TBasicString<TChar> EscapeC(TChar ch) { + return EscapeC(&ch, 1); +} + +template <typename TChar> +static inline TBasicString<TChar> EscapeC(const TChar* str) { + return EscapeC(str, std::char_traits<TChar>::length(str)); +} + +TString& EscapeC(const TStringBuf str, TString& res); +TUtf16String& EscapeC(const TWtringBuf str, TUtf16String& res); + +// these two need to be methods, because of TBasicString::Quote implementation +TString EscapeC(const TString& str); +TUtf16String EscapeC(const TUtf16String& str); + +TString& UnescapeC(const TStringBuf str, TString& res); +TUtf16String& UnescapeC(const TWtringBuf str, TUtf16String& res); + +TString UnescapeC(const TStringBuf str); +TUtf16String UnescapeC(const TWtringBuf wtr); + +/// Returns number of chars in escape sequence. +/// - 0, if begin >= end +/// - 1, if [begin, end) starts with an unescaped char +/// - at least 2 (including '\'), if [begin, end) starts with an escaped symbol +template <class TChar> +size_t UnescapeCCharLen(const TChar* begin, const TChar* end); diff --git a/util/string/escape_ut.cpp b/util/string/escape_ut.cpp new file mode 100644 index 0000000000..cd38ecffd3 --- /dev/null +++ b/util/string/escape_ut.cpp @@ -0,0 +1,148 @@ +#include "escape.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/string.h> +#include <util/charset/wide.h> + +using namespace std::string_view_literals; + +namespace { + struct TExample { + TString Expected; + TString Source; + + TExample(const TStringBuf expected, const TStringBuf source) + : Expected{expected} + , Source{source} + { + } + }; +} + +static const TExample CommonTestData[] = { + // Should be valid UTF-8. + {"http://ya.ru/", "http://ya.ru/"}, + {"http://ya.ru/\\x17\\n", "http://ya.ru/\x17\n"}, + + {"http://ya.ru/\\0", "http://ya.ru/\0"sv}, + {"http://ya.ru/\\0\\0", "http://ya.ru/\0\0"sv}, + {"http://ya.ru/\\0\\0000", "http://ya.ru/\0\0" + "0"sv}, + {"http://ya.ru/\\0\\0001", "http://ya.ru/\0\x00" + "1"sv}, + + {R"(\2\4\00678)", "\2\4\6" + "78"sv}, // \6 -> \006 because next char '7' is "octal" + {R"(\2\4\689)", "\2\4\6" + "89"sv}, // \6 -> \6 because next char '8' is not "octal" + + {R"(\"Hello\", Alice said.)", "\"Hello\", Alice said."}, + {"Slash\\\\dash!", "Slash\\dash!"}, + {R"(There\nare\r\nnewlines.)", "There\nare\r\nnewlines."}, + {"There\\tare\\ttabs.", "There\tare\ttabs."}, + + {"There are questions \\x3F\\x3F?", "There are questions ???"}, + {"There are questions \\x3F?", "There are questions ??"}, +}; + +Y_UNIT_TEST_SUITE(TEscapeCTest) { + Y_UNIT_TEST(TestStrokaEscapeC) { + for (const auto& e : CommonTestData) { + TString expected(e.Expected); + TString source(e.Source); + TString actual(EscapeC(e.Source)); + TString actual2(UnescapeC(e.Expected)); + + UNIT_ASSERT_VALUES_EQUAL(e.Expected, actual); + UNIT_ASSERT_VALUES_EQUAL(e.Source, actual2); + } + + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\x17\\n\\xAB", EscapeC(TString("http://ya.ru/\x17\n\xab"))); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\x17\n\xab", UnescapeC(TString("http://ya.ru/\\x17\\n\\xAB"))); + UNIT_ASSERT_VALUES_EQUAL("h", EscapeC('h')); + UNIT_ASSERT_VALUES_EQUAL("h", UnescapeC(TString("h"))); + UNIT_ASSERT_VALUES_EQUAL("\\xFF", EscapeC('\xFF')); + UNIT_ASSERT_VALUES_EQUAL("\xFF", UnescapeC(TString("\\xFF"))); + + UNIT_ASSERT_VALUES_EQUAL("\\377f", EscapeC(TString("\xff" + "f"))); + UNIT_ASSERT_VALUES_EQUAL("\xff" + "f", + UnescapeC(TString("\\377f"))); + UNIT_ASSERT_VALUES_EQUAL("\\xFFg", EscapeC(TString("\xff" + "g"))); + UNIT_ASSERT_VALUES_EQUAL("\xff" + "g", + UnescapeC(TString("\\xFFg"))); + UNIT_ASSERT_VALUES_EQUAL("\xEA\x9A\x96", UnescapeC(TString("\\uA696"))); + UNIT_ASSERT_VALUES_EQUAL("Странный компроматтест", UnescapeC(TString("\\u0421\\u0442\\u0440\\u0430\\u043d\\u043d\\u044b\\u0439 \\u043a\\u043e\\u043c\\u043f\\u0440\\u043e\\u043c\\u0430\\u0442тест"))); + } + + Y_UNIT_TEST(TestWtrokaEscapeC) { + for (const auto& e : CommonTestData) { + TUtf16String expected(UTF8ToWide(e.Expected)); + TUtf16String source(UTF8ToWide(e.Source)); + TUtf16String actual(EscapeC(source)); + TUtf16String actual2(UnescapeC(expected)); + + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + UNIT_ASSERT_VALUES_EQUAL(source, actual2); + } + + UNIT_ASSERT_VALUES_EQUAL(u"http://ya.ru/\\x17\\n\\u1234", EscapeC(u"http://ya.ru/\x17\n\u1234")); + UNIT_ASSERT_VALUES_EQUAL(u"h", EscapeC(u'h')); + UNIT_ASSERT_VALUES_EQUAL(u"\\xFF", EscapeC(wchar16(255))); + } + + Y_UNIT_TEST(TestEscapeTrigraphs) { + UNIT_ASSERT_VALUES_EQUAL("?", EscapeC(TString("?"))); + UNIT_ASSERT_VALUES_EQUAL("\\x3F?", EscapeC(TString("??"))); + UNIT_ASSERT_VALUES_EQUAL("\\x3F\\x3F?", EscapeC(TString("???"))); + // ok but may cause warning about trigraphs + // UNIT_ASSERT_VALUES_EQUAL("[x]?z", EscapeC(TString("??(x??)?z"))); + UNIT_ASSERT_VALUES_EQUAL("\\x3F?x\\x3F\\x3F?z", EscapeC(TString("??x???z"))); + } + + Y_UNIT_TEST(TestUnescapeCCharLen) { + auto test = [](const char* str, size_t len) { + UNIT_ASSERT_EQUAL(UnescapeCCharLen(str, str + strlen(str)), len); + }; + + test("", 0); + test("abc", 1); + test("\\", 1); + test("\\\\", 2); + test("\\#", 2); + test("\\n10", 2); + test("\\r\\n", 2); + test("\\x05abc", 4); + test("\\u11117777", 6); + test("\\u123yyy", 2); + test("\\U11117777cccc", 10); + test("\\U111yyy", 2); + test("\\0\\1", 2); + test("\\01\\1", 3); + test("\\012\\1", 4); + test("\\0123\\1", 4); + test("\\4\\1", 2); + test("\\40\\1", 3); + test("\\400\\1", 3); + test("\\4xxx", 2); + } + + Y_UNIT_TEST(TestUnbounded) { + char buf[100000]; + + for (const auto& x : CommonTestData) { + char* end = UnescapeC(x.Expected.data(), x.Expected.size(), buf); + + UNIT_ASSERT_VALUES_EQUAL(x.Source, TStringBuf(buf, end)); + } + } + + Y_UNIT_TEST(TestCapitalUEscapes) { + UNIT_ASSERT_VALUES_EQUAL(UnescapeC("\\U00000020"), " "); + UNIT_ASSERT_VALUES_EQUAL(UnescapeC("\\Uxxx"), "Uxxx"); + } +} diff --git a/util/string/fuzzing/collapse/main.cpp b/util/string/fuzzing/collapse/main.cpp new file mode 100644 index 0000000000..e7b09f0f55 --- /dev/null +++ b/util/string/fuzzing/collapse/main.cpp @@ -0,0 +1,12 @@ +#include <util/string/strip.h> +#include <util/charset/wide.h> + +extern "C" int LLVMFuzzerTestOneInput(const ui8* data, size_t size) { + TUtf16String w((const wchar16*)data, size / 2); + Collapse(w); + + TString s((const char*)data, size); + CollapseInPlace(s); + + return 0; // Non-zero return values are reserved for future use. +} diff --git a/util/string/fuzzing/collapse/ya.make b/util/string/fuzzing/collapse/ya.make new file mode 100644 index 0000000000..b8614f6411 --- /dev/null +++ b/util/string/fuzzing/collapse/ya.make @@ -0,0 +1,13 @@ +FUZZ() + +OWNER( + pg + g:util +) +SUBSCRIBER(g:util-subscribers) + +SRCS( + main.cpp +) + +END() diff --git a/util/string/fuzzing/escape_c/main.cpp b/util/string/fuzzing/escape_c/main.cpp new file mode 100644 index 0000000000..742126416a --- /dev/null +++ b/util/string/fuzzing/escape_c/main.cpp @@ -0,0 +1,11 @@ +#include <util/generic/string.h> +#include <util/string/escape.h> + +extern "C" int LLVMFuzzerTestOneInput(const ui8* const data, const size_t size) { + const TString src(reinterpret_cast<const char*>(data), size); + const auto escaped = EscapeC(src); + const auto dst = UnescapeC(escaped); + + Y_VERIFY(src == dst); + return 0; +} diff --git a/util/string/fuzzing/escape_c/ya.make b/util/string/fuzzing/escape_c/ya.make new file mode 100644 index 0000000000..61e64ac9de --- /dev/null +++ b/util/string/fuzzing/escape_c/ya.make @@ -0,0 +1,13 @@ +OWNER( + yazevnul + g:util +) +SUBSCRIBER(g:util-subscribers) + +FUZZ() + +SRCS( + main.cpp +) + +END() diff --git a/util/string/fuzzing/strtod/main.cpp b/util/string/fuzzing/strtod/main.cpp new file mode 100644 index 0000000000..50ea2a6afc --- /dev/null +++ b/util/string/fuzzing/strtod/main.cpp @@ -0,0 +1,9 @@ +#include <util/string/cast.h> + +extern "C" int LLVMFuzzerTestOneInput(const ui8* data, size_t size) { + double res; + + TryFromString<double>((const char*)data, size, res); + + return 0; // Non-zero return values are reserved for future use. +} diff --git a/util/string/fuzzing/strtod/ya.make b/util/string/fuzzing/strtod/ya.make new file mode 100644 index 0000000000..b8614f6411 --- /dev/null +++ b/util/string/fuzzing/strtod/ya.make @@ -0,0 +1,13 @@ +FUZZ() + +OWNER( + pg + g:util +) +SUBSCRIBER(g:util-subscribers) + +SRCS( + main.cpp +) + +END() diff --git a/util/string/fuzzing/ya.make b/util/string/fuzzing/ya.make new file mode 100644 index 0000000000..617e0f2b1d --- /dev/null +++ b/util/string/fuzzing/ya.make @@ -0,0 +1,11 @@ +OWNER( + g:util + pg +) +SUBSCRIBER(g:util-subscribers) + +RECURSE( + collapse + escape_c + strtod +) diff --git a/util/string/hex.cpp b/util/string/hex.cpp new file mode 100644 index 0000000000..667397987f --- /dev/null +++ b/util/string/hex.cpp @@ -0,0 +1,63 @@ +#include "hex.h" + +const char* const Char2DigitTable = ("\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff" //0-9 + "\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //A-Z + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //a-z + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"); + +char* HexEncode(const void* in, size_t len, char* out) { + const unsigned char* b = (const unsigned char*)in; + const unsigned char* e = b + len; + + while (b != e) { + *out++ = DigitToChar(*b / 16); + *out++ = DigitToChar(*b++ % 16); + } + + return out; +} + +void* HexDecode(const void* in, size_t len, void* ptr) { + const char* b = (const char*)in; + const char* e = b + len; + Y_ENSURE(!(len & 1), TStringBuf("Odd buffer length passed to HexDecode")); + + char* out = (char*)ptr; + + while (b != e) { + *out++ = (char)String2Byte(b); + b += 2; + } + + return out; +} + +TString HexEncode(const void* in, size_t len) { + TString ret; + + ret.ReserveAndResize(len << 1); + HexEncode(in, len, ret.begin()); + + return ret; +} + +TString HexDecode(const void* in, size_t len) { + TString ret; + + ret.ReserveAndResize(len >> 1); + HexDecode(in, len, ret.begin()); + + return ret; +} diff --git a/util/string/hex.h b/util/string/hex.h new file mode 100644 index 0000000000..af3d2d528f --- /dev/null +++ b/util/string/hex.h @@ -0,0 +1,59 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/yexception.h> +#include <util/system/yassert.h> + +inline static char DigitToChar(unsigned char digit) { + if (digit < 10) { + return (char)digit + '0'; + } + + return (char)(digit - 10) + 'A'; +} + +extern const char* const Char2DigitTable; + +inline static int Char2Digit(char ch) { + char result = Char2DigitTable[(unsigned char)ch]; + Y_ENSURE(result != '\xff', "invalid hex character " << (int)ch); + return result; +} + +//! Convert a hex string of exactly 2 chars to int +/*! @example String2Byte("10") => 16 */ +inline static int String2Byte(const char* s) { + return Char2Digit(*s) * 16 + Char2Digit(*(s + 1)); +} + +char* HexEncode(const void* in, size_t len, char* out); + +TString HexEncode(const void* in, size_t len); + +inline TString HexEncode(const TStringBuf h) { + return HexEncode(h.data(), h.size()); +} + +//! Convert a hex string @c in of @c len chars (case-insensitive) to array of ints stored at @c ptr and return this array. +/*! @note len must be even (len % 2 == 0), otherwise an exception will be thrown. + * @return @c ptr, which is an array of chars, where each char holds the numeric value + * equal to the corresponding 2 digits of the input stream. + * @warning You must ensure that @c ptr has (len/2) allocated bytes, otherwise SIGSEGV will happen. + * + * @example HexDecode("beef", 4, ptr) => {190, 239} + */ +void* HexDecode(const void* in, size_t len, void* ptr); + +//! Convert a hex string @c in of @c len chars (case-insensitive) to array of ints and return this array. +/*! @note len must be even (len % 2 == 0), otherwise an exception will be thrown. + * @return an array of chars, where each char holds the numeric value equal to the corresponding 2 digits + * of the input stream. + * + * @example HexDecode("beef", 4) => {190, 239} + */ +TString HexDecode(const void* in, size_t len); + +//! Convert an ASCII hex-string (case-insensitive) to the binary form. Note that h.Size() must be even (+h % 2 == 0). +inline TString HexDecode(const TStringBuf h) { + return HexDecode(h.data(), h.size()); +} diff --git a/util/string/hex_ut.cpp b/util/string/hex_ut.cpp new file mode 100644 index 0000000000..39a83d5e62 --- /dev/null +++ b/util/string/hex_ut.cpp @@ -0,0 +1,19 @@ +#include "hex.h" + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(THexCodingTest) { + Y_UNIT_TEST(TestEncode) { + UNIT_ASSERT_EQUAL(HexEncode("i1634iqwbf,&msdb"), "693136333469717762662C266D736462"); + } + + Y_UNIT_TEST(TestDecode) { + UNIT_ASSERT_EQUAL(HexDecode("693136333469717762662C266D736462"), "i1634iqwbf,&msdb"); + } + + Y_UNIT_TEST(TestDecodeCase) { + UNIT_ASSERT_EQUAL(HexDecode("12ABCDEF"), HexDecode("12abcdef")); + UNIT_ASSERT_EXCEPTION(HexDecode("Hello"), yexception); //< incorrect chars + UNIT_ASSERT_EXCEPTION(HexDecode("123"), yexception); //< odd length + } +} diff --git a/util/string/join.cpp b/util/string/join.cpp new file mode 100644 index 0000000000..3f88e23128 --- /dev/null +++ b/util/string/join.cpp @@ -0,0 +1 @@ +#include "join.h" diff --git a/util/string/join.h b/util/string/join.h new file mode 100644 index 0000000000..b166fad1f3 --- /dev/null +++ b/util/string/join.h @@ -0,0 +1,265 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/typetraits.h> +#include <util/string/cast.h> +#include "cast.h" + +/* + * Default implementation of AppendToString uses a temporary TString object which is inefficient. You can overload it + * for your type to speed up string joins. If you already have an Out() or operator<<() implementation you can simply + * do the following: + * + * inline void AppendToString(TString& dst, const TMyType& t) { + * TStringOutput o(dst); + * o << t; + * } + * + * Unfortunately we can't do this by default because for some types ToString() is defined while Out() is not. + * For standard types (strings of all kinds and arithmetic types) we don't use a temporary TString in AppendToString(). + */ + +template <typename TCharType, typename T> +inline std::enable_if_t<!std::is_arithmetic<std::remove_cv_t<T>>::value, void> +AppendToString(TBasicString<TCharType>& dst, const T& t) { + dst.AppendNoAlias(ToString(t)); +} + +template <typename TCharType, typename T> +inline std::enable_if_t<std::is_arithmetic<std::remove_cv_t<T>>::value, void> +AppendToString(TBasicString<TCharType>& dst, const T& t) { + char buf[512]; + dst.append(buf, ToString<std::remove_cv_t<T>>(t, buf, sizeof(buf))); +} + +template <typename TCharType> +inline void AppendToString(TBasicString<TCharType>& dst, const TCharType* t) { + dst.append(t); +} + +template <typename TCharType> +inline void AppendToString(TBasicString<TCharType>& dst, TBasicStringBuf<TCharType> t) { + dst.append(t); +} + +namespace NPrivate { + template <typename T> + inline size_t GetLength(const T&) { + // By default don't pre-allocate space when joining and appending non-string types. + // This code can be extended by estimating stringified length for specific types (e.g. 10 for ui32). + return 0; + } + + template <> + inline size_t GetLength(const TString& s) { + return s.length(); + } + + template <> + inline size_t GetLength(const TStringBuf& s) { + return s.length(); + } + + template <> + inline size_t GetLength(const char* const& s) { + return (s ? std::char_traits<char>::length(s) : 0); + } + + inline size_t GetAppendLength(const TStringBuf /*delim*/) { + return 0; + } + + template <typename TFirst, typename... TRest> + size_t GetAppendLength(const TStringBuf delim, const TFirst& f, const TRest&... r) { + return delim.length() + ::NPrivate::GetLength(f) + ::NPrivate::GetAppendLength(delim, r...); + } +} + +template <typename TCharType> +inline void AppendJoinNoReserve(TBasicString<TCharType>&, TBasicStringBuf<TCharType>) { +} + +template <typename TCharType, typename TFirst, typename... TRest> +inline void AppendJoinNoReserve(TBasicString<TCharType>& dst, TBasicStringBuf<TCharType> delim, const TFirst& f, const TRest&... r) { + AppendToString(dst, delim); + AppendToString(dst, f); + AppendJoinNoReserve(dst, delim, r...); +} + +template <typename... TValues> +inline void AppendJoin(TString& dst, const TStringBuf delim, const TValues&... values) { + const size_t appendLength = ::NPrivate::GetAppendLength(delim, values...); + if (appendLength > 0) { + dst.reserve(dst.length() + appendLength); + } + AppendJoinNoReserve(dst, delim, values...); +} + +template <typename TFirst, typename... TRest> +inline TString Join(const TStringBuf delim, const TFirst& f, const TRest&... r) { + TString ret = ToString(f); + AppendJoin(ret, delim, r...); + return ret; +} + +// Note that char delimeter @cdelim will be printed as single char string, +// but any char value @v will be printed as corresponding numeric code. +// For example, Join('a', 'a', 'a') will print "97a97" (see unit-test). +template <typename... TValues> +inline TString Join(char cdelim, const TValues&... v) { + return Join(TStringBuf(&cdelim, 1), v...); +} + +namespace NPrivate { + template <typename TCharType, typename TIter> + inline TBasicString<TCharType> JoinRange(TBasicStringBuf<TCharType> delim, const TIter beg, const TIter end) { + TBasicString<TCharType> out; + if (beg != end) { + size_t total = ::NPrivate::GetLength(*beg); + for (TIter pos = beg; ++pos != end;) { + total += delim.length() + ::NPrivate::GetLength(*pos); + } + if (total > 0) { + out.reserve(total); + } + + AppendToString(out, *beg); + for (TIter pos = beg; ++pos != end;) { + AppendJoinNoReserve(out, delim, *pos); + } + } + + return out; + } + +} // namespace NPrivate + +template <typename TIter> +TString JoinRange(std::string_view delim, const TIter beg, const TIter end) { + return ::NPrivate::JoinRange<char>(delim, beg, end); +} + +template <typename TIter> +TString JoinRange(char delim, const TIter beg, const TIter end) { + TStringBuf delimBuf(&delim, 1); + return ::NPrivate::JoinRange<char>(delimBuf, beg, end); +} + +template <typename TIter> +TUtf16String JoinRange(std::u16string_view delim, const TIter beg, const TIter end) { + return ::NPrivate::JoinRange<wchar16>(delim, beg, end); +} + +template <typename TIter> +TUtf16String JoinRange(wchar16 delim, const TIter beg, const TIter end) { + TWtringBuf delimBuf(&delim, 1); + return ::NPrivate::JoinRange<wchar16>(delimBuf, beg, end); +} + +template <typename TIter> +TUtf32String JoinRange(std::u32string_view delim, const TIter beg, const TIter end) { + return ::NPrivate::JoinRange<wchar32>(delim, beg, end); +} + +template <typename TIter> +TUtf32String JoinRange(wchar32 delim, const TIter beg, const TIter end) { + TUtf32StringBuf delimBuf(&delim, 1); + return ::NPrivate::JoinRange<wchar32>(delimBuf, beg, end); +} + +template <typename TCharType, typename TContainer> +inline TBasicString<TCharType> JoinSeq(std::basic_string_view<TCharType> delim, const TContainer& data) { + using std::begin; + using std::end; + return JoinRange(delim, begin(data), end(data)); +} + +template <typename TCharType, typename TContainer> +inline TBasicString<TCharType> JoinSeq(const TCharType* delim, const TContainer& data) { + TBasicStringBuf<TCharType> delimBuf = delim; + return JoinSeq(delimBuf, data); +} + +template <typename TCharType, typename TContainer> +inline TBasicString<TCharType> JoinSeq(const TBasicString<TCharType>& delim, const TContainer& data) { + TBasicStringBuf<TCharType> delimBuf = delim; + return JoinSeq(delimBuf, data); +} + +template <typename TCharType, typename TContainer> +inline std::enable_if_t< + std::is_same_v<TCharType, char> || + std::is_same_v<TCharType, char16_t> || + std::is_same_v<TCharType, char32_t>, + TBasicString<TCharType>> +JoinSeq(TCharType delim, const TContainer& data) { + TBasicStringBuf<TCharType> delimBuf(&delim, 1); + return JoinSeq(delimBuf, data); +} + +/** \brief Functor for streaming iterative objects from TIterB e to TIterE b, separated with delim. + * Difference from JoinSeq, JoinRange, Join is the lack of TString object - all depends on operator<< for the type and + * realization of IOutputStream + */ +template <class TIterB, class TIterE> +struct TRangeJoiner { + friend constexpr IOutputStream& operator<<(IOutputStream& stream, const TRangeJoiner<TIterB, TIterE>& rangeJoiner) { + if (rangeJoiner.b != rangeJoiner.e) { + stream << *rangeJoiner.b; + + for (auto it = std::next(rangeJoiner.b); it != rangeJoiner.e; ++it) + stream << rangeJoiner.delim << *it; + } + return stream; + } + + constexpr TRangeJoiner(TStringBuf delim, TIterB&& b, TIterE&& e) + : delim(delim) + , b(std::forward<TIterB>(b)) + , e(std::forward<TIterE>(e)) + { + } + +private: + const TStringBuf delim; + const TIterB b; + const TIterE e; +}; + +template <class TIterB, class TIterE = TIterB> +constexpr auto MakeRangeJoiner(TStringBuf delim, TIterB&& b, TIterE&& e) { + return TRangeJoiner<TIterB, TIterE>(delim, std::forward<TIterB>(b), std::forward<TIterE>(e)); +} + +template <class TContainer> +constexpr auto MakeRangeJoiner(TStringBuf delim, const TContainer& data) { + return MakeRangeJoiner(delim, std::cbegin(data), std::cend(data)); +} + +template <class TVal> +constexpr auto MakeRangeJoiner(TStringBuf delim, const std::initializer_list<TVal>& data) { + return MakeRangeJoiner(delim, std::cbegin(data), std::cend(data)); +} + +/* We force (std::initializer_list<TStringBuf>) input type for (TString) and (const char*) types because: + * # When (std::initializer_list<TString>) is used, TString objects are copied into the initializer_list object. + * Storing TStringBufs instead is faster, even with COW-enabled strings. + * # For (const char*) we calculate length only once and store it in TStringBuf. Otherwise strlen scan would be executed + * in both GetAppendLength and AppendToString. For string literals constant lengths get propagated in compile-time. + * + * This way JoinSeq(",", { s1, s2 }) always does the right thing whatever types s1 and s2 have. + * + * If someone needs to join std::initializer_list<TString> -- it still works because of the TContainer template above. +*/ + +template <typename T> +inline std::enable_if_t< + !std::is_same<std::decay_t<T>, TString>::value && !std::is_same<std::decay_t<T>, const char*>::value, + TString> +JoinSeq(const TStringBuf delim, const std::initializer_list<T>& data) { + return JoinRange(delim, data.begin(), data.end()); +} + +inline TString JoinSeq(const TStringBuf delim, const std::initializer_list<TStringBuf>& data) { + return JoinRange(delim, data.begin(), data.end()); +} diff --git a/util/string/join_ut.cpp b/util/string/join_ut.cpp new file mode 100644 index 0000000000..3ed2b2459c --- /dev/null +++ b/util/string/join_ut.cpp @@ -0,0 +1,163 @@ +#include "join.h" + +#include <library/cpp/testing/unittest/registar.h> +#include <util/generic/vector.h> + +#include <util/stream/output.h> + +struct TCustomData { + TVector<int> Ints; +}; + +TString ToString(const TCustomData& d) { + return JoinSeq("__", d.Ints); +} + +Y_UNIT_TEST_SUITE(JoinStringTest) { + Y_UNIT_TEST(ScalarItems) { + UNIT_ASSERT_EQUAL(Join(',', 10, 11.1, "foobar"), "10,11.1,foobar"); + UNIT_ASSERT_EQUAL(Join(", ", 10, 11.1, "foobar"), "10, 11.1, foobar"); + UNIT_ASSERT_EQUAL(Join(", ", 10, 11.1, TString("foobar")), "10, 11.1, foobar"); + + UNIT_ASSERT_EQUAL(Join('#', 0, "a", "foobar", -1.4, TStringBuf("aaa")), "0#a#foobar#-1.4#aaa"); + UNIT_ASSERT_EQUAL(Join("", "", ""), ""); + UNIT_ASSERT_EQUAL(Join("", "a", "b", "c"), "abc"); + UNIT_ASSERT_EQUAL(Join("", "a", "b", "", "c"), "abc"); + UNIT_ASSERT_EQUAL(Join(" ", "a", "b", "", "c"), "a b c"); + } + + Y_UNIT_TEST(IntContainerItems) { + int v[] = {1, 2, 3}; + TVector<int> vv(v, v + 3); + UNIT_ASSERT_EQUAL(JoinSeq(" ", vv), "1 2 3"); + UNIT_ASSERT_EQUAL(JoinSeq(" ", vv), JoinRange(" ", vv.begin(), vv.end())); + UNIT_ASSERT_EQUAL(JoinRange(" ", v, v + 2), "1 2"); + UNIT_ASSERT_EQUAL(JoinSeq(" ", {}), ""); + UNIT_ASSERT_EQUAL(JoinSeq(" ", {42}), "42"); + UNIT_ASSERT_EQUAL(JoinSeq(" ", {1, 2, 3}), "1 2 3"); + UNIT_ASSERT_VALUES_EQUAL(JoinSeq(" ", v), "1 2 3"); + } + + Y_UNIT_TEST(StrContainerItems) { + // try various overloads and template type arguments + static const char* const result = "1 22 333"; + static const char* const v[] = {"1", "22", "333"}; + TVector<const char*> vchar(v, v + sizeof(v) / sizeof(v[0])); + TVector<TStringBuf> vbuf(v, v + sizeof(v) / sizeof(v[0])); + TVector<TString> vstring(v, v + sizeof(v) / sizeof(v[0])); + + // ranges + UNIT_ASSERT_EQUAL(JoinRange(" ", v, v + 3), result); + UNIT_ASSERT_EQUAL(JoinRange(" ", vchar.begin(), vchar.end()), result); + UNIT_ASSERT_EQUAL(JoinRange(" ", vbuf.begin(), vbuf.end()), result); + UNIT_ASSERT_EQUAL(JoinRange(" ", vstring.begin(), vstring.end()), result); + { + TStringStream stream; + stream << MakeRangeJoiner(" ", v, v + 3); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", vchar.begin(), vchar.end()); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", vbuf.begin(), vbuf.end()); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", vstring.begin(), vstring.end()); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + + // vectors + UNIT_ASSERT_EQUAL(JoinSeq(" ", vchar), result); + UNIT_ASSERT_EQUAL(JoinSeq(" ", vbuf), result); + UNIT_ASSERT_EQUAL(JoinSeq(" ", vstring), result); + { + TStringStream stream; + stream << MakeRangeJoiner(" ", vchar); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", vbuf); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", vstring); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + + // initializer lists with type deduction + UNIT_ASSERT_EQUAL(JoinSeq(" ", {v[0], v[1], v[2]}), result); + UNIT_ASSERT_EQUAL(JoinSeq(" ", {vchar[0], vchar[1], vchar[2]}), result); + UNIT_ASSERT_EQUAL(JoinSeq(" ", {vbuf[0], vbuf[1], vbuf[2]}), result); + UNIT_ASSERT_EQUAL(JoinSeq(" ", {vstring[0], vstring[1], vstring[2]}), result); + { + TStringStream stream; + stream << MakeRangeJoiner(" ", {v[0], v[1], v[2]}); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", {vchar[0], vchar[1], vchar[2]}); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", {vbuf[0], vbuf[1], vbuf[2]}); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", {vstring[0], vstring[1], vstring[2]}); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + + // initializer lists with explicit types + UNIT_ASSERT_EQUAL(JoinSeq(" ", std::initializer_list<const char*>{v[0], v[1], v[2]}), result); + UNIT_ASSERT_EQUAL(JoinSeq(" ", std::initializer_list<const char*>{vchar[0], vchar[1], vchar[2]}), result); + UNIT_ASSERT_EQUAL(JoinSeq(" ", std::initializer_list<TStringBuf>{vbuf[0], vbuf[1], vbuf[2]}), result); + UNIT_ASSERT_EQUAL(JoinSeq(" ", std::initializer_list<TString>{vstring[0], vstring[1], vstring[2]}), result); + { + TStringStream stream; + stream << MakeRangeJoiner(" ", std::initializer_list<const char*>{v[0], v[1], v[2]}); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", std::initializer_list<const char*>{vchar[0], vchar[1], vchar[2]}); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", std::initializer_list<TStringBuf>{vbuf[0], vbuf[1], vbuf[2]}); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + { + TStringStream stream; + stream << MakeRangeJoiner(" ", std::initializer_list<TString>{vstring[0], vstring[1], vstring[2]}); + UNIT_ASSERT_EQUAL(stream.Str(), result); + } + + // c-style array + UNIT_ASSERT_VALUES_EQUAL(JoinSeq(" ", v), result); + } + + Y_UNIT_TEST(CustomToString) { + TCustomData d1{{1, 2, 3, 4, 5}}; + TCustomData d2{{0, -1, -2}}; + UNIT_ASSERT_EQUAL(Join(" ", d1, d2), "1__2__3__4__5 0__-1__-2"); + } + + Y_UNIT_TEST(JoinChars) { + // Note that char delimeter is printed as single char string, + // but joined char values are printed as their numeric codes! O_o + UNIT_ASSERT_EQUAL(Join('a', 'a', 'a'), "97a97"); + UNIT_ASSERT_EQUAL(Join("a", "a", "a"), "aaa"); + } +} diff --git a/util/string/printf.cpp b/util/string/printf.cpp new file mode 100644 index 0000000000..5b7c34d4e1 --- /dev/null +++ b/util/string/printf.cpp @@ -0,0 +1,38 @@ +#include "printf.h" + +#include <util/stream/printf.h> +#include <util/stream/str.h> + +int vsprintf(TString& s, const char* c, va_list params) { + TStringOutput so(s.remove()); + + return Printf(so, c, params); +} + +int sprintf(TString& s, const char* c, ...) { + va_list params; + va_start(params, c); + const int k = vsprintf(s, c, params); + va_end(params); + return k; +} + +TString Sprintf(const char* c, ...) { + TString s; + va_list params; + va_start(params, c); + vsprintf(s, c, params); + va_end(params); + return s; +} + +int fcat(TString& s, const char* c, ...) { + TStringOutput so(s); + + va_list params; + va_start(params, c); + const size_t ret = Printf(so, c, params); + va_end(params); + + return ret; +} diff --git a/util/string/printf.h b/util/string/printf.h new file mode 100644 index 0000000000..925c6edaff --- /dev/null +++ b/util/string/printf.h @@ -0,0 +1,13 @@ +#pragma once + +#include <util/generic/fwd.h> +#include <util/system/compiler.h> + +#include <cstdarg> + +/// formatted print. return printed length: +int Y_PRINTF_FORMAT(2, 0) vsprintf(TString& s, const char* c, va_list params); +/// formatted print. return printed length: +int Y_PRINTF_FORMAT(2, 3) sprintf(TString& s, const char* c, ...); +TString Y_PRINTF_FORMAT(1, 2) Sprintf(const char* c, ...); +int Y_PRINTF_FORMAT(2, 3) fcat(TString& s, const char* c, ...); diff --git a/util/string/printf_ut.cpp b/util/string/printf_ut.cpp new file mode 100644 index 0000000000..2b2f980b70 --- /dev/null +++ b/util/string/printf_ut.cpp @@ -0,0 +1,30 @@ +#include "printf.h" + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(TStringPrintf) { + Y_UNIT_TEST(TestSprintf) { + TString s; + int len = sprintf(s, "Hello %s", "world"); + UNIT_ASSERT_EQUAL(s, TString("Hello world")); + UNIT_ASSERT_EQUAL(len, 11); + } + + Y_UNIT_TEST(TestFcat) { + TString s; + int len = sprintf(s, "Hello %s", "world"); + UNIT_ASSERT_EQUAL(s, TString("Hello world")); + UNIT_ASSERT_EQUAL(len, 11); + len = fcat(s, " qwqw%s", "as"); + UNIT_ASSERT_EQUAL(s, TString("Hello world qwqwas")); + UNIT_ASSERT_EQUAL(len, 7); + } + + Y_UNIT_TEST(TestSpecial) { + UNIT_ASSERT_EQUAL("4294967295", Sprintf("%" PRIu32, (ui32)(-1))); + } + + Y_UNIT_TEST(TestExplicitPositions) { + UNIT_ASSERT_EQUAL("abc xyz abc", Sprintf("%1$s %2$s %1$s", "abc", "xyz")); + } +} diff --git a/util/string/reverse.cpp b/util/string/reverse.cpp new file mode 100644 index 0000000000..167cd11f49 --- /dev/null +++ b/util/string/reverse.cpp @@ -0,0 +1,33 @@ +#include "reverse.h" + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/charset/wide_specific.h> + +#include <algorithm> + +void ReverseInPlace(TString& string) { + auto* begin = string.begin(); + std::reverse(begin, begin + string.size()); +} + +void ReverseInPlace(TUtf16String& string) { + auto* begin = string.begin(); + const auto len = string.size(); + auto* end = begin + string.size(); + + TVector<wchar16> buffer(len); + wchar16* rbegin = buffer.data() + len; + for (wchar16* p = begin; p < end;) { + const size_t symbolSize = W16SymbolSize(p, end); + rbegin -= symbolSize; + std::copy(p, p + symbolSize, rbegin); + p += symbolSize; + } + std::copy(buffer.begin(), buffer.end(), begin); +} + +void ReverseInPlace(TUtf32String& string) { + auto* begin = string.begin(); + std::reverse(begin, begin + string.size()); +} diff --git a/util/string/reverse.h b/util/string/reverse.h new file mode 100644 index 0000000000..80f8b00887 --- /dev/null +++ b/util/string/reverse.h @@ -0,0 +1,16 @@ +#pragma once + +#include <util/generic/fwd.h> + +void ReverseInPlace(TString& string); + +/** NB. UTF-16 is variable-length encoding because of the surrogate pairs. + * This function takes this into account and treats a surrogate pair as a single symbol. + * Ex. if [C D] is a surrogate pair, + * A B [C D] E + * will become + * E [C D] B A + */ +void ReverseInPlace(TUtf16String& string); + +void ReverseInPlace(TUtf32String& string); diff --git a/util/string/split.cpp b/util/string/split.cpp new file mode 100644 index 0000000000..7d26857cc7 --- /dev/null +++ b/util/string/split.cpp @@ -0,0 +1,24 @@ +#include "split.h" + +template <class TValue> +inline size_t Split(const char* ptr, const char* delim, TVector<TValue>& values) { + values.erase(values.begin(), values.end()); + while (ptr && *ptr) { + ptr += strspn(ptr, delim); + if (ptr && *ptr) { + size_t epos = strcspn(ptr, delim); + assert(epos); + values.push_back(TValue(ptr, epos)); + ptr += epos; + } + } + return values.size(); +} + +size_t Split(const char* ptr, const char* delim, TVector<TString>& values) { + return Split<TString>(ptr, delim, values); +} + +size_t Split(const TString& in, const TString& delim, TVector<TString>& res) { + return Split(in.data(), delim.data(), res); +} diff --git a/util/string/split.h b/util/string/split.h new file mode 100644 index 0000000000..80f8c787dc --- /dev/null +++ b/util/string/split.h @@ -0,0 +1,1085 @@ +#pragma once + +#include "strspn.h" +#include "cast.h" + +#include <util/generic/algorithm.h> +#include <util/generic/fwd.h> +#include <util/generic/iterator.h> +#include <util/generic/iterator_range.h> +#include <util/generic/store_policy.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/generic/typetraits.h> +#include <util/generic/vector.h> +#include <util/generic/ylimits.h> +#include <util/system/compat.h> +#include <util/system/defaults.h> + +#include <utility> +#include <stlfwd> + +// NOTE: Check StringSplitter below to get more convenient split string interface. + +namespace NStringSplitPrivate { + + template <class T, class I, class = void> + struct TIsConsumer: std::false_type {}; + + template <class T, class I> + struct TIsConsumer< + T, I, + TVoidT<decltype(std::declval<T>().Consume( + std::declval<I>(), std::declval<I>(), std::declval<I>()))>> + : std::true_type {}; + + template <class T, class I> + constexpr bool TIsConsumerV = TIsConsumer<T, I>::value; + + template <class T> + T* Find(T* str, std::common_type_t<T> ch) { + for (; *str; ++str) { + if (*str == ch) { + return str; + } + } + + return nullptr; + } + +} + +template <class I, class TDelim, class TConsumer> +std::enable_if_t<::NStringSplitPrivate::TIsConsumerV<TConsumer, I>> +SplitString(I b, I e, const TDelim& d, TConsumer&& c) { + I l, i; + + do { + l = b; + i = d.Find(b, e); + } while (c.Consume(l, i, b) && (b != i)); +} + +template <class I, class TDelim, class TConsumer> +std::enable_if_t<::NStringSplitPrivate::TIsConsumerV<TConsumer, I>> +SplitString(I b, const TDelim& d, TConsumer&& c) { + I l, i; + + do { + l = b; + i = d.Find(b); + } while (c.Consume(l, i, b) && (b != i)); +} + +template <class I1, class I2> +static inline I1* FastStrChr(I1* str, I2 f) noexcept { + I1* ret = NStringSplitPrivate::Find(str, f); + + if (!ret) { + ret = str + std::char_traits<I1>::length(str); + } + + return ret; +} + +template <class I> +static inline I* FastStrStr(I* str, I* f, size_t l) noexcept { + std::basic_string_view<I> strView(str); + const auto ret = strView.find(*f); + + if (ret != std::string::npos) { + std::basic_string_view<I> fView(f, l); + strView = strView.substr(ret); + for (; strView.size() >= l; strView = strView.substr(1)) { + if (strView.substr(0, l) == fView) { + break; + } + } + + return strView.size() >= l ? strView.data() : strView.data() + strView.size(); + } else { + return strView.data() + strView.size(); + } +} + +template <class Char> +struct TStringDelimiter { + inline TStringDelimiter(Char* delim) noexcept + : Delim(delim) + , Len(std::char_traits<Char>::length(delim)) + { + } + + inline TStringDelimiter(Char* delim, size_t len) noexcept + : Delim(delim) + , Len(len) + { + } + + inline Char* Find(Char*& b, Char* e) const noexcept { + const auto ret = std::basic_string_view<Char>(b, e - b).find(Delim, 0, Len); + + if (ret != std::string::npos) { + const auto result = b + ret; + b = result + Len; + return result; + } + + return (b = e); + } + + inline Char* Find(Char*& b) const noexcept { + Char* ret = FastStrStr(b, Delim, Len); + + b = *ret ? ret + Len : ret; + + return ret; + } + + Char* Delim; + const size_t Len; +}; + +template <class Char> +struct TCharDelimiter { + inline TCharDelimiter(Char ch) noexcept + : Ch(ch) + { + } + + inline Char* Find(Char*& b, Char* e) const noexcept { + const auto ret = std::basic_string_view<Char>(b, e - b).find(Ch); + + if (ret != std::string::npos) { + const auto result = b + ret; + b = result + 1; + return result; + } + + return (b = e); + } + + inline Char* Find(Char*& b) const noexcept { + Char* ret = FastStrChr(b, Ch); + + if (*ret) { + b = ret + 1; + } else { + b = ret; + } + + return ret; + } + + Char Ch; +}; + +template <class Iterator, class Condition> +struct TFuncDelimiter { +public: + template <class... Args> + TFuncDelimiter(Args&&... args) + : Fn(std::forward<Args>(args)...) + { + } + + inline Iterator Find(Iterator& b, Iterator e) const noexcept { + if ((b = std::find_if(b, e, Fn)) != e) { + return b++; + } + + return b; + } + +private: + Condition Fn; +}; + +template <class Char> +struct TFindFirstOf { + inline TFindFirstOf(Char* set) + : Set(set) + { + } + + inline Char* FindFirstOf(Char* b, Char* e) const noexcept { + Char* ret = b; + for (; ret != e; ++ret) { + if (NStringSplitPrivate::Find(Set, *ret)) + break; + } + return ret; + } + + inline Char* FindFirstOf(Char* b) const noexcept { + const std::basic_string_view<Char> bView(b); + const auto ret = bView.find_first_of(Set); + return ret != std::string::npos ? b + ret : b + bView.size(); + } + + Char* Set; +}; + +template <> +struct TFindFirstOf<const char>: public TCompactStrSpn { + inline TFindFirstOf(const char* set, const char* e) + : TCompactStrSpn(set, e) + { + } + + inline TFindFirstOf(const char* set) + : TCompactStrSpn(set) + { + } +}; + +template <class Char> +struct TSetDelimiter: private TFindFirstOf<const Char> { + using TFindFirstOf<const Char>::TFindFirstOf; + + inline Char* Find(Char*& b, Char* e) const noexcept { + Char* ret = const_cast<Char*>(this->FindFirstOf(b, e)); + + if (ret != e) { + b = ret + 1; + return ret; + } + + return (b = e); + } + + inline Char* Find(Char*& b) const noexcept { + Char* ret = const_cast<Char*>(this->FindFirstOf(b)); + + if (*ret) { + b = ret + 1; + return ret; + } + + return (b = ret); + } +}; + +namespace NSplitTargetHasPushBack { + Y_HAS_MEMBER(push_back, PushBack); +} + +template <class T, class = void> +struct TConsumerBackInserter; + +template <class T> +struct TConsumerBackInserter<T, std::enable_if_t<NSplitTargetHasPushBack::TClassHasPushBack<T>::value>> { + static void DoInsert(T* C, const typename T::value_type& i) { + C->push_back(i); + } +}; + +template <class T> +struct TConsumerBackInserter<T, std::enable_if_t<!NSplitTargetHasPushBack::TClassHasPushBack<T>::value>> { + static void DoInsert(T* C, const typename T::value_type& i) { + C->insert(C->end(), i); + } +}; + +template <class T> +struct TContainerConsumer { + inline TContainerConsumer(T* c) noexcept + : C(c) + { + } + + template <class I> + inline bool Consume(I* b, I* d, I* /*e*/) { + TConsumerBackInserter<T>::DoInsert(C, typename T::value_type(b, d)); + + return true; + } + + T* C; +}; + +template <class T> +struct TContainerConvertingConsumer { + inline TContainerConvertingConsumer(T* c) noexcept + : C(c) + { + } + + template <class I> + inline bool Consume(I* b, I* d, I* /*e*/) { + TConsumerBackInserter<T>::DoInsert(C, FromString<typename T::value_type>(TStringBuf(b, d))); + + return true; + } + + T* C; +}; + +template <class S, class I> +struct TLimitingConsumer { + inline TLimitingConsumer(size_t cnt, S* slave) noexcept + : Cnt(cnt ? cnt - 1 : Max<size_t>()) + , Slave(slave) + , Last(nullptr) + { + } + + inline bool Consume(I* b, I* d, I* e) { + if (!Cnt) { + Last = b; + + return false; + } + + --Cnt; + + return Slave->Consume(b, d, e); + } + + size_t Cnt; + S* Slave; + I* Last; +}; + +template <class S> +struct TSkipEmptyTokens { + inline TSkipEmptyTokens(S* slave) noexcept + : Slave(slave) + { + } + + template <class I> + inline bool Consume(I* b, I* d, I* e) { + if (b != d) { + return Slave->Consume(b, d, e); + } + + return true; + } + + S* Slave; +}; + +template <class S> +struct TKeepDelimiters { + inline TKeepDelimiters(S* slave) noexcept + : Slave(slave) + { + } + + template <class I> + inline bool Consume(I* b, I* d, I* e) { + if (Slave->Consume(b, d, d)) { + if (d != e) { + return Slave->Consume(d, e, e); + } + + return true; + } + + return false; + } + + S* Slave; +}; + +template <class T> +struct TSimplePusher { + inline bool Consume(char* b, char* d, char*) { + *d = 0; + C->push_back(b); + + return true; + } + + T* C; +}; + +template <class T> +static inline void Split(char* buf, char ch, T* res) { + res->resize(0); + if (*buf == 0) + return; + + TCharDelimiter<char> delim(ch); + TSimplePusher<T> pusher = {res}; + + SplitString(buf, delim, pusher); +} + +/// Split string into res vector. Res vector is cleared before split. +/// Old good slow split function. +/// Field delimter is any number of symbols specified in delim (no empty strings in res vector) +/// @return number of elements created +size_t Split(const char* in, const char* delim, TVector<TString>& res); +size_t Split(const TString& in, const TString& delim, TVector<TString>& res); + +/// Old split reimplemented for TStringBuf using the new code +/// Note that delim can be constructed from char* automatically (it is not cheap though) +inline size_t Split(const TStringBuf s, const TSetDelimiter<const char>& delim, TVector<TStringBuf>& res) { + res.clear(); + TContainerConsumer<TVector<TStringBuf>> res1(&res); + TSkipEmptyTokens<TContainerConsumer<TVector<TStringBuf>>> consumer(&res1); + SplitString(s.data(), s.data() + s.size(), delim, consumer); + return res.size(); +} + +template <class P, class D> +void GetNext(TStringBuf& s, D delim, P& param) { + TStringBuf next = s.NextTok(delim); + Y_ENSURE(next.IsInited(), TStringBuf("Split: number of fields less than number of Split output arguments")); + param = FromString<P>(next); +} + +template <class P, class D> +void GetNext(TStringBuf& s, D delim, TMaybe<P>& param) { + TStringBuf next = s.NextTok(delim); + if (next.IsInited()) { + param = FromString<P>(next); + } else { + param.Clear(); + } +} + +// example: +// Split(TStringBuf("Sherlock,2014,36.6"), ',', name, year, temperature); +template <class D, class P1, class P2> +void Split(TStringBuf s, D delim, P1& p1, P2& p2) { + GetNext(s, delim, p1); + GetNext(s, delim, p2); + Y_ENSURE(!s.IsInited(), TStringBuf("Split: number of fields more than number of Split output arguments")); +} + +template <class D, class P1, class P2, class... Other> +void Split(TStringBuf s, D delim, P1& p1, P2& p2, Other&... other) { + GetNext(s, delim, p1); + Split(s, delim, p2, other...); +} + +/** + * \fn auto StringSplitter(...) + * + * Creates a string splitter object. The only use for it is to call one of its + * `Split*` methods, and then do something with the resulting proxy range. + * + * Some examples: + * \code + * TVector<TStringBuf> values = StringSplitter("1\t2\t3").Split('\t'); + * + * for(TStringBuf part: StringSplitter("1::2::::3").SplitByString("::").SkipEmpty()) { + * Cerr << part; + * } + * + * TVector<TString> firstTwoValues = StringSplitter("1\t2\t3").Split('\t').Take(2); + * \endcode + * + * Use `Collect` or `AddTo` to store split results into an existing container: + * \code + * TVector<TStringBuf> values = {"0"}; + * StringSplitter("1\t2\t3").Split('\t').AddTo(&values); + * \endcode + * Note that `Collect` clears target container, while `AddTo` just inserts values. + * You can use these methods with any container that has `emplace` / `emplace_back`. + * + * Use `ParseInto` to also perform string conversions before inserting values + * into target container: + * \code + * TSet<int> values; + * StringSplitter("1\t2\t3").Split('\t').ParseInto(&values); + * \endcode + */ + +namespace NStringSplitPrivate { + Y_HAS_MEMBER(push_back, PushBack); + Y_HAS_MEMBER(insert, Insert); + Y_HAS_MEMBER(data, Data); + + /** + * This one is needed here so that `std::string_view -> std::string_view` + * conversion works. + */ + template <class Src, class Dst> + inline void DoFromString(const Src& src, Dst* dst) { + *dst = ::FromString<Dst>(src); + } + + template <class T> + inline void DoFromString(const T& src, T* dst) noexcept { + *dst = src; + } + + template <class T> + inline void DoFromString(const T& src, decltype(std::ignore)* dst) noexcept { + *dst = src; + } + + template <class Src, class Dst> + inline Y_WARN_UNUSED_RESULT bool TryDoFromString(const Src& src, Dst* dst) noexcept { + return ::TryFromString(src, *dst); + } + + template <class T> + inline Y_WARN_UNUSED_RESULT bool TryDoFromString(const T& src, T* dst) noexcept { + *dst = src; + return true; + } + + template <class T> + inline Y_WARN_UNUSED_RESULT bool TryDoFromString(const T& src, decltype(std::ignore)* dst) noexcept { + *dst = src; + return true; + } + + /** + * Consumer that places provided elements into a container. Not using + * `emplace(iterator)` for efficiency. + */ + template <class Container> + struct TContainerConsumer { + using value_type = typename Container::value_type; + + TContainerConsumer(Container* c) + : C_(c) + { + } + + // TODO: return bool (continue) + template <class StringBuf> + void operator()(StringBuf e) const { + this->operator()(C_, e); + } + + private: + template <class OtherContainer, class StringBuf> + auto operator()(OtherContainer* c, StringBuf e) const -> decltype(c->emplace_back()) { + return c->emplace_back(value_type(e)); + } + + template <class OtherContainer, class StringBuf> + auto operator()(OtherContainer* c, StringBuf e) const -> decltype(c->emplace()) { + return c->emplace(value_type(e)); + } + + Container* C_; + }; + + /** + * Consumer that converts provided elements via `FromString` and places them + * into a container. + */ + template <class Container> + struct TContainerConvertingConsumer { + using value_type = typename Container::value_type; + + TContainerConvertingConsumer(Container* c) + : C_(c) + { + } + + template <class StringBuf> + void operator()(StringBuf e) const { + this->operator()(C_, e); + } + + private: + template <class OtherContainer, class StringBuf> + auto operator()(OtherContainer* c, StringBuf e) const -> decltype(c->emplace_back()) { + value_type v; + DoFromString(e, &v); + return c->emplace_back(std::move(v)); + } + + template <class OtherContainer, class StringBuf> + auto operator()(OtherContainer* c, StringBuf e) const -> decltype(c->emplace()) { + value_type v; + DoFromString(e, &v); + return c->emplace(std::move(v)); + } + + Container* C_; + }; + + template <class String> + struct TStringBufOfImpl { + using type = std::conditional_t< + THasData<String>::value, + TBasicStringBuf<typename String::value_type>, + TIteratorRange<typename String::const_iterator>>; + }; + + template <class Char, class Traits, class Allocator> + struct TStringBufOfImpl<std::basic_string<Char, Traits, Allocator>> { + using type = std::basic_string_view<Char, Traits>; + }; + + template <class Char, class Traits> + struct TStringBufOfImpl<std::basic_string_view<Char, Traits>> { + using type = std::basic_string_view<Char, Traits>; + }; + + /** + * Metafunction that returns a string buffer for the given type. This is to + * make sure that splitting `std::string` returns `std::string_view`. + */ + template <class String> + using TStringBufOf = typename TStringBufOfImpl<String>::type; + + template <class StringBuf, class Iterator> + StringBuf DoMakeStringBuf(Iterator b, Iterator e, StringBuf*) { + return StringBuf(b, e); + } + + template <class Char, class Traits, class Iterator> + std::basic_string_view<Char, Traits> DoMakeStringBuf(Iterator b, Iterator e, std::basic_string_view<Char, Traits>*) { + return std::basic_string_view<Char, Traits>(b, e - b); + } + + template <class StringBuf, class Iterator> + StringBuf MakeStringBuf(Iterator b, Iterator e) { + return DoMakeStringBuf(b, e, static_cast<StringBuf*>(nullptr)); + } + + template <class String> + struct TIteratorOfImpl { + using type = std::conditional_t< + THasData<String>::value, + const typename String::value_type*, + typename String::const_iterator>; + }; + + template <class String> + using TIteratorOf = typename TIteratorOfImpl<String>::type; + + template <class String> + struct TIterState { + using TStringBufType = TStringBufOf<String>; + using TIterator = TIteratorOf<String>; + + TIterState(const String& string) noexcept + : TokS() + , TokD() + { + Init(string, THasData<String>()); + } + + operator TStringBufType() const noexcept { + return Token(); + } + + template < + typename Other, + class = typename std::enable_if< + std::is_convertible<Other, TStringBufType>::value, + void>::type> + bool operator==(const Other& toCompare) const { + return TStringBufType(*this) == TStringBufType(toCompare); + } + + explicit operator bool() const { + return !Empty(); + } + + TIterator TokenStart() const noexcept { + return TokS; + } + + TIterator TokenDelim() const noexcept { + return TokD; + } + + TIterator TokenEnd() const noexcept { + return B; + } + + Y_PURE_FUNCTION bool Empty() const noexcept { + return TokenStart() == TokenDelim(); + } + + TStringBufType Token() const noexcept { + return MakeStringBuf<TStringBufType>(TokenStart(), TokenDelim()); + } + + TStringBufType Delim() const noexcept { + return MakeStringBuf<TStringBufType>(TokenDelim(), TokenEnd()); + } + + TIterator B; + TIterator E; + + TIterator TokS; + TIterator TokD; + + private: + void Init(const String& string, std::true_type) { + B = string.data(); + E = string.data() + string.size(); + } + + void Init(const String& string, std::false_type) { + B = string.begin(); + E = string.end(); + } + }; + + template <class Base> + class TSplitRange: public Base, public TInputRangeAdaptor<TSplitRange<Base>> { + using TStringBufType = decltype(std::declval<Base>().Next()->Token()); + + public: + template <typename... Args> + inline TSplitRange(Args&&... args) + : Base(std::forward<Args>(args)...) + { + } + + template <class Consumer, std::enable_if_t<std::is_same<decltype(std::declval<Consumer>()(std::declval<TStringBufType>())), void>::value, int>* = nullptr> + inline void Consume(Consumer&& f) { + for (auto&& it : *this) { + f(it.Token()); + } + } + + template <class Consumer, std::enable_if_t<std::is_same<decltype(std::declval<Consumer>()(std::declval<TStringBufType>())), bool>::value, int>* = nullptr> + inline bool Consume(Consumer&& f) { + for (auto&& it : *this) { + if (!f(it.Token())) { + return false; + } + } + return true; + } + + template <class Container, class = std::enable_if_t<THasInsert<Container>::value || THasPushBack<Container>::value>> + operator Container() { + Container result; + AddTo(&result); + return result; + } + + template <class S> + inline TVector<S> ToList() { + TVector<S> result; + for (auto&& it : *this) { + result.push_back(S(it.Token())); + } + return result; + } + + template <class Container> + inline void Collect(Container* c) { + Y_ASSERT(c); + c->clear(); + AddTo(c); + } + + template <class Container> + inline void AddTo(Container* c) { + Y_ASSERT(c); + TContainerConsumer<Container> consumer(c); + Consume(consumer); + } + + template <class Container> + inline void ParseInto(Container* c) { + Y_ASSERT(c); + TContainerConvertingConsumer<Container> consumer(c); + Consume(consumer); + } + + // TODO: this is actually TryParseInto + /** + * Same as `CollectInto`, just doesn't throw. + * + * \param[out] args Output arguments. + * \returns Whether parsing was successful. + */ + template <typename... Args> + inline bool TryCollectInto(Args*... args) noexcept { + size_t successfullyFilled = 0; + auto it = this->begin(); + + //FIXME: actually, some kind of TryApplyToMany is needed in order to stop iteration upon first failure + ApplyToMany([&](auto&& arg) { + if (it != this->end()) { + if (TryDoFromString(it->Token(), arg)) { + ++successfullyFilled; + } + ++it; + } + }, args...); + + return successfullyFilled == sizeof...(args) && it == this->end(); + } + + // TODO: this is actually ParseInto + /** + * Splits and parses everything that's in this splitter into `args`. + * + * Example usage: + * \code + * int l, r; + * StringSplitter("100*200").Split('*').CollectInto(&l, &r); + * \endcode + * + * \param[out] args Output arguments. + * \throws If not all items were parsed, or + * if there were too many items in the split. + */ + template <typename... Args> + inline void CollectInto(Args*... args) { + Y_ENSURE(TryCollectInto<Args...>(args...)); + } + + inline size_t Count() const { + size_t cnt = 0; + for (auto&& it : *this) { + Y_UNUSED(it); + ++cnt; + } + return cnt; + } + }; + + template <class String> + class TStringSplitter { + using TStringType = String; + using TStringBufType = TStringBufOf<TStringType>; + using TChar = typename TStringType::value_type; + using TIterator = TIteratorOf<TStringType>; + using TIteratorState = TIterState<TStringType>; + + /** + * Base class for all split ranges that actually does the splitting. + */ + template <class DelimStorage> + struct TSplitRangeBase { + template <class OtherString, class... Args> + inline TSplitRangeBase(OtherString&& s, Args&&... args) + : String_(std::forward<OtherString>(s)) + , State_(String_) + , Delim_(std::forward<Args>(args)...) + { + } + + inline TIteratorState* Next() { + if (State_.TokD == State_.B) { + return nullptr; + } + + State_.TokS = State_.B; + State_.TokD = Delim_.Ptr()->Find(State_.B, State_.E); + + return &State_; + } + + private: + TStringType String_; + TIteratorState State_; + DelimStorage Delim_; + }; + + template <class Base, class Filter> + struct TFilterRange: public Base { + template <class... Args> + inline TFilterRange(const Base& base, Args&&... args) + : Base(base) + , Filter_(std::forward<Args>(args)...) + { + } + + inline TIteratorState* Next() { + TIteratorState* ret; + + do { + ret = Base::Next(); + } while (ret && !Filter_.Accept(ret)); + + return ret; + } + + Filter Filter_; + }; + + struct TNonEmptyFilter { + template <class TToken> + inline bool Accept(const TToken* token) noexcept { + return !token->Empty(); + } + }; + + template <class TIter> + struct TStopIteration; + + template <class Base> + struct TFilters: public Base { + template <class TFilter> + using TIt = TSplitRange<TStopIteration<TFilters<TFilterRange<Base, TFilter>>>>; + + template <typename... Args> + inline TFilters(Args&&... args) + : Base(std::forward<Args>(args)...) + { + } + + inline TIt<TNonEmptyFilter> SkipEmpty() const { + return {*this}; + } + }; + + template <class Base, class Stopper> + struct TStopRange: public Base { + template <typename... Args> + inline TStopRange(const Base& base, Args&&... args) + : Base(base) + , Stopper_(std::forward<Args>(args)...) + { + } + + inline TIteratorState* Next() { + TIteratorState* ret = Base::Next(); + if (!ret || Stopper_.Stop(ret)) { + return nullptr; + } + return ret; + } + + Stopper Stopper_; + }; + + struct TTake { + TTake() = default; + + TTake(size_t count) + : Count(count) + { + } + + template <class TToken> + inline bool Stop(TToken*) noexcept { + if (Count > 0) { + --Count; + return false; + } else { + return true; + } + } + + size_t Count = 0; + }; + + struct TLimit { + TLimit() = default; + + TLimit(size_t count) + : Count(count) + { + Y_ASSERT(Count > 0); + } + + template <class TToken> + inline bool Stop(TToken* token) noexcept { + if (Count > 1) { + --Count; + return false; + } else if (Count == 1) { + token->TokD = token->B = token->E; + return false; + } + return true; + } + + size_t Count = 0; + }; + + template <class Base> + struct TStopIteration: public Base { + template <class TStopper> + using TIt = TSplitRange<TStopIteration<TFilters<TStopRange<Base, TStopper>>>>; + + template <typename... Args> + inline TStopIteration(Args&&... args) + : Base(std::forward<Args>(args)...) + { + } + + inline TIt<TTake> Take(size_t count) { + return {*this, count}; + } + + inline TIt<TLimit> Limit(size_t count) { + return {*this, count}; + } + }; + + template <class TPolicy> + using TIt = TSplitRange<TStopIteration<TFilters<TSplitRangeBase<TPolicy>>>>; + + public: + template <class OtherString> + explicit TStringSplitter(OtherString&& s) + : String_(std::forward<OtherString>(s)) + { + } + + //does not own TDelim + template <class TDelim> + inline TIt<TPtrPolicy<const TDelim>> Split(const TDelim& d) const noexcept { + return {String_, &d}; + } + + inline TIt<TEmbedPolicy<TCharDelimiter<const TChar>>> Split(TChar ch) const noexcept { + return {String_, ch}; + } + + inline TIt<TSimpleRefPolicy<TSetDelimiter<const TChar>>> SplitBySet(const TChar* set) const noexcept { + return {String_, set}; + } + + inline TIt<TEmbedPolicy<TStringDelimiter<const TChar>>> SplitByString(const TStringBufType& str) const noexcept { + return {String_, str.data(), str.size()}; + } + + template <class TFunc> + inline TIt<TEmbedPolicy<TFuncDelimiter<TIterator, TFunc>>> SplitByFunc(TFunc f) const noexcept { + return {String_, f}; + } + + private: + TStringType String_; + }; + + template <class String> + auto MakeStringSplitter(String&& s) { + return TStringSplitter<std::remove_reference_t<String>>(std::forward<String>(s)); + } +} + +template <class Iterator> +auto StringSplitter(Iterator begin, Iterator end) { + return ::NStringSplitPrivate::MakeStringSplitter(TIteratorRange<Iterator>(begin, end)); +} + +template <class Char> +auto StringSplitter(const Char* begin, const Char* end) { + return ::NStringSplitPrivate::MakeStringSplitter(TBasicStringBuf<Char>(begin, end)); +} + +template <class Char> +auto StringSplitter(const Char* begin, size_t len) { + return ::NStringSplitPrivate::MakeStringSplitter(TBasicStringBuf<Char>(begin, len)); +} + +template <class Char> +auto StringSplitter(const Char* str) { + return ::NStringSplitPrivate::MakeStringSplitter(TBasicStringBuf<Char>(str)); +} + +template <class String, std::enable_if_t<!std::is_pointer<std::remove_reference_t<String>>::value, int> = 0> +auto StringSplitter(String& s) { + return ::NStringSplitPrivate::MakeStringSplitter(::NStringSplitPrivate::TStringBufOf<String>(s.data(), s.size())); +} + +template <class String, std::enable_if_t<!std::is_pointer<std::remove_reference_t<String>>::value, int> = 0> +auto StringSplitter(String&& s) { + return ::NStringSplitPrivate::MakeStringSplitter(std::move(s)); +} diff --git a/util/string/split_ut.cpp b/util/string/split_ut.cpp new file mode 100644 index 0000000000..43e59f2d75 --- /dev/null +++ b/util/string/split_ut.cpp @@ -0,0 +1,831 @@ +#include "split.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/stream/output.h> +#include <util/charset/wide.h> +#include <util/datetime/cputimer.h> +#include <util/generic/maybe.h> + +#include <string> +#include <string_view> + +template <typename T> +static inline void OldSplit(char* pszBuf, T* pRes) { + pRes->resize(0); + pRes->push_back(pszBuf); + for (char* pszData = pszBuf; *pszData; ++pszData) { + if (*pszData == '\t') { + *pszData = 0; + pRes->push_back(pszData + 1); + } + } +} + +template <class T1, class T2> +inline void Cmp(const T1& t1, const T2& t2) { + try { + UNIT_ASSERT_EQUAL(t1.size(), t2.size()); + } catch (...) { + Print(t1); + Cerr << "---------------" << Endl; + Print(t2); + + throw; + } + + auto i = t1.begin(); + auto j = t2.begin(); + + for (; i != t1.end() && j != t2.end(); ++i, ++j) { + try { + UNIT_ASSERT_EQUAL(*i, *j); + } catch (...) { + Cerr << "(" << *i << ")->(" << *j << ")" << Endl; + + throw; + } + } +} + +template <class T> +inline void Print(const T& t) { + for (typename T::const_iterator i = t.begin(); i != t.end(); ++i) { + Cerr << *i << Endl; + } +} + +template <template <typename> class TConsumer, typename TResult, typename I, typename TDelimiter> +void TestDelimiterOnString(TResult& good, I* str, const TDelimiter& delim) { + TResult test; + TConsumer<TResult> consumer(&test); + SplitString(str, delim, consumer); + Cmp(good, test); + UNIT_ASSERT_EQUAL(good, test); +} + +template <template <typename> class TConsumer, typename TResult, typename I, typename TDelimiter> +void TestDelimiterOnRange(TResult& good, I* b, I* e, const TDelimiter& delim) { + TResult test; + TConsumer<TResult> consumer(&test); + SplitString(b, e, delim, consumer); + Cmp(good, test); + UNIT_ASSERT_EQUAL(good, test); +} + +template <typename TConsumer, typename TResult, typename I> +void TestConsumerOnString(TResult& good, I* str, I* d) { + TResult test; + TContainerConsumer<TResult> consumer(&test); + TConsumer tested(&consumer); + TCharDelimiter<const I> delim(*d); + SplitString(str, delim, tested); + Cmp(good, test); + UNIT_ASSERT_EQUAL(good, test); +} + +template <typename TConsumer, typename TResult, typename I> +void TestConsumerOnRange(TResult& good, I* b, I* e, I* d) { + TResult test; + TContainerConsumer<TResult> consumer(&test); + TConsumer tested(&consumer); + TCharDelimiter<const I> delim(*d); + SplitString(b, e, delim, tested); + Cmp(good, test); + UNIT_ASSERT_EQUAL(good, test); +} + +using TStrokaConsumer = TContainerConsumer<TVector<TString>>; + +void TestLimitingConsumerOnString(TVector<TString>& good, const char* str, const char* d, size_t n, const char* last) { + TVector<TString> test; + TStrokaConsumer consumer(&test); + TLimitingConsumer<TStrokaConsumer, const char> limits(n, &consumer); + TCharDelimiter<const char> delim(*d); + SplitString(str, delim, limits); + Cmp(good, test); + UNIT_ASSERT_EQUAL(good, test); + UNIT_ASSERT_EQUAL(TString(limits.Last), TString(last)); // Quite unobvious behaviour. Why the last token is not added to slave consumer? +} + +void TestLimitingConsumerOnRange(TVector<TString>& good, const char* b, const char* e, const char* d, size_t n, const char* last) { + TVector<TString> test; + TStrokaConsumer consumer(&test); + TLimitingConsumer<TStrokaConsumer, const char> limits(n, &consumer); + TCharDelimiter<const char> delim(*d); + SplitString(b, e, delim, limits); + Cmp(good, test); + UNIT_ASSERT_EQUAL(good, test); + UNIT_ASSERT_EQUAL(TString(limits.Last), TString(last)); +} + +Y_UNIT_TEST_SUITE(SplitStringTest) { + Y_UNIT_TEST(TestCharSingleDelimiter) { + TString data("qw ab qwabcab"); + TString canonic[] = {"qw", "ab", "", "qwabcab"}; + TVector<TString> good(canonic, canonic + 4); + TCharDelimiter<const char> delim(' '); + + TestDelimiterOnString<TContainerConsumer>(good, data.data(), delim); + TestDelimiterOnRange<TContainerConsumer>(good, data.data(), data.end(), delim); + } + + Y_UNIT_TEST(TestWideSingleDelimiter) { + TUtf16String data(u"qw ab qwabcab"); + TUtf16String canonic[] = {u"qw", u"ab", TUtf16String(), u"qwabcab"}; + TVector<TUtf16String> good(canonic, canonic + 4); + TCharDelimiter<const wchar16> delim(' '); + + TestDelimiterOnString<TContainerConsumer>(good, data.data(), delim); + TestDelimiterOnRange<TContainerConsumer>(good, data.data(), data.end(), delim); + } + + Y_UNIT_TEST(TestConvertToIntCharSingleDelimiter) { + TString data("42 4242 -12345 0"); + i32 canonic[] = {42, 4242, -12345, 0}; + TVector<i32> good(canonic, canonic + 4); + TCharDelimiter<const char> delim(' '); + + TestDelimiterOnString<TContainerConvertingConsumer>(good, data.data(), delim); + TestDelimiterOnRange<TContainerConvertingConsumer>(good, data.data(), data.end(), delim); + } + + Y_UNIT_TEST(TestCharSkipEmpty) { + TString data("qw ab qwabcab "); + TString canonic[] = {"qw", "ab", "qwabcab"}; + TVector<TString> good(canonic, canonic + 3); + + TestConsumerOnString<TSkipEmptyTokens<TStrokaConsumer>>(good, data.data(), " "); + TestConsumerOnRange<TSkipEmptyTokens<TStrokaConsumer>>(good, data.data(), data.end(), " "); + } + + Y_UNIT_TEST(TestCharKeepDelimiters) { + TString data("qw ab qwabcab "); + TString canonic[] = {"qw", " ", "ab", " ", "", " ", "qwabcab", " ", ""}; + TVector<TString> good(canonic, canonic + 9); + + TestConsumerOnString<TKeepDelimiters<TStrokaConsumer>>(good, data.data(), " "); + TestConsumerOnRange<TKeepDelimiters<TStrokaConsumer>>(good, data.data(), data.end(), " "); + } + + Y_UNIT_TEST(TestCharLimit) { + TString data("qw ab qwabcab "); + TString canonic[] = {"qw", "ab"}; + TVector<TString> good(canonic, canonic + 2); + + TestLimitingConsumerOnString(good, data.data(), " ", 3, " qwabcab "); + TestLimitingConsumerOnRange(good, data.data(), data.end(), " ", 3, " qwabcab "); + } + + Y_UNIT_TEST(TestCharStringDelimiter) { + TString data("qw ab qwababcab"); + TString canonic[] = {"qw ", " qw", "", "c", ""}; + TVector<TString> good(canonic, canonic + 5); + TStringDelimiter<const char> delim("ab"); + + TestDelimiterOnString<TContainerConsumer>(good, data.data(), delim); + TestDelimiterOnRange<TContainerConsumer>(good, data.data(), data.end(), delim); + } + + Y_UNIT_TEST(TestWideStringDelimiter) { + TUtf16String data(u"qw ab qwababcab"); + TUtf16String canonic[] = {u"qw ", u" qw", TUtf16String(), u"c", TUtf16String()}; + TVector<TUtf16String> good(canonic, canonic + 5); + TUtf16String wideDelim(u"ab"); + TStringDelimiter<const wchar16> delim(wideDelim.data()); + + TestDelimiterOnString<TContainerConsumer>(good, data.data(), delim); + TestDelimiterOnRange<TContainerConsumer>(good, data.data(), data.end(), delim); + } + + Y_UNIT_TEST(TestCharSetDelimiter) { + TString data("qw ab qwababccab"); + TString canonic[] = {"q", " ab q", "abab", "", "ab"}; + TVector<TString> good(canonic, canonic + 5); + TSetDelimiter<const char> delim("wc"); + + TestDelimiterOnString<TContainerConsumer>(good, data.data(), delim); + TestDelimiterOnRange<TContainerConsumer>(good, data.data(), data.end(), delim); + } + + Y_UNIT_TEST(TestWideSetDelimiter) { + TUtf16String data(u"qw ab qwababccab"); + TUtf16String canonic[] = {u"q", u" ab q", u"abab", TUtf16String(), u"ab"}; + TVector<TUtf16String> good(canonic, canonic + 5); + TUtf16String wideDelim(u"wc"); + TSetDelimiter<const wchar16> delim(wideDelim.data()); + + TestDelimiterOnString<TContainerConsumer>(good, data.data(), delim); + } + + Y_UNIT_TEST(TestWideSetDelimiterRange) { + TUtf16String data(u"qw ab qwababccab"); + TUtf16String canonic[] = {u"q", u" ab q", u"abab", TUtf16String(), u"ab"}; + TVector<TUtf16String> good(1); + TUtf16String wideDelim(u"wc"); + TSetDelimiter<const wchar16> delim(wideDelim.data()); + + TVector<TUtf16String> test; + TContainerConsumer<TVector<TUtf16String>> consumer(&test); + SplitString(data.data(), data.data(), delim, consumer); // Empty string is still inserted into consumer + Cmp(good, test); + + good.assign(canonic, canonic + 4); + good.push_back(TUtf16String()); + test.clear(); + SplitString(data.data(), data.end() - 2, delim, consumer); + Cmp(good, test); + } + + Y_UNIT_TEST(TestSplit) { + TString data("qw ab qwababcba"); + TString canonic[] = {"qw ", " qw", "c"}; + TVector<TString> good(canonic, canonic + 3); + TString delim = "ab"; + TVector<TString> test; + Split(data, delim, test); + Cmp(good, test); + + TVector<TStringBuf> test1; + Split(data, delim.data(), test1); + Cmp(good, test1); + } + + Y_UNIT_TEST(ConvenientSplitTest) { + TString data("abc 22 33.5 xyz"); + TString str; + int num1 = 0; + double num2 = 0; + TStringBuf strBuf; + Split(data, ' ', str, num1, num2, strBuf); + UNIT_ASSERT_VALUES_EQUAL(str, "abc"); + UNIT_ASSERT_VALUES_EQUAL(num1, 22); + UNIT_ASSERT_VALUES_EQUAL(num2, 33.5); + UNIT_ASSERT_VALUES_EQUAL(strBuf, "xyz"); + } + + Y_UNIT_TEST(ConvenientSplitTestWithMaybe) { + TString data("abc 42"); + TString str; + TMaybe<double> num2 = 1; + TMaybe<double> maybe = 1; + + Split(data, ' ', str, num2, maybe); + + UNIT_ASSERT_VALUES_EQUAL(str, "abc"); + UNIT_ASSERT_VALUES_EQUAL(*num2, 42); + UNIT_ASSERT(!maybe); + } + + Y_UNIT_TEST(ConvenientSplitTestExceptions) { + TString data("abc 22 33"); + TString s1, s2, s3, s4; + + UNIT_ASSERT_EXCEPTION(Split(data, ' ', s1, s2), yexception); + UNIT_ASSERT_NO_EXCEPTION(Split(data, ' ', s1, s2, s3)); + UNIT_ASSERT_EXCEPTION(Split(data, ' ', s1, s2, s3, s4), yexception); + } + + Y_UNIT_TEST(ConvenientSplitTestMaybeExceptions) { + TString data("abc 22 33"); + TString s1, s2; + TMaybe<TString> m1, m2; + + UNIT_ASSERT_EXCEPTION(Split(data, ' ', s1, m1), yexception); + UNIT_ASSERT_EXCEPTION(Split(data, ' ', m1, m2), yexception); + UNIT_ASSERT_NO_EXCEPTION(Split(data, ' ', s1, s2, m1)); + + UNIT_ASSERT_NO_EXCEPTION(Split(data, ' ', s1, s2, m1, m2)); + UNIT_ASSERT_EXCEPTION(Split(data, ' ', m1, m2, s1, s2), yexception); + + UNIT_ASSERT_NO_EXCEPTION(Split(data, ' ', s1, s2, m1, m2, m1, m1, m1, m1)); + UNIT_ASSERT_EXCEPTION(Split(data, ' ', s1, s2, m1, m2, m1, m1, m1, m1, s1), yexception); + } +} + +template <typename I, typename C> +void TestStringSplitterCount(I* str, C delim, size_t good) { + size_t res = StringSplitter(str).Split(delim).Count(); + UNIT_ASSERT_VALUES_EQUAL(res, good); +} + +Y_UNIT_TEST_SUITE(StringSplitter) { + Y_UNIT_TEST(TestSplit) { + int sum = 0; + + for (const auto& it : StringSplitter("1,2,3").Split(',')) { + sum += FromString<int>(it.Token()); + } + + UNIT_ASSERT_VALUES_EQUAL(sum, 6); + } + + Y_UNIT_TEST(TestSplit1) { + int cnt = 0; + + for (const auto& it : StringSplitter(" ").Split(' ')) { + (void)it; + + ++cnt; + } + + UNIT_ASSERT_VALUES_EQUAL(cnt, 2); + } + + Y_UNIT_TEST(TestSplitLimited) { + TVector<TString> expected = {"1", "2", "3,4,5"}; + TVector<TString> actual = StringSplitter("1,2,3,4,5").Split(',').Limit(3).ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestSplitLimitedWithEmptySkip) { + TVector<TString> expected = {"1", "2", "3,4,5"}; + TVector<TString> actual = StringSplitter("1,,,2,,,,3,4,5").Split(',').SkipEmpty().Limit(3).ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + + expected = {"1", "2", ",,,3,4,5"}; + actual = StringSplitter("1,2,,,,3,4,5").Split(',').Limit(3).SkipEmpty().ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestSplitBySet) { + int sum = 0; + + for (const auto& it : StringSplitter("1,2:3").SplitBySet(",:")) { + sum += FromString<int>(it.Token()); + } + + UNIT_ASSERT_VALUES_EQUAL(sum, 6); + } + + Y_UNIT_TEST(TestSplitBySetLimited) { + TVector<TString> expected = {"1", "2", "3,4:5"}; + TVector<TString> actual = StringSplitter("1,2:3,4:5").SplitBySet(",:").Limit(3).ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestSplitBySetLimitedWithEmptySkip) { + TVector<TString> expected = {"1", "2", "3,4:5"}; + TVector<TString> actual = StringSplitter("1,:,2::::,3,4:5").SplitBySet(",:").SkipEmpty().Limit(3).ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + + expected = {"1", ",2::::,3,4:5"}; + actual = StringSplitter("1,:,2::::,3,4:5").SplitBySet(",:").Limit(3).SkipEmpty().ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestSplitByString) { + int sum = 0; + + for (const auto& it : StringSplitter("1ab2ab3").SplitByString("ab")) { + sum += FromString<int>(it.Token()); + } + + UNIT_ASSERT_VALUES_EQUAL(sum, 6); + } + + Y_UNIT_TEST(TestSplitByStringLimited) { + TVector<TString> expected = {"1", "2", "3ab4ab5"}; + TVector<TString> actual = StringSplitter("1ab2ab3ab4ab5").SplitByString("ab").Limit(3).ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestSplitByStringLimitedWithEmptySkip) { + TVector<TString> expected = {"1", "2", "3ab4ab5"}; + TVector<TString> actual = StringSplitter("1abab2ababababab3ab4ab5").SplitByString("ab").SkipEmpty().Limit(3).ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestSplitByFunc) { + TString s = "123 456 \t\n789\n10\t 20"; + TVector<TString> pattern = {"123", "456", "789", "10", "20"}; + + TVector<TString> tokens; + auto f = [](char a) { return a == ' ' || a == '\t' || a == '\n'; }; + for (auto v : StringSplitter(s).SplitByFunc(f)) { + if (v) { + tokens.emplace_back(v); + } + } + + UNIT_ASSERT(tokens == pattern); + } + + Y_UNIT_TEST(TestSplitByFuncLimited) { + TVector<TString> expected = {"1", "2", "3a4b5"}; + auto f = [](char a) { return a == 'a' || a == 'b'; }; + TVector<TString> actual = StringSplitter("1a2b3a4b5").SplitByFunc(f).Limit(3).ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestSplitByFuncLimitedWithEmptySkip) { + TVector<TString> expected = {"1", "2", "3a4b5"}; + auto f = [](char a) { return a == 'a' || a == 'b'; }; + TVector<TString> actual = StringSplitter("1aaba2bbababa3a4b5").SplitByFunc(f).SkipEmpty().Limit(3).Take(3).ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestSkipEmpty) { + int sum = 0; + + for (const auto& it : StringSplitter(" 1 2 3 ").Split(' ').SkipEmpty()) { + sum += FromString<int>(it.Token()); + } + + UNIT_ASSERT_VALUES_EQUAL(sum, 6); + + // double + sum = 0; + for (const auto& it : StringSplitter(" 1 2 3 ").Split(' ').SkipEmpty().SkipEmpty()) { + sum += FromString<int>(it.Token()); + } + UNIT_ASSERT_VALUES_EQUAL(sum, 6); + } + + Y_UNIT_TEST(TestTake) { + TVector<TString> expected = {"1", "2", "3"}; + UNIT_ASSERT_VALUES_EQUAL(expected, StringSplitter("1 2 3 4 5 6 7 8 9 10").Split(' ').Take(3).ToList<TString>()); + + expected = {"1", "2"}; + UNIT_ASSERT_VALUES_EQUAL(expected, StringSplitter(" 1 2 3 ").Split(' ').SkipEmpty().Take(2).ToList<TString>()); + + expected = {"1", "2", "3"}; + UNIT_ASSERT_VALUES_EQUAL(expected, StringSplitter("1 2 3 4 5 6 7 8 9 10").Split(' ').Take(5).Take(3).ToList<TString>()); + UNIT_ASSERT_VALUES_EQUAL(expected, StringSplitter("1 2 3 4 5 6 7 8 9 10").Split(' ').Take(3).Take(5).ToList<TString>()); + + expected = {"1", "2"}; + UNIT_ASSERT_VALUES_EQUAL(expected, StringSplitter(" 1 2 3 ").Split(' ').Take(4).SkipEmpty().ToList<TString>()); + + expected = {"1"}; + UNIT_ASSERT_VALUES_EQUAL(expected, StringSplitter(" 1 2 3 ").Split(' ').Take(4).SkipEmpty().Take(1).ToList<TString>()); + } + + Y_UNIT_TEST(TestCompile) { + (void)StringSplitter(TString()); + (void)StringSplitter(TStringBuf()); + (void)StringSplitter("", 0); + } + + Y_UNIT_TEST(TestStringSplitterCountEmpty) { + TCharDelimiter<const char> delim(' '); + TestStringSplitterCount("", delim, 1); + } + + Y_UNIT_TEST(TestStringSplitterCountOne) { + TCharDelimiter<const char> delim(' '); + TestStringSplitterCount("one", delim, 1); + } + + Y_UNIT_TEST(TestStringSplitterCountWithOneDelimiter) { + TCharDelimiter<const char> delim(' '); + TestStringSplitterCount("one two", delim, 2); + } + + Y_UNIT_TEST(TestStringSplitterCountWithTrailing) { + TCharDelimiter<const char> delim(' '); + TestStringSplitterCount(" one ", delim, 3); + } + + Y_UNIT_TEST(TestStringSplitterConsume) { + TVector<TString> expected = {"1", "2", "3"}; + TVector<TString> actual; + auto func = [&actual](const TBasicStringBuf<char>& token) { + actual.push_back(TString(token)); + }; + StringSplitter("1 2 3").Split(' ').Consume(func); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestStringSplitterConsumeConditional) { + TVector<TString> expected = {"1", "2"}; + TVector<TString> actual; + auto func = [&actual](const TBasicStringBuf<char>& token) { + if (token == "3") { + return false; + } + actual.push_back(TString(token)); + return true; + }; + bool completed = StringSplitter("1 2 3 4 5").Split(' ').Consume(func); + UNIT_ASSERT(!completed); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestStringSplitterToList) { + TVector<TString> expected = {"1", "2", "3"}; + TVector<TString> actual = StringSplitter("1 2 3").Split(' ').ToList<TString>(); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestStringSplitterCollectPushBack) { + TVector<TString> expected = {"1", "2", "3"}; + TVector<TString> actual; + StringSplitter("1 2 3").Split(' ').Collect(&actual); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestStringSplitterCollectInsert) { + TSet<TString> expected = {"1", "2", "3"}; + TSet<TString> actual; + StringSplitter("1 2 3 1 2 3").Split(' ').Collect(&actual); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestStringSplitterCollectClears) { + TVector<TString> v; + StringSplitter("1 2 3").Split(' ').Collect(&v); + UNIT_ASSERT_VALUES_EQUAL(v.size(), 3); + StringSplitter("4 5").Split(' ').Collect(&v); + UNIT_ASSERT_VALUES_EQUAL(v.size(), 2); + } + + Y_UNIT_TEST(TestStringSplitterAddToDoesntClear) { + TVector<TString> v; + StringSplitter("1 2 3").Split(' ').AddTo(&v); + UNIT_ASSERT_VALUES_EQUAL(v.size(), 3); + StringSplitter("4 5").Split(' ').AddTo(&v); + UNIT_ASSERT_VALUES_EQUAL(v.size(), 5); + } + + Y_UNIT_TEST(TestSplitStringInto) { + int a = -1; + TStringBuf s; + double d = -1; + StringSplitter("2 substr 1.02").Split(' ').CollectInto(&a, &s, &d); + UNIT_ASSERT_VALUES_EQUAL(a, 2); + UNIT_ASSERT_VALUES_EQUAL(s, "substr"); + UNIT_ASSERT_DOUBLES_EQUAL(d, 1.02, 0.0001); + UNIT_ASSERT_EXCEPTION(StringSplitter("1").Split(' ').CollectInto(&a, &a), yexception); + UNIT_ASSERT_EXCEPTION(StringSplitter("1 2 3").Split(' ').CollectInto(&a, &a), yexception); + } + + Y_UNIT_TEST(TestSplitStringWithIgnore) { + TStringBuf s; + StringSplitter("x y z").Split(' ').CollectInto(&std::ignore, &s, &std::ignore); + UNIT_ASSERT_VALUES_EQUAL(s, "y"); + + UNIT_ASSERT_EXCEPTION(StringSplitter("ignored != non-requred").Split(':').CollectInto(&s, &std::ignore), yexception); + } + + Y_UNIT_TEST(TestTryCollectInto) { + int a, b, c; + bool parsingSucceeded; + parsingSucceeded = StringSplitter("100,500,3").Split(',').TryCollectInto(&a, &b, &c); + UNIT_ASSERT(parsingSucceeded); + UNIT_ASSERT_VALUES_EQUAL(a, 100); + UNIT_ASSERT_VALUES_EQUAL(b, 500); + UNIT_ASSERT_VALUES_EQUAL(c, 3); + + //not enough tokens + parsingSucceeded = StringSplitter("3,14").Split(',').TryCollectInto(&a, &b, &c); + UNIT_ASSERT(!parsingSucceeded); + + //too many tokens + parsingSucceeded = StringSplitter("3,14,15,92,6").Split(',').TryCollectInto(&a, &b, &c); + UNIT_ASSERT(!parsingSucceeded); + + //where single TryFromString fails + parsingSucceeded = StringSplitter("ot topota kopyt pyl po polu letit").Split(' ').TryCollectInto(&a, &b, &c); + UNIT_ASSERT(!parsingSucceeded); + } + + Y_UNIT_TEST(TestOwningSplit1) { + int sum = 0; + + for (const auto& it : StringSplitter(TString("1,2,3")).Split(',')) { + sum += FromString<int>(it.Token()); + } + + UNIT_ASSERT_VALUES_EQUAL(sum, 6); + } + + Y_UNIT_TEST(TestOwningSplit2) { + int sum = 0; + + TString str("1,2,3"); + for (const auto& it : StringSplitter(str).Split(',')) { + sum += FromString<int>(it.Token()); + } + + UNIT_ASSERT_VALUES_EQUAL(sum, 6); + } + + Y_UNIT_TEST(TestOwningSplit3) { + int sum = 0; + + const TString str("1,2,3"); + for (const auto& it : StringSplitter(str).Split(',')) { + sum += FromString<int>(it.Token()); + } + + UNIT_ASSERT_VALUES_EQUAL(sum, 6); + } + + Y_UNIT_TEST(TestAssigment) { + TVector<TString> expected0 = {"1", "2", "3", "4"}; + TVector<TString> actual0 = StringSplitter("1 2 3 4").Split(' '); + UNIT_ASSERT_VALUES_EQUAL(expected0, actual0); + + TSet<TString> expected1 = {"11", "22", "33", "44"}; + TSet<TString> actual1 = StringSplitter("11 22 33 44").Split(' '); + UNIT_ASSERT_VALUES_EQUAL(expected1, actual1); + + TSet<TString> expected2 = {"11", "aa"}; + auto actual2 = static_cast<TSet<TString>>(StringSplitter("11 aa 11 11 aa").Split(' ')); + UNIT_ASSERT_VALUES_EQUAL(expected2, actual2); + + TVector<TString> expected3 = {"dd", "bb"}; + auto actual3 = TVector<TString>(StringSplitter("dd\tbb").Split('\t')); + UNIT_ASSERT_VALUES_EQUAL(expected3, actual3); + } + + Y_UNIT_TEST(TestRangeBasedFor) { + TVector<TString> actual0 = {"11", "22", "33", "44"}; + size_t num = 0; + for (TStringBuf elem : StringSplitter("11 22 33 44").Split(' ')) { + UNIT_ASSERT_VALUES_EQUAL(elem, actual0[num++]); + } + + TVector<TString> actual1 = {"another", "one,", "and", "another", "one"}; + num = 0; + for (TStringBuf elem : StringSplitter(TStringBuf("another one, and \n\n another one")).SplitBySet(" \n").SkipEmpty()) { + UNIT_ASSERT_VALUES_EQUAL(elem, actual1[num++]); + } + + TVector<TUtf16String> actual2 = {u"привет,", u"как", u"дела"}; + num = 0; + for (TWtringBuf elem : StringSplitter(u"привет, как дела").Split(wchar16(' '))) { + UNIT_ASSERT_VALUES_EQUAL(elem, actual2[num++]); + } + + TVector<TString> copy(4); + auto v = StringSplitter("11 22 33 44").Split(' '); + Copy(v.begin(), v.end(), copy.begin()); + UNIT_ASSERT_VALUES_EQUAL(actual0, copy); + } + + Y_UNIT_TEST(TestParseInto) { + TVector<int> actual0 = {1, 2, 3, 4}; + TVector<int> answer0; + + StringSplitter("1 2 3 4").Split(' ').ParseInto(&answer0); + UNIT_ASSERT_VALUES_EQUAL(actual0, answer0); + + TVector<int> actual1 = {42, 1, 2, 3, 4}; + TVector<int> answer1 = {42}; + StringSplitter("1 2 3 4").Split(' ').ParseInto(&answer1); + UNIT_ASSERT_VALUES_EQUAL(actual1, answer1); + + answer1.clear(); + UNIT_ASSERT_EXCEPTION(StringSplitter("1 2 3 4").Split(' ').ParseInto(&answer1), yexception); + + answer1 = {42}; + StringSplitter(" 1 2 3 4").Split(' ').SkipEmpty().ParseInto(&answer1); + UNIT_ASSERT_VALUES_EQUAL(actual1, answer1); + + answer1.clear(); + StringSplitter(" \n 1 2 \n\n\n 3 4\n ").SplitBySet(" \n").SkipEmpty().ParseInto(&answer1); + UNIT_ASSERT_VALUES_EQUAL(actual0, answer1); + } + + Y_UNIT_TEST(TestStdString) { + std::vector<std::string_view> r0, r1, answer = {"lol", "zomg"}; + std::string s = "lol zomg"; + for (std::string_view ss : StringSplitter(s).Split(' ')) { + r0.push_back(ss); + } + StringSplitter(s).Split(' ').Collect(&r1); + + UNIT_ASSERT_VALUES_EQUAL(r0, answer); + UNIT_ASSERT_VALUES_EQUAL(r1, answer); + } + + Y_UNIT_TEST(TestStdStringView) { + std::string_view s = "aaacccbbb"; + std::vector<std::string_view> expected = {"aaa", "bbb"}; + std::vector<std::string_view> actual = StringSplitter(s).SplitByString("ccc"); + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + } + + Y_UNIT_TEST(TestStdSplitAfterSplit) { + std::string_view input = "a*b+a*b"; + for (std::string_view summand : StringSplitter(input).Split('+')) { + //FIXME: std::string is used to workaround MSVC ICE + UNIT_ASSERT_VALUES_EQUAL(std::string(summand), "a*b"); + std::string_view multiplier1, multiplier2; + bool splitResult = StringSplitter(summand).Split('*').TryCollectInto(&multiplier1, &multiplier2); + UNIT_ASSERT(splitResult); + UNIT_ASSERT_VALUES_EQUAL(std::string(multiplier1), "a"); + UNIT_ASSERT_VALUES_EQUAL(std::string(multiplier2), "b"); + } + } + + Y_UNIT_TEST(TestStdSplitWithParsing) { + std::string_view input = "1,2,3,4"; + TVector<ui64> numbers; + const TVector<ui64> expected{1, 2, 3, 4}; + StringSplitter(input).Split(',').ParseInto(&numbers); + UNIT_ASSERT_VALUES_EQUAL(numbers, expected); + } + + Y_UNIT_TEST(TestArcadiaStdInterop) { + TVector<TString> expected0 = {"a", "b"}; + TVector<TStringBuf> expected1 = {"a", "b"}; + std::string src1("a b"); + std::string_view src2("a b"); + TVector<TString> actual0 = StringSplitter(src1).Split(' ').SkipEmpty(); + TVector<TString> actual1 = StringSplitter(src2).Split(' ').SkipEmpty(); + TVector<TStringBuf> actual2 = StringSplitter(src1).Split(' ').SkipEmpty(); + TVector<TStringBuf> actual3 = StringSplitter(src2).Split(' ').SkipEmpty(); + UNIT_ASSERT_VALUES_EQUAL(expected0, actual0); + UNIT_ASSERT_VALUES_EQUAL(expected0, actual1); + UNIT_ASSERT_VALUES_EQUAL(expected1, actual2); + UNIT_ASSERT_VALUES_EQUAL(expected1, actual3); + } + + Y_UNIT_TEST(TestConstCString) { + const char* b = "a;b"; + const char* e = b + 3; + + std::vector<TStringBuf> v; + StringSplitter(b, e).Split(';').AddTo(&v); + + std::vector<TStringBuf> expected = {"a", "b"}; + UNIT_ASSERT_VALUES_EQUAL(v, expected); + } + + Y_UNIT_TEST(TestCStringRef) { + TString s = "lol"; + char* str = s.Detach(); + + std::vector<TStringBuf> v = StringSplitter(str).Split('o'); + std::vector<TStringBuf> expected = {"l", "l"}; + UNIT_ASSERT_VALUES_EQUAL(v, expected); + } + + Y_UNIT_TEST(TestSplitVector) { + std::vector<char> buffer = {'a', ';', 'b'}; + + std::vector<TStringBuf> v = StringSplitter(buffer).Split(';'); + + std::vector<TStringBuf> expected = {"a", "b"}; + UNIT_ASSERT_VALUES_EQUAL(v, expected); + } + + class TDoubleIterator { + public: + using iterator_category = std::input_iterator_tag; + using value_type = int; + using pointer = void; + using reference = int; + using const_reference = int; + using difference_type = ptrdiff_t; + + TDoubleIterator() = default; + + TDoubleIterator(const char* ptr) + : Ptr_(ptr) + { + } + + TDoubleIterator operator++() { + Ptr_ += 2; + return *this; + } + + TDoubleIterator operator++(int) { + TDoubleIterator tmp = *this; + ++*this; + return tmp; + } + + friend bool operator==(TDoubleIterator l, TDoubleIterator r) { + return l.Ptr_ == r.Ptr_; + } + + friend bool operator!=(TDoubleIterator l, TDoubleIterator r) { + return l.Ptr_ != r.Ptr_; + } + + int operator*() const { + return (*Ptr_ - '0') * 10 + *(Ptr_ + 1) - '0'; + } + + private: + const char* Ptr_ = nullptr; + }; + + Y_UNIT_TEST(TestInputIterator) { + const char* beg = "1213002233000011"; + const char* end = beg + strlen(beg); + + std::vector<std::vector<int>> expected = {{12, 13}, {22, 33}, {}, {11}}; + int i = 0; + + for (TIteratorRange<TDoubleIterator> part : StringSplitter(TDoubleIterator(beg), TDoubleIterator(end)).SplitByFunc([](int value) { return value == 0; })) { + UNIT_ASSERT(std::equal(part.begin(), part.end(), expected[i].begin(), expected[i].end())); + i++; + } + UNIT_ASSERT_VALUES_EQUAL(i, expected.size()); + } +} diff --git a/util/string/strip.cpp b/util/string/strip.cpp new file mode 100644 index 0000000000..c921571cf0 --- /dev/null +++ b/util/string/strip.cpp @@ -0,0 +1,23 @@ +#include "strip.h" +#include "ascii.h" + +#include <util/string/reverse.h> + +bool Collapse(const TString& from, TString& to, size_t maxLen) { + return CollapseImpl<TString, bool (*)(unsigned char)>(from, to, maxLen, IsAsciiSpace); +} + +void CollapseText(const TString& from, TString& to, size_t maxLen) { + Collapse(from, to, maxLen); + StripInPlace(to); + if (to.size() >= maxLen) { + to.remove(maxLen - 5); // " ..." + ReverseInPlace(to); + size_t pos = to.find_first_of(" .,;"); + if (pos != TString::npos && pos < 32) { + to.remove(0, pos + 1); + } + ReverseInPlace(to); + to.append(" ..."); + } +} diff --git a/util/string/strip.h b/util/string/strip.h new file mode 100644 index 0000000000..d5ef6da96d --- /dev/null +++ b/util/string/strip.h @@ -0,0 +1,257 @@ +#pragma once + +#include "ascii.h" + +#include <util/generic/string.h> +#include <util/generic/strbuf.h> +#include <utility> + +template <class It> +struct TIsAsciiSpaceAdapter { + bool operator()(const It& it) const noexcept { + return IsAsciiSpace(*it); + } +}; + +template <class It> +TIsAsciiSpaceAdapter<It> IsAsciiSpaceAdapter(It) { + return {}; +} + +template <class TChar> +struct TEqualsStripAdapter { + TEqualsStripAdapter(TChar ch) + : Ch(ch) + { + } + + template <class It> + bool operator()(const It& it) const noexcept { + return *it == Ch; + } + + const TChar Ch; +}; + +template <class TChar> +TEqualsStripAdapter<TChar> EqualsStripAdapter(TChar ch) { + return {ch}; +} + +template <class It, class TStripCriterion> +inline void StripRangeBegin(It& b, const It& e, TStripCriterion&& criterion) noexcept { + while (b < e && criterion(b)) { + ++b; + } +} + +template <class It> +inline void StripRangeBegin(It& b, const It& e) noexcept { + StripRangeBegin(b, e, IsAsciiSpaceAdapter(b)); +} + +template <class It, class TStripCriterion> +inline void StripRangeEnd(const It& b, It& e, TStripCriterion&& criterion) noexcept { + while (b < e && criterion(e - 1)) { + --e; + } +} + +template <class It> +inline void StripRangeEnd(const It& b, It& e) noexcept { + StripRangeEnd(b, e, IsAsciiSpaceAdapter(b)); +} + +template <bool stripBeg, bool stripEnd> +struct TStripImpl { + template <class It, class TStripCriterion> + static inline bool StripRange(It& b, It& e, TStripCriterion&& criterion) noexcept { + const size_t oldLen = e - b; + + if (stripBeg) { + StripRangeBegin(b, e, criterion); + } + + if (stripEnd) { + StripRangeEnd(b, e, criterion); + } + + const size_t newLen = e - b; + return newLen != oldLen; + } + + template <class T, class TStripCriterion> + static inline bool StripString(const T& from, T& to, TStripCriterion&& criterion) { + auto b = from.begin(); + auto e = from.end(); + + if (StripRange(b, e, criterion)) { + to = T(b, e - b); + + return true; + } + + to = from; + + return false; + } + + template <class T, class TStripCriterion> + static inline T StripString(const T& from, TStripCriterion&& criterion) { + T ret; + StripString(from, ret, criterion); + return ret; + } + + template <class T> + static inline T StripString(const T& from) { + return StripString(from, IsAsciiSpaceAdapter(from.begin())); + } +}; + +template <class It, class TStripCriterion> +inline bool StripRange(It& b, It& e, TStripCriterion&& criterion) noexcept { + return TStripImpl<true, true>::StripRange(b, e, criterion); +} + +template <class It> +inline bool StripRange(It& b, It& e) noexcept { + return StripRange(b, e, IsAsciiSpaceAdapter(b)); +} + +template <class It, class TStripCriterion> +inline bool Strip(It& b, size_t& len, TStripCriterion&& criterion) noexcept { + It e = b + len; + + if (StripRange(b, e, criterion)) { + len = e - b; + + return true; + } + + return false; +} + +template <class It> +inline bool Strip(It& b, size_t& len) noexcept { + return Strip(b, len, IsAsciiSpaceAdapter(b)); +} + +template <class T, class TStripCriterion> +static inline bool StripString(const T& from, T& to, TStripCriterion&& criterion) { + return TStripImpl<true, true>::StripString(from, to, criterion); +} + +template <class T> +static inline bool StripString(const T& from, T& to) { + return StripString(from, to, IsAsciiSpaceAdapter(from.begin())); +} + +template <class T, class TStripCriterion> +static inline T StripString(const T& from, TStripCriterion&& criterion) { + return TStripImpl<true, true>::StripString(from, criterion); +} + +template <class T> +static inline T StripString(const T& from) { + return TStripImpl<true, true>::StripString(from); +} + +template <class T> +static inline T StripStringLeft(const T& from) { + return TStripImpl<true, false>::StripString(from); +} + +template <class T> +static inline T StripStringRight(const T& from) { + return TStripImpl<false, true>::StripString(from); +} + +template <class T, class TStripCriterion> +static inline T StripStringLeft(const T& from, TStripCriterion&& criterion) { + return TStripImpl<true, false>::StripString(from, criterion); +} + +template <class T, class TStripCriterion> +static inline T StripStringRight(const T& from, TStripCriterion&& criterion) { + return TStripImpl<false, true>::StripString(from, criterion); +} + +/// Copies the given string removing leading and trailing spaces. +static inline bool Strip(const TString& from, TString& to) { + return StripString(from, to); +} + +/// Removes leading and trailing spaces from the string. +inline TString& StripInPlace(TString& s) { + Strip(s, s); + return s; +} + +/// Returns a copy of the given string with removed leading and trailing spaces. +inline TString Strip(const TString& s) Y_WARN_UNUSED_RESULT; +inline TString Strip(const TString& s) { + TString ret = s; + Strip(ret, ret); + return ret; +} + +template <class TChar, class TWhitespaceFunc> +size_t CollapseImpl(TChar* s, size_t n, const TWhitespaceFunc& isWhitespace) { + size_t newLen = 0; + for (size_t i = 0; i < n; ++i, ++newLen) { + size_t nextNonSpace = i; + while (nextNonSpace < n && isWhitespace(s[nextNonSpace])) { + ++nextNonSpace; + } + size_t numSpaces = nextNonSpace - i; + if (numSpaces > 1 || (numSpaces == 1 && s[i] != ' ')) { + s[newLen] = ' '; + i = nextNonSpace - 1; + } else { + s[newLen] = s[i]; + } + } + return newLen; +} + +template <class TStringType, class TWhitespaceFunc> +bool CollapseImpl(const TStringType& from, TStringType& to, size_t maxLen, const TWhitespaceFunc& isWhitespace) { + to = from; + maxLen = maxLen ? Min(maxLen, to.size()) : to.size(); + for (size_t i = 0; i < maxLen; ++i) { + if (isWhitespace(to[i]) && (to[i] != ' ' || isWhitespace(to[i + 1]))) { + size_t tailSize = maxLen - i; + size_t newTailSize = CollapseImpl(to.begin() + i, tailSize, isWhitespace); + to.remove(i + newTailSize, tailSize - newTailSize); + return true; + } + } + return false; +} + +bool Collapse(const TString& from, TString& to, size_t maxLen = 0); + +/// Replaces several consequtive space symbols with one (processing is limited to maxLen bytes) +inline TString& CollapseInPlace(TString& s, size_t maxLen = 0) { + Collapse(s, s, maxLen); + return s; +} + +/// Replaces several consequtive space symbols with one (processing is limited to maxLen bytes) +inline TString Collapse(const TString& s, size_t maxLen = 0) Y_WARN_UNUSED_RESULT; +inline TString Collapse(const TString& s, size_t maxLen) { + TString ret; + Collapse(s, ret, maxLen); + return ret; +} + +void CollapseText(const TString& from, TString& to, size_t maxLen); + +/// The same as Collapse() + truncates the string to maxLen. +/// @details An ellipsis is inserted at the end of the truncated line. +inline void CollapseText(TString& s, size_t maxLen) { + TString to; + CollapseText(s, to, maxLen); + s = to; +} diff --git a/util/string/strip_ut.cpp b/util/string/strip_ut.cpp new file mode 100644 index 0000000000..d1029d1498 --- /dev/null +++ b/util/string/strip_ut.cpp @@ -0,0 +1,138 @@ +#include "strip.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/charset/wide.h> + +Y_UNIT_TEST_SUITE(TStripStringTest) { + Y_UNIT_TEST(TestStrip) { + struct TTest { + const char* Str; + const char* StripLeftRes; + const char* StripRightRes; + const char* StripRes; + }; + static const TTest tests[] = { + {" 012 ", "012 ", " 012", "012"}, + {" 012", "012", " 012", "012"}, + {"012\t\t", "012\t\t", "012", "012"}, + {"\t012\t", "012\t", "\t012", "012"}, + {"012", "012", "012", "012"}, + {"012\r\n", "012\r\n", "012", "012"}, + {"\n012\r", "012\r", "\n012", "012"}, + {"\n \t\r", "", "", ""}, + {"", "", "", ""}, + {"abc", "abc", "abc", "abc"}, + {"a c", "a c", "a c", "a c"}, + }; + + for (const auto& test : tests) { + TString inputStr(test.Str); + + TString s; + Strip(inputStr, s); + UNIT_ASSERT_EQUAL(s, test.StripRes); + + UNIT_ASSERT_EQUAL(StripString(inputStr), test.StripRes); + UNIT_ASSERT_EQUAL(StripStringLeft(inputStr), test.StripLeftRes); + UNIT_ASSERT_EQUAL(StripStringRight(inputStr), test.StripRightRes); + + TStringBuf inputStrBuf(test.Str); + UNIT_ASSERT_EQUAL(StripString(inputStrBuf), test.StripRes); + UNIT_ASSERT_EQUAL(StripStringLeft(inputStrBuf), test.StripLeftRes); + UNIT_ASSERT_EQUAL(StripStringRight(inputStrBuf), test.StripRightRes); + }; + } + + Y_UNIT_TEST(TestCustomStrip) { + struct TTest { + const char* Str; + const char* Result; + }; + static const TTest tests[] = { + {"//012//", "012"}, + {"//012", "012"}, + {"012", "012"}, + {"012//", "012"}, + }; + + for (auto test : tests) { + UNIT_ASSERT_EQUAL( + StripString(TString(test.Str), EqualsStripAdapter('/')), + test.Result); + }; + } + + Y_UNIT_TEST(TestCustomStripLeftRight) { + struct TTest { + const char* Str; + const char* ResultLeft; + const char* ResultRight; + }; + static const TTest tests[] = { + {"//012//", "012//", "//012"}, + {"//012", "012", "//012"}, + {"012", "012", "012"}, + {"012//", "012//", "012"}, + }; + + for (const auto& test : tests) { + UNIT_ASSERT_EQUAL( + StripStringLeft(TString(test.Str), EqualsStripAdapter('/')), + test.ResultLeft); + UNIT_ASSERT_EQUAL( + StripStringRight(TString(test.Str), EqualsStripAdapter('/')), + test.ResultRight); + }; + } + + Y_UNIT_TEST(TestNullStringStrip) { + TStringBuf nullString(nullptr, nullptr); + UNIT_ASSERT_EQUAL( + StripString(nullString), + TString()); + } + + Y_UNIT_TEST(TestWtrokaStrip) { + UNIT_ASSERT_EQUAL(StripString(TWtringBuf(u" abc ")), u"abc"); + UNIT_ASSERT_EQUAL(StripStringLeft(TWtringBuf(u" abc ")), u"abc "); + UNIT_ASSERT_EQUAL(StripStringRight(TWtringBuf(u" abc ")), u" abc"); + } + + Y_UNIT_TEST(TestWtrokaCustomStrip) { + UNIT_ASSERT_EQUAL( + StripString( + TWtringBuf(u"/abc/"), + EqualsStripAdapter(u'/')), + u"abc"); + } + + Y_UNIT_TEST(TestCollapse) { + TString s; + Collapse(TString(" 123 456 "), s); + UNIT_ASSERT(s == " 123 456 "); + Collapse(TString(" 123 456 "), s, 10); + UNIT_ASSERT(s == " 123 456 "); + + s = TString(" a b c "); + TString s2 = s; + CollapseInPlace(s2); + + UNIT_ASSERT(s == s2); +#ifndef TSTRING_IS_STD_STRING + UNIT_ASSERT(s.c_str() == s2.c_str()); // Collapse() does not change the string at all +#endif + } + + Y_UNIT_TEST(TestCollapseText) { + TString abs1("Very long description string written in unknown language."); + TString abs2(abs1); + TString abs3(abs1); + CollapseText(abs1, 204); + CollapseText(abs2, 54); + CollapseText(abs3, 49); + UNIT_ASSERT_EQUAL(abs1 == "Very long description string written in unknown language.", true); + UNIT_ASSERT_EQUAL(abs2 == "Very long description string written in unknown ...", true); + UNIT_ASSERT_EQUAL(abs3 == "Very long description string written in ...", true); + } +} diff --git a/util/string/strspn.cpp b/util/string/strspn.cpp new file mode 100644 index 0000000000..cdb8d7ca9b --- /dev/null +++ b/util/string/strspn.cpp @@ -0,0 +1 @@ +#include "strspn.h" diff --git a/util/string/strspn.h b/util/string/strspn.h new file mode 100644 index 0000000000..8229e74a9c --- /dev/null +++ b/util/string/strspn.h @@ -0,0 +1,65 @@ +#pragma once + +#include "cstriter.h" + +#include <util/generic/bitmap.h> + +template <class TSetType> +class TStrSpnImpl { +public: + inline TStrSpnImpl(const char* b, const char* e) { + Init(b, e); + } + + inline TStrSpnImpl(const char* s) { + Init(s, TCStringEndIterator()); + } + + //FirstOf + template <class It> + inline It FindFirstOf(It b, const char* e) const noexcept { + return FindFirst<false>(b, e); + } + + template <class It> + inline It FindFirstOf(It s) const noexcept { + return FindFirst<false>(s, TCStringEndIterator()); + } + + //FirstNotOf + template <class It> + inline It FindFirstNotOf(It b, const char* e) const noexcept { + return FindFirst<true>(b, e); + } + + template <class It> + inline It FindFirstNotOf(It s) const noexcept { + return FindFirst<true>(s, TCStringEndIterator()); + } + + inline void Set(ui8 b) noexcept { + S_.Set(b); + } + +private: + template <bool Result, class It1, class It2> + inline It1 FindFirst(It1 b, It2 e) const noexcept { + while (b != e && (S_.Get((ui8)*b) == Result)) { + ++b; + } + + return b; + } + + template <class It1, class It2> + inline void Init(It1 b, It2 e) { + while (b != e) { + this->Set((ui8)*b++); + } + } + +private: + TSetType S_; +}; + +using TCompactStrSpn = TStrSpnImpl<TBitMap<256>>; diff --git a/util/string/subst.cpp b/util/string/subst.cpp new file mode 100644 index 0000000000..b2df328dc1 --- /dev/null +++ b/util/string/subst.cpp @@ -0,0 +1,201 @@ +#include "subst.h" + +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/system/compiler.h> + +#include <string> +#include <type_traits> + +// a bit of template magic (to be fast and unreadable) +template <class TStringType, class TTo, bool Main> +static Y_FORCE_INLINE void MoveBlock(typename TStringType::value_type* ptr, size_t& srcPos, size_t& dstPos, const size_t off, const TTo to, const size_t toSize) { + const size_t unchangedSize = off - srcPos; + if (dstPos < srcPos) { + for (size_t i = 0; i < unchangedSize; ++i) { + ptr[dstPos++] = ptr[srcPos++]; + } + } else { + dstPos += unchangedSize; + srcPos += unchangedSize; + } + + if (Main) { + for (size_t i = 0; i < toSize; ++i) { + ptr[dstPos++] = to[i]; + } + } +} + +template <typename T, typename U> +static bool IsIntersect(const T& a, const U& b) noexcept { + if (b.data() < a.data()) { + return IsIntersect(b, a); + } + + return !a.empty() && !b.empty() && + ((a.data() <= b.data() && b.data() < a.data() + a.size()) || + (a.data() < b.data() + b.size() && b.data() + b.size() <= a.data() + a.size())); +} + +/** + * Replaces all occurences of substring @c from in string @c s to string @c to. + * Uses two separate implementations (inplace for shrink and append for grow case) + * See IGNIETFERRO-394 + **/ +template <class TStringType, typename TStringViewType = TBasicStringBuf<typename TStringType::value_type>> +static inline size_t SubstGlobalImpl(TStringType& s, const TStringViewType from, const TStringViewType to, size_t fromPos = 0) { + if (from.empty()) { + return 0; + } + + Y_ASSERT(!IsIntersect(s, from)); + Y_ASSERT(!IsIntersect(s, to)); + + const size_t fromSize = from.size(); + const size_t toSize = to.size(); + size_t replacementsCount = 0; + size_t off = fromPos; + size_t srcPos = 0; + + if (toSize > fromSize) { + // string will grow: append to another string + TStringType result; + for (; (off = TStringViewType(s).find(from, off)) != TStringType::npos; off += fromSize) { + if (!replacementsCount) { + // first replacement occured, we can prepare result string + result.reserve(s.size() + s.size() / 3); + } + result.append(s.begin() + srcPos, s.begin() + off); + result.append(to.data(), to.size()); + srcPos = off + fromSize; + ++replacementsCount; + } + if (replacementsCount) { + // append tail + result.append(s.begin() + srcPos, s.end()); + s = std::move(result); + } + return replacementsCount; + } + + // string will not grow: use inplace algo + size_t dstPos = 0; + typename TStringType::value_type* ptr = &*s.begin(); + for (; (off = TStringViewType(s).find(from, off)) != TStringType::npos; off += fromSize) { + Y_ASSERT(dstPos <= srcPos); + MoveBlock<TStringType, TStringViewType, true>(ptr, srcPos, dstPos, off, to, toSize); + srcPos = off + fromSize; + ++replacementsCount; + } + + if (replacementsCount) { + // append tail + MoveBlock<TStringType, TStringViewType, false>(ptr, srcPos, dstPos, s.size(), to, toSize); + s.resize(dstPos); + } + return replacementsCount; +} + +/// Replaces all occurences of the 'from' symbol in a string to the 'to' symbol. +template <class TStringType> +inline size_t SubstCharGlobalImpl(TStringType& s, typename TStringType::value_type from, typename TStringType::value_type to, size_t fromPos = 0) { + if (fromPos >= s.size()) { + return 0; + } + + size_t result = 0; + fromPos = s.find(from, fromPos); + + // s.begin() might cause memory copying, so call it only if needed + if (fromPos != TStringType::npos) { + auto* it = &*s.begin() + fromPos; + *it = to; + ++result; + // at this point string is copied and it's safe to use constant s.end() to iterate + const auto* const sEnd = &*s.end(); + // unrolled loop goes first because it is more likely that `it` will be properly aligned + for (const auto* const end = sEnd - (sEnd - it) % 4; it < end;) { + if (*it == from) { + *it = to; + ++result; + } + ++it; + if (*it == from) { + *it = to; + ++result; + } + ++it; + if (*it == from) { + *it = to; + ++result; + } + ++it; + if (*it == from) { + *it = to; + ++result; + } + ++it; + } + for (; it < sEnd; ++it) { + if (*it == from) { + *it = to; + ++result; + } + } + } + + return result; +} + +/* Standard says that `char16_t` is a distinct type and has same size, signedness and alignment as + * `std::uint_least16_t`, so we check if `char16_t` has same signedness and size as `wchar16` to be + * sure that we can make safe casts between values of these types and pointers. + */ +static_assert(sizeof(wchar16) == sizeof(char16_t), ""); +static_assert(sizeof(wchar32) == sizeof(char32_t), ""); +static_assert(std::is_unsigned<wchar16>::value == std::is_unsigned<char16_t>::value, ""); +static_assert(std::is_unsigned<wchar32>::value == std::is_unsigned<char32_t>::value, ""); + +size_t SubstGlobal(TString& text, const TStringBuf what, const TStringBuf with, size_t from) { + return SubstGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(std::string& text, const TStringBuf what, const TStringBuf with, size_t from) { + return SubstGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(TUtf16String& text, const TWtringBuf what, const TWtringBuf with, size_t from) { + return SubstGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(TUtf32String& text, const TUtf32StringBuf what, const TUtf32StringBuf with, size_t from) { + return SubstGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(std::u16string& text, const TWtringBuf what, const TWtringBuf with, size_t from) { + return SubstGlobalImpl(text, + std::u16string_view(reinterpret_cast<const char16_t*>(what.data()), what.size()), + std::u16string_view(reinterpret_cast<const char16_t*>(with.data()), with.size()), + from); +} + +size_t SubstGlobal(TString& text, char what, char with, size_t from) { + return SubstCharGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(std::string& text, char what, char with, size_t from) { + return SubstCharGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(TUtf16String& text, wchar16 what, wchar16 with, size_t from) { + return SubstCharGlobalImpl(text, (char16_t)what, (char16_t)with, from); +} + +size_t SubstGlobal(std::u16string& text, wchar16 what, wchar16 with, size_t from) { + return SubstCharGlobalImpl(text, (char16_t)what, (char16_t)with, from); +} + +size_t SubstGlobal(TUtf32String& text, wchar32 what, wchar32 with, size_t from) { + return SubstCharGlobalImpl(text, (char32_t)what, (char32_t)with, from); +} diff --git a/util/string/subst.h b/util/string/subst.h new file mode 100644 index 0000000000..45b622fbef --- /dev/null +++ b/util/string/subst.h @@ -0,0 +1,56 @@ +#pragma once + +#include <util/generic/fwd.h> + +#include <stlfwd> + +/* Replace all occurences of substring `what` with string `with` starting from position `from`. + * + * @param text String to modify. + * @param what Substring to replace. + * @param with Substring to use as replacement. + * @param from Position at with to start replacement. + * + * @return Number of replacements occured. + */ +size_t SubstGlobal(TString& text, TStringBuf what, TStringBuf with, size_t from = 0); +size_t SubstGlobal(std::string& text, TStringBuf what, TStringBuf with, size_t from = 0); +size_t SubstGlobal(TUtf16String& text, TWtringBuf what, TWtringBuf with, size_t from = 0); +size_t SubstGlobal(std::u16string& text, TWtringBuf what, TWtringBuf with, size_t from = 0); +size_t SubstGlobal(TUtf32String& text, TUtf32StringBuf what, TUtf32StringBuf with, size_t from = 0); + +/* Replace all occurences of character `what` with character `with` starting from position `from`. + * + * @param text String to modify. + * @param what Character to replace. + * @param with Character to use as replacement. + * @param from Position at with to start replacement. + * + * @return Number of replacements occured. + */ +size_t SubstGlobal(TString& text, char what, char with, size_t from = 0); +size_t SubstGlobal(std::string& text, char what, char with, size_t from = 0); +size_t SubstGlobal(TUtf16String& text, wchar16 what, wchar16 with, size_t from = 0); +size_t SubstGlobal(std::u16string& text, wchar16 what, wchar16 with, size_t from = 0); +size_t SubstGlobal(TUtf32String& text, wchar32 what, wchar32 with, size_t from = 0); + +// TODO(yazevnul): +// - rename `SubstGlobal` to `ReplaceAll` for convenience +// - add `SubstGlobalCopy(TStringBuf)` for convenience +// - add `RemoveAll(text, what, from)` as a shortcut for `SubstGlobal(text, what, "", from)` +// - rename file to `replace.h` + +/* Replace all occurences of substring or character `what` with string or character `with` starting from position `from`, and return result string. + * + * @param text String to modify. + * @param what Substring/character to replace. + * @param with Substring/character to use as replacement. + * @param from Position at with to start replacement. + * + * @return Result string + */ +template <class TStringType, class TPatternType> +Y_WARN_UNUSED_RESULT TStringType SubstGlobalCopy(TStringType result, TPatternType what, TPatternType with, size_t from = 0) { + SubstGlobal(result, what, with, from); + return result; +} diff --git a/util/string/subst_ut.cpp b/util/string/subst_ut.cpp new file mode 100644 index 0000000000..21eccef779 --- /dev/null +++ b/util/string/subst_ut.cpp @@ -0,0 +1,253 @@ +#include "join.h" +#include "subst.h" +#include <string> + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(TStringSubst) { + static const size_t MIN_FROM_CTX = 4; + static const TVector<TString> ALL_FROM{TString("F"), TString("FF")}; + static const TVector<TString> ALL_TO{TString(""), TString("T"), TString("TT"), TString("TTT")}; + + static void AssertSubstGlobal(const TString& sFrom, const TString& sTo, const TString& from, const TString& to, const size_t fromPos, const size_t numSubst) { + TString s = sFrom; + size_t res = SubstGlobal(s, from, to, fromPos); + UNIT_ASSERT_VALUES_EQUAL_C(res, numSubst, + TStringBuilder() << "numSubst=" << numSubst << ", fromPos=" << fromPos << ", " << sFrom << " -> " << sTo); + if (numSubst) { + UNIT_ASSERT_STRINGS_EQUAL_C(s, sTo, + TStringBuilder() << "numSubst=" << numSubst << ", fromPos=" << fromPos << ", " << sFrom << " -> " << sTo); + } else { + // ensure s didn't trigger copy-on-write + UNIT_ASSERT_VALUES_EQUAL_C(s.c_str(), sFrom.c_str(), + TStringBuilder() << "numSubst=" << numSubst << ", fromPos=" << fromPos << ", " << sFrom << " -> " << sTo); + } + } + + Y_UNIT_TEST(TestSubstGlobalNoSubstA) { + for (const auto& from : ALL_FROM) { + const size_t fromSz = from.size(); + const size_t minSz = fromSz; + const size_t maxSz = fromSz + MIN_FROM_CTX; + for (size_t sz = minSz; sz <= maxSz; ++sz) { + for (size_t fromPos = 0; fromPos < sz; ++fromPos) { + TString s{sz, '.'}; + for (const auto& to : ALL_TO) { + AssertSubstGlobal(s, s, from, to, fromPos, 0); + } + } + } + } + } + + Y_UNIT_TEST(TestSubstGlobalNoSubstB) { + for (const auto& from : ALL_FROM) { + const size_t fromSz = from.size(); + const size_t minSz = fromSz; + const size_t maxSz = fromSz + MIN_FROM_CTX; + for (size_t sz = minSz; sz <= maxSz; ++sz) { + for (size_t fromPos = 0; fromPos <= sz - fromSz; ++fromPos) { + for (size_t fromBeg = 0; fromBeg < fromPos; ++fromBeg) { + const auto parts = { + TString{fromBeg, '.'}, + TString{sz - fromSz - fromBeg, '.'}}; + TString s = JoinSeq(from, parts); + for (const auto& to : ALL_TO) { + AssertSubstGlobal(s, s, from, to, fromPos, 0); + } + } + } + } + } + } + + static void DoTestSubstGlobal(TVector<TString>& parts, const size_t minBeg, const size_t sz, + const TString& from, const size_t fromPos, const size_t numSubst) { + const size_t numLeft = numSubst - parts.size(); + for (size_t fromBeg = minBeg; fromBeg <= sz - numLeft * from.size(); ++fromBeg) { + if (parts.empty()) { + parts.emplace_back(fromBeg, '.'); + } else { + parts.emplace_back(fromBeg - minBeg, '.'); + } + + if (numLeft == 1) { + parts.emplace_back(sz - fromBeg - from.size(), '.'); + TString sFrom = JoinSeq(from, parts); + UNIT_ASSERT_VALUES_EQUAL_C(sFrom.size(), sz, sFrom); + for (const auto& to : ALL_TO) { + TString sTo = JoinSeq(to, parts); + AssertSubstGlobal(sFrom, sTo, from, to, fromPos, numSubst); + } + parts.pop_back(); + } else { + DoTestSubstGlobal(parts, fromBeg + from.size(), sz, from, fromPos, numSubst); + } + + parts.pop_back(); + } + } + + static void DoTestSubstGlobal(size_t numSubst) { + TVector<TString> parts; + for (const auto& from : ALL_FROM) { + const size_t fromSz = from.size(); + const size_t minSz = numSubst * fromSz; + const size_t maxSz = numSubst * (fromSz + MIN_FROM_CTX); + for (size_t sz = minSz; sz <= maxSz; ++sz) { + const size_t maxPos = sz - numSubst * fromSz; + for (size_t fromPos = 0; fromPos <= maxPos; ++fromPos) { + DoTestSubstGlobal(parts, fromPos, sz, from, fromPos, numSubst); + } + } + } + } + + Y_UNIT_TEST(TestSubstGlobalSubst1) { + DoTestSubstGlobal(1); + } + + Y_UNIT_TEST(TestSubstGlobalSubst2) { + DoTestSubstGlobal(2); + } + + Y_UNIT_TEST(TestSubstGlobalSubst3) { + DoTestSubstGlobal(3); + } + + Y_UNIT_TEST(TestSubstGlobalSubst4) { + DoTestSubstGlobal(4); + } + + Y_UNIT_TEST(TestSubstGlobalOld) { + TString s; + s = "aaa"; + SubstGlobal(s, "a", "bb"); + UNIT_ASSERT_EQUAL(s, TString("bbbbbb")); + s = "aaa"; + SubstGlobal(s, "a", "b"); + UNIT_ASSERT_EQUAL(s, TString("bbb")); + s = "aaa"; + SubstGlobal(s, "a", ""); + UNIT_ASSERT_EQUAL(s, TString("")); + s = "abcdefbcbcdfb"; + SubstGlobal(s, "bc", "bbc", 2); + UNIT_ASSERT_EQUAL(s, TString("abcdefbbcbbcdfb")); + s = "Москва ~ Париж"; + SubstGlobal(s, " ~ ", " "); + UNIT_ASSERT_EQUAL(s, TString("Москва Париж")); + } + + Y_UNIT_TEST(TestSubstGlobalOldRet) { + const TString s1 = "aaa"; + const TString s2 = SubstGlobalCopy(s1, "a", "bb"); + UNIT_ASSERT_EQUAL(s2, TString("bbbbbb")); + + const TString s3 = "aaa"; + const TString s4 = SubstGlobalCopy(s3, "a", "b"); + UNIT_ASSERT_EQUAL(s4, TString("bbb")); + + const TString s5 = "aaa"; + const TString s6 = SubstGlobalCopy(s5, "a", ""); + UNIT_ASSERT_EQUAL(s6, TString("")); + + const TString s7 = "abcdefbcbcdfb"; + const TString s8 = SubstGlobalCopy(s7, "bc", "bbc", 2); + UNIT_ASSERT_EQUAL(s8, TString("abcdefbbcbbcdfb")); + + const TString s9 = "Москва ~ Париж"; + const TString s10 = SubstGlobalCopy(s9, " ~ ", " "); + UNIT_ASSERT_EQUAL(s10, TString("Москва Париж")); + } + + Y_UNIT_TEST(TestSubstCharGlobal) { + TUtf16String w = u"abcdabcd"; + SubstGlobal(w, wchar16('b'), wchar16('B'), 3); + UNIT_ASSERT_EQUAL(w, u"abcdaBcd"); + + TString s = "aaa"; + SubstGlobal(s, 'a', 'b', 1); + UNIT_ASSERT_EQUAL(s, TString("abb")); + } + + Y_UNIT_TEST(TestSubstCharGlobalRet) { + const TUtf16String w1 = u"abcdabcd"; + const TUtf16String w2 = SubstGlobalCopy(w1, wchar16('b'), wchar16('B'), 3); + UNIT_ASSERT_EQUAL(w2, u"abcdaBcd"); + + const TString s1 = "aaa"; + const TString s2 = SubstGlobalCopy(s1, 'a', 'b', 1); + UNIT_ASSERT_EQUAL(s2, TString("abb")); + } + + Y_UNIT_TEST(TestSubstStdString) { + std::string s = "aaa"; + SubstGlobal(s, "a", "b", 1); + UNIT_ASSERT_EQUAL(s, "abb"); + } + + Y_UNIT_TEST(TestSubstStdStringRet) { + const std::string s1 = "aaa"; + const std::string s2 = SubstGlobalCopy(s1, "a", "b", 1); + UNIT_ASSERT_EQUAL(s2, "abb"); + } + + Y_UNIT_TEST(TestSubstGlobalChar) { + { + const TString s = "a"; + const TString st = "b"; + TString ss = s; + UNIT_ASSERT_VALUES_EQUAL(s.size(), SubstGlobal(ss, 'a', 'b')); + UNIT_ASSERT_VALUES_EQUAL(st, ss); + } + { + const TString s = "aa"; + const TString st = "bb"; + TString ss = s; + UNIT_ASSERT_VALUES_EQUAL(s.size(), SubstGlobal(ss, 'a', 'b')); + UNIT_ASSERT_VALUES_EQUAL(st, ss); + } + { + const TString s = "aaa"; + const TString st = "bbb"; + TString ss = s; + UNIT_ASSERT_VALUES_EQUAL(s.size(), SubstGlobal(ss, 'a', 'b')); + UNIT_ASSERT_VALUES_EQUAL(st, ss); + } + { + const TString s = "aaaa"; + const TString st = "bbbb"; + TString ss = s; + UNIT_ASSERT_VALUES_EQUAL(s.size(), SubstGlobal(ss, 'a', 'b')); + UNIT_ASSERT_VALUES_EQUAL(st, ss); + } + { + const TString s = "aaaaa"; + const TString st = "bbbbb"; + TString ss = s; + UNIT_ASSERT_VALUES_EQUAL(s.size(), SubstGlobal(ss, 'a', 'b')); + UNIT_ASSERT_VALUES_EQUAL(st, ss); + } + { + const TString s = "aaaaaa"; + const TString st = "bbbbbb"; + TString ss = s; + UNIT_ASSERT_VALUES_EQUAL(s.size(), SubstGlobal(ss, 'a', 'b')); + UNIT_ASSERT_VALUES_EQUAL(st, ss); + } + { + const TString s = "aaaaaaa"; + const TString st = "bbbbbbb"; + TString ss = s; + UNIT_ASSERT_VALUES_EQUAL(s.size(), SubstGlobal(ss, 'a', 'b')); + UNIT_ASSERT_VALUES_EQUAL(st, ss); + } + { + const TString s = "aaaaaaaa"; + const TString st = "bbbbbbbb"; + TString ss = s; + UNIT_ASSERT_VALUES_EQUAL(s.size(), SubstGlobal(ss, 'a', 'b')); + UNIT_ASSERT_VALUES_EQUAL(st, ss); + } + } +} diff --git a/util/string/type.cpp b/util/string/type.cpp new file mode 100644 index 0000000000..49671c02c2 --- /dev/null +++ b/util/string/type.cpp @@ -0,0 +1,86 @@ +#include "type.h" +#include "ascii.h" + +#include <array> + +bool IsSpace(const char* s, size_t len) noexcept { + if (len == 0) { + return false; + } + for (const char* p = s; p < s + len; ++p) { + if (!IsAsciiSpace(*p)) { + return false; + } + } + return true; +} + +template <typename TStringType> +static bool IsNumberT(const TStringType& s) noexcept { + if (s.empty()) { + return false; + } + + return std::all_of(s.begin(), s.end(), IsAsciiDigit<typename TStringType::value_type>); +} + +bool IsNumber(const TStringBuf s) noexcept { + return IsNumberT(s); +} + +bool IsNumber(const TWtringBuf s) noexcept { + return IsNumberT(s); +} + +template <typename TStringType> +static bool IsHexNumberT(const TStringType& s) noexcept { + if (s.empty()) { + return false; + } + + return std::all_of(s.begin(), s.end(), IsAsciiHex<typename TStringType::value_type>); +} + +bool IsHexNumber(const TStringBuf s) noexcept { + return IsHexNumberT(s); +} + +bool IsHexNumber(const TWtringBuf s) noexcept { + return IsHexNumberT(s); +} + +namespace { + template <size_t N> + bool IsCaseInsensitiveAnyOf(TStringBuf str, const std::array<TStringBuf, N>& options) { + for (auto option : options) { + if (str.size() == option.size() && ::strnicmp(str.data(), option.data(), str.size()) == 0) { + return true; + } + } + return false; + } +} //anonymous namespace + +bool IsTrue(const TStringBuf v) noexcept { + static constexpr std::array<TStringBuf, 7> trueOptions{ + "true", + "t", + "yes", + "y", + "on", + "1", + "da"}; + return IsCaseInsensitiveAnyOf(v, trueOptions); +} + +bool IsFalse(const TStringBuf v) noexcept { + static constexpr std::array<TStringBuf, 7> falseOptions{ + "false", + "f", + "no", + "n", + "off", + "0", + "net"}; + return IsCaseInsensitiveAnyOf(v, falseOptions); +} diff --git a/util/string/type.h b/util/string/type.h new file mode 100644 index 0000000000..d6cb29ea58 --- /dev/null +++ b/util/string/type.h @@ -0,0 +1,42 @@ +#pragma once + +#include <util/generic/strbuf.h> + +Y_PURE_FUNCTION bool IsSpace(const char* s, size_t len) noexcept; + +/// Checks if a string is a set of only space symbols. +Y_PURE_FUNCTION static inline bool IsSpace(const TStringBuf s) noexcept { + return IsSpace(s.data(), s.size()); +} + +/// Returns "true" if the given string is an arabic number ([0-9]+) +Y_PURE_FUNCTION bool IsNumber(const TStringBuf s) noexcept; + +Y_PURE_FUNCTION bool IsNumber(const TWtringBuf s) noexcept; + +/// Returns "true" if the given string is a hex number ([0-9a-fA-F]+) +Y_PURE_FUNCTION bool IsHexNumber(const TStringBuf s) noexcept; + +Y_PURE_FUNCTION bool IsHexNumber(const TWtringBuf s) noexcept; + +/* Tests if the given string is case insensitive equal to one of: + * - "true", + * - "t", + * - "yes", + * - "y", + * - "on", + * - "1", + * - "da". + */ +Y_PURE_FUNCTION bool IsTrue(const TStringBuf value) noexcept; + +/* Tests if the given string is case insensitive equal to one of: + * - "false", + * - "f", + * - "no", + * - "n", + * - "off", + * - "0", + * - "net". + */ +Y_PURE_FUNCTION bool IsFalse(const TStringBuf value) noexcept; diff --git a/util/string/type_ut.cpp b/util/string/type_ut.cpp new file mode 100644 index 0000000000..03e7af62bd --- /dev/null +++ b/util/string/type_ut.cpp @@ -0,0 +1,76 @@ +#include "type.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/charset/wide.h> + +Y_UNIT_TEST_SUITE(TStringClassify) { + Y_UNIT_TEST(TestIsSpace) { + UNIT_ASSERT_EQUAL(IsSpace(" "), true); + UNIT_ASSERT_EQUAL(IsSpace("\t\r\n"), true); + UNIT_ASSERT_EQUAL(IsSpace(""), false); + UNIT_ASSERT_EQUAL(IsSpace(" a"), false); + } + + Y_UNIT_TEST(TestIsTrue) { + UNIT_ASSERT(IsTrue("1")); + UNIT_ASSERT(IsTrue("yes")); + UNIT_ASSERT(IsTrue("YeS")); + UNIT_ASSERT(IsTrue("on")); + UNIT_ASSERT(IsTrue("true")); + UNIT_ASSERT(IsTrue("t")); + UNIT_ASSERT(IsTrue("da")); + + UNIT_ASSERT(!IsTrue("")); + UNIT_ASSERT(!IsTrue("tr")); + UNIT_ASSERT(!IsTrue("foobar")); + } + + Y_UNIT_TEST(TestIsFalse) { + UNIT_ASSERT(IsFalse("0")); + UNIT_ASSERT(IsFalse("no")); + UNIT_ASSERT(IsFalse("off")); + UNIT_ASSERT(IsFalse("false")); + UNIT_ASSERT(IsFalse("f")); + UNIT_ASSERT(IsFalse("net")); + + UNIT_ASSERT(!IsFalse("")); + UNIT_ASSERT(!IsFalse("fa")); + UNIT_ASSERT(!IsFalse("foobar")); + } + + Y_UNIT_TEST(TestIsNumber) { + UNIT_ASSERT(IsNumber("0")); + UNIT_ASSERT(IsNumber("12345678901234567890")); + UNIT_ASSERT(!IsNumber("1234567890a")); + UNIT_ASSERT(!IsNumber("12345xx67890a")); + UNIT_ASSERT(!IsNumber("foobar")); + UNIT_ASSERT(!IsNumber("")); + + UNIT_ASSERT(IsNumber(u"0")); + UNIT_ASSERT(IsNumber(u"12345678901234567890")); + UNIT_ASSERT(!IsNumber(u"1234567890a")); + UNIT_ASSERT(!IsNumber(u"12345xx67890a")); + UNIT_ASSERT(!IsNumber(u"foobar")); + } + + Y_UNIT_TEST(TestIsHexNumber) { + UNIT_ASSERT(IsHexNumber("0")); + UNIT_ASSERT(IsHexNumber("aaaadddAAAAA")); + UNIT_ASSERT(IsHexNumber("0123456789ABCDEFabcdef")); + UNIT_ASSERT(IsHexNumber("12345678901234567890")); + UNIT_ASSERT(IsHexNumber("1234567890a")); + UNIT_ASSERT(!IsHexNumber("12345xx67890a")); + UNIT_ASSERT(!IsHexNumber("foobar")); + UNIT_ASSERT(!IsHexNumber(TString())); + + UNIT_ASSERT(IsHexNumber(u"0")); + UNIT_ASSERT(IsHexNumber(u"aaaadddAAAAA")); + UNIT_ASSERT(IsHexNumber(u"0123456789ABCDEFabcdef")); + UNIT_ASSERT(IsHexNumber(u"12345678901234567890")); + UNIT_ASSERT(IsHexNumber(u"1234567890a")); + UNIT_ASSERT(!IsHexNumber(u"12345xx67890a")); + UNIT_ASSERT(!IsHexNumber(u"foobar")); + UNIT_ASSERT(!IsHexNumber(TUtf16String())); + } +} diff --git a/util/string/ut/ya.make b/util/string/ut/ya.make new file mode 100644 index 0000000000..6e80812825 --- /dev/null +++ b/util/string/ut/ya.make @@ -0,0 +1,24 @@ +UNITTEST_FOR(util) + +OWNER(g:util) +SUBSCRIBER(g:util-subscribers) + +SRCS( + string/builder_ut.cpp + string/cast_ut.cpp + string/escape_ut.cpp + string/join_ut.cpp + string/hex_ut.cpp + string/printf_ut.cpp + string/split_ut.cpp + string/strip_ut.cpp + string/subst_ut.cpp + string/type_ut.cpp + string/util_ut.cpp + string/vector_ut.cpp + string/ascii_ut.cpp +) + +INCLUDE(${ARCADIA_ROOT}/util/tests/ya_util_tests.inc) + +END() diff --git a/util/string/util.cpp b/util/string/util.cpp new file mode 100644 index 0000000000..b14f20bf75 --- /dev/null +++ b/util/string/util.cpp @@ -0,0 +1,72 @@ +#include "util.h" + +#include <util/generic/utility.h> + +#include <cstdio> +#include <cstdarg> +#include <cstdlib> + +int a2i(const TString& s) { + return atoi(s.c_str()); +} + +//============================== span ===================================== + +void str_spn::init(const char* charset, bool extended) { + // chars_table_1 is necessary to avoid some unexpected + // multi-threading issues + ui8 chars_table_1[256]; + memset(chars_table_1, 0, sizeof(chars_table_1)); + if (extended) { + for (const char* cs = charset; *cs; cs++) { + if (cs[1] == '-' && cs[2] != 0) { + for (int c = (ui8)*cs; c <= (ui8)cs[2]; c++) { + chars_table_1[c] = 1; + } + cs += 2; + continue; + } + chars_table_1[(ui8)*cs] = 1; + } + } else { + for (; *charset; charset++) { + chars_table_1[(ui8)*charset] = 1; + } + } + memcpy(chars_table, chars_table_1, 256); + chars_table_1[0] = 1; + for (int n = 0; n < 256; n++) { + c_chars_table[n] = !chars_table_1[n]; + } +} + +Tr::Tr(const char* from, const char* to) { + for (size_t n = 0; n < 256; n++) { + Map[n] = (char)n; + } + for (; *from && *to; from++, to++) { + Map[(ui8)*from] = *to; + } +} + +size_t Tr::FindFirstChangePosition(const TString& str) const { + for (auto it = str.begin(); it != str.end(); ++it) { + if (ConvertChar(*it) != *it) { + return it - str.begin(); + } + } + + return TString::npos; +} + +void Tr::Do(TString& str) const { + const size_t changePosition = FindFirstChangePosition(str); + + if (changePosition == TString::npos) { + return; + } + + for (auto it = str.begin() + changePosition; it != str.end(); ++it) { + *it = ConvertChar(*it); + } +} diff --git a/util/string/util.h b/util/string/util.h new file mode 100644 index 0000000000..0d77a5042b --- /dev/null +++ b/util/string/util.h @@ -0,0 +1,195 @@ +#pragma once + +//THIS FILE A COMPAT STUB HEADER + +#include <cstring> +#include <cstdarg> +#include <algorithm> + +#include <util/system/defaults.h> +#include <util/generic/string.h> +#include <util/generic/strbuf.h> + +/// @addtogroup Strings_Miscellaneous +/// @{ +int a2i(const TString& s); + +/// Removes the last character if it is equal to c. +template <class T> +inline void RemoveIfLast(T& s, int c) { + const size_t length = s.length(); + if (length && s[length - 1] == c) + s.remove(length - 1); +} + +/// Adds lastCh symbol to the the of the string if it is not already there. +inline void addIfNotLast(TString& s, int lastCh) { + size_t len = s.length(); + if (!len || s[len - 1] != lastCh) { + s.append(char(lastCh)); + } +} + +/// @details Finishes the string with lastCh1 if lastCh2 is not present in the string and lastCh1 is not already at the end of the string. +/// Else, if lastCh2 is not equal to the symbol before the last, it finishes the string with lastCh2. +/// @todo ?? Define, when to apply the function. Is in use several times for URLs parsing. +inline void addIfAbsent(TString& s, char lastCh1, char lastCh2) { + size_t pos = s.find(lastCh2); + if (pos == TString::npos) { + //s.append((char)lastCh1); + addIfNotLast(s, lastCh1); + } else if (pos < s.length() - 1) { + addIfNotLast(s, lastCh2); + } +} + +/// @} + +/* + * ------------------------------------------------------------------ + * + * A fast implementation of glibc's functions; + * strspn, strcspn and strpbrk. + * + * ------------------------------------------------------------------ + */ +struct ui8_256 { + // forward chars table + ui8 chars_table[256]; + // reverse (for c* functions) chars table + ui8 c_chars_table[256]; +}; + +class str_spn: public ui8_256 { +public: + explicit str_spn(const char* charset, bool extended = false) { + // exteneded: if true, treat charset string more like + // interior of brackets [ ], e.g. "a-z0-9" + init(charset, extended); + } + + /// Return first character in table, like strpbrk() + /// That is, skip all characters not in table + /// [DIFFERENCE FOR NOT_FOUND CASE: Returns end of string, not NULL] + const char* brk(const char* s) const { + while (c_chars_table[(ui8)*s]) + ++s; + return s; + } + + const char* brk(const char* s, const char* e) const { + while (s < e && c_chars_table[(ui8)*s]) + ++s; + return s; + } + + /// Return first character not in table, like strpbrk() for inverted table. + /// That is, skip all characters in table + const char* cbrk(const char* s) const { + while (chars_table[(ui8)*s]) + ++s; + return s; + } + + const char* cbrk(const char* s, const char* e) const { + while (s < e && chars_table[(ui8)*s]) + ++s; + return s; + } + + /// Offset of the first character not in table, like strspn(). + size_t spn(const char* s) const { + return cbrk(s) - s; + } + + size_t spn(const char* s, const char* e) const { + return cbrk(s, e) - s; + } + + /// Offset of the first character in table, like strcspn(). + size_t cspn(const char* s) const { + return brk(s) - s; + } + + size_t cspn(const char* s, const char* e) const { + return brk(s, e) - s; + } + + char* brk(char* s) const { + return const_cast<char*>(brk((const char*)s)); + } + + char* cbrk(char* s) const { + return const_cast<char*>(cbrk((const char*)s)); + } + + /// See strsep [BUT argument is *&, not **] + char* sep(char*& s) const { + char sep_char; // unused; + return sep(s, sep_char); + } + + /// strsep + remember character that was destroyed + char* sep(char*& s, char& sep_char) const { + if (!s) + return nullptr; + char* ret = s; + char* next = brk(ret); + if (*next) { + sep_char = *next; + *next = 0; + s = next + 1; + } else { + sep_char = 0; + s = nullptr; + } + return ret; + } + +protected: + void init(const char* charset, bool extended); + str_spn() = default; +}; + +// an analogue of tr/$from/$to/ +class Tr { +public: + Tr(const char* from, const char* to); + + char ConvertChar(char ch) const { + return Map[(ui8)ch]; + } + + void Do(char* s) const { + for (; *s; s++) + *s = ConvertChar(*s); + } + void Do(const char* src, char* dst) const { + for (; *src; src++) + *dst++ = ConvertChar(*src); + *dst = 0; + } + void Do(char* s, size_t l) const { + for (size_t i = 0; i < l && s[i]; i++) + s[i] = ConvertChar(s[i]); + } + void Do(TString& str) const; + +private: + char Map[256]; + + size_t FindFirstChangePosition(const TString& str) const; +}; + +// Removes all occurrences of given character from string +template <typename TStringType> +void RemoveAll(TStringType& str, typename TStringType::char_type ch) { + size_t pos = str.find(ch); // 'find' to avoid cloning of string in 'TString.begin()' + if (pos == TStringType::npos) + return; + + typename TStringType::iterator begin = str.begin(); + typename TStringType::iterator end = begin + str.length(); + typename TStringType::iterator it = std::remove(begin + pos, end, ch); + str.erase(it, end); +} diff --git a/util/string/util_ut.cpp b/util/string/util_ut.cpp new file mode 100644 index 0000000000..18a2d8e195 --- /dev/null +++ b/util/string/util_ut.cpp @@ -0,0 +1,46 @@ +#include "util.h" + +#include <library/cpp/testing/unittest/registar.h> + +class TStrUtilTest: public TTestBase { + UNIT_TEST_SUITE(TStrUtilTest); + UNIT_TEST(TestSpn); + UNIT_TEST(TestRemoveAll); + UNIT_TEST_SUITE_END(); + +public: + void TestSpn() { + str_spn rul("a-z", true); + char s[] = "!@#$ab%^&c+-"; + UNIT_ASSERT_EQUAL(rul.brk(s), s + 4); + UNIT_ASSERT_EQUAL(rul.brk(s + 4), s + 4); + UNIT_ASSERT_EQUAL(rul.brk(s + 10), s + 12); + char* s1 = s; + UNIT_ASSERT_EQUAL(strcmp(rul.sep(s1), "!@#$"), 0); + UNIT_ASSERT_EQUAL(strcmp(rul.sep(s1), ""), 0); + UNIT_ASSERT_EQUAL(strcmp(rul.sep(s1), "%^&"), 0); + UNIT_ASSERT_EQUAL(strcmp(rul.sep(s1), "+-"), 0); + UNIT_ASSERT_EQUAL(rul.sep(s1), nullptr); + } + + void TestRemoveAll() { + static const struct T { + const char* Str; + char Ch; + const char* Result; + } tests[] = { + {"", 'x', ""}, + {"hello world", 'h', "ello world"}, + {"hello world", 'l', "heo word"}, + {"hello world", 'x', "hello world"}, + }; + + for (const T* t = tests; t != std::end(tests); ++t) { + TString str(t->Str); + RemoveAll(str, t->Ch); + UNIT_ASSERT_EQUAL(t->Result, str); + } + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStrUtilTest); diff --git a/util/string/vector.cpp b/util/string/vector.cpp new file mode 100644 index 0000000000..9ba401f0a2 --- /dev/null +++ b/util/string/vector.cpp @@ -0,0 +1,91 @@ +#include "util.h" +#include "split.h" +#include "vector.h" + +#include <util/system/defaults.h> + +template <class TConsumer, class TDelim, typename TChr> +static inline void DoSplit2(TConsumer& c, TDelim& d, const TBasicStringBuf<TChr> str, int) { + SplitString(str.data(), str.data() + str.size(), d, c); +} + +template <class TConsumer, class TDelim, typename TChr> +static inline void DoSplit1(TConsumer& cc, TDelim& d, const TBasicStringBuf<TChr> str, int opts) { + if (opts & KEEP_EMPTY_TOKENS) { + DoSplit2(cc, d, str, opts); + } else { + TSkipEmptyTokens<TConsumer> sc(&cc); + + DoSplit2(sc, d, str, opts); + } +} + +template <class C, class TDelim, typename TChr> +static inline void DoSplit0(C* res, const TBasicStringBuf<TChr> str, TDelim& d, size_t maxFields, int options) { + using TStringType = std::conditional_t<std::is_same<TChr, wchar16>::value, TUtf16String, TString>; + res->clear(); + + if (!str.data()) { + return; + } + + using TConsumer = TContainerConsumer<C>; + TConsumer cc(res); + + if (maxFields) { + TLimitingConsumer<TConsumer, const TChr> lc(maxFields, &cc); + + DoSplit1(lc, d, str, options); + + if (lc.Last) { + res->push_back(TStringType(lc.Last, str.data() + str.size() - lc.Last)); + } + } else { + DoSplit1(cc, d, str, options); + } +} + +template <typename TChr> +static void SplitStringImplT(TVector<std::conditional_t<std::is_same<TChr, wchar16>::value, TUtf16String, TString>>* res, + const TBasicStringBuf<TChr> str, const TChr* delim, size_t maxFields, int options) { + if (!*delim) { + return; + } + + if (*(delim + 1)) { + TStringDelimiter<const TChr> d(delim, std::char_traits<TChr>::length(delim)); + + DoSplit0(res, str, d, maxFields, options); + } else { + TCharDelimiter<const TChr> d(*delim); + + DoSplit0(res, str, d, maxFields, options); + } +} + +void ::NPrivate::SplitStringImpl(TVector<TString>* res, const char* ptr, const char* delim, size_t maxFields, int options) { + return SplitStringImplT<char>(res, TStringBuf(ptr), delim, maxFields, options); +} + +void ::NPrivate::SplitStringImpl(TVector<TString>* res, const char* ptr, size_t len, const char* delim, size_t maxFields, int options) { + return SplitStringImplT<char>(res, TStringBuf(ptr, len), delim, maxFields, options); +} + +void ::NPrivate::SplitStringImpl(TVector<TUtf16String>* res, const wchar16* ptr, const wchar16* delimiter, size_t maxFields, int options) { + return SplitStringImplT<wchar16>(res, TWtringBuf(ptr), delimiter, maxFields, options); +} + +void ::NPrivate::SplitStringImpl(TVector<TUtf16String>* res, const wchar16* ptr, size_t len, const wchar16* delimiter, size_t maxFields, int options) { + return SplitStringImplT<wchar16>(res, TWtringBuf(ptr, len), delimiter, maxFields, options); +} + +TUtf16String JoinStrings(const TVector<TUtf16String>& v, const TWtringBuf delim) { + return JoinStrings(v.begin(), v.end(), delim); +} + +TUtf16String JoinStrings(const TVector<TUtf16String>& v, size_t index, size_t count, const TWtringBuf delim) { + const size_t f = Min(index, v.size()); + const size_t l = f + Min(count, v.size() - f); + + return JoinStrings(v.begin() + f, v.begin() + l, delim); +} diff --git a/util/string/vector.h b/util/string/vector.h new file mode 100644 index 0000000000..e36c348bbe --- /dev/null +++ b/util/string/vector.h @@ -0,0 +1,132 @@ +#pragma once + +#include "cast.h" +#include "split.h" + +#include <util/generic/map.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/string/cast.h> +#include <util/system/yassert.h> + +#define KEEP_EMPTY_TOKENS 0x01 + +// +// NOTE: Check StringSplitter below to get more convenient split string interface. + +namespace NPrivate { + + void SplitStringImpl(TVector<TString>* res, const char* ptr, + const char* delimiter, size_t maxFields, int options); + void SplitStringImpl(TVector<TString>* res, const char* ptr, size_t len, + const char* delimiter, size_t maxFields, int options); + + void SplitStringImpl(TVector<TUtf16String>* res, const wchar16* ptr, + const wchar16* delimiter, size_t maxFields, int options); + void SplitStringImpl(TVector<TUtf16String>* res, const wchar16* ptr, size_t len, + const wchar16* delimiter, size_t maxFields, int options); + + template <typename C> + struct TStringDeducer; + + template <> + struct TStringDeducer<char> { + using type = TString; + }; + + template <> + struct TStringDeducer<wchar16> { + using type = TUtf16String; + }; +} + +template <typename C> +TVector<typename ::NPrivate::TStringDeducer<C>::type> +SplitString(const C* ptr, const C* delimiter, + size_t maxFields = 0, int options = 0) { + TVector<typename ::NPrivate::TStringDeducer<C>::type> res; + ::NPrivate::SplitStringImpl(&res, ptr, delimiter, maxFields, options); + return res; +} + +template <typename C> +TVector<typename ::NPrivate::TStringDeducer<C>::type> +SplitString(const C* ptr, size_t len, const C* delimiter, + size_t maxFields = 0, int options = 0) { + TVector<typename ::NPrivate::TStringDeducer<C>::type> res; + ::NPrivate::SplitStringImpl(&res, ptr, len, delimiter, maxFields, options); + return res; +} + +template <typename C> +TVector<typename ::NPrivate::TStringDeducer<C>::type> +SplitString(const typename ::NPrivate::TStringDeducer<C>::type& str, const C* delimiter, + size_t maxFields = 0, int options = 0) { + return SplitString(str.data(), str.size(), delimiter, maxFields, options); +} + +template <class TIter> +inline TString JoinStrings(TIter begin, TIter end, const TStringBuf delim) { + if (begin == end) + return TString(); + + TString result = ToString(*begin); + + for (++begin; begin != end; ++begin) { + result.append(delim); + result.append(ToString(*begin)); + } + + return result; +} + +template <class TIter> +inline TUtf16String JoinStrings(TIter begin, TIter end, const TWtringBuf delim) { + if (begin == end) + return TUtf16String(); + + TUtf16String result = ToWtring(*begin); + + for (++begin; begin != end; ++begin) { + result.append(delim); + result.append(ToWtring(*begin)); + } + + return result; +} + +/// Concatenates elements of given TVector<TString>. +inline TString JoinStrings(const TVector<TString>& v, const TStringBuf delim) { + return JoinStrings(v.begin(), v.end(), delim); +} + +inline TString JoinStrings(const TVector<TString>& v, size_t index, size_t count, const TStringBuf delim) { + Y_ASSERT(index + count <= v.size() && "JoinStrings(): index or count out of range"); + return JoinStrings(v.begin() + index, v.begin() + index + count, delim); +} + +template <typename T> +inline TString JoinVectorIntoString(const TVector<T>& v, const TStringBuf delim) { + return JoinStrings(v.begin(), v.end(), delim); +} + +template <typename T> +inline TString JoinVectorIntoString(const TVector<T>& v, size_t index, size_t count, const TStringBuf delim) { + Y_ASSERT(index + count <= v.size() && "JoinVectorIntoString(): index or count out of range"); + return JoinStrings(v.begin() + index, v.begin() + index + count, delim); +} + +TUtf16String JoinStrings(const TVector<TUtf16String>& v, const TWtringBuf delim); +TUtf16String JoinStrings(const TVector<TUtf16String>& v, size_t index, size_t count, const TWtringBuf delim); + +//! Converts vector of strings to vector of type T variables +template <typename T, typename TStringType> +TVector<T> Scan(const TVector<TStringType>& input) { + TVector<T> output; + output.reserve(input.size()); + for (int i = 0; i < input.ysize(); ++i) { + output.push_back(FromString<T>(input[i])); + } + return output; +} diff --git a/util/string/vector_ut.cpp b/util/string/vector_ut.cpp new file mode 100644 index 0000000000..817120f268 --- /dev/null +++ b/util/string/vector_ut.cpp @@ -0,0 +1,38 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <util/charset/wide.h> + +#include "cast.h" +#include "vector.h" + +Y_UNIT_TEST_SUITE(TStringJoinTest) { + Y_UNIT_TEST(Test1) { + TVector<TUtf16String> v; + + UNIT_ASSERT_EQUAL(JoinStrings(v, ToWtring("")), ToWtring("")); + } + + Y_UNIT_TEST(Test2) { + TVector<TUtf16String> v; + + v.push_back(ToWtring("1")); + v.push_back(ToWtring("2")); + + UNIT_ASSERT_EQUAL(JoinStrings(v, ToWtring(" ")), ToWtring("1 2")); + } + + Y_UNIT_TEST(Test3) { + TVector<TUtf16String> v; + + v.push_back(ToWtring("1")); + v.push_back(ToWtring("2")); + + UNIT_ASSERT_EQUAL(JoinStrings(v, 1, 10, ToWtring(" ")), ToWtring("2")); + } + + Y_UNIT_TEST(TestJoinWStrings) { + const TUtf16String str = u"Яндекс"; + const TVector<TUtf16String> v(1, str); + + UNIT_ASSERT_EQUAL(JoinStrings(v, TUtf16String()), str); + } +} diff --git a/util/string/ya.make b/util/string/ya.make new file mode 100644 index 0000000000..79c9498ddd --- /dev/null +++ b/util/string/ya.make @@ -0,0 +1,6 @@ +OWNER(g:util) +SUBSCRIBER(g:util-subscribers) + +RECURSE_FOR_TESTS( + ut +) |