diff options
| author | babenko <[email protected]> | 2025-12-24 23:29:35 +0300 |
|---|---|---|
| committer | babenko <[email protected]> | 2025-12-24 23:45:58 +0300 |
| commit | 4dbf62fd2f8cc5ece53cc1446561cf71476bdd12 (patch) | |
| tree | 9aab73c043bf8f6dc177b06f69dab4336dadfcba | |
| parent | dba8986f6b1a5fc7c4f230bee510113995a48970 (diff) | |
Explicitly use TCowString in TYsonString
Для ревьюеров: изменения вне `library/cpp/yt` убирают `using namespace NYT` из хедера. Эта конструкция приводила к клешу имен глобального неймспейса и `NYT` и ошибкам сборки.
commit_hash:f598da488a6dd8671af9f1f02870ab5612ae46eb
| -rw-r--r-- | library/cpp/containers/cow_string/cow_string.cpp | 280 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/cow_string.h | 1047 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/cow_string_ut.cpp | 1268 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/output.cpp | 46 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/reverse.cpp | 32 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/reverse.h | 16 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/str_stl.h | 67 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/subst.cpp | 182 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/subst.h | 31 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/ut/ya.make | 7 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/ut_medium/cow_string_medium_ut.cpp | 55 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/ut_medium/ya.make | 9 | ||||
| -rw-r--r-- | library/cpp/containers/cow_string/ya.make | 15 | ||||
| -rw-r--r-- | library/cpp/yt/yson_string/string.cpp | 25 | ||||
| -rw-r--r-- | library/cpp/yt/yson_string/string.h | 4 | ||||
| -rw-r--r-- | library/cpp/yt/yson_string/ya.make | 1 |
16 files changed, 3066 insertions, 19 deletions
diff --git a/library/cpp/containers/cow_string/cow_string.cpp b/library/cpp/containers/cow_string/cow_string.cpp new file mode 100644 index 00000000000..87dc0ad99ec --- /dev/null +++ b/library/cpp/containers/cow_string/cow_string.cpp @@ -0,0 +1,280 @@ +#include "cow_string.h" + +#include <util/string/ascii.h> +#include <util/system/sanitizers.h> +#include <util/system/sys_alloc.h> +#include <util/charset/wide.h> + +#include <iostream> + +template <bool stopOnFirstModification, typename TCharType, typename F> +static bool ModifySequence(TCharType*& p, const TCharType* const pe, F&& f) { + while (p != pe) { + const auto symbol = ReadSymbol(p, pe); + const auto modified = f(symbol); + if (symbol != modified) { + if (stopOnFirstModification) { + return true; + } + + WriteSymbol(modified, p); // also moves `p` forward + } else { + p = SkipSymbol(p, pe); + } + } + + return false; +} + +template <bool stopOnFirstModification, typename TCharType, typename F> +static bool ModifySequence(const TCharType*& p, const TCharType* const pe, TCharType*& out, F&& f) { + while (p != pe) { + const auto symbol = stopOnFirstModification ? ReadSymbol(p, pe) : ReadSymbolAndAdvance(p, pe); + const auto modified = f(symbol); + + if (stopOnFirstModification) { + if (symbol != modified) { + return true; + } + + p = SkipSymbol(p, pe); + } + + WriteSymbol(modified, out); + } + + return false; +} + +template <class TStringType> +static void DetachAndFixPointers(TStringType& text, typename TStringType::value_type*& p, const typename TStringType::value_type*& pe) { + const auto pos = p - text.data(); + const auto count = pe - p; + p = text.Detach() + pos; + pe = p + count; +} + +template <class TStringType, typename F> +static bool ModifyStringSymbolwise(TStringType& text, size_t pos, size_t count, F&& f) { + // TODO(yazevnul): this is done for consistency with `TUtf16String::to_lower` and friends + // at r2914050, maybe worth replacing them with asserts. Also see the same code in `ToTitle`. + pos = pos < text.size() ? pos : text.size(); + count = count < text.size() - pos ? count : text.size() - pos; + + // TUtf16String is refcounted and it's `data` method return pointer to the constant memory. + // To simplify the code we do a `const_cast`, though first write to the memory will be done only + // after we call `Detach()` and get pointer to a writable piece of memory. + auto* p = const_cast<typename TStringType::value_type*>(text.data() + pos); + const auto* pe = text.data() + pos + count; + + if (ModifySequence<true>(p, pe, f)) { + DetachAndFixPointers(text, p, pe); + ModifySequence<false>(p, pe, f); + return true; + } + + return false; +} + +std::ostream& operator<<(std::ostream& os, const TCowString& s) { + return os.write(s.data(), s.size()); +} + +std::istream& operator>>(std::istream& is, TCowString& s) { + return is >> s.MutRef(); +} + +template <> +bool TBasicCowString<char, std::char_traits<char>>::to_lower(size_t pos, size_t n) { + return Transform([](size_t, char c) { return AsciiToLower(c); }, pos, n); +} + +template <> +bool TBasicCowString<char, std::char_traits<char>>::to_upper(size_t pos, size_t n) { + return Transform([](size_t, char c) { return AsciiToUpper(c); }, pos, n); +} + +template <> +bool TBasicCowString<char, std::char_traits<char>>::to_title(size_t pos, size_t n) { + if (n == 0) { + return false; + } + bool changed = to_upper(pos, 1); + return to_lower(pos + 1, n - 1) || changed; +} + +template <> +TUtf16CowString& +TBasicCowString<wchar16, std::char_traits<wchar16>>::AppendAscii(const ::TStringBuf& s) { + ReserveAndResize(size() + s.size()); + + auto dst = begin() + size() - s.size(); + + for (const char* src = s.data(); dst != end(); ++dst, ++src) { + *dst = static_cast<wchar16>(*src); + } + + return *this; +} + +template <> +TUtf16CowString& +TBasicCowString<wchar16, std::char_traits<wchar16>>::AppendUtf8(const ::TStringBuf& s) { + size_t oldSize = size(); + ReserveAndResize(size() + s.size() * 4); + size_t written = 0; + size_t pos = UTF8ToWideImpl(s.data(), s.size(), begin() + oldSize, written); + if (pos != s.size()) { + ythrow yexception() << "failed to decode UTF-8 string at pos " << pos << ::NDetail::InStringMsg(s.data(), s.size()); + } + resize(oldSize + written); + + return *this; +} + +template <> +bool TBasicCowString<wchar16, std::char_traits<wchar16>>::to_lower(size_t pos, size_t n) { + const auto f = [](const wchar32 s) { return ToLower(s); }; + return ModifyStringSymbolwise(*this, pos, n, f); +} + +template <> +bool TBasicCowString<wchar16, std::char_traits<wchar16>>::to_upper(size_t pos, size_t n) { + const auto f = [](const wchar32 s) { return ToUpper(s); }; + return ModifyStringSymbolwise(*this, pos, n, f); +} + +template <> +bool TBasicCowString<wchar16, std::char_traits<wchar16>>::to_title(size_t pos, size_t nn) { + if (!*this) { + return false; + } + + pos = pos < this->size() ? pos : this->size(); + nn = nn < this->size() - pos ? nn : this->size() - pos; + + const auto toLower = [](const wchar32 s) { return ToLower(s); }; + + auto* p = const_cast<wchar16*>(this->data() + pos); + const auto* pe = this->data() + pos + nn; + + const auto firstSymbol = ReadSymbol(p, pe); + if (firstSymbol == ToTitle(firstSymbol)) { + p = SkipSymbol(p, pe); + if (ModifySequence<true>(p, pe, toLower)) { + DetachAndFixPointers(*this, p, pe); + ModifySequence<false>(p, pe, toLower); + return true; + } + } else { + DetachAndFixPointers(*this, p, pe); + WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward + ModifySequence<false>(p, pe, toLower); + return true; + } + + return false; +} + +template <> +TUtf32CowString& +TBasicCowString<wchar32, std::char_traits<wchar32>>::AppendAscii(const ::TStringBuf& s) { + ReserveAndResize(size() + s.size()); + + auto dst = begin() + size() - s.size(); + + for (const char* src = s.data(); dst != end(); ++dst, ++src) { + *dst = static_cast<wchar32>(*src); + } + + return *this; +} + +template <> +TBasicCowString<char, std::char_traits<char>>& +TBasicCowString<char, std::char_traits<char>>::AppendUtf16(const ::TWtringBuf& s) { + const size_t oldSize = size(); + ReserveAndResize(size() + WideToUTF8BufferSize(s.size())); + + size_t written = 0; + WideToUTF8(s.data(), s.size(), begin() + oldSize, written); + + resize(oldSize + written); + + return *this; +} + +template <> +TUtf32CowString& +TBasicCowString<wchar32, std::char_traits<wchar32>>::AppendUtf8(const ::TStringBuf& s) { + size_t oldSize = size(); + ReserveAndResize(size() + s.size() * 4); + size_t written = 0; + size_t pos = UTF8ToWideImpl(s.data(), s.size(), begin() + oldSize, written); + if (pos != s.size()) { + ythrow yexception() << "failed to decode UTF-8 string at pos " << pos << ::NDetail::InStringMsg(s.data(), s.size()); + } + resize(oldSize + written); + + return *this; +} + +template <> +TUtf32CowString& +TBasicCowString<wchar32, std::char_traits<wchar32>>::AppendUtf16(const ::TWtringBuf& s) { + size_t oldSize = size(); + ReserveAndResize(size() + s.size() * 2); + + wchar32* oldEnd = begin() + oldSize; + wchar32* end = oldEnd; + NDetail::UTF16ToUTF32ImplScalar(s.data(), s.data() + s.size(), end); + size_t written = end - oldEnd; + + resize(oldSize + written); + + return *this; +} + +template <> +bool TBasicCowString<wchar32, std::char_traits<wchar32>>::to_lower(size_t pos, size_t n) { + const auto f = [](const wchar32 s) { return ToLower(s); }; + return ModifyStringSymbolwise(*this, pos, n, f); +} + +template <> +bool TBasicCowString<wchar32, std::char_traits<wchar32>>::to_upper(size_t pos, size_t n) { + const auto f = [](const wchar32 s) { return ToUpper(s); }; + return ModifyStringSymbolwise(*this, pos, n, f); +} + +template <> +bool TBasicCowString<wchar32, std::char_traits<wchar32>>::to_title(size_t pos, size_t n) { + if (!*this) { + return false; + } + + pos = pos < this->size() ? pos : this->size(); + n = n < this->size() - pos ? n : this->size() - pos; + + const auto toLower = [](const wchar32 s) { return ToLower(s); }; + + auto* p = const_cast<wchar32*>(this->data() + pos); + const auto* pe = this->data() + pos + n; + + const auto firstSymbol = *p; + if (firstSymbol == ToTitle(firstSymbol)) { + p += 1; + if (ModifySequence<true>(p, pe, toLower)) { + DetachAndFixPointers(*this, p, pe); + ModifySequence<false>(p, pe, toLower); + return true; + } + } else { + DetachAndFixPointers(*this, p, pe); + WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward + ModifySequence<false>(p, pe, toLower); + return true; + } + + return false; +} diff --git a/library/cpp/containers/cow_string/cow_string.h b/library/cpp/containers/cow_string/cow_string.h new file mode 100644 index 00000000000..597d4c2f06b --- /dev/null +++ b/library/cpp/containers/cow_string/cow_string.h @@ -0,0 +1,1047 @@ +#pragma once + +#include <util/generic/string.h> + +template <typename TCharType, typename TTraits = std::char_traits<TCharType>> +class TBasicCowString: public TStringBase<TBasicCowString<TCharType, TTraits>, TCharType, TTraits> { +public: + // TODO: Move to private section + using TBase = TStringBase<TBasicCowString, TCharType, TTraits>; + using TStringType = std::basic_string<TCharType, TTraits>; + using TStdStr = TStdString<TStringType>; + using TStorage = TIntrusivePtr<TStdStr, TStringPtrOps<TStdStr>>; + using reference = TBasicCharRef<TBasicCowString>; + using char_type = TCharType; // TODO: DROP + using value_type = TCharType; + using traits_type = TTraits; + + using iterator = TCharType*; + using reverse_iterator = std::reverse_iterator<iterator>; + using typename TBase::const_iterator; + using typename TBase::const_reference; + using typename TBase::const_reverse_iterator; + + struct TUninitialized { + explicit TUninitialized(size_t size) + : Size(size) + { + } + + size_t Size; + }; + + size_t max_size() noexcept { + static size_t res = TStringType().max_size(); + + return res; + } + +protected: + TStorage S_; + + template <typename... A> + static TStorage Construct(A&&... a) { + return {new TStdStr(std::forward<A>(a)...), typename TStorage::TNoIncrement()}; + } + + static TStorage Construct() noexcept { + return TStdStr::NullStr(); + } + + TStdStr& StdStr() noexcept { + return *S_; + } + + const TStdStr& StdStr() const noexcept { + return *S_; + } + + /** + * Makes a distinct copy of this string. `IsDetached()` is always true after this call. + * + * @throw std::length_error + */ + void Clone() { + Construct(StdStr()).Swap(S_); + } + + size_t RefCount() const noexcept { + return S_.RefCount(); + } + +public: + inline const TStringType& ConstRef() const Y_LIFETIME_BOUND { + return StdStr(); + } + + inline TStringType& MutRef() Y_LIFETIME_BOUND { + Detach(); + + return StdStr(); + } + + inline const_reference operator[](size_t pos) const noexcept Y_LIFETIME_BOUND { + Y_ASSERT(pos <= length()); + + return this->data()[pos]; + } + + inline reference operator[](size_t pos) noexcept Y_LIFETIME_BOUND { + Y_ASSERT(pos <= length()); + + return reference(*this, pos); + } + + using TBase::back; + + inline reference back() noexcept Y_LIFETIME_BOUND { + Y_ASSERT(!this->empty()); + + if (Y_UNLIKELY(this->empty())) { + return reference(*this, 0); + } + + return reference(*this, length() - 1); + } + + using TBase::front; + + inline reference front() noexcept Y_LIFETIME_BOUND { + Y_ASSERT(!this->empty()); + + return reference(*this, 0); + } + + inline size_t length() const noexcept { + return ConstRef().length(); + } + + inline const TCharType* data() const noexcept Y_LIFETIME_BOUND { + return ConstRef().data(); + } + + inline const TCharType* c_str() const noexcept Y_LIFETIME_BOUND { + return ConstRef().c_str(); + } + + // ~~~ STL compatible method to obtain data pointer ~~~ + iterator begin() Y_LIFETIME_BOUND { + return &*MutRef().begin(); + } + + iterator end() Y_LIFETIME_BOUND { + return &*MutRef().end(); + } + + reverse_iterator rbegin() Y_LIFETIME_BOUND { + return reverse_iterator(end()); + } + + reverse_iterator rend() Y_LIFETIME_BOUND { + return reverse_iterator(begin()); + } + + const_iterator begin() const noexcept Y_LIFETIME_BOUND { + return TBase::begin(); + } + const_iterator cbegin() const noexcept Y_LIFETIME_BOUND { + return TBase::cbegin(); + } + + const_iterator cend() const noexcept Y_LIFETIME_BOUND { + return TBase::cend(); + } + + const_reverse_iterator crbegin() const noexcept Y_LIFETIME_BOUND { + return TBase::crbegin(); + } + + const_reverse_iterator crend() const noexcept Y_LIFETIME_BOUND { + return TBase::crend(); + } + + const_iterator end() const noexcept Y_LIFETIME_BOUND { + return TBase::end(); + } + + const_reverse_iterator rbegin() const noexcept Y_LIFETIME_BOUND { + return TBase::rbegin(); + } + + const_reverse_iterator rend() const noexcept Y_LIFETIME_BOUND { + return TBase::rend(); + } + + inline size_t capacity() const noexcept { + if (S_->IsNull()) { + return 0; + } + + return S_->capacity(); + } + + TCharType* Detach() Y_LIFETIME_BOUND { + if (Y_UNLIKELY(!IsDetached())) { + Clone(); + } + + return (TCharType*)S_->data(); + } + + bool IsDetached() const { + return 1 == RefCount(); + } + + // ~~~ Size and capacity ~~~ + TBasicCowString& resize(size_t n, TCharType c = ' ') Y_LIFETIME_BOUND { // remove or append + MutRef().resize(n, c); + + return *this; + } + + // ~~~ Constructor ~~~ : FAMILY0(,TBasicCowString) + TBasicCowString() noexcept + : S_(Construct()) + { + } + + inline explicit TBasicCowString(::NDetail::TReserveTag rt) + : S_(Construct<>()) + { + reserve(rt.Capacity); + } + + inline TBasicCowString(const TBasicCowString& s) + : S_(s.S_) + { + } + + inline TBasicCowString(TBasicCowString&& s) noexcept + : S_(Construct()) + { + s.swap(*this); + } + + template <typename T, typename A> + explicit inline TBasicCowString(const std::basic_string<TCharType, T, A>& s) + : TBasicCowString(s.data(), s.size()) + { + } + + template <typename T, typename A> + inline TBasicCowString(std::basic_string<TCharType, T, A>&& s) + : S_(s.empty() ? Construct() : Construct(std::move(s))) + { + } + + TBasicCowString(const TBasicCowString& s, size_t pos, size_t n) + : S_(n ? Construct(s, pos, n) : Construct()) + { + } + + TBasicCowString(const TCharType* pc) + : TBasicCowString(pc, TBase::StrLen(pc)) + { + } + TBasicCowString(std::nullptr_t) = delete; + + TBasicCowString(const TCharType* pc, size_t n) + : S_(n ? Construct(pc, n) : Construct()) + { + } + TBasicCowString(std::nullptr_t, size_t) = delete; + + TBasicCowString(const TCharType* pc, size_t pos, size_t n) + : TBasicCowString(pc + pos, n) + { + } + + explicit TBasicCowString(TExplicitType<TCharType> c) + : TBasicCowString(&c.Value(), 1) + { + } + explicit TBasicCowString(const reference& c) + : TBasicCowString(&c, 1) + { + } + + TBasicCowString(size_t n, TCharType c) + : S_(Construct(n, c)) + { + } + + /** + * Constructs an uninitialized string of size `uninitialized.Size`. The proper + * way to use this ctor is via `TBasicCowString::Uninitialized` factory function. + * + * @throw std::length_error + */ + TBasicCowString(TUninitialized uninitialized) + : S_(Construct<>()) + { + ReserveAndResize(uninitialized.Size); + } + + TBasicCowString(const TCharType* b, const TCharType* e) + : TBasicCowString(b, NonNegativeDistance(b, e)) + { + } + + explicit TBasicCowString(const TBasicStringBuf<TCharType, TTraits> s) + : TBasicCowString(s.data(), s.size()) + { + } + + template <typename Traits> + explicit inline TBasicCowString(const std::basic_string_view<TCharType, Traits>& s) + : TBasicCowString(s.data(), s.size()) + { + } + + /** + * WARN: + * Certain invocations of this method will result in link-time error. + * You are free to implement corresponding methods in string.cpp if you need them. + */ + static TBasicCowString FromAscii(const ::TStringBuf& s) { + return TBasicCowString().AppendAscii(s); + } + + static TBasicCowString FromUtf8(const ::TStringBuf& s) { + return TBasicCowString().AppendUtf8(s); + } + + static TBasicCowString FromUtf16(const ::TWtringBuf& s) { + return TBasicCowString().AppendUtf16(s); + } + + static TBasicCowString Uninitialized(size_t n) { + return TBasicCowString(TUninitialized(n)); + } + +private: + template <typename T> + using TJoinParam = std::conditional_t<std::is_same_v<T, TCharType>, TCharType, TBasicStringBuf<TCharType, TTraits>>; + + template <typename... R> + static size_t SumLength(const TBasicStringBuf<TCharType, TTraits> s1, const R&... r) noexcept { + return s1.size() + SumLength(r...); + } + + template <typename... R> + static size_t SumLength(const TCharType /*s1*/, const R&... r) noexcept { + return 1 + SumLength(r...); + } + + static constexpr size_t SumLength() noexcept { + return 0; + } + + template <typename... R> + static void CopyAll(TCharType* p, const TBasicStringBuf<TCharType, TTraits> s, const R&... r) { + TTraits::copy(p, s.data(), s.size()); + CopyAll(p + s.size(), r...); + } + + template <typename... R, class TNextCharType, typename = std::enable_if_t<std::is_same<TCharType, TNextCharType>::value>> + static void CopyAll(TCharType* p, const TNextCharType s, const R&... r) { + p[0] = s; + CopyAll(p + 1, r...); + } + + static void CopyAll(TCharType*) noexcept { + } + + template <typename... R> + static inline TBasicCowString JoinImpl(const R&... r) { + TBasicCowString s{TUninitialized{SumLength(r...)}}; + + TBasicCowString::CopyAll((TCharType*)s.data(), r...); + + return s; + } + +public: + Y_REINITIALIZES_OBJECT inline void clear() noexcept { + if (IsDetached()) { + S_->clear(); + + return; + } + + Construct().Swap(S_); + } + + template <typename... R> + static inline TBasicCowString Join(const R&... r) { + return JoinImpl(TJoinParam<R>(r)...); + } + + // ~~~ Assignment ~~~ : FAMILY0(TBasicCowString&, assign); + TBasicCowString& assign(size_t size, TCharType ch) Y_LIFETIME_BOUND { + ReserveAndResize(size); + std::fill(begin(), end(), ch); + return *this; + } + + TBasicCowString& assign(const TBasicCowString& s) Y_LIFETIME_BOUND { + TBasicCowString(s).swap(*this); + + return *this; + } + + TBasicCowString& assign(const TBasicCowString& s, size_t pos, size_t n) Y_LIFETIME_BOUND { + return assign(TBasicCowString(s, pos, n)); + } + + TBasicCowString& assign(const TCharType* pc) Y_LIFETIME_BOUND { + return assign(pc, TBase::StrLen(pc)); + } + + TBasicCowString& assign(TCharType ch) Y_LIFETIME_BOUND { + return assign(&ch, 1); + } + + TBasicCowString& assign(const TCharType* pc, size_t len) Y_LIFETIME_BOUND { +#if defined(address_sanitizer_enabled) || defined(thread_sanitizer_enabled) + pc = (const TCharType*)HidePointerOrigin((void*)pc); +#endif + if (IsDetached()) { + MutRef().assign(pc, len); + } else { + TBasicCowString(pc, len).swap(*this); + } + + return *this; + } + + TBasicCowString& assign(const TCharType* first, const TCharType* last) Y_LIFETIME_BOUND { + return assign(first, NonNegativeDistance(first, last)); + } + + TBasicCowString& assign(const TCharType* pc, size_t pos, size_t n) Y_LIFETIME_BOUND { + return assign(pc + pos, n); + } + + TBasicCowString& assign(const TBasicStringBuf<TCharType, TTraits> s) Y_LIFETIME_BOUND { + return assign(s.data(), s.size()); + } + + TBasicCowString& assign(const TBasicStringBuf<TCharType, TTraits> s, size_t spos, size_t sn = TBase::npos) Y_LIFETIME_BOUND { + return assign(s.SubString(spos, sn)); + } + + inline TBasicCowString& AssignNoAlias(const TCharType* pc, size_t len) Y_LIFETIME_BOUND { + return assign(pc, len); + } + + inline TBasicCowString& AssignNoAlias(const TCharType* b, const TCharType* e) Y_LIFETIME_BOUND { + return AssignNoAlias(b, e - b); + } + + TBasicCowString& AssignNoAlias(const TBasicStringBuf<TCharType, TTraits> s) Y_LIFETIME_BOUND { + return AssignNoAlias(s.data(), s.size()); + } + + TBasicCowString& AssignNoAlias(const TBasicStringBuf<TCharType, TTraits> s, size_t spos, size_t sn = TBase::npos) Y_LIFETIME_BOUND { + return AssignNoAlias(s.SubString(spos, sn)); + } + + /** + * WARN: + * Certain invocations of this method will result in link-time error. + * You are free to implement corresponding methods in string.cpp if you need them. + */ + auto AssignAscii(const ::TStringBuf& s) { + clear(); + return AppendAscii(s); + } + + auto AssignUtf8(const ::TStringBuf& s) { + clear(); + return AppendUtf8(s); + } + + auto AssignUtf16(const ::TWtringBuf& s) { + clear(); + return AppendUtf16(s); + } + + TBasicCowString& operator=(const TBasicCowString& s) Y_LIFETIME_BOUND { + return assign(s); + } + + TBasicCowString& operator=(TBasicCowString&& s) noexcept Y_LIFETIME_BOUND { + swap(s); + return *this; + } + + template <typename T, typename A> + TBasicCowString& operator=(std::basic_string<TCharType, T, A>&& s) noexcept Y_LIFETIME_BOUND { + TBasicCowString(std::move(s)).swap(*this); + + return *this; + } + + TBasicCowString& operator=(const TBasicStringBuf<TCharType, TTraits> s) Y_LIFETIME_BOUND { + return assign(s); + } + + TBasicCowString& operator=(std::initializer_list<TCharType> il) Y_LIFETIME_BOUND { + return assign(il.begin(), il.end()); + } + + TBasicCowString& operator=(const TCharType* s) Y_LIFETIME_BOUND { + return assign(s); + } + TBasicCowString& operator=(std::nullptr_t) Y_LIFETIME_BOUND = delete; + + TBasicCowString& operator=(TExplicitType<TCharType> ch) Y_LIFETIME_BOUND { + return assign(ch); + } + + inline void reserve(size_t len) { + MutRef().reserve(len); + } + + // ~~~ Appending ~~~ : FAMILY0(TBasicCowString&, append); + inline TBasicCowString& append(size_t count, TCharType ch) Y_LIFETIME_BOUND { + MutRef().append(count, ch); + + return *this; + } + + inline TBasicCowString& append(const TBasicCowString& s) Y_LIFETIME_BOUND { + MutRef().append(s.ConstRef()); + + return *this; + } + + inline TBasicCowString& append(const TBasicCowString& s, size_t pos, size_t n) Y_LIFETIME_BOUND { + MutRef().append(s.ConstRef(), pos, n); + + return *this; + } + + inline TBasicCowString& append(const TCharType* pc) Y_LIFETIME_BOUND { + MutRef().append(pc); + + return *this; + } + + inline TBasicCowString& append(TCharType c) Y_LIFETIME_BOUND { + MutRef().push_back(c); + + return *this; + } + + inline TBasicCowString& append(const TCharType* first, const TCharType* last) Y_LIFETIME_BOUND { + MutRef().append(first, last); + + return *this; + } + + inline TBasicCowString& append(const TCharType* pc, size_t len) Y_LIFETIME_BOUND { + MutRef().append(pc, len); + + return *this; + } + + inline void ReserveAndResize(size_t len) { + ::ResizeUninitialized(MutRef(), len); + } + + TBasicCowString& AppendNoAlias(const TCharType* pc, size_t len) Y_LIFETIME_BOUND { + if (len) { + auto s = this->size(); + + ReserveAndResize(s + len); + memcpy(&*(begin() + s), pc, len * sizeof(*pc)); + } + + return *this; + } + + TBasicCowString& AppendNoAlias(const TBasicStringBuf<TCharType, TTraits> s) Y_LIFETIME_BOUND { + return AppendNoAlias(s.data(), s.size()); + } + + TBasicCowString& AppendNoAlias(const TBasicStringBuf<TCharType, TTraits> s, size_t spos, size_t sn = TBase::npos) Y_LIFETIME_BOUND { + return AppendNoAlias(s.SubString(spos, sn)); + } + + TBasicCowString& append(const TBasicStringBuf<TCharType, TTraits> s) Y_LIFETIME_BOUND { + return append(s.data(), s.size()); + } + + TBasicCowString& append(const TBasicStringBuf<TCharType, TTraits> s, size_t spos, size_t sn = TBase::npos) Y_LIFETIME_BOUND { + return append(s.SubString(spos, sn)); + } + + TBasicCowString& append(const TCharType* pc, size_t pos, size_t n, size_t pc_len = TBase::npos) Y_LIFETIME_BOUND { + return append(pc + pos, Min(n, pc_len - pos)); + } + + /** + * WARN: + * Certain invocations of this method will result in link-time error. + * You are free to implement corresponding methods in string.cpp if you need them. + */ + TBasicCowString& AppendAscii(const ::TStringBuf& s) Y_LIFETIME_BOUND; + + TBasicCowString& AppendUtf8(const ::TStringBuf& s) Y_LIFETIME_BOUND; + + TBasicCowString& AppendUtf16(const ::TWtringBuf& s) Y_LIFETIME_BOUND; + + inline void push_back(TCharType c) { + // TODO + append(c); + } + + template <class T> + TBasicCowString& operator+=(const T& s) Y_LIFETIME_BOUND { + return append(s); + } + + template <class T> + friend TBasicCowString operator*(const TBasicCowString& s, T count) { + static_assert(std::is_integral<T>::value, "Integral type required."); + + TBasicCowString result; + + if (count > 0) { + result.reserve(s.length() * count); + } + + for (T i = 0; i < count; ++i) { + result += s; + } + + return result; + } + + template <class T> + TBasicCowString& operator*=(T count) Y_LIFETIME_BOUND { + static_assert(std::is_integral<T>::value, "Integral type required."); + + TBasicCowString temp; + + if (count > 0) { + temp.reserve(length() * count); + } + + for (T i = 0; i < count; ++i) { + temp += *this; + } + + swap(temp); + + return *this; + } + + operator const TStringType&() const noexcept Y_LIFETIME_BOUND { + return this->ConstRef(); + } + + /* We have operator casting TString to `const std::string&` but we explicitly don't support + * casting TString to `std::string&` since such casting requires detaching TString and therefore + * modifies TString object. Sometimes compiler might call `operator std::string&` + * implicitly and it might lead to problems. Check IGNIETFERRO-2155 for details. + */ + template <typename T, typename = std::enable_if_t<std::is_same_v<T, TStringType>>> + operator T&() & Y_LIFETIME_BOUND requires false { + return this->MutRef(); + } + + /* + * Following overloads of "operator+" aim to choose the cheapest implementation depending on + * summand types: lvalues, detached rvalues, shared rvalues. + * + * General idea is to use the detached-rvalue argument (left of right) to store the result + * wherever possible. If a buffer in rvalue is large enough this saves a re-allocation. If + * both arguments are rvalues we check which one is detached. If both of them are detached then + * the left argument is obviously preferrable because you won't need to shift the data. + * + * If an rvalue is shared then it's basically the same as lvalue because you cannot use its + * buffer to store the sum. However, we rely on the fact that append() and prepend() are already + * optimized for the shared case and detach the string into the buffer large enough to store + * the sum (compared to the detach+reallocation). This way, if we have only one rvalue argument + * (left or right) then we simply append/prepend into it, without checking if it's detached or + * not. This will be checked inside ReserveAndResize anyway. + * + * If both arguments cannot be used to store the sum (e.g. two lvalues) then we fall back to the + * Join function that constructs a resulting string in the new buffer with the minimum overhead: + * malloc + memcpy + memcpy. + */ + + friend TBasicCowString operator+(TBasicCowString&& s1, const TBasicCowString& s2) Y_WARN_UNUSED_RESULT { + s1 += s2; + return std::move(s1); + } + + friend TBasicCowString operator+(const TBasicCowString& s1, TBasicCowString&& s2) Y_WARN_UNUSED_RESULT { + s2.prepend(s1); + return std::move(s2); + } + + friend TBasicCowString operator+(TBasicCowString&& s1, TBasicCowString&& s2) Y_WARN_UNUSED_RESULT { +#if 0 + if (!s1.IsDetached() && s2.IsDetached()) { + s2.prepend(s1); + return std::move(s2); + } +#endif + s1 += s2; + return std::move(s1); + } + + friend TBasicCowString operator+(TBasicCowString&& s1, const TBasicStringBuf<TCharType, TTraits> s2) Y_WARN_UNUSED_RESULT { + s1 += s2; + return std::move(s1); + } + + friend TBasicCowString operator+(TBasicCowString&& s1, const TCharType* s2) Y_WARN_UNUSED_RESULT { + s1 += s2; + return std::move(s1); + } + + friend TBasicCowString operator+(TBasicCowString&& s1, TCharType s2) Y_WARN_UNUSED_RESULT { + s1 += s2; + return std::move(s1); + } + + friend TBasicCowString operator+(TExplicitType<TCharType> ch, const TBasicCowString& s) Y_WARN_UNUSED_RESULT { + return Join(TCharType(ch), s); + } + + friend TBasicCowString operator+(TExplicitType<TCharType> ch, TBasicCowString&& s) Y_WARN_UNUSED_RESULT { + s.prepend(ch); + return std::move(s); + } + + friend TBasicCowString operator+(const TBasicCowString& s1, const TBasicCowString& s2) Y_WARN_UNUSED_RESULT { + return Join(s1, s2); + } + + friend TBasicCowString operator+(const TBasicCowString& s1, const TBasicStringBuf<TCharType, TTraits> s2) Y_WARN_UNUSED_RESULT { + return Join(s1, s2); + } + + friend TBasicCowString operator+(const TBasicCowString& s1, const TCharType* s2) Y_WARN_UNUSED_RESULT { + return Join(s1, s2); + } + + friend TBasicCowString operator+(const TBasicCowString& s1, TCharType s2) Y_WARN_UNUSED_RESULT { + return Join(s1, TBasicStringBuf<TCharType, TTraits>(&s2, 1)); + } + + friend TBasicCowString operator+(const TCharType* s1, TBasicCowString&& s2) Y_WARN_UNUSED_RESULT { + s2.prepend(s1); + return std::move(s2); + } + + friend TBasicCowString operator+(const TBasicStringBuf<TCharType, TTraits> s1, TBasicCowString&& s2) Y_WARN_UNUSED_RESULT { + s2.prepend(s1); + return std::move(s2); + } + + friend TBasicCowString operator+(const TBasicStringBuf<TCharType, TTraits> s1, const TBasicCowString& s2) Y_WARN_UNUSED_RESULT { + return Join(s1, s2); + } + + friend TBasicCowString operator+(const TCharType* s1, const TBasicCowString& s2) Y_WARN_UNUSED_RESULT { + return Join(s1, s2); + } + + friend TBasicCowString operator+(std::basic_string<TCharType, TTraits> l, TBasicCowString r) { + return std::move(l) + r.ConstRef(); + } + + friend TBasicCowString operator+(TBasicCowString l, std::basic_string<TCharType, TTraits> r) { + return l.ConstRef() + std::move(r); + } + + // ~~~ Prepending ~~~ : FAMILY0(TBasicCowString&, prepend); + TBasicCowString& prepend(const TBasicCowString& s) Y_LIFETIME_BOUND { + MutRef().insert(0, s.ConstRef()); + + return *this; + } + + TBasicCowString& prepend(const TBasicCowString& s, size_t pos, size_t n) Y_LIFETIME_BOUND { + MutRef().insert(0, s.ConstRef(), pos, n); + + return *this; + } + + TBasicCowString& prepend(const TCharType* pc) Y_LIFETIME_BOUND { + MutRef().insert(0, pc); + + return *this; + } + + TBasicCowString& prepend(size_t n, TCharType c) Y_LIFETIME_BOUND { + MutRef().insert(size_t(0), n, c); + + return *this; + } + + TBasicCowString& prepend(TCharType c) Y_LIFETIME_BOUND { + MutRef().insert(size_t(0), 1, c); + + return *this; + } + + TBasicCowString& prepend(const TBasicStringBuf<TCharType, TTraits> s, size_t spos = 0, size_t sn = TBase::npos) Y_LIFETIME_BOUND { + return insert(0, s, spos, sn); + } + + // ~~~ Insertion ~~~ : FAMILY1(TBasicCowString&, insert, size_t pos); + TBasicCowString& insert(size_t pos, const TBasicCowString& s) Y_LIFETIME_BOUND { + MutRef().insert(pos, s.ConstRef()); + + return *this; + } + + TBasicCowString& insert(size_t pos, const TBasicCowString& s, size_t pos1, size_t n1) Y_LIFETIME_BOUND { + MutRef().insert(pos, s.ConstRef(), pos1, n1); + + return *this; + } + + TBasicCowString& insert(size_t pos, const TCharType* pc) Y_LIFETIME_BOUND { + MutRef().insert(pos, pc); + + return *this; + } + + TBasicCowString& insert(size_t pos, const TCharType* pc, size_t len) Y_LIFETIME_BOUND { + MutRef().insert(pos, pc, len); + + return *this; + } + + TBasicCowString& insert(const_iterator pos, const_iterator b, const_iterator e) Y_LIFETIME_BOUND { + return insert(this->off(pos), b, e - b); + } + + TBasicCowString& insert(size_t pos, size_t n, TCharType c) Y_LIFETIME_BOUND { + MutRef().insert(pos, n, c); + + return *this; + } + + TBasicCowString& insert(const_iterator pos, size_t len, TCharType ch) Y_LIFETIME_BOUND { + return this->insert(this->off(pos), len, ch); + } + + TBasicCowString& insert(const_iterator pos, TCharType ch) Y_LIFETIME_BOUND { + return this->insert(pos, 1, ch); + } + + TBasicCowString& insert(size_t pos, const TBasicStringBuf<TCharType, TTraits> s, size_t spos = 0, size_t sn = TBase::npos) Y_LIFETIME_BOUND { + MutRef().insert(pos, s, spos, sn); + + return *this; + } + + // ~~~ Removing ~~~ + TBasicCowString& remove(size_t pos, size_t n) Y_LIFETIME_BOUND { + if (pos < length()) { + MutRef().erase(pos, n); + } + + return *this; + } + + TBasicCowString& remove(size_t pos = 0) Y_LIFETIME_BOUND { + if (pos < length()) { + MutRef().erase(pos); + } + + return *this; + } + + TBasicCowString& erase(size_t pos = 0, size_t n = TBase::npos) Y_LIFETIME_BOUND { + MutRef().erase(pos, n); + + return *this; + } + + TBasicCowString& erase(const_iterator b, const_iterator e) Y_LIFETIME_BOUND { + return erase(this->off(b), e - b); + } + + TBasicCowString& erase(const_iterator i) Y_LIFETIME_BOUND { + return erase(i, i + 1); + } + + TBasicCowString& pop_back() Y_LIFETIME_BOUND { + Y_ASSERT(!this->empty()); + + MutRef().pop_back(); + + return *this; + } + + // ~~~ replacement ~~~ : FAMILY2(TBasicCowString&, replace, size_t pos, size_t n); + TBasicCowString& replace(size_t pos, size_t n, const TBasicCowString& s) Y_LIFETIME_BOUND { + MutRef().replace(pos, n, s.ConstRef()); + + return *this; + } + + TBasicCowString& replace(size_t pos, size_t n, const TBasicCowString& s, size_t pos1, size_t n1) Y_LIFETIME_BOUND { + MutRef().replace(pos, n, s.ConstRef(), pos1, n1); + + return *this; + } + + TBasicCowString& replace(size_t pos, size_t n, const TCharType* pc) Y_LIFETIME_BOUND { + MutRef().replace(pos, n, pc); + + return *this; + } + + TBasicCowString& replace(size_t pos, size_t n, const TCharType* s, size_t len) Y_LIFETIME_BOUND { + MutRef().replace(pos, n, s, len); + + return *this; + } + + TBasicCowString& replace(size_t pos, size_t n, const TCharType* s, size_t spos, size_t sn) Y_LIFETIME_BOUND { + MutRef().replace(pos, n, s + spos, sn - spos); + + return *this; + } + + TBasicCowString& replace(size_t pos, size_t n1, size_t n2, TCharType c) Y_LIFETIME_BOUND { + MutRef().replace(pos, n1, n2, c); + + return *this; + } + + TBasicCowString& replace(size_t pos, size_t n, const TBasicStringBuf<TCharType, TTraits> s, size_t spos = 0, size_t sn = TBase::npos) Y_LIFETIME_BOUND { + MutRef().replace(pos, n, s, spos, sn); + + return *this; + } + + void swap(TBasicCowString& s) noexcept { + S_.Swap(s.S_); + } + + /** + * @returns String suitable for debug printing (like Python's `repr()`). + * Format of the string is unspecified and may be changed over time. + */ + TBasicCowString Quote() const { + extern TBasicCowString EscapeC(const TBasicCowString&); + + return TBasicCowString() + '"' + EscapeC(*this) + '"'; + } + + /** + * Modifies the case of the string, depending on the operation. + * @return false if no changes have been made. + * + * @warning when the value_type is char, these methods will not work with non-ASCII letters. + */ + bool to_lower(size_t pos = 0, size_t n = TBase::npos); + bool to_upper(size_t pos = 0, size_t n = TBase::npos); + bool to_title(size_t pos = 0, size_t n = TBase::npos); + + constexpr const TCharType* Data() const noexcept = delete; + constexpr size_t Size() noexcept = delete; + Y_PURE_FUNCTION constexpr bool Empty() const noexcept = delete; + +public: + /** + * Modifies the substring of length `n` starting from `pos`, applying `f` to each position and symbol. + * + * @return false if no changes have been made. + */ + template <typename T> + bool Transform(T&& f, size_t pos = 0, size_t n = TBase::npos) { + size_t len = length(); + + if (pos > len) { + pos = len; + } + + if (n > len - pos) { + n = len - pos; + } + + bool changed = false; + + for (size_t i = pos; i != pos + n; ++i) { + auto c = f(i, data()[i]); + if (c != data()[i]) { + if (!changed) { + Detach(); + changed = true; + } + + begin()[i] = c; + } + } + + return changed; + } +}; + +using TCowString = TBasicCowString<char>; +using TUtf16CowString = TBasicCowString<wchar16>; +using TUtf32CowString = TBasicCowString<wchar32>; + +std::ostream& operator<<(std::ostream&, const TCowString&); +std::istream& operator>>(std::istream&, TCowString&); + +template <typename TCharType, typename TTraits> +TBasicCowString<TCharType> to_lower(const TBasicCowString<TCharType, TTraits>& s) { + TBasicCowString<TCharType> ret(s); + ret.to_lower(); + return ret; +} + +template <typename TCharType, typename TTraits> +TBasicCowString<TCharType> to_upper(const TBasicCowString<TCharType, TTraits>& s) { + TBasicCowString<TCharType> ret(s); + ret.to_upper(); + return ret; +} + +template <typename TCharType, typename TTraits> +TBasicCowString<TCharType> to_title(const TBasicCowString<TCharType, TTraits>& s) { + TBasicCowString<TCharType> ret(s); + ret.to_title(); + return ret; +} + +namespace std { + template <> + struct hash<TCowString> { + using argument_type = TCowString; + using result_type = size_t; + inline result_type operator()(argument_type const& s) const noexcept { + return NHashPrivate::ComputeStringHash(s.data(), s.size()); + } + }; +} // namespace std + +// interop +template <class TCharType, class TTraits> +auto& MutRef(TBasicCowString<TCharType, TTraits>& s Y_LIFETIME_BOUND) { + return s.MutRef(); +} + +template <class TCharType, class TTraits> +const auto& ConstRef(const TBasicCowString<TCharType, TTraits>& s Y_LIFETIME_BOUND) noexcept { + return s.ConstRef(); +} + +template <class TCharType, class TTraits> +void ResizeUninitialized(TBasicCowString<TCharType, TTraits>& s, size_t len) { + s.ReserveAndResize(len); +} diff --git a/library/cpp/containers/cow_string/cow_string_ut.cpp b/library/cpp/containers/cow_string/cow_string_ut.cpp new file mode 100644 index 00000000000..6de74b5c4b7 --- /dev/null +++ b/library/cpp/containers/cow_string/cow_string_ut.cpp @@ -0,0 +1,1268 @@ +#include <cow_string.h> + +#include <library/cpp/containers/cow_string/str_stl.h> +#include <library/cpp/containers/cow_string/subst.h> +#include <library/cpp/containers/cow_string/reverse.h> + +#include <util/charset/wide.h> +#include "util/generic/deque.h" +#include "util/generic/strbuf.h" +#include "util/generic/string_ut.h" +#include "util/generic/vector.h" +#include "util/generic/yexception.h" +#include <util/stream/output.h> +#include <util/string/subst.h> + +#include <string> +#include <sstream> +#include <algorithm> +#include <stdexcept> + +static_assert(sizeof(TCowString) == sizeof(const char*), "expect sizeof(TCowString) == sizeof(const char*)"); + +class TStringTestZero: public TTestBase { + UNIT_TEST_SUITE(TStringTestZero); + UNIT_TEST(TestZero); + UNIT_TEST_SUITE_END(); + +public: + void TestZero() { + const char data[] = "abc\0def\0"; + TCowString s(data, sizeof(data)); + UNIT_ASSERT(s.size() == sizeof(data)); + UNIT_ASSERT(s.StartsWith(s)); + UNIT_ASSERT(s.EndsWith(s)); + UNIT_ASSERT(s.Contains('\0')); + + const char raw_def[] = "def"; + const char raw_zero[] = "\0"; + TCowString def(raw_def, sizeof(raw_def) - 1); + TCowString zero(raw_zero, sizeof(raw_zero) - 1); + UNIT_ASSERT_EQUAL(4, s.find(raw_def)); + UNIT_ASSERT_EQUAL(4, s.find(def)); + UNIT_ASSERT_EQUAL(4, s.find_first_of(raw_def)); + UNIT_ASSERT_EQUAL(3, s.find_first_of(zero)); + UNIT_ASSERT_EQUAL(7, s.find_first_not_of(def, 4)); + + const char nonSubstring[] = "def\0ghi"; + UNIT_ASSERT_EQUAL(TCowString::npos, s.find(TCowString(nonSubstring, sizeof(nonSubstring)))); + + TCowString copy = s; + copy.replace(copy.size() - 1, 1, "z"); + UNIT_ASSERT(s != copy); + copy.replace(copy.size() - 1, 1, "\0", 0, 1); + UNIT_ASSERT(s == copy); + + TCowString prefix(data, 5); + UNIT_ASSERT(s.StartsWith(prefix)); + UNIT_ASSERT(s != prefix); + UNIT_ASSERT(s > prefix); + UNIT_ASSERT(s > s.data()); + UNIT_ASSERT(s == TCowString(s.data(), s.size())); + UNIT_ASSERT(data < s); + + s.remove(5); + UNIT_ASSERT(s == prefix); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStringTestZero); + +template <typename TStringType, typename TTestData> +class TStringStdTestImpl { + using TChar = typename TStringType::char_type; + using TTraits = typename TStringType::traits_type; + using TView = std::basic_string_view<TChar, TTraits>; + + TTestData Data_; + +protected: + void Constructor() { + UNIT_ASSERT_EXCEPTION(TStringType((size_t)-1, *Data_.a()), std::length_error); + } + + void reserve() { +#if 0 + TStringType s; + UNIT_ASSERT_EXCEPTION(s.reserve(s.max_size() + 1), std::length_error); + + // Non-shared behaviour - never shrink + + s.reserve(256); + const auto* data = s.data(); + + UNIT_ASSERT(s.capacity() >= 256); + + s.reserve(128); + + UNIT_ASSERT(s.capacity() >= 256 && s.data() == data); + + s.resize(64, 'x'); + s.reserve(10); + + UNIT_ASSERT(s.capacity() >= 256 && s.data() == data); + + // Shared behaviour - always reallocate, just as much as requisted + + TStringType holder = s; + + UNIT_ASSERT(s.capacity() >= 256); + + s.reserve(128); + + UNIT_ASSERT(s.capacity() >= 128 && s.capacity() < 256 && s.data() != data); + UNIT_ASSERT(s.IsDetached()); + + s.resize(64, 'x'); + data = s.data(); + holder = s; + + s.reserve(10); + + UNIT_ASSERT(s.capacity() >= 64 && s.capacity() < 128 && s.data() != data); + UNIT_ASSERT(s.IsDetached()); +#endif + } + + void short_string() { + TStringType const ref_short_str1(Data_.str1()), ref_short_str2(Data_.str2()); + TStringType short_str1(ref_short_str1), short_str2(ref_short_str2); + TStringType const ref_long_str1(Data_.str__________________________________________________1()); + TStringType const ref_long_str2(Data_.str__________________________________________________2()); + TStringType long_str1(ref_long_str1), long_str2(ref_long_str2); + + UNIT_ASSERT(short_str1 == ref_short_str1); + UNIT_ASSERT(long_str1 == ref_long_str1); + + { + TStringType str1(short_str1); + str1 = long_str1; + UNIT_ASSERT(str1 == ref_long_str1); + } + + { + TStringType str1(long_str1); + str1 = short_str1; + UNIT_ASSERT(str1 == ref_short_str1); + } + + { + short_str1.swap(short_str2); + UNIT_ASSERT((short_str1 == ref_short_str2) && (short_str2 == ref_short_str1)); + short_str1.swap(short_str2); + } + + { + long_str1.swap(long_str2); + UNIT_ASSERT((long_str1 == ref_long_str2) && (long_str2 == ref_long_str1)); + long_str1.swap(long_str2); + } + + { + short_str1.swap(long_str1); + UNIT_ASSERT((short_str1 == ref_long_str1) && (long_str1 == ref_short_str1)); + short_str1.swap(long_str1); + } + + { + long_str1.swap(short_str1); + UNIT_ASSERT((short_str1 == ref_long_str1) && (long_str1 == ref_short_str1)); + long_str1.swap(short_str1); + } + + { + // This is to test move constructor + TVector<TStringType> str_vect; + + str_vect.push_back(short_str1); + str_vect.push_back(long_str1); + str_vect.push_back(short_str2); + str_vect.push_back(long_str2); + + UNIT_ASSERT(str_vect[0] == ref_short_str1); + UNIT_ASSERT(str_vect[1] == ref_long_str1); + UNIT_ASSERT(str_vect[2] == ref_short_str2); + UNIT_ASSERT(str_vect[3] == ref_long_str2); + } + } + + void erase() { + TChar const* c_str = Data_.Hello_World(); + TStringType str(c_str); + UNIT_ASSERT(str == c_str); + + str.erase(str.begin() + 1, str.end() - 1); // Erase all but first and last. + + size_t i; + for (i = 0; i < str.size(); ++i) { + switch (i) { + case 0: + UNIT_ASSERT(str[i] == *Data_.H()); + break; + + case 1: + UNIT_ASSERT(str[i] == *Data_.d()); + break; + + default: + UNIT_ASSERT(false); + } + } + + str.insert(1, c_str); + str.erase(str.begin()); // Erase first element. + str.erase(str.end() - 1); // Erase last element. + UNIT_ASSERT(str == c_str); + str.clear(); // Erase all. + UNIT_ASSERT(str.empty()); + + str = c_str; + UNIT_ASSERT(str == c_str); + + str.erase(1, str.size() - 1); // Erase all but first and last. + for (i = 0; i < str.size(); i++) { + switch (i) { + case 0: + UNIT_ASSERT(str[i] == *Data_.H()); + break; + + case 1: + UNIT_ASSERT(str[i] == *Data_.d()); + break; + + default: + UNIT_ASSERT(false); + } + } + + str.erase(1); + UNIT_ASSERT(str == Data_.H()); + } + + void data() { + TStringType xx; + + // ISO-IEC-14882:1998(E), 21.3.6, paragraph 3 + UNIT_ASSERT(xx.data() != nullptr); + } + + void c_str() { + TStringType low(Data_._2004_01_01()); + TStringType xx; + TStringType yy; + + // ISO-IEC-14882:1998(E), 21.3.6, paragraph 1 + UNIT_ASSERT(*(yy.c_str()) == 0); + + // Blocks A and B should follow each other. + // Block A: + xx = Data_._123456(); + xx += low; + UNIT_ASSERT(xx.c_str() == TView(Data_._1234562004_01_01())); + // End of block A + + // Block B: + xx = Data_._1234(); + xx += Data_._5(); + UNIT_ASSERT(xx.c_str() == TView(Data_._12345())); + // End of block B + } + + void null_char_of_empty() { + const TStringType s; + + // NOTE: https://a.yandex-team.ru/arcadia/junk/grechnik/test_string?rev=r12602052 + i64 i = s[s.size()]; + UNIT_ASSERT_VALUES_EQUAL(i, 0); + } + + void null_char() { + // ISO/IEC 14882:1998(E), ISO/IEC 14882:2003(E), 21.3.4 ('... the const version') + const TStringType s(Data_._123456()); + + UNIT_ASSERT(s[s.size()] == 0); + } + + // Allowed since C++17, see http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2475 + void null_char_assignment_to_subscript_of_empty() { + TStringType s; + + using reference = typename TStringType::reference; + reference trailing_zero = s[s.size()]; + trailing_zero = 0; + UNIT_ASSERT(trailing_zero == 0); + } + + // Allowed since C++17, see http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2475 + void null_char_assignment_to_subscript_of_nonempty() { + TStringType s(Data_._123456()); + + using reference = typename TStringType::reference; + reference trailing_zero = s[s.size()]; + trailing_zero = 0; + UNIT_ASSERT(trailing_zero == 0); + } + + // Dereferencing string end() is not allowed by C++ standard as of C++20, avoid using in real code. + void null_char_assignment_to_end_of_empty() { + TStringType s; + + volatile auto& trailing_zero = *(s.begin() + s.size()); + trailing_zero = 0; + UNIT_ASSERT(trailing_zero == 0); + } + + // Dereferencing string end() is not allowed by C++ standard as of C++20, avoid using in real code. + void null_char_assignment_to_end_of_nonempty() { + TStringType s(Data_._123456()); + + volatile auto& trailing_zero = *(s.begin() + s.size()); + trailing_zero = 0; + UNIT_ASSERT(trailing_zero == 0); + } + + void insert() { + TStringType strorg = Data_.This_is_test_string_for_string_calls(); + TStringType str; + + // In case of reallocation there is no auto reference problem + // so we reserve a big enough TStringType to be sure to test this + // particular point. + + str.reserve(100); + str = strorg; + + // test self insertion: + str.insert(10, str.c_str() + 5, 15); + UNIT_ASSERT(str == Data_.This_is_teis_test_string_st_string_for_string_calls()); + + str = strorg; + str.insert(15, str.c_str() + 5, 25); + UNIT_ASSERT(str == Data_.This_is_test_stis_test_string_for_stringring_for_string_calls()); + + str = strorg; + str.insert(0, str.c_str() + str.size() - 4, 4); + UNIT_ASSERT(str == Data_.allsThis_is_test_string_for_string_calls()); + + str = strorg; + str.insert(0, str.c_str() + str.size() / 2 - 1, str.size() / 2 + 1); + UNIT_ASSERT(str == Data_.ng_for_string_callsThis_is_test_string_for_string_calls()); + + str = strorg; + typename TStringType::iterator b = str.begin(); + typename TStringType::const_iterator s = str.begin() + str.size() / 2 - 1; + typename TStringType::const_iterator e = str.end(); + str.insert(b, s, e); + UNIT_ASSERT(str == Data_.ng_for_string_callsThis_is_test_string_for_string_calls()); + +#if 0 + // AV + str = strorg; + str.insert(str.begin(), str.begin() + str.size() / 2 - 1, str.end()); + UNIT_ASSERT(str == Data.ng_for_string_callsThis_is_test_string_for_string_calls()); +#endif + + TStringType str0; + str0.insert(str0.begin(), 5, *Data_._0()); + UNIT_ASSERT(str0 == Data_._00000()); + + TStringType str1; + { + typename TStringType::size_type pos = 0, nb = 2; + str1.insert(pos, nb, *Data_._1()); + } + UNIT_ASSERT(str1 == Data_._11()); + + str0.insert(0, str1); + UNIT_ASSERT(str0 == Data_._1100000()); + + TStringType str2(Data_._2345()); + str0.insert(str0.size(), str2, 1, 2); + UNIT_ASSERT(str0 == Data_._110000034()); + + str1.insert(str1.begin() + 1, 2, *Data_._2()); + UNIT_ASSERT(str1 == Data_._1221()); + + str1.insert(2, Data_._333333(), 3); + UNIT_ASSERT(str1 == Data_._1233321()); + + str1.insert(4, Data_._4444()); + UNIT_ASSERT(str1 == Data_._12334444321()); + + str1.insert(str1.begin() + 6, *Data_._5()); + UNIT_ASSERT(str1 == Data_._123344544321()); + } + + void resize() { + TStringType s; + + s.resize(0); + + UNIT_ASSERT(*s.c_str() == 0); + + s = Data_._1234567(); + + s.resize(0); + UNIT_ASSERT(*s.c_str() == 0); + + s = Data_._1234567(); + s.resize(1); + UNIT_ASSERT(s.size() == 1); + UNIT_ASSERT(*s.c_str() == *Data_._1()); + UNIT_ASSERT(*(s.c_str() + 1) == 0); + + s = Data_._1234567(); +#if 0 + s.resize(10); +#else + s.resize(10, 0); +#endif + UNIT_ASSERT(s.size() == 10); + UNIT_ASSERT(s[6] == *Data_._7()); + UNIT_ASSERT(s[7] == 0); + UNIT_ASSERT(s[8] == 0); + UNIT_ASSERT(s[9] == 0); + } + + void find() { + TStringType s(Data_.one_two_three_one_two_three()); + + UNIT_ASSERT(s.find(Data_.one()) == 0); + UNIT_ASSERT(s.find(*Data_.t()) == 4); + UNIT_ASSERT(s.find(*Data_.t(), 5) == 8); + + UNIT_ASSERT(s.find(Data_.four()) == TStringType::npos); + UNIT_ASSERT(s.find(Data_.one(), TStringType::npos) == TStringType::npos); + UNIT_ASSERT(s.find_first_of(Data_.abcde()) == 2); + UNIT_ASSERT(s.find_first_not_of(Data_.enotw_()) == 9); + } + + void capacity() { + TStringType s; + + UNIT_ASSERT(s.capacity() < s.max_size()); + UNIT_ASSERT(s.capacity() >= s.size()); + + for (int i = 0; i < 18; ++i) { + s += ' '; + + UNIT_ASSERT(s.capacity() > 0); + UNIT_ASSERT(s.capacity() < s.max_size()); + UNIT_ASSERT(s.capacity() >= s.size()); + } + } + + void assign() { + TStringType s; + TChar const* cstr = Data_.test_string_for_assign(); + + s.assign(cstr, cstr + 22); + UNIT_ASSERT(s == Data_.test_string_for_assign()); + + TStringType s2(Data_.other_test_string()); + s.assign(s2); + UNIT_ASSERT(s == s2); + + static TStringType str1; + static TStringType str2; + + // short TStringType optim: + str1 = Data_._123456(); + // longer than short TStringType: + str2 = Data_._1234567890123456789012345678901234567890(); + + UNIT_ASSERT(str1[5] == *Data_._6()); + UNIT_ASSERT(str2[29] == *Data_._0()); + } + + void copy() { + TStringType s(Data_.foo()); + TChar dest[4]; + dest[0] = dest[1] = dest[2] = dest[3] = 1; + s.copy(dest, 4); + int pos = 0; + UNIT_ASSERT(dest[pos++] == *Data_.f()); + UNIT_ASSERT(dest[pos++] == *Data_.o()); + UNIT_ASSERT(dest[pos++] == *Data_.o()); + UNIT_ASSERT(dest[pos++] == 1); + + dest[0] = dest[1] = dest[2] = dest[3] = 1; + s.copy(dest, 4, 2); + pos = 0; + UNIT_ASSERT(dest[pos++] == *Data_.o()); + UNIT_ASSERT(dest[pos++] == 1); + + UNIT_ASSERT_EXCEPTION(s.copy(dest, 4, 5), std::out_of_range); + } + + void cbegin_cend() { + const char helloThere[] = "Hello there"; + TCowString s = helloThere; + size_t index = 0; + for (auto it = s.cbegin(); s.cend() != it; ++it, ++index) { + UNIT_ASSERT_VALUES_EQUAL(helloThere[index], *it); + } + } + + void compare() { + TStringType str1(Data_.abcdef()); + TStringType str2; + + str2 = Data_.abcdef(); + UNIT_ASSERT(str1.compare(str2) == 0); + UNIT_ASSERT(str1.compare(str2.data(), str2.size()) == 0); + str2 = Data_.abcde(); + UNIT_ASSERT(str1.compare(str2) > 0); + UNIT_ASSERT(str1.compare(str2.data(), str2.size()) > 0); + str2 = Data_.abcdefg(); + UNIT_ASSERT(str1.compare(str2) < 0); + UNIT_ASSERT(str1.compare(str2.data(), str2.size()) < 0); + + UNIT_ASSERT(str1.compare(Data_.abcdef()) == 0); + UNIT_ASSERT(str1.compare(Data_.abcde()) > 0); + UNIT_ASSERT(str1.compare(Data_.abcdefg()) < 0); + + str2 = Data_.cde(); + UNIT_ASSERT(str1.compare(2, 3, str2) == 0); + str2 = Data_.cd(); + UNIT_ASSERT(str1.compare(2, 3, str2) > 0); + str2 = Data_.cdef(); + UNIT_ASSERT(str1.compare(2, 3, str2) < 0); + + str2 = Data_.abcdef(); + UNIT_ASSERT(str1.compare(2, 3, str2, 2, 3) == 0); + UNIT_ASSERT(str1.compare(2, 3, str2, 2, 2) > 0); + UNIT_ASSERT(str1.compare(2, 3, str2, 2, 4) < 0); + + UNIT_ASSERT(str1.compare(2, 3, Data_.cdefgh(), 3) == 0); + UNIT_ASSERT(str1.compare(2, 3, Data_.cdefgh(), 2) > 0); + UNIT_ASSERT(str1.compare(2, 3, Data_.cdefgh(), 4) < 0); + } + + void find_last_of() { + // 21.3.6.4 + TStringType s(Data_.one_two_three_one_two_three()); + + UNIT_ASSERT(s.find_last_of(Data_.abcde()) == 26); + UNIT_ASSERT(s.find_last_of(TStringType(Data_.abcde())) == 26); + + TStringType test(Data_.aba()); + + UNIT_ASSERT(test.find_last_of(Data_.a(), 2, 1) == 2); + UNIT_ASSERT(test.find_last_of(Data_.a(), 1, 1) == 0); + UNIT_ASSERT(test.find_last_of(Data_.a(), 0, 1) == 0); + + UNIT_ASSERT(test.find_last_of(*Data_.a(), 2) == 2); + UNIT_ASSERT(test.find_last_of(*Data_.a(), 1) == 0); + UNIT_ASSERT(test.find_last_of(*Data_.a(), 0) == 0); + } +#if 0 + void rfind() { + // 21.3.6.2 + TStringType s(Data.one_two_three_one_two_three()); + + UNIT_ASSERT(s.rfind(Data.two()) == 18); + UNIT_ASSERT(s.rfind(Data.two(), 0) == TStringType::npos); + UNIT_ASSERT(s.rfind(Data.two(), 11) == 4); + UNIT_ASSERT(s.rfind(*Data.w()) == 19); + + TStringType test(Data.aba()); + + UNIT_ASSERT(test.rfind(Data.a(), 2, 1) == 2); + UNIT_ASSERT(test.rfind(Data.a(), 1, 1) == 0); + UNIT_ASSERT(test.rfind(Data.a(), 0, 1) == 0); + + UNIT_ASSERT(test.rfind(*Data.a(), 2) == 2); + UNIT_ASSERT(test.rfind(*Data.a(), 1) == 0); + UNIT_ASSERT(test.rfind(*Data.a(), 0) == 0); + } +#endif + void find_last_not_of() { + // 21.3.6.6 + TStringType s(Data_.one_two_three_one_two_three()); + + UNIT_ASSERT(s.find_last_not_of(Data_.ehortw_()) == 15); + + TStringType test(Data_.aba()); + + UNIT_ASSERT(test.find_last_not_of(Data_.a(), 2, 1) == 1); + UNIT_ASSERT(test.find_last_not_of(Data_.b(), 2, 1) == 2); + UNIT_ASSERT(test.find_last_not_of(Data_.a(), 1, 1) == 1); + UNIT_ASSERT(test.find_last_not_of(Data_.b(), 1, 1) == 0); + UNIT_ASSERT(test.find_last_not_of(Data_.a(), 0, 1) == TStringType::npos); + UNIT_ASSERT(test.find_last_not_of(Data_.b(), 0, 1) == 0); + + UNIT_ASSERT(test.find_last_not_of(*Data_.a(), 2) == 1); + UNIT_ASSERT(test.find_last_not_of(*Data_.b(), 2) == 2); + UNIT_ASSERT(test.find_last_not_of(*Data_.a(), 1) == 1); + UNIT_ASSERT(test.find_last_not_of(*Data_.b(), 1) == 0); + UNIT_ASSERT(test.find_last_not_of(*Data_.a(), 0) == TStringType::npos); + UNIT_ASSERT(test.find_last_not_of(*Data_.b(), 0) == 0); + } +#if 0 + void replace() { + // This test case is for the non template basic_TString::replace method, + // this is why we play with the const iterators and reference to guaranty + // that the right method is called. + + const TStringType v(Data._78()); + TStringType s(Data._123456()); + TStringType const& cs = s; + + typename TStringType::iterator i = s.begin() + 1; + s.replace(i, i + 3, v.begin(), v.end()); + UNIT_ASSERT(s == Data._17856()); + + s = Data._123456(); + i = s.begin() + 1; + s.replace(i, i + 1, v.begin(), v.end()); + UNIT_ASSERT(s == Data._1783456()); + + s = Data._123456(); + i = s.begin() + 1; + typename TStringType::const_iterator ci = s.begin() + 1; + s.replace(i, i + 3, ci + 3, cs.end()); + UNIT_ASSERT(s == Data._15656()); + + s = Data._123456(); + i = s.begin() + 1; + ci = s.begin() + 1; + s.replace(i, i + 3, ci, ci + 2); + UNIT_ASSERT(s == Data._12356()); + + s = Data._123456(); + i = s.begin() + 1; + ci = s.begin() + 1; + s.replace(i, i + 3, ci + 1, cs.end()); + UNIT_ASSERT(s == Data._1345656()); + + s = Data._123456(); + i = s.begin(); + ci = s.begin() + 1; + s.replace(i, i, ci, ci + 1); + UNIT_ASSERT(s == Data._2123456()); + + s = Data._123456(); + s.replace(s.begin() + 4, s.end(), cs.begin(), cs.end()); + UNIT_ASSERT(s == Data._1234123456()); + + // This is the test for the template replace method. + + s = Data._123456(); + typename TStringType::iterator b = s.begin() + 4; + typename TStringType::iterator e = s.end(); + typename TStringType::const_iterator rb = s.begin(); + typename TStringType::const_iterator re = s.end(); + s.replace(b, e, rb, re); + UNIT_ASSERT(s == Data._1234123456()); + + s = Data._123456(); + s.replace(s.begin() + 4, s.end(), s.begin(), s.end()); + UNIT_ASSERT(s == Data._1234123456()); + + TStringType strorg(Data.This_is_test_StringT_for_StringT_calls()); + TStringType str = strorg; + str.replace(5, 15, str.c_str(), 10); + UNIT_ASSERT(str == Data.This_This_is_tefor_StringT_calls()); + + str = strorg; + str.replace(5, 5, str.c_str(), 10); + UNIT_ASSERT(str == Data.This_This_is_test_StringT_for_StringT_calls()); + + #if !defined(STLPORT) || defined(_STLP_MEMBER_TEMPLATES) + deque<TChar> cdeque; + cdeque.push_back(*Data.I()); + str.replace(str.begin(), str.begin() + 11, cdeque.begin(), cdeque.end()); + UNIT_ASSERT(str == Data.Is_test_StringT_for_StringT_calls()); + #endif + } +#endif +}; // TStringStdTestImpl + +class TStringTest: public TTestBase, private TStringTestImpl<TCowString, TTestData<char>> { +public: + UNIT_TEST_SUITE(TStringTest); + UNIT_TEST(TestMaxSize); + UNIT_TEST(TestConstructors); + UNIT_TEST(TestReplace); + UNIT_TEST(TestRefCount); + UNIT_TEST(TestFind); + UNIT_TEST(TestContains); + UNIT_TEST(TestOperators); + UNIT_TEST(TestMulOperators); + UNIT_TEST(TestFuncs); + UNIT_TEST(TestUtils); + UNIT_TEST(TestEmpty); + UNIT_TEST(TestJoin); + UNIT_TEST(TestCopy); + UNIT_TEST(TestStrCpy); + UNIT_TEST(TestPrefixSuffix); + UNIT_TEST(TestCharRef); + UNIT_TEST(TestBack) + UNIT_TEST(TestFront) + UNIT_TEST(TestIterators); + UNIT_TEST(TestReverseIterators); + UNIT_TEST(TestAppendUtf16) + UNIT_TEST(TestFillingAssign) + UNIT_TEST(TestStdStreamApi) + // UNIT_TEST(TestOperatorsCI); must fail + UNIT_TEST_SUITE_END(); + + void TestAppendUtf16() { + TCowString appended = TCowString("А роза упала").AppendUtf16(u" на лапу Азора"); + UNIT_ASSERT(appended == "А роза упала на лапу Азора"); + } + + void TestFillingAssign() { + TCowString s("abc"); + s.assign(5, 'a'); + UNIT_ASSERT_VALUES_EQUAL(s, "aaaaa"); + } + + void TestStdStreamApi() { + const TCowString data = "abracadabra"; + std::stringstream ss; + ss << data; + + UNIT_ASSERT_VALUES_EQUAL(data, ss.str()); + + ss << '\n' + << data << std::endl; + + TCowString read = "xxx"; + ss >> read; + UNIT_ASSERT_VALUES_EQUAL(read, data); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStringTest); + +class TWideStringTest: public TTestBase, private TStringTestImpl<TUtf16CowString, TTestData<wchar16>> { +public: + UNIT_TEST_SUITE(TWideStringTest); + UNIT_TEST(TestConstructors); + UNIT_TEST(TestReplace); + UNIT_TEST(TestRefCount); + UNIT_TEST(TestFind); + UNIT_TEST(TestContains); + UNIT_TEST(TestOperators); + UNIT_TEST(TestLetOperator) + UNIT_TEST(TestMulOperators); + UNIT_TEST(TestFuncs); + UNIT_TEST(TestUtils); + UNIT_TEST(TestEmpty); + UNIT_TEST(TestJoin); + UNIT_TEST(TestCopy); + UNIT_TEST(TestStrCpy); + UNIT_TEST(TestPrefixSuffix); + UNIT_TEST(TestCharRef); + UNIT_TEST(TestBack); + UNIT_TEST(TestFront) + UNIT_TEST(TestDecodingMethods); + UNIT_TEST(TestIterators); + UNIT_TEST(TestReverseIterators); + UNIT_TEST(TestStringLiterals); + UNIT_TEST_SUITE_END(); + +private: + void TestDecodingMethods() { + UNIT_ASSERT(TUtf16CowString::FromAscii("").empty()); + UNIT_ASSERT(TUtf16CowString::FromAscii("abc") == ASCIIToWide("abc")); + +#if 0 // no wide convertions support + const char* text = "123kx83abcd ej)#$%ddja&%J&"; + TUtf16CowString wtext = ASCIIToWide(text); + + UNIT_ASSERT(wtext == TUtf16CowString::FromAscii(text)); + + TCowString strtext(text); + UNIT_ASSERT(wtext == TUtf16CowString::FromAscii(strtext)); + + TStringBuf strbuftext(text); + UNIT_ASSERT(wtext == TUtf16CowString::FromAscii(strbuftext)); + + UNIT_ASSERT(wtext.substr(5) == TUtf16CowString::FromAscii(text + 5)); + + const wchar16 wideCyrillicAlphabet[] = { + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + 0x00}; + + TUtf16CowString strWide(wideCyrillicAlphabet); + TCowString strUtf8 = WideToUTF8(strWide); + + UNIT_ASSERT(strWide == TUtf16CowString::FromUtf8(strUtf8.c_str())); + UNIT_ASSERT(strWide == TUtf16CowString::FromUtf8(strUtf8)); + UNIT_ASSERT(strWide == TUtf16CowString::FromUtf8(TStringBuf(strUtf8))); + + // assign + + TUtf16CowString s1; + s1.AssignAscii("1234"); + UNIT_ASSERT(s1 == ASCIIToWide("1234")); + + s1.AssignUtf8(strUtf8); + UNIT_ASSERT(s1 == strWide); + + s1.AssignAscii(text); + UNIT_ASSERT(s1 == wtext); + + // append + + TUtf16CowString s2; + TUtf16CowString testAppend = strWide; + s2.AppendUtf8(strUtf8); + UNIT_ASSERT(testAppend == s2); + + testAppend += ' '; + s2.AppendAscii(" "); + UNIT_ASSERT(testAppend == s2); + + testAppend += '_'; + s2.AppendUtf8("_"); + UNIT_ASSERT(testAppend == s2); + + testAppend += wtext; + s2.AppendAscii(text); + UNIT_ASSERT(testAppend == s2); + + testAppend += wtext; + s2.AppendUtf8(text); + UNIT_ASSERT(testAppend == s2); +#endif + } + + void TestLetOperator() { + TUtf16CowString str; + + str = wchar16('X'); + UNIT_ASSERT(str == TUtf16CowString::FromAscii("X")); + + const TUtf16CowString hello = TUtf16CowString::FromAscii("hello"); + str = hello.data(); + UNIT_ASSERT(str == hello); + + str = hello; + UNIT_ASSERT(str == hello); + } + + void TestStringLiterals() { + TUtf16CowString s1 = u"hello"; + UNIT_ASSERT_VALUES_EQUAL(s1, TUtf16CowString::FromAscii("hello")); + + TUtf16CowString s2 = u"привет"; + UNIT_ASSERT_VALUES_EQUAL(s2, TUtf16CowString::FromUtf8("привет")); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TWideStringTest); + +class TUtf32StringTest: public TTestBase, private TStringTestImpl<TUtf32CowString, TTestData<wchar32>> { +public: + UNIT_TEST_SUITE(TUtf32StringTest); + UNIT_TEST(TestConstructors); + UNIT_TEST(TestReplace); + UNIT_TEST(TestRefCount); + UNIT_TEST(TestFind); + UNIT_TEST(TestContains); + UNIT_TEST(TestOperators); + UNIT_TEST(TestLetOperator) + UNIT_TEST(TestMulOperators); + UNIT_TEST(TestFuncs); + UNIT_TEST(TestUtils); + UNIT_TEST(TestEmpty); + UNIT_TEST(TestJoin); + UNIT_TEST(TestCopy); + UNIT_TEST(TestStrCpy); + UNIT_TEST(TestPrefixSuffix); + UNIT_TEST(TestCharRef); + UNIT_TEST(TestBack); + UNIT_TEST(TestFront) + UNIT_TEST(TestDecodingMethods); + UNIT_TEST(TestDecodingMethodsMixedStr); + UNIT_TEST(TestIterators); + UNIT_TEST(TestReverseIterators); + UNIT_TEST(TestStringLiterals); + UNIT_TEST_SUITE_END(); + +private: + void TestDecodingMethods() { + UNIT_ASSERT(TUtf32CowString::FromAscii("").empty()); + UNIT_ASSERT(TUtf32CowString::FromAscii("abc") == ASCIIToUTF32("abc")); + +#if 0 // no wide convertions support + const char* text = "123kx83abcd ej)#$%ddja&%J&"; + TUtf32CowString wtext = ASCIIToUTF32(text); + + UNIT_ASSERT(wtext == TUtf32CowString::FromAscii(text)); + + TCowString strtext(text); + UNIT_ASSERT(wtext == TUtf32CowString::FromAscii(strtext)); + + TStringBuf strbuftext(text); + UNIT_ASSERT(wtext == TUtf32CowString::FromAscii(strbuftext)); + + UNIT_ASSERT(wtext.substr(5) == TUtf32CowString::FromAscii(text + 5)); + + const wchar32 wideCyrillicAlphabet[] = { + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + 0x00}; + + TUtf32CowString strWide(wideCyrillicAlphabet); + TCowString strUtf8 = WideToUTF8(strWide); + + UNIT_ASSERT(strWide == TUtf32CowString::FromUtf8(strUtf8.c_str())); + UNIT_ASSERT(strWide == TUtf32CowString::FromUtf8(strUtf8)); + UNIT_ASSERT(strWide == TUtf32CowString::FromUtf8(TStringBuf(strUtf8))); + + // assign + + TUtf32CowString s1; + s1.AssignAscii("1234"); + UNIT_ASSERT(s1 == ASCIIToUTF32("1234")); + + s1.AssignUtf8(strUtf8); + UNIT_ASSERT(s1 == strWide); + + s1.AssignAscii(text); + UNIT_ASSERT(s1 == wtext); + + // append + + TUtf32CowString s2; + TUtf32CowString testAppend = strWide; + s2.AppendUtf8(strUtf8); + UNIT_ASSERT(testAppend == s2); + + testAppend += ' '; + s2.AppendAscii(" "); + UNIT_ASSERT(testAppend == s2); + + testAppend += '_'; + s2.AppendUtf8("_"); + UNIT_ASSERT(testAppend == s2); + + testAppend += wtext; + s2.AppendAscii(text); + UNIT_ASSERT(testAppend == s2); + + testAppend += wtext; + s2.AppendUtf8(text); + + UNIT_ASSERT(testAppend == s2); +#endif + } + + void TestDecodingMethodsMixedStr() { + UNIT_ASSERT(TUtf32CowString::FromAscii("").empty()); + UNIT_ASSERT(TUtf32CowString::FromAscii("abc") == ASCIIToUTF32("abc")); + +#if 0 // no wide convertions support + const char* text = "123kx83abcd ej)#$%ddja&%J&"; + TUtf32CowString wtext = ASCIIToUTF32(text); + + UNIT_ASSERT(wtext == TUtf32CowString::FromAscii(text)); + + TCowString strtext(text); + UNIT_ASSERT(wtext == TUtf32CowString::FromAscii(strtext)); + + TStringBuf strbuftext(text); + UNIT_ASSERT(wtext == TUtf32CowString::FromAscii(strbuftext)); + + UNIT_ASSERT(wtext.substr(5) == TUtf32CowString::FromAscii(text + 5)); + + const wchar32 cyrilicAndLatinWide[] = { + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, + 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + wchar32('z'), + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + wchar32('z'), + 0x00}; + + TUtf32CowString strWide(cyrilicAndLatinWide); + TCowString strUtf8 = WideToUTF8(strWide); + + UNIT_ASSERT(strWide == TUtf32CowString::FromUtf8(strUtf8.c_str())); + UNIT_ASSERT(strWide == TUtf32CowString::FromUtf8(strUtf8)); + UNIT_ASSERT(strWide == UTF8ToUTF32<true>(strUtf8)); + UNIT_ASSERT(strWide == UTF8ToUTF32<false>(strUtf8)); + UNIT_ASSERT(strWide == TUtf32CowString::FromUtf8(TStringBuf(strUtf8))); + + // assign + + TUtf32CowString s1; + s1.AssignAscii("1234"); + UNIT_ASSERT(s1 == ASCIIToUTF32("1234")); + + s1.AssignUtf8(strUtf8); + UNIT_ASSERT(s1 == strWide); + + s1.AssignAscii(text); + UNIT_ASSERT(s1 == wtext); + + // append + + TUtf32CowString s2; + TUtf32CowString testAppend = strWide; + s2.AppendUtf16(UTF8ToWide(strUtf8)); + UNIT_ASSERT(testAppend == s2); + + testAppend += ' '; + s2.AppendAscii(" "); + UNIT_ASSERT(testAppend == s2); + + testAppend += '_'; + s2.AppendUtf8("_"); + UNIT_ASSERT(testAppend == s2); + + testAppend += wtext; + s2.AppendAscii(text); + UNIT_ASSERT(testAppend == s2); + + testAppend += wtext; + s2.AppendUtf8(text); + + UNIT_ASSERT(testAppend == s2); +#endif + } + + void TestLetOperator() { + TUtf32CowString str; + + str = wchar32('X'); + UNIT_ASSERT(str == TUtf32CowString::FromAscii("X")); + + const TUtf32CowString hello = TUtf32CowString::FromAscii("hello"); + str = hello.data(); + UNIT_ASSERT(str == hello); + + str = hello; + UNIT_ASSERT(str == hello); + } + + void TestStringLiterals() { + TUtf32CowString s1 = U"hello"; + UNIT_ASSERT_VALUES_EQUAL(s1, TUtf32CowString::FromAscii("hello")); + + TUtf32CowString s2 = U"привет"; + UNIT_ASSERT_VALUES_EQUAL(s2, TUtf32CowString::FromUtf8("привет")); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TUtf32StringTest); + +class TStringStdTest: public TTestBase, private TStringStdTestImpl<TCowString, TTestData<char>> { +public: + UNIT_TEST_SUITE(TStringStdTest); + UNIT_TEST(Constructor); + UNIT_TEST(reserve); + UNIT_TEST(short_string); + UNIT_TEST(erase); + UNIT_TEST(data); + UNIT_TEST(c_str); + UNIT_TEST(null_char_of_empty); + UNIT_TEST(null_char); + UNIT_TEST(null_char_assignment_to_subscript_of_empty); + UNIT_TEST(null_char_assignment_to_subscript_of_nonempty); + UNIT_TEST(null_char_assignment_to_end_of_empty); + UNIT_TEST(null_char_assignment_to_end_of_nonempty); + UNIT_TEST(insert); + UNIT_TEST(resize); + UNIT_TEST(find); + UNIT_TEST(capacity); + UNIT_TEST(assign); + UNIT_TEST(copy); + UNIT_TEST(cbegin_cend); + UNIT_TEST(compare); + UNIT_TEST(find_last_of); +#if 0 + UNIT_TEST(rfind); + UNIT_TEST(replace); +#endif + UNIT_TEST(find_last_not_of); + UNIT_TEST_SUITE_END(); +}; + +UNIT_TEST_SUITE_REGISTRATION(TStringStdTest); + +class TWideStringStdTest: public TTestBase, private TStringStdTestImpl<TUtf16CowString, TTestData<wchar16>> { +public: + UNIT_TEST_SUITE(TWideStringStdTest); + UNIT_TEST(Constructor); + UNIT_TEST(reserve); + UNIT_TEST(short_string); + UNIT_TEST(erase); + UNIT_TEST(data); + UNIT_TEST(c_str); + UNIT_TEST(null_char_of_empty); + UNIT_TEST(null_char); + UNIT_TEST(null_char_assignment_to_subscript_of_empty); + UNIT_TEST(null_char_assignment_to_subscript_of_nonempty); + UNIT_TEST(null_char_assignment_to_end_of_empty); + UNIT_TEST(null_char_assignment_to_end_of_nonempty); + UNIT_TEST(insert); + UNIT_TEST(resize); + UNIT_TEST(find); + UNIT_TEST(capacity); + UNIT_TEST(assign); + UNIT_TEST(copy); + UNIT_TEST(cbegin_cend); + UNIT_TEST(compare); + UNIT_TEST(find_last_of); +#if 0 + UNIT_TEST(rfind); + UNIT_TEST(replace); +#endif + UNIT_TEST(find_last_not_of); + UNIT_TEST_SUITE_END(); +}; + +UNIT_TEST_SUITE_REGISTRATION(TWideStringStdTest); + +Y_UNIT_TEST_SUITE(TStringConversionTest) { + Y_UNIT_TEST(ConversionToStdStringTest) { + TCowString abra = "cadabra"; + std::string stdAbra = abra; + UNIT_ASSERT_VALUES_EQUAL(stdAbra, "cadabra"); + } + + Y_UNIT_TEST(ConversionToStdStringViewTest) { + TCowString abra = "cadabra"; + std::string_view stdAbra = abra; + UNIT_ASSERT_VALUES_EQUAL(stdAbra, "cadabra"); + } +} // Y_UNIT_TEST_SUITE(TStringConversionTest) + +Y_UNIT_TEST_SUITE(HashFunctorTests) { + Y_UNIT_TEST(TestTransparency) { + THash<TCowString> h; + const char* ptr = "a"; + const TStringBuf strbuf = ptr; + const TCowString str = ptr; + const std::string stdStr = ptr; + UNIT_ASSERT_VALUES_EQUAL(h(ptr), h(strbuf)); + UNIT_ASSERT_VALUES_EQUAL(h(ptr), h(str)); + UNIT_ASSERT_VALUES_EQUAL(h(ptr), h(stdStr)); + } +} // Y_UNIT_TEST_SUITE(HashFunctorTests) + +Y_UNIT_TEST_SUITE(StdNonConformant) { + Y_UNIT_TEST(TestEraseNoThrow) { + TCowString x; + + LegacyErase(x, 10); + } + + Y_UNIT_TEST(TestReplaceNoThrow) { + TCowString x; + + LegacyReplace(x, 0, 0, "1"); + + UNIT_ASSERT_VALUES_EQUAL(x, "1"); + + LegacyReplace(x, 10, 0, "1"); + + UNIT_ASSERT_VALUES_EQUAL(x, "1"); + } + + Y_UNIT_TEST(TestNoAlias) { + TCowString s = "x"; + + s.AppendNoAlias("abc", 3); + + UNIT_ASSERT_VALUES_EQUAL(s, "xabc"); + UNIT_ASSERT_VALUES_EQUAL(TCowString(s.c_str()), "xabc"); + } +} // Y_UNIT_TEST_SUITE(StdNonConformant) + +Y_UNIT_TEST_SUITE(Interop) { + static void Mutate(std::string& s) { + s += "y"; + } + + static void Mutate(TCowString& s) { + Mutate(MutRef(s)); + } + + Y_UNIT_TEST(TestMutate) { + TCowString x = "x"; + + Mutate(x); + + UNIT_ASSERT_VALUES_EQUAL(x, "xy"); + } + + static std::string TransformStd(const std::string& s) { + return s + "y"; + } + + static TCowString Transform(const TCowString& s) { + return TransformStd(s); + } + + Y_UNIT_TEST(TestTransform) { + UNIT_ASSERT_VALUES_EQUAL(Transform(TCowString("x")), "xy"); + } + + Y_UNIT_TEST(TestTemp) { + UNIT_ASSERT_VALUES_EQUAL("x" + ConstRef(TCowString("y")), "xy"); + } + + static void ComparePointers(const std::string& s, const void* expected, TStringBuf descr) { + UNIT_ASSERT_VALUES_EQUAL_C(static_cast<const void*>(s.c_str()), expected, descr); + } + + Y_UNIT_TEST(TestConstShared) { + TCowString s(600, 'a'); + const void* stringStart = s.c_str(); + ComparePointers(s, stringStart, "unique"); + TCowString shared{s}; + ComparePointers(s, stringStart, "shared"); // converting a TCowString to a `const std::string&` should not cause data cloning + } +} // Y_UNIT_TEST_SUITE(Interop) + +Y_UNIT_TEST_SUITE(CowPitfalls) { + template<class T> + static TString CopyStringViaBeginEndIterators(T& string, bool reverse) { + decltype(string.begin()) b; + decltype(string.end()) e; + if (!reverse) { + b = string.begin(); + e = string.end(); + } else { + e = string.end(); + b = string.begin(); + } + return TString{b, e}; + } + + Y_UNIT_TEST(IteratorCallOrder) { + const TString ref(600, 'a'); + for (const bool reverse : {false, true}) { + TCowString s = {ref.begin(), ref.end()}; + // sanity check + UNIT_ASSERT_VALUES_EQUAL_C(CopyStringViaBeginEndIterators<const TCowString>(s, reverse), TStringBuf(ref), LabeledOutput(reverse)); + UNIT_ASSERT_VALUES_EQUAL_C(CopyStringViaBeginEndIterators<TCowString>(s, reverse), TStringBuf(ref), LabeledOutput(reverse)); + // test + TCowString copy = s; + UNIT_ASSERT_VALUES_EQUAL_C(CopyStringViaBeginEndIterators<const TCowString>(s, reverse), TStringBuf(ref), LabeledOutput(reverse)); + UNIT_ASSERT_VALUES_EQUAL_C(CopyStringViaBeginEndIterators<TCowString>(s, reverse), TStringBuf(ref), LabeledOutput(reverse)); + } + } + + Y_UNIT_TEST(RangeFor) { + TCowString str; + str.resize(200); + TCowString copy = str; + for (auto& c : str) { + c = 'x'; + } + UNIT_ASSERT_VALUES_EQUAL(str, TString(200, 'x')); + } +} // Y_UNIT_TEST_SUITE(CowPitfalls) diff --git a/library/cpp/containers/cow_string/output.cpp b/library/cpp/containers/cow_string/output.cpp new file mode 100644 index 00000000000..e0b4924ad31 --- /dev/null +++ b/library/cpp/containers/cow_string/output.cpp @@ -0,0 +1,46 @@ +#include "cow_string.h" + +#include <util/charset/wide.h> +#include <util/stream/input.h> +#include <util/string/cast.h> + +constexpr size_t MAX_UTF8_BYTES = 4; // UTF-8-encoded code point takes between 1 and 4 bytes + +template <typename TCharType> +static void WriteString(IOutputStream& o, const TCharType* w, size_t n) { + const size_t buflen = (n * MAX_UTF8_BYTES); // * 4 because the conversion functions can convert unicode character into maximum 4 bytes of UTF8 + TTempBuf buffer(buflen + 1); + size_t written = 0; + WideToUTF8(w, n, buffer.Data(), written); + o.Write(buffer.Data(), written); +} + +template <> +void Out<TCowString>(IOutputStream& o, const TCowString& p) { + o.Write(p.data(), p.size()); +} + +template <> +void Out<TUtf16CowString>(IOutputStream& o, const TUtf16CowString& w) { + WriteString(o, w.c_str(), w.size()); +} + +template <> +void Out<TUtf32CowString>(IOutputStream& o, const TUtf32CowString& w) { + WriteString(o, w.c_str(), w.size()); +} + +template <> +void Out<TBasicCharRef<TCowString>>(IOutputStream& o, const TBasicCharRef<TCowString>& c) { + o << static_cast<char>(c); +} + +template <> +void Out<TBasicCharRef<TUtf16CowString>>(IOutputStream& o, const TBasicCharRef<TUtf16CowString>& c) { + o << static_cast<wchar16>(c); +} + +template <> +void Out<TBasicCharRef<TUtf32CowString>>(IOutputStream& o, const TBasicCharRef<TUtf32CowString>& c) { + o << static_cast<wchar32>(c); +} diff --git a/library/cpp/containers/cow_string/reverse.cpp b/library/cpp/containers/cow_string/reverse.cpp new file mode 100644 index 00000000000..b5bd10d250a --- /dev/null +++ b/library/cpp/containers/cow_string/reverse.cpp @@ -0,0 +1,32 @@ +#include "reverse.h" + +#include <util/generic/vector.h> +#include <util/charset/wide_specific.h> + +#include <algorithm> + +void ReverseInPlace(TCowString& string) { + auto* begin = string.begin(); + std::reverse(begin, begin + string.size()); +} + +void ReverseInPlace(TUtf16CowString& string) { + auto* begin = string.begin(); + const auto len = string.size(); + auto* end = begin + string.size(); + + TVector<wchar16> buffer(len); + wchar16* rbegin = buffer.data() + len; + for (wchar16* p = begin; p < end;) { + const size_t symbolSize = W16SymbolSize(p, end); + rbegin -= symbolSize; + std::copy(p, p + symbolSize, rbegin); + p += symbolSize; + } + std::copy(buffer.begin(), buffer.end(), begin); +} + +void ReverseInPlace(TUtf32CowString& string) { + auto* begin = string.begin(); + std::reverse(begin, begin + string.size()); +} diff --git a/library/cpp/containers/cow_string/reverse.h b/library/cpp/containers/cow_string/reverse.h new file mode 100644 index 00000000000..d27b0b4fed6 --- /dev/null +++ b/library/cpp/containers/cow_string/reverse.h @@ -0,0 +1,16 @@ +#pragma once + +#include <library/cpp/containers/cow_string/cow_string.h> + +void ReverseInPlace(TCowString& string); + +/** NB. UTF-16 is variable-length encoding because of the surrogate pairs. + * This function takes this into account and treats a surrogate pair as a single symbol. + * Ex. if [C D] is a surrogate pair, + * A B [C D] E + * will become + * E [C D] B A + */ +void ReverseInPlace(TUtf16CowString& string); + +void ReverseInPlace(TUtf32CowString& string); diff --git a/library/cpp/containers/cow_string/str_stl.h b/library/cpp/containers/cow_string/str_stl.h new file mode 100644 index 00000000000..d8256a6e10b --- /dev/null +++ b/library/cpp/containers/cow_string/str_stl.h @@ -0,0 +1,67 @@ +#pragma once + +#include <util/str_stl.h> + +template <> +struct hash<TCowString>: ::NHashPrivate::TStringHash<char> { +}; + +template <> +struct hash<TUtf16CowString>: ::NHashPrivate::TStringHash<wchar16> { +}; + +template <> +struct hash<TUtf32CowString>: ::NHashPrivate::TStringHash<wchar32> { +}; + +template <> +struct TEqualTo<TCowString>: public TEqualTo<TStringBuf> { + using is_transparent = void; +}; + +template <> +struct TEqualTo<TUtf16CowString>: public TEqualTo<TWtringBuf> { + using is_transparent = void; +}; + +template <> +struct TEqualTo<TUtf32CowString>: public TEqualTo<TUtf32StringBuf> { + using is_transparent = void; +}; + +template <> +struct TCIEqualTo<TCowString> { + inline bool operator()(const TCowString& a, const TCowString& b) const { + return a.size() == b.size() && strnicmp(a.data(), b.data(), a.size()) == 0; + } +}; + +template <> +struct TLess<TCowString>: public TLess<TStringBuf> { + using is_transparent = void; +}; + +template <> +struct TLess<TUtf16CowString>: public TLess<TWtringBuf> { + using is_transparent = void; +}; + +template <> +struct TLess<TUtf32CowString>: public TLess<TUtf32StringBuf> { + using is_transparent = void; +}; + +template <> +struct TGreater<TCowString>: public TGreater<TStringBuf> { + using is_transparent = void; +}; + +template <> +struct TGreater<TUtf16CowString>: public TGreater<TWtringBuf> { + using is_transparent = void; +}; + +template <> +struct TGreater<TUtf32CowString>: public TGreater<TUtf32StringBuf> { + using is_transparent = void; +}; diff --git a/library/cpp/containers/cow_string/subst.cpp b/library/cpp/containers/cow_string/subst.cpp new file mode 100644 index 00000000000..d4e9ff3395d --- /dev/null +++ b/library/cpp/containers/cow_string/subst.cpp @@ -0,0 +1,182 @@ +#include "subst.h" + +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/system/compiler.h> + +#include <string> +#include <type_traits> + +// a bit of template magic (to be fast and unreadable) +template <class TStringType, class TTo, bool Main> +static Y_FORCE_INLINE void MoveBlock(typename TStringType::value_type* ptr, size_t& srcPos, size_t& dstPos, const size_t off, const TTo to, const size_t toSize) { + const size_t unchangedSize = off - srcPos; + if (dstPos < srcPos) { + for (size_t i = 0; i < unchangedSize; ++i) { + ptr[dstPos++] = ptr[srcPos++]; + } + } else { + dstPos += unchangedSize; + srcPos += unchangedSize; + } + + if (Main) { + for (size_t i = 0; i < toSize; ++i) { + ptr[dstPos++] = to[i]; + } + } +} + +template <typename T, typename U> +static bool IsIntersect(const T& a, const U& b) noexcept { + if (b.data() < a.data()) { + return IsIntersect(b, a); + } + + return !a.empty() && !b.empty() && + ((a.data() <= b.data() && b.data() < a.data() + a.size()) || + (a.data() < b.data() + b.size() && b.data() + b.size() <= a.data() + a.size())); +} + +/** + * Replaces all occurences of substring @c from in string @c s to string @c to. + * Uses two separate implementations (inplace for shrink and append for grow case) + * See IGNIETFERRO-394 + **/ +template <class TStringType, typename TStringViewType = TBasicStringBuf<typename TStringType::value_type>> +static inline size_t SubstGlobalImpl(TStringType& s, const TStringViewType from, const TStringViewType to, size_t fromPos = 0) { + if (from.empty()) { + return 0; + } + + Y_ASSERT(!IsIntersect(s, from)); + Y_ASSERT(!IsIntersect(s, to)); + + const size_t fromSize = from.size(); + const size_t toSize = to.size(); + size_t replacementsCount = 0; + size_t off = fromPos; + size_t srcPos = 0; + + if (toSize > fromSize) { + // string will grow: append to another string + TStringType result; + for (; (off = TStringViewType(s).find(from, off)) != TStringType::npos; off += fromSize) { + if (!replacementsCount) { + // first replacement occured, we can prepare result string + result.reserve(s.size() + s.size() / 3); + } + result.append(s.begin() + srcPos, s.begin() + off); + result.append(to.data(), to.size()); + srcPos = off + fromSize; + ++replacementsCount; + } + if (replacementsCount) { + // append tail + result.append(s.begin() + srcPos, s.end()); + s = std::move(result); + } + return replacementsCount; + } + + // string will not grow: use inplace algo + size_t dstPos = 0; + typename TStringType::value_type* ptr = &*s.begin(); + for (; (off = TStringViewType(s).find(from, off)) != TStringType::npos; off += fromSize) { + Y_ASSERT(dstPos <= srcPos); + MoveBlock<TStringType, TStringViewType, true>(ptr, srcPos, dstPos, off, to, toSize); + srcPos = off + fromSize; + ++replacementsCount; + } + + if (replacementsCount) { + // append tail + MoveBlock<TStringType, TStringViewType, false>(ptr, srcPos, dstPos, s.size(), to, toSize); + s.resize(dstPos); + } + return replacementsCount; +} + +/// Replaces all occurences of the 'from' symbol in a string to the 'to' symbol. +template <class TStringType> +inline size_t SubstCharGlobalImpl(TStringType& s, typename TStringType::value_type from, typename TStringType::value_type to, size_t fromPos = 0) { + if (fromPos >= s.size()) { + return 0; + } + + size_t result = 0; + fromPos = s.find(from, fromPos); + + // s.begin() might cause memory copying, so call it only if needed + if (fromPos != TStringType::npos) { + auto* it = &*s.begin() + fromPos; + *it = to; + ++result; + // at this point string is copied and it's safe to use constant s.end() to iterate + const auto* const sEnd = &*s.end(); + // unrolled loop goes first because it is more likely that `it` will be properly aligned + for (const auto* const end = sEnd - (sEnd - it) % 4; it < end;) { + if (*it == from) { + *it = to; + ++result; + } + ++it; + if (*it == from) { + *it = to; + ++result; + } + ++it; + if (*it == from) { + *it = to; + ++result; + } + ++it; + if (*it == from) { + *it = to; + ++result; + } + ++it; + } + for (; it < sEnd; ++it) { + if (*it == from) { + *it = to; + ++result; + } + } + } + + return result; +} + +/* Standard says that `char16_t` is a distinct type and has same size, signedness and alignment as + * `std::uint_least16_t`, so we check if `char16_t` has same signedness and size as `wchar16` to be + * sure that we can make safe casts between values of these types and pointers. + */ +static_assert(sizeof(wchar16) == sizeof(char16_t), ""); +static_assert(sizeof(wchar32) == sizeof(char32_t), ""); +static_assert(std::is_unsigned<wchar16>::value == std::is_unsigned<char16_t>::value, ""); +static_assert(std::is_unsigned<wchar32>::value == std::is_unsigned<char32_t>::value, ""); + +size_t SubstGlobal(TCowString& text, const TStringBuf what, const TStringBuf with, size_t from) { + return SubstGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(TUtf16CowString& text, const TWtringBuf what, const TWtringBuf with, size_t from) { + return SubstGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(TUtf32CowString& text, const TUtf32StringBuf what, const TUtf32StringBuf with, size_t from) { + return SubstGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(TCowString& text, char what, char with, size_t from) { + return SubstCharGlobalImpl(text, what, with, from); +} + +size_t SubstGlobal(TUtf16CowString& text, wchar16 what, wchar16 with, size_t from) { + return SubstCharGlobalImpl(text, (char16_t)what, (char16_t)with, from); +} + +size_t SubstGlobal(TUtf32CowString& text, wchar32 what, wchar32 with, size_t from) { + return SubstCharGlobalImpl(text, (char32_t)what, (char32_t)with, from); +} diff --git a/library/cpp/containers/cow_string/subst.h b/library/cpp/containers/cow_string/subst.h new file mode 100644 index 00000000000..6090ba54b25 --- /dev/null +++ b/library/cpp/containers/cow_string/subst.h @@ -0,0 +1,31 @@ +#pragma once + +#include <library/cpp/containers/cow_string/cow_string.h> + +#include <util/string/subst.h> + +/* Replace all occurences of substring `what` with string `with` starting from position `from`. + * + * @param text String to modify. + * @param what Substring to replace. + * @param with Substring to use as replacement. + * @param from Position at with to start replacement. + * + * @return Number of replacements occured. + */ +size_t SubstGlobal(TCowString& text, TStringBuf what, TStringBuf with, size_t from = 0); +size_t SubstGlobal(TUtf16CowString& text, TWtringBuf what, TWtringBuf with, size_t from = 0); +size_t SubstGlobal(TUtf32CowString& text, TUtf32StringBuf what, TUtf32StringBuf with, size_t from = 0); + +/* Replace all occurences of character `what` with character `with` starting from position `from`. + * + * @param text String to modify. + * @param what Character to replace. + * @param with Character to use as replacement. + * @param from Position at with to start replacement. + * + * @return Number of replacements occured. + */ +size_t SubstGlobal(TCowString& text, char what, char with, size_t from = 0); +size_t SubstGlobal(TUtf16CowString& text, wchar16 what, wchar16 with, size_t from = 0); +size_t SubstGlobal(TUtf32CowString& text, wchar32 what, wchar32 with, size_t from = 0); diff --git a/library/cpp/containers/cow_string/ut/ya.make b/library/cpp/containers/cow_string/ut/ya.make new file mode 100644 index 00000000000..1a54646dc77 --- /dev/null +++ b/library/cpp/containers/cow_string/ut/ya.make @@ -0,0 +1,7 @@ +UNITTEST_FOR(library/cpp/containers/cow_string) + +SRCS( + cow_string_ut.cpp +) + +END() diff --git a/library/cpp/containers/cow_string/ut_medium/cow_string_medium_ut.cpp b/library/cpp/containers/cow_string/ut_medium/cow_string_medium_ut.cpp new file mode 100644 index 00000000000..a9a37db776d --- /dev/null +++ b/library/cpp/containers/cow_string/ut_medium/cow_string_medium_ut.cpp @@ -0,0 +1,55 @@ +#include <library/cpp/containers/cow_string/cow_string.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/strbuf.h> +#include <util/generic/yexception.h> +#include <util/stream/output.h> +#include <util/system/thread.h> + +#include <string> +#include <barrier> + +static_assert(sizeof(TCowString) == sizeof(const char*), "expect sizeof(TCowString) == sizeof(const char*)"); + +Y_UNIT_TEST_SUITE(CowPitfalls) { + Y_UNIT_TEST(ParallelDetach) { + // best results with thread-sanitizer + std::vector<std::unique_ptr<TThread>> threads; + TCowString a = "the string"; + TCowString b = a; + auto makeRefToA = [&a, &b]() { + b = a; // make second reference to the same string + }; + constexpr int nThreads = 8; +#ifdef _tsan_enabled_ + constexpr i64 retries = 1'000; +#else + constexpr i64 retries = 1'000'000; +#endif + std::barrier iterationSyncPoint(nThreads, makeRefToA); + std::atomic<i64> totalLen = 0; + auto addLen = [](std::string a, std::atomic<i64>& len) { + len += a.length(); + }; + auto workload = [&a, &addLen, &totalLen, &iterationSyncPoint]() { + std::atomic<i64> len = 0; + for (i64 j = 0; j < retries; ++j) { + addLen(a, len); // possibility of bad implicit conversion + iterationSyncPoint.arrive_and_wait(); + } + totalLen += len.load(); + }; + for (int i = 0; i < nThreads; ++i) { + threads.push_back(std::make_unique<TThread>(workload)); + } + for (auto& t : threads) { + t->Start(); + } + for (auto& t : threads) { + t->Join(); + } + UNIT_ASSERT_VALUES_EQUAL(totalLen.load(), b.size() * nThreads * retries); + } + +} // Y_UNIT_TEST_SUITE(CowPitfalls) diff --git a/library/cpp/containers/cow_string/ut_medium/ya.make b/library/cpp/containers/cow_string/ut_medium/ya.make new file mode 100644 index 00000000000..f8420a8c688 --- /dev/null +++ b/library/cpp/containers/cow_string/ut_medium/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/containers/cow_string) + +SRCS( + cow_string_medium_ut.cpp +) + +SIZE(medium) + +END() diff --git a/library/cpp/containers/cow_string/ya.make b/library/cpp/containers/cow_string/ya.make new file mode 100644 index 00000000000..5df2d2de830 --- /dev/null +++ b/library/cpp/containers/cow_string/ya.make @@ -0,0 +1,15 @@ +LIBRARY() + +SRCS( + cow_string.cpp + output.cpp + reverse.cpp + subst.cpp +) + +END() + +RECURSE_FOR_TESTS( + ut + ut_medium +) diff --git a/library/cpp/yt/yson_string/string.cpp b/library/cpp/yt/yson_string/string.cpp index 45a6aa7f099..c21783b204a 100644 --- a/library/cpp/yt/yson_string/string.cpp +++ b/library/cpp/yt/yson_string/string.cpp @@ -91,25 +91,14 @@ TYsonString::TYsonString( : TYsonString(TYsonStringBuf(data, type)) { } -#ifdef TSTRING_IS_STD_STRING TYsonString::TYsonString( const TString& data, EYsonType type) - : TYsonString(TYsonStringBuf(data, type)) + : Payload_(TCowString(data)) + , Begin_(std::get<TCowString>(Payload_).data()) + , Size_(data.length()) + , Type_(type) { } -#else -TYsonString::TYsonString( - const TString& data, - EYsonType type) -{ - // NOTE: CoW TString implementation is assumed - // Moving the payload MUST NOT invalidate its internal pointers - Payload_ = data; - Begin_ = data.data(); - Size_ = data.length(); - Type_ = type; -} -#endif TYsonString::TYsonString( const TSharedRef& data, @@ -148,8 +137,8 @@ TString TYsonString::ToString() const [&] (const TSharedRangeHolderPtr&) { return TString(AsStringBuf()); }, - [] (const TString& payload) { - return payload; + [] (const TCowString& payload) { + return TString(payload); }); } @@ -163,7 +152,7 @@ TSharedRef TYsonString::ToSharedRef() const [&] (const TSharedRangeHolderPtr& holder) { return TSharedRef(Begin_, Size_, holder); }, - [] (const TString& payload) { + [] (const TCowString& payload) { return TSharedRef::FromString(payload); }); } diff --git a/library/cpp/yt/yson_string/string.h b/library/cpp/yt/yson_string/string.h index 8fbe415e0fd..b9f44e6dccd 100644 --- a/library/cpp/yt/yson_string/string.h +++ b/library/cpp/yt/yson_string/string.h @@ -6,6 +6,8 @@ #include <library/cpp/yt/string/format.h> +#include <library/cpp/containers/cow_string/cow_string.h> + #include <variant> namespace NYT::NYson { @@ -117,7 +119,7 @@ private: struct TNullPayload { }; - std::variant<TNullPayload, TSharedRangeHolderPtr, TString> Payload_; + std::variant<TNullPayload, TSharedRangeHolderPtr, TCowString> Payload_; const char* Begin_; ui64 Size_ : 56; diff --git a/library/cpp/yt/yson_string/ya.make b/library/cpp/yt/yson_string/ya.make index ba693760f10..da180de5471 100644 --- a/library/cpp/yt/yson_string/ya.make +++ b/library/cpp/yt/yson_string/ya.make @@ -8,6 +8,7 @@ SRCS( ) PEERDIR( + library/cpp/containers/cow_string library/cpp/yt/assert library/cpp/yt/coding library/cpp/yt/exception |
