diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/deprecated/split | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/deprecated/split')
-rw-r--r-- | library/cpp/deprecated/split/delim_string_iter.cpp | 45 | ||||
-rw-r--r-- | library/cpp/deprecated/split/delim_string_iter.h | 185 | ||||
-rw-r--r-- | library/cpp/deprecated/split/delim_string_iter_ut.cpp | 99 | ||||
-rw-r--r-- | library/cpp/deprecated/split/split_iterator.cpp | 318 | ||||
-rw-r--r-- | library/cpp/deprecated/split/split_iterator.h | 317 | ||||
-rw-r--r-- | library/cpp/deprecated/split/split_iterator_ut.cpp | 152 | ||||
-rw-r--r-- | library/cpp/deprecated/split/ya.make | 14 |
7 files changed, 1130 insertions, 0 deletions
diff --git a/library/cpp/deprecated/split/delim_string_iter.cpp b/library/cpp/deprecated/split/delim_string_iter.cpp new file mode 100644 index 0000000000..af418c5bfb --- /dev/null +++ b/library/cpp/deprecated/split/delim_string_iter.cpp @@ -0,0 +1,45 @@ +#include "delim_string_iter.h" + +// +// TKeyValueDelimStringIter +// + +void TKeyValueDelimStringIter::ReadKeyAndValue() { + TStringBuf currentToken(*DelimIter); + + size_t pos = currentToken.find('='); + if (pos == TString::npos) { + ChunkValue.Clear(); + ChunkKey = currentToken; + } else { + ChunkKey = currentToken.SubStr(0, pos); + ChunkValue = currentToken.SubStr(pos + 1); + } +} + +TKeyValueDelimStringIter::TKeyValueDelimStringIter(const TStringBuf str, const TStringBuf delim) + : DelimIter(str, delim) +{ + if (DelimIter.Valid()) + ReadKeyAndValue(); +} + +bool TKeyValueDelimStringIter::Valid() const { + return DelimIter.Valid(); +} + +TKeyValueDelimStringIter& TKeyValueDelimStringIter::operator++() { + ++DelimIter; + if (DelimIter.Valid()) + ReadKeyAndValue(); + + return *this; +} + +const TStringBuf& TKeyValueDelimStringIter::Key() const { + return ChunkKey; +} + +const TStringBuf& TKeyValueDelimStringIter::Value() const { + return ChunkValue; +} diff --git a/library/cpp/deprecated/split/delim_string_iter.h b/library/cpp/deprecated/split/delim_string_iter.h new file mode 100644 index 0000000000..8e4ca171a0 --- /dev/null +++ b/library/cpp/deprecated/split/delim_string_iter.h @@ -0,0 +1,185 @@ +#pragma once + +#include <util/generic/algorithm.h> +#include <util/generic/strbuf.h> +#include <util/generic/yexception.h> +#include <util/string/cast.h> +#include <util/system/yassert.h> + +#include <iterator> + +class TDelimStringIter { +public: + using value_type = TStringBuf; + using difference_type = ptrdiff_t; + using pointer = const TStringBuf*; + using reference = const TStringBuf&; + using iterator_category = std::forward_iterator_tag; + + inline TDelimStringIter(const char* begin, const char* strEnd, TStringBuf delim) + : TDelimStringIter(TStringBuf(begin, strEnd), delim) + { + } + + inline TDelimStringIter(TStringBuf str, TStringBuf delim) + : IsValid(true) + , Str(str) + , Delim(delim) + { + UpdateCurrent(); + } + + inline TDelimStringIter() + : IsValid(false) + { + } + + inline explicit operator bool() const { + return IsValid; + } + + // NOTE: this is a potentially unsafe operation (no overrun check) + inline TDelimStringIter& operator++() { + if (Current.end() != Str.end()) { + Str.Skip(Current.length() + Delim.length()); + UpdateCurrent(); + } else { + Str.Clear(); + Current.Clear(); + IsValid = false; + } + return *this; + } + + inline void operator+=(size_t n) { + for (; n > 0; --n) { + ++(*this); + } + } + + inline bool operator==(const TDelimStringIter& rhs) const { + return (IsValid == rhs.IsValid) && (!IsValid || (Current.begin() == rhs.Current.begin())); + } + + inline bool operator!=(const TDelimStringIter& rhs) const { + return !(*this == rhs); + } + + inline TStringBuf operator*() const { + return Current; + } + + inline const TStringBuf* operator->() const { + return &Current; + } + + // Get & advance + template <class T> + inline bool TryNext(T& t) { + if (IsValid) { + t = FromString<T>(Current); + operator++(); + return true; + } else { + return false; + } + } + + template <class T> + inline TDelimStringIter& Next(T& t) // Get & advance + { + if (!TryNext(t)) + ythrow yexception() << "No valid field"; + return *this; + } + + template <class T> + inline T GetNext() { + T res; + Next(res); + return res; + } + + inline const char* GetBegin() const { + return Current.begin(); + } + + inline const char* GetEnd() const { + return Current.end(); + } + + inline bool Valid() const { + return IsValid; + } + + // contents from next token to the end of string + inline TStringBuf Cdr() const { + return Str.SubStr(Current.length() + Delim.length()); + } + + inline TDelimStringIter IterEnd() const { + return TDelimStringIter(); + } + +private: + inline void UpdateCurrent() { + // it is much faster than TStringBuf::find + size_t pos = std::search(Str.begin(), Str.end(), Delim.begin(), Delim.end()) - Str.begin(); + Current = Str.Head(pos); + } + +private: + bool IsValid; + + TStringBuf Str; + TStringBuf Current; + TStringBuf Delim; +}; + +//example: for (TStringBuf field: TDelimStroka(line, "@@")) { ... } +struct TDelimStroka { + TStringBuf S; + TStringBuf Delim; + + inline TDelimStroka(TStringBuf s, TStringBuf delim) + : S(s) + , Delim(delim) + { + } + + inline TDelimStringIter begin() const { + return TDelimStringIter(S, Delim); + } + + inline TDelimStringIter end() const { + return TDelimStringIter(); + } +}; + +inline TDelimStringIter begin_delim(const TString& str, TStringBuf delim) { + return TDelimStringIter(str, delim); +} + +inline TDelimStringIter begin_delim(TStringBuf str, TStringBuf delim) { + return TDelimStringIter(str.begin(), str.end(), delim); +} + +inline TDelimStringIter end_delim(const TString& /*str*/, TStringBuf /*delim*/) { + return TDelimStringIter(); +} + +class TKeyValueDelimStringIter { +public: + TKeyValueDelimStringIter(const TStringBuf str, const TStringBuf delim); + bool Valid() const; + TKeyValueDelimStringIter& operator++(); + const TStringBuf& Key() const; + const TStringBuf& Value() const; + +private: + TDelimStringIter DelimIter; + TStringBuf ChunkKey, ChunkValue; + +private: + void ReadKeyAndValue(); +}; diff --git a/library/cpp/deprecated/split/delim_string_iter_ut.cpp b/library/cpp/deprecated/split/delim_string_iter_ut.cpp new file mode 100644 index 0000000000..18a8b2a160 --- /dev/null +++ b/library/cpp/deprecated/split/delim_string_iter_ut.cpp @@ -0,0 +1,99 @@ +#include "delim_string_iter.h" +#include <util/generic/vector.h> +#include <library/cpp/testing/unittest/registar.h> + +/// Test that TDelimStringIter build on top of given string and delimeter will produce expected sequence +static void AssertStringSplit(const TString& str, const TString& delim, const TVector<TString>& expected) { + TDelimStringIter it(str, delim); + + // test iterator invariants + for (const auto& expectedString : expected) { + UNIT_ASSERT(it.Valid()); + UNIT_ASSERT(bool(it)); + UNIT_ASSERT_STRINGS_EQUAL(it->ToString(), expectedString); + ++it; + } + UNIT_ASSERT(!it.Valid()); +}; + +Y_UNIT_TEST_SUITE(TDelimStrokaIterTestSuite) { + Y_UNIT_TEST(SingleCharacterAsDelimiter) { + AssertStringSplit( + "Hello words!", " ", {"Hello", "words!"}); + } + + Y_UNIT_TEST(MultipleCharactersAsDelimiter) { + AssertStringSplit( + "0, 1, 1, 2, 3, 5, 8, 13, 21, 34", "1, ", {"0, ", "", "2, 3, 5, 8, 13, 2", "34"}); + } + + Y_UNIT_TEST(NoDelimitersPresent) { + AssertStringSplit("This string could be yours", "\t", {"This string could be yours"}); + } + + Y_UNIT_TEST(Cdr) { + TDelimStringIter it("a\tc\t", "\t"); + UNIT_ASSERT_STRINGS_EQUAL(*it, "a"); + UNIT_ASSERT_STRINGS_EQUAL(it.Cdr(), "c\t"); + ++it; + UNIT_ASSERT_STRINGS_EQUAL(it.Cdr(), ""); + } + + Y_UNIT_TEST(ForIter) { + TVector<TStringBuf> expected = {"1", "", "3@4", ""}; + TVector<TStringBuf> got; + + for (TStringBuf x : TDelimStroka("1@@@@3@4@@", "@@")) { + got.push_back(x); + } + + UNIT_ASSERT_EQUAL(got, expected); + } +} + +static void AssertKeyValueStringSplit( + const TStringBuf str, + const TStringBuf delim, + const TVector<std::pair<TStringBuf, TStringBuf>>& expected) { + TKeyValueDelimStringIter it(str, delim); + + for (const auto& expectedKeyValue : expected) { + UNIT_ASSERT(it.Valid()); + UNIT_ASSERT_STRINGS_EQUAL(it.Key(), expectedKeyValue.first); + UNIT_ASSERT_STRINGS_EQUAL(it.Value(), expectedKeyValue.second); + ++it; + } + UNIT_ASSERT(!it.Valid()); +} + +Y_UNIT_TEST_SUITE(TKeyValueDelimStringIterTestSuite) { + Y_UNIT_TEST(SingleCharacterAsDelimiter) { + AssertKeyValueStringSplit( + "abc=123,cde=qwer", ",", + {{"abc", "123"}, + {"cde", "qwer"}}); + } + + Y_UNIT_TEST(MultipleCharactersAsDelimiter) { + AssertKeyValueStringSplit( + "abc=xyz@@qwerty=zxcv", "@@", + {{"abc", "xyz"}, + {"qwerty", "zxcv"}}); + } + + Y_UNIT_TEST(NoDelimiters) { + AssertKeyValueStringSplit( + "abc=zz", ",", + {{"abc", "zz"}}); + } + + Y_UNIT_TEST(EmptyElements) { + AssertKeyValueStringSplit( + "@@abc=zxy@@@@qwerty=y@@", "@@", + {{"", ""}, + {"abc", "zxy"}, + {"", ""}, + {"qwerty", "y"}, + {"", ""}}); + } +} diff --git a/library/cpp/deprecated/split/split_iterator.cpp b/library/cpp/deprecated/split/split_iterator.cpp new file mode 100644 index 0000000000..32262d25bd --- /dev/null +++ b/library/cpp/deprecated/split/split_iterator.cpp @@ -0,0 +1,318 @@ +#include "split_iterator.h" + +#include <util/system/yassert.h> + +#include <cctype> +#include <cstring> +#include <cstdlib> + +/****************** TSplitDelimiters2 ******************/ + +TSplitDelimiters::TSplitDelimiters(const char* s) { + memset(Delims, 0, sizeof(Delims)); + while (*s) + Delims[(ui8) * (s++)] = true; +} + +/****************** TSplitBase ******************/ +TSplitBase::TSplitBase(const char* str, size_t length) + : Str(str) + , Len(length) +{ +} + +TSplitBase::TSplitBase(const TString& s) + : Str(s.data()) + , Len(s.size()) +{ +} + +/****************** TDelimitersSplit ******************/ + +TDelimitersSplit::TDelimitersSplit(const char* str, size_t length, const TSplitDelimiters& delimiters) + : TSplitBase(str, length) + , Delimiters(delimiters) +{ +} + +TDelimitersSplit::TDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters) + : TSplitBase(s) + , Delimiters(delimiters) +{ +} + +size_t TDelimitersSplit::Begin() const { + size_t pos = 0; + while ((pos < Len) && Delimiters.IsDelimiter(Str[pos])) + ++pos; + return pos; +} + +TSizeTRegion TDelimitersSplit::Next(size_t& pos) const { + size_t begin = pos; + while ((pos < Len) && !Delimiters.IsDelimiter(Str[pos])) + ++pos; + TSizeTRegion result(begin, pos); + + while ((pos < Len) && Delimiters.IsDelimiter(Str[pos])) + ++pos; + + return result; +} + +TDelimitersSplit::TIterator TDelimitersSplit::Iterator() const { + return TIterator(*this); +} + +/****************** TDelimitersStrictSplit ******************/ + +TDelimitersStrictSplit::TDelimitersStrictSplit(const char* str, size_t length, const TSplitDelimiters& delimiters) + : TSplitBase(str, length) + , Delimiters(delimiters) +{ +} + +TDelimitersStrictSplit::TDelimitersStrictSplit(const TString& s, const TSplitDelimiters& delimiters) + : TSplitBase(s) + , Delimiters(delimiters) +{ +} + +TDelimitersStrictSplit::TIterator TDelimitersStrictSplit::Iterator() const { + return TIterator(*this); +} + +TSizeTRegion TDelimitersStrictSplit::Next(size_t& pos) const { + size_t begin = pos; + while ((pos < Len) && !Delimiters.IsDelimiter(Str[pos])) + ++pos; + TSizeTRegion result(begin, pos); + + if (pos < Len) + ++pos; + + return result; +} + +size_t TDelimitersStrictSplit::Begin() const { + return 0; +} + +/****************** TScreenedDelimitersSplit ******************/ + +TScreenedDelimitersSplit::TScreenedDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens) + : TSplitBase(s) + , Delimiters(delimiters) + , Screens(screens) +{ +} + +TScreenedDelimitersSplit::TScreenedDelimitersSplit(const char* str, size_t length, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens) + : TSplitBase(str, length) + , Delimiters(delimiters) + , Screens(screens) +{ +} + +TScreenedDelimitersSplit::TIterator TScreenedDelimitersSplit::Iterator() const { + return TIterator(*this); +} + +TSizeTRegion TScreenedDelimitersSplit::Next(size_t& pos) const { + size_t begin = pos; + bool screened = false; + while (pos < Len) { + if (Screens.IsDelimiter(Str[pos])) + screened = !screened; + if (Delimiters.IsDelimiter(Str[pos]) && !screened) + break; + ++pos; + } + TSizeTRegion result(begin, pos); + + if (pos < Len) + ++pos; + + return result; +} + +size_t TScreenedDelimitersSplit::Begin() const { + return 0; +} + +/****************** TDelimitersSplitWithoutTags ******************/ + +TDelimitersSplitWithoutTags::TDelimitersSplitWithoutTags(const char* str, size_t length, const TSplitDelimiters& delimiters) + : TSplitBase(str, length) + , Delimiters(delimiters) +{ +} + +TDelimitersSplitWithoutTags::TDelimitersSplitWithoutTags(const TString& s, const TSplitDelimiters& delimiters) + : TSplitBase(s) + , Delimiters(delimiters) +{ +} + +size_t TDelimitersSplitWithoutTags::SkipTag(size_t pos) const { + Y_ASSERT('<' == Str[pos]); + while ((pos < Len) && ('>' != Str[pos])) + ++pos; + return pos + 1; +} + +size_t TDelimitersSplitWithoutTags::SkipDelimiters(size_t pos) const { + while (true) { + while ((pos < Len) && Delimiters.IsDelimiter(Str[pos]) && ('<' != Str[pos])) + ++pos; + if (pos < Len) { + if ('<' != Str[pos]) + break; + else + pos = SkipTag(pos); + } else + break; + } + return pos; +} + +size_t TDelimitersSplitWithoutTags::Begin() const { + size_t pos = 0; + pos = SkipDelimiters(pos); + return pos; +} + +TSizeTRegion TDelimitersSplitWithoutTags::Next(size_t& pos) const { + size_t begin = pos; + while ((pos < Len) && !Delimiters.IsDelimiter(Str[pos]) && ('<' != Str[pos])) + ++pos; + TSizeTRegion result(begin, pos); + + pos = SkipDelimiters(pos); + + return result; +} + +TDelimitersSplitWithoutTags::TIterator TDelimitersSplitWithoutTags::Iterator() const { + return TIterator(*this); +} + +/****************** TCharSplit ******************/ + +TCharSplit::TCharSplit(const char* str, size_t length) + : TSplitBase(str, length) +{ +} + +TCharSplit::TCharSplit(const TString& s) + : TSplitBase(s) +{ +} + +TCharSplit::TIterator TCharSplit::Iterator() const { + return TIterator(*this); +} + +TSizeTRegion TCharSplit::Next(size_t& pos) const { + TSizeTRegion result(pos, pos + 1); + ++pos; + return result; +} + +size_t TCharSplit::Begin() const { + return 0; +} + +/****************** TCharSplitWithoutTags ******************/ + +TCharSplitWithoutTags::TCharSplitWithoutTags(const char* str, size_t length) + : TSplitBase(str, length) +{ +} + +TCharSplitWithoutTags::TCharSplitWithoutTags(const TString& s) + : TSplitBase(s) +{ +} + +size_t TCharSplitWithoutTags::SkipTag(size_t pos) const { + Y_ASSERT('<' == Str[pos]); + while ((pos < Len) && ('>' != Str[pos])) + ++pos; + return pos + 1; +} + +size_t TCharSplitWithoutTags::SkipDelimiters(size_t pos) const { + while (true) { + if (pos < Len) { + if ('<' != Str[pos]) + break; + else + pos = SkipTag(pos); + } else + break; + } + return pos; +} + +size_t TCharSplitWithoutTags::Begin() const { + size_t pos = 0; + pos = SkipDelimiters(pos); + return pos; +} + +TSizeTRegion TCharSplitWithoutTags::Next(size_t& pos) const { + size_t begin = pos++; + TSizeTRegion result(begin, pos); + + pos = SkipDelimiters(pos); + + return result; +} + +TCharSplitWithoutTags::TIterator TCharSplitWithoutTags::Iterator() const { + return TIterator(*this); +} + +TSubstringSplitDelimiter::TSubstringSplitDelimiter(const TString& s) + : Matcher(s) + , Len(s.size()) +{ +} + +/****************** TSubstringSplit ******************/ + +TSubstringSplit::TSubstringSplit(const char* str, size_t length, const TSubstringSplitDelimiter& delimiter) + : TSplitBase(str, length) + , Delimiter(delimiter) +{ +} + +TSubstringSplit::TSubstringSplit(const TString& str, const TSubstringSplitDelimiter& delimiter) + : TSplitBase(str) + , Delimiter(delimiter) +{ +} + +TSubstringSplit::TIterator TSubstringSplit::Iterator() const { + return TIterator(*this); +} + +TSizeTRegion TSubstringSplit::Next(size_t& pos) const { + const char* begin = Str + pos; + const char* end = Str + Len; + const char* delim; + if (Delimiter.Matcher.SubStr(begin, end, delim)) { + TSizeTRegion result(pos, delim - begin + pos); + pos += delim - begin + Delimiter.Len; + return result; + } else { + TSizeTRegion result(pos, end - begin + pos); + pos += end - begin; + return result; + } +} + +size_t TSubstringSplit::Begin() const { + return 0; +} diff --git a/library/cpp/deprecated/split/split_iterator.h b/library/cpp/deprecated/split/split_iterator.h new file mode 100644 index 0000000000..0eacc29228 --- /dev/null +++ b/library/cpp/deprecated/split/split_iterator.h @@ -0,0 +1,317 @@ +#pragma once + +#include <library/cpp/deprecated/kmp/kmp.h> +#include <util/string/cast.h> +#include <util/string/util.h> +#include <util/string/builder.h> + +#include <util/system/yassert.h> +#include <util/system/defaults.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> + +#include <cstdio> + +template <typename T> +struct TNumPair { + T Begin; + T End; + + TNumPair() = default; + + TNumPair(T begin, T end) + : Begin(begin) + , End(end) + { + Y_ASSERT(begin <= end); + } + + T Length() const { + return End - Begin + 1; + } + + bool operator==(const TNumPair& r) const { + return (Begin == r.Begin) && (End == r.End); + } + + bool operator!=(const TNumPair& r) const { + return (Begin != r.Begin) || (End != r.End); + } +}; + +using TSizeTRegion = TNumPair<size_t>; +using TUi32Region = TNumPair<ui32>; + +template <> +inline TString ToString(const TUi32Region& r) { + return TStringBuilder() << "(" << r.Begin << ", " << r.End << ")"; +} + +template <> +inline TUi32Region FromString(const TString& s) { + TUi32Region result; + sscanf(s.data(), "(%" PRIu32 ", %" PRIu32 ")", &result.Begin, &result.End); + return result; +} + +class TSplitDelimiters { +private: + bool Delims[256]; + +public: + explicit TSplitDelimiters(const char* s); + + Y_FORCE_INLINE bool IsDelimiter(ui8 ch) const { + return Delims[ch]; + } +}; + +template <class Split> +class TSplitIterator; + +class TSplitBase { +protected: + const char* Str; + size_t Len; + +public: + TSplitBase(const char* str, size_t length); + TSplitBase(const TString& s); + + Y_FORCE_INLINE const char* GetString() const { + return Str; + } + + Y_FORCE_INLINE size_t GetLength() const { + return Len; + } + +private: + // we don't own Str, make sure that no one calls us with temporary object + TSplitBase(TString&&) = delete; +}; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4512) +#endif + +class TDelimitersSplit: public TSplitBase { +private: + const TSplitDelimiters& Delimiters; + +public: + using TIterator = TSplitIterator<TDelimitersSplit>; + friend class TSplitIterator<TDelimitersSplit>; + + TDelimitersSplit(const char* str, size_t length, const TSplitDelimiters& delimiters); + TDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters); + TIterator Iterator() const; + TSizeTRegion Next(size_t& pos) const; + size_t Begin() const; + +private: + // we don't own Delimiters, make sure that no one calls us with temporary object + TDelimitersSplit(const char*, size_t, TSplitDelimiters&&) = delete; + TDelimitersSplit(const TString&, TSplitDelimiters&&) = delete; + TDelimitersSplit(TString&&, const TSplitDelimiters&) = delete; +}; + +class TDelimitersStrictSplit: public TSplitBase { +private: + const TSplitDelimiters& Delimiters; + +public: + using TIterator = TSplitIterator<TDelimitersStrictSplit>; + friend class TSplitIterator<TDelimitersStrictSplit>; + + TDelimitersStrictSplit(const char* str, size_t length, const TSplitDelimiters& delimiters); + TDelimitersStrictSplit(const TString& s, const TSplitDelimiters& delimiters); + TIterator Iterator() const; + TSizeTRegion Next(size_t& pos) const; + size_t Begin() const; + +private: + // we don't own Delimiters, make sure that no one calls us with temporary object + TDelimitersStrictSplit(const char*, size_t, TSplitDelimiters&&) = delete; + TDelimitersStrictSplit(const TString&, TSplitDelimiters&&) = delete; + TDelimitersStrictSplit(TString&&, const TSplitDelimiters&) = delete; +}; + +class TScreenedDelimitersSplit: public TSplitBase { +private: + const TSplitDelimiters& Delimiters; + const TSplitDelimiters& Screens; + +public: + using TIterator = TSplitIterator<TScreenedDelimitersSplit>; + friend class TSplitIterator<TScreenedDelimitersSplit>; + + TScreenedDelimitersSplit(const char*, size_t, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens); + TScreenedDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens); + TIterator Iterator() const; + TSizeTRegion Next(size_t& pos) const; + size_t Begin() const; + +private: + // we don't own Delimiters and Screens, make sure that no one calls us with temporary object + TScreenedDelimitersSplit(TString&&, const TSplitDelimiters&, const TSplitDelimiters&) = delete; + TScreenedDelimitersSplit(const TString&, TSplitDelimiters&&, const TSplitDelimiters&) = delete; + TScreenedDelimitersSplit(const TString&, const TSplitDelimiters&, TSplitDelimiters&&) = delete; +}; + +class TDelimitersSplitWithoutTags: public TSplitBase { +private: + const TSplitDelimiters& Delimiters; + size_t SkipTag(size_t pos) const; + size_t SkipDelimiters(size_t pos) const; + +public: + using TIterator = TSplitIterator<TDelimitersSplitWithoutTags>; + friend class TSplitIterator<TDelimitersSplitWithoutTags>; + + TDelimitersSplitWithoutTags(const char* str, size_t length, const TSplitDelimiters& delimiters); + TDelimitersSplitWithoutTags(const TString& s, const TSplitDelimiters& delimiters); + TIterator Iterator() const; + TSizeTRegion Next(size_t& pos) const; + size_t Begin() const; + +private: + // we don't own Delimiters, make sure that no one calls us with temporary object + TDelimitersSplitWithoutTags(const char*, size_t, TSplitDelimiters&&) = delete; + TDelimitersSplitWithoutTags(const TString&, TSplitDelimiters&&) = delete; + TDelimitersSplitWithoutTags(TString&&, const TSplitDelimiters&) = delete; +}; + +class TCharSplit: public TSplitBase { +public: + using TIterator = TSplitIterator<TCharSplit>; + friend class TSplitIterator<TCharSplit>; + + TCharSplit(const char* str, size_t length); + TCharSplit(const TString& s); + TIterator Iterator() const; + TSizeTRegion Next(size_t& pos) const; + size_t Begin() const; + +private: + // we don't own Str, make sure that no one calls us with temporary object + TCharSplit(TString&&) = delete; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +class TCharSplitWithoutTags: public TSplitBase { +private: + size_t SkipTag(size_t pos) const; + size_t SkipDelimiters(size_t pos) const; + +public: + using TIterator = TSplitIterator<TCharSplitWithoutTags>; + friend class TSplitIterator<TCharSplitWithoutTags>; + + TCharSplitWithoutTags(const char* str, size_t length); + TCharSplitWithoutTags(const TString& s); + TIterator Iterator() const; + TSizeTRegion Next(size_t& pos) const; + size_t Begin() const; + +private: + // we don't own Str, make sure that no one calls us with temporary object + TCharSplitWithoutTags(TString&&) = delete; +}; + +class TSubstringSplitDelimiter { +public: + TKMPMatcher Matcher; + size_t Len; + + TSubstringSplitDelimiter(const TString& s); +}; + +class TSubstringSplit: public TSplitBase { +private: + const TSubstringSplitDelimiter& Delimiter; + +public: + using TIterator = TSplitIterator<TSubstringSplit>; + friend class TSplitIterator<TSubstringSplit>; + + TSubstringSplit(const char* str, size_t length, const TSubstringSplitDelimiter& delimiter); + TSubstringSplit(const TString& str, const TSubstringSplitDelimiter& delimiter); + TIterator Iterator() const; + TSizeTRegion Next(size_t& pos) const; + size_t Begin() const; + +private: + // we don't own Delimiters, make sure that no one calls us with temporary object + TSubstringSplit(TString&&, const TSubstringSplitDelimiter&) = delete; + TSubstringSplit(const TString&, TSubstringSplitDelimiter&&) = delete; +}; + +template <class TSplit> +class TSplitIterator { +protected: + const TSplit& Split; + size_t Pos; + TString* CurrentStroka; + +public: + TSplitIterator(const TSplit& split) + : Split(split) + , Pos(Split.Begin()) + , CurrentStroka(nullptr) + { + } + + virtual ~TSplitIterator() { + delete CurrentStroka; + } + + inline TSizeTRegion Next() { + Y_ENSURE(!Eof(), TStringBuf("eof reached")); + return Split.Next(Pos); + } + + TStringBuf NextTok() { + if (Eof()) + return TStringBuf(); + TSizeTRegion region = Next(); + return TStringBuf(Split.Str + region.Begin, region.End - region.Begin); + } + + const TString& NextString() { + if (!CurrentStroka) + CurrentStroka = new TString(); + TSizeTRegion region = Next(); + CurrentStroka->assign(Split.Str, region.Begin, region.Length() - 1); + return *CurrentStroka; + } + + inline bool Eof() const { + return Pos >= Split.Len; + } + + TString GetTail() const { + return TString(Split.Str + Pos); + } + + void Skip(size_t count) { + for (size_t i = 0; i < count; ++i) + Next(); + } +}; + +using TSplitTokens = TVector<TString>; + +template <typename TSplit> +void Split(const TSplit& split, TSplitTokens* words) { + words->clear(); + TSplitIterator<TSplit> it(split); + while (!it.Eof()) + words->push_back(it.NextString()); +} diff --git a/library/cpp/deprecated/split/split_iterator_ut.cpp b/library/cpp/deprecated/split/split_iterator_ut.cpp new file mode 100644 index 0000000000..be5069c4be --- /dev/null +++ b/library/cpp/deprecated/split/split_iterator_ut.cpp @@ -0,0 +1,152 @@ +#include "split_iterator.h" + +#include <library/cpp/testing/unittest/registar.h> + +class TSplitIteratorTest: public TTestBase { + UNIT_TEST_SUITE(TSplitIteratorTest); + UNIT_TEST(TestDelimiters); + UNIT_TEST(TestDelimitersSplit); + UNIT_TEST(TestDelimitersStrictSplit); + UNIT_TEST(TestTail); + UNIT_TEST(TestScreenedDelimitersSplit); + UNIT_TEST(TestSubstringDelimiter); + UNIT_TEST_SUITE_END(); + +public: + void TestDelimiters(); + void TestDelimitersSplit(); + void TestDelimitersStrictSplit(); + void TestTail(); + void TestScreenedDelimitersSplit(); + void TestSubstringDelimiter(); +}; + +void TSplitIteratorTest::TestDelimiters() { + TSplitDelimiters delims("@"); + for (int i = 0; i < 256; ++i) + if ('@' != i) { + UNIT_ASSERT(!delims.IsDelimiter((ui8)i)); + } else { + UNIT_ASSERT(delims.IsDelimiter((ui8)i)); + } +} + +void TSplitIteratorTest::TestDelimitersSplit() { + { + TString s = "1a3b45cd"; + TSplitDelimiters delims("abcd"); + TDelimitersSplit split(s, delims); + TSplitTokens tokens; + Split(split, &tokens); + TSplitTokens pattern = {"1", "3", "45"}; + UNIT_ASSERT(tokens == pattern); + } + { + TString s = "aaaaaa"; + TSplitDelimiters delims("abcd"); + TDelimitersSplit split(s, delims); + TSplitTokens tokens; + Split(split, &tokens); + TSplitTokens pattern = {}; + UNIT_ASSERT(tokens == pattern); + } +} + +void TSplitIteratorTest::TestDelimitersStrictSplit() { + { + TString s = "grp@2"; + TSplitDelimiters delims("@"); + TDelimitersStrictSplit split(s, delims); + TSplitTokens tokens; + Split(split, &tokens); + TSplitTokens pattern = {"grp", "2"}; + UNIT_ASSERT(tokens == pattern); + } + + { + TString s = "@grp@2@@"; + TSplitDelimiters delims("@"); + TDelimitersStrictSplit split(s, delims); + TSplitTokens tokens; + Split(split, &tokens); + TSplitTokens pattern = {"", "grp", "2", ""}; + UNIT_ASSERT(tokens == pattern); + } +} + +void TSplitIteratorTest::TestTail() { + TString s = "grp@2@4"; + TSplitDelimiters delims("@"); + TDelimitersSplit split(s, delims); + TDelimitersSplit::TIterator it = split.Iterator(); + UNIT_ASSERT_EQUAL(it.GetTail(), "grp@2@4"); + it.Next(); + UNIT_ASSERT_EQUAL(it.GetTail(), "2@4"); + it.Next(); + UNIT_ASSERT_EQUAL(it.GetTail(), "4"); + it.Next(); + UNIT_ASSERT_EQUAL(it.GetTail(), ""); +} + +void TSplitIteratorTest::TestScreenedDelimitersSplit() { + { + const TString s = "77.88.58.91 - - [28/Aug/2008:00:08:07 +0400] \"GET /export/mordashka.tgz HTTP/1.1\" 304 - \"-\" \"libwww-perl/5.805\" \"news.yandex.ru,80\" \"-\" \"-\" 1219867687 \"0\" 3283 2"; + const TSplitDelimiters delims(" "); + const TSplitDelimiters screens("\"[]"); + const TScreenedDelimitersSplit splitter(s, delims, screens); + TScreenedDelimitersSplit::TIterator it = splitter.Iterator(); + UNIT_ASSERT_EQUAL(it.NextString(), "77.88.58.91"); + UNIT_ASSERT_EQUAL(it.NextString(), "-"); + UNIT_ASSERT_EQUAL(it.NextString(), "-"); + UNIT_ASSERT_EQUAL(it.NextString(), "[28/Aug/2008:00:08:07 +0400]"); + UNIT_ASSERT_EQUAL(it.NextString(), "\"GET /export/mordashka.tgz HTTP/1.1\""); + UNIT_ASSERT_EQUAL(it.NextString(), "304"); + UNIT_ASSERT_EQUAL(it.NextString(), "-"); + UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); + UNIT_ASSERT_EQUAL(it.NextString(), "\"libwww-perl/5.805\""); + UNIT_ASSERT_EQUAL(it.NextString(), "\"news.yandex.ru,80\""); + UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); + UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); + UNIT_ASSERT_EQUAL(it.NextString(), "1219867687"); + UNIT_ASSERT_EQUAL(it.NextString(), "\"0\""); + UNIT_ASSERT_EQUAL(it.NextString(), "3283"); + UNIT_ASSERT_EQUAL(it.NextString(), "2"); + } + { + const TString s = "77.88.58.91 - - [28/Aug/2008:00:08:07 +0400] \"GET /export/mordashka.tgz HTTP/1.1\" 304 - \"-\" \"libwww-perl/5.805\" \"news.yandex.ru,80\" \"-\" \"-\" 1219867687 \"0\" 3283 2"; + const TSplitDelimiters delims(" "); + const TSplitDelimiters screens("\"[]"); + const TScreenedDelimitersSplit splitter(s.Data(), s.Size(), delims, screens); + TScreenedDelimitersSplit::TIterator it = splitter.Iterator(); + UNIT_ASSERT_EQUAL(it.NextString(), "77.88.58.91"); + UNIT_ASSERT_EQUAL(it.NextString(), "-"); + UNIT_ASSERT_EQUAL(it.NextString(), "-"); + UNIT_ASSERT_EQUAL(it.NextString(), "[28/Aug/2008:00:08:07 +0400]"); + UNIT_ASSERT_EQUAL(it.NextString(), "\"GET /export/mordashka.tgz HTTP/1.1\""); + UNIT_ASSERT_EQUAL(it.NextString(), "304"); + UNIT_ASSERT_EQUAL(it.NextString(), "-"); + UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); + UNIT_ASSERT_EQUAL(it.NextString(), "\"libwww-perl/5.805\""); + UNIT_ASSERT_EQUAL(it.NextString(), "\"news.yandex.ru,80\""); + UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); + UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); + UNIT_ASSERT_EQUAL(it.NextString(), "1219867687"); + UNIT_ASSERT_EQUAL(it.NextString(), "\"0\""); + UNIT_ASSERT_EQUAL(it.NextString(), "3283"); + UNIT_ASSERT_EQUAL(it.NextString(), "2"); + } +} + +void TSplitIteratorTest::TestSubstringDelimiter() { + const TString s = "a@@bb@@cc@c.d@@r"; + static const TSubstringSplitDelimiter delimiter("@@"); + const TSubstringSplit splitter(s, delimiter); + TSubstringSplit::TIterator it = splitter.Iterator(); + UNIT_ASSERT_EQUAL(it.NextString(), "a"); + UNIT_ASSERT_EQUAL(it.NextString(), "bb"); + UNIT_ASSERT_EQUAL(it.NextString(), "cc@c.d"); + UNIT_ASSERT_EQUAL(it.NextString(), "r"); + UNIT_ASSERT(it.Eof()); +} + +UNIT_TEST_SUITE_REGISTRATION(TSplitIteratorTest); diff --git a/library/cpp/deprecated/split/ya.make b/library/cpp/deprecated/split/ya.make new file mode 100644 index 0000000000..946e685ac8 --- /dev/null +++ b/library/cpp/deprecated/split/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +OWNER(wrg0ababd) + +SRCS( + delim_string_iter.cpp + split_iterator.cpp +) + +PEERDIR( + library/cpp/deprecated/kmp +) + +END() |