diff options
| author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
|---|---|---|
| committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
| commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
| tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/deprecated/split | |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/deprecated/split')
| -rw-r--r-- | library/cpp/deprecated/split/delim_string_iter.cpp | 45 | ||||
| -rw-r--r-- | library/cpp/deprecated/split/delim_string_iter.h | 185 | ||||
| -rw-r--r-- | library/cpp/deprecated/split/delim_string_iter_ut.cpp | 99 | ||||
| -rw-r--r-- | library/cpp/deprecated/split/split_iterator.cpp | 318 | ||||
| -rw-r--r-- | library/cpp/deprecated/split/split_iterator.h | 317 | ||||
| -rw-r--r-- | library/cpp/deprecated/split/split_iterator_ut.cpp | 152 | ||||
| -rw-r--r-- | library/cpp/deprecated/split/ya.make | 14 | 
7 files changed, 1130 insertions, 0 deletions
diff --git a/library/cpp/deprecated/split/delim_string_iter.cpp b/library/cpp/deprecated/split/delim_string_iter.cpp new file mode 100644 index 00000000000..af418c5bfb9 --- /dev/null +++ b/library/cpp/deprecated/split/delim_string_iter.cpp @@ -0,0 +1,45 @@ +#include "delim_string_iter.h" + +// +// TKeyValueDelimStringIter +// + +void TKeyValueDelimStringIter::ReadKeyAndValue() { +    TStringBuf currentToken(*DelimIter); + +    size_t pos = currentToken.find('='); +    if (pos == TString::npos) { +        ChunkValue.Clear(); +        ChunkKey = currentToken; +    } else { +        ChunkKey = currentToken.SubStr(0, pos); +        ChunkValue = currentToken.SubStr(pos + 1); +    } +} + +TKeyValueDelimStringIter::TKeyValueDelimStringIter(const TStringBuf str, const TStringBuf delim) +    : DelimIter(str, delim) +{ +    if (DelimIter.Valid()) +        ReadKeyAndValue(); +} + +bool TKeyValueDelimStringIter::Valid() const { +    return DelimIter.Valid(); +} + +TKeyValueDelimStringIter& TKeyValueDelimStringIter::operator++() { +    ++DelimIter; +    if (DelimIter.Valid()) +        ReadKeyAndValue(); + +    return *this; +} + +const TStringBuf& TKeyValueDelimStringIter::Key() const { +    return ChunkKey; +} + +const TStringBuf& TKeyValueDelimStringIter::Value() const { +    return ChunkValue; +} diff --git a/library/cpp/deprecated/split/delim_string_iter.h b/library/cpp/deprecated/split/delim_string_iter.h new file mode 100644 index 00000000000..8e4ca171a09 --- /dev/null +++ b/library/cpp/deprecated/split/delim_string_iter.h @@ -0,0 +1,185 @@ +#pragma once + +#include <util/generic/algorithm.h> +#include <util/generic/strbuf.h> +#include <util/generic/yexception.h> +#include <util/string/cast.h> +#include <util/system/yassert.h> + +#include <iterator> + +class TDelimStringIter { +public: +    using value_type = TStringBuf; +    using difference_type = ptrdiff_t; +    using pointer = const TStringBuf*; +    using reference = const TStringBuf&; +    using iterator_category = std::forward_iterator_tag; + +    inline TDelimStringIter(const char* begin, const char* strEnd, TStringBuf delim) +        : TDelimStringIter(TStringBuf(begin, strEnd), delim) +    { +    } + +    inline TDelimStringIter(TStringBuf str, TStringBuf delim) +        : IsValid(true) +        , Str(str) +        , Delim(delim) +    { +        UpdateCurrent(); +    } + +    inline TDelimStringIter() +        : IsValid(false) +    { +    } + +    inline explicit operator bool() const { +        return IsValid; +    } + +    // NOTE: this is a potentially unsafe operation (no overrun check) +    inline TDelimStringIter& operator++() { +        if (Current.end() != Str.end()) { +            Str.Skip(Current.length() + Delim.length()); +            UpdateCurrent(); +        } else { +            Str.Clear(); +            Current.Clear(); +            IsValid = false; +        } +        return *this; +    } + +    inline void operator+=(size_t n) { +        for (; n > 0; --n) { +            ++(*this); +        } +    } + +    inline bool operator==(const TDelimStringIter& rhs) const { +        return (IsValid == rhs.IsValid) && (!IsValid || (Current.begin() == rhs.Current.begin())); +    } + +    inline bool operator!=(const TDelimStringIter& rhs) const { +        return !(*this == rhs); +    } + +    inline TStringBuf operator*() const { +        return Current; +    } + +    inline const TStringBuf* operator->() const { +        return &Current; +    } + +    // Get & advance +    template <class T> +    inline bool TryNext(T& t) { +        if (IsValid) { +            t = FromString<T>(Current); +            operator++(); +            return true; +        } else { +            return false; +        } +    } + +    template <class T> +    inline TDelimStringIter& Next(T& t) // Get & advance +    { +        if (!TryNext(t)) +            ythrow yexception() << "No valid field"; +        return *this; +    } + +    template <class T> +    inline T GetNext() { +        T res; +        Next(res); +        return res; +    } + +    inline const char* GetBegin() const { +        return Current.begin(); +    } + +    inline const char* GetEnd() const { +        return Current.end(); +    } + +    inline bool Valid() const { +        return IsValid; +    } + +    // contents from next token to the end of string +    inline TStringBuf Cdr() const { +        return Str.SubStr(Current.length() + Delim.length()); +    } + +    inline TDelimStringIter IterEnd() const { +        return TDelimStringIter(); +    } + +private: +    inline void UpdateCurrent() { +        // it is much faster than TStringBuf::find +        size_t pos = std::search(Str.begin(), Str.end(), Delim.begin(), Delim.end()) - Str.begin(); +        Current = Str.Head(pos); +    } + +private: +    bool IsValid; + +    TStringBuf Str; +    TStringBuf Current; +    TStringBuf Delim; +}; + +//example: for (TStringBuf field: TDelimStroka(line, "@@")) { ... } +struct TDelimStroka { +    TStringBuf S; +    TStringBuf Delim; + +    inline TDelimStroka(TStringBuf s, TStringBuf delim) +        : S(s) +        , Delim(delim) +    { +    } + +    inline TDelimStringIter begin() const { +        return TDelimStringIter(S, Delim); +    } + +    inline TDelimStringIter end() const { +        return TDelimStringIter(); +    } +}; + +inline TDelimStringIter begin_delim(const TString& str, TStringBuf delim) { +    return TDelimStringIter(str, delim); +} + +inline TDelimStringIter begin_delim(TStringBuf str, TStringBuf delim) { +    return TDelimStringIter(str.begin(), str.end(), delim); +} + +inline TDelimStringIter end_delim(const TString& /*str*/, TStringBuf /*delim*/) { +    return TDelimStringIter(); +} + +class TKeyValueDelimStringIter { +public: +    TKeyValueDelimStringIter(const TStringBuf str, const TStringBuf delim); +    bool Valid() const; +    TKeyValueDelimStringIter& operator++(); +    const TStringBuf& Key() const; +    const TStringBuf& Value() const; + +private: +    TDelimStringIter DelimIter; +    TStringBuf ChunkKey, ChunkValue; + +private: +    void ReadKeyAndValue(); +}; diff --git a/library/cpp/deprecated/split/delim_string_iter_ut.cpp b/library/cpp/deprecated/split/delim_string_iter_ut.cpp new file mode 100644 index 00000000000..18a8b2a1604 --- /dev/null +++ b/library/cpp/deprecated/split/delim_string_iter_ut.cpp @@ -0,0 +1,99 @@ +#include "delim_string_iter.h" +#include <util/generic/vector.h> +#include <library/cpp/testing/unittest/registar.h> + +/// Test that TDelimStringIter build on top of given string and delimeter will produce expected sequence +static void AssertStringSplit(const TString& str, const TString& delim, const TVector<TString>& expected) { +    TDelimStringIter it(str, delim); + +    // test iterator invariants +    for (const auto& expectedString : expected) { +        UNIT_ASSERT(it.Valid()); +        UNIT_ASSERT(bool(it)); +        UNIT_ASSERT_STRINGS_EQUAL(it->ToString(), expectedString); +        ++it; +    } +    UNIT_ASSERT(!it.Valid()); +}; + +Y_UNIT_TEST_SUITE(TDelimStrokaIterTestSuite) { +    Y_UNIT_TEST(SingleCharacterAsDelimiter) { +        AssertStringSplit( +            "Hello words!", " ", {"Hello", "words!"}); +    } + +    Y_UNIT_TEST(MultipleCharactersAsDelimiter) { +        AssertStringSplit( +            "0, 1, 1, 2, 3, 5, 8, 13, 21, 34", "1, ", {"0, ", "", "2, 3, 5, 8, 13, 2", "34"}); +    } + +    Y_UNIT_TEST(NoDelimitersPresent) { +        AssertStringSplit("This string could be yours", "\t", {"This string could be yours"}); +    } + +    Y_UNIT_TEST(Cdr) { +        TDelimStringIter it("a\tc\t", "\t"); +        UNIT_ASSERT_STRINGS_EQUAL(*it, "a"); +        UNIT_ASSERT_STRINGS_EQUAL(it.Cdr(), "c\t"); +        ++it; +        UNIT_ASSERT_STRINGS_EQUAL(it.Cdr(), ""); +    } + +    Y_UNIT_TEST(ForIter) { +        TVector<TStringBuf> expected = {"1", "", "3@4", ""}; +        TVector<TStringBuf> got; + +        for (TStringBuf x : TDelimStroka("1@@@@3@4@@", "@@")) { +            got.push_back(x); +        } + +        UNIT_ASSERT_EQUAL(got, expected); +    } +} + +static void AssertKeyValueStringSplit( +    const TStringBuf str, +    const TStringBuf delim, +    const TVector<std::pair<TStringBuf, TStringBuf>>& expected) { +    TKeyValueDelimStringIter it(str, delim); + +    for (const auto& expectedKeyValue : expected) { +        UNIT_ASSERT(it.Valid()); +        UNIT_ASSERT_STRINGS_EQUAL(it.Key(), expectedKeyValue.first); +        UNIT_ASSERT_STRINGS_EQUAL(it.Value(), expectedKeyValue.second); +        ++it; +    } +    UNIT_ASSERT(!it.Valid()); +} + +Y_UNIT_TEST_SUITE(TKeyValueDelimStringIterTestSuite) { +    Y_UNIT_TEST(SingleCharacterAsDelimiter) { +        AssertKeyValueStringSplit( +            "abc=123,cde=qwer", ",", +            {{"abc", "123"}, +             {"cde", "qwer"}}); +    } + +    Y_UNIT_TEST(MultipleCharactersAsDelimiter) { +        AssertKeyValueStringSplit( +            "abc=xyz@@qwerty=zxcv", "@@", +            {{"abc", "xyz"}, +             {"qwerty", "zxcv"}}); +    } + +    Y_UNIT_TEST(NoDelimiters) { +        AssertKeyValueStringSplit( +            "abc=zz", ",", +            {{"abc", "zz"}}); +    } + +    Y_UNIT_TEST(EmptyElements) { +        AssertKeyValueStringSplit( +            "@@abc=zxy@@@@qwerty=y@@", "@@", +            {{"", ""}, +             {"abc", "zxy"}, +             {"", ""}, +             {"qwerty", "y"}, +             {"", ""}}); +    } +} diff --git a/library/cpp/deprecated/split/split_iterator.cpp b/library/cpp/deprecated/split/split_iterator.cpp new file mode 100644 index 00000000000..32262d25bd1 --- /dev/null +++ b/library/cpp/deprecated/split/split_iterator.cpp @@ -0,0 +1,318 @@ +#include "split_iterator.h" + +#include <util/system/yassert.h> + +#include <cctype> +#include <cstring> +#include <cstdlib> + +/****************** TSplitDelimiters2 ******************/ + +TSplitDelimiters::TSplitDelimiters(const char* s) { +    memset(Delims, 0, sizeof(Delims)); +    while (*s) +        Delims[(ui8) * (s++)] = true; +} + +/****************** TSplitBase ******************/ +TSplitBase::TSplitBase(const char* str, size_t length) +    : Str(str) +    , Len(length) +{ +} + +TSplitBase::TSplitBase(const TString& s) +    : Str(s.data()) +    , Len(s.size()) +{ +} + +/****************** TDelimitersSplit ******************/ + +TDelimitersSplit::TDelimitersSplit(const char* str, size_t length, const TSplitDelimiters& delimiters) +    : TSplitBase(str, length) +    , Delimiters(delimiters) +{ +} + +TDelimitersSplit::TDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters) +    : TSplitBase(s) +    , Delimiters(delimiters) +{ +} + +size_t TDelimitersSplit::Begin() const { +    size_t pos = 0; +    while ((pos < Len) && Delimiters.IsDelimiter(Str[pos])) +        ++pos; +    return pos; +} + +TSizeTRegion TDelimitersSplit::Next(size_t& pos) const { +    size_t begin = pos; +    while ((pos < Len) && !Delimiters.IsDelimiter(Str[pos])) +        ++pos; +    TSizeTRegion result(begin, pos); + +    while ((pos < Len) && Delimiters.IsDelimiter(Str[pos])) +        ++pos; + +    return result; +} + +TDelimitersSplit::TIterator TDelimitersSplit::Iterator() const { +    return TIterator(*this); +} + +/****************** TDelimitersStrictSplit ******************/ + +TDelimitersStrictSplit::TDelimitersStrictSplit(const char* str, size_t length, const TSplitDelimiters& delimiters) +    : TSplitBase(str, length) +    , Delimiters(delimiters) +{ +} + +TDelimitersStrictSplit::TDelimitersStrictSplit(const TString& s, const TSplitDelimiters& delimiters) +    : TSplitBase(s) +    , Delimiters(delimiters) +{ +} + +TDelimitersStrictSplit::TIterator TDelimitersStrictSplit::Iterator() const { +    return TIterator(*this); +} + +TSizeTRegion TDelimitersStrictSplit::Next(size_t& pos) const { +    size_t begin = pos; +    while ((pos < Len) && !Delimiters.IsDelimiter(Str[pos])) +        ++pos; +    TSizeTRegion result(begin, pos); + +    if (pos < Len) +        ++pos; + +    return result; +} + +size_t TDelimitersStrictSplit::Begin() const { +    return 0; +} + +/****************** TScreenedDelimitersSplit ******************/ + +TScreenedDelimitersSplit::TScreenedDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens) +    : TSplitBase(s) +    , Delimiters(delimiters) +    , Screens(screens) +{ +} + +TScreenedDelimitersSplit::TScreenedDelimitersSplit(const char* str, size_t length, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens) +    : TSplitBase(str, length) +    , Delimiters(delimiters) +    , Screens(screens) +{ +} + +TScreenedDelimitersSplit::TIterator TScreenedDelimitersSplit::Iterator() const { +    return TIterator(*this); +} + +TSizeTRegion TScreenedDelimitersSplit::Next(size_t& pos) const { +    size_t begin = pos; +    bool screened = false; +    while (pos < Len) { +        if (Screens.IsDelimiter(Str[pos])) +            screened = !screened; +        if (Delimiters.IsDelimiter(Str[pos]) && !screened) +            break; +        ++pos; +    } +    TSizeTRegion result(begin, pos); + +    if (pos < Len) +        ++pos; + +    return result; +} + +size_t TScreenedDelimitersSplit::Begin() const { +    return 0; +} + +/****************** TDelimitersSplitWithoutTags ******************/ + +TDelimitersSplitWithoutTags::TDelimitersSplitWithoutTags(const char* str, size_t length, const TSplitDelimiters& delimiters) +    : TSplitBase(str, length) +    , Delimiters(delimiters) +{ +} + +TDelimitersSplitWithoutTags::TDelimitersSplitWithoutTags(const TString& s, const TSplitDelimiters& delimiters) +    : TSplitBase(s) +    , Delimiters(delimiters) +{ +} + +size_t TDelimitersSplitWithoutTags::SkipTag(size_t pos) const { +    Y_ASSERT('<' == Str[pos]); +    while ((pos < Len) && ('>' != Str[pos])) +        ++pos; +    return pos + 1; +} + +size_t TDelimitersSplitWithoutTags::SkipDelimiters(size_t pos) const { +    while (true) { +        while ((pos < Len) && Delimiters.IsDelimiter(Str[pos]) && ('<' != Str[pos])) +            ++pos; +        if (pos < Len) { +            if ('<' != Str[pos]) +                break; +            else +                pos = SkipTag(pos); +        } else +            break; +    } +    return pos; +} + +size_t TDelimitersSplitWithoutTags::Begin() const { +    size_t pos = 0; +    pos = SkipDelimiters(pos); +    return pos; +} + +TSizeTRegion TDelimitersSplitWithoutTags::Next(size_t& pos) const { +    size_t begin = pos; +    while ((pos < Len) && !Delimiters.IsDelimiter(Str[pos]) && ('<' != Str[pos])) +        ++pos; +    TSizeTRegion result(begin, pos); + +    pos = SkipDelimiters(pos); + +    return result; +} + +TDelimitersSplitWithoutTags::TIterator TDelimitersSplitWithoutTags::Iterator() const { +    return TIterator(*this); +} + +/****************** TCharSplit ******************/ + +TCharSplit::TCharSplit(const char* str, size_t length) +    : TSplitBase(str, length) +{ +} + +TCharSplit::TCharSplit(const TString& s) +    : TSplitBase(s) +{ +} + +TCharSplit::TIterator TCharSplit::Iterator() const { +    return TIterator(*this); +} + +TSizeTRegion TCharSplit::Next(size_t& pos) const { +    TSizeTRegion result(pos, pos + 1); +    ++pos; +    return result; +} + +size_t TCharSplit::Begin() const { +    return 0; +} + +/****************** TCharSplitWithoutTags ******************/ + +TCharSplitWithoutTags::TCharSplitWithoutTags(const char* str, size_t length) +    : TSplitBase(str, length) +{ +} + +TCharSplitWithoutTags::TCharSplitWithoutTags(const TString& s) +    : TSplitBase(s) +{ +} + +size_t TCharSplitWithoutTags::SkipTag(size_t pos) const { +    Y_ASSERT('<' == Str[pos]); +    while ((pos < Len) && ('>' != Str[pos])) +        ++pos; +    return pos + 1; +} + +size_t TCharSplitWithoutTags::SkipDelimiters(size_t pos) const { +    while (true) { +        if (pos < Len) { +            if ('<' != Str[pos]) +                break; +            else +                pos = SkipTag(pos); +        } else +            break; +    } +    return pos; +} + +size_t TCharSplitWithoutTags::Begin() const { +    size_t pos = 0; +    pos = SkipDelimiters(pos); +    return pos; +} + +TSizeTRegion TCharSplitWithoutTags::Next(size_t& pos) const { +    size_t begin = pos++; +    TSizeTRegion result(begin, pos); + +    pos = SkipDelimiters(pos); + +    return result; +} + +TCharSplitWithoutTags::TIterator TCharSplitWithoutTags::Iterator() const { +    return TIterator(*this); +} + +TSubstringSplitDelimiter::TSubstringSplitDelimiter(const TString& s) +    : Matcher(s) +    , Len(s.size()) +{ +} + +/****************** TSubstringSplit ******************/ + +TSubstringSplit::TSubstringSplit(const char* str, size_t length, const TSubstringSplitDelimiter& delimiter) +    : TSplitBase(str, length) +    , Delimiter(delimiter) +{ +} + +TSubstringSplit::TSubstringSplit(const TString& str, const TSubstringSplitDelimiter& delimiter) +    : TSplitBase(str) +    , Delimiter(delimiter) +{ +} + +TSubstringSplit::TIterator TSubstringSplit::Iterator() const { +    return TIterator(*this); +} + +TSizeTRegion TSubstringSplit::Next(size_t& pos) const { +    const char* begin = Str + pos; +    const char* end = Str + Len; +    const char* delim; +    if (Delimiter.Matcher.SubStr(begin, end, delim)) { +        TSizeTRegion result(pos, delim - begin + pos); +        pos += delim - begin + Delimiter.Len; +        return result; +    } else { +        TSizeTRegion result(pos, end - begin + pos); +        pos += end - begin; +        return result; +    } +} + +size_t TSubstringSplit::Begin() const { +    return 0; +} diff --git a/library/cpp/deprecated/split/split_iterator.h b/library/cpp/deprecated/split/split_iterator.h new file mode 100644 index 00000000000..0eacc29228e --- /dev/null +++ b/library/cpp/deprecated/split/split_iterator.h @@ -0,0 +1,317 @@ +#pragma once + +#include <library/cpp/deprecated/kmp/kmp.h> +#include <util/string/cast.h> +#include <util/string/util.h> +#include <util/string/builder.h> + +#include <util/system/yassert.h> +#include <util/system/defaults.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> + +#include <cstdio> + +template <typename T> +struct TNumPair { +    T Begin; +    T End; + +    TNumPair() = default; + +    TNumPair(T begin, T end) +        : Begin(begin) +        , End(end) +    { +        Y_ASSERT(begin <= end); +    } + +    T Length() const { +        return End - Begin + 1; +    } + +    bool operator==(const TNumPair& r) const { +        return (Begin == r.Begin) && (End == r.End); +    } + +    bool operator!=(const TNumPair& r) const { +        return (Begin != r.Begin) || (End != r.End); +    } +}; + +using TSizeTRegion = TNumPair<size_t>; +using TUi32Region = TNumPair<ui32>; + +template <> +inline TString ToString(const TUi32Region& r) { +    return TStringBuilder() << "(" << r.Begin << ", " << r.End << ")"; +} + +template <> +inline TUi32Region FromString(const TString& s) { +    TUi32Region result; +    sscanf(s.data(), "(%" PRIu32 ", %" PRIu32 ")", &result.Begin, &result.End); +    return result; +} + +class TSplitDelimiters { +private: +    bool Delims[256]; + +public: +    explicit TSplitDelimiters(const char* s); + +    Y_FORCE_INLINE bool IsDelimiter(ui8 ch) const { +        return Delims[ch]; +    } +}; + +template <class Split> +class TSplitIterator; + +class TSplitBase { +protected: +    const char* Str; +    size_t Len; + +public: +    TSplitBase(const char* str, size_t length); +    TSplitBase(const TString& s); + +    Y_FORCE_INLINE const char* GetString() const { +        return Str; +    } + +    Y_FORCE_INLINE size_t GetLength() const { +        return Len; +    } + +private: +    // we don't own Str, make sure that no one calls us with temporary object +    TSplitBase(TString&&) = delete; +}; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4512) +#endif + +class TDelimitersSplit: public TSplitBase { +private: +    const TSplitDelimiters& Delimiters; + +public: +    using TIterator = TSplitIterator<TDelimitersSplit>; +    friend class TSplitIterator<TDelimitersSplit>; + +    TDelimitersSplit(const char* str, size_t length, const TSplitDelimiters& delimiters); +    TDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters); +    TIterator Iterator() const; +    TSizeTRegion Next(size_t& pos) const; +    size_t Begin() const; + +private: +    // we don't own Delimiters, make sure that no one calls us with temporary object +    TDelimitersSplit(const char*, size_t, TSplitDelimiters&&) = delete; +    TDelimitersSplit(const TString&, TSplitDelimiters&&) = delete; +    TDelimitersSplit(TString&&, const TSplitDelimiters&) = delete; +}; + +class TDelimitersStrictSplit: public TSplitBase { +private: +    const TSplitDelimiters& Delimiters; + +public: +    using TIterator = TSplitIterator<TDelimitersStrictSplit>; +    friend class TSplitIterator<TDelimitersStrictSplit>; + +    TDelimitersStrictSplit(const char* str, size_t length, const TSplitDelimiters& delimiters); +    TDelimitersStrictSplit(const TString& s, const TSplitDelimiters& delimiters); +    TIterator Iterator() const; +    TSizeTRegion Next(size_t& pos) const; +    size_t Begin() const; + +private: +    // we don't own Delimiters, make sure that no one calls us with temporary object +    TDelimitersStrictSplit(const char*, size_t, TSplitDelimiters&&) = delete; +    TDelimitersStrictSplit(const TString&, TSplitDelimiters&&) = delete; +    TDelimitersStrictSplit(TString&&, const TSplitDelimiters&) = delete; +}; + +class TScreenedDelimitersSplit: public TSplitBase { +private: +    const TSplitDelimiters& Delimiters; +    const TSplitDelimiters& Screens; + +public: +    using TIterator = TSplitIterator<TScreenedDelimitersSplit>; +    friend class TSplitIterator<TScreenedDelimitersSplit>; + +    TScreenedDelimitersSplit(const char*, size_t, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens); +    TScreenedDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens); +    TIterator Iterator() const; +    TSizeTRegion Next(size_t& pos) const; +    size_t Begin() const; + +private: +    // we don't own Delimiters and Screens, make sure that no one calls us with temporary object +    TScreenedDelimitersSplit(TString&&, const TSplitDelimiters&, const TSplitDelimiters&) = delete; +    TScreenedDelimitersSplit(const TString&, TSplitDelimiters&&, const TSplitDelimiters&) = delete; +    TScreenedDelimitersSplit(const TString&, const TSplitDelimiters&, TSplitDelimiters&&) = delete; +}; + +class TDelimitersSplitWithoutTags: public TSplitBase { +private: +    const TSplitDelimiters& Delimiters; +    size_t SkipTag(size_t pos) const; +    size_t SkipDelimiters(size_t pos) const; + +public: +    using TIterator = TSplitIterator<TDelimitersSplitWithoutTags>; +    friend class TSplitIterator<TDelimitersSplitWithoutTags>; + +    TDelimitersSplitWithoutTags(const char* str, size_t length, const TSplitDelimiters& delimiters); +    TDelimitersSplitWithoutTags(const TString& s, const TSplitDelimiters& delimiters); +    TIterator Iterator() const; +    TSizeTRegion Next(size_t& pos) const; +    size_t Begin() const; + +private: +    // we don't own Delimiters, make sure that no one calls us with temporary object +    TDelimitersSplitWithoutTags(const char*, size_t, TSplitDelimiters&&) = delete; +    TDelimitersSplitWithoutTags(const TString&, TSplitDelimiters&&) = delete; +    TDelimitersSplitWithoutTags(TString&&, const TSplitDelimiters&) = delete; +}; + +class TCharSplit: public TSplitBase { +public: +    using TIterator = TSplitIterator<TCharSplit>; +    friend class TSplitIterator<TCharSplit>; + +    TCharSplit(const char* str, size_t length); +    TCharSplit(const TString& s); +    TIterator Iterator() const; +    TSizeTRegion Next(size_t& pos) const; +    size_t Begin() const; + +private: +    // we don't own Str, make sure that no one calls us with temporary object +    TCharSplit(TString&&) = delete; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +class TCharSplitWithoutTags: public TSplitBase { +private: +    size_t SkipTag(size_t pos) const; +    size_t SkipDelimiters(size_t pos) const; + +public: +    using TIterator = TSplitIterator<TCharSplitWithoutTags>; +    friend class TSplitIterator<TCharSplitWithoutTags>; + +    TCharSplitWithoutTags(const char* str, size_t length); +    TCharSplitWithoutTags(const TString& s); +    TIterator Iterator() const; +    TSizeTRegion Next(size_t& pos) const; +    size_t Begin() const; + +private: +    // we don't own Str, make sure that no one calls us with temporary object +    TCharSplitWithoutTags(TString&&) = delete; +}; + +class TSubstringSplitDelimiter { +public: +    TKMPMatcher Matcher; +    size_t Len; + +    TSubstringSplitDelimiter(const TString& s); +}; + +class TSubstringSplit: public TSplitBase { +private: +    const TSubstringSplitDelimiter& Delimiter; + +public: +    using TIterator = TSplitIterator<TSubstringSplit>; +    friend class TSplitIterator<TSubstringSplit>; + +    TSubstringSplit(const char* str, size_t length, const TSubstringSplitDelimiter& delimiter); +    TSubstringSplit(const TString& str, const TSubstringSplitDelimiter& delimiter); +    TIterator Iterator() const; +    TSizeTRegion Next(size_t& pos) const; +    size_t Begin() const; + +private: +    // we don't own Delimiters, make sure that no one calls us with temporary object +    TSubstringSplit(TString&&, const TSubstringSplitDelimiter&) = delete; +    TSubstringSplit(const TString&, TSubstringSplitDelimiter&&) = delete; +}; + +template <class TSplit> +class TSplitIterator { +protected: +    const TSplit& Split; +    size_t Pos; +    TString* CurrentStroka; + +public: +    TSplitIterator(const TSplit& split) +        : Split(split) +        , Pos(Split.Begin()) +        , CurrentStroka(nullptr) +    { +    } + +    virtual ~TSplitIterator() { +        delete CurrentStroka; +    } + +    inline TSizeTRegion Next() { +        Y_ENSURE(!Eof(), TStringBuf("eof reached")); +        return Split.Next(Pos); +    } + +    TStringBuf NextTok() { +        if (Eof()) +            return TStringBuf(); +        TSizeTRegion region = Next(); +        return TStringBuf(Split.Str + region.Begin, region.End - region.Begin); +    } + +    const TString& NextString() { +        if (!CurrentStroka) +            CurrentStroka = new TString(); +        TSizeTRegion region = Next(); +        CurrentStroka->assign(Split.Str, region.Begin, region.Length() - 1); +        return *CurrentStroka; +    } + +    inline bool Eof() const { +        return Pos >= Split.Len; +    } + +    TString GetTail() const { +        return TString(Split.Str + Pos); +    } + +    void Skip(size_t count) { +        for (size_t i = 0; i < count; ++i) +            Next(); +    } +}; + +using TSplitTokens = TVector<TString>; + +template <typename TSplit> +void Split(const TSplit& split, TSplitTokens* words) { +    words->clear(); +    TSplitIterator<TSplit> it(split); +    while (!it.Eof()) +        words->push_back(it.NextString()); +} diff --git a/library/cpp/deprecated/split/split_iterator_ut.cpp b/library/cpp/deprecated/split/split_iterator_ut.cpp new file mode 100644 index 00000000000..be5069c4be4 --- /dev/null +++ b/library/cpp/deprecated/split/split_iterator_ut.cpp @@ -0,0 +1,152 @@ +#include "split_iterator.h" + +#include <library/cpp/testing/unittest/registar.h> + +class TSplitIteratorTest: public TTestBase { +    UNIT_TEST_SUITE(TSplitIteratorTest); +    UNIT_TEST(TestDelimiters); +    UNIT_TEST(TestDelimitersSplit); +    UNIT_TEST(TestDelimitersStrictSplit); +    UNIT_TEST(TestTail); +    UNIT_TEST(TestScreenedDelimitersSplit); +    UNIT_TEST(TestSubstringDelimiter); +    UNIT_TEST_SUITE_END(); + +public: +    void TestDelimiters(); +    void TestDelimitersSplit(); +    void TestDelimitersStrictSplit(); +    void TestTail(); +    void TestScreenedDelimitersSplit(); +    void TestSubstringDelimiter(); +}; + +void TSplitIteratorTest::TestDelimiters() { +    TSplitDelimiters delims("@"); +    for (int i = 0; i < 256; ++i) +        if ('@' != i) { +            UNIT_ASSERT(!delims.IsDelimiter((ui8)i)); +        } else { +            UNIT_ASSERT(delims.IsDelimiter((ui8)i)); +        } +} + +void TSplitIteratorTest::TestDelimitersSplit() { +    { +        TString s = "1a3b45cd"; +        TSplitDelimiters delims("abcd"); +        TDelimitersSplit split(s, delims); +        TSplitTokens tokens; +        Split(split, &tokens); +        TSplitTokens pattern = {"1", "3", "45"}; +        UNIT_ASSERT(tokens == pattern); +    } +    { +        TString s = "aaaaaa"; +        TSplitDelimiters delims("abcd"); +        TDelimitersSplit split(s, delims); +        TSplitTokens tokens; +        Split(split, &tokens); +        TSplitTokens pattern = {}; +        UNIT_ASSERT(tokens == pattern); +    } +} + +void TSplitIteratorTest::TestDelimitersStrictSplit() { +    { +        TString s = "grp@2"; +        TSplitDelimiters delims("@"); +        TDelimitersStrictSplit split(s, delims); +        TSplitTokens tokens; +        Split(split, &tokens); +        TSplitTokens pattern = {"grp", "2"}; +        UNIT_ASSERT(tokens == pattern); +    } + +    { +        TString s = "@grp@2@@"; +        TSplitDelimiters delims("@"); +        TDelimitersStrictSplit split(s, delims); +        TSplitTokens tokens; +        Split(split, &tokens); +        TSplitTokens pattern = {"", "grp", "2", ""}; +        UNIT_ASSERT(tokens == pattern); +    } +} + +void TSplitIteratorTest::TestTail() { +    TString s = "grp@2@4"; +    TSplitDelimiters delims("@"); +    TDelimitersSplit split(s, delims); +    TDelimitersSplit::TIterator it = split.Iterator(); +    UNIT_ASSERT_EQUAL(it.GetTail(), "grp@2@4"); +    it.Next(); +    UNIT_ASSERT_EQUAL(it.GetTail(), "2@4"); +    it.Next(); +    UNIT_ASSERT_EQUAL(it.GetTail(), "4"); +    it.Next(); +    UNIT_ASSERT_EQUAL(it.GetTail(), ""); +} + +void TSplitIteratorTest::TestScreenedDelimitersSplit() { +    { +        const TString s = "77.88.58.91 - - [28/Aug/2008:00:08:07 +0400] \"GET /export/mordashka.tgz HTTP/1.1\" 304 - \"-\" \"libwww-perl/5.805\" \"news.yandex.ru,80\" \"-\" \"-\" 1219867687 \"0\" 3283 2"; +        const TSplitDelimiters delims(" "); +        const TSplitDelimiters screens("\"[]"); +        const TScreenedDelimitersSplit splitter(s, delims, screens); +        TScreenedDelimitersSplit::TIterator it = splitter.Iterator(); +        UNIT_ASSERT_EQUAL(it.NextString(), "77.88.58.91"); +        UNIT_ASSERT_EQUAL(it.NextString(), "-"); +        UNIT_ASSERT_EQUAL(it.NextString(), "-"); +        UNIT_ASSERT_EQUAL(it.NextString(), "[28/Aug/2008:00:08:07 +0400]"); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"GET /export/mordashka.tgz HTTP/1.1\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "304"); +        UNIT_ASSERT_EQUAL(it.NextString(), "-"); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"libwww-perl/5.805\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"news.yandex.ru,80\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "1219867687"); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"0\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "3283"); +        UNIT_ASSERT_EQUAL(it.NextString(), "2"); +    } +    { +        const TString s = "77.88.58.91 - - [28/Aug/2008:00:08:07 +0400] \"GET /export/mordashka.tgz HTTP/1.1\" 304 - \"-\" \"libwww-perl/5.805\" \"news.yandex.ru,80\" \"-\" \"-\" 1219867687 \"0\" 3283 2"; +        const TSplitDelimiters delims(" "); +        const TSplitDelimiters screens("\"[]"); +        const TScreenedDelimitersSplit splitter(s.Data(), s.Size(), delims, screens); +        TScreenedDelimitersSplit::TIterator it = splitter.Iterator(); +        UNIT_ASSERT_EQUAL(it.NextString(), "77.88.58.91"); +        UNIT_ASSERT_EQUAL(it.NextString(), "-"); +        UNIT_ASSERT_EQUAL(it.NextString(), "-"); +        UNIT_ASSERT_EQUAL(it.NextString(), "[28/Aug/2008:00:08:07 +0400]"); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"GET /export/mordashka.tgz HTTP/1.1\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "304"); +        UNIT_ASSERT_EQUAL(it.NextString(), "-"); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"libwww-perl/5.805\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"news.yandex.ru,80\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"-\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "1219867687"); +        UNIT_ASSERT_EQUAL(it.NextString(), "\"0\""); +        UNIT_ASSERT_EQUAL(it.NextString(), "3283"); +        UNIT_ASSERT_EQUAL(it.NextString(), "2"); +    } +} + +void TSplitIteratorTest::TestSubstringDelimiter() { +    const TString s = "a@@bb@@[email protected]@@r"; +    static const TSubstringSplitDelimiter delimiter("@@"); +    const TSubstringSplit splitter(s, delimiter); +    TSubstringSplit::TIterator it = splitter.Iterator(); +    UNIT_ASSERT_EQUAL(it.NextString(), "a"); +    UNIT_ASSERT_EQUAL(it.NextString(), "bb"); +    UNIT_ASSERT_EQUAL(it.NextString(), "[email protected]"); +    UNIT_ASSERT_EQUAL(it.NextString(), "r"); +    UNIT_ASSERT(it.Eof()); +} + +UNIT_TEST_SUITE_REGISTRATION(TSplitIteratorTest); diff --git a/library/cpp/deprecated/split/ya.make b/library/cpp/deprecated/split/ya.make new file mode 100644 index 00000000000..946e685ac82 --- /dev/null +++ b/library/cpp/deprecated/split/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +OWNER(wrg0ababd) + +SRCS( +    delim_string_iter.cpp +    split_iterator.cpp +) + +PEERDIR( +    library/cpp/deprecated/kmp +) + +END()  | 
