intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <[email protected]> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <[email protected]> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /util/stream/tokenizer_ut.cpp
1 files changed, 264 insertions, 0 deletions
diff --git a/util/stream/tokenizer_ut.cpp b/util/stream/tokenizer_ut.cpp
new file mode 100644
index 00000000000..afc566da86e
--- /dev/null
+++ b/util/stream/tokenizer_ut.cpp
@@ -0,0 +1,264 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/array_size.h>
+#include <util/generic/strbuf.h>
+
+#include "mem.h"
+#include "null.h"
+#include "tokenizer.h"
+
+static inline void CheckIfNullTerminated(const TStringBuf str) {
+    UNIT_ASSERT_VALUES_EQUAL('\0', *(str.data() + str.size()));
+}
+
+Y_UNIT_TEST_SUITE(TStreamTokenizerTests) {
+    Y_UNIT_TEST(EmptyStreamTest) {
+        auto&& input = TNullInput{};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(0, tokensCount);
+    }
+
+    Y_UNIT_TEST(EmptyTokensTest) {
+        const char data[] = "\n\n";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
+            UNIT_ASSERT_VALUES_EQUAL(0, it->Length());
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(2, tokensCount);
+    }
+
+    Y_UNIT_TEST(LastTokenendDoesntSatisfyPredicateTest) {
+        const char data[] = "abc\ndef\nxxxxxx";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
+        const auto tokensSize = Y_ARRAY_SIZE(tokens);
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            UNIT_ASSERT(tokensCount < tokensSize);
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+    }
+
+    Y_UNIT_TEST(FirstTokenIsEmptyTest) {
+        const char data[] = "\ndef\nxxxxxx";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        const TStringBuf tokens[] = {TStringBuf(), TStringBuf("def"), TStringBuf("xxxxxx")};
+        const auto tokensSize = Y_ARRAY_SIZE(tokens);
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            UNIT_ASSERT(tokensCount < tokensSize);
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+    }
+
+    Y_UNIT_TEST(PredicateDoesntMatch) {
+        const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(data, token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
+    }
+
+    Y_UNIT_TEST(SimpleTest) {
+        const char data[] = "qwerty\n1234567890\n";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        const TStringBuf tokens[] = {TStringBuf("qwerty"), TStringBuf("1234567890")};
+        const auto tokensSize = Y_ARRAY_SIZE(tokens);
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            UNIT_ASSERT(tokensCount < tokensSize);
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+    }
+
+    Y_UNIT_TEST(CustomPredicateTest) {
+        struct TIsVerticalBar {
+            inline bool operator()(const char ch) const noexcept {
+                return '|' == ch;
+            }
+        };
+
+        const char data[] = "abc|def|xxxxxx";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
+        const auto tokensSize = Y_ARRAY_SIZE(tokens);
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TIsVerticalBar>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            UNIT_ASSERT(tokensCount < tokensSize);
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+    }
+
+    Y_UNIT_TEST(CustomPredicateSecondTest) {
+        struct TIsVerticalBar {
+            inline bool operator()(const char ch) const noexcept {
+                return '|' == ch || ',' == ch;
+            }
+        };
+
+        const char data[] = "abc|def|xxxxxx,abc|def|xxxxxx";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx"),
+                                     TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
+        const auto tokensSize = Y_ARRAY_SIZE(tokens);
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TIsVerticalBar>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            UNIT_ASSERT(tokensCount < tokensSize);
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+    }
+
+    Y_UNIT_TEST(FalsePredicateTest) {
+        struct TAlwaysFalse {
+            inline bool operator()(const char) const noexcept {
+                return false;
+            }
+        };
+
+        const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TAlwaysFalse>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(data, token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
+    }
+
+    Y_UNIT_TEST(TruePredicateTest) {
+        struct TAlwaysTrue {
+            inline bool operator()(const char) const noexcept {
+                return true;
+            }
+        };
+
+        const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TAlwaysTrue>{&input};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
+            UNIT_ASSERT_VALUES_EQUAL(0, it->Length());
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(dataSize, tokensCount);
+    }
+
+    Y_UNIT_TEST(FirstTokenHasSizeOfTheBufferTest) {
+        const char data[] = "xxxxx\nxx";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        const TStringBuf tokens[] = {TStringBuf("xxxxx"), TStringBuf("xx")};
+        const auto tokensSize = Y_ARRAY_SIZE(tokens);
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, tokens[0].size()};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+    }
+
+    Y_UNIT_TEST(OnlyTokenHasSizeOfTheBufferTest) {
+        const char data[] = "xxxxx";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, dataSize};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(data, token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
+    }
+
+    Y_UNIT_TEST(BufferSizeInitialSizeSmallerThanTokenTest) {
+        const char data[] = "xxxxx\nxx";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        const TStringBuf tokens[] = {TStringBuf("xxxxx"), TStringBuf("xx")};
+        const auto tokensSize = Y_ARRAY_SIZE(tokens);
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, 1};
+        auto tokensCount = size_t{};
+        for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+            const auto token = TStringBuf{it->Data(), it->Length()};
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+    }
+
+    Y_UNIT_TEST(RangeBasedForTest) {
+        const char data[] = "abc\ndef\nxxxxxx";
+        const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+        const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
+        const auto tokensSize = Y_ARRAY_SIZE(tokens);
+        auto&& input = TMemoryInput{data, dataSize};
+        auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+        auto tokensCount = size_t{};
+        for (const auto& token : tokenizer) {
+            UNIT_ASSERT(tokensCount < tokensSize);
+            CheckIfNullTerminated(token);
+            UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+            ++tokensCount;
+        }
+        UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+    }
+}
author	Devtools Arcadia <[email protected]>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <[email protected]>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /util/stream/tokenizer_ut.cpp