summaryrefslogtreecommitdiffstats
path: root/util/stream/tokenizer_ut.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <[email protected]>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /util/stream/tokenizer_ut.cpp
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'util/stream/tokenizer_ut.cpp')
-rw-r--r--util/stream/tokenizer_ut.cpp264
1 files changed, 264 insertions, 0 deletions
diff --git a/util/stream/tokenizer_ut.cpp b/util/stream/tokenizer_ut.cpp
new file mode 100644
index 00000000000..afc566da86e
--- /dev/null
+++ b/util/stream/tokenizer_ut.cpp
@@ -0,0 +1,264 @@
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/array_size.h>
+#include <util/generic/strbuf.h>
+
+#include "mem.h"
+#include "null.h"
+#include "tokenizer.h"
+
+static inline void CheckIfNullTerminated(const TStringBuf str) {
+ UNIT_ASSERT_VALUES_EQUAL('\0', *(str.data() + str.size()));
+}
+
+Y_UNIT_TEST_SUITE(TStreamTokenizerTests) {
+ Y_UNIT_TEST(EmptyStreamTest) {
+ auto&& input = TNullInput{};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(0, tokensCount);
+ }
+
+ Y_UNIT_TEST(EmptyTokensTest) {
+ const char data[] = "\n\n";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
+ UNIT_ASSERT_VALUES_EQUAL(0, it->Length());
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(2, tokensCount);
+ }
+
+ Y_UNIT_TEST(LastTokenendDoesntSatisfyPredicateTest) {
+ const char data[] = "abc\ndef\nxxxxxx";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
+ const auto tokensSize = Y_ARRAY_SIZE(tokens);
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ UNIT_ASSERT(tokensCount < tokensSize);
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+ }
+
+ Y_UNIT_TEST(FirstTokenIsEmptyTest) {
+ const char data[] = "\ndef\nxxxxxx";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ const TStringBuf tokens[] = {TStringBuf(), TStringBuf("def"), TStringBuf("xxxxxx")};
+ const auto tokensSize = Y_ARRAY_SIZE(tokens);
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ UNIT_ASSERT(tokensCount < tokensSize);
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+ }
+
+ Y_UNIT_TEST(PredicateDoesntMatch) {
+ const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(data, token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
+ }
+
+ Y_UNIT_TEST(SimpleTest) {
+ const char data[] = "qwerty\n1234567890\n";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ const TStringBuf tokens[] = {TStringBuf("qwerty"), TStringBuf("1234567890")};
+ const auto tokensSize = Y_ARRAY_SIZE(tokens);
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ UNIT_ASSERT(tokensCount < tokensSize);
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+ }
+
+ Y_UNIT_TEST(CustomPredicateTest) {
+ struct TIsVerticalBar {
+ inline bool operator()(const char ch) const noexcept {
+ return '|' == ch;
+ }
+ };
+
+ const char data[] = "abc|def|xxxxxx";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
+ const auto tokensSize = Y_ARRAY_SIZE(tokens);
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TIsVerticalBar>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ UNIT_ASSERT(tokensCount < tokensSize);
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+ }
+
+ Y_UNIT_TEST(CustomPredicateSecondTest) {
+ struct TIsVerticalBar {
+ inline bool operator()(const char ch) const noexcept {
+ return '|' == ch || ',' == ch;
+ }
+ };
+
+ const char data[] = "abc|def|xxxxxx,abc|def|xxxxxx";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx"),
+ TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
+ const auto tokensSize = Y_ARRAY_SIZE(tokens);
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TIsVerticalBar>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ UNIT_ASSERT(tokensCount < tokensSize);
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+ }
+
+ Y_UNIT_TEST(FalsePredicateTest) {
+ struct TAlwaysFalse {
+ inline bool operator()(const char) const noexcept {
+ return false;
+ }
+ };
+
+ const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TAlwaysFalse>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(data, token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
+ }
+
+ Y_UNIT_TEST(TruePredicateTest) {
+ struct TAlwaysTrue {
+ inline bool operator()(const char) const noexcept {
+ return true;
+ }
+ };
+
+ const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TAlwaysTrue>{&input};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
+ UNIT_ASSERT_VALUES_EQUAL(0, it->Length());
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(dataSize, tokensCount);
+ }
+
+ Y_UNIT_TEST(FirstTokenHasSizeOfTheBufferTest) {
+ const char data[] = "xxxxx\nxx";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ const TStringBuf tokens[] = {TStringBuf("xxxxx"), TStringBuf("xx")};
+ const auto tokensSize = Y_ARRAY_SIZE(tokens);
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, tokens[0].size()};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+ }
+
+ Y_UNIT_TEST(OnlyTokenHasSizeOfTheBufferTest) {
+ const char data[] = "xxxxx";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, dataSize};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(data, token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
+ }
+
+ Y_UNIT_TEST(BufferSizeInitialSizeSmallerThanTokenTest) {
+ const char data[] = "xxxxx\nxx";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ const TStringBuf tokens[] = {TStringBuf("xxxxx"), TStringBuf("xx")};
+ const auto tokensSize = Y_ARRAY_SIZE(tokens);
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, 1};
+ auto tokensCount = size_t{};
+ for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
+ const auto token = TStringBuf{it->Data(), it->Length()};
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+ }
+
+ Y_UNIT_TEST(RangeBasedForTest) {
+ const char data[] = "abc\ndef\nxxxxxx";
+ const auto dataSize = Y_ARRAY_SIZE(data) - 1;
+ const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
+ const auto tokensSize = Y_ARRAY_SIZE(tokens);
+ auto&& input = TMemoryInput{data, dataSize};
+ auto&& tokenizer = TStreamTokenizer<TEol>{&input};
+ auto tokensCount = size_t{};
+ for (const auto& token : tokenizer) {
+ UNIT_ASSERT(tokensCount < tokensSize);
+ CheckIfNullTerminated(token);
+ UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
+ ++tokensCount;
+ }
+ UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
+ }
+}