aboutsummaryrefslogtreecommitdiffstats
path: root/util/stream/tokenizer.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /util/stream/tokenizer.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'util/stream/tokenizer.h')
-rw-r--r--util/stream/tokenizer.h214
1 files changed, 214 insertions, 0 deletions
diff --git a/util/stream/tokenizer.h b/util/stream/tokenizer.h
new file mode 100644
index 0000000000..b2398efdd1
--- /dev/null
+++ b/util/stream/tokenizer.h
@@ -0,0 +1,214 @@
+#pragma once
+
+#include "input.h"
+
+#include <util/generic/buffer.h>
+#include <util/generic/mem_copy.h>
+#include <util/generic/strbuf.h>
+#include <util/system/compiler.h>
+#include <util/system/yassert.h>
+
+/**
+ * @addtogroup Streams
+ * @{
+ */
+
+/**
+ * Simple stream tokenizer. Splits the stream into tokens that are available
+ * via iterator interface.
+ *
+ * @tparam TEndOfToken Predicate for token delimiter characters.
+ * @see TEol
+ */
+template <typename TEndOfToken>
+class TStreamTokenizer {
+public:
+ class TIterator {
+ public:
+ inline TIterator(TStreamTokenizer* const parent)
+ : Parent_(parent)
+ , AtEnd_(!Parent_->Next(Data_, Len_))
+ {
+ }
+
+ inline TIterator() noexcept
+ : Parent_(nullptr)
+ , Data_(nullptr)
+ , Len_(0)
+ , AtEnd_(true)
+ {
+ }
+
+ inline ~TIterator() = default;
+
+ inline void operator++() {
+ Next();
+ }
+
+ inline bool operator==(const TIterator& l) const noexcept {
+ return AtEnd_ == l.AtEnd_;
+ }
+
+ inline bool operator!=(const TIterator& l) const noexcept {
+ return !(*this == l);
+ }
+
+ /**
+ * @return Return null-terminated character array with current token.
+ * The pointer may be invalid after iterator increment.
+ */
+ inline const char* Data() const noexcept {
+ Y_ASSERT(!AtEnd_);
+
+ return Data_;
+ }
+
+ /**
+ * @return Length of current token.
+ */
+ inline size_t Length() const noexcept {
+ Y_ASSERT(!AtEnd_);
+
+ return Len_;
+ }
+
+ inline TIterator* operator->() noexcept {
+ return this;
+ }
+
+ inline TStringBuf operator*() noexcept {
+ return TStringBuf{Data_, Len_};
+ }
+
+ private:
+ inline void Next() {
+ Y_ASSERT(Parent_);
+
+ AtEnd_ = !Parent_->Next(Data_, Len_);
+ }
+
+ private:
+ TStreamTokenizer* const Parent_;
+ char* Data_;
+ size_t Len_;
+ bool AtEnd_;
+ };
+
+ inline TStreamTokenizer(IInputStream* const input, const TEndOfToken& eot = TEndOfToken(),
+ const size_t initial = 1024)
+ : Input_(input)
+ , Buf_(initial)
+ , Cur_(BufBegin())
+ , End_(BufBegin())
+ , Eot_(eot)
+ {
+ CheckBuf();
+ }
+
+ inline bool Next(char*& buf, size_t& len) {
+ char* it = Cur_;
+
+ while (true) {
+ do {
+ while (it != End_) {
+ if (Eot_(*it)) {
+ *it = '\0';
+
+ buf = Cur_;
+ len = it - Cur_;
+ Cur_ = it + 1;
+
+ return true;
+ } else {
+ ++it;
+ }
+ }
+
+ if (Fill() == 0 && End_ != BufEnd()) {
+ *it = '\0';
+
+ buf = Cur_;
+ len = it - Cur_;
+ Cur_ = End_;
+
+ return len;
+ }
+ } while (it != BufEnd());
+
+ Y_ASSERT(it == BufEnd());
+ Y_ASSERT(End_ == BufEnd());
+
+ const size_t blen = End_ - Cur_;
+ if (Cur_ == BufBegin()) {
+ Y_ASSERT(blen == Buf_.Capacity());
+
+ /*
+ * do reallocate
+ */
+
+ Buf_.Reserve(Buf_.Capacity() * 4);
+ CheckBuf();
+ } else {
+ /*
+ * do move
+ */
+
+ MemMove(BufBegin(), Cur_, blen);
+ }
+
+ Cur_ = BufBegin();
+ End_ = Cur_ + blen;
+ it = End_;
+ }
+ }
+
+ inline TIterator begin() {
+ return TIterator{this};
+ }
+
+ inline TIterator end() noexcept {
+ return {};
+ }
+
+private:
+ inline size_t Fill() {
+ const size_t avail = BufEnd() - End_;
+ const size_t bytesRead = Input_->Read(End_, avail);
+
+ End_ += bytesRead;
+
+ return bytesRead;
+ }
+
+ inline char* BufBegin() noexcept {
+ return Buf_.Data();
+ }
+
+ inline char* BufEnd() noexcept {
+ return Buf_.Data() + Buf_.Capacity();
+ }
+
+ inline void CheckBuf() const {
+ if (!Buf_.Data()) {
+ throw std::bad_alloc();
+ }
+ }
+
+private:
+ IInputStream* const Input_;
+ TBuffer Buf_;
+ char* Cur_;
+ char* End_;
+ TEndOfToken Eot_;
+};
+
+/**
+ * Predicate for `TStreamTokenizer` that uses '\\n' as a delimiter.
+ */
+struct TEol {
+ inline bool operator()(char ch) const noexcept {
+ return ch == '\n';
+ }
+};
+
+/** @} */