aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/yson_pull/detail
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/yson_pull/detail
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/yson_pull/detail')
-rw-r--r--library/cpp/yson_pull/detail/byte_reader.h74
-rw-r--r--library/cpp/yson_pull/detail/byte_writer.h77
-rw-r--r--library/cpp/yson_pull/detail/cescape.h143
-rw-r--r--library/cpp/yson_pull/detail/cescape_decode.h154
-rw-r--r--library/cpp/yson_pull/detail/cescape_encode.h114
-rw-r--r--library/cpp/yson_pull/detail/fail.h20
-rw-r--r--library/cpp/yson_pull/detail/format_string.h26
-rw-r--r--library/cpp/yson_pull/detail/input/buffered.h35
-rw-r--r--library/cpp/yson_pull/detail/input/stdio_file.h42
-rw-r--r--library/cpp/yson_pull/detail/input/stream.h69
-rw-r--r--library/cpp/yson_pull/detail/lexer_base.h343
-rw-r--r--library/cpp/yson_pull/detail/macros.h24
-rw-r--r--library/cpp/yson_pull/detail/number.h37
-rw-r--r--library/cpp/yson_pull/detail/output/buffered.h51
-rw-r--r--library/cpp/yson_pull/detail/output/stdio_file.h33
-rw-r--r--library/cpp/yson_pull/detail/output/stream.h56
-rw-r--r--library/cpp/yson_pull/detail/percent_scalar.h36
-rw-r--r--library/cpp/yson_pull/detail/reader.h677
-rw-r--r--library/cpp/yson_pull/detail/stream_counter.h51
-rw-r--r--library/cpp/yson_pull/detail/symbols.h55
-rw-r--r--library/cpp/yson_pull/detail/traits.h29
-rw-r--r--library/cpp/yson_pull/detail/varint.h260
-rw-r--r--library/cpp/yson_pull/detail/writer.h566
-rw-r--r--library/cpp/yson_pull/detail/zigzag.h24
24 files changed, 2996 insertions, 0 deletions
diff --git a/library/cpp/yson_pull/detail/byte_reader.h b/library/cpp/yson_pull/detail/byte_reader.h
new file mode 100644
index 0000000000..7cea50d323
--- /dev/null
+++ b/library/cpp/yson_pull/detail/byte_reader.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "cescape.h"
+#include "fail.h"
+#include "stream_counter.h"
+
+#include <library/cpp/yson_pull/input.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ template <class StreamCounter>
+ class byte_reader {
+ NYsonPull::NInput::IStream& stream_;
+ StreamCounter stream_counter_;
+
+ public:
+ byte_reader(NYsonPull::NInput::IStream& stream)
+ : stream_(stream)
+ {
+ }
+
+ // const-ness added to prevent direct stream mutation
+ const NYsonPull::NInput::IStream& stream() {
+ return stream_;
+ }
+
+ template <typename... Args>
+ ATTRIBUTE(noinline, cold)
+ void fail[[noreturn]](const char* msg, Args&&... args) {
+ NYsonPull::NDetail::fail(
+ stream_counter_.info(),
+ msg,
+ std::forward<Args>(args)...);
+ }
+
+ template <bool AllowFinish>
+ void fill_buffer() {
+ stream_.fill_buffer();
+
+ if (!AllowFinish) {
+ auto& buf = stream_.buffer();
+ if (Y_UNLIKELY(buf.is_empty() && stream_.at_end())) {
+ fail("Premature end of stream");
+ }
+ }
+ }
+
+ void fill_buffer() {
+ return fill_buffer<true>();
+ }
+
+ template <bool AllowFinish>
+ ui8 get_byte() {
+ fill_buffer<AllowFinish>();
+ auto& buf = stream_.buffer();
+ return !buf.is_empty()
+ ? *buf.pos()
+ : ui8{'\0'};
+ }
+
+ ui8 get_byte() {
+ return get_byte<true>();
+ }
+
+ void advance(size_t bytes) {
+ auto& buf = stream_.buffer();
+ stream_counter_.update(
+ buf.pos(),
+ buf.pos() + bytes);
+ buf.advance(bytes);
+ }
+ };
+ }
+}
diff --git a/library/cpp/yson_pull/detail/byte_writer.h b/library/cpp/yson_pull/detail/byte_writer.h
new file mode 100644
index 0000000000..dc1d4b4b96
--- /dev/null
+++ b/library/cpp/yson_pull/detail/byte_writer.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "macros.h"
+
+#include <library/cpp/yson_pull/output.h>
+
+#include <util/system/types.h>
+
+#include <cstddef>
+#include <cstring>
+
+namespace NYsonPull {
+ namespace NDetail {
+ template <class StreamCounter>
+ class byte_writer {
+ NYsonPull::NOutput::IStream& stream_;
+ StreamCounter stream_counter_;
+
+ public:
+ byte_writer(NYsonPull::NOutput::IStream& stream)
+ : stream_(stream)
+ {
+ }
+
+ // const-ness added to prevent direct stream mutation
+ const NYsonPull::NOutput::IStream& stream() {
+ return stream_;
+ }
+ const StreamCounter& counter() {
+ return stream_counter_;
+ }
+
+ void flush_buffer() {
+ stream_.flush_buffer();
+ }
+
+ void advance(size_t bytes) {
+ auto& buf = stream_.buffer();
+ stream_counter_.update(
+ buf.pos(),
+ buf.pos() + bytes);
+ buf.advance(bytes);
+ }
+
+ void write(ui8 c) {
+ auto& buf = stream_.buffer();
+ if (Y_LIKELY(!buf.is_full())) {
+ *buf.pos() = c;
+ advance(1);
+ } else {
+ auto ptr = reinterpret_cast<char*>(&c);
+ stream_counter_.update(&c, &c + 1);
+ stream_.flush_buffer({ptr, 1});
+ }
+ }
+
+ void write(const ui8* data, size_t size) {
+ auto& buf = stream_.buffer();
+ auto free_buf = buf.available();
+ if (Y_LIKELY(size < free_buf)) {
+ ::memcpy(buf.pos(), data, size);
+ advance(size);
+ } else {
+ if (!buf.is_full()) {
+ ::memcpy(buf.pos(), data, free_buf);
+ advance(free_buf);
+ data += free_buf;
+ size -= free_buf;
+ }
+ stream_counter_.update(data, data + size);
+ stream_.flush_buffer({reinterpret_cast<const char*>(data),
+ size});
+ }
+ }
+ };
+ }
+}
diff --git a/library/cpp/yson_pull/detail/cescape.h b/library/cpp/yson_pull/detail/cescape.h
new file mode 100644
index 0000000000..1ea150e69a
--- /dev/null
+++ b/library/cpp/yson_pull/detail/cescape.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "byte_writer.h"
+#include "cescape_decode.h"
+#include "cescape_encode.h"
+#include "macros.h"
+
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+
+/* REFERENCES FOR ESCAPE SEQUENCE INTERPRETATION:
+ * C99 p. 6.4.3 Universal character names.
+ * C99 p. 6.4.4.4 Character constants.
+ *
+ * <simple-escape-sequence> ::= {
+ * \' , \" , \? , \\ ,
+ * \a , \b , \f , \n , \r , \t , \v
+ * }
+ *
+ * <octal-escape-sequence> ::= \ <octal-digit> {1, 3}
+ * <hexadecimal-escape-sequence> ::= \x <hexadecimal-digit> +
+ * <universal-character-name> ::= \u <hexadecimal-digit> {4}
+ * || \U <hexadecimal-digit> {8}
+ *
+ * NOTE (6.4.4.4.7):
+ * Each octal or hexadecimal escape sequence is the longest sequence of characters that can
+ * constitute the escape sequence.
+ *
+ * THEREFORE:
+ * - Octal escape sequence spans until rightmost non-octal-digit character.
+ * - Octal escape sequence always terminates after three octal digits.
+ * - Hexadecimal escape sequence spans until rightmost non-hexadecimal-digit character.
+ * - Universal character name consists of exactly 4 or 8 hexadecimal digit.
+ *
+ */
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NCEscape {
+ inline void encode(TString& dest, TStringBuf data) {
+ NImpl::escape_impl(
+ reinterpret_cast<const ui8*>(data.data()),
+ data.size(),
+ [&](const ui8* str, size_t size) {
+ dest.append(
+ reinterpret_cast<const char*>(str),
+ size);
+ });
+ }
+
+ // dest must have at least 4*data.size() bytes available
+ inline size_t encode(ui8* dest, TStringBuf data) {
+ auto* dest_begin = dest;
+ NImpl::escape_impl(
+ reinterpret_cast<const ui8*>(data.data()),
+ data.size(),
+ [&](const ui8* str, size_t size) {
+ ::memcpy(dest, str, size);
+ dest += size;
+ });
+ return dest - dest_begin;
+ }
+
+ template <typename U>
+ void encode(byte_writer<U>& dest, TStringBuf data) {
+ auto& buffer = dest.stream().buffer();
+ if (Y_LIKELY(buffer.available() >= data.size() * 4)) {
+ auto size = encode(buffer.pos(), data);
+ dest.advance(size);
+ } else {
+ NImpl::escape_impl(
+ reinterpret_cast<const ui8*>(data.data()),
+ data.size(),
+ [&](const ui8* str, size_t size) {
+ dest.write(str, size);
+ });
+ }
+ }
+
+ inline TString encode(TStringBuf data) {
+ TString result;
+ result.reserve(data.size());
+ encode(result, data);
+ return result;
+ }
+
+ inline void decode(TString& dest, TStringBuf data) {
+ NImpl::unescape_impl(
+ reinterpret_cast<const ui8*>(data.begin()),
+ reinterpret_cast<const ui8*>(data.end()),
+ [&](ui8 c) {
+ dest += c;
+ },
+ [&](const ui8* p, size_t len) {
+ dest.append(reinterpret_cast<const char*>(p), len);
+ });
+ }
+
+ inline void decode_inplace(TVector<ui8>& data) {
+ auto* out = static_cast<ui8*>(
+ ::memchr(data.data(), '\\', data.size()));
+ if (out == nullptr) {
+ return;
+ }
+ NImpl::unescape_impl(
+ out,
+ data.data() + data.size(),
+ [&](ui8 c) {
+ *out++ = c;
+ },
+ [&](const ui8* p, size_t len) {
+ ::memmove(out, p, len);
+ out += len;
+ });
+ data.resize(out - &data[0]);
+ }
+
+ inline TString decode(TStringBuf data) {
+ TString result;
+ result.reserve(data.size());
+ decode(result, data);
+ return result;
+ }
+
+ ATTRIBUTE(noinline, cold)
+ inline TString quote(TStringBuf str) {
+ TString result;
+ result.reserve(str.size() + 16);
+ result += '"';
+ encode(result, str);
+ result += '"';
+ return result;
+ }
+
+ ATTRIBUTE(noinline, cold)
+ inline TString quote(ui8 ch) {
+ char c = ch;
+ return quote(TStringBuf(&c, 1));
+ }
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/cescape_decode.h b/library/cpp/yson_pull/detail/cescape_decode.h
new file mode 100644
index 0000000000..2ee5dd9500
--- /dev/null
+++ b/library/cpp/yson_pull/detail/cescape_decode.h
@@ -0,0 +1,154 @@
+#pragma once
+
+#include <util/system/types.h>
+
+#include <algorithm>
+#include <cstring>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NCEscape {
+ namespace NImpl {
+ inline ui8 as_digit(ui8 c) {
+ return c - ui8{'0'};
+ }
+
+ inline ui8 as_hexdigit(ui8 c) {
+ static constexpr ui8 hex_decode_map[256] = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255};
+
+ return hex_decode_map[c];
+ }
+
+ inline const ui8* read_oct(ui8& result, const ui8* p, ui8 n) {
+ auto digit = ui8{0};
+ while (n-- && (digit = as_digit(*p)) < 8) {
+ result = result * 8 + digit;
+ ++p;
+ }
+ return p;
+ }
+
+ inline const ui8* read_hex(ui8& result, const ui8* p, ui8 n) {
+ auto digit = ui8{0};
+ while (n-- && (digit = as_hexdigit(*p)) < 16) {
+ result = result * 16 + digit;
+ ++p;
+ }
+ return p;
+ }
+
+ inline const ui8* unescape_char_and_advance(
+ ui8& result,
+ const ui8* p,
+ const ui8* end) {
+ switch (*p) {
+ default:
+ result = *p;
+ ++p;
+ break;
+ case 'b':
+ result = '\b';
+ ++p;
+ break;
+ case 'f':
+ result = '\f';
+ ++p;
+ break;
+ case 'n':
+ result = '\n';
+ ++p;
+ break;
+ case 'r':
+ result = '\r';
+ ++p;
+ break;
+ case 't':
+ result = '\t';
+ ++p;
+ break;
+
+ case 'x': {
+ ++p;
+ result = 0;
+ auto* next = read_hex(
+ result,
+ p, std::min<ptrdiff_t>(2, end - p));
+ if (next > p) {
+ p = next;
+ } else {
+ result = 'x';
+ }
+ } break;
+
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ result = 0;
+ p = read_oct(
+ result,
+ p, std::min<ptrdiff_t>(3, end - p));
+ break;
+
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ result = 0;
+ p = read_oct(
+ result,
+ p, std::min<ptrdiff_t>(2, end - p));
+ break;
+ }
+ return p;
+ }
+
+ template <typename T, typename U>
+ inline void unescape_impl(
+ const ui8* p,
+ const ui8* end,
+ T&& consume_one,
+ U&& consume_span) {
+ while (p < end) {
+ auto* escaped = static_cast<const ui8*>(
+ ::memchr(p, '\\', end - p));
+ if (escaped == nullptr) {
+ consume_span(p, end - p);
+ return;
+ } else {
+ consume_span(p, escaped - p);
+ auto c = ui8{'\\'};
+ p = escaped + 1;
+ if (p < end) {
+ p = unescape_char_and_advance(c, p, end);
+ }
+ consume_one(c);
+ }
+ }
+ }
+ }
+ } // namespace NCEscape
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/cescape_encode.h b/library/cpp/yson_pull/detail/cescape_encode.h
new file mode 100644
index 0000000000..bf5765f1d9
--- /dev/null
+++ b/library/cpp/yson_pull/detail/cescape_encode.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <util/system/types.h>
+
+// Whether to ensure strict ASCII compatibility
+// Turns UTF-8 strings into unreadable garbage for no known reason
+//#define CESCAPE_STRICT_ASCII
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NCEscape {
+ namespace NImpl {
+ inline ui8 hex_digit(ui8 value) {
+ constexpr ui8 hex_digits[] = "0123456789ABCDEF";
+ return hex_digits[value];
+ }
+
+ inline ui8 oct_digit(ui8 value) {
+ return '0' + value;
+ }
+
+ inline bool is_printable(ui8 c) {
+#ifdef CESCAPE_STRICT_ASCII
+ return c >= 32 && c <= 126;
+#else
+ return c >= 32;
+#endif
+ }
+
+ inline bool is_hex_digit(ui8 c) {
+ return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
+ }
+
+ inline bool is_oct_digit(ui8 c) {
+ return c >= '0' && c <= '7';
+ }
+
+ constexpr size_t ESCAPE_C_BUFFER_SIZE = 4;
+
+ inline size_t escape_char(
+ ui8 c,
+ ui8 next,
+ ui8 r[ESCAPE_C_BUFFER_SIZE]) {
+ // (1) Printable characters go as-is, except backslash and double quote.
+ // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible).
+ // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal.
+ if (c == '\"') {
+ r[0] = '\\';
+ r[1] = '\"';
+ return 2;
+ } else if (c == '\\') {
+ r[0] = '\\';
+ r[1] = '\\';
+ return 2;
+ } else if (is_printable(c)) {
+ r[0] = c;
+ return 1;
+ } else if (c == '\r') {
+ r[0] = '\\';
+ r[1] = 'r';
+ return 2;
+ } else if (c == '\n') {
+ r[0] = '\\';
+ r[1] = 'n';
+ return 2;
+ } else if (c == '\t') {
+ r[0] = '\\';
+ r[1] = 't';
+ return 2;
+ } else if (c < 8 && !is_oct_digit(next)) {
+ r[0] = '\\';
+ r[1] = oct_digit(c);
+ return 2;
+ } else if (!is_hex_digit(next)) {
+ r[0] = '\\';
+ r[1] = 'x';
+ r[2] = hex_digit((c & 0xF0) >> 4);
+ r[3] = hex_digit((c & 0x0F) >> 0);
+ return 4;
+ } else {
+ r[0] = '\\';
+ r[1] = oct_digit((c & 0700) >> 6);
+ r[2] = oct_digit((c & 0070) >> 3);
+ r[3] = oct_digit((c & 0007) >> 0);
+ return 4;
+ }
+ }
+
+ template <typename T>
+ inline void escape_impl(const ui8* str, size_t len, T&& consume) {
+ ui8 buffer[ESCAPE_C_BUFFER_SIZE];
+
+ size_t i, j;
+ for (i = 0, j = 0; i < len; ++i) {
+ auto next_char = i + 1 < len ? str[i + 1] : 0;
+ size_t rlen = escape_char(str[i], next_char, buffer);
+
+ if (rlen > 1) {
+ consume(str + j, i - j);
+ j = i + 1;
+ consume(buffer, rlen);
+ }
+ }
+
+ if (j > 0) {
+ consume(str + j, len - j);
+ } else {
+ consume(str, len);
+ }
+ }
+ }
+ } // namespace NCEscape
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/fail.h b/library/cpp/yson_pull/detail/fail.h
new file mode 100644
index 0000000000..6937612d0b
--- /dev/null
+++ b/library/cpp/yson_pull/detail/fail.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "format_string.h"
+#include "macros.h"
+
+#include <library/cpp/yson_pull/exceptions.h>
+#include <library/cpp/yson_pull/position_info.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ template <typename... Args>
+ ATTRIBUTE(noreturn, noinline, cold)
+ void fail(
+ const TPositionInfo& info,
+ Args&&... args) {
+ auto formatted_message = format_string(std::forward<Args>(args)...);
+ throw NException::TBadInput(formatted_message, info);
+ }
+ }
+}
diff --git a/library/cpp/yson_pull/detail/format_string.h b/library/cpp/yson_pull/detail/format_string.h
new file mode 100644
index 0000000000..683fd1bf36
--- /dev/null
+++ b/library/cpp/yson_pull/detail/format_string.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/string/builder.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NImpl {
+ inline void apply_args(TStringBuilder&) {
+ }
+
+ template <typename T, typename... Args>
+ inline void apply_args(TStringBuilder& builder, T&& arg, Args&&... args) {
+ apply_args(builder << arg, std::forward<Args>(args)...);
+ }
+ }
+
+ template <typename... Args>
+ TString format_string(Args&&... args) {
+ TStringBuilder builder;
+ NImpl::apply_args(builder, std::forward<Args>(args)...);
+ return TString(std::move(builder));
+ }
+ }
+}
diff --git a/library/cpp/yson_pull/detail/input/buffered.h b/library/cpp/yson_pull/detail/input/buffered.h
new file mode 100644
index 0000000000..9b1482577f
--- /dev/null
+++ b/library/cpp/yson_pull/detail/input/buffered.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <library/cpp/yson_pull/detail/macros.h>
+
+#include <library/cpp/yson_pull/exceptions.h>
+#include <library/cpp/yson_pull/input.h>
+
+#include <cstdio>
+#include <memory>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NInput {
+ class TBuffered: public NYsonPull::NInput::IStream {
+ TArrayHolder<ui8> buffer_;
+ size_t size_;
+
+ public:
+ explicit TBuffered(size_t buffer_size)
+ : buffer_{new ui8[buffer_size]}
+ , size_{buffer_size} {
+ }
+
+ protected:
+ ui8* buffer_data() const {
+ return buffer_.Get();
+ }
+
+ size_t buffer_size() const {
+ return size_;
+ }
+ };
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/input/stdio_file.h b/library/cpp/yson_pull/detail/input/stdio_file.h
new file mode 100644
index 0000000000..c412b7e59b
--- /dev/null
+++ b/library/cpp/yson_pull/detail/input/stdio_file.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "buffered.h"
+
+#include <library/cpp/yson_pull/detail/macros.h>
+
+#include <library/cpp/yson_pull/exceptions.h>
+#include <library/cpp/yson_pull/input.h>
+
+#include <cstdio>
+#include <memory>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NInput {
+ class TStdioFile: public TBuffered {
+ FILE* file_;
+
+ public:
+ TStdioFile(FILE* file, size_t buffer_size)
+ : TBuffered(buffer_size)
+ , file_{file} {
+ }
+
+ protected:
+ result do_fill_buffer() override {
+ auto nread = ::fread(buffer_data(), 1, buffer_size(), file_);
+ if (Y_UNLIKELY(nread == 0)) {
+ if (ferror(file_)) {
+ throw NException::TSystemError();
+ }
+ if (feof(file_)) {
+ return result::at_end;
+ }
+ }
+ buffer().reset(buffer_data(), buffer_data() + nread);
+ return result::have_more_data;
+ }
+ };
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/input/stream.h b/library/cpp/yson_pull/detail/input/stream.h
new file mode 100644
index 0000000000..791cd5a3f5
--- /dev/null
+++ b/library/cpp/yson_pull/detail/input/stream.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <library/cpp/yson_pull/detail/macros.h>
+
+#include <library/cpp/yson_pull/input.h>
+
+#include <util/stream/buffered.h>
+#include <util/stream/file.h>
+#include <util/stream/zerocopy.h>
+#include <util/system/file.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NInput {
+ class TStreamBase: public NYsonPull::NInput::IStream {
+ protected:
+ result DoFillBufferFrom(IZeroCopyInput& input) {
+ void* ptr = nullptr;
+ size_t size = input.Next(&ptr);
+ if (Y_UNLIKELY(size == 0)) {
+ return result::at_end;
+ }
+ buffer().reset(static_cast<ui8*>(ptr), static_cast<ui8*>(ptr) + size);
+ return result::have_more_data;
+ }
+ };
+
+ class TZeroCopy: public TStreamBase {
+ IZeroCopyInput* Input;
+
+ public:
+ explicit TZeroCopy(IZeroCopyInput* input)
+ : Input(input)
+ {
+ }
+
+ protected:
+ result do_fill_buffer() override {
+ return DoFillBufferFrom(*Input);
+ }
+ };
+
+ template <typename TBuffered>
+ class TOwned: public TStreamBase {
+ TBuffered Input;
+
+ public:
+ template <typename... Args>
+ explicit TOwned(Args&&... args)
+ : Input(std::forward<Args>(args)...)
+ {
+ }
+
+ protected:
+ result do_fill_buffer() override {
+ return DoFillBufferFrom(Input);
+ }
+ };
+
+ class TFHandle: public TOwned<TFileInput> {
+ public:
+ TFHandle(int fd, size_t buffer_size)
+ : TOwned<TFileInput>(Duplicate(fd), buffer_size)
+ {
+ }
+ };
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/lexer_base.h b/library/cpp/yson_pull/detail/lexer_base.h
new file mode 100644
index 0000000000..572bdb3d18
--- /dev/null
+++ b/library/cpp/yson_pull/detail/lexer_base.h
@@ -0,0 +1,343 @@
+#pragma once
+
+#include "byte_reader.h"
+#include "cescape.h"
+#include "macros.h"
+#include "number.h"
+#include "percent_scalar.h"
+#include "stream_counter.h"
+#include "varint.h"
+
+#include <util/generic/maybe.h>
+#include <util/generic/vector.h>
+#include <util/string/cast.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ template <bool EnableLinePositionInfo>
+ class lexer_base: public byte_reader<stream_counter<EnableLinePositionInfo>> {
+ using Base = byte_reader<
+ stream_counter<EnableLinePositionInfo>>;
+
+ TVector<ui8> token_buffer_;
+ TMaybe<size_t> memory_limit_;
+
+ public:
+ lexer_base(
+ NYsonPull::NInput::IStream& buffer,
+ TMaybe<size_t> memory_limit)
+ : Base(buffer)
+ , memory_limit_{memory_limit} {
+ }
+
+ ATTRIBUTE(noinline, hot)
+ ui8 skip_space_and_get_byte() {
+ auto& buf = Base::stream().buffer();
+ if (Y_LIKELY(!buf.is_empty())) {
+ auto ch = *buf.pos();
+ if (Y_LIKELY(!is_space(ch))) {
+ return ch;
+ }
+ }
+ return skip_space_and_get_byte_fallback();
+ }
+
+ ATTRIBUTE(hot)
+ ui8 get_byte() {
+ auto& buf = Base::stream().buffer();
+ if (Y_LIKELY(!buf.is_empty())) {
+ return *buf.pos();
+ }
+ return Base::get_byte();
+ }
+
+ number read_numeric() {
+ token_buffer_.clear();
+ auto type = number_type::int64;
+ while (true) {
+ auto ch = this->Base::template get_byte<true>();
+ if (isdigit(ch) || ch == '+' || ch == '-') {
+ token_buffer_.push_back(ch);
+ } else if (ch == '.' || ch == 'e' || ch == 'E') {
+ token_buffer_.push_back(ch);
+ type = number_type::float64;
+ } else if (ch == 'u') {
+ token_buffer_.push_back(ch);
+ type = number_type::uint64;
+ } else if (Y_UNLIKELY(isalpha(ch))) {
+ COLD_BLOCK_BYVALUE
+ Base::fail("Unexpected ", NCEscape::quote(ch), " in numeric literal");
+ COLD_BLOCK_END
+ } else {
+ break;
+ }
+ check_memory_limit();
+ Base::advance(1);
+ }
+
+ auto str = token_buffer();
+ try {
+ switch (type) {
+ case number_type::float64:
+ return FromString<double>(str);
+ case number_type::int64:
+ return FromString<i64>(str);
+ case number_type::uint64:
+ str.Chop(1); // 'u' suffix
+ return FromString<ui64>(str);
+ }
+ Y_UNREACHABLE();
+ } catch (const std::exception& err) {
+ Base::fail(err.what());
+ }
+ }
+
+ TStringBuf read_quoted_string() {
+ auto count_trailing_slashes = [](ui8* begin, ui8* end) {
+ auto count = size_t{0};
+ if (begin < end) {
+ for (auto p = end - 1; p >= begin && *p == '\\'; --p) {
+ ++count;
+ }
+ }
+ return count;
+ };
+
+ token_buffer_.clear();
+ auto& buf = Base::stream().buffer();
+ while (true) {
+ this->Base::template fill_buffer<false>();
+ auto* quote = reinterpret_cast<const ui8*>(
+ ::memchr(buf.pos(), '"', buf.available()));
+ if (quote == nullptr) {
+ token_buffer_.insert(
+ token_buffer_.end(),
+ buf.pos(),
+ buf.end());
+ Base::advance(buf.available());
+ continue;
+ }
+
+ token_buffer_.insert(
+ token_buffer_.end(),
+ buf.pos(),
+ quote);
+ Base::advance(quote - buf.pos() + 1); // +1 for the quote itself
+
+ // We must count the number of '\' at the end of StringValue
+ // to check if it's not \"
+ int slash_count = count_trailing_slashes(
+ token_buffer_.data(),
+ token_buffer_.data() + token_buffer_.size());
+ if (slash_count % 2 == 0) {
+ break;
+ } else {
+ token_buffer_.push_back('"');
+ }
+ check_memory_limit();
+ }
+
+ NCEscape::decode_inplace(token_buffer_);
+ return token_buffer();
+ }
+
+ TStringBuf read_unquoted_string() {
+ token_buffer_.clear();
+ while (true) {
+ auto ch = this->Base::template get_byte<true>();
+ if (isalpha(ch) || isdigit(ch) ||
+ ch == '_' || ch == '-' || ch == '%' || ch == '.') {
+ token_buffer_.push_back(ch);
+ } else {
+ break;
+ }
+ check_memory_limit();
+ Base::advance(1);
+ }
+ return token_buffer();
+ }
+
+ ATTRIBUTE(noinline, hot)
+ TStringBuf read_binary_string() {
+ auto slength = NVarInt::read<i32>(*this);
+ if (Y_UNLIKELY(slength < 0)) {
+ COLD_BLOCK_BYVALUE
+ Base::fail("Negative binary string literal length ", slength);
+ COLD_BLOCK_END
+ }
+ auto length = static_cast<ui32>(slength);
+
+ auto& buf = Base::stream().buffer();
+ if (Y_LIKELY(buf.available() >= length)) {
+ auto result = TStringBuf{
+ reinterpret_cast<const char*>(buf.pos()),
+ length};
+ Base::advance(length);
+ return result;
+ } else { // reading in Buffer
+ return read_binary_string_fallback(length);
+ }
+ }
+
+ ATTRIBUTE(noinline)
+ TStringBuf read_binary_string_fallback(size_t length) {
+ auto& buf = Base::stream().buffer();
+ auto needToRead = length;
+ token_buffer_.clear();
+ while (needToRead) {
+ this->Base::template fill_buffer<false>();
+ auto chunk_size = std::min(needToRead, buf.available());
+
+ token_buffer_.insert(
+ token_buffer_.end(),
+ buf.pos(),
+ buf.pos() + chunk_size);
+ check_memory_limit();
+ needToRead -= chunk_size;
+ Base::advance(chunk_size);
+ }
+ return token_buffer();
+ }
+
+ percent_scalar read_percent_scalar() {
+ auto throw_incorrect_percent_scalar = [&]() {
+ Base::fail("Incorrect %-literal prefix ", NCEscape::quote(token_buffer()));
+ };
+
+ auto assert_literal = [&](TStringBuf literal) -> void {
+ for (size_t i = 2; i < literal.size(); ++i) {
+ token_buffer_.push_back(this->Base::template get_byte<false>());
+ Base::advance(1);
+ if (Y_UNLIKELY(token_buffer_.back() != literal[i])) {
+ throw_incorrect_percent_scalar();
+ }
+ }
+ };
+
+ token_buffer_.clear();
+ token_buffer_.push_back(this->Base::template get_byte<false>());
+ Base::advance(1);
+
+ switch (token_buffer_[0]) {
+ case 't':
+ assert_literal(percent_scalar::true_literal);
+ return percent_scalar(true);
+ case 'f':
+ assert_literal(percent_scalar::false_literal);
+ return percent_scalar(false);
+ case 'n':
+ assert_literal(percent_scalar::nan_literal);
+ return percent_scalar(std::numeric_limits<double>::quiet_NaN());
+ case 'i':
+ assert_literal(percent_scalar::positive_inf_literal);
+ return percent_scalar(std::numeric_limits<double>::infinity());
+ case '-':
+ assert_literal(percent_scalar::negative_inf_literal);
+ return percent_scalar(-std::numeric_limits<double>::infinity());
+ default:
+ throw_incorrect_percent_scalar();
+ }
+
+ Y_UNREACHABLE();
+ }
+
+ i64 read_binary_int64() {
+ return NVarInt::read<i64>(*this);
+ }
+
+ ui64 read_binary_uint64() {
+ return NVarInt::read<ui64>(*this);
+ }
+
+ double read_binary_double() {
+ union {
+ double as_double;
+ ui8 as_bytes[sizeof(double)];
+ } data;
+ static_assert(sizeof(data) == sizeof(double), "bad union size");
+
+ auto needToRead = sizeof(double);
+
+ auto& buf = Base::stream().buffer();
+ while (needToRead != 0) {
+ Base::fill_buffer();
+
+ auto chunk_size = std::min(needToRead, buf.available());
+ if (chunk_size == 0) {
+ Base::fail("Error parsing binary double literal");
+ }
+ std::copy(
+ buf.pos(),
+ buf.pos() + chunk_size,
+ data.as_bytes + (sizeof(double) - needToRead));
+ needToRead -= chunk_size;
+ Base::advance(chunk_size);
+ }
+ return data.as_double;
+ }
+
+ private:
+ static bool is_space(ui8 ch) {
+ static const ui8 lookupTable[] =
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ return lookupTable[ch];
+ }
+
+ ATTRIBUTE(noinline, cold)
+ ui8 skip_space_and_get_byte_fallback() {
+ auto& buf = Base::stream().buffer();
+ while (true) {
+ // FIXME
+ if (buf.is_empty()) {
+ if (Base::stream().at_end()) {
+ return '\0';
+ }
+ Base::fill_buffer();
+ } else {
+ if (!is_space(*buf.pos())) {
+ break;
+ }
+ Base::advance(1);
+ }
+ }
+ return Base::get_byte();
+ }
+
+ void check_memory_limit() {
+ if (Y_UNLIKELY(memory_limit_ && token_buffer_.capacity() > *memory_limit_)) {
+ COLD_BLOCK_BYVALUE
+ Base::fail(
+ "Memory limit exceeded while parsing YSON stream: "
+ "allocated ",
+ token_buffer_.capacity(),
+ ", limit ", *memory_limit_);
+ COLD_BLOCK_END
+ }
+ }
+
+ TStringBuf token_buffer() const {
+ auto* begin = reinterpret_cast<const char*>(token_buffer_.data());
+ return {begin, token_buffer_.size()};
+ }
+ };
+ }
+}
diff --git a/library/cpp/yson_pull/detail/macros.h b/library/cpp/yson_pull/detail/macros.h
new file mode 100644
index 0000000000..7243f9cfe1
--- /dev/null
+++ b/library/cpp/yson_pull/detail/macros.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <util/system/compiler.h>
+
+#if defined(__GNUC__)
+#define ATTRIBUTE(args...) __attribute__((args))
+#else
+#define ATTRIBUTE(...)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define COLD_BLOCK_BYVALUE [=]() ATTRIBUTE(noinline, cold) {
+#define COLD_BLOCK_BYREF [&]() ATTRIBUTE(noinline, cold) {
+#define COLD_BLOCK_END \
+ } \
+ ();
+#else
+// Clang does not support gnu-style attributes on lambda functions yet
+#define COLD_BLOCK_BYVALUE [=]() {
+#define COLD_BLOCK_BYREF [&]() {
+#define COLD_BLOCK_END \
+ } \
+ ();
+#endif
diff --git a/library/cpp/yson_pull/detail/number.h b/library/cpp/yson_pull/detail/number.h
new file mode 100644
index 0000000000..5595f55e05
--- /dev/null
+++ b/library/cpp/yson_pull/detail/number.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <util/system/types.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ enum class number_type {
+ float64,
+ uint64,
+ int64
+ };
+
+ struct number {
+ number_type type;
+ union {
+ double as_float64;
+ ui64 as_uint64;
+ i64 as_int64;
+ } value;
+
+ number(double v) {
+ type = number_type::float64;
+ value.as_float64 = v;
+ }
+
+ number(i64 v) {
+ type = number_type::int64;
+ value.as_int64 = v;
+ }
+
+ number(ui64 v) {
+ type = number_type::uint64;
+ value.as_uint64 = v;
+ }
+ };
+ }
+}
diff --git a/library/cpp/yson_pull/detail/output/buffered.h b/library/cpp/yson_pull/detail/output/buffered.h
new file mode 100644
index 0000000000..475cf34785
--- /dev/null
+++ b/library/cpp/yson_pull/detail/output/buffered.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <library/cpp/yson_pull/detail/macros.h>
+
+#include <library/cpp/yson_pull/output.h>
+
+#include <util/generic/strbuf.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NOutput {
+ template <typename T>
+ class TBuffered: public NYsonPull::NOutput::IStream {
+ TArrayHolder<ui8> buffer_;
+ size_t size_;
+
+ public:
+ TBuffered(size_t buffer_size)
+ : buffer_{new ui8[buffer_size]}
+ , size_{buffer_size} {
+ reset_buffer();
+ }
+
+ protected:
+ void do_flush_buffer(TStringBuf extra) override {
+ auto& buf = buffer();
+ if (!buf.is_empty()) {
+ do_write({reinterpret_cast<const char*>(buf.begin()), buf.used()});
+ reset_buffer();
+ }
+ if (extra.size() >= buf.available()) {
+ do_write(extra);
+ } else if (extra.size() > 0) {
+ ::memcpy(buf.pos(), extra.data(), extra.size());
+ buf.advance(extra.size());
+ }
+ }
+
+ private:
+ void do_write(TStringBuf data) {
+ // CRTP dispatch
+ static_cast<T*>(this)->write(data);
+ }
+
+ void reset_buffer() {
+ buffer().reset(buffer_.Get(), buffer_.Get() + size_);
+ }
+ };
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/output/stdio_file.h b/library/cpp/yson_pull/detail/output/stdio_file.h
new file mode 100644
index 0000000000..03f2b40dc5
--- /dev/null
+++ b/library/cpp/yson_pull/detail/output/stdio_file.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "buffered.h"
+
+#include <library/cpp/yson_pull/detail/macros.h>
+
+#include <library/cpp/yson_pull/exceptions.h>
+
+#include <cstdio>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NOutput {
+ class TStdioFile: public TBuffered<TStdioFile> {
+ FILE* file_;
+
+ public:
+ TStdioFile(FILE* file, size_t buffer_size)
+ : TBuffered<TStdioFile>(buffer_size)
+ , file_(file)
+ {
+ }
+
+ void write(TStringBuf data) {
+ auto nwritten = ::fwrite(data.data(), 1, data.size(), file_);
+ if (Y_UNLIKELY(static_cast<size_t>(nwritten) != data.size())) {
+ throw NException::TSystemError();
+ }
+ }
+ };
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/output/stream.h b/library/cpp/yson_pull/detail/output/stream.h
new file mode 100644
index 0000000000..d4810f3353
--- /dev/null
+++ b/library/cpp/yson_pull/detail/output/stream.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "buffered.h"
+
+#include <library/cpp/yson_pull/detail/macros.h>
+#include <library/cpp/yson_pull/exceptions.h>
+
+#include <util/stream/output.h>
+#include <util/stream/file.h>
+#include <util/system/file.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NOutput {
+ class TStream: public TBuffered<TStream> {
+ IOutputStream* Output;
+
+ public:
+ TStream(IOutputStream* output, size_t buffer_size)
+ : TBuffered<TStream>(buffer_size)
+ , Output(output)
+ {
+ }
+
+ void write(TStringBuf data) {
+ Output->Write(data);
+ }
+ };
+
+ template <typename TOutput>
+ class TOwned: public TBuffered<TOwned<TOutput>> {
+ TOutput Output;
+
+ public:
+ template <typename... Args>
+ TOwned(size_t buffer_size, Args&&... args)
+ : TBuffered<TOwned>(buffer_size)
+ , Output(std::forward<Args>(args)...)
+ {
+ }
+
+ void write(TStringBuf data) {
+ Output.Write(data);
+ }
+ };
+
+ class TFHandle: public TOwned<TUnbufferedFileOutput> {
+ public:
+ TFHandle(int fd, size_t buffer_size)
+ : TOwned<TUnbufferedFileOutput>(buffer_size, Duplicate(fd))
+ {
+ }
+ };
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/percent_scalar.h b/library/cpp/yson_pull/detail/percent_scalar.h
new file mode 100644
index 0000000000..ff4571842e
--- /dev/null
+++ b/library/cpp/yson_pull/detail/percent_scalar.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <util/generic/strbuf.h>
+
+namespace NYsonPull::NDetail {
+ enum class percent_scalar_type {
+ boolean,
+ float64
+ };
+
+ struct percent_scalar {
+ //! Text boolean literals
+ static constexpr TStringBuf true_literal = "%true";
+ static constexpr TStringBuf false_literal = "%false";
+ //! Text floating-point literals
+ static constexpr TStringBuf nan_literal = "%nan";
+ static constexpr TStringBuf positive_inf_literal = "%inf";
+ static constexpr TStringBuf negative_inf_literal = "%-inf";
+
+ percent_scalar_type type;
+ union {
+ double as_float64;
+ bool as_boolean;
+ } value;
+
+ percent_scalar(double v) {
+ type = percent_scalar_type::float64;
+ value.as_float64 = v;
+ }
+
+ percent_scalar(bool v) {
+ type = percent_scalar_type::boolean;
+ value.as_boolean = v;
+ }
+ };
+}
diff --git a/library/cpp/yson_pull/detail/reader.h b/library/cpp/yson_pull/detail/reader.h
new file mode 100644
index 0000000000..0e02396358
--- /dev/null
+++ b/library/cpp/yson_pull/detail/reader.h
@@ -0,0 +1,677 @@
+#pragma once
+
+#include "lexer_base.h"
+#include "symbols.h"
+
+#include <library/cpp/yson_pull/reader.h>
+
+#include <util/generic/maybe.h>
+#include <util/generic/vector.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ /*! \internal */
+ ////////////////////////////////////////////////////////////////////////////////
+
+ enum class special_token : ui8 {
+ // Special values:
+ // YSON
+ semicolon = 0, // ;
+ equals = 1, // =
+ hash = 2, // #
+ left_bracket = 3, // [
+ right_bracket = 4, // ]
+ left_brace = 5, // {
+ right_brace = 6, // }
+ left_angle = 7, // <
+ right_angle = 8, // >
+ };
+
+ // char_class tree representation:
+ // Root = xb
+ // BinaryStringOrOtherSpecialToken = x0b
+ // BinaryString = 00b
+ // OtherSpecialToken = 10b
+ // Other = x1b
+ // BinaryScalar = xx01b
+ // BinaryInt64 = 0001b
+ // BinaryDouble = 0101b
+ // BinaryFalse = 1001b
+ // BinaryTrue = 1101b
+ // Other = xxx11b
+ // Quote = 00011b
+ // DigitOrMinus = 00111b
+ // String = 01011b
+ // Space = 01111b
+ // Plus = 10011b
+ // None = 10111b
+ // Percent = 11011b
+ enum class char_class : ui8 {
+ binary_string = 0, // = 00b
+
+ special_token_mask = 2, // = 10b
+ semicolon = 2 + (0 << 2),
+ equals = 2 + (1 << 2),
+ hash = 2 + (2 << 2),
+ left_bracket = 2 + (3 << 2),
+ right_bracket = 2 + (4 << 2),
+ left_brace = 2 + (5 << 2),
+ right_brace = 2 + (6 << 2),
+ left_angle = 2 + (7 << 2),
+ right_angle = 2 + (8 << 2),
+
+ binary_scalar_mask = 1,
+ binary_int64 = 1 + (0 << 2), // = 001b
+ binary_double = 1 + (1 << 2), // = 101b
+ binary_false = 1 + (2 << 2), // = 1001b
+ binary_true = 1 + (3 << 2), // = 1101b
+ binary_uint64 = 1 + (4 << 2), // = 10001b
+
+ other_mask = 3,
+ quote = 3 + (0 << 2), // = 00011b
+ number = 3 + (1 << 2), // = 00111b
+ string = 3 + (2 << 2), // = 01011b
+ percent = 3 + (6 << 2), // = 11011b
+ none = 3 + (5 << 2), // = 10111b
+ };
+
+#define CHAR_SUBCLASS(x) (static_cast<ui8>(x) >> 2)
+
+ inline char_class get_char_class(ui8 ch) {
+#define NN char_class::none
+#define BS char_class::binary_string
+#define BI char_class::binary_int64
+#define BD char_class::binary_double
+#define BF char_class::binary_false
+#define BT char_class::binary_true
+#define BU char_class::binary_uint64
+#define SP NN // char_class::space
+#define NB char_class::number
+#define ST char_class::string
+#define QU char_class::quote
+#define PC char_class::percent
+#define TT(name) (static_cast<char_class>( \
+ (static_cast<ui8>(special_token::name) << 2) | static_cast<ui8>(char_class::special_token_mask)))
+
+ static constexpr char_class lookup[256] =
+ {
+ NN, BS, BI, BD, BF, BT, BU, NN, NN, SP, SP, SP, SP, SP, NN, NN,
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
+
+ // 32
+ SP, // ' '
+ NN, // '!'
+ QU, // '"'
+ TT(hash), // '#'
+ NN, // '$'
+ PC, // '%'
+ NN, // '&'
+ NN, // "'"
+ NN, // '('
+ NN, // ')'
+ NN, // '*'
+ NB, // '+'
+ NN, // ','
+ NB, // '-'
+ NN, // '.'
+ NN, // '/'
+
+ // 48
+ NB, NB, NB, NB, NB, NB, NB, NB, NB, NB, // '0' - '9'
+ NN, // ':'
+ TT(semicolon), // ';'
+ TT(left_angle), // '<'
+ TT(equals), // '='
+ TT(right_angle), // '>'
+ NN, // '?'
+
+ // 64
+ NN, // '@'
+ ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'A' - 'M'
+ ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'N' - 'Z'
+ TT(left_bracket), // '['
+ NN, // '\'
+ TT(right_bracket), // ']'
+ NN, // '^'
+ ST, // '_'
+
+ // 96
+ NN, // '`'
+
+ ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'a' - 'm'
+ ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, ST, // 'n' - 'z'
+ TT(left_brace), // '{'
+ NN, // '|'
+ TT(right_brace), // '}'
+ NN, // '~'
+ NN, // '^?' non-printable
+ // 128
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
+
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN,
+ NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN, NN};
+
+#undef NN
+#undef BS
+#undef BI
+#undef BD
+#undef SP
+#undef NB
+#undef ST
+#undef QU
+#undef TT
+ return lookup[ch];
+ }
+
+ template <bool EnableLinePositionInfo>
+ class gen_reader_impl {
+ enum class state {
+ delimiter = 0, //! expecting ';' or closing-char ('>', ']', '}')
+ maybe_value = 1, //! expecting a value or closing-char
+ maybe_key = 2, //! expecting a key or closing-char
+ equals = 3, //! expecting '=' (followed by value)
+ value = 4, //! expecting a value
+ value_noattr = 5, //! expecting a value w/o attrs (after attrs)
+
+ // by design, rare states have numbers starting from first_rare_state
+ first_rare_state = 6,
+ before_begin = first_rare_state, //! before started reading the stream
+ before_end = first_rare_state + 1, //! Expecting end of stream
+ after_end = first_rare_state + 2, //! after end of stream
+ };
+
+ lexer_base<EnableLinePositionInfo> lexer_;
+ state state_;
+ TEvent event_;
+ TVector<EEventType> stack_;
+ EStreamType mode_;
+
+ public:
+ gen_reader_impl(
+ NYsonPull::NInput::IStream& buffer,
+ EStreamType mode,
+ TMaybe<size_t> memoryLimit = {})
+ : lexer_(buffer, memoryLimit)
+ , state_{state::before_begin}
+ , mode_{mode} {
+ }
+
+ const TEvent& last_event() const {
+ return event_;
+ }
+
+ ATTRIBUTE(hot)
+ const TEvent& next_event() {
+ if (Y_LIKELY(state_ < state::first_rare_state)) {
+ // 'hot' handler for in-stream events
+ next_event_hot();
+ } else {
+ // these events happen no more than once per stream
+ next_event_cold();
+ }
+ return event_;
+ }
+
+ private:
+ ATTRIBUTE(hot)
+ void next_event_hot() {
+ auto ch = lexer_.get_byte();
+ auto cls = get_char_class(ch);
+ if (Y_UNLIKELY(cls == char_class::none)) {
+ ch = lexer_.skip_space_and_get_byte();
+ if (Y_UNLIKELY(ch == NSymbol::eof)) {
+ handle_eof();
+ return;
+ }
+ cls = get_char_class(ch);
+ }
+
+ // states maybe_value/value/value_noattr are distinguished
+ // later in state_value_special
+ switch (state_) {
+ case state::maybe_value:
+ state_value(ch, cls);
+ break;
+ case state::maybe_key:
+ state_maybe_key(ch, cls);
+ break;
+ case state::equals:
+ state_equals(ch);
+ break;
+ case state::value:
+ state_value(ch, cls);
+ break;
+ case state::value_noattr:
+ state_value(ch, cls);
+ break;
+ case state::delimiter:
+ state_delimiter(ch, cls);
+ break;
+ default:
+ Y_UNREACHABLE();
+ }
+ }
+
+ ATTRIBUTE(noinline, cold)
+ void next_event_cold() {
+ switch (state_) {
+ case state::before_begin:
+ state_before_begin();
+ break;
+ case state::after_end:
+ lexer_.fail("Attempted read past stream end");
+ case state::before_end:
+ state_before_end();
+ break;
+ default:
+ Y_UNREACHABLE();
+ }
+ }
+
+ //! Present a scalar value for caller
+ template <typename T>
+ void yield(T value) {
+ event_ = TEvent{TScalar{value}};
+ }
+
+ //! Present a scalar value with non-scalar tag (i.e. key)
+ template <typename T>
+ void yield(EEventType type, T value) {
+ event_ = TEvent{type, TScalar{value}};
+ }
+
+ //! Present a value from number variant
+ void yield(const number& value) {
+ switch (value.type) {
+ case number_type::int64:
+ yield(value.value.as_int64);
+ break;
+ case number_type::uint64:
+ yield(value.value.as_uint64);
+ break;
+ case number_type::float64:
+ yield(value.value.as_float64);
+ break;
+ }
+ }
+
+ //! Present a value from %-literal variant
+ void yield(const percent_scalar& value) {
+ switch (value.type) {
+ case percent_scalar_type::boolean:
+ yield(value.value.as_boolean);
+ break;
+ case percent_scalar_type::float64:
+ yield(value.value.as_float64);
+ break;
+ }
+ }
+
+ //! Present a value-less event
+ void yield(EEventType type) {
+ event_ = TEvent{type};
+ }
+
+ //! Push the opening of a paired event
+ void push(EEventType type) {
+ stack_.push_back(type);
+ }
+
+ //! Close the paired_event, verify that delimiters are well-formed
+ void pop(EEventType first, EEventType last) {
+ if (Y_UNLIKELY(stack_.empty() || stack_.back() != first)) {
+ pop_fail(first, last);
+ return;
+ }
+ stack_.pop_back();
+
+ yield(last);
+ switch (first) {
+ case EEventType::BeginList:
+ next(state::delimiter);
+ break;
+
+ case EEventType::BeginMap:
+ next(state::delimiter);
+ break;
+
+ case EEventType::BeginAttributes:
+ next(state::value_noattr);
+ break;
+
+ case EEventType::BeginStream:
+ next(state::after_end);
+ break;
+
+ default:
+ Y_UNREACHABLE();
+ }
+
+ if (Y_UNLIKELY(mode_ == EStreamType::Node && stack_.size() == 1 && state_ == state::delimiter)) {
+ next(state::before_end);
+ }
+ }
+
+ ATTRIBUTE(noinline, cold)
+ void pop_fail(EEventType first, EEventType last) {
+ if (stack_.empty()) {
+ lexer_.fail("Unpaired events: expected opening '", first, "' for '", last, "', but event stack is empty");
+ } else {
+ lexer_.fail("Unpaired events: expected opening '", first, "' for '", last, "', but '", stack_.back(), "' is found.");
+ }
+ }
+
+ //! Transition to new_state
+ void next(state new_state) {
+ state_ = new_state;
+ }
+
+ bool in_map() {
+ return (stack_.back() == EEventType::BeginMap) || (stack_.back() == EEventType::BeginAttributes) || (stack_.back() == EEventType::BeginStream && mode_ == EStreamType::MapFragment);
+ }
+
+ ATTRIBUTE(noinline, cold)
+ void handle_eof() {
+ switch (state_) {
+ case state::maybe_value:
+ case state::maybe_key:
+ case state::delimiter:
+ case state::before_end:
+ pop(EEventType::BeginStream, EEventType::EndStream);
+ return;
+
+ default:
+ lexer_.fail("Unexpected end of stream");
+ }
+ }
+
+ ATTRIBUTE(noinline, cold)
+ void state_before_begin() {
+ push(EEventType::BeginStream);
+ yield(EEventType::BeginStream);
+ switch (mode_) {
+ case EStreamType::Node:
+ next(state::value);
+ break;
+ case EStreamType::ListFragment:
+ next(state::maybe_value);
+ break;
+ case EStreamType::MapFragment:
+ next(state::maybe_key);
+ break;
+ default:
+ Y_UNREACHABLE();
+ }
+ }
+
+ ATTRIBUTE(noinline, cold)
+ void state_before_end() {
+ auto ch = lexer_.skip_space_and_get_byte();
+ if (ch == NSymbol::eof) {
+ handle_eof();
+ } else {
+ lexer_.fail("Expected stream end, but found ", NCEscape::quote(ch));
+ }
+ }
+
+ ATTRIBUTE(hot)
+ void state_delimiter(ui8 ch, char_class cls) {
+ if (Y_LIKELY(ch == NSymbol::item_separator)) {
+ lexer_.advance(1);
+ next(in_map() ? state::maybe_key : state::maybe_value);
+ // immediately read next value
+ next_event_hot();
+ return;
+ }
+ state_delimiter_fallback(ch, cls);
+ }
+
+ ATTRIBUTE(noinline, hot)
+ void state_delimiter_fallback(ui8 ch, char_class cls) {
+ auto cls_bits = static_cast<ui8>(cls);
+ if ((cls_bits & 3) == static_cast<ui8>(char_class::special_token_mask)) {
+ auto token = static_cast<special_token>(cls_bits >> 2);
+ lexer_.advance(1);
+ switch (token) {
+ /* // handled in the fast track
+ case special_token::semicolon:
+ next(in_map()? state::maybe_key : state::maybe_value);
+ // immediately read next value
+ return next_event();
+ */
+
+ case special_token::right_bracket:
+ pop(EEventType::BeginList, EEventType::EndList);
+ return;
+
+ case special_token::right_brace:
+ pop(EEventType::BeginMap, EEventType::EndMap);
+ return;
+
+ case special_token::right_angle:
+ pop(EEventType::BeginAttributes, EEventType::EndAttributes);
+ return;
+
+ default:
+ break;
+ }
+ }
+
+ COLD_BLOCK_BYVALUE
+ lexer_.fail(
+ "Unexpected ", NCEscape::quote(ch), ", expected one of ",
+ NCEscape::quote(NSymbol::item_separator), ", ",
+ NCEscape::quote(NSymbol::end_list), ", ",
+ NCEscape::quote(NSymbol::end_map), ", ",
+ NCEscape::quote(NSymbol::end_attributes));
+ COLD_BLOCK_END
+ }
+
+ ATTRIBUTE(noinline, hot)
+ void state_maybe_key(ui8 ch, char_class cls) {
+ auto key = TStringBuf{};
+ // Keys are always strings, put binary-string key into fast lane
+ if (Y_LIKELY(ch == NSymbol::string_marker)) {
+ lexer_.advance(1);
+ key = lexer_.read_binary_string();
+ } else {
+ switch (cls) {
+ case char_class::quote:
+ lexer_.advance(1);
+ key = lexer_.read_quoted_string();
+ break;
+
+ case char_class::string:
+ key = lexer_.read_unquoted_string();
+ break;
+
+ case char_class::right_brace:
+ lexer_.advance(1);
+ pop(EEventType::BeginMap, EEventType::EndMap);
+ return;
+
+ case char_class::right_angle:
+ lexer_.advance(1);
+ pop(EEventType::BeginAttributes, EEventType::EndAttributes);
+ return;
+
+ default:
+ COLD_BLOCK_BYVALUE
+ lexer_.fail("Unexpected ", NCEscape::quote(ch), ", expected key string");
+ COLD_BLOCK_END
+ }
+ }
+
+ yield(EEventType::Key, key);
+ next(state::equals);
+ }
+
+ ATTRIBUTE(hot)
+ void state_equals(ui8 ch) {
+ // skip '='
+ if (Y_UNLIKELY(ch != NSymbol::key_value_separator)) {
+ COLD_BLOCK_BYVALUE
+ lexer_.fail("Unexpected ", NCEscape::quote(ch), ", expected ", NCEscape::quote(NSymbol::key_value_separator));
+ COLD_BLOCK_END
+ }
+ lexer_.advance(1);
+ next(state::value);
+ // immediately read the following value
+ // (this symbol yields no result)
+ next_event_hot();
+ }
+
+ ATTRIBUTE(noinline, hot)
+ void state_value(ui8 ch, char_class cls) {
+ auto cls_bits = static_cast<ui8>(cls);
+ if (cls_bits & 1) { // Other = x1b
+ if (cls_bits & (1 << 1)) { // Other = xxx11b
+ state_value_text_scalar(cls);
+ } else { // BinaryScalar = x01b
+ state_value_binary_scalar(cls);
+ }
+ next(state::delimiter);
+ } else { // BinaryStringOrOtherSpecialToken = x0b
+ lexer_.advance(1);
+ if (cls_bits & 1 << 1) {
+ // special token
+ auto token = static_cast<special_token>(cls_bits >> 2);
+ state_value_special(token, ch);
+ } else {
+ // binary string
+ yield(lexer_.read_binary_string());
+ next(state::delimiter);
+ }
+ }
+ }
+
+ ATTRIBUTE(noinline)
+ void state_value_special(special_token token, ui8 ch) {
+ // Value starters are always accepted values
+ switch (token) {
+ case special_token::hash:
+ yield(TScalar{});
+ next(state::delimiter);
+ return;
+
+ case special_token::left_bracket:
+ push(EEventType::BeginList);
+ yield(EEventType::BeginList);
+ next(state::maybe_value);
+ return;
+
+ case special_token::left_brace:
+ push(EEventType::BeginMap);
+ yield(EEventType::BeginMap);
+ next(state::maybe_key);
+ return;
+
+ default:
+ break;
+ }
+
+ // ...closing-chars are only allowed in maybe_value state
+ if (state_ == state::maybe_value) {
+ switch (token) {
+ case special_token::right_bracket:
+ pop(EEventType::BeginList, EEventType::EndList);
+ return;
+
+ case special_token::right_brace:
+ pop(EEventType::BeginMap, EEventType::EndMap);
+ return;
+
+ // right_angle is impossible in maybe_value state
+ // (only in delimiter, maybe_key)
+
+ default:
+ break;
+ }
+ }
+
+ // attributes are not allowed after attributes (thus, value_noattr state)
+ if (state_ != state::value_noattr && token == special_token::left_angle) {
+ push(EEventType::BeginAttributes);
+ yield(EEventType::BeginAttributes);
+ next(state::maybe_key);
+ return;
+ }
+
+ COLD_BLOCK_BYVALUE
+ lexer_.fail("Unexpected ", NCEscape::quote(ch));
+ COLD_BLOCK_END
+ }
+
+ ATTRIBUTE(hot)
+ void state_value_binary_scalar(char_class cls) {
+ lexer_.advance(1);
+ switch (cls) {
+ case char_class::binary_double:
+ yield(lexer_.read_binary_double());
+ break;
+
+ case char_class::binary_int64:
+ yield(lexer_.read_binary_int64());
+ break;
+
+ case char_class::binary_uint64:
+ yield(lexer_.read_binary_uint64());
+ break;
+
+ case char_class::binary_false:
+ yield(false);
+ break;
+
+ case char_class::binary_true:
+ yield(true);
+ break;
+
+ default:
+ Y_UNREACHABLE();
+ }
+ }
+
+ ATTRIBUTE(noinline)
+ void state_value_text_scalar(char_class cls) {
+ switch (cls) {
+ case char_class::quote:
+ lexer_.advance(1);
+ yield(lexer_.read_quoted_string());
+ break;
+
+ case char_class::number:
+ yield(lexer_.read_numeric());
+ break;
+
+ case char_class::string:
+ yield(lexer_.read_unquoted_string());
+ break;
+
+ case char_class::percent:
+ lexer_.advance(1);
+ yield(lexer_.read_percent_scalar());
+ break;
+
+ case char_class::none:
+ COLD_BLOCK_BYVALUE
+ lexer_.fail("Invalid yson value.");
+ COLD_BLOCK_END
+ break;
+
+ default:
+ Y_UNREACHABLE();
+ }
+ }
+ };
+
+ class reader_impl: public gen_reader_impl<false> {
+ public:
+ using gen_reader_impl<false>::gen_reader_impl;
+ };
+ }
+}
diff --git a/library/cpp/yson_pull/detail/stream_counter.h b/library/cpp/yson_pull/detail/stream_counter.h
new file mode 100644
index 0000000000..3b41b27eb6
--- /dev/null
+++ b/library/cpp/yson_pull/detail/stream_counter.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <library/cpp/yson_pull/position_info.h>
+
+#include <cstddef>
+
+namespace NYsonPull {
+ namespace NDetail {
+ template <bool EnableLinePositionInfo>
+ class stream_counter;
+
+ template <>
+ class stream_counter<true> {
+ private:
+ size_t offset_ = 0;
+ size_t line_ = 1;
+ size_t column_ = 1;
+
+ public:
+ TPositionInfo info() const {
+ return {offset_, line_, column_};
+ }
+
+ void update(const ui8* begin, const ui8* end) {
+ offset_ += end - begin;
+ for (auto current = begin; current != end; ++current) {
+ ++column_;
+ if (*current == '\n') { //TODO: memchr
+ ++line_;
+ column_ = 1;
+ }
+ }
+ }
+ };
+
+ template <>
+ class stream_counter<false> {
+ private:
+ size_t offset_ = 0;
+
+ public:
+ TPositionInfo info() const {
+ return {offset_, {}, {}};
+ }
+
+ void update(const ui8* begin, const ui8* end) {
+ offset_ += end - begin;
+ }
+ };
+ }
+}
diff --git a/library/cpp/yson_pull/detail/symbols.h b/library/cpp/yson_pull/detail/symbols.h
new file mode 100644
index 0000000000..fe94bb9c41
--- /dev/null
+++ b/library/cpp/yson_pull/detail/symbols.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <util/generic/strbuf.h>
+#include <util/system/types.h>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NSymbol {
+#define SYM(name, value) constexpr ui8 name = value
+
+ //! Indicates the beginning of a list.
+ SYM(begin_list, '[');
+ //! Indicates the end of a list.
+ SYM(end_list, ']');
+
+ //! Indicates the beginning of a map.
+ SYM(begin_map, '{');
+ //! Indicates the end of a map.
+ SYM(end_map, '}');
+
+ //! Indicates the beginning of an attribute map.
+ SYM(begin_attributes, '<');
+ //! Indicates the end of an attribute map.
+ SYM(end_attributes, '>');
+
+ //! Separates items in lists and pairs in maps or attribute maps.
+ SYM(item_separator, ';');
+ //! Separates keys from values in maps and attribute maps.
+ SYM(key_value_separator, '=');
+
+ //! Indicates an entity.
+ SYM(entity, '#');
+ //! Indicates end of stream.
+ SYM(eof, '\0');
+
+ //! Marks the beginning of a binary string literal.
+ SYM(string_marker, '\x01');
+ //! Marks the beginning of a binary int64 literal.
+ SYM(int64_marker, '\x02');
+ //! Marks the beginning of a binary uint64 literal.
+ SYM(uint64_marker, '\x06');
+ //! Marks the beginning of a binary double literal.
+ SYM(double_marker, '\x03');
+ //! Marks a binary `false' boolean value.
+ SYM(false_marker, '\x04');
+ //! Marks a binary `true' boolean value.
+ SYM(true_marker, '\x05');
+
+ //! Text string quote symbol
+ SYM(quote, '"');
+
+#undef SYM
+ }
+ }
+}
diff --git a/library/cpp/yson_pull/detail/traits.h b/library/cpp/yson_pull/detail/traits.h
new file mode 100644
index 0000000000..869a3b9c44
--- /dev/null
+++ b/library/cpp/yson_pull/detail/traits.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <type_traits>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NTraits {
+ template <typename T, typename U>
+ using if_signed = typename std::enable_if<
+ std::is_signed<T>::value,
+ U>::type;
+
+ template <typename T, typename U>
+ using if_unsigned = typename std::enable_if<
+ std::is_unsigned<T>::value,
+ U>::type;
+
+ template <typename T>
+ using to_unsigned = typename std::enable_if<
+ std::is_signed<T>::value,
+ typename std::make_unsigned<T>::type>::type;
+
+ template <typename T>
+ using to_signed = typename std::enable_if<
+ std::is_unsigned<T>::value,
+ typename std::make_signed<T>::type>::type;
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/varint.h b/library/cpp/yson_pull/detail/varint.h
new file mode 100644
index 0000000000..38bf45d925
--- /dev/null
+++ b/library/cpp/yson_pull/detail/varint.h
@@ -0,0 +1,260 @@
+#pragma once
+
+#include "byte_reader.h"
+#include "byte_writer.h"
+#include "traits.h"
+#include "zigzag.h"
+
+#include <util/system/types.h>
+
+#include <cstddef>
+#include <type_traits>
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NVarInt {
+ namespace NImpl {
+ template <typename T>
+ constexpr inline size_t max_size() {
+ return (8 * sizeof(T) - 1) / 7 + 1;
+ }
+
+ template <typename T>
+ inline size_t write(ui64 value, T&& consume) {
+ auto stop = false;
+ auto nwritten = size_t{0};
+ while (!stop) {
+ ++nwritten;
+ auto byte = static_cast<ui8>(value | 0x80);
+ value >>= 7;
+ if (value == 0) {
+ stop = true;
+ byte &= 0x7F;
+ }
+ consume(byte);
+ }
+ return nwritten;
+ }
+
+ template <typename U>
+ inline bool read_fast(byte_reader<U>& reader, ui64* value) {
+ auto& buf = reader.stream().buffer();
+ auto* ptr = buf.pos();
+ ui32 b;
+
+ // Splitting into 32-bit pieces gives better performance on 32-bit
+ // processors.
+ ui32 part0 = 0, part1 = 0, part2 = 0;
+
+ b = *(ptr++);
+ part0 = (b & 0x7F);
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part0 |= (b & 0x7F) << 7;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part0 |= (b & 0x7F) << 14;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part0 |= (b & 0x7F) << 21;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part1 = (b & 0x7F);
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part1 |= (b & 0x7F) << 7;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part1 |= (b & 0x7F) << 14;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part1 |= (b & 0x7F) << 21;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part2 = (b & 0x7F);
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ part2 |= (b & 0x7F) << 7;
+ if (!(b & 0x80))
+ goto done;
+
+ // We have overrun the maximum size of a Varint (10 bytes). The data
+ // must be corrupt.
+ return false;
+
+ done:
+ reader.advance(ptr - buf.pos());
+ *value = (static_cast<ui64>(part0)) | (static_cast<ui64>(part1) << 28) | (static_cast<ui64>(part2) << 56);
+ return true;
+ }
+
+ template <typename U>
+ inline bool read_fast(byte_reader<U>& reader, ui32* value) {
+ // Fast path: We have enough bytes left in the buffer to guarantee that
+ // this read won't cross the end, so we can skip the checks.
+ auto& buf = reader.stream().buffer();
+ auto* ptr = buf.pos();
+ ui32 b;
+ ui32 result;
+
+ b = *(ptr++);
+ result = (b & 0x7F);
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ result |= (b & 0x7F) << 7;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ result |= (b & 0x7F) << 14;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ result |= (b & 0x7F) << 21;
+ if (!(b & 0x80))
+ goto done;
+ b = *(ptr++);
+ result |= b << 28;
+ if (!(b & 0x80))
+ goto done;
+
+ // FIXME
+ // If the input is larger than 32 bits, we still need to read it all
+ // and discard the high-order bits.
+
+ for (size_t i = 0; i < max_size<ui64>() - max_size<ui32>(); i++) {
+ b = *(ptr++);
+ if (!(b & 0x80))
+ goto done;
+ }
+
+ // We have overrun the maximum size of a Varint (10 bytes). Assume
+ // the data is corrupt.
+ return false;
+
+ done:
+ reader.advance(ptr - buf.pos());
+ *value = result;
+ return true;
+ }
+
+ template <typename U>
+ inline bool read_slow(byte_reader<U>& reader, ui64* value) {
+ // Slow path: This read might cross the end of the buffer, so we
+ // need to check and refresh the buffer if and when it does.
+
+ auto& buf = reader.stream().buffer();
+ ui64 result = 0;
+ int count = 0;
+ ui32 b;
+
+ do {
+ if (count == max_size<ui64>()) {
+ return false;
+ }
+ reader.fill_buffer();
+ if (reader.stream().at_end()) {
+ return false;
+ }
+ b = *buf.pos();
+ result |= static_cast<ui64>(b & 0x7F) << (7 * count);
+ reader.advance(1);
+ ++count;
+ } while (b & 0x80);
+
+ *value = result;
+ return true;
+ }
+
+ template <typename U>
+ inline bool read_slow(byte_reader<U>& reader, ui32* value) {
+ ui64 result;
+ // fallback to 64-bit reading
+ if (read_slow(reader, &result) && result <= std::numeric_limits<ui32>::max()) {
+ *value = static_cast<ui32>(result);
+ return true;
+ }
+
+ return false;
+ }
+
+ // Following functions is an adaptation
+ // of Protobuf code from coded_stream.cc
+ template <typename T, typename U>
+ inline bool read_dispatch(byte_reader<U>& reader, T* value) {
+ auto& buf = reader.stream().buffer();
+ // NOTE: checking for 64-bit max_size(), since 32-bit
+ // read_fast() might fallback to 64-bit reading
+ if (buf.available() >= max_size<ui64>() ||
+ // Optimization: If the Varint ends at exactly the end of the buffer,
+ // we can detect that and still use the fast path.
+ (!buf.is_empty() && !(buf.end()[-1] & 0x80)))
+ {
+ return read_fast(reader, value);
+ } else {
+ // Really slow case: we will incur the cost of an extra function call here,
+ // but moving this out of line reduces the size of this function, which
+ // improves the common case. In micro benchmarks, this is worth about 10-15%
+ return read_slow(reader, value);
+ }
+ }
+
+ }
+
+ // Various functions to read/write varints.
+
+ // Returns the number of bytes written.
+ template <typename T>
+ inline NTraits::if_unsigned<T, size_t> write(ui8* data, T value) {
+ return NImpl::write(
+ static_cast<ui64>(value),
+ [&](ui8 byte) { *data++ = byte; });
+ }
+
+ template <typename T>
+ inline NTraits::if_signed<T, size_t> write(ui8* data, T value) {
+ return NImpl::write(
+ static_cast<ui64>(NZigZag::encode(value)),
+ [&](ui8 byte) { *data++ = byte; });
+ }
+
+ template <typename T, typename U>
+ inline void write(byte_writer<U>& stream, T value) {
+ ui8 data[NImpl::max_size<T>()];
+ auto size = write(data, value);
+ stream.write(data, size);
+ }
+
+ template <typename T, typename U>
+ inline NTraits::if_unsigned<T, T> read(byte_reader<U>& reader) {
+ auto value = T{};
+ auto& buf = reader.stream().buffer();
+ if (!buf.is_empty() && *buf.pos() < 0x80) {
+ value = *buf.pos();
+ reader.advance(1);
+ return value;
+ }
+
+ if (Y_UNLIKELY(!NImpl::read_dispatch(reader, &value))) {
+ reader.fail("Error parsing varint value");
+ }
+ return value;
+ }
+
+ template <typename T, typename U>
+ inline NTraits::if_signed<T, T> read(byte_reader<U>& reader) {
+ return NZigZag::decode(
+ read<NTraits::to_unsigned<T>>(reader));
+ }
+ }
+ } // namespace NDetail
+}
diff --git a/library/cpp/yson_pull/detail/writer.h b/library/cpp/yson_pull/detail/writer.h
new file mode 100644
index 0000000000..b24b994292
--- /dev/null
+++ b/library/cpp/yson_pull/detail/writer.h
@@ -0,0 +1,566 @@
+#pragma once
+
+#include "byte_writer.h"
+#include "cescape.h"
+#include "percent_scalar.h"
+#include "stream_counter.h"
+#include "symbols.h"
+#include "varint.h"
+
+#include <library/cpp/yson_pull/consumer.h>
+#include <library/cpp/yson_pull/event.h>
+#include <library/cpp/yson_pull/output.h>
+#include <library/cpp/yson_pull/stream_type.h>
+#include <library/cpp/yson_pull/writer.h>
+
+#include <util/generic/vector.h>
+#include <util/system/yassert.h>
+
+#include <cmath>
+
+namespace NYsonPull {
+ namespace NDetail {
+ class writer: public IConsumer {
+ enum class state {
+ maybe_key,
+ maybe_value,
+ value,
+ value_noattr,
+ before_begin,
+ before_end,
+ after_end,
+ };
+
+ byte_writer<stream_counter<false>> stream_;
+ TVector<EEventType> stack_;
+ bool need_item_separator_ = false;
+ EStreamType mode_ = EStreamType::ListFragment;
+ state state_ = state::before_begin;
+
+ public:
+ void OnBeginStream() override {
+ update_state(EEventType::BeginStream);
+ }
+
+ void OnEndStream() override {
+ update_state(EEventType::EndStream);
+ stream_.flush_buffer();
+ }
+
+ void OnBeginList() override {
+ begin_node();
+ write(NSymbol::begin_list);
+ update_state(EEventType::BeginList);
+ begin_collection(collection_type::list);
+ }
+
+ void OnEndList() override {
+ update_state(EEventType::EndList);
+ end_collection(collection_type::list);
+ write(NSymbol::end_list);
+ end_node();
+ }
+
+ void OnBeginMap() override {
+ begin_node();
+ write(NSymbol::begin_map);
+ update_state(EEventType::BeginMap);
+ begin_collection(collection_type::map);
+ }
+
+ void OnEndMap() override {
+ update_state(EEventType::EndMap);
+ end_collection(collection_type::map);
+ write(NSymbol::end_map);
+ end_node();
+ }
+
+ void OnBeginAttributes() override {
+ begin_node();
+ write(NSymbol::begin_attributes);
+ update_state(EEventType::BeginAttributes);
+ begin_collection(collection_type::attributes);
+ }
+
+ void OnEndAttributes() override {
+ update_state(EEventType::EndAttributes);
+ end_collection(collection_type::attributes);
+ write(NSymbol::end_attributes);
+ // no end_node
+ }
+
+ void OnEntity() override {
+ begin_node();
+ update_state(EEventType::Scalar);
+ write(NSymbol::entity);
+ end_node();
+ }
+
+ protected:
+ enum class collection_type {
+ list,
+ map,
+ attributes,
+ };
+
+ writer(NYsonPull::NOutput::IStream& stream, EStreamType mode)
+ : stream_(stream)
+ , mode_{mode} {
+ }
+
+ bool need_item_separator() const {
+ return need_item_separator_;
+ }
+ void need_item_separator(bool value) {
+ need_item_separator_ = value;
+ }
+
+ size_t depth() const {
+ Y_ASSERT(!stack_.empty());
+ if (mode_ == EStreamType::Node) {
+ return stack_.size() - 1;
+ } else {
+ return stack_.size() - 2;
+ }
+ }
+ EStreamType mode() const {
+ return mode_;
+ }
+
+ void write(ui8 c) {
+ stream_.write(c);
+ }
+
+ void write(TStringBuf value) {
+ write_raw(value.data(), value.size());
+ }
+
+ void write_raw(const void* ptr, size_t len) {
+ stream_.write(static_cast<const ui8*>(ptr), len);
+ }
+
+ template <typename T>
+ void write_varint(T value) {
+ NVarInt::write(stream_, value);
+ }
+
+ void write_escaped_string(TStringBuf value) {
+ write(NSymbol::quote);
+ NCEscape::encode(stream_, value);
+ write(NSymbol::quote);
+ }
+
+ void push(EEventType type) {
+ stack_.push_back(type);
+ }
+
+ void pop(EEventType type) {
+ if (stack_.empty()) {
+ fail("Unpaired events: empty event stack");
+ }
+ if (stack_.back() != type) {
+ fail("Unpaired events: expected ", type, ", got ", stack_.back());
+ }
+ stack_.pop_back();
+ }
+
+ void update_state(EEventType event) {
+ switch (state_) {
+ case state::before_begin:
+ if (event != EEventType::BeginStream) {
+ fail("Expected begin_stream, got ", event);
+ }
+ begin_stream();
+ return;
+
+ case state::before_end:
+ if (event != EEventType::EndStream) {
+ fail("Expected end_stream, got ", event);
+ }
+ end_stream();
+ return;
+
+ case state::after_end:
+ fail("Attempted write past stream end");
+
+ case state::maybe_key:
+ if (event == EEventType::Key) {
+ state_ = state::value;
+ return;
+ }
+
+ switch (event) {
+ case EEventType::EndStream:
+ end_stream();
+ return;
+
+ case EEventType::EndMap:
+ pop(EEventType::BeginMap);
+ next_state();
+ return;
+
+ case EEventType::EndAttributes:
+ pop(EEventType::BeginAttributes);
+ state_ = state::value_noattr;
+ return;
+
+ default:
+ fail("Unexpected event ", event, " in maybe_key");
+ }
+ break;
+
+ case state::maybe_value:
+ switch (event) {
+ case EEventType::EndList:
+ pop(EEventType::BeginList);
+ next_state();
+ return;
+
+ case EEventType::EndStream:
+ end_stream();
+ return;
+
+ default:
+ break;
+ }
+ [[fallthrough]];
+ case state::value:
+ if (event == EEventType::BeginAttributes) {
+ push(EEventType::BeginAttributes);
+ next_state();
+ return;
+ }
+ [[fallthrough]];
+ case state::value_noattr:
+ switch (event) {
+ case EEventType::Scalar:
+ next_state();
+ return;
+
+ case EEventType::BeginList:
+ push(EEventType::BeginList);
+ next_state();
+ return;
+
+ case EEventType::BeginMap:
+ push(EEventType::BeginMap);
+ next_state();
+ return;
+
+ default:
+ fail("Unexpected event ", event, " (in value_*)");
+ }
+ break;
+ }
+ }
+
+ void next_state() {
+ Y_ASSERT(!stack_.empty());
+ switch (stack_.back()) {
+ case EEventType::BeginMap:
+ case EEventType::BeginAttributes:
+ state_ = state::maybe_key;
+ break;
+
+ case EEventType::BeginList:
+ state_ = state::maybe_value;
+ break;
+
+ case EEventType::BeginStream:
+ state_ = state::before_end;
+ break;
+
+ default:
+ Y_UNREACHABLE();
+ }
+ }
+
+ void begin_stream() {
+ push(EEventType::BeginStream);
+ switch (mode_) {
+ case EStreamType::ListFragment:
+ push(EEventType::BeginList);
+ state_ = state::maybe_value;
+ break;
+
+ case EStreamType::MapFragment:
+ push(EEventType::BeginMap);
+ state_ = state::maybe_key;
+ break;
+
+ case EStreamType::Node:
+ state_ = state::value;
+ break;
+ }
+ }
+
+ void end_stream() {
+ switch (mode_) {
+ case EStreamType::ListFragment:
+ pop(EEventType::BeginList);
+ break;
+
+ case EStreamType::MapFragment:
+ pop(EEventType::BeginMap);
+ break;
+
+ case EStreamType::Node:
+ break;
+ }
+ pop(EEventType::BeginStream);
+ state_ = state::after_end;
+ }
+
+ virtual void begin_node() {
+ if (need_item_separator_) {
+ write(NSymbol::item_separator);
+ }
+ }
+
+ virtual void end_node() {
+ need_item_separator_ = true;
+ }
+
+ virtual void begin_key() {
+ begin_node();
+ }
+
+ virtual void end_key() {
+ need_item_separator_ = false;
+ write(NSymbol::key_value_separator);
+ }
+
+ virtual void begin_collection(collection_type type) {
+ Y_UNUSED(type);
+ need_item_separator_ = false;
+ }
+
+ virtual void end_collection(collection_type type) {
+ need_item_separator_ = (type != collection_type::attributes);
+ }
+
+ template <typename... Args>
+ ATTRIBUTE(noinline, cold)
+ void fail[[noreturn]](const char* msg, Args&&... args) {
+ auto formatted_message = format_string(
+ msg,
+ std::forward<Args>(args)...);
+ throw NException::TBadOutput(
+ formatted_message,
+ stream_.counter().info());
+ }
+ };
+
+ class TBinaryWriterImpl final: public writer {
+ public:
+ TBinaryWriterImpl(NYsonPull::NOutput::IStream& stream, EStreamType mode)
+ : writer(stream, mode)
+ {
+ }
+
+ void OnScalarBoolean(bool value) override {
+ update_state(EEventType::Scalar);
+
+ begin_node();
+ write(value ? NSymbol::true_marker : NSymbol::false_marker);
+ end_node();
+ }
+
+ void OnScalarInt64(i64 value) override {
+ update_state(EEventType::Scalar);
+
+ begin_node();
+ write(NSymbol::int64_marker);
+ write_varint(value);
+ end_node();
+ }
+
+ void OnScalarUInt64(ui64 value) override {
+ update_state(EEventType::Scalar);
+
+ begin_node();
+ write(NSymbol::uint64_marker);
+ write_varint(value);
+ end_node();
+ }
+
+ void OnScalarFloat64(double value) override {
+ update_state(EEventType::Scalar);
+
+ begin_node();
+ write(NSymbol::double_marker);
+ write_raw(&value, sizeof value);
+ end_node();
+ }
+
+ void OnScalarString(TStringBuf value) override {
+ update_state(EEventType::Scalar);
+
+ begin_node();
+ write(NSymbol::string_marker);
+ write_varint(static_cast<i32>(value.size()));
+ write_raw(value.data(), value.size());
+ end_node();
+ }
+
+ void OnKey(TStringBuf name) override {
+ update_state(EEventType::Key);
+
+ begin_key();
+ write(NSymbol::string_marker);
+ write_varint(static_cast<i32>(name.size()));
+ write_raw(name.data(), name.size());
+ end_key();
+ }
+ };
+
+ class TTextWriterImpl: public writer {
+ public:
+ TTextWriterImpl(NYsonPull::NOutput::IStream& stream, EStreamType mode)
+ : writer(stream, mode)
+ {
+ }
+
+ void OnScalarBoolean(bool value) override {
+ update_state(EEventType::Scalar);
+
+ begin_node();
+ write(value ? percent_scalar::true_literal : percent_scalar::false_literal);
+ end_node();
+ }
+
+ void OnScalarInt64(i64 value) override {
+ update_state(EEventType::Scalar);
+
+ char buf[32];
+ auto len = ::snprintf(buf, sizeof(buf), "%" PRIi64, value);
+
+ begin_node();
+ write_raw(buf, len);
+ end_node();
+ }
+
+ void OnScalarUInt64(ui64 value) override {
+ update_state(EEventType::Scalar);
+
+ char buf[32];
+ auto len = ::snprintf(buf, sizeof(buf), "%" PRIu64, value);
+
+ begin_node();
+ write_raw(buf, len);
+ write('u');
+ end_node();
+ }
+
+ void OnScalarFloat64(double value) override {
+ update_state(EEventType::Scalar);
+
+ begin_node();
+
+ if (std::isfinite(value)) {
+ char buf[32];
+ auto len = ::snprintf(buf, sizeof(buf), "%#.17lg", value);
+ write_raw(buf, len);
+ } else if (std::isnan(value)) {
+ write(percent_scalar::nan_literal);
+ } else if (value > 0) {
+ write(percent_scalar::positive_inf_literal);
+ } else {
+ write(percent_scalar::negative_inf_literal);
+ }
+
+ end_node();
+ }
+
+ void OnScalarString(TStringBuf value) override {
+ update_state(EEventType::Scalar);
+
+ begin_node();
+ write_escaped_string(value);
+ end_node();
+ }
+
+ void OnKey(TStringBuf name) override {
+ update_state(EEventType::Key);
+
+ begin_key();
+ write_escaped_string(name);
+ end_key();
+ }
+
+ protected:
+ void begin_node() override {
+ if (need_item_separator()) {
+ write(NSymbol::item_separator);
+ write(' ');
+ }
+ }
+
+ void end_node() override {
+ if (mode() != EStreamType::Node && depth() == 0) {
+ write(NSymbol::item_separator);
+ write('\n');
+ need_item_separator(false);
+ } else {
+ writer::end_node();
+ }
+ }
+
+ void end_key() override {
+ write(' ');
+ writer::end_key();
+ write(' ');
+ }
+ };
+
+ class TPrettyWriterImpl final: public TTextWriterImpl {
+ size_t indent_size_;
+
+ public:
+ TPrettyWriterImpl(
+ NYsonPull::NOutput::IStream& stream,
+ EStreamType mode,
+ size_t indent_size)
+ : TTextWriterImpl(stream, mode)
+ , indent_size_{indent_size} {
+ }
+
+ protected:
+ void begin_node() override {
+ if (need_item_separator()) {
+ write(NSymbol::item_separator);
+ newline();
+ }
+ }
+
+ void begin_collection(collection_type type) override {
+ TTextWriterImpl::begin_collection(type);
+ newline();
+ }
+
+ void end_collection(collection_type type) override {
+ TTextWriterImpl::end_collection(type);
+ newline();
+ }
+
+ void newline() {
+ write('\n');
+ indent(depth());
+ }
+
+ void indent(size_t count) {
+ for (size_t i = 0; i < count * indent_size_; ++i) {
+ write(' ');
+ }
+ }
+ };
+
+ template <typename T, typename... Args>
+ NYsonPull::TWriter make_writer(
+ THolder<NYsonPull::NOutput::IStream> stream,
+ Args&&... args) {
+ auto impl = MakeHolder<T>(*stream, std::forward<Args>(args)...);
+ return NYsonPull::TWriter(std::move(stream), std::move(impl));
+ }
+ }
+}
diff --git a/library/cpp/yson_pull/detail/zigzag.h b/library/cpp/yson_pull/detail/zigzag.h
new file mode 100644
index 0000000000..98fcac0e9f
--- /dev/null
+++ b/library/cpp/yson_pull/detail/zigzag.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "traits.h"
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NZigZag {
+ //! Functions that provide coding of integers with property: 0 <= f(x) <= 2 * |x|
+
+ template <typename TSigned>
+ inline NTraits::to_unsigned<TSigned> encode(TSigned x) {
+ using TUnsigned = NTraits::to_unsigned<TSigned>;
+ constexpr auto rshift = sizeof(TSigned) * 8 - 1;
+ return (static_cast<TUnsigned>(x) << 1) ^ static_cast<TUnsigned>(x >> rshift);
+ }
+
+ template <typename TUnsigned>
+ inline NTraits::to_signed<TUnsigned> decode(TUnsigned x) {
+ using TSigned = NTraits::to_signed<TUnsigned>;
+ return static_cast<TSigned>(x >> 1) ^ -static_cast<TSigned>(x & 1);
+ }
+ }
+ } // namespace NDetail
+}