aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/yson_pull/detail/cescape.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/yson_pull/detail/cescape.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/yson_pull/detail/cescape.h')
-rw-r--r--library/cpp/yson_pull/detail/cescape.h143
1 files changed, 143 insertions, 0 deletions
diff --git a/library/cpp/yson_pull/detail/cescape.h b/library/cpp/yson_pull/detail/cescape.h
new file mode 100644
index 0000000000..1ea150e69a
--- /dev/null
+++ b/library/cpp/yson_pull/detail/cescape.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "byte_writer.h"
+#include "cescape_decode.h"
+#include "cescape_encode.h"
+#include "macros.h"
+
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+
+/* REFERENCES FOR ESCAPE SEQUENCE INTERPRETATION:
+ * C99 p. 6.4.3 Universal character names.
+ * C99 p. 6.4.4.4 Character constants.
+ *
+ * <simple-escape-sequence> ::= {
+ * \' , \" , \? , \\ ,
+ * \a , \b , \f , \n , \r , \t , \v
+ * }
+ *
+ * <octal-escape-sequence> ::= \ <octal-digit> {1, 3}
+ * <hexadecimal-escape-sequence> ::= \x <hexadecimal-digit> +
+ * <universal-character-name> ::= \u <hexadecimal-digit> {4}
+ * || \U <hexadecimal-digit> {8}
+ *
+ * NOTE (6.4.4.4.7):
+ * Each octal or hexadecimal escape sequence is the longest sequence of characters that can
+ * constitute the escape sequence.
+ *
+ * THEREFORE:
+ * - Octal escape sequence spans until rightmost non-octal-digit character.
+ * - Octal escape sequence always terminates after three octal digits.
+ * - Hexadecimal escape sequence spans until rightmost non-hexadecimal-digit character.
+ * - Universal character name consists of exactly 4 or 8 hexadecimal digit.
+ *
+ */
+
+namespace NYsonPull {
+ namespace NDetail {
+ namespace NCEscape {
+ inline void encode(TString& dest, TStringBuf data) {
+ NImpl::escape_impl(
+ reinterpret_cast<const ui8*>(data.data()),
+ data.size(),
+ [&](const ui8* str, size_t size) {
+ dest.append(
+ reinterpret_cast<const char*>(str),
+ size);
+ });
+ }
+
+ // dest must have at least 4*data.size() bytes available
+ inline size_t encode(ui8* dest, TStringBuf data) {
+ auto* dest_begin = dest;
+ NImpl::escape_impl(
+ reinterpret_cast<const ui8*>(data.data()),
+ data.size(),
+ [&](const ui8* str, size_t size) {
+ ::memcpy(dest, str, size);
+ dest += size;
+ });
+ return dest - dest_begin;
+ }
+
+ template <typename U>
+ void encode(byte_writer<U>& dest, TStringBuf data) {
+ auto& buffer = dest.stream().buffer();
+ if (Y_LIKELY(buffer.available() >= data.size() * 4)) {
+ auto size = encode(buffer.pos(), data);
+ dest.advance(size);
+ } else {
+ NImpl::escape_impl(
+ reinterpret_cast<const ui8*>(data.data()),
+ data.size(),
+ [&](const ui8* str, size_t size) {
+ dest.write(str, size);
+ });
+ }
+ }
+
+ inline TString encode(TStringBuf data) {
+ TString result;
+ result.reserve(data.size());
+ encode(result, data);
+ return result;
+ }
+
+ inline void decode(TString& dest, TStringBuf data) {
+ NImpl::unescape_impl(
+ reinterpret_cast<const ui8*>(data.begin()),
+ reinterpret_cast<const ui8*>(data.end()),
+ [&](ui8 c) {
+ dest += c;
+ },
+ [&](const ui8* p, size_t len) {
+ dest.append(reinterpret_cast<const char*>(p), len);
+ });
+ }
+
+ inline void decode_inplace(TVector<ui8>& data) {
+ auto* out = static_cast<ui8*>(
+ ::memchr(data.data(), '\\', data.size()));
+ if (out == nullptr) {
+ return;
+ }
+ NImpl::unescape_impl(
+ out,
+ data.data() + data.size(),
+ [&](ui8 c) {
+ *out++ = c;
+ },
+ [&](const ui8* p, size_t len) {
+ ::memmove(out, p, len);
+ out += len;
+ });
+ data.resize(out - &data[0]);
+ }
+
+ inline TString decode(TStringBuf data) {
+ TString result;
+ result.reserve(data.size());
+ decode(result, data);
+ return result;
+ }
+
+ ATTRIBUTE(noinline, cold)
+ inline TString quote(TStringBuf str) {
+ TString result;
+ result.reserve(str.size() + 16);
+ result += '"';
+ encode(result, str);
+ result += '"';
+ return result;
+ }
+
+ ATTRIBUTE(noinline, cold)
+ inline TString quote(ui8 ch) {
+ char c = ch;
+ return quote(TStringBuf(&c, 1));
+ }
+ }
+ } // namespace NDetail
+}