aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/string_utils/relaxed_escaper
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/relaxed_escaper
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils/relaxed_escaper')
-rw-r--r--library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp1
-rw-r--r--library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h208
-rw-r--r--library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp66
-rw-r--r--library/cpp/string_utils/relaxed_escaper/ut/ya.make9
-rw-r--r--library/cpp/string_utils/relaxed_escaper/ya.make9
5 files changed, 293 insertions, 0 deletions
diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp
new file mode 100644
index 0000000000..ac624dca85
--- /dev/null
+++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp
@@ -0,0 +1 @@
+#include "relaxed_escaper.h"
diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h
new file mode 100644
index 0000000000..d7ea7c1259
--- /dev/null
+++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h
@@ -0,0 +1,208 @@
+#pragma once
+
+#include <util/stream/output.h>
+#include <util/string/escape.h>
+#include <util/memory/tempbuf.h>
+#include <util/generic/strbuf.h>
+
+namespace NEscJ {
+ // almost copypaste from util/string/escape.h
+ // todo: move there (note difference in IsPrintable and handling of string)
+
+ inline char HexDigit(char value) {
+ if (value < 10)
+ return '0' + value;
+ else
+ return 'A' + value - 10;
+ }
+
+ inline char OctDigit(char value) {
+ return '0' + value;
+ }
+
+ inline bool IsUTF8(ui8 c) {
+ return c < 0xf5 && c != 0xC0 && c != 0xC1;
+ }
+
+ inline bool IsControl(ui8 c) {
+ return c < 0x20 || c == 0x7f;
+ }
+
+ inline bool IsPrintable(ui8 c) {
+ return IsUTF8(c) && !IsControl(c);
+ }
+
+ inline bool IsHexDigit(ui8 c) {
+ return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
+ }
+
+ inline bool IsOctDigit(ui8 c) {
+ return c >= '0' && c <= '7';
+ }
+
+ struct TEscapeUtil {
+ static const size_t ESCAPE_C_BUFFER_SIZE = 6;
+
+ template <bool asunicode>
+ static inline size_t EscapeJ(ui8 c, ui8 next, char r[ESCAPE_C_BUFFER_SIZE], TStringBuf safe, TStringBuf unsafe) {
+ // (1) Printable characters go as-is, except backslash and double quote.
+ // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible).
+ // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal.
+ if (safe.find(c) != TStringBuf::npos) {
+ r[0] = c;
+ return 1;
+ }
+ if (c == '\"') {
+ r[0] = '\\';
+ r[1] = '\"';
+ return 2;
+ } else if (c == '\\') {
+ r[0] = '\\';
+ r[1] = '\\';
+ return 2;
+ } else if (IsPrintable(c) && unsafe.find(c) == TStringBuf::npos) {
+ r[0] = c;
+ return 1;
+ } else if (c == '\b') {
+ r[0] = '\\';
+ r[1] = 'b';
+ return 2;
+ } else if (c == '\f') {
+ r[0] = '\\';
+ r[1] = 'f';
+ return 2;
+ } else if (c == '\r') {
+ r[0] = '\\';
+ r[1] = 'r';
+ return 2;
+ } else if (c == '\n') {
+ r[0] = '\\';
+ r[1] = 'n';
+ return 2;
+ } else if (c == '\t') {
+ r[0] = '\\';
+ r[1] = 't';
+ return 2;
+ } else if (asunicode && IsUTF8(c)) { // utf8 controls escape for json
+ r[0] = '\\';
+ r[1] = 'u';
+ r[2] = '0';
+ r[3] = '0';
+ r[4] = HexDigit((c & 0xF0) >> 4);
+ r[5] = HexDigit((c & 0x0F) >> 0);
+ return 6;
+ } else if (c < 8 && !IsOctDigit(next)) {
+ r[0] = '\\';
+ r[1] = OctDigit(c);
+ return 2;
+ } else if (!IsHexDigit(next)) {
+ r[0] = '\\';
+ r[1] = 'x';
+ r[2] = HexDigit((c & 0xF0) >> 4);
+ r[3] = HexDigit((c & 0x0F) >> 0);
+ return 4;
+ } else {
+ r[0] = '\\';
+ r[1] = OctDigit((c & 0700) >> 6);
+ r[2] = OctDigit((c & 0070) >> 3);
+ r[3] = OctDigit((c & 0007) >> 0);
+ return 4;
+ }
+ }
+
+ static inline size_t EscapeJ(ui8 c, ui8 next, char r[ESCAPE_C_BUFFER_SIZE], TStringBuf safe, TStringBuf unsafe) {
+ return EscapeJ<false>(c, next, r, safe, unsafe);
+ }
+ };
+
+ inline size_t SuggestBuffer(size_t len) {
+ return len * TEscapeUtil::ESCAPE_C_BUFFER_SIZE;
+ }
+
+ template <bool tounicode>
+ inline size_t EscapeJ(const char* str, size_t len, char* out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
+ char* out0 = out;
+ char buffer[TEscapeUtil::ESCAPE_C_BUFFER_SIZE];
+
+ size_t i, j;
+ for (i = 0, j = 0; i < len; ++i) {
+ size_t rlen = TEscapeUtil::EscapeJ<tounicode>(str[i], (i + 1 < len ? str[i + 1] : 0), buffer, safe, unsafe);
+
+ if (rlen > 1) {
+ strncpy(out, str + j, i - j);
+ out += i - j;
+ j = i + 1;
+
+ strncpy(out, buffer, rlen);
+ out += rlen;
+ }
+ }
+
+ if (j > 0) {
+ strncpy(out, str + j, len - j);
+ out += len - j;
+ } else {
+ strncpy(out, str, len);
+ out += len;
+ }
+
+ return out - out0;
+ }
+
+ template <bool quote, bool tounicode>
+ inline void EscapeJ(TStringBuf in, IOutputStream& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
+ TTempBuf b(SuggestBuffer(in.size()) + 2);
+
+ if (quote)
+ b.Append("\"", 1);
+
+ b.Proceed(EscapeJ<tounicode>(in.data(), in.size(), b.Current(), safe, unsafe));
+
+ if (quote)
+ b.Append("\"", 1);
+
+ out.Write(b.Data(), b.Filled());
+ }
+
+ template <bool quote, bool tounicode>
+ inline void EscapeJ(TStringBuf in, TString& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
+ TTempBuf b(SuggestBuffer(in.size()) + 2);
+
+ if (quote)
+ b.Append("\"", 1);
+
+ b.Proceed(EscapeJ<tounicode>(in.data(), in.size(), b.Current(), safe, unsafe));
+
+ if (quote)
+ b.Append("\"", 1);
+
+ out.append(b.Data(), b.Filled());
+ }
+
+ template <bool quote, bool tounicode>
+ inline TString EscapeJ(TStringBuf in, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
+ TString s;
+ EscapeJ<quote, tounicode>(in, s, safe, unsafe);
+ return s;
+ }
+
+ // If the template parameter "tounicode" is ommited, then use the default value false
+ inline size_t EscapeJ(const char* str, size_t len, char* out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
+ return EscapeJ<false>(str, len, out, safe, unsafe);
+ }
+
+ template <bool quote>
+ inline void EscapeJ(TStringBuf in, IOutputStream& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
+ EscapeJ<quote, false>(in, out, safe, unsafe);
+ }
+
+ template <bool quote>
+ inline void EscapeJ(TStringBuf in, TString& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
+ EscapeJ<quote, false>(in, out, safe, unsafe);
+ }
+
+ template <bool quote>
+ inline TString EscapeJ(TStringBuf in, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) {
+ return EscapeJ<quote, false>(in, safe, unsafe);
+ }
+}
diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp
new file mode 100644
index 0000000000..768555ea3a
--- /dev/null
+++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp
@@ -0,0 +1,66 @@
+#include "relaxed_escaper.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#define RESC_FIXED_STR(s) TStringBuf(s, sizeof(s) - 1)
+static const TStringBuf CommonTestData[] = {
+ // Should be valid UTF-8.
+ RESC_FIXED_STR("http://ya.ru/"), RESC_FIXED_STR("http://ya.ru/"),
+ RESC_FIXED_STR("http://ya.ru/\\x17\\n"), RESC_FIXED_STR("http://ya.ru/\x17\n"),
+
+ RESC_FIXED_STR("http://ya.ru/\\0"), RESC_FIXED_STR("http://ya.ru/\0"),
+ RESC_FIXED_STR("http://ya.ru/\\0\\0"), RESC_FIXED_STR("http://ya.ru/\0\0"),
+ RESC_FIXED_STR("http://ya.ru/\\0\\0000"), RESC_FIXED_STR("http://ya.ru/\0\0"
+ "0"),
+ RESC_FIXED_STR("http://ya.ru/\\0\\0001"), RESC_FIXED_STR("http://ya.ru/\0\x00"
+ "1"),
+
+ RESC_FIXED_STR("\\2\\4\\00678"), RESC_FIXED_STR("\2\4\6"
+ "78"),
+ RESC_FIXED_STR("\\2\\4\\689"), RESC_FIXED_STR("\2\4\689"),
+
+ RESC_FIXED_STR("\\\"Hello\\\", Alice said."), RESC_FIXED_STR("\"Hello\", Alice said."),
+ RESC_FIXED_STR("Slash\\\\dash!"), RESC_FIXED_STR("Slash\\dash!"),
+ RESC_FIXED_STR("There\\nare\\r\\nnewlines."), RESC_FIXED_STR("There\nare\r\nnewlines."),
+ RESC_FIXED_STR("There\\tare\\ttabs."), RESC_FIXED_STR("There\tare\ttabs.")};
+#undef RESC_FIXED_STR
+
+Y_UNIT_TEST_SUITE(TRelaxedEscaperTest) {
+ Y_UNIT_TEST(TestEscaper) {
+ using namespace NEscJ;
+ for (size_t i = 0; i < Y_ARRAY_SIZE(CommonTestData); i += 2) {
+ TString expected(CommonTestData[i].data(), CommonTestData[i].size());
+ TString source(CommonTestData[i + 1].data(), CommonTestData[i + 1].size());
+ TString actual(EscapeJ<false>(source));
+ TString actual2(UnescapeC(expected));
+
+ UNIT_ASSERT_VALUES_EQUAL(expected, actual);
+ UNIT_ASSERT_VALUES_EQUAL(source, actual2);
+ }
+
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\x17\\n\xAB", EscapeJ<false>("http://ya.ru/\x17\n\xab"));
+ TString s = EscapeJ<false, true>("http://ya.ru/\x17\n\xab\xff");
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\u0017\\n\xAB\\xFF", s);
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\x17\n\xAB", EscapeJ<false>("http://ya.ru/\x17\n\xab", "\n"));
+ UNIT_ASSERT_VALUES_EQUAL("http:\\x2F\\x2Fya.ru\\x2F\\x17\n\xAB'", EscapeJ<false>("http://ya.ru/\x17\n\xab'", "\n'", "/"));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\x17\n\xab", UnescapeC("http:\\x2F\\x2Fya.ru\\x2F\\x17\n\xAB"));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\x17\n\xab", UnescapeC("http://ya.ru/\\x17\\n\xAB"));
+ UNIT_ASSERT_VALUES_EQUAL("h", EscapeJ<false>("h"));
+ UNIT_ASSERT_VALUES_EQUAL("\"h\"", EscapeJ<true>("h"));
+ UNIT_ASSERT_VALUES_EQUAL("h", UnescapeC("h"));
+ UNIT_ASSERT_VALUES_EQUAL("\\xFF", EscapeJ<false>("\xFF"));
+ UNIT_ASSERT_VALUES_EQUAL("\"\\xFF\"", EscapeJ<true>("\xFF"));
+ UNIT_ASSERT_VALUES_EQUAL("\xFF", UnescapeC("\\xFF"));
+
+ UNIT_ASSERT_VALUES_EQUAL("\\377f", EscapeJ<false>("\xff"
+ "f"));
+ UNIT_ASSERT_VALUES_EQUAL("\xff"
+ "f",
+ UnescapeC("\\377f"));
+ UNIT_ASSERT_VALUES_EQUAL("\\xFFg", EscapeJ<false>("\xff"
+ "g"));
+ UNIT_ASSERT_VALUES_EQUAL("\xff"
+ "g",
+ UnescapeC("\\xFFg"));
+ }
+}
diff --git a/library/cpp/string_utils/relaxed_escaper/ut/ya.make b/library/cpp/string_utils/relaxed_escaper/ut/ya.make
new file mode 100644
index 0000000000..7ebd393c48
--- /dev/null
+++ b/library/cpp/string_utils/relaxed_escaper/ut/ya.make
@@ -0,0 +1,9 @@
+UNITTEST_FOR(library/cpp/string_utils/relaxed_escaper)
+
+OWNER(velavokr)
+
+SRCS(
+ relaxed_escaper_ut.cpp
+)
+
+END()
diff --git a/library/cpp/string_utils/relaxed_escaper/ya.make b/library/cpp/string_utils/relaxed_escaper/ya.make
new file mode 100644
index 0000000000..3f0fa5bc07
--- /dev/null
+++ b/library/cpp/string_utils/relaxed_escaper/ya.make
@@ -0,0 +1,9 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ relaxed_escaper.cpp
+)
+
+END()