diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/relaxed_escaper | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils/relaxed_escaper')
5 files changed, 293 insertions, 0 deletions
diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp new file mode 100644 index 0000000000..ac624dca85 --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp @@ -0,0 +1 @@ +#include "relaxed_escaper.h" diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h new file mode 100644 index 0000000000..d7ea7c1259 --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h @@ -0,0 +1,208 @@ +#pragma once + +#include <util/stream/output.h> +#include <util/string/escape.h> +#include <util/memory/tempbuf.h> +#include <util/generic/strbuf.h> + +namespace NEscJ { + // almost copypaste from util/string/escape.h + // todo: move there (note difference in IsPrintable and handling of string) + + inline char HexDigit(char value) { + if (value < 10) + return '0' + value; + else + return 'A' + value - 10; + } + + inline char OctDigit(char value) { + return '0' + value; + } + + inline bool IsUTF8(ui8 c) { + return c < 0xf5 && c != 0xC0 && c != 0xC1; + } + + inline bool IsControl(ui8 c) { + return c < 0x20 || c == 0x7f; + } + + inline bool IsPrintable(ui8 c) { + return IsUTF8(c) && !IsControl(c); + } + + inline bool IsHexDigit(ui8 c) { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); + } + + inline bool IsOctDigit(ui8 c) { + return c >= '0' && c <= '7'; + } + + struct TEscapeUtil { + static const size_t ESCAPE_C_BUFFER_SIZE = 6; + + template <bool asunicode> + static inline size_t EscapeJ(ui8 c, ui8 next, char r[ESCAPE_C_BUFFER_SIZE], TStringBuf safe, TStringBuf unsafe) { + // (1) Printable characters go as-is, except backslash and double quote. + // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible). + // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal. + if (safe.find(c) != TStringBuf::npos) { + r[0] = c; + return 1; + } + if (c == '\"') { + r[0] = '\\'; + r[1] = '\"'; + return 2; + } else if (c == '\\') { + r[0] = '\\'; + r[1] = '\\'; + return 2; + } else if (IsPrintable(c) && unsafe.find(c) == TStringBuf::npos) { + r[0] = c; + return 1; + } else if (c == '\b') { + r[0] = '\\'; + r[1] = 'b'; + return 2; + } else if (c == '\f') { + r[0] = '\\'; + r[1] = 'f'; + return 2; + } else if (c == '\r') { + r[0] = '\\'; + r[1] = 'r'; + return 2; + } else if (c == '\n') { + r[0] = '\\'; + r[1] = 'n'; + return 2; + } else if (c == '\t') { + r[0] = '\\'; + r[1] = 't'; + return 2; + } else if (asunicode && IsUTF8(c)) { // utf8 controls escape for json + r[0] = '\\'; + r[1] = 'u'; + r[2] = '0'; + r[3] = '0'; + r[4] = HexDigit((c & 0xF0) >> 4); + r[5] = HexDigit((c & 0x0F) >> 0); + return 6; + } else if (c < 8 && !IsOctDigit(next)) { + r[0] = '\\'; + r[1] = OctDigit(c); + return 2; + } else if (!IsHexDigit(next)) { + r[0] = '\\'; + r[1] = 'x'; + r[2] = HexDigit((c & 0xF0) >> 4); + r[3] = HexDigit((c & 0x0F) >> 0); + return 4; + } else { + r[0] = '\\'; + r[1] = OctDigit((c & 0700) >> 6); + r[2] = OctDigit((c & 0070) >> 3); + r[3] = OctDigit((c & 0007) >> 0); + return 4; + } + } + + static inline size_t EscapeJ(ui8 c, ui8 next, char r[ESCAPE_C_BUFFER_SIZE], TStringBuf safe, TStringBuf unsafe) { + return EscapeJ<false>(c, next, r, safe, unsafe); + } + }; + + inline size_t SuggestBuffer(size_t len) { + return len * TEscapeUtil::ESCAPE_C_BUFFER_SIZE; + } + + template <bool tounicode> + inline size_t EscapeJ(const char* str, size_t len, char* out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + char* out0 = out; + char buffer[TEscapeUtil::ESCAPE_C_BUFFER_SIZE]; + + size_t i, j; + for (i = 0, j = 0; i < len; ++i) { + size_t rlen = TEscapeUtil::EscapeJ<tounicode>(str[i], (i + 1 < len ? str[i + 1] : 0), buffer, safe, unsafe); + + if (rlen > 1) { + strncpy(out, str + j, i - j); + out += i - j; + j = i + 1; + + strncpy(out, buffer, rlen); + out += rlen; + } + } + + if (j > 0) { + strncpy(out, str + j, len - j); + out += len - j; + } else { + strncpy(out, str, len); + out += len; + } + + return out - out0; + } + + template <bool quote, bool tounicode> + inline void EscapeJ(TStringBuf in, IOutputStream& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + TTempBuf b(SuggestBuffer(in.size()) + 2); + + if (quote) + b.Append("\"", 1); + + b.Proceed(EscapeJ<tounicode>(in.data(), in.size(), b.Current(), safe, unsafe)); + + if (quote) + b.Append("\"", 1); + + out.Write(b.Data(), b.Filled()); + } + + template <bool quote, bool tounicode> + inline void EscapeJ(TStringBuf in, TString& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + TTempBuf b(SuggestBuffer(in.size()) + 2); + + if (quote) + b.Append("\"", 1); + + b.Proceed(EscapeJ<tounicode>(in.data(), in.size(), b.Current(), safe, unsafe)); + + if (quote) + b.Append("\"", 1); + + out.append(b.Data(), b.Filled()); + } + + template <bool quote, bool tounicode> + inline TString EscapeJ(TStringBuf in, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + TString s; + EscapeJ<quote, tounicode>(in, s, safe, unsafe); + return s; + } + + // If the template parameter "tounicode" is ommited, then use the default value false + inline size_t EscapeJ(const char* str, size_t len, char* out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + return EscapeJ<false>(str, len, out, safe, unsafe); + } + + template <bool quote> + inline void EscapeJ(TStringBuf in, IOutputStream& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + EscapeJ<quote, false>(in, out, safe, unsafe); + } + + template <bool quote> + inline void EscapeJ(TStringBuf in, TString& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + EscapeJ<quote, false>(in, out, safe, unsafe); + } + + template <bool quote> + inline TString EscapeJ(TStringBuf in, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + return EscapeJ<quote, false>(in, safe, unsafe); + } +} diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp new file mode 100644 index 0000000000..768555ea3a --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp @@ -0,0 +1,66 @@ +#include "relaxed_escaper.h" + +#include <library/cpp/testing/unittest/registar.h> + +#define RESC_FIXED_STR(s) TStringBuf(s, sizeof(s) - 1) +static const TStringBuf CommonTestData[] = { + // Should be valid UTF-8. + RESC_FIXED_STR("http://ya.ru/"), RESC_FIXED_STR("http://ya.ru/"), + RESC_FIXED_STR("http://ya.ru/\\x17\\n"), RESC_FIXED_STR("http://ya.ru/\x17\n"), + + RESC_FIXED_STR("http://ya.ru/\\0"), RESC_FIXED_STR("http://ya.ru/\0"), + RESC_FIXED_STR("http://ya.ru/\\0\\0"), RESC_FIXED_STR("http://ya.ru/\0\0"), + RESC_FIXED_STR("http://ya.ru/\\0\\0000"), RESC_FIXED_STR("http://ya.ru/\0\0" + "0"), + RESC_FIXED_STR("http://ya.ru/\\0\\0001"), RESC_FIXED_STR("http://ya.ru/\0\x00" + "1"), + + RESC_FIXED_STR("\\2\\4\\00678"), RESC_FIXED_STR("\2\4\6" + "78"), + RESC_FIXED_STR("\\2\\4\\689"), RESC_FIXED_STR("\2\4\689"), + + RESC_FIXED_STR("\\\"Hello\\\", Alice said."), RESC_FIXED_STR("\"Hello\", Alice said."), + RESC_FIXED_STR("Slash\\\\dash!"), RESC_FIXED_STR("Slash\\dash!"), + RESC_FIXED_STR("There\\nare\\r\\nnewlines."), RESC_FIXED_STR("There\nare\r\nnewlines."), + RESC_FIXED_STR("There\\tare\\ttabs."), RESC_FIXED_STR("There\tare\ttabs.")}; +#undef RESC_FIXED_STR + +Y_UNIT_TEST_SUITE(TRelaxedEscaperTest) { + Y_UNIT_TEST(TestEscaper) { + using namespace NEscJ; + for (size_t i = 0; i < Y_ARRAY_SIZE(CommonTestData); i += 2) { + TString expected(CommonTestData[i].data(), CommonTestData[i].size()); + TString source(CommonTestData[i + 1].data(), CommonTestData[i + 1].size()); + TString actual(EscapeJ<false>(source)); + TString actual2(UnescapeC(expected)); + + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + UNIT_ASSERT_VALUES_EQUAL(source, actual2); + } + + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\x17\\n\xAB", EscapeJ<false>("http://ya.ru/\x17\n\xab")); + TString s = EscapeJ<false, true>("http://ya.ru/\x17\n\xab\xff"); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\u0017\\n\xAB\\xFF", s); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\x17\n\xAB", EscapeJ<false>("http://ya.ru/\x17\n\xab", "\n")); + UNIT_ASSERT_VALUES_EQUAL("http:\\x2F\\x2Fya.ru\\x2F\\x17\n\xAB'", EscapeJ<false>("http://ya.ru/\x17\n\xab'", "\n'", "/")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\x17\n\xab", UnescapeC("http:\\x2F\\x2Fya.ru\\x2F\\x17\n\xAB")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\x17\n\xab", UnescapeC("http://ya.ru/\\x17\\n\xAB")); + UNIT_ASSERT_VALUES_EQUAL("h", EscapeJ<false>("h")); + UNIT_ASSERT_VALUES_EQUAL("\"h\"", EscapeJ<true>("h")); + UNIT_ASSERT_VALUES_EQUAL("h", UnescapeC("h")); + UNIT_ASSERT_VALUES_EQUAL("\\xFF", EscapeJ<false>("\xFF")); + UNIT_ASSERT_VALUES_EQUAL("\"\\xFF\"", EscapeJ<true>("\xFF")); + UNIT_ASSERT_VALUES_EQUAL("\xFF", UnescapeC("\\xFF")); + + UNIT_ASSERT_VALUES_EQUAL("\\377f", EscapeJ<false>("\xff" + "f")); + UNIT_ASSERT_VALUES_EQUAL("\xff" + "f", + UnescapeC("\\377f")); + UNIT_ASSERT_VALUES_EQUAL("\\xFFg", EscapeJ<false>("\xff" + "g")); + UNIT_ASSERT_VALUES_EQUAL("\xff" + "g", + UnescapeC("\\xFFg")); + } +} diff --git a/library/cpp/string_utils/relaxed_escaper/ut/ya.make b/library/cpp/string_utils/relaxed_escaper/ut/ya.make new file mode 100644 index 0000000000..7ebd393c48 --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/string_utils/relaxed_escaper) + +OWNER(velavokr) + +SRCS( + relaxed_escaper_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/relaxed_escaper/ya.make b/library/cpp/string_utils/relaxed_escaper/ya.make new file mode 100644 index 0000000000..3f0fa5bc07 --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/ya.make @@ -0,0 +1,9 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( + relaxed_escaper.cpp +) + +END() |