diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils')
51 files changed, 4113 insertions, 0 deletions
diff --git a/library/cpp/string_utils/base64/base64.cpp b/library/cpp/string_utils/base64/base64.cpp new file mode 100644 index 0000000000..05c201f0de --- /dev/null +++ b/library/cpp/string_utils/base64/base64.cpp @@ -0,0 +1,268 @@ +#include "base64.h" + +#include <contrib/libs/base64/avx2/libbase64.h> +#include <contrib/libs/base64/ssse3/libbase64.h> +#include <contrib/libs/base64/neon32/libbase64.h> +#include <contrib/libs/base64/neon64/libbase64.h> +#include <contrib/libs/base64/plain32/libbase64.h> +#include <contrib/libs/base64/plain64/libbase64.h> + +#include <util/generic/yexception.h> +#include <util/system/cpu_id.h> +#include <util/system/platform.h> + +#include <cstdlib> + +namespace { + struct TImpl { + void (*Encode)(const char* src, size_t srclen, char* out, size_t* outlen); + int (*Decode)(const char* src, size_t srclen, char* out, size_t* outlen); + + TImpl() { +#if defined(_arm32_) + const bool haveNEON32 = true; +#else + const bool haveNEON32 = false; +#endif + +#if defined(_arm64_) + const bool haveNEON64 = true; +#else + const bool haveNEON64 = false; +#endif + +# ifdef _windows_ + // msvc does something wrong in release-build, so we temprorary disable this branch on windows + // https://developercommunity.visualstudio.com/content/problem/334085/release-build-has-made-wrong-optimizaion-in-base64.html + const bool isWin = true; +# else + const bool isWin = false; +# endif + if (!isWin && NX86::HaveAVX() && NX86::HaveAVX2()) { + Encode = avx2_base64_encode; + Decode = avx2_base64_decode; + } else if (NX86::HaveSSSE3()) { + Encode = ssse3_base64_encode; + Decode = ssse3_base64_decode; + } else if (haveNEON64) { + Encode = neon64_base64_encode; + Decode = neon64_base64_decode; + } else if (haveNEON32) { + Encode = neon32_base64_encode; + Decode = neon32_base64_decode; + } else if (sizeof(void*) == 8) { + // running on a 64 bit platform + Encode = plain64_base64_encode; + Decode = plain64_base64_decode; + } else if (sizeof(void*) == 4) { + // running on a 32 bit platform (actually impossible in Arcadia) + Encode = plain32_base64_encode; + Decode = plain32_base64_decode; + } else { + // failed to find appropriate implementation + std::abort(); + } + } + }; + + const TImpl GetImpl() { + static const TImpl IMPL; + return IMPL; + } +} + +static const char base64_etab_std[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char base64_bkw[] = { + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', // 0..15 + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', // 16..31 + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\76', '\0', '\76', '\0', '\77', // 32.47 + '\64', '\65', '\66', '\67', '\70', '\71', '\72', '\73', '\74', '\75', '\0', '\0', '\0', '\0', '\0', '\0', // 48..63 + '\0', '\0', '\1', '\2', '\3', '\4', '\5', '\6', '\7', '\10', '\11', '\12', '\13', '\14', '\15', '\16', // 64..79 + '\17', '\20', '\21', '\22', '\23', '\24', '\25', '\26', '\27', '\30', '\31', '\0', '\0', '\0', '\0', '\77', // 80..95 + '\0', '\32', '\33', '\34', '\35', '\36', '\37', '\40', '\41', '\42', '\43', '\44', '\45', '\46', '\47', '\50', // 96..111 + '\51', '\52', '\53', '\54', '\55', '\56', '\57', '\60', '\61', '\62', '\63', '\0', '\0', '\0', '\0', '\0', // 112..127 + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', // 128..143 + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', + '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0'}; + +static_assert(Y_ARRAY_SIZE(base64_bkw) == 256, "wrong size"); + +// Base64 for url encoding, RFC3548 +static const char base64_etab_url[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + +static inline unsigned char GetBase64EncodedIndex0(unsigned char octet0) { + return (octet0 >> 2); +} + +static inline unsigned char GetBase64EncodedIndex1(unsigned char octet0, unsigned char octet1) { + return (((octet0 << 4) & 0x30) | ((octet1 >> 4) & 0x0f)); +} + +static inline unsigned char GetBase64EncodedIndex2(unsigned char octet1, unsigned char octet2) { + return (((octet1 << 2) & 0x3c) | ((octet2 >> 6) & 0x03)); +} + +static inline unsigned char GetBase64EncodedIndex3(unsigned char octet2) { + return (octet2 & 0x3f); +} + +template <bool urlVersion> +static inline char* Base64EncodeImpl(char* outstr, const unsigned char* instr, size_t len) { + const char* const base64_etab = (urlVersion ? base64_etab_url : base64_etab_std); + const char pad = (urlVersion ? ',' : '='); + + size_t idx = 0; + + while (idx + 2 < len) { + *outstr++ = base64_etab[GetBase64EncodedIndex0(instr[idx])]; + *outstr++ = base64_etab[GetBase64EncodedIndex1(instr[idx], instr[idx + 1])]; + *outstr++ = base64_etab[GetBase64EncodedIndex2(instr[idx + 1], instr[idx + 2])]; + *outstr++ = base64_etab[GetBase64EncodedIndex3(instr[idx + 2])]; + idx += 3; + } + if (idx < len) { + *outstr++ = base64_etab[GetBase64EncodedIndex0(instr[idx])]; + if (idx + 1 < len) { + *outstr++ = base64_etab[GetBase64EncodedIndex1(instr[idx], instr[idx + 1])]; + *outstr++ = base64_etab[GetBase64EncodedIndex2(instr[idx + 1], '\0')]; + } else { + *outstr++ = base64_etab[GetBase64EncodedIndex1(instr[idx], '\0')]; + *outstr++ = pad; + } + *outstr++ = pad; + } + *outstr = 0; + + return outstr; +} + +static char* Base64EncodePlain(char* outstr, const unsigned char* instr, size_t len) { + return Base64EncodeImpl<false>(outstr, instr, len); +} + +char* Base64EncodeUrl(char* outstr, const unsigned char* instr, size_t len) { + return Base64EncodeImpl<true>(outstr, instr, len); +} + +inline void uudecode_1(char* dst, unsigned char* src) { + dst[0] = char((base64_bkw[src[0]] << 2) | (base64_bkw[src[1]] >> 4)); + dst[1] = char((base64_bkw[src[1]] << 4) | (base64_bkw[src[2]] >> 2)); + dst[2] = char((base64_bkw[src[2]] << 6) | base64_bkw[src[3]]); +} + +static size_t Base64DecodePlain(void* dst, const char* b, const char* e) { + size_t n = 0; + while (b < e) { + uudecode_1((char*)dst + n, (unsigned char*)b); + + b += 4; + n += 3; + } + + if (n > 0) { + if (b[-1] == ',' || b[-1] == '=') { + n--; + + if (b[-2] == ',' || b[-2] == '=') { + n--; + } + } + } + + return n; +} + +// Table for Base64StrictDecode +static const char base64_bkw_strict[] = + "\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100" + "\100\100\100\100\100\100\100\100\100\100\100\76\101\76\100\77\64\65\66\67\70\71\72\73\74\75\100\100\100\101\100\100" + "\100\0\1\2\3\4\5\6\7\10\11\12\13\14\15\16\17\20\21\22\23\24\25\26\27\30\31\100\100\100\100\77" + "\100\32\33\34\35\36\37\40\41\42\43\44\45\46\47\50\51\52\53\54\55\56\57\60\61\62\63\100\100\100\100\100" + "\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100" + "\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100" + "\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100" + "\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100\100"; + +size_t Base64StrictDecode(void* out, const char* b, const char* e) { + char* dst = (char*)out; + const unsigned char* src = (unsigned char*)b; + const unsigned char* const end = (unsigned char*)e; + + Y_ENSURE(!((e - b) % 4), "incorrect input length for base64 decode"); + + while (src < end) { + const char zeroth = base64_bkw_strict[src[0]]; + const char first = base64_bkw_strict[src[1]]; + const char second = base64_bkw_strict[src[2]]; + const char third = base64_bkw_strict[src[3]]; + + constexpr char invalid = 64; + constexpr char padding = 65; + if (Y_UNLIKELY(zeroth == invalid || first == invalid || + second == invalid || third == invalid || + zeroth == padding || first == padding)) + { + ythrow yexception() << "invalid character in input"; + } + + dst[0] = char((zeroth << 2) | (first >> 4)); + dst[1] = char((first << 4) | (second >> 2)); + dst[2] = char((second << 6) | third); + + src += 4; + dst += 3; + + if (src[-1] == ',' || src[-1] == '=') { + --dst; + + if (src[-2] == ',' || src[-2] == '=') { + --dst; + } + } else if (Y_UNLIKELY(src[-2] == ',' || src[-2] == '=')) { + ythrow yexception() << "incorrect padding"; + } + } + + return dst - (char*)out; +} + +size_t Base64Decode(void* dst, const char* b, const char* e) { + static const TImpl IMPL = GetImpl(); + const auto size = e - b; + Y_ENSURE(!(size % 4), "incorrect input length for base64 decode"); + if (Y_LIKELY(size < 8)) { + return Base64DecodePlain(dst, b, e); + } + + size_t outLen; + IMPL.Decode(b, size, (char*)dst, &outLen); + + return outLen; +} + +TString Base64DecodeUneven(const TStringBuf s) { + if (s.length() % 4 == 0) { + return Base64Decode(s); + } + + // padding to 4 + return Base64Decode(TString(s) + TString(4 - (s.length() % 4), '=')); +} + +char* Base64Encode(char* outstr, const unsigned char* instr, size_t len) { + static const TImpl IMPL = GetImpl(); + if (Y_LIKELY(len < 8)) { + return Base64EncodePlain(outstr, instr, len); + } + + size_t outLen; + IMPL.Encode((char*)instr, len, outstr, &outLen); + + *(outstr + outLen) = '\0'; + return outstr + outLen; +} diff --git a/library/cpp/string_utils/base64/base64.h b/library/cpp/string_utils/base64/base64.h new file mode 100644 index 0000000000..f778a6425a --- /dev/null +++ b/library/cpp/string_utils/base64/base64.h @@ -0,0 +1,130 @@ +#pragma once + +#include <util/system/defaults.h> +#include <util/generic/strbuf.h> +#include <util/generic/string.h> + +/* @return Size of the buffer required to decode Base64 encoded data of size `len`. + */ +constexpr size_t Base64DecodeBufSize(const size_t len) noexcept { + return (len + 3) / 4 * 3; +} + +/* Decode Base64 encoded data. Can decode both regular Base64 and Base64URL encoded data. Can decode + * only valid Base64[URL] data, behaviour for invalid data is unspecified. + * + * @throws Throws exception in case of incorrect padding. + * + * @param dst memory for writing output. + * @param b pointer to the beginning of base64 encoded string. + * @param a pointer to the end of base64 encoded string + * + * @return Return number of bytes decoded. + */ +size_t Base64Decode(void* dst, const char* b, const char* e); + +inline TStringBuf Base64Decode(const TStringBuf src, void* dst) { + return TStringBuf((const char*)dst, Base64Decode(dst, src.begin(), src.end())); +} + +inline void Base64Decode(const TStringBuf src, TString& dst) { + dst.ReserveAndResize(Base64DecodeBufSize(src.size())); + dst.resize(Base64Decode(src, dst.begin()).size()); +} + +//WARNING: can process not whole input silently, use Base64StrictDecode instead of this function +inline TString Base64Decode(const TStringBuf s) { + TString ret; + Base64Decode(s, ret); + return ret; +} + +/// +/// @brief Decodes Base64 string with strict verification +/// of invalid symbols, also tries to decode Base64 string with padding +/// inside. +// +/// @throws Throws exceptions on inputs which contain invalid symbols +/// or incorrect padding. +/// @{ +/// +/// @param b a pointer to the beginning of base64 encoded string. +/// @param e a pointer to the end of base64 encoded string. +/// @param dst memory for writing output. +/// +/// @return Returns number of bytes decoded. +/// +size_t Base64StrictDecode(void* dst, const char* b, const char* e); + +/// +/// @param src a base64 encoded string. +/// @param dst an pointer to allocated memory +/// for writing result. +/// +/// @return Returns dst wrapped into TStringBuf. +/// +inline TStringBuf Base64StrictDecode(const TStringBuf src, void* dst) { + return TStringBuf((const char*)dst, Base64StrictDecode(dst, src.begin(), src.end())); +} + +/// +/// @param src a base64 encoded string. +/// @param dst a decoded string. +/// +inline void Base64StrictDecode(const TStringBuf src, TString& dst) { + dst.ReserveAndResize(Base64DecodeBufSize(src.size())); + dst.resize(Base64StrictDecode(src, dst.begin()).size()); +} + +/// +/// @param src a base64 encoded string. +/// +/// @returns a decoded string. +/// +inline TString Base64StrictDecode(const TStringBuf src) { + TString ret; + Base64StrictDecode(src, ret); + return ret; +} +/// @} + +/// Works with strings which length is not divisible by 4. +TString Base64DecodeUneven(const TStringBuf s); + +//encode +constexpr size_t Base64EncodeBufSize(const size_t len) noexcept { + return (len + 2) / 3 * 4 + 1; +} + +char* Base64Encode(char* outstr, const unsigned char* instr, size_t len); +char* Base64EncodeUrl(char* outstr, const unsigned char* instr, size_t len); + +inline TStringBuf Base64Encode(const TStringBuf src, void* tmp) { + return TStringBuf((const char*)tmp, Base64Encode((char*)tmp, (const unsigned char*)src.data(), src.size())); +} + +inline TStringBuf Base64EncodeUrl(const TStringBuf src, void* tmp) { + return TStringBuf((const char*)tmp, Base64EncodeUrl((char*)tmp, (const unsigned char*)src.data(), src.size())); +} + +inline void Base64Encode(const TStringBuf src, TString& dst) { + dst.ReserveAndResize(Base64EncodeBufSize(src.size())); + dst.resize(Base64Encode(src, dst.begin()).size()); +} + +inline void Base64EncodeUrl(const TStringBuf src, TString& dst) { + dst.ReserveAndResize(Base64EncodeBufSize(src.size())); + dst.resize(Base64EncodeUrl(src, dst.begin()).size()); +} + +inline TString Base64Encode(const TStringBuf s) { + TString ret; + Base64Encode(s, ret); + return ret; +} + +inline TString Base64EncodeUrl(const TStringBuf s) { + TString ret; + Base64EncodeUrl(s, ret); + return ret; +} diff --git a/library/cpp/string_utils/base64/base64_decode_uneven_ut.cpp b/library/cpp/string_utils/base64/base64_decode_uneven_ut.cpp new file mode 100644 index 0000000000..c3ed068a37 --- /dev/null +++ b/library/cpp/string_utils/base64/base64_decode_uneven_ut.cpp @@ -0,0 +1,46 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <library/cpp/string_utils/base64/base64.h> + +Y_UNIT_TEST_SUITE(TBase64DecodeUneven) { + Y_UNIT_TEST(Base64DecodeUneven) { + const TString wikipedia_slogan = + "Man is distinguished, not only by his reason, " + "but by this singular passion from other animals, which is a lust of the " + "mind, that by a perseverance of delight in the continued and " + "indefatigable generation of knowledge, exceeds the short " + "vehemence of any carnal pleasure."; + const TString encoded = + "TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpcyByZWFzb24sIGJ1dCBieSB0" + "aGlzIHNpbmd1bGFyIHBhc3Npb24gZnJvbSBvdGhlciBhbmltYWxzLCB3aGljaCBpcyBhIGx1" + "c3Qgb2YgdGhlIG1pbmQsIHRoYXQgYnkgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0" + "aGUgY29udGludWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xlZGdl" + "LCBleGNlZWRzIHRoZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5hbCBwbGVhc3VyZS4="; + + UNIT_ASSERT_VALUES_EQUAL(encoded, Base64Encode(wikipedia_slogan)); + UNIT_ASSERT_VALUES_EQUAL(wikipedia_slogan, Base64DecodeUneven(encoded)); + + const TString encoded_url1 = + "TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpcyByZWFzb24sIGJ1dCBieSB0" + "aGlzIHNpbmd1bGFyIHBhc3Npb24gZnJvbSBvdGhlciBhbmltYWxzLCB3aGljaCBpcyBhIGx1" + "c3Qgb2YgdGhlIG1pbmQsIHRoYXQgYnkgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0" + "aGUgY29udGludWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xlZGdl" + "LCBleGNlZWRzIHRoZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5hbCBwbGVhc3VyZS4,"; + const TString encoded_url2 = + "TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpcyByZWFzb24sIGJ1dCBieSB0" + "aGlzIHNpbmd1bGFyIHBhc3Npb24gZnJvbSBvdGhlciBhbmltYWxzLCB3aGljaCBpcyBhIGx1" + "c3Qgb2YgdGhlIG1pbmQsIHRoYXQgYnkgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0" + "aGUgY29udGludWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xlZGdl" + "LCBleGNlZWRzIHRoZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5hbCBwbGVhc3VyZS4"; + UNIT_ASSERT_VALUES_EQUAL(wikipedia_slogan, Base64DecodeUneven(encoded_url1)); + UNIT_ASSERT_VALUES_EQUAL(wikipedia_slogan, Base64DecodeUneven(encoded_url2)); + + const TString lp = "Linkin Park"; + UNIT_ASSERT_VALUES_EQUAL(lp, Base64DecodeUneven(Base64Encode(lp))); + UNIT_ASSERT_VALUES_EQUAL(lp, Base64DecodeUneven(Base64EncodeUrl(lp))); + + const TString dp = "ADP GmbH\nAnalyse Design & Programmierung\nGesellschaft mit beschränkter Haftung"; + UNIT_ASSERT_VALUES_EQUAL(dp, Base64DecodeUneven(Base64Encode(dp))); + UNIT_ASSERT_VALUES_EQUAL(dp, Base64DecodeUneven(Base64EncodeUrl(dp))); + } +} diff --git a/library/cpp/string_utils/base64/base64_ut.cpp b/library/cpp/string_utils/base64/base64_ut.cpp new file mode 100644 index 0000000000..bcc1e65879 --- /dev/null +++ b/library/cpp/string_utils/base64/base64_ut.cpp @@ -0,0 +1,497 @@ +#include "base64.h" + +#include <contrib/libs/base64/avx2/libbase64.h> +#include <contrib/libs/base64/neon32/libbase64.h> +#include <contrib/libs/base64/neon64/libbase64.h> +#include <contrib/libs/base64/plain32/libbase64.h> +#include <contrib/libs/base64/plain64/libbase64.h> +#include <contrib/libs/base64/ssse3/libbase64.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/vector.h> +#include <util/random/fast.h> +#include <util/system/cpu_id.h> +#include <util/system/platform.h> + +#include <array> + +using namespace std::string_view_literals; + +#define BASE64_UT_DECLARE_BASE64_IMPL(prefix, encFunction, decFunction) \ + Y_DECLARE_UNUSED \ + static size_t prefix##Base64Decode(void* dst, const char* b, const char* e) { \ + const auto size = e - b; \ + Y_ENSURE(!(size % 4), "incorrect input length for base64 decode"); \ + \ + size_t outLen; \ + decFunction(b, size, (char*)dst, &outLen); \ + return outLen; \ + } \ + \ + Y_DECLARE_UNUSED \ + static inline TStringBuf prefix##Base64Decode(const TStringBuf& src, void* dst) { \ + return TStringBuf((const char*)dst, ::NB64Etalon::prefix##Base64Decode(dst, src.begin(), src.end())); \ + } \ + \ + Y_DECLARE_UNUSED \ + static inline void prefix##Base64Decode(const TStringBuf& src, TString& dst) { \ + dst.ReserveAndResize(Base64DecodeBufSize(src.size())); \ + dst.resize(::NB64Etalon::prefix##Base64Decode(src, dst.begin()).size()); \ + } \ + \ + Y_DECLARE_UNUSED \ + static inline TString prefix##Base64Decode(const TStringBuf& s) { \ + TString ret; \ + prefix##Base64Decode(s, ret); \ + return ret; \ + } \ + \ + Y_DECLARE_UNUSED \ + static char* prefix##Base64Encode(char* outstr, const unsigned char* instr, size_t len) { \ + size_t outLen; \ + encFunction((char*)instr, len, outstr, &outLen); \ + *(outstr + outLen) = '\0'; \ + return outstr + outLen; \ + } \ + \ + Y_DECLARE_UNUSED \ + static inline TStringBuf prefix##Base64Encode(const TStringBuf& src, void* tmp) { \ + return TStringBuf((const char*)tmp, ::NB64Etalon::prefix##Base64Encode((char*)tmp, (const unsigned char*)src.data(), src.size())); \ + } \ + \ + Y_DECLARE_UNUSED \ + static inline void prefix##Base64Encode(const TStringBuf& src, TString& dst) { \ + dst.ReserveAndResize(Base64EncodeBufSize(src.size())); \ + dst.resize(::NB64Etalon::prefix##Base64Encode(src, dst.begin()).size()); \ + } \ + \ + Y_DECLARE_UNUSED \ + static inline TString prefix##Base64Encode(const TStringBuf& s) { \ + TString ret; \ + prefix##Base64Encode(s, ret); \ + return ret; \ + } + +namespace NB64Etalon { + BASE64_UT_DECLARE_BASE64_IMPL(PLAIN32, plain32_base64_encode, plain32_base64_decode); + BASE64_UT_DECLARE_BASE64_IMPL(PLAIN64, plain64_base64_encode, plain64_base64_decode); + BASE64_UT_DECLARE_BASE64_IMPL(NEON32, neon32_base64_encode, neon32_base64_decode); + BASE64_UT_DECLARE_BASE64_IMPL(NEON64, neon64_base64_encode, neon64_base64_decode); + BASE64_UT_DECLARE_BASE64_IMPL(AVX2, avx2_base64_encode, avx2_base64_decode); + BASE64_UT_DECLARE_BASE64_IMPL(SSSE3, ssse3_base64_encode, ssse3_base64_decode); + +#undef BASE64_UT_DECLARE_BASE64_IMPL + + struct TImpls { + enum EImpl : size_t { + PLAIN32_IMPL, + PLAIN64_IMPL, + NEON32_IMPL, + NEON64_IMPL, + AVX2_IMPL, + SSSE3_IMPL, + MAX_IMPL + }; + + using TEncodeF = void (*)(const TStringBuf&, TString&); + using TDecodeF = void (*)(const TStringBuf&, TString&); + + struct TImpl { + TEncodeF Encode = nullptr; + TDecodeF Decode = nullptr; + }; + + std::array<TImpl, MAX_IMPL> Impl; + + TImpls() { + Impl[PLAIN32_IMPL].Encode = PLAIN32Base64Encode; + Impl[PLAIN32_IMPL].Decode = PLAIN32Base64Decode; + Impl[PLAIN64_IMPL].Encode = PLAIN64Base64Encode; + Impl[PLAIN64_IMPL].Decode = PLAIN64Base64Decode; +#if defined(_arm32_) + Impl[NEON32_IMPL].Encode = NEON32Base64Encode; + Impl[NEON32_IMPL].Decode = NEON32Base64Decode; +#elif defined(_arm64_) + Impl[NEON64_IMPL].Encode = NEON64Base64Encode; + Impl[NEON64_IMPL].Decode = NEON64Base64Decode; +#elif defined(_x86_64_) + if (NX86::HaveSSSE3()) { + Impl[SSSE3_IMPL].Encode = SSSE3Base64Encode; + Impl[SSSE3_IMPL].Decode = SSSE3Base64Decode; + } + + if (NX86::HaveAVX2()) { + Impl[AVX2_IMPL].Encode = AVX2Base64Encode; + Impl[AVX2_IMPL].Decode = AVX2Base64Decode; + } +#else + ythrow yexception() << "Failed to identify the platform"; +#endif + } + }; + + TImpls GetImpls() { + static const TImpls IMPLS; + return IMPLS; + } +} + +template <> +void Out<NB64Etalon::TImpls::EImpl>(IOutputStream& o, typename TTypeTraits<NB64Etalon::TImpls::EImpl>::TFuncParam v) { + switch (v) { + case NB64Etalon::TImpls::PLAIN32_IMPL: + o << TStringBuf{"PLAIN32"}; + return; + case NB64Etalon::TImpls::PLAIN64_IMPL: + o << TStringBuf{"PLAIN64"}; + return; + case NB64Etalon::TImpls::NEON64_IMPL: + o << TStringBuf{"NEON64"}; + return; + case NB64Etalon::TImpls::NEON32_IMPL: + o << TStringBuf{"NEON32"}; + return; + case NB64Etalon::TImpls::SSSE3_IMPL: + o << TStringBuf{"SSSE3"}; + return; + case NB64Etalon::TImpls::AVX2_IMPL: + o << TStringBuf{"AVX2"}; + return; + default: + ythrow yexception() << "invalid"; + } +} + +static void TestEncodeDecodeIntoString(const TString& plain, const TString& encoded, const TString& encodedUrl) { + TString a, b; + + Base64Encode(plain, a); + UNIT_ASSERT_VALUES_EQUAL(a, encoded); + + Base64Decode(a, b); + UNIT_ASSERT_VALUES_EQUAL(b, plain); + + Base64EncodeUrl(plain, a); + UNIT_ASSERT_VALUES_EQUAL(a, encodedUrl); + + Base64Decode(a, b); + UNIT_ASSERT_VALUES_EQUAL(b, plain); +} + +static void TestEncodeStrictDecodeIntoString(const TString& plain, const TString& encoded, const TString& encodedUrl) { + TString a, b; + + Base64Encode(plain, a); + UNIT_ASSERT_VALUES_EQUAL(a, encoded); + + Base64StrictDecode(a, b); + UNIT_ASSERT_VALUES_EQUAL(b, plain); + + Base64EncodeUrl(plain, a); + UNIT_ASSERT_VALUES_EQUAL(a, encodedUrl); + + Base64StrictDecode(a, b); + UNIT_ASSERT_VALUES_EQUAL(b, plain); +} + +Y_UNIT_TEST_SUITE(TBase64) { + Y_UNIT_TEST(TestEncode) { + UNIT_ASSERT_VALUES_EQUAL(Base64Encode("12z"), "MTJ6"); + UNIT_ASSERT_VALUES_EQUAL(Base64Encode("123"), "MTIz"); + UNIT_ASSERT_VALUES_EQUAL(Base64Encode("12"), "MTI="); + UNIT_ASSERT_VALUES_EQUAL(Base64Encode("1"), "MQ=="); + } + + Y_UNIT_TEST(TestIntoString) { + { + TString str; + for (size_t i = 0; i < 256; ++i) + str += char(i); + + const TString base64 = + "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJy" + "gpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9Q" + "UVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eH" + "l6e3x9fn+AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6Ch" + "oqOkpaanqKmqq6ytrq+wsbKztLW2t7i5uru8vb6/wMHCw8TFxsfIyc" + "rLzM3Oz9DR0tPU1dbX2Nna29zd3t/g4eLj5OXm5+jp6uvs7e7v8PHy" + "8/T19vf4+fr7/P3+/w=="; + const TString base64Url = + "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJy" + "gpKissLS4vMDEyMzQ1Njc4OTo7PD0-P0BBQkNERUZHSElKS0xNTk9Q" + "UVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eH" + "l6e3x9fn-AgYKDhIWGh4iJiouMjY6PkJGSk5SVlpeYmZqbnJ2en6Ch" + "oqOkpaanqKmqq6ytrq-wsbKztLW2t7i5uru8vb6_wMHCw8TFxsfIyc" + "rLzM3Oz9DR0tPU1dbX2Nna29zd3t_g4eLj5OXm5-jp6uvs7e7v8PHy" + "8_T19vf4-fr7_P3-_w,,"; + + TestEncodeDecodeIntoString(str, base64, base64Url); + TestEncodeStrictDecodeIntoString(str, base64, base64Url); + } + + { + const TString str = "http://yandex.ru:1234/request?param=value&lll=fff#fragment"; + + const TString base64 = "aHR0cDovL3lhbmRleC5ydToxMjM0L3JlcXVlc3Q/cGFyYW09dmFsdWUmbGxsPWZmZiNmcmFnbWVudA=="; + const TString base64Url = "aHR0cDovL3lhbmRleC5ydToxMjM0L3JlcXVlc3Q_cGFyYW09dmFsdWUmbGxsPWZmZiNmcmFnbWVudA,,"; + + TestEncodeDecodeIntoString(str, base64, base64Url); + TestEncodeStrictDecodeIntoString(str, base64, base64Url); + } + } + + Y_UNIT_TEST(TestDecode) { + UNIT_ASSERT_EXCEPTION(Base64Decode("a"), yexception); + UNIT_ASSERT_EXCEPTION(Base64StrictDecode("a"), yexception); + + UNIT_ASSERT_VALUES_EQUAL(Base64Decode(""), ""); + UNIT_ASSERT_VALUES_EQUAL(Base64StrictDecode(""), ""); + + UNIT_ASSERT_VALUES_EQUAL(Base64Decode("MTI="), "12"); + UNIT_ASSERT_VALUES_EQUAL(Base64StrictDecode("MTI="), "12"); + + UNIT_ASSERT_VALUES_EQUAL(Base64Decode("QQ=="), "A"); + UNIT_ASSERT_VALUES_EQUAL(Base64StrictDecode("QQ=="), "A"); + + UNIT_ASSERT_EXCEPTION(Base64StrictDecode("M=I="), yexception); + + UNIT_ASSERT_VALUES_EQUAL(Base64Decode("dnluZHg="), "vyndx"); + UNIT_ASSERT_VALUES_EQUAL(Base64StrictDecode("dnluZHg="), "vyndx"); + + UNIT_ASSERT_VALUES_EQUAL(Base64StrictDecode("dnluZHg=dmlkZW8="), "vyndxvideo"); + + UNIT_ASSERT_EXCEPTION(Base64StrictDecode("aHR0cDovL2ltZy5tZWdhLXBvcm5vLnJ1Lw=a"), yexception); + + UNIT_ASSERT_EXCEPTION(Base64StrictDecode("aHh=="), yexception); + UNIT_ASSERT_EXCEPTION(Base64StrictDecode("\1\1\1\2"), yexception); + } + + Y_UNIT_TEST(TestDecodeUneven) { + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven(""), ""); + + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("YWFh"), "aaa"); + + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("MTI="), "12"); + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("MTI,"), "12"); + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("MTI"), "12"); + + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("QQ=="), "A"); + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("QQ,,"), "A"); + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("QQ"), "A"); + + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("dnluZHg="), "vyndx"); + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("dnluZHg,"), "vyndx"); + UNIT_ASSERT_VALUES_EQUAL(Base64DecodeUneven("dnluZHg"), "vyndx"); + } + + Y_UNIT_TEST(TestDecodeRandom) { + TString input; + constexpr size_t testSize = 240000; + for (size_t i = 0; i < testSize; ++i) { + input.push_back(rand() % 256); + } + TString output; + TString encoded = Base64Encode(input); + UNIT_ASSERT_VALUES_EQUAL(Base64Decode(encoded), input); + UNIT_ASSERT_VALUES_EQUAL(Base64StrictDecode(encoded), input); + } + + Y_UNIT_TEST(TestAllPossibleOctets) { + const TString x("\0\x01\x02\x03\x04\x05\x06\x07\b\t\n\x0B\f\r\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F"sv); + const TString xEnc = "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3x9fn8="; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestTwoPaddingCharacters) { + const TString x("a"); + const TString xEnc = "YQ=="; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestOnePaddingCharacter) { + const TString x("aa"); + const TString xEnc = "YWE="; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestNoPaddingCharacters) { + const TString x("aaa"); + const TString xEnc = "YWFh"; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestTrailingZero) { + const TString x("foo\0"sv); + const TString xEnc = "Zm9vAA=="; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestTwoTrailingZeroes) { + const TString x("foo\0\0"sv); + const TString xEnc = "Zm9vAAA="; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestZero) { + const TString x("\0"sv); + const TString xEnc = "AA=="; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestSymbolsAfterZero) { + const TString x("\0a"sv); + const TString xEnc = "AGE="; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestEmptyString) { + const TString x = ""; + const TString xEnc = ""; + const TString y = Base64Decode(xEnc); + const TString yEnc = Base64Encode(x); + UNIT_ASSERT_VALUES_EQUAL(x, y); + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + } + + Y_UNIT_TEST(TestBackendsConsistencyOnRandomData) { + constexpr size_t TEST_CASES_COUNT = 1000; + constexpr size_t MAX_DATA_SIZE = 1000; + TFastRng<ui32> prng{42}; + TVector<TString> xs{TEST_CASES_COUNT}; + TString xEnc; + TString xDec; + TString yEnc; + TString yDec; + + for (auto& x : xs) { + const size_t size = prng() % MAX_DATA_SIZE; + for (size_t j = 0; j < size; ++j) { + x += static_cast<char>(prng() % 256); + } + } + + static const auto IMPLS = NB64Etalon::GetImpls(); + for (size_t i = 0; i < static_cast<size_t>(NB64Etalon::TImpls::MAX_IMPL); ++i) { + for (size_t j = 0; j < static_cast<size_t>(NB64Etalon::TImpls::MAX_IMPL); ++j) { + const auto ei = static_cast<NB64Etalon::TImpls::EImpl>(i); + const auto ej = static_cast<NB64Etalon::TImpls::EImpl>(j); + const auto impl = IMPLS.Impl[i]; + const auto otherImpl = IMPLS.Impl[j]; + if (!impl.Encode && !impl.Decode || !otherImpl.Encode && !otherImpl.Decode) { + continue; + } + + for (const auto& x : xs) { + impl.Encode(x, xEnc); + impl.Decode(xEnc, xDec); + Y_ENSURE(x == xDec, "something is wrong with " << ei << " implementation"); + + otherImpl.Encode(x, yEnc); + otherImpl.Decode(xEnc, yDec); + Y_ENSURE(x == yDec, "something is wrong with " << ej << " implementation"); + + UNIT_ASSERT_VALUES_EQUAL(xEnc, yEnc); + UNIT_ASSERT_VALUES_EQUAL(xDec, yDec); + } + } + } + } + + Y_UNIT_TEST(TestIfEncodedDataIsZeroTerminatedOnRandomData) { + constexpr size_t TEST_CASES_COUNT = 1000; + constexpr size_t MAX_DATA_SIZE = 1000; + TFastRng<ui32> prng{42}; + TString x; + TVector<char> buf; + for (size_t i = 0; i < TEST_CASES_COUNT; ++i) { + const size_t size = prng() % MAX_DATA_SIZE; + x.clear(); + for (size_t j = 0; j < size; ++j) { + x += static_cast<char>(prng() % 256); + } + + buf.assign(Base64EncodeBufSize(x.size()), Max<char>()); + const auto* const xEncEnd = Base64Encode(buf.data(), (const unsigned char*)x.data(), x.size()); + UNIT_ASSERT_VALUES_EQUAL(*xEncEnd, '\0'); + } + } + + Y_UNIT_TEST(TestDecodeURLEncodedNoPadding) { + const auto x = "123"; + const auto xDec = Base64Decode("MTIz"); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } + + Y_UNIT_TEST(TestDecodeURLEncodedOnePadding) { + const auto x = "12"; + const auto xDec = Base64Decode("MTI,"); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } + + Y_UNIT_TEST(TestDecodeURLEncodedTwoPadding) { + const auto x = "1"; + const auto xDec = Base64Decode("MQ,,"); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } + + Y_UNIT_TEST(TestDecodeNoPaddingLongString) { + const auto x = "How do I convert between big-endian and little-endian values in C++?a"; + const auto xDec = Base64Decode("SG93IGRvIEkgY29udmVydCBiZXR3ZWVuIGJpZy1lbmRpYW4gYW5kIGxpdHRsZS1lbmRpYW4gdmFsdWVzIGluIEMrKz9h"); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } + + Y_UNIT_TEST(TestDecodeOnePaddingLongString) { + const auto x = "How do I convert between big-endian and little-endian values in C++?"; + const auto xDec = Base64Decode("SG93IGRvIEkgY29udmVydCBiZXR3ZWVuIGJpZy1lbmRpYW4gYW5kIGxpdHRsZS1lbmRpYW4gdmFsdWVzIGluIEMrKz8="); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } + + Y_UNIT_TEST(TestDecodeTwoPaddingLongString) { + const auto x = "How do I convert between big-endian and little-endian values in C++?aa"; + const auto xDec = Base64Decode("SG93IGRvIEkgY29udmVydCBiZXR3ZWVuIGJpZy1lbmRpYW4gYW5kIGxpdHRsZS1lbmRpYW4gdmFsdWVzIGluIEMrKz9hYQ=="); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } + + Y_UNIT_TEST(TestDecodeURLEncodedNoPaddingLongString) { + const auto x = "How do I convert between big-endian and little-endian values in C++?a"; + const auto xDec = Base64Decode("SG93IGRvIEkgY29udmVydCBiZXR3ZWVuIGJpZy1lbmRpYW4gYW5kIGxpdHRsZS1lbmRpYW4gdmFsdWVzIGluIEMrKz9h"); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } + + Y_UNIT_TEST(TestDecodeURLEncodedOnePaddingLongString) { + const auto x = "How do I convert between big-endian and little-endian values in C++?"; + const auto xDec = Base64Decode("SG93IGRvIEkgY29udmVydCBiZXR3ZWVuIGJpZy1lbmRpYW4gYW5kIGxpdHRsZS1lbmRpYW4gdmFsdWVzIGluIEMrKz8,"); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } + + Y_UNIT_TEST(TestDecodeURLEncodedTwoPaddingLongString) { + const auto x = "How do I convert between big-endian and little-endian values in C++?aa"; + const auto xDec = Base64Decode("SG93IGRvIEkgY29udmVydCBiZXR3ZWVuIGJpZy1lbmRpYW4gYW5kIGxpdHRsZS1lbmRpYW4gdmFsdWVzIGluIEMrKz9hYQ,,"); + UNIT_ASSERT_VALUES_EQUAL(x, xDec); + } +} diff --git a/library/cpp/string_utils/base64/bench/main.cpp b/library/cpp/string_utils/base64/bench/main.cpp new file mode 100644 index 0000000000..10e09bc1c7 --- /dev/null +++ b/library/cpp/string_utils/base64/bench/main.cpp @@ -0,0 +1,326 @@ +#include <library/cpp/string_utils/base64/base64.h> + +#include <library/cpp/testing/benchmark/bench.h> + +#include <util/generic/buffer.h> +#include <util/generic/singleton.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/xrange.h> +#include <util/generic/yexception.h> +#include <util/random/random.h> + +#include <array> + +static TString GenerateRandomData(const size_t minSize, const size_t maxSize) { + Y_ENSURE(minSize <= maxSize, "wow"); + TString r; + for (size_t i = 0; i < minSize; ++i) { + r.push_back(RandomNumber<char>()); + } + + if (minSize == maxSize) { + return r; + } + + const size_t size = RandomNumber<size_t>() % (maxSize - minSize + 1); + for (size_t i = 0; i < size; ++i) { + r.push_back(RandomNumber<char>()); + } + + return r; +} + +template <size_t N> +static std::array<TString, N> GenerateRandomDataVector(const size_t minSize, const size_t maxSize) { + std::array<TString, N> r; + for (size_t i = 0; i < N; ++i) { + r[i] = GenerateRandomData(minSize, maxSize); + } + + return r; +} + +template <size_t N> +static std::array<TString, N> Encode(const std::array<TString, N>& d) { + std::array<TString, N> r; + for (size_t i = 0, iEnd = d.size(); i < iEnd; ++i) { + r[i] = Base64Encode(d[i]); + } + + return r; +} + +namespace { + template <size_t N, size_t MinSize, size_t MaxSize> + struct TRandomDataHolder { + TRandomDataHolder() + : Data(GenerateRandomDataVector<N>(MinSize, MaxSize)) + , DataEncoded(Encode<N>(Data)) + { + for (size_t i = 0; i < N; ++i) { + const size_t size = Data[i].size(); + const size_t sizeEnc = DataEncoded[i].size(); + PlaceToEncode[i].Resize(Base64EncodeBufSize(size)); + PlaceToDecode[i].Resize(Base64DecodeBufSize(sizeEnc)); + } + } + + static constexpr size_t Size = N; + const std::array<TString, N> Data; + const std::array<TString, N> DataEncoded; + std::array<TBuffer, N> PlaceToEncode; + std::array<TBuffer, N> PlaceToDecode; + }; + + template <size_t N, size_t Size> + using TFixedSizeRandomDataHolder = TRandomDataHolder<N, Size, Size>; + + using FSRDH_1 = TFixedSizeRandomDataHolder<10, 1>; + using FSRDH_2 = TFixedSizeRandomDataHolder<10, 2>; + using FSRDH_4 = TFixedSizeRandomDataHolder<10, 4>; + using FSRDH_8 = TFixedSizeRandomDataHolder<10, 8>; + using FSRDH_16 = TFixedSizeRandomDataHolder<10, 16>; + using FSRDH_32 = TFixedSizeRandomDataHolder<10, 32>; + using FSRDH_64 = TFixedSizeRandomDataHolder<10, 64>; + using FSRDH_128 = TFixedSizeRandomDataHolder<10, 128>; + using FSRDH_1024 = TFixedSizeRandomDataHolder<10, 1024>; + using FSRDH_10240 = TFixedSizeRandomDataHolder<10, 10240>; + using FSRDH_102400 = TFixedSizeRandomDataHolder<10, 102400>; + using FSRDH_1048576 = TFixedSizeRandomDataHolder<10, 1048576>; + using FSRDH_10485760 = TFixedSizeRandomDataHolder<10, 10485760>; +} + +template <typename T> +static inline void BenchEncode(T& d, const NBench::NCpu::TParams& iface) { + for (const auto it : xrange(iface.Iterations())) { + Y_UNUSED(it); + for (size_t i = 0; i < d.Size; ++i) { + NBench::Escape(d.PlaceToEncode[i].data()); + Y_DO_NOT_OPTIMIZE_AWAY( + Base64Encode(d.PlaceToEncode[i].data(), (const unsigned char*)d.Data[i].data(), d.Data[i].size())); + NBench::Clobber(); + } + } +} + +template <typename T> +static inline void BenchEncodeUrl(T& d, const NBench::NCpu::TParams& iface) { + for (const auto it : xrange(iface.Iterations())) { + Y_UNUSED(it); + for (size_t i = 0; i < d.Size; ++i) { + NBench::Escape(d.PlaceToEncode[i].data()); + Y_DO_NOT_OPTIMIZE_AWAY( + Base64EncodeUrl(d.PlaceToEncode[i].data(), (const unsigned char*)d.Data[i].data(), d.Data[i].size())); + NBench::Clobber(); + } + } +} + +template <typename T> +static inline void BenchDecode(T& d, const NBench::NCpu::TParams& iface) { + for (const auto it : xrange(iface.Iterations())) { + Y_UNUSED(it); + for (size_t i = 0; i < d.Size; ++i) { + NBench::Escape(d.PlaceToDecode[i].data()); + Y_DO_NOT_OPTIMIZE_AWAY( + Base64Decode(d.PlaceToDecode[i].data(), (const char*)d.DataEncoded[i].data(), (const char*)(d.DataEncoded[i].data() + d.DataEncoded[i].size()))); + NBench::Clobber(); + } + } +} + +Y_CPU_BENCHMARK(EncodeF1, iface) { + auto& d = *Singleton<FSRDH_1>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF1, iface) { + auto& d = *Singleton<FSRDH_1>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF2, iface) { + auto& d = *Singleton<FSRDH_2>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF2, iface) { + auto& d = *Singleton<FSRDH_2>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF4, iface) { + auto& d = *Singleton<FSRDH_4>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF4, iface) { + auto& d = *Singleton<FSRDH_4>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF8, iface) { + auto& d = *Singleton<FSRDH_8>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF8, iface) { + auto& d = *Singleton<FSRDH_8>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF16, iface) { + auto& d = *Singleton<FSRDH_16>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF16, iface) { + auto& d = *Singleton<FSRDH_16>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF32, iface) { + auto& d = *Singleton<FSRDH_32>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF32, iface) { + auto& d = *Singleton<FSRDH_32>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF64, iface) { + auto& d = *Singleton<FSRDH_64>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF64, iface) { + auto& d = *Singleton<FSRDH_64>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF128, iface) { + auto& d = *Singleton<FSRDH_128>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF128, iface) { + auto& d = *Singleton<FSRDH_128>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF1024, iface) { + auto& d = *Singleton<FSRDH_1024>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF1024, iface) { + auto& d = *Singleton<FSRDH_1024>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF10240, iface) { + auto& d = *Singleton<FSRDH_10240>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF10240, iface) { + auto& d = *Singleton<FSRDH_10240>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF102400, iface) { + auto& d = *Singleton<FSRDH_102400>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF102400, iface) { + auto& d = *Singleton<FSRDH_102400>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF1048576, iface) { + auto& d = *Singleton<FSRDH_1048576>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF1048576, iface) { + auto& d = *Singleton<FSRDH_1048576>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeF10485760, iface) { + auto& d = *Singleton<FSRDH_10485760>(); + BenchEncode(d, iface); +} + +Y_CPU_BENCHMARK(DecodeF10485760, iface) { + auto& d = *Singleton<FSRDH_10485760>(); + BenchDecode(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF1, iface) { + auto& d = *Singleton<FSRDH_1>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF2, iface) { + auto& d = *Singleton<FSRDH_2>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF4, iface) { + auto& d = *Singleton<FSRDH_4>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF8, iface) { + auto& d = *Singleton<FSRDH_8>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF16, iface) { + auto& d = *Singleton<FSRDH_16>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF32, iface) { + auto& d = *Singleton<FSRDH_32>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF64, iface) { + auto& d = *Singleton<FSRDH_64>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF128, iface) { + auto& d = *Singleton<FSRDH_128>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF1024, iface) { + auto& d = *Singleton<FSRDH_1024>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF10240, iface) { + auto& d = *Singleton<FSRDH_10240>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF102400, iface) { + auto& d = *Singleton<FSRDH_102400>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF1048576, iface) { + auto& d = *Singleton<FSRDH_1048576>(); + BenchEncodeUrl(d, iface); +} + +Y_CPU_BENCHMARK(EncodeUrlF10485760, iface) { + auto& d = *Singleton<FSRDH_10485760>(); + BenchEncodeUrl(d, iface); +} diff --git a/library/cpp/string_utils/base64/bench/metrics/main.py b/library/cpp/string_utils/base64/bench/metrics/main.py new file mode 100644 index 0000000000..c35fd6d8cd --- /dev/null +++ b/library/cpp/string_utils/base64/bench/metrics/main.py @@ -0,0 +1,5 @@ +import yatest.common as yc + + +def test_export_metrics(metrics): + metrics.set_benchmark(yc.execute_benchmark('library/cpp/string_utils/base64/bench/bench')) diff --git a/library/cpp/string_utils/base64/bench/metrics/ya.make b/library/cpp/string_utils/base64/bench/metrics/ya.make new file mode 100644 index 0000000000..b0406516c3 --- /dev/null +++ b/library/cpp/string_utils/base64/bench/metrics/ya.make @@ -0,0 +1,20 @@ +OWNER( + yazevnul + g:util +) + +PY2TEST() + +SIZE(LARGE) + +TAG( + ya:force_sandbox + sb:intel_e5_2660v1 + ya:fat +) + +TEST_SRCS(main.py) + +DEPENDS(library/cpp/string_utils/base64/bench) + +END() diff --git a/library/cpp/string_utils/base64/bench/ya.make b/library/cpp/string_utils/base64/bench/ya.make new file mode 100644 index 0000000000..5ac5f3d6ce --- /dev/null +++ b/library/cpp/string_utils/base64/bench/ya.make @@ -0,0 +1,16 @@ +OWNER( + yazevnul + g:util +) + +Y_BENCHMARK() + +SRCS( + main.cpp +) + +PEERDIR( + library/cpp/string_utils/base64 +) + +END() diff --git a/library/cpp/string_utils/base64/fuzz/generic/ya.make b/library/cpp/string_utils/base64/fuzz/generic/ya.make new file mode 100644 index 0000000000..d155e2b0a0 --- /dev/null +++ b/library/cpp/string_utils/base64/fuzz/generic/ya.make @@ -0,0 +1,12 @@ +OWNER( + yazevnul + g:util +) + +FUZZ() + +PEERDIR( + library/cpp/string_utils/base64/fuzz/lib +) + +END() diff --git a/library/cpp/string_utils/base64/fuzz/lib/main.cpp b/library/cpp/string_utils/base64/fuzz/lib/main.cpp new file mode 100644 index 0000000000..28547ae7a5 --- /dev/null +++ b/library/cpp/string_utils/base64/fuzz/lib/main.cpp @@ -0,0 +1,13 @@ +#include <library/cpp/string_utils/base64/base64.h> + +#include <util/system/types.h> +#include <util/system/yassert.h> + +extern "C" int LLVMFuzzerTestOneInput(const ui8* data, size_t size) { + const TStringBuf example{reinterpret_cast<const char*>(data), size}; + const auto converted = Base64Decode(Base64Encode(example)); + + Y_VERIFY(example == converted); + + return 0; +} diff --git a/library/cpp/string_utils/base64/fuzz/lib/ya.make b/library/cpp/string_utils/base64/fuzz/lib/ya.make new file mode 100644 index 0000000000..7b981b86a3 --- /dev/null +++ b/library/cpp/string_utils/base64/fuzz/lib/ya.make @@ -0,0 +1,16 @@ +OWNER( + yazevnul + g:util +) + +LIBRARY() + +SRCS( + main.cpp +) + +PEERDIR( + library/cpp/string_utils/base64 +) + +END() diff --git a/library/cpp/string_utils/base64/fuzz/uneven/main.cpp b/library/cpp/string_utils/base64/fuzz/uneven/main.cpp new file mode 100644 index 0000000000..915e81a7e5 --- /dev/null +++ b/library/cpp/string_utils/base64/fuzz/uneven/main.cpp @@ -0,0 +1,10 @@ +#include <library/cpp/string_utils/base64/base64.h> + +#include <util/system/types.h> +#include <util/system/yassert.h> + +extern "C" int LLVMFuzzerTestOneInput(const ui8* data, size_t size) { + const TStringBuf example{reinterpret_cast<const char*>(data), size}; + Y_UNUSED(Base64DecodeUneven(example)); + return 0; +} diff --git a/library/cpp/string_utils/base64/fuzz/uneven/ya.make b/library/cpp/string_utils/base64/fuzz/uneven/ya.make new file mode 100644 index 0000000000..18cb18ef52 --- /dev/null +++ b/library/cpp/string_utils/base64/fuzz/uneven/ya.make @@ -0,0 +1,15 @@ +FUZZ() + +OWNER( + g:util +) + +SRCS( + main.cpp +) + +PEERDIR( + library/cpp/string_utils/base64 +) + +END() diff --git a/library/cpp/string_utils/base64/fuzz/ya.make b/library/cpp/string_utils/base64/fuzz/ya.make new file mode 100644 index 0000000000..bef82061c4 --- /dev/null +++ b/library/cpp/string_utils/base64/fuzz/ya.make @@ -0,0 +1,10 @@ +OWNER( + yazevnul + g:util +) + +RECURSE( + generic + lib + uneven +) diff --git a/library/cpp/string_utils/base64/ut/ya.make b/library/cpp/string_utils/base64/ut/ya.make new file mode 100644 index 0000000000..9b61241f0e --- /dev/null +++ b/library/cpp/string_utils/base64/ut/ya.make @@ -0,0 +1,22 @@ +OWNER( + g:util + yazevnul +) + +UNITTEST_FOR(library/cpp/string_utils/base64) + +SRCS( + base64_ut.cpp + base64_decode_uneven_ut.cpp +) + +PEERDIR( + contrib/libs/base64/avx2 + contrib/libs/base64/ssse3 + contrib/libs/base64/neon32 + contrib/libs/base64/neon64 + contrib/libs/base64/plain32 + contrib/libs/base64/plain64 +) + +END() diff --git a/library/cpp/string_utils/base64/ya.make b/library/cpp/string_utils/base64/ya.make new file mode 100644 index 0000000000..f5258c446c --- /dev/null +++ b/library/cpp/string_utils/base64/ya.make @@ -0,0 +1,23 @@ +OWNER( + g:util + yazevnul +) + +LIBRARY() + +SRCS( + base64.cpp +) + +PEERDIR( + contrib/libs/base64/avx2 + contrib/libs/base64/ssse3 + contrib/libs/base64/neon32 + contrib/libs/base64/neon64 + contrib/libs/base64/plain32 + contrib/libs/base64/plain64 +) + +END() + +RECURSE_FOR_TESTS(ut) diff --git a/library/cpp/string_utils/indent_text/indent_text.cpp b/library/cpp/string_utils/indent_text/indent_text.cpp new file mode 100644 index 0000000000..09a4f6bca8 --- /dev/null +++ b/library/cpp/string_utils/indent_text/indent_text.cpp @@ -0,0 +1,25 @@ +#include "indent_text.h" + +#include <util/stream/str.h> + +TString IndentText(TStringBuf text, TStringBuf indent) { + if (text.empty()) + return TString(); + + TStringStream ss; + ss.Reserve(text.size() + 20); + + char pc = 0; + for (size_t i = 0; i < text.size(); ++i) { + if (i == 0 || pc == '\n') + ss << indent; + + char c = text.at(i); + ss << c; + pc = c; + } + if (pc != '\n') + ss << '\n'; + + return ss.Str(); +} diff --git a/library/cpp/string_utils/indent_text/indent_text.h b/library/cpp/string_utils/indent_text/indent_text.h new file mode 100644 index 0000000000..7117d6c0ee --- /dev/null +++ b/library/cpp/string_utils/indent_text/indent_text.h @@ -0,0 +1,6 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/strbuf.h> + +TString IndentText(TStringBuf text, TStringBuf indent = TStringBuf(" ")); diff --git a/library/cpp/string_utils/indent_text/ya.make b/library/cpp/string_utils/indent_text/ya.make new file mode 100644 index 0000000000..cd0ed9ec61 --- /dev/null +++ b/library/cpp/string_utils/indent_text/ya.make @@ -0,0 +1,9 @@ +LIBRARY() + +OWNER(nga) + +SRCS( + indent_text.cpp +) + +END() diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp new file mode 100644 index 0000000000..8883d7df07 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.cpp @@ -0,0 +1 @@ +#include "levenshtein_diff.h" diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h new file mode 100644 index 0000000000..8a240bfed8 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h @@ -0,0 +1,192 @@ +#pragma once + +#include <util/draft/matrix.h> +#include <util/generic/algorithm.h> +#include <util/generic/vector.h> +#include <util/system/yassert.h> + +#include <type_traits> +#include <utility> + +namespace NLevenshtein { + enum EEditMoveType { + EMT_SPECIAL, + EMT_PRESERVE, + EMT_REPLACE, + EMT_DELETE, + EMT_INSERT + }; + + inline bool IsImportantEditMove(EEditMoveType p) { + return (p != EMT_SPECIAL && p != EMT_PRESERVE); + } + + inline void MakeMove(EEditMoveType t, int& p1, int& p2) { + switch (t) { + case EMT_PRESERVE: + case EMT_REPLACE: + p1++; + p2++; + break; + case EMT_DELETE: + p1++; + break; + case EMT_INSERT: + p2++; + break; + default: + break; + } + } + + using TEditChain = TVector<EEditMoveType>; + + template <typename TArgType> + struct TWeightOneUnaryGetter { + int operator()(const TArgType&) const { + return 1; + } + }; + + template <typename TArgType> + struct TWeightOneBinaryGetter { + int operator()(const TArgType&, const TArgType&) const { + return 1; + } + }; + + template <typename TStringType> + using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>; + + /// Finds sequence of "edit moves" for two strings + template <class TStringType, class TWeightType = int, + class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>, + class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>, + class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>> + > + void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr, + const TReplaceWeigher& replaceWeigher = TReplaceWeigher(), + const TDeleteWeigher& deleteWeigher = TDeleteWeigher(), + const TInsertWeigher& insertWeigher = TInsertWeigher()) + { + int l1 = (int)str1.size(); + int l2 = (int)str2.size(); + + TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1]) + ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point + for (int i = 1; i <= l1; i++) { + ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE); + } + for (int i = 1; i <= l2; i++) { + ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT); + } + // Here goes basic Levestein's algorithm + for (int i = 1; i <= l1; i++) { + for (int j = 1; j <= l2; j++) { + if (str1[i - 1] == str2[j - 1]) { + ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE); + } else { + const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]); + Y_ASSERT(replaceWeight >= 0); + ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE); + } + + if (ma[i][j].first > ma[i - 1][j].first) { + const TWeightType deleteWeight = deleteWeigher(str1[i - 1]); + Y_ASSERT(deleteWeight >= 0); + const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight; + if (deletePathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE); + } + } + + if (ma[i][j].first > ma[i][j - 1].first) { + const TWeightType insertWeight = insertWeigher(str2[j - 1]); + Y_ASSERT(insertWeight >= 0); + const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight; + if (insertPathWeight <= ma[i][j].first) { + ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT); + } + } + } + } + // Tracing the path from final point + res.clear(); + res.reserve(Max<size_t>(l1, l2)); + for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) { + res.push_back(ma[i][j].second); + switch (ma[i][j].second) { + case EMT_PRESERVE: + case EMT_REPLACE: + --i; + --j; + break; + case EMT_DELETE: + --i; + break; + case EMT_INSERT: + --j; + break; + default: + // TODO: throw exception + break; + } + } + std::reverse(res.begin(), res.end()); + + if (weight != nullptr) { + *weight = ma[l1][l2].first; + } + } + + template <class TStringType> + size_t Distance(const TStringType& str1, const TStringType& str2) { + TEditChain editChain; + GetEditChain(str1, str2, editChain); + size_t result = 0; + for (auto edit : editChain) { + if (IsImportantEditMove(edit)) + result++; + } + return result; + } + + /// Calculates substrings to be replaced for str1->str2 transformation + struct TReplacement { + int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength; + TReplacement() + : CorrectOffset(0) + , CorrectLength(0) + , MisspelledOffset(0) + , MisspelledLength(0) + { + } + TReplacement(int correctOffset, int correctLength, int misspelledOffset, int misspelledLength) + : CorrectOffset(correctOffset) + , CorrectLength(correctLength) + , MisspelledOffset(misspelledOffset) + , MisspelledLength(misspelledLength) + { + } + }; + + template <class TStringType> + void GetStringReplacements(const TStringType& str1, const TStringType& str2, TVector<TReplacement>& res) { + TEditChain editChain; + GetEditChain(str1, str2, editChain); + editChain.push_back(EMT_SPECIAL); + int c1 = 0, c2 = 0; + res.clear(); + for (TEditChain::const_iterator it = editChain.begin(); it != editChain.end(); it++) { + if (IsImportantEditMove(*it)) { + int sc1 = c1, sc2 = c2; + do { + MakeMove(*it, c1, c2); + ++it; + } while (IsImportantEditMove(*it)); + res.push_back(TReplacement(sc1, c1 - sc1, sc2, c2 - sc2)); + } + MakeMove(*it, c1, c2); + } + } +} diff --git a/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp new file mode 100644 index 0000000000..cf0f78637f --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp @@ -0,0 +1,190 @@ +#include "levenshtein_diff.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/string.h> + +namespace { + + float unaryZeroWeigher(const char&) { + return 0.0f; + }; + + float unaryMaxWeigher(const char&) { + return 1.0f; + }; + + float binaryZeroWeigher(const char&, const char&) { + return 0.0f; + }; + + float binaryMaxWeigher(const char&, const char&) { + return 1.0f; + }; + +} + +Y_UNIT_TEST_SUITE(Levenstein) { + Y_UNIT_TEST(Distance) { + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3); + UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2); + } +} + +Y_UNIT_TEST_SUITE(WeightedLevenstein) { + Y_UNIT_TEST(EqualStrings) { + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("12345"), TString("12345"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(EmptyStrings) { + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString(""), TString(""), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 0); + } + + Y_UNIT_TEST(InsertsOnly) { + auto unaryWeigher = [](const char&) { + return 2.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString(""), TString("12345"), chain, &distance, binaryZeroWeigher, unaryZeroWeigher, unaryWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 10.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(DeletionsOnly) { + auto unaryWeigher = [](const char&) { + return 3.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("54321"), TString(""), chain, &distance, binaryZeroWeigher, unaryWeigher, unaryZeroWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 15.0f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(SymmetryCheck) { + const TString str1 = "123x5"; + const TString str2 = "x2345"; + const float trgDistance = 2.0f; + const size_t trgChainLen = 5; + + NLevenshtein::TEditChain chainLeftRight; + float distanceLeftRight = 0.0f; + NLevenshtein::GetEditChain(str1, str2, chainLeftRight, &distanceLeftRight, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distanceLeftRight, trgDistance); + UNIT_ASSERT_VALUES_EQUAL(chainLeftRight.size(), trgChainLen); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[0]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[2]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE)); + + NLevenshtein::TEditChain chainRightLeft; + float distanceRightLeft = 0.0f; + NLevenshtein::GetEditChain(str2, str1, chainRightLeft, &distanceRightLeft, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distanceRightLeft, trgDistance); + UNIT_ASSERT_VALUES_EQUAL(chainRightLeft.size(), trgChainLen); + UNIT_ASSERT(chainRightLeft == chainLeftRight); + } + + Y_UNIT_TEST(PreferReplacements) { + auto binaryWeigher = [](const char&, const char&) { + return 0.0625f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("54321"), TString("43210"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 0.3125f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + } + + Y_UNIT_TEST(PreferInsertDeletions) { + auto unaryWeigher = [](const char&) { + return 0.0625f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("54321"), TString("98765"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryWeigher); + UNIT_ASSERT_VALUES_EQUAL(distance, 0.5f); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 9); + } + + Y_UNIT_TEST(NoXDeletions) { + auto unaryWeigher = [](const char& c) { + return c == 'x' ? 100.0f : 1.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("543x1"), TString("5431"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE)); + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoXInsertions) { + auto unaryWeigher = [](const char& c) { + return c == 'x' ? 100.0f : 1.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("5431"), TString("543x1"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_INSERT)); + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoReplacementsOfX) { + auto binaryWeigher = [](const char& l, const char&) { + return l == 'x' ? 100.0f : 1.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("5432x"), TString("5432y"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_INSERT)); + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(NoReplacementsForX) { + auto binaryWeigher = [](const char&, const char& r) { + return r == 'x' ? 100.0f : 1.0f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("y4321"), TString("x4321"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_DELETE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_INSERT)); + UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f); + } + + Y_UNIT_TEST(SimilarOperationPriorities) { + auto replaceWeigher = [](const char&, const char&) { + return 0.5f; + }; + auto deleteWeigher = [](const char&) { + return 0.2f; + }; + auto insertWeigher = [](const char&) { + return 0.9f; + }; + NLevenshtein::TEditChain chain; + float distance = 0.0f; + NLevenshtein::GetEditChain(TString("y0"), TString("0x"), chain, &distance, replaceWeigher, deleteWeigher, insertWeigher); + UNIT_ASSERT_VALUES_EQUAL(chain.size(), 2); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_REPLACE)); + UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f); + } +} diff --git a/library/cpp/string_utils/levenshtein_diff/ut/ya.make b/library/cpp/string_utils/levenshtein_diff/ut/ya.make new file mode 100644 index 0000000000..a3b9b8fea5 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/string_utils/levenshtein_diff) + +OWNER(myltsev) + +SRCS( + levenshtein_diff_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/levenshtein_diff/ya.make b/library/cpp/string_utils/levenshtein_diff/ya.make new file mode 100644 index 0000000000..bafefe5365 --- /dev/null +++ b/library/cpp/string_utils/levenshtein_diff/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +OWNER(g:mt) + +SRCS( + levenshtein_diff.cpp +) + +PEERDIR( + util/draft +) + +END() diff --git a/library/cpp/string_utils/parse_size/parse_size.cpp b/library/cpp/string_utils/parse_size/parse_size.cpp new file mode 100644 index 0000000000..39188d560b --- /dev/null +++ b/library/cpp/string_utils/parse_size/parse_size.cpp @@ -0,0 +1,95 @@ +#include "parse_size.h" + +#include <util/generic/yexception.h> +#include <util/generic/ylimits.h> +#include <util/string/cast.h> +#include <util/stream/output.h> + +namespace { + enum ESuffixShifts { + ESS_KILO_BYTES = 10, + ESS_MEGA_BYTES = 20, + ESS_GIGA_BYTES = 30, + ESS_TERA_BYTES = 40, + }; + + bool TryShiftValue(ui64& value, ui64 shift) { + if (value > (Max<ui64>() >> shift)) { + return false; + } + + value <<= shift; + return true; + } + + ui64 ShiftValue(ui64 value, ui64 shift) { + if (!TryShiftValue(value, shift)) { + ythrow yexception() << "value overflow '" << value << " << " << shift << "'"; + } else { + return value; + } + } + +} + +namespace NSize { + ui64 ParseSize(TStringBuf str) { + if (! str.size()) + ythrow yexception() << "Wrong size " << str; + char suff = tolower(str[str.size() - 1]); + if (isdigit(suff)) + return FromString<ui64>(str); + ui64 shift = 1; + switch (suff) { + case 'k': + shift = ESS_KILO_BYTES; + break; + case 'm': + shift = ESS_MEGA_BYTES; + break; + case 'g': + shift = ESS_GIGA_BYTES; + break; + case 't': + shift = ESS_TERA_BYTES; + break; + default: + ythrow yexception() << "Unknown suffix " << str; + } + + ui64 value = FromString<ui64>(str.substr(0, str.size() - 1)); + + if (!TryShiftValue(value, shift)) { + ythrow yexception() << "Value overflow " << str; + } else { + return value; + } + } + + TSize FromKiloBytes(ui64 value) { + return TSize(ShiftValue(value, ESS_KILO_BYTES)); + } + + TSize FromMegaBytes(ui64 value) { + return TSize(ShiftValue(value, ESS_MEGA_BYTES)); + } + + TSize FromGigaBytes(ui64 value) { + return TSize(ShiftValue(value, ESS_GIGA_BYTES)); + } + + TSize FromTeraBytes(ui64 value) { + return TSize(ShiftValue(value, ESS_TERA_BYTES)); + } + +} + +template <> +NSize::TSize FromStringImpl<NSize::TSize>(const char* data, size_t len) { + return NSize::TSize(NSize::ParseSize(TStringBuf(data, len))); +} + +template <> +void Out<NSize::TSize>(IOutputStream& os, const NSize::TSize& size) { + os << size.GetValue(); +} diff --git a/library/cpp/string_utils/parse_size/parse_size.h b/library/cpp/string_utils/parse_size/parse_size.h new file mode 100644 index 0000000000..ad235ef02f --- /dev/null +++ b/library/cpp/string_utils/parse_size/parse_size.h @@ -0,0 +1,33 @@ +#pragma once + +#include <util/generic/strbuf.h> + +namespace NSize { + ui64 ParseSize(TStringBuf size); + + // Convenient disk size representation with string parsing and integer comparison + class TSize { + public: + TSize(ui64 value = 0) + : Value(value) + { + } + + ui64 GetValue() const { + return Value; + } + + operator ui64() const { + return Value; + } + + private: + ui64 Value; + }; + + TSize FromKiloBytes(ui64 value); + TSize FromMegaBytes(ui64 value); + TSize FromGigaBytes(ui64 value); + TSize FromTeraBytes(ui64 value); + +} diff --git a/library/cpp/string_utils/parse_size/parse_size_ut.cpp b/library/cpp/string_utils/parse_size/parse_size_ut.cpp new file mode 100644 index 0000000000..8fff4f56b2 --- /dev/null +++ b/library/cpp/string_utils/parse_size/parse_size_ut.cpp @@ -0,0 +1,63 @@ +#include "parse_size.h" + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NSize; + +class TParseSizeTest: public TTestBase { + UNIT_TEST_SUITE(TParseSizeTest); + + UNIT_TEST(TestPlain); + UNIT_TEST(TestKiloBytes); + UNIT_TEST(TestMegaBytes); + UNIT_TEST(TestGigaBytes); + UNIT_TEST(TestTeraBytes); + UNIT_TEST(TestOverflow); + UNIT_TEST(TestStaticCreators); + UNIT_TEST(TestToString); + + UNIT_TEST_SUITE_END(); + +private: + void TestPlain() { + UNIT_ASSERT(ParseSize("1024") == 1024); + } + + void TestKiloBytes() { + UNIT_ASSERT(ParseSize("10K") == 1024 * 10); + UNIT_ASSERT(ParseSize("10k") == 1024 * 10); + } + + void TestMegaBytes() { + UNIT_ASSERT(ParseSize("10M") == 1024 * 1024 * 10); + UNIT_ASSERT(ParseSize("10m") == 1024 * 1024 * 10); + } + + void TestGigaBytes() { + UNIT_ASSERT(ParseSize("10G") == 1024ul * 1024ul * 1024ul * 10ul); + UNIT_ASSERT(ParseSize("10g") == 1024ul * 1024ul * 1024ul * 10ul); + } + + void TestTeraBytes() { + UNIT_ASSERT(ParseSize("10T") == 1024ul * 1024ul * 1024ul * 1024ul * 10ul); + UNIT_ASSERT(ParseSize("10t") == 1024ul * 1024ul * 1024ul * 1024ul * 10ul); + } + + void TestStaticCreators() { + UNIT_ASSERT_EQUAL(FromKiloBytes(10), 1024ul * 10ul); + UNIT_ASSERT_EQUAL(FromMegaBytes(10), 1024ul * 1024ul * 10ul); + UNIT_ASSERT_EQUAL(FromGigaBytes(10), 1024ul * 1024ul * 1024ul * 10ul); + UNIT_ASSERT_EQUAL(FromTeraBytes(10), 1024ul * 1024ul * 1024ul * 1024ul * 10ul); + } + + void TestOverflow() { + UNIT_ASSERT_EXCEPTION(ParseSize("20000000000G"), yexception); + UNIT_ASSERT_EXCEPTION(FromGigaBytes(20000000000ull), yexception); + } + + void TestToString() { + UNIT_ASSERT_VALUES_EQUAL(ToString(FromKiloBytes(1)), TString("1024")); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TParseSizeTest); diff --git a/library/cpp/string_utils/parse_size/ut/ya.make b/library/cpp/string_utils/parse_size/ut/ya.make new file mode 100644 index 0000000000..da19cf025b --- /dev/null +++ b/library/cpp/string_utils/parse_size/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/string_utils/parse_size) + +OWNER(g:images-robot) + +SRCS( + parse_size_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/parse_size/ya.make b/library/cpp/string_utils/parse_size/ya.make new file mode 100644 index 0000000000..4a62abcac2 --- /dev/null +++ b/library/cpp/string_utils/parse_size/ya.make @@ -0,0 +1,10 @@ +LIBRARY() + +OWNER(g:images-robot) + +SRCS( + parse_size.cpp + parse_size.h +) + +END() diff --git a/library/cpp/string_utils/quote/quote.cpp b/library/cpp/string_utils/quote/quote.cpp new file mode 100644 index 0000000000..e523350b80 --- /dev/null +++ b/library/cpp/string_utils/quote/quote.cpp @@ -0,0 +1,311 @@ +#include "quote.h" + +#include <util/memory/tempbuf.h> +#include <util/string/ascii.h> +#include <util/string/cstriter.h> + +#include <cctype> + +/* note: (x & 0xdf) makes x upper case */ +#define GETXC \ + do { \ + c *= 16; \ + c += (x[0] >= 'A' ? ((x[0] & 0xdf) - 'A') + 10 : (x[0] - '0')); \ + ++x; \ + } while (0) + +#define GETSBXC \ + do { \ + c *= 16; \ + c += (x[0] >= 'A' ? ((x[0] & 0xdf) - 'A') + 10 : (x[0] - '0')); \ + x.Skip(1); \ + } while (0) + + +namespace { + class TFromHexZeroTerm { + public: + static inline char x2c(const char*& x) { + if (!IsAsciiHex((ui8)x[0]) || !IsAsciiHex((ui8)x[1])) + return '%'; + ui8 c = 0; + + GETXC; + GETXC; + return c; + } + + static inline char x2c(TStringBuf& x) { + if (!IsAsciiHex((ui8)x[0]) || !IsAsciiHex((ui8)x[1])) + return '%'; + ui8 c = 0; + + GETSBXC; + GETSBXC; + return c; + } + }; + + class TFromHexLenLimited { + public: + TFromHexLenLimited(const char* end) + : End(end) + { + } + + inline char x2c(const char*& x) { + if (x + 2 > End) + return '%'; + return TFromHexZeroTerm::x2c(x); + } + + private: + const char* End; + }; +} + +static inline char d2x(unsigned x) { + return (char)((x < 10) ? ('0' + x) : ('A' + x - 10)); +} + +static inline const char* FixZero(const char* s) noexcept { + return s ? s : ""; +} + +// we escape: +// '\"', '|', '(', ')', +// '%', '&', '+', ',', +// '#', '<', '=', '>', +// '[', '\\',']', '?', +// ':', '{', '}', +// all below ' ' (0x20) and above '~' (0x7E). +// ' ' converted to '+' +static const bool chars_to_url_escape[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //1 + 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, //2 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, //3 + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //4 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, //5 + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //6 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, //7 + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //B + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //F +}; + +template <class It1, class It2, class It3> +static inline It1 Escape(It1 to, It2 from, It3 end, const bool* escape_map = chars_to_url_escape) { + while (from != end) { + if (escape_map[(unsigned char)*from]) { + *to++ = '%'; + *to++ = d2x((unsigned char)*from >> 4); + *to++ = d2x((unsigned char)*from & 0xF); + } else { + *to++ = (*from == ' ' ? '+' : *from); + } + + ++from; + } + + *to = 0; + + return to; +} + +template <class It1, class It2, class It3, class FromHex> +static inline It1 Unescape(It1 to, It2 from, It3 end, FromHex fromHex) { + (void)fromHex; + + while (from != end) { + switch (*from) { + case '%': + ++from; + *to++ = fromHex.x2c(from); + break; + case '+': + *to++ = ' '; + ++from; + break; + default: + *to++ = *from++; + } + } + *to = 0; + return to; +} + +// CGIEscape returns pointer to the end of the result string +// so as it could be possible to populate single long buffer +// with several calls to CGIEscape in a row. +char* CGIEscape(char* to, const char* from) { + return Escape(to, FixZero(from), TCStringEndIterator()); +} + +char* CGIEscape(char* to, const char* from, size_t len) { + return Escape(to, from, from + len); +} + +void CGIEscape(TString& url) { + TTempBuf tempBuf(CgiEscapeBufLen(url.size())); + char* to = tempBuf.Data(); + + url.AssignNoAlias(to, CGIEscape(to, url.data(), url.size())); +} + +TString CGIEscapeRet(const TStringBuf url) { + TString to; + to.ReserveAndResize(CgiEscapeBufLen(url.size())); + to.resize(CGIEscape(to.begin(), url.data(), url.size()) - to.data()); + return to; +} + +TString& AppendCgiEscaped(const TStringBuf value, TString& to) { + const size_t origLength = to.length(); + to.ReserveAndResize(origLength + CgiEscapeBufLen(value.size())); + to.resize(CGIEscape(to.begin() + origLength, value.data(), value.size()) - to.data()); + return to; +} + +// More general version of CGIEscape. The optional safe parameter specifies +// additional characters that should not be quoted — its default value is '/'. + +// Also returns pointer to the end of result string. + +template <class It1, class It2, class It3> +static inline It1 Quote(It1 to, It2 from, It3 end, const char* safe) { + bool escape_map[256]; + memcpy(escape_map, chars_to_url_escape, 256); + // RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax + // lists following reserved characters: + const char* reserved = ":/?#[]@!$&\'()*+,;="; + for (const char* p = reserved; *p; ++p) { + escape_map[(unsigned char)*p] = 1; + } + // characters we think are safe at the moment + for (const char* p = safe; *p; ++p) { + escape_map[(unsigned char)*p] = 0; + } + + return Escape(to, from, end, escape_map); +} + +char* Quote(char* to, const char* from, const char* safe) { + return Quote(to, FixZero(from), TCStringEndIterator(), safe); +} + +char* Quote(char* to, const TStringBuf s, const char* safe) { + return Quote(to, s.data(), s.data() + s.size(), safe); +} + +void Quote(TString& url, const char* safe) { + TTempBuf tempBuf(CgiEscapeBufLen(url.size())); + char* to = tempBuf.Data(); + + url.AssignNoAlias(to, Quote(to, url, safe)); +} + +char* CGIUnescape(char* to, const char* from) { + return Unescape(to, FixZero(from), TCStringEndIterator(), TFromHexZeroTerm()); +} + +char* CGIUnescape(char* to, const char* from, size_t len) { + return Unescape(to, from, from + len, TFromHexLenLimited(from + len)); +} + +void CGIUnescape(TString& url) { + if (url.empty()) { + return; + } + if (url.IsDetached()) { // in-place when refcount == 1 + char* resBegin = url.begin(); + const char* resEnd = CGIUnescape(resBegin, resBegin, url.size()); + url.resize(resEnd - resBegin); + } else { + url = CGIUnescapeRet(url); + } +} + +TString CGIUnescapeRet(const TStringBuf from) { + TString to; + to.ReserveAndResize(CgiUnescapeBufLen(from.size())); + to.resize(CGIUnescape(to.begin(), from.data(), from.size()) - to.data()); + return to; +} + +char* UrlUnescape(char* to, TStringBuf from) { + while (!from.empty()) { + char ch = from[0]; + from.Skip(1); + if ('%' == ch && 2 <= from.length()) + ch = TFromHexZeroTerm::x2c(from); + *to++ = ch; + } + + *to = 0; + + return to; +} + +void UrlUnescape(TString& url) { + if (url.empty()) { + return; + } + if (url.IsDetached()) { // in-place when refcount == 1 + char* resBegin = url.begin(); + const char* resEnd = UrlUnescape(resBegin, url); + url.resize(resEnd - resBegin); + } else { + url = UrlUnescapeRet(url); + } +} + +TString UrlUnescapeRet(const TStringBuf from) { + TString to; + to.ReserveAndResize(CgiUnescapeBufLen(from.size())); + to.resize(UrlUnescape(to.begin(), from) - to.data()); + return to; +} + +char* UrlEscape(char* to, const char* from, bool forceEscape) { + from = FixZero(from); + + while (*from) { + const bool escapePercent = (*from == '%') && + (forceEscape || !((*(from + 1) && IsAsciiHex(*(from + 1)) && *(from + 2) && IsAsciiHex(*(from + 2))))); + + if (escapePercent || (unsigned char)*from <= ' ' || (unsigned char)*from > '~') { + *to++ = '%'; + *to++ = d2x((unsigned char)*from >> 4); + *to++ = d2x((unsigned char)*from & 0xF); + } else + *to++ = *from; + ++from; + } + + *to = 0; + + return to; +} + +void UrlEscape(TString& url, bool forceEscape) { + TTempBuf tempBuf(CgiEscapeBufLen(url.size())); + char* to = tempBuf.Data(); + url.AssignNoAlias(to, UrlEscape(to, url.data(), forceEscape)); +} + +TString UrlEscapeRet(const TStringBuf from, bool forceEscape) { + TString to; + to.ReserveAndResize(CgiEscapeBufLen(from.size())); + to.resize(UrlEscape(to.begin(), from.begin(), forceEscape) - to.data()); + return to; +} diff --git a/library/cpp/string_utils/quote/quote.h b/library/cpp/string_utils/quote/quote.h new file mode 100644 index 0000000000..3b7221154e --- /dev/null +++ b/library/cpp/string_utils/quote/quote.h @@ -0,0 +1,72 @@ +#pragma once + +#include <util/generic/strbuf.h> +#include <util/generic/string.h> + +//CGIEscape*: +// ' ' converted to '+', +// Some punctuation and chars outside [32, 126] range are converted to %xx +// Use function CgiEscapeBufLen to determine number of characters needed for 'char* to' parameter. +// Returns pointer to the end of the result string +char* CGIEscape(char* to, const char* from); +char* CGIEscape(char* to, const char* from, size_t len); +inline char* CGIEscape(char* to, const TStringBuf from) { + return CGIEscape(to, from.data(), from.size()); +} +void CGIEscape(TString& url); +TString CGIEscapeRet(const TStringBuf url); +TString& AppendCgiEscaped(const TStringBuf value, TString& to); + +inline TStringBuf CgiEscapeBuf(char* to, const TStringBuf from) { + return TStringBuf(to, CGIEscape(to, from.data(), from.size())); +} +inline TStringBuf CgiEscape(void* tmp, const TStringBuf s) { + return CgiEscapeBuf(static_cast<char*>(tmp), s); +} + +//CgiUnescape*: +// Decodes '%xx' to bytes, '+' to space. +// Use function CgiUnescapeBufLen to determine number of characters needed for 'char* to' parameter. +// If pointer returned, then this is pointer to the end of the result string. +char* CGIUnescape(char* to, const char* from); +char* CGIUnescape(char* to, const char* from, size_t len); +void CGIUnescape(TString& url); +TString CGIUnescapeRet(const TStringBuf from); + +inline TStringBuf CgiUnescapeBuf(char* to, const TStringBuf from) { + return TStringBuf(to, CGIUnescape(to, from.data(), from.size())); +} +inline TStringBuf CgiUnescape(void* tmp, const TStringBuf s) { + return CgiUnescapeBuf(static_cast<char*>(tmp), s); +} + +//Quote: +// Is like CGIEscape, also skips encoding of user-supplied 'safe' characters. +char* Quote(char* to, const char* from, const char* safe = "/"); +char* Quote(char* to, const TStringBuf s, const char* safe = "/"); +void Quote(TString& url, const char* safe = "/"); + +//UrlEscape: +// Can't be used for cgi parameters ('&' character is not escaped)! +// escapes only '%' not followed by two hex-digits or if forceEscape set to ture, +// and chars outside [32, 126] range. +// Can't handle '\0'-chars in TString. +char* UrlEscape(char* to, const char* from, bool forceEscape = false); +void UrlEscape(TString& url, bool forceEscape = false); +TString UrlEscapeRet(const TStringBuf from, bool forceEscape = false); + +//UrlUnescape: +// '+' is NOT converted to space! +// %xx converted to bytes, other characters are copied unchanged. +char* UrlUnescape(char* to, TStringBuf from); +void UrlUnescape(TString& url); +TString UrlUnescapeRet(const TStringBuf from); + +//*BufLen: how much characters you should allocate for 'char* to' buffers. +constexpr size_t CgiEscapeBufLen(const size_t len) noexcept { + return 3 * len + 1; +} + +constexpr size_t CgiUnescapeBufLen(const size_t len) noexcept { + return len + 1; +} diff --git a/library/cpp/string_utils/quote/quote_ut.cpp b/library/cpp/string_utils/quote/quote_ut.cpp new file mode 100644 index 0000000000..6c552b279e --- /dev/null +++ b/library/cpp/string_utils/quote/quote_ut.cpp @@ -0,0 +1,319 @@ +#include "quote.h" + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(TCGIEscapeTest) { + Y_UNIT_TEST(ReturnsEndOfTo) { + char r[10]; + const char* returned = CGIEscape(r, "123"); + UNIT_ASSERT_VALUES_EQUAL(r + strlen("123"), returned); + UNIT_ASSERT_VALUES_EQUAL('\0', *returned); + } + + Y_UNIT_TEST(NotZeroTerminated) { + char r[] = {'1', '2', '3', '4'}; + char buf[sizeof(r) * 3 + 2]; + + TString ret(buf, CGIEscape(buf, r, sizeof(r))); + + UNIT_ASSERT_EQUAL(ret, "1234"); + } + + Y_UNIT_TEST(StringBuf) { + char tmp[100]; + + UNIT_ASSERT_VALUES_EQUAL(CgiEscape(tmp, "!@#$%^&*(){}[]\" "), TStringBuf("!@%23$%25^%26*%28%29%7B%7D%5B%5D%22+")); + } + + Y_UNIT_TEST(StrokaRet) { + UNIT_ASSERT_VALUES_EQUAL(CGIEscapeRet("!@#$%^&*(){}[]\" "), TString("!@%23$%25^%26*%28%29%7B%7D%5B%5D%22+")); + } + + Y_UNIT_TEST(StrokaAppendRet) { + TString param; + AppendCgiEscaped("!@#$%^&*(){}[]\" ", param); + UNIT_ASSERT_VALUES_EQUAL(param, TString("!@%23$%25^%26*%28%29%7B%7D%5B%5D%22+")); + + TString param2 = "¶m="; + AppendCgiEscaped("!@#$%^&*(){}[]\" ", param2); + UNIT_ASSERT_VALUES_EQUAL(param2, + TString("¶m=!@%23$%25^%26*%28%29%7B%7D%5B%5D%22+")); + + param2.append("¶m_param="); + AppendCgiEscaped("!@#$%^&*(){}[]\" ", param2); + UNIT_ASSERT_VALUES_EQUAL(param2, + TString("¶m=!@%23$%25^%26*%28%29%7B%7D%5B%5D%22+¶m_param=!@%23$%25^%26*%28%29%7B%7D%5B%5D%22+")); + } + +} + +Y_UNIT_TEST_SUITE(TCGIUnescapeTest) { + Y_UNIT_TEST(StringBuf) { + char tmp[100]; + + UNIT_ASSERT_VALUES_EQUAL(CgiUnescape(tmp, "!@%23$%25^%26*%28%29"), TStringBuf("!@#$%^&*()")); + } + + Y_UNIT_TEST(TestValidZeroTerm) { + char r[10]; + + CGIUnescape(r, "1234"); + UNIT_ASSERT_VALUES_EQUAL(r, "1234"); + + CGIUnescape(r, "%3d"); + UNIT_ASSERT_VALUES_EQUAL(r, "="); + + CGIUnescape(r, "12%3D34"); + UNIT_ASSERT_VALUES_EQUAL(r, "12=34"); + } + + Y_UNIT_TEST(TestInvalidZeroTerm) { + char r[10]; + + CGIUnescape(r, "%"); + UNIT_ASSERT_VALUES_EQUAL(r, "%"); + + CGIUnescape(r, "%3"); + UNIT_ASSERT_VALUES_EQUAL(r, "%3"); + + CGIUnescape(r, "%3g"); + UNIT_ASSERT_VALUES_EQUAL(r, "%3g"); + + CGIUnescape(r, "12%3g34"); + UNIT_ASSERT_VALUES_EQUAL(r, "12%3g34"); + + CGIUnescape(r, "%3u123"); + UNIT_ASSERT_VALUES_EQUAL(r, "%3u123"); + } + + Y_UNIT_TEST(TestValidNotZeroTerm) { + char r[10]; + + CGIUnescape(r, "123456789", 4); + UNIT_ASSERT_VALUES_EQUAL(r, "1234"); + + CGIUnescape(r, "%3d1234", 3); + UNIT_ASSERT_VALUES_EQUAL(r, "="); + + CGIUnescape(r, "12%3D345678", 7); + UNIT_ASSERT_VALUES_EQUAL(r, "12=34"); + } + + Y_UNIT_TEST(TestInvalidNotZeroTerm) { + char r[10]; + + CGIUnescape(r, "%3d", 1); + UNIT_ASSERT_VALUES_EQUAL(r, "%"); + + CGIUnescape(r, "%3d", 2); + UNIT_ASSERT_VALUES_EQUAL(r, "%3"); + + CGIUnescape(r, "%3g1234", 3); + UNIT_ASSERT_VALUES_EQUAL(r, "%3g"); + + CGIUnescape(r, "12%3g345678", 7); + UNIT_ASSERT_VALUES_EQUAL(r, "12%3g34"); + + CGIUnescape(r, "%3u1234", 2); + UNIT_ASSERT_VALUES_EQUAL(r, "%3"); + + CGIUnescape(r, "%3u1234", 3); + UNIT_ASSERT_VALUES_EQUAL(r, "%3u"); + + CGIUnescape(r, "%3u1234", 4); + UNIT_ASSERT_VALUES_EQUAL(r, "%3u1"); + } + + Y_UNIT_TEST(StrokaOutParameterInplace) { + TString s; + + s = "hello%3dworld"; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello=world"); + + s = "+%23+"; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, " # "); + + s = "hello%3u"; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello%3u"); + + s = "0123456789012345"; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "0123456789012345"); + + s = ""; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + } + + Y_UNIT_TEST(StrokaOutParameterNotInplace) { + TString s, sCopy; + + s = "hello%3dworld"; + sCopy = s; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello=world"); + + s = "+%23+"; + sCopy = s; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, " # "); + + s = "hello%3u"; + sCopy = s; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello%3u"); + + s = "0123456789012345"; + sCopy = s; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "0123456789012345"); + + s = ""; + sCopy = s; + CGIUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + } +} + +Y_UNIT_TEST_SUITE(TUrlEscapeTest) { + Y_UNIT_TEST(EscapeEscaped) { + TString s; + + s = "hello%3dworld"; + UNIT_ASSERT_VALUES_EQUAL(UrlEscapeRet(s), "hello%3dworld"); + UrlEscape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello%3dworld"); + } + + Y_UNIT_TEST(EscapeUnescape) { + TString s; + + s = "hello%3dworld"; + UrlEscape(s); + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello=world"); + } + + Y_UNIT_TEST(EscapeUnescapeRet) { + TString s; + + s = "hello%3dworld"; + UNIT_ASSERT_VALUES_EQUAL(UrlUnescapeRet(UrlEscapeRet(s)), "hello=world"); + } + + Y_UNIT_TEST(EscapeEscapedForce) { + TString s; + + s = "hello%3dworld"; + UNIT_ASSERT_VALUES_EQUAL(UrlEscapeRet(s, true), "hello%253dworld"); + UrlEscape(s, true); + UNIT_ASSERT_VALUES_EQUAL(s, "hello%253dworld"); + } + + Y_UNIT_TEST(EscapeUnescapeForce) { + TString s; + + s = "hello%3dworld"; + UrlEscape(s, true); + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello%3dworld"); + } + + Y_UNIT_TEST(EscapeUnescapeForceRet) { + TString s; + + s = "hello%3dworld"; + UNIT_ASSERT_VALUES_EQUAL(UrlUnescapeRet(UrlEscapeRet(s, true)), "hello%3dworld"); + } +} + +Y_UNIT_TEST_SUITE(TUrlUnescapeTest) { + Y_UNIT_TEST(StrokaOutParameterInplace) { + TString s; + + s = "hello%3dworld"; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello=world"); + + s = "+%23+"; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "+#+"); + + s = "hello%3u"; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello%3u"); + + s = "0123456789012345"; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "0123456789012345"); + + s = ""; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + } + + Y_UNIT_TEST(StrokaOutParameterNotInplace) { + TString s, sCopy; + + s = "hello%3dworld"; + sCopy = s; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello=world"); + + s = "+%23+"; + sCopy = s; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "+#+"); + + s = "hello%3u"; + sCopy = s; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "hello%3u"); + + s = "0123456789012345"; + sCopy = s; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, "0123456789012345"); + + s = ""; + sCopy = s; + UrlUnescape(s); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + } +} + +Y_UNIT_TEST_SUITE(TQuoteTest) { + Y_UNIT_TEST(ReturnsEndOfTo) { + char r[10]; + const char* returned = Quote(r, "123"); + UNIT_ASSERT_VALUES_EQUAL(r + strlen("123"), returned); + UNIT_ASSERT_VALUES_EQUAL('\0', *returned); + } + + Y_UNIT_TEST(SlashIsSafeByDefault) { + char r[100]; + Quote(r, "/path;tail/path,tail/"); + UNIT_ASSERT_VALUES_EQUAL("/path%3Btail/path%2Ctail/", r); + TString s("/path;tail/path,tail/"); + Quote(s); + UNIT_ASSERT_VALUES_EQUAL("/path%3Btail/path%2Ctail/", s.c_str()); + } + + Y_UNIT_TEST(SafeColons) { + char r[100]; + Quote(r, "/path;tail/path,tail/", ";,"); + UNIT_ASSERT_VALUES_EQUAL("%2Fpath;tail%2Fpath,tail%2F", r); + TString s("/path;tail/path,tail/"); + Quote(s, ";,"); + UNIT_ASSERT_VALUES_EQUAL("%2Fpath;tail%2Fpath,tail%2F", s.c_str()); + } + + Y_UNIT_TEST(StringBuf) { + char r[100]; + char* end = Quote(r, "abc\0/path", ""); + UNIT_ASSERT_VALUES_EQUAL("abc\0%2Fpath", TStringBuf(r, end)); + } +} diff --git a/library/cpp/string_utils/quote/ut/ya.make b/library/cpp/string_utils/quote/ut/ya.make new file mode 100644 index 0000000000..eca955144f --- /dev/null +++ b/library/cpp/string_utils/quote/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/string_utils/quote) + +OWNER(vladon) + +SRCS( + quote_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/quote/ya.make b/library/cpp/string_utils/quote/ya.make new file mode 100644 index 0000000000..55bb3cf939 --- /dev/null +++ b/library/cpp/string_utils/quote/ya.make @@ -0,0 +1,10 @@ +LIBRARY() + +OWNER(g:util) + +SRCS( + quote.cpp + quote.h +) + +END() diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp new file mode 100644 index 0000000000..ac624dca85 --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.cpp @@ -0,0 +1 @@ +#include "relaxed_escaper.h" diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h new file mode 100644 index 0000000000..d7ea7c1259 --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h @@ -0,0 +1,208 @@ +#pragma once + +#include <util/stream/output.h> +#include <util/string/escape.h> +#include <util/memory/tempbuf.h> +#include <util/generic/strbuf.h> + +namespace NEscJ { + // almost copypaste from util/string/escape.h + // todo: move there (note difference in IsPrintable and handling of string) + + inline char HexDigit(char value) { + if (value < 10) + return '0' + value; + else + return 'A' + value - 10; + } + + inline char OctDigit(char value) { + return '0' + value; + } + + inline bool IsUTF8(ui8 c) { + return c < 0xf5 && c != 0xC0 && c != 0xC1; + } + + inline bool IsControl(ui8 c) { + return c < 0x20 || c == 0x7f; + } + + inline bool IsPrintable(ui8 c) { + return IsUTF8(c) && !IsControl(c); + } + + inline bool IsHexDigit(ui8 c) { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); + } + + inline bool IsOctDigit(ui8 c) { + return c >= '0' && c <= '7'; + } + + struct TEscapeUtil { + static const size_t ESCAPE_C_BUFFER_SIZE = 6; + + template <bool asunicode> + static inline size_t EscapeJ(ui8 c, ui8 next, char r[ESCAPE_C_BUFFER_SIZE], TStringBuf safe, TStringBuf unsafe) { + // (1) Printable characters go as-is, except backslash and double quote. + // (2) Characters \r, \n, \t and \0 ... \7 replaced by their simple escape characters (if possible). + // (3) Otherwise, character is encoded using hexadecimal escape sequence (if possible), or octal. + if (safe.find(c) != TStringBuf::npos) { + r[0] = c; + return 1; + } + if (c == '\"') { + r[0] = '\\'; + r[1] = '\"'; + return 2; + } else if (c == '\\') { + r[0] = '\\'; + r[1] = '\\'; + return 2; + } else if (IsPrintable(c) && unsafe.find(c) == TStringBuf::npos) { + r[0] = c; + return 1; + } else if (c == '\b') { + r[0] = '\\'; + r[1] = 'b'; + return 2; + } else if (c == '\f') { + r[0] = '\\'; + r[1] = 'f'; + return 2; + } else if (c == '\r') { + r[0] = '\\'; + r[1] = 'r'; + return 2; + } else if (c == '\n') { + r[0] = '\\'; + r[1] = 'n'; + return 2; + } else if (c == '\t') { + r[0] = '\\'; + r[1] = 't'; + return 2; + } else if (asunicode && IsUTF8(c)) { // utf8 controls escape for json + r[0] = '\\'; + r[1] = 'u'; + r[2] = '0'; + r[3] = '0'; + r[4] = HexDigit((c & 0xF0) >> 4); + r[5] = HexDigit((c & 0x0F) >> 0); + return 6; + } else if (c < 8 && !IsOctDigit(next)) { + r[0] = '\\'; + r[1] = OctDigit(c); + return 2; + } else if (!IsHexDigit(next)) { + r[0] = '\\'; + r[1] = 'x'; + r[2] = HexDigit((c & 0xF0) >> 4); + r[3] = HexDigit((c & 0x0F) >> 0); + return 4; + } else { + r[0] = '\\'; + r[1] = OctDigit((c & 0700) >> 6); + r[2] = OctDigit((c & 0070) >> 3); + r[3] = OctDigit((c & 0007) >> 0); + return 4; + } + } + + static inline size_t EscapeJ(ui8 c, ui8 next, char r[ESCAPE_C_BUFFER_SIZE], TStringBuf safe, TStringBuf unsafe) { + return EscapeJ<false>(c, next, r, safe, unsafe); + } + }; + + inline size_t SuggestBuffer(size_t len) { + return len * TEscapeUtil::ESCAPE_C_BUFFER_SIZE; + } + + template <bool tounicode> + inline size_t EscapeJ(const char* str, size_t len, char* out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + char* out0 = out; + char buffer[TEscapeUtil::ESCAPE_C_BUFFER_SIZE]; + + size_t i, j; + for (i = 0, j = 0; i < len; ++i) { + size_t rlen = TEscapeUtil::EscapeJ<tounicode>(str[i], (i + 1 < len ? str[i + 1] : 0), buffer, safe, unsafe); + + if (rlen > 1) { + strncpy(out, str + j, i - j); + out += i - j; + j = i + 1; + + strncpy(out, buffer, rlen); + out += rlen; + } + } + + if (j > 0) { + strncpy(out, str + j, len - j); + out += len - j; + } else { + strncpy(out, str, len); + out += len; + } + + return out - out0; + } + + template <bool quote, bool tounicode> + inline void EscapeJ(TStringBuf in, IOutputStream& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + TTempBuf b(SuggestBuffer(in.size()) + 2); + + if (quote) + b.Append("\"", 1); + + b.Proceed(EscapeJ<tounicode>(in.data(), in.size(), b.Current(), safe, unsafe)); + + if (quote) + b.Append("\"", 1); + + out.Write(b.Data(), b.Filled()); + } + + template <bool quote, bool tounicode> + inline void EscapeJ(TStringBuf in, TString& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + TTempBuf b(SuggestBuffer(in.size()) + 2); + + if (quote) + b.Append("\"", 1); + + b.Proceed(EscapeJ<tounicode>(in.data(), in.size(), b.Current(), safe, unsafe)); + + if (quote) + b.Append("\"", 1); + + out.append(b.Data(), b.Filled()); + } + + template <bool quote, bool tounicode> + inline TString EscapeJ(TStringBuf in, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + TString s; + EscapeJ<quote, tounicode>(in, s, safe, unsafe); + return s; + } + + // If the template parameter "tounicode" is ommited, then use the default value false + inline size_t EscapeJ(const char* str, size_t len, char* out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + return EscapeJ<false>(str, len, out, safe, unsafe); + } + + template <bool quote> + inline void EscapeJ(TStringBuf in, IOutputStream& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + EscapeJ<quote, false>(in, out, safe, unsafe); + } + + template <bool quote> + inline void EscapeJ(TStringBuf in, TString& out, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + EscapeJ<quote, false>(in, out, safe, unsafe); + } + + template <bool quote> + inline TString EscapeJ(TStringBuf in, TStringBuf safe = TStringBuf(), TStringBuf unsafe = TStringBuf()) { + return EscapeJ<quote, false>(in, safe, unsafe); + } +} diff --git a/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp new file mode 100644 index 0000000000..768555ea3a --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/relaxed_escaper_ut.cpp @@ -0,0 +1,66 @@ +#include "relaxed_escaper.h" + +#include <library/cpp/testing/unittest/registar.h> + +#define RESC_FIXED_STR(s) TStringBuf(s, sizeof(s) - 1) +static const TStringBuf CommonTestData[] = { + // Should be valid UTF-8. + RESC_FIXED_STR("http://ya.ru/"), RESC_FIXED_STR("http://ya.ru/"), + RESC_FIXED_STR("http://ya.ru/\\x17\\n"), RESC_FIXED_STR("http://ya.ru/\x17\n"), + + RESC_FIXED_STR("http://ya.ru/\\0"), RESC_FIXED_STR("http://ya.ru/\0"), + RESC_FIXED_STR("http://ya.ru/\\0\\0"), RESC_FIXED_STR("http://ya.ru/\0\0"), + RESC_FIXED_STR("http://ya.ru/\\0\\0000"), RESC_FIXED_STR("http://ya.ru/\0\0" + "0"), + RESC_FIXED_STR("http://ya.ru/\\0\\0001"), RESC_FIXED_STR("http://ya.ru/\0\x00" + "1"), + + RESC_FIXED_STR("\\2\\4\\00678"), RESC_FIXED_STR("\2\4\6" + "78"), + RESC_FIXED_STR("\\2\\4\\689"), RESC_FIXED_STR("\2\4\689"), + + RESC_FIXED_STR("\\\"Hello\\\", Alice said."), RESC_FIXED_STR("\"Hello\", Alice said."), + RESC_FIXED_STR("Slash\\\\dash!"), RESC_FIXED_STR("Slash\\dash!"), + RESC_FIXED_STR("There\\nare\\r\\nnewlines."), RESC_FIXED_STR("There\nare\r\nnewlines."), + RESC_FIXED_STR("There\\tare\\ttabs."), RESC_FIXED_STR("There\tare\ttabs.")}; +#undef RESC_FIXED_STR + +Y_UNIT_TEST_SUITE(TRelaxedEscaperTest) { + Y_UNIT_TEST(TestEscaper) { + using namespace NEscJ; + for (size_t i = 0; i < Y_ARRAY_SIZE(CommonTestData); i += 2) { + TString expected(CommonTestData[i].data(), CommonTestData[i].size()); + TString source(CommonTestData[i + 1].data(), CommonTestData[i + 1].size()); + TString actual(EscapeJ<false>(source)); + TString actual2(UnescapeC(expected)); + + UNIT_ASSERT_VALUES_EQUAL(expected, actual); + UNIT_ASSERT_VALUES_EQUAL(source, actual2); + } + + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\x17\\n\xAB", EscapeJ<false>("http://ya.ru/\x17\n\xab")); + TString s = EscapeJ<false, true>("http://ya.ru/\x17\n\xab\xff"); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\u0017\\n\xAB\\xFF", s); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\\x17\n\xAB", EscapeJ<false>("http://ya.ru/\x17\n\xab", "\n")); + UNIT_ASSERT_VALUES_EQUAL("http:\\x2F\\x2Fya.ru\\x2F\\x17\n\xAB'", EscapeJ<false>("http://ya.ru/\x17\n\xab'", "\n'", "/")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\x17\n\xab", UnescapeC("http:\\x2F\\x2Fya.ru\\x2F\\x17\n\xAB")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru/\x17\n\xab", UnescapeC("http://ya.ru/\\x17\\n\xAB")); + UNIT_ASSERT_VALUES_EQUAL("h", EscapeJ<false>("h")); + UNIT_ASSERT_VALUES_EQUAL("\"h\"", EscapeJ<true>("h")); + UNIT_ASSERT_VALUES_EQUAL("h", UnescapeC("h")); + UNIT_ASSERT_VALUES_EQUAL("\\xFF", EscapeJ<false>("\xFF")); + UNIT_ASSERT_VALUES_EQUAL("\"\\xFF\"", EscapeJ<true>("\xFF")); + UNIT_ASSERT_VALUES_EQUAL("\xFF", UnescapeC("\\xFF")); + + UNIT_ASSERT_VALUES_EQUAL("\\377f", EscapeJ<false>("\xff" + "f")); + UNIT_ASSERT_VALUES_EQUAL("\xff" + "f", + UnescapeC("\\377f")); + UNIT_ASSERT_VALUES_EQUAL("\\xFFg", EscapeJ<false>("\xff" + "g")); + UNIT_ASSERT_VALUES_EQUAL("\xff" + "g", + UnescapeC("\\xFFg")); + } +} diff --git a/library/cpp/string_utils/relaxed_escaper/ut/ya.make b/library/cpp/string_utils/relaxed_escaper/ut/ya.make new file mode 100644 index 0000000000..7ebd393c48 --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/string_utils/relaxed_escaper) + +OWNER(velavokr) + +SRCS( + relaxed_escaper_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/relaxed_escaper/ya.make b/library/cpp/string_utils/relaxed_escaper/ya.make new file mode 100644 index 0000000000..3f0fa5bc07 --- /dev/null +++ b/library/cpp/string_utils/relaxed_escaper/ya.make @@ -0,0 +1,9 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( + relaxed_escaper.cpp +) + +END() diff --git a/library/cpp/string_utils/scan/scan.cpp b/library/cpp/string_utils/scan/scan.cpp new file mode 100644 index 0000000000..fbc1fdf08f --- /dev/null +++ b/library/cpp/string_utils/scan/scan.cpp @@ -0,0 +1 @@ +#include "scan.h" diff --git a/library/cpp/string_utils/scan/scan.h b/library/cpp/string_utils/scan/scan.h new file mode 100644 index 0000000000..703db54321 --- /dev/null +++ b/library/cpp/string_utils/scan/scan.h @@ -0,0 +1,22 @@ +#pragma once + +#include <util/generic/strbuf.h> + +template <bool addAll, char sep, char sepKeyVal, class F> +static inline void ScanKeyValue(TStringBuf s, F&& f) { + TStringBuf key, val; + + while (!s.empty()) { + val = s.NextTok(sep); + + if (val.empty()) { + continue; // && case + } + + key = val.NextTok(sepKeyVal); + + if (addAll || val.IsInited()) { + f(key, val); // includes empty keys + } + } +} diff --git a/library/cpp/string_utils/scan/ya.make b/library/cpp/string_utils/scan/ya.make new file mode 100644 index 0000000000..2faae86b09 --- /dev/null +++ b/library/cpp/string_utils/scan/ya.make @@ -0,0 +1,11 @@ +OWNER( + g:util +) + +LIBRARY() + +SRCS( + scan.cpp +) + +END() diff --git a/library/cpp/string_utils/url/url.cpp b/library/cpp/string_utils/url/url.cpp new file mode 100644 index 0000000000..85f4ac5d69 --- /dev/null +++ b/library/cpp/string_utils/url/url.cpp @@ -0,0 +1,421 @@ +#include "url.h" + +#include <util/string/cast.h> +#include <util/string/util.h> +#include <util/string/cstriter.h> +#include <util/string/ascii.h> +#include <util/string/strip.h> + +#include <util/charset/unidata.h> // for ToLower +#include <util/system/defaults.h> +#include <util/generic/algorithm.h> +#include <util/generic/hash_set.h> +#include <util/generic/yexception.h> +#include <util/generic/singleton.h> + +#include <cstdlib> + +namespace { + struct TUncheckedSize { + static bool Has(size_t) { + return true; + } + }; + + struct TKnownSize { + size_t MySize; + explicit TKnownSize(size_t sz) + : MySize(sz) + { + } + bool Has(size_t sz) const { + return sz <= MySize; + } + }; + + template <typename TChar1, typename TChar2> + int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) { + for (size_t i = 0; i < n; ++i) { + if ((TChar1)ToLower(s1[i]) != s2[i]) + return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1; + } + return 0; + } + + template <typename TChar, typename TBounds> + inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) { + const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0}; + const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0}; + if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0) + return 7; + if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0) + return 8; + return 0; + } + + template <typename T> + inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) { + size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps); + if (prefixSize) + return url.substr(prefixSize); + return url; + } +} + +namespace NUrl { + + TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) { + TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false); + TStringBuf path = url; + path.SkipPrefix(host); + return {host, path}; + } + +} // namespace NUrl + +size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept { + return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps); +} + +size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept { + return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps); +} + +size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept { + return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps); +} + +size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept { + return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps); +} + +TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept { + return CutHttpPrefixImpl(url, ignorehttps); +} + +TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept { + return CutHttpPrefixImpl(url, ignorehttps); +} + +size_t GetSchemePrefixSize(const TStringBuf url) noexcept { + struct TDelim: public str_spn { + inline TDelim() + : str_spn("!-/:-@[-`{|}", true) + { + } + }; + + const auto& delim = *Singleton<TDelim>(); + const char* n = delim.brk(url.data(), url.end()); + + if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') { + return 0; + } + + return n + 3 - url.begin(); +} + +TStringBuf GetSchemePrefix(const TStringBuf url) noexcept { + return url.Head(GetSchemePrefixSize(url)); +} + +TStringBuf CutSchemePrefix(const TStringBuf url) noexcept { + return url.Tail(GetSchemePrefixSize(url)); +} + +template <bool KeepPort> +static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) { + TStringBuf urlNoScheme = url; + + urlNoScheme.Skip(GetHttpPrefixSize(url)); + + struct TDelim: public str_spn { + inline TDelim() + : str_spn(KeepPort ? "/;?#" : "/:;?#") + { + } + }; + + const auto& nonHostCharacters = *Singleton<TDelim>(); + const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end()); + + if (firstNonHostCharacter != urlNoScheme.end()) { + return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data()); + } + + return urlNoScheme; +} + +TStringBuf GetHost(const TStringBuf url) noexcept { + return GetHostAndPortImpl<false>(url); +} + +TStringBuf GetHostAndPort(const TStringBuf url) noexcept { + return GetHostAndPortImpl<true>(url); +} + +TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept { + const size_t schemeSize = GetSchemePrefixSize(url); + const TStringBuf scheme = url.Head(schemeSize); + + const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://")); + + TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize)); + + if (trimDefaultPort) { + const size_t pos = hostAndPort.find(':'); + if (pos != TStringBuf::npos) { + const bool isHttps = (scheme == TStringBuf("https://")); + + const TStringBuf port = hostAndPort.Tail(pos + 1); + if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) { + // trimming default port + hostAndPort = hostAndPort.Head(pos); + } + } + } + + if (isHttp && trimHttp) { + return hostAndPort; + } else { + return TStringBuf(scheme.begin(), hostAndPort.end()); + } +} + +void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) { + auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url); + host = hostBuf; + path = pathBuf; +} + +void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) { + auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url); + host = hostBuf; + path = pathBuf; +} + +void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) { + TStringBuf urlWithoutFragment; + if (!url.TrySplit('#', urlWithoutFragment, fragment)) { + fragment = ""; + urlWithoutFragment = url; + } + if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) { + query = ""; + sanitizedUrl = urlWithoutFragment; + } +} + +bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) { + const size_t schemeSize = GetSchemePrefixSize(url); + if (schemeSize != 0) { + scheme = url.Head(schemeSize); + } + + TStringBuf portStr; + TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize)); + if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) { + // URL has port + if (!TryFromString(portStr, port)) { + return false; + } + } else { + host = hostAndPort; + if (scheme == TStringBuf("https://")) { + port = 443; + } else if (scheme == TStringBuf("http://")) { + port = 80; + } + } + return true; +} + +void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) { + bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port); + Y_ENSURE(isOk, "cannot parse port number from URL: " << url); +} + +TStringBuf GetOnlyHost(const TStringBuf url) noexcept { + return GetHost(CutSchemePrefix(url)); +} + +TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept { + const size_t off = url.find('/', GetHttpPrefixSize(url)); + TStringBuf hostUnused, path; + if (!url.TrySplitAt(off, hostUnused, path)) + return "/"; + + return trimFragment ? path.Before('#') : path; +} + +// this strange creature returns 2nd level domain, possibly with port +TStringBuf GetDomain(const TStringBuf host) noexcept { + const char* c = !host ? host.data() : host.end() - 1; + for (bool wasPoint = false; c != host.data(); --c) { + if (*c == '.') { + if (wasPoint) { + ++c; + break; + } + wasPoint = true; + } + } + return TStringBuf(c, host.end()); +} + +TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept { + size_t pos = host.size(); + for (size_t i = 0; i < level; ++i) { + pos = host.rfind('.', pos); + if (pos == TString::npos) + return host; + } + return host.SubStr(pos + 1); +} + +TStringBuf GetZone(const TStringBuf host) noexcept { + return GetParentDomain(host, 1); +} + +TStringBuf CutWWWPrefix(const TStringBuf url) noexcept { + if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3)) + return url.substr(4); + return url; +} + +TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept { + auto it = url.begin(); + + StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; }); + if (it == url.begin()) { + return url; + } + + StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); }); + if (it == url.end()) { + return url; + } + + if (*it++ == '.') { + return url.Tail(it - url.begin()); + } + + return url; +} + +TStringBuf CutMPrefix(const TStringBuf url) noexcept { + if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) { + return url.substr(2); + } + return url; +} + +static inline bool IsSchemeChar(char c) noexcept { + return IsAsciiAlnum(c); //what about '+' ?.. +} + +static bool HasPrefix(const TStringBuf url) noexcept { + TStringBuf scheme, unused; + if (!url.TrySplit(TStringBuf("://"), scheme, unused)) + return false; + + return AllOf(scheme, IsSchemeChar); +} + +TString AddSchemePrefix(const TString& url) { + return AddSchemePrefix(url, TStringBuf("http")); +} + +TString AddSchemePrefix(const TString& url, TStringBuf scheme) { + if (HasPrefix(url)) { + return url; + } + + return TString::Join(scheme, TStringBuf("://"), url); +} + +#define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0')) + +static inline int x2c(unsigned char* x) { + if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1])) + return -1; + return X(x[0]) * 16 + X(x[1]); +} + +#undef X + +static inline int Unescape(char* str) { + char *to, *from; + int dlen = 0; + if ((str = strchr(str, '%')) == nullptr) + return dlen; + for (to = str, from = str; *from; from++, to++) { + if ((*to = *from) == '%') { + int c = x2c((unsigned char*)from + 1); + *to = char((c > 0) ? c : '0'); + from += 2; + dlen += 2; + } + } + *to = 0; /* terminate it at the new length */ + return dlen; +} + +size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) { + if (source.empty() || source[0] == '?') + return strlcpy(dest, "/", dest_size); + size_t len = Min(dest_size - 1, source.length()); + memcpy(dest, source.data(), len); + dest[len] = 0; + len -= Unescape(dest); + strlwr(dest); + return len; +} + +size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) { + size_t len = Min(dest_size - 1, source.length()); + memcpy(dest, source.data(), len); + dest[len] = 0; + char buf[8] = ":"; + size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2); + buf[buflen] = '\0'; + char* ptr = strstr(dest, buf); + if (ptr && ptr[buflen] == 0) { + len -= buflen; + *ptr = 0; + } + strlwr(dest); + return len; +} + +TStringBuf RemoveFinalSlash(TStringBuf str) noexcept { + if (str.EndsWith('/')) { + str.Chop(1); + } + return str; +} + +TStringBuf CutUrlPrefixes(TStringBuf url) noexcept { + url = CutSchemePrefix(url); + url = CutWWWPrefix(url); + return url; +} + +bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept { + url = CutSchemePrefix(url); + const TStringBuf noHostSuffix = url.After('/'); + if (noHostSuffix == url) { + // no slash => no suffix with token info + return false; + } + const bool suffixHasPrefix = noHostSuffix.StartsWith(token); + if (!suffixHasPrefix) { + return false; + } + const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length(); + const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length(); + const bool nothingAfterPrefix = noHostSuffix.length() <= token.length(); + const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix; + return prefixIsToken; +} + diff --git a/library/cpp/string_utils/url/url.h b/library/cpp/string_utils/url/url.h new file mode 100644 index 0000000000..84137ccc57 --- /dev/null +++ b/library/cpp/string_utils/url/url.h @@ -0,0 +1,170 @@ +#pragma once + +#include <util/generic/fwd.h> +#include <util/generic/strbuf.h> + +namespace NUrl { + + /** + * Splits URL to host and path + * Example: + * auto [host, path] = SplitUrlToHostAndPath(url); + * + * @param[in] url any URL + * @param[out] <host, path> parsed host and path + */ + struct TSplitUrlToHostAndPathResult { + TStringBuf host; + TStringBuf path; + }; + + Y_PURE_FUNCTION + TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url); + +} // namespace NUrl + +Y_PURE_FUNCTION +size_t GetHttpPrefixSize(const char* url, bool ignorehttps = false) noexcept; +Y_PURE_FUNCTION +size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps = false) noexcept; + +Y_PURE_FUNCTION +size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps = false) noexcept; + +Y_PURE_FUNCTION +size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps = false) noexcept; + +/** BEWARE of TStringBuf! You can not use operator ~ or c_str() like in TString + !!!!!!!!!!!! */ +Y_PURE_FUNCTION +size_t GetSchemePrefixSize(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetSchemePrefix(const TStringBuf url) noexcept; + +//! removes protocol prefixes 'http://' and 'https://' from given URL +//! @note if URL has no prefix or some other prefix the function does nothing +//! @param url URL from which the prefix should be removed +//! @param ignorehttps if true, leaves https:// +//! @return a new URL without protocol prefix +Y_PURE_FUNCTION +TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps = false) noexcept; + +Y_PURE_FUNCTION +TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps = false) noexcept; + +Y_PURE_FUNCTION +TStringBuf CutSchemePrefix(const TStringBuf url) noexcept; + +//! adds specified scheme prefix if URL has no scheme +//! @note if URL has scheme prefix already the function returns unchanged URL +TString AddSchemePrefix(const TString& url, const TStringBuf scheme); + +//! Same as `AddSchemePrefix(url, "http")`. +TString AddSchemePrefix(const TString& url); + +Y_PURE_FUNCTION +TStringBuf GetHost(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetHostAndPort(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp = true, bool trimDefaultPort = true) noexcept; + +/** + * Splits URL to host and path + * + * @param[in] url any URL + * @param[out] host parsed host + * @param[out] path parsed path + */ +void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path); +void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path); + +/** + * Separates URL into url prefix, query (aka cgi params list), and fragment (aka part after #) + * + * @param[in] url any URL + * @param[out] sanitizedUrl parsed URL without query and fragment parts + * @param[out] query parsed query + * @param[out] fragment parsed fragment + */ +void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment); + +/** + * Extracts scheme, host and port from URL. + * + * Port will be parsed from URL with checks against ui16 overflow. If URL doesn't + * contain port it will be determined by one of the known schemes (currently + * https:// and http:// only). + * Given parameters will not be modified if URL has no appropriate components. + * + * @param[in] url any URL + * @param[out] scheme URL scheme + * @param[out] host host name + * @param[out] port parsed port number + * @return false if present port number cannot be parsed into ui16 + * true otherwise. + */ +bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port); + +/** + * Extracts scheme, host and port from URL. + * + * This function perform the same actions as TryGetSchemeHostAndPort(), but in + * case of impossibility to parse port number throws yexception. + * + * @param[in] url any URL + * @param[out] scheme URL scheme + * @param[out] host host name + * @param[out] port parsed port number + * @throws yexception if present port number cannot be parsed into ui16. + */ +void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port); + +Y_PURE_FUNCTION +TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment = true) noexcept; +/** + * Extracts host from url and cuts http(https) protocol prefix and port if any. + * @param[in] url any URL + * @return host without port and http(https) prefix. + */ +Y_PURE_FUNCTION +TStringBuf GetOnlyHost(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept; // ("www.ya.ru", 2) -> "ya.ru" + +Y_PURE_FUNCTION +TStringBuf GetZone(const TStringBuf host) noexcept; + +Y_PURE_FUNCTION +TStringBuf CutWWWPrefix(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept; + +/** + * Cuts 'm.' prefix from url if and only if the url starts with it + * Example: 'm.some-domain.com' -> 'some-domain.com'. + * 'http://m.some-domain.com' is not changed + * + * @param[in] url any URL + * @return url without 'm.' or 'M.' prefix. + */ +Y_PURE_FUNCTION +TStringBuf CutMPrefix(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetDomain(const TStringBuf host) noexcept; // should not be used + +size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size); +size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport = 80); + +Y_PURE_FUNCTION +TStringBuf RemoveFinalSlash(TStringBuf str) noexcept; + +TStringBuf CutUrlPrefixes(TStringBuf url) noexcept; +bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept; + diff --git a/library/cpp/string_utils/url/url_ut.cpp b/library/cpp/string_utils/url/url_ut.cpp new file mode 100644 index 0000000000..1588013893 --- /dev/null +++ b/library/cpp/string_utils/url/url_ut.cpp @@ -0,0 +1,281 @@ +#include "url.h" + +#include <util/string/cast.h> + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(TUtilUrlTest) { + Y_UNIT_TEST(TestGetHostAndGetHostAndPort) { + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru:8080")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru:8080", GetHostAndPort("ya.ru:8080")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru/bebe:8080")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru/bebe:8080")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru:8080/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("https://ya.ru:8080/bebe")); + UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetHost("www.ya.ru:8080/bebe")); + UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetHost("https://www.ya.ru:8080/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru:8080", GetHostAndPort("ya.ru:8080/bebe")); + // irl RFC3986 sometimes gets ignored + UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetHost("pravda-kmv.ru?page=news&id=6973")); + UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetHostAndPort("pravda-kmv.ru?page=news&id=6973")); + // check simple string + UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetHost("some_blender_url")); + UNIT_ASSERT_VALUES_EQUAL("", GetHost("")); + } + + Y_UNIT_TEST(TestGetPathAndQuery) { + UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org")); + UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/")); + UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org:8080")); + UNIT_ASSERT_VALUES_EQUAL("/index.php?123/", GetPathAndQuery("ru.wikipedia.org/index.php?123/")); + UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("http://ru.wikipedia.org:8080")); + UNIT_ASSERT_VALUES_EQUAL("/index.php?123/", GetPathAndQuery("https://ru.wikipedia.org/index.php?123/")); + UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/#comment")); + UNIT_ASSERT_VALUES_EQUAL("/?1", GetPathAndQuery("ru.wikipedia.org/?1#comment")); + UNIT_ASSERT_VALUES_EQUAL("/?1#comment", GetPathAndQuery("ru.wikipedia.org/?1#comment", false)); + } + + Y_UNIT_TEST(TestGetDomain) { + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("www.ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("a.b.ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya", GetDomain("ya")); + UNIT_ASSERT_VALUES_EQUAL("", GetDomain("")); + } + + Y_UNIT_TEST(TestGetParentDomain) { + UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("www.ya.ru", 0)); + UNIT_ASSERT_VALUES_EQUAL("ru", GetParentDomain("www.ya.ru", 1)); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetParentDomain("www.ya.ru", 2)); + UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetParentDomain("www.ya.ru", 3)); + UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetParentDomain("www.ya.ru", 4)); + UNIT_ASSERT_VALUES_EQUAL("com", GetParentDomain("ya.com", 1)); + UNIT_ASSERT_VALUES_EQUAL("ya.com", GetParentDomain("ya.com", 2)); + UNIT_ASSERT_VALUES_EQUAL("RU", GetParentDomain("RU", 1)); + UNIT_ASSERT_VALUES_EQUAL("RU", GetParentDomain("RU", 2)); + UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("", 0)); + UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("", 1)); + } + + Y_UNIT_TEST(TestGetZone) { + UNIT_ASSERT_VALUES_EQUAL("ru", GetZone("www.ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("com", GetZone("ya.com")); + UNIT_ASSERT_VALUES_EQUAL("RU", GetZone("RU")); + UNIT_ASSERT_VALUES_EQUAL("FHFBN", GetZone("ya.FHFBN")); + UNIT_ASSERT_VALUES_EQUAL("", GetZone("")); + } + + Y_UNIT_TEST(TestAddSchemePrefix) { + UNIT_ASSERT_VALUES_EQUAL("http://yandex.ru", AddSchemePrefix("yandex.ru")); + UNIT_ASSERT_VALUES_EQUAL("http://yandex.ru", AddSchemePrefix("http://yandex.ru")); + UNIT_ASSERT_VALUES_EQUAL("https://yandex.ru", AddSchemePrefix("https://yandex.ru")); + UNIT_ASSERT_VALUES_EQUAL("file://yandex.ru", AddSchemePrefix("file://yandex.ru")); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", AddSchemePrefix("ya.ru", "ftp")); + } + + Y_UNIT_TEST(TestSchemeGet) { + UNIT_ASSERT_VALUES_EQUAL("http://", GetSchemePrefix("http://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("", GetSchemePrefix("yaru")); + UNIT_ASSERT_VALUES_EQUAL("yaru://", GetSchemePrefix("yaru://ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("", GetSchemePrefix("ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("ftp://", GetSchemePrefix("ftp://ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("https://", GetSchemePrefix("https://")); // is that right? + } + + Y_UNIT_TEST(TestSchemeCut) { + UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutSchemePrefix("http://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("yaru", CutSchemePrefix("yaru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("yaru://ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("ftp://ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("", CutSchemePrefix("https://")); // is that right? + + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", CutHttpPrefix("ftp://ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("http://ya.ru/zzz")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("http://ya.ru/zzz", true)); + UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("https://ya.ru/zzz")); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru/zzz", CutHttpPrefix("https://ya.ru/zzz", true)); + UNIT_ASSERT_VALUES_EQUAL("", CutHttpPrefix("https://")); // is that right? + UNIT_ASSERT_VALUES_EQUAL("https://", CutHttpPrefix("https://", true)); // is that right? + } + + Y_UNIT_TEST(TestMisc) { + UNIT_ASSERT_VALUES_EQUAL("", CutWWWPrefix("www.")); + UNIT_ASSERT_VALUES_EQUAL("", CutWWWPrefix("WwW.")); + UNIT_ASSERT_VALUES_EQUAL("www", CutWWWPrefix("www")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWPrefix("www.ya.ru")); + + UNIT_ASSERT_VALUES_EQUAL("", CutWWWNumberedPrefix("www.")); + UNIT_ASSERT_VALUES_EQUAL("www", CutWWWNumberedPrefix("www")); + UNIT_ASSERT_VALUES_EQUAL("www27", CutWWWNumberedPrefix("www27")); + UNIT_ASSERT_VALUES_EQUAL("", CutWWWNumberedPrefix("www27.")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www.ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www2.ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www12.ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("ww2.ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("w1w2w3.ya.ru", CutWWWNumberedPrefix("w1w2w3.ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("123.ya.ru", CutWWWNumberedPrefix("123.ya.ru")); + + UNIT_ASSERT_VALUES_EQUAL("", CutMPrefix("m.")); + UNIT_ASSERT_VALUES_EQUAL("", CutMPrefix("M.")); + UNIT_ASSERT_VALUES_EQUAL("m", CutMPrefix("m")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutMPrefix("m.ya.ru")); + } + + Y_UNIT_TEST(TestSplitUrlToHostAndPath) { + TStringBuf host, path; + + SplitUrlToHostAndPath("https://yandex.ru/yandsearch", host, path); + UNIT_ASSERT_STRINGS_EQUAL(host, "https://yandex.ru"); + UNIT_ASSERT_STRINGS_EQUAL(path, "/yandsearch"); + + SplitUrlToHostAndPath("yandex.ru/yandsearch", host, path); + UNIT_ASSERT_STRINGS_EQUAL(host, "yandex.ru"); + UNIT_ASSERT_STRINGS_EQUAL(path, "/yandsearch"); + + SplitUrlToHostAndPath("https://yandex.ru", host, path); + UNIT_ASSERT_STRINGS_EQUAL(host, "https://yandex.ru"); + UNIT_ASSERT_STRINGS_EQUAL(path, ""); + + SplitUrlToHostAndPath("invalid url /", host, path); + UNIT_ASSERT_STRINGS_EQUAL(host, "invalid url "); + UNIT_ASSERT_STRINGS_EQUAL(path, "/"); + + SplitUrlToHostAndPath("some_blender_url", host, path); + UNIT_ASSERT_STRINGS_EQUAL(host, "some_blender_url"); + UNIT_ASSERT_STRINGS_EQUAL(path, ""); + } + + Y_UNIT_TEST(TestSeparateUrlFromQueryAndFragment) { + TStringBuf sanitizedUrl, query, fragment; + + SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch", sanitizedUrl, query, fragment); + UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch"); + UNIT_ASSERT_STRINGS_EQUAL(query, ""); + UNIT_ASSERT_STRINGS_EQUAL(fragment, ""); + + SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch?param1=val1¶m2=val2", sanitizedUrl, query, fragment); + UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch"); + UNIT_ASSERT_STRINGS_EQUAL(query, "param1=val1¶m2=val2"); + UNIT_ASSERT_STRINGS_EQUAL(fragment, ""); + + SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch#fragment", sanitizedUrl, query, fragment); + UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch"); + UNIT_ASSERT_STRINGS_EQUAL(query, ""); + UNIT_ASSERT_STRINGS_EQUAL(fragment, "fragment"); + + SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch?param1=val1¶m2=val2#fragment", sanitizedUrl, query, fragment); + UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch"); + UNIT_ASSERT_STRINGS_EQUAL(query, "param1=val1¶m2=val2"); + UNIT_ASSERT_STRINGS_EQUAL(fragment, "fragment"); + } + + Y_UNIT_TEST(TestGetSchemeHostAndPort) { + { // all components are present + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("https://ya.ru:8080/bebe", scheme, host, port); + UNIT_ASSERT_VALUES_EQUAL(scheme, "https://"); + UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); + UNIT_ASSERT_VALUES_EQUAL(port, 8080); + } + { // scheme is abset + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("ya.ru:8080/bebe", scheme, host, port); + UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown"); + UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); + UNIT_ASSERT_VALUES_EQUAL(port, 8080); + } + { // scheme and port are absent + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("ya.ru/bebe", scheme, host, port); + UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown"); + UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); + UNIT_ASSERT_VALUES_EQUAL(port, 0); + } + { // port is absent, but returned its default value for HTTP + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("http://ya.ru/bebe", scheme, host, port); + UNIT_ASSERT_VALUES_EQUAL(scheme, "http://"); + UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); + UNIT_ASSERT_VALUES_EQUAL(port, 80); + } + { // port is absent, but returned its default value for HTTPS + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("https://ya.ru/bebe", scheme, host, port); + UNIT_ASSERT_VALUES_EQUAL(scheme, "https://"); + UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); + UNIT_ASSERT_VALUES_EQUAL(port, 443); + } + { // ipv6 + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("https://[1080:0:0:0:8:800:200C:417A]:443/bebe", scheme, host, port); + UNIT_ASSERT_VALUES_EQUAL(scheme, "https://"); + UNIT_ASSERT_VALUES_EQUAL(host, "[1080:0:0:0:8:800:200C:417A]"); + UNIT_ASSERT_VALUES_EQUAL(port, 443); + } + { // ipv6 + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("[::1]/bebe", scheme, host, port); + UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown"); + UNIT_ASSERT_VALUES_EQUAL(host, "[::1]"); + UNIT_ASSERT_VALUES_EQUAL(port, 0); + } + { // ipv6 + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("unknown:///bebe", scheme, host, port); + UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown://"); + UNIT_ASSERT_VALUES_EQUAL(host, ""); + UNIT_ASSERT_VALUES_EQUAL(port, 0); + } + // port overflow + auto testCase = []() { + TStringBuf scheme("unknown"), host("unknown"); + ui16 port = 0; + GetSchemeHostAndPort("https://ya.ru:65536/bebe", scheme, host, port); + }; + UNIT_ASSERT_EXCEPTION(testCase(), yexception); + } + + Y_UNIT_TEST(TestCutUrlPrefixes) { + UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutUrlPrefixes("http://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("yaru", CutUrlPrefixes("yaru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("yaru://ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ftp://ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("", CutUrlPrefixes("https://")); + + UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutUrlPrefixes("https://www.ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("yaru", CutUrlPrefixes("www.yaru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("yaru://www.ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("www.ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ftp://www.ya.ru://zzz")); + UNIT_ASSERT_VALUES_EQUAL("", CutUrlPrefixes("http://www.")); + } + + Y_UNIT_TEST(TestUrlPathStartWithToken) { + UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe/zzz", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe?zzz", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe/", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe?", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("https://ya.ru/bebe", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/bebezzz", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/bebe.zzz", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://bebe", "bebe")); + UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("https://bebe/", "bebe")); + } +} diff --git a/library/cpp/string_utils/url/ut/ya.make b/library/cpp/string_utils/url/ut/ya.make new file mode 100644 index 0000000000..0efa30e4d2 --- /dev/null +++ b/library/cpp/string_utils/url/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/string_utils/url) + +OWNER(g:util) + +SRCS( + url_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/url/ya.make b/library/cpp/string_utils/url/ya.make new file mode 100644 index 0000000000..b08d69ec83 --- /dev/null +++ b/library/cpp/string_utils/url/ya.make @@ -0,0 +1,10 @@ +LIBRARY() + +OWNER(g:util) + +SRCS( + url.cpp + url.h +) + +END() diff --git a/library/cpp/string_utils/ya.make b/library/cpp/string_utils/ya.make new file mode 100644 index 0000000000..cd731bda95 --- /dev/null +++ b/library/cpp/string_utils/ya.make @@ -0,0 +1,37 @@ +RECURSE( + ascii_encode + ascii_encode/ut + base64 + base64/bench + base64/bench/metrics + base64/ut + base64/fuzz + csv + csv/bench + csv/ut + col_diff + col_diff/ut + indent_text + levenshtein_diff + levenshtein_diff/ut + old_url_normalize + old_url_normalize/ut + parse_size + parse_size/ut + parse_vector + parse_vector/ut + secret_string + quote + quote/ut + relaxed_escaper + relaxed_escaper/ut + scan + subst_buf + subst_buf/ut + tskv_format + tskv_format/ut + tskv_format/fuzz + url + url/ut + ztstrbuf +) diff --git a/library/cpp/string_utils/ztstrbuf/ya.make b/library/cpp/string_utils/ztstrbuf/ya.make new file mode 100644 index 0000000000..28b3f32f58 --- /dev/null +++ b/library/cpp/string_utils/ztstrbuf/ya.make @@ -0,0 +1,9 @@ +LIBRARY() + +OWNER(myltsev) + +SRCS( + ztstrbuf.cpp +) + +END() diff --git a/library/cpp/string_utils/ztstrbuf/ztstrbuf.cpp b/library/cpp/string_utils/ztstrbuf/ztstrbuf.cpp new file mode 100644 index 0000000000..4a7269ff4a --- /dev/null +++ b/library/cpp/string_utils/ztstrbuf/ztstrbuf.cpp @@ -0,0 +1,8 @@ +#include "ztstrbuf.h" + +#include <util/stream/output.h> + +template <> +void Out<TZtStringBuf>(IOutputStream& os, const TZtStringBuf& sb) { + os << static_cast<const TStringBuf&>(sb); +} diff --git a/library/cpp/string_utils/ztstrbuf/ztstrbuf.h b/library/cpp/string_utils/ztstrbuf/ztstrbuf.h new file mode 100644 index 0000000000..5fab768d8c --- /dev/null +++ b/library/cpp/string_utils/ztstrbuf/ztstrbuf.h @@ -0,0 +1,36 @@ +#pragma once + +#include <util/generic/strbuf.h> +#include <util/generic/string.h> + +/* + * Zero-terminated string view. + * + * Has a c_str() for use with system/cstdlib calls (like TString) + * but can be constructed from a string literal or command-line arg + * without memory allocation (like TStringBuf). + * + * Use it to reference filenames, thread names, string formats etc. + */ + +class TZtStringBuf: public TStringBuf { +public: + TZtStringBuf(const char* s) + : TStringBuf(s) + { + } + + TZtStringBuf(const TString& s) + : TStringBuf(s) + { + } + + TZtStringBuf() + : TZtStringBuf(TString{}) + { + } + + const char* c_str() const { + return data(); + } +}; |