aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/uri
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/uri
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/uri')
-rw-r--r--library/cpp/uri/assign.cpp426
-rw-r--r--library/cpp/uri/benchmark/main.cpp46
-rw-r--r--library/cpp/uri/benchmark/ya.make17
-rw-r--r--library/cpp/uri/common.cpp115
-rw-r--r--library/cpp/uri/common.h511
-rw-r--r--library/cpp/uri/encode.cpp221
-rw-r--r--library/cpp/uri/encode.h282
-rw-r--r--library/cpp/uri/encodefsm.rl651
-rw-r--r--library/cpp/uri/http_url.h77
-rw-r--r--library/cpp/uri/location.cpp31
-rw-r--r--library/cpp/uri/location.h13
-rw-r--r--library/cpp/uri/location_ut.cpp40
-rw-r--r--library/cpp/uri/other.cpp82
-rw-r--r--library/cpp/uri/other.h42
-rw-r--r--library/cpp/uri/parse.cpp207
-rw-r--r--library/cpp/uri/parse.h361
-rw-r--r--library/cpp/uri/parsefsm.rl6501
-rw-r--r--library/cpp/uri/qargs.cpp279
-rw-r--r--library/cpp/uri/qargs.h22
-rw-r--r--library/cpp/uri/uri-ru_ut.cpp163
-rw-r--r--library/cpp/uri/uri.cpp623
-rw-r--r--library/cpp/uri/uri.h626
-rw-r--r--library/cpp/uri/uri_ut.cpp1022
-rw-r--r--library/cpp/uri/uri_ut.h81
-rw-r--r--library/cpp/uri/ut/ya.make19
-rw-r--r--library/cpp/uri/ya.make32
26 files changed, 5890 insertions, 0 deletions
diff --git a/library/cpp/uri/assign.cpp b/library/cpp/uri/assign.cpp
new file mode 100644
index 00000000000..ea1955a9e92
--- /dev/null
+++ b/library/cpp/uri/assign.cpp
@@ -0,0 +1,426 @@
+#include "uri.h"
+#include "parse.h"
+
+#include <contrib/libs/libidn/idna.h>
+
+#include <library/cpp/charset/recyr.hh>
+#include <util/charset/wide.h>
+#include <util/memory/tempbuf.h>
+#include <util/string/cast.h>
+#include <util/system/maxlen.h>
+#include <util/system/yassert.h>
+#include <util/system/sys_alloc.h>
+
+namespace NUri {
+ TMallocPtr<char> TUri::IDNToAscii(const wchar32* idna) {
+ // XXX: don't use punycode_encode directly as it doesn't include
+ // proper stringprep and splitting on dot-equivalent characters
+ char* buf;
+ static_assert(sizeof(*idna) == sizeof(ui32), "fixme");
+ if (IDNA_SUCCESS != idna_to_ascii_4z((const uint32_t*)idna, &buf, 0))
+ buf = nullptr;
+ return buf;
+ }
+
+ TMallocPtr<char> TUri::IDNToAscii(const TStringBuf& host, ECharset enc) {
+ TTempBuf buf(sizeof(wchar32) * (1 + host.length()));
+ wchar32* wbuf = reinterpret_cast<wchar32*>(buf.Data());
+
+ const size_t written = NDetail::NBaseOps::Recode(host, wbuf, enc).length();
+ wbuf[written] = 0;
+
+ return IDNToAscii(wbuf);
+ }
+
+ TStringBuf TUri::HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc) {
+ TStringBuf outhost; // store the result here before returning it, to get RVO
+
+ size_t buflen = 0;
+
+ if (hasExtended && !allowIDN)
+ return outhost; // definitely can't convert
+
+ // charset-recode: RFC 3986, 3.2.2, requires percent-encoded non-ASCII
+ // chars in reg-name to be UTF-8 so convert to UTF-8 prior to decoding
+ const bool recoding = CODES_UTF8 != enc && hasExtended;
+ if (recoding) {
+ size_t nrd, nwr;
+ buflen = host.length() * 4;
+ buf.Reset(static_cast<char*>(y_allocate(buflen)));
+ if (RECODE_OK != Recode(enc, CODES_UTF8, host.data(), buf.Get(), host.length(), buflen, nrd, nwr))
+ return outhost;
+ host = TStringBuf(buf.Get(), nwr);
+ }
+
+ // percent-decode
+ if (0 == buflen) {
+ buflen = host.length();
+ buf.Reset(static_cast<char*>(y_allocate(buflen)));
+ }
+ // decoding shortens so writing over host in buf is OK
+ TMemoryWriteBuffer out(buf.Get(), buflen);
+ TEncoder decoder(out, FeatureDecodeANY | FeatureToLower);
+ const long outFlags = decoder.ReEncode(host);
+ hasExtended = 0 != (outFlags & FeatureEncodeExtendedASCII);
+
+ // check again
+ if (hasExtended && !allowIDN)
+ return outhost;
+
+ host = out.Str();
+
+ // convert to punycode if needed
+ if (!hasExtended) {
+ outhost = host;
+ return outhost;
+ }
+
+ TMallocPtr<char> puny;
+ try {
+ puny = IDNToAscii(host);
+ } catch (const yexception& /* exc */) {
+ }
+
+ if (!puny) {
+ // XXX: try user charset unless UTF8 or converted to it
+ if (CODES_UTF8 == enc || recoding)
+ return outhost;
+ try {
+ puny = IDNToAscii(host, enc);
+ } catch (const yexception& /* exc */) {
+ return outhost;
+ }
+ if (!puny)
+ return outhost;
+ }
+
+ buf = puny;
+ outhost = buf.Get();
+
+ return outhost;
+ }
+
+ TStringBuf TUri::HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc) {
+ // find what we have
+ long haveFlags = 0;
+ for (size_t i = 0; i != host.length(); ++i)
+ haveFlags |= TEncoder::GetFlags(host[i]).FeatFlags;
+
+ // interested in encoded characters or (if IDN is allowed) extended ascii
+ TStringBuf outhost;
+ const bool haveExtended = haveFlags & FeatureEncodeExtendedASCII;
+
+ if (!haveExtended || allowIDN) {
+ if (!haveExtended && 0 == (haveFlags & FeatureDecodeANY))
+ outhost = host;
+ else
+ outhost = HostToAscii(host, buf, haveExtended, allowIDN, enc);
+ }
+
+ return outhost;
+ }
+
+ static inline bool AppendField(TMemoryWriteBuffer& out, TField::EField fld, const TStringBuf& val, long flags) {
+ if (val.empty())
+ return false;
+ if (flags & TFeature::FeaturesAllEncoder)
+ TUri::ReEncodeField(out, val, fld, flags);
+ else
+ out << val;
+ return true;
+ }
+
+ TState::EParsed TUri::AssignImpl(const TParser& parser, TScheme::EKind defscheme) {
+ Clear();
+
+ TState::EParsed ret = parser.State;
+ if (ParsedBadFormat <= ret)
+ return ret;
+
+ const TSection& scheme = parser.Get(FieldScheme);
+ const TSchemeInfo& schemeInfo = SetSchemeImpl(parser.Scheme);
+
+ // set the scheme always if available
+ if (schemeInfo.Str.empty() && scheme.IsSet())
+ FldSet(FieldScheme, scheme.Get());
+
+ if (ParsedOK != ret)
+ return ret;
+
+ size_t buflen = 0;
+
+ // special processing for fields
+
+ const bool convertIDN = parser.Flags & FeatureConvertHostIDN;
+ long flags = parser.Flags.Allow;
+ if (convertIDN)
+ flags |= FeatureAllowHostIDN | FeatureCheckHost;
+
+ // process non-ASCII host for punycode
+
+ TMallocPtr<char> hostptr;
+ TStringBuf hostascii; // empty: use host field; non-empty: ascii
+ bool hostConverted = false; // hostascii is empty or the original
+ const TSection& host = parser.Get(FieldHost);
+ if (host.IsSet() && !FldIsSet(FieldHost)) {
+ const bool allowIDN = (flags & FeatureAllowHostIDN);
+ const TStringBuf hostbuf = host.Get();
+
+ // if we know we have and allow extended-ASCII chars, no need to check further
+ if (allowIDN && (host.GetFlagsAllPlaintext() & FeatureEncodeExtendedASCII))
+ hostascii = HostToAscii(hostbuf, hostptr, true, true, parser.Enc);
+ else
+ hostascii = HostToAscii(hostbuf, hostptr, allowIDN, parser.Enc);
+
+ if (hostascii.empty())
+ ret = ParsedBadHost; // exists but cannot be converted
+ else if (hostbuf.data() != hostascii.data()) {
+ hostConverted = true;
+ buflen += 1 + hostascii.length();
+ if (convertIDN)
+ FldMarkSet(FieldHost); // so that we don't process host below
+ }
+ }
+
+ // add unprocessed fields
+
+ for (int idx = 0; idx < FieldUrlMAX; ++idx) {
+ const EField fld = EField(idx);
+ const TSection& section = parser.Get(fld);
+ if (section.IsSet() && !FldIsSet(fld))
+ buflen += 1 + section.EncodedLen(); // includes null
+ }
+ if (0 == buflen) // no more sections set?
+ return ret;
+
+ // process #! fragments
+ // https://developers.google.com/webmasters/ajax-crawling/docs/specification
+
+ static const TStringBuf escFragPrefix(TStringBuf("_escaped_fragment_="));
+
+ bool encHashBangFrag = false;
+ TStringBuf qryBeforeEscapedFragment;
+ TStringBuf qryEscapedFragment;
+ do {
+ if (FldIsSet(FieldFrag) || FldIsSet(FieldQuery))
+ break;
+
+ const TSection& frag = parser.Get(FieldFrag);
+ if (frag.IsSet()) {
+ if (0 == (parser.Flags & FeatureHashBangToEscapedFragment))
+ break;
+ const TStringBuf fragbuf = frag.Get();
+ if (fragbuf.empty() || '!' != fragbuf[0])
+ break;
+ encHashBangFrag = true;
+ // '!' will make space for '&' or '\0' if needed
+ buflen += escFragPrefix.length();
+ buflen += 2 * fragbuf.length(); // we don't know how many will be encoded
+ } else {
+ const TSection& qry = parser.Get(FieldQuery);
+ if (!qry.IsSet())
+ break;
+ // FeatureHashBangToEscapedFragment has preference
+ if (FeatureEscapedToHashBangFragment != (parser.Flags & FeaturesEscapedFragment))
+ break;
+ qry.Get().RSplit('&', qryBeforeEscapedFragment, qryEscapedFragment);
+ if (!qryEscapedFragment.StartsWith(escFragPrefix)) {
+ qryEscapedFragment.Clear();
+ break;
+ }
+ qryEscapedFragment.Skip(escFragPrefix.length());
+ buflen += 2; // for '!' and '\0' in fragment
+ buflen -= escFragPrefix.length();
+ }
+ } while (false);
+
+ // now set all fields prior to validating
+
+ Alloc(buflen);
+
+ TMemoryWriteBuffer out(Buffer.data(), Buffer.size());
+ for (int idx = 0; idx < FieldUrlMAX; ++idx) {
+ const EField fld = EField(idx);
+
+ const TSection& section = parser.Get(fld);
+ if (!section.IsSet() || FldIsSet(fld))
+ continue;
+
+ if (FieldQuery == fld && encHashBangFrag)
+ continue;
+
+ if (FieldFrag == fld && qryEscapedFragment.IsInited())
+ continue;
+
+ char* beg = out.Buf();
+ TStringBuf val = section.Get();
+ long careFlags = section.GetFlagsEncode();
+
+ switch (fld) {
+ default:
+ break;
+
+ case FieldQuery:
+ if (qryEscapedFragment.IsInited()) {
+ const EField dstfld = FieldFrag; // that's where we will store
+ out << '!';
+ if (!qryEscapedFragment.empty())
+ ReEncodeToField(out, qryEscapedFragment, fld, FeatureDecodeANY | careFlags, dstfld, FeatureDecodeANY | parser.GetFieldFlags(dstfld));
+ FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
+ if (qryBeforeEscapedFragment.empty())
+ continue;
+ out << '\0';
+ beg = out.Buf();
+ val = qryBeforeEscapedFragment;
+ }
+ break;
+
+ case FieldFrag:
+ if (encHashBangFrag) {
+ const EField dstfld = FieldQuery; // that's where we will store
+ const TSection& qry = parser.Get(dstfld);
+ if (qry.IsSet())
+ if (AppendField(out, dstfld, qry.Get(), qry.GetFlagsEncode()))
+ out << '&';
+ out << escFragPrefix;
+ val.Skip(1); // skip '!'
+ ReEncodeToField(out, val, fld, careFlags, dstfld, parser.GetFieldFlags(dstfld));
+ FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
+ continue;
+ }
+ break;
+ }
+
+ AppendField(out, fld, val, careFlags);
+ char* end = out.Buf();
+
+ if (careFlags & FeaturePathOperation) {
+ if (!PathOperation(beg, end, PathOperationFlag(parser.Flags)))
+ return ParsedBadPath;
+
+ Y_ASSERT(beg >= out.Beg());
+ out.SetPos(end);
+ }
+
+ FldSetNoDirty(fld, TStringBuf(beg, end));
+
+ // special character case
+ const long checkChars = section.GetFlagsAllPlaintext() & FeaturesCheckSpecialChar;
+ if (0 != checkChars) { // has unencoded special chars: check permission
+ const long allowChars = parser.GetFieldFlags(fld) & checkChars;
+ if (checkChars != allowChars)
+ ret = ParsedBadFormat;
+ }
+
+ out << '\0';
+ }
+
+ if (hostConverted) {
+ char* beg = out.Buf();
+ out << hostascii;
+ char* end = out.Buf();
+ const EField fld = convertIDN ? FieldHost : FieldHostAscii;
+ FldSetNoDirty(fld, TStringBuf(beg, end));
+ out << '\0';
+ }
+
+ Buffer.Resize(out.Len());
+
+ if (GetScheme() == SchemeEmpty && SchemeEmpty != defscheme) {
+ if (SchemeUnknown == defscheme)
+ ret = ParsedBadScheme;
+ else
+ SetSchemeImpl(defscheme);
+ }
+
+ if (0 == (parser.Flags & FeatureAllowEmptyPath))
+ CheckMissingFields();
+
+ const TStringBuf& port = GetField(FieldPort);
+ if (!port.empty()) {
+ if (!TryFromString<ui16>(port, Port))
+ ret = ParsedBadPort;
+ }
+
+ if (ParsedOK != ret)
+ return ret;
+
+ // run validity checks now that all fields are set
+
+ // check the host for DNS compliance
+ do {
+ if (0 == (flags & FeatureCheckHost))
+ break;
+ if (hostascii.empty())
+ hostascii = GetField(FieldHost);
+ if (hostascii.empty())
+ break;
+ // IP literal
+ if ('[' == hostascii[0] && ']' == hostascii.back())
+ break;
+ ret = CheckHost(hostascii);
+ if (ParsedOK != ret)
+ return ret;
+ } while (false);
+
+ return ret;
+ }
+
+ TState::EParsed TUri::ParseImpl(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
+ Clear();
+
+ if (url.empty())
+ return ParsedEmpty;
+
+ if (maxlen > 0 && url.length() > maxlen)
+ return ParsedTooLong;
+
+ const TParser parser(flags, url, enc);
+
+ return AssignImpl(parser, defscheme);
+ }
+
+ TState::EParsed TUri::Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& url_base, ui32 maxlen, ECharset enc) {
+ const TParseFlags flags1 = flags.Exclude(FeatureNoRelPath);
+ TState::EParsed ret = ParseImpl(url, url_base.empty() ? flags : flags1, maxlen, SchemeEmpty, enc);
+ if (ParsedOK != ret)
+ return ret;
+
+ if (!url_base.empty() && !IsValidAbs()) {
+ TUri base;
+ ret = base.ParseImpl(url_base, flags, maxlen, SchemeEmpty, enc);
+ if (ParsedOK != ret)
+ return ret;
+ Merge(base, PathOperationFlag(flags));
+ }
+
+ Rewrite();
+ return ret;
+ }
+
+ TState::EParsed TUri::Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags, ui32 maxlen, ECharset enc) {
+ const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc);
+ if (ParsedOK != ret)
+ return ret;
+
+ if (!IsValidAbs())
+ Merge(base, PathOperationFlag(flags));
+
+ Rewrite();
+ return ret;
+ }
+
+ TState::EParsed TUri::ParseAbsUri(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
+ const TState::EParsed ret = ParseImpl(
+ url, flags | FeatureNoRelPath, maxlen, defscheme, enc);
+ if (ParsedOK != ret)
+ return ret;
+
+ if (IsNull(FlagHost))
+ return ParsedBadHost;
+
+ Rewrite();
+ return ParsedOK;
+ }
+
+}
diff --git a/library/cpp/uri/benchmark/main.cpp b/library/cpp/uri/benchmark/main.cpp
new file mode 100644
index 00000000000..d39704877e9
--- /dev/null
+++ b/library/cpp/uri/benchmark/main.cpp
@@ -0,0 +1,46 @@
+#include <library/cpp/uri/uri.h>
+
+#include <library/cpp/testing/benchmark/bench.h>
+
+#include <util/generic/vector.h>
+
+const TString URLS[] = {
+ "http://www.TEST.Ru:80/InDex.html",
+ "www.ya.ru/index.html",
+ "https://workplace.z.yandex-team.ru/di#vertical=drive&datePreset=week",
+ "https://warden.z.yandex-team.ru/components/web/report?filter_type=action_items&filter_status=total&filter_period=review",
+ "https://meduza.io/news/2021/05/01/italiya-vozobnovila-vydachu-turisticheskih-viz-v-moskve",
+ "https://gcc.gnu.org/projects/cxx-status.html#cxx20",
+ "https://github.com/llvm/llvm-project/commits/main/libcxx",
+ "https://photos.google.com/share/AF1QipNi8VN2pw2Ya_xCV8eFgzEZmiXDy1-GwhXbqFtvXoH3HypF10as9puV8FdoVZpOZA?key=WkZjQTIxQTM5a01oZkNUYTE2ZllKTVJKZk1CMTR3",
+ "https://mag.auto.ru/article/ladasolaris/?from=mag_web_block&utm_campaign=ladasolaris&utm_content=populyarnoe&utm_source=mag_web_block&utm_medium=cpc",
+ "https://yabs.yandex.ru/count/WZ4ejI_zODW1FH40j1nmE7kaiN1MJWK0s08GWY0nuM6EO000000useqKG0H846344d30nU21pYI00GcG0VpCzQaucBW1mCNWWOR1co7e0QG2y0A-meRx0v03yCE0RzqD-0Jozzu1Y0Nozzu1a0Nozzu1m0Nvks_D2-U0-KK5Iga7StZ0vgZ8Ao2m1u20c0ou1-u9q0SIW870W822W07u2DoYy82d4W10oQeB4EBQlCbpU0002zaBbLJ1w0lozzu1y0i6w0oNm808WWuaCaOrC30oGaKjHaCoHIqqDpWoBJX1HZajCaD3HJGtDKH3Gp7Dbvo7cB_HWagW3i24FO0Gm-3DmA8G0w7W4e606EaIQREKnW19jq8oc1C8g1E2WkxBsSo3dXRW4_BttW6W5FBttW6e5FBttW7G5EUCwadO583Nge06w1IC0j0LWDUgW0RO5S6AzkoZZxpyO_2G5i41e1RGWVs31iaMWHUe5md05xGIs1V0X3sm68AikOG6q1WG-1ZKfU3zXERBdee1W1cG6G6W6Qe3i1cu6T8P4dbXOdDVSsLoTcLoBt8rCZGjCkWPWC83y1c0mWE16l__eqZnoGHLc1hyy8W2i1hotyIEmftqxKJr6W40000R02tR-6NBeHRAQn6iQSWtbmq2DFDwGK8a9Muu2wvTOhNWE4vj3Fy-LbbUXS4Y0NLWZWE9MX1ZSNyOanc8rS-k0u1LdAMo0O-fHvO33T0LBwYZBzcF7h0qb3q0~1",
+ "https://yandex.ru/pogoda/nowcast/?utm_campaign=alert&utm_content=alert_separate&utm_medium=web&utm_source=home&utm_term=nowcast&morda_geolocation=1",
+ "https://an.yandex.ru/count/WluejI_zOE02fHS052S_YplH_yoFzGK0u0CGWY0nncwAO000000uYgLImfs4aOKAW06Suha2Y07-gFqAa07-vu2bpu20W0AO0VxdWALFe07Ug07Uk076nFll8S010jW1w9_VcG7W0Soxo1te0Ue40Q02vAG1y0Aasfk_0SOA-0IzX9W1Y0MzX9W1a0Nav9y1e0MCiIwe1ShJ9h05ojCck0NkpoZ9qoOhae0mkgnJ5wa7cB8OEOwtEX-m1u20a2Iu1u05ibB92YPyFmDpJFK_D7ddf9Yo002blSLG6C7e2xs4c07m2mc83BIDthu1gGpzGrRLHHZfF-WCbmBW3OA0mC60288E93OtGqOmC4L4BJ4sDaCjD4P5H2r1EJX5BKKuCZCvGZX4DqGtgwI2XAENwwcOvUBgu_6jdH_P3u0Gz8sO7OWGpz-nX0e1eU0HoV740-WHhS3enS2ScyBwO3_lGEaI3kz-5M87zWKoc1C4g1Eli-_mek7ml1RW4-xFA8WKqRAuZvJoWDi8e1JkpoYe5EJadudj_uC6w1I40iWLhPEErFa4q1MEz9w41jWLmOhsxAEFlFnZyA0Mq87zWmR95j0Mj8tUlW615vWNWfsO8wWN2RWN0S0NjHBO5y24FUWN0PaOe1WBi1ZIlwc41hWO0j0O4FWOeQI_pOMKhhOPW1c96MWs1G000000a1a1e1cg0x0Pk1d_0T8P4dbXOdDVSsLoTcLoBt8rCZCjCkWPWC83y1c0mWE16l__fs_4tYo3a1g0W06O6l70j06m6hkouUoAwFByhm6u6W7r6W4000226nmnDZ4vDZWrC3OnBZasE30pBZWrDJSmBZOuDJSt9I1HX8PYb34CCgLmxEvDy50h8PoKR4c2QKHEe8YaRoCEXF5m_E1rNC99Bpm6wIRbNGAU6GiQ3qguwKm8cq7la7pmUZxo079vS26KW_bTaCPY9QnjKrUHau50oq6yQCyn5TW0kBtjBWG2fE3FRk2DVwuirPJ_lvLb3GlkDKAujdLv~1?stat-id=1",
+ "https://rasp.yandex.ru/?utm_source=yamain&utm_medium=geoblock&utm_campaign=main",
+ "https://dsp-rambler.ru/click?url=hs31lTJpNQdz5IfhdiQZitCij144sWuuLcC3jopSDRPYYlPZNuD1x6OEJg74u0nbP0J0oXWXw7awYsCTgG7Cr*koGqxBBhOnzB5aZ6aLfCVVGlIuP3L194y025VLJAsK04sZwSvJbxEYfnbOWKPgQHOf7c8Gqkope95-kr--aTxpsg18jiwVdEPBENe3F2Iahm3yL*3vvzt-t7Vq6bANhL3DiJglfLw2WVd4tAhoAepFV*QybPJFoGHbdOWGMTyzpWMxWPLKb7MRFQWj1K*58qPyfNkfyQ72vvzjviirTM57xng8Cxbi08-HzM5x7imDOYIY8EvXOqfU3Q0KWsyO1RC1yHHVinVxthuIb16zLjg2aoYFpz-OLiTYhlmk1vyK9t9fLz8*Oiez9i*7TqIwcWZyX5gUFuOOJ4sTWbeQLe6IQEShvKIj7v9yBZRLkRmdHLfrREuTNpBMBtRG80MIxwXyt6SjjFOhSKtK-yDA19Wawzgw9fNOy9DW0TDAehQkPUTz*5-htmszyUqJWk5ovqoyHV3acnOI2-klqMCMfU9w3*GOYS0CuNTrggGCQH376EaeQthtwiUcabSarBEocGEsW7n27kIsrtYh2-SZwPvKC1Ek3dg35nuEO-MWNPMmqJvAhBGXHF*EQkcB3eLUEluJmwTxqPRv00M4PNgdYsYsgKYPU6MMJTxbH7fA*Q2vA7WErGONTSzhSOeNLPP4vR9WalRvzllDB3XH4bNx6Pleb3ZfWmoYTNN0Lux3-VxOSjIvDDGzMirfOVPKZB4qVQWsP4WHCrRgsijW43cKGhcQ0dPORYO5v0xhzMjoZ0qDEsaiXBkfRXccnQ3QGaXk2PC1vu*Zlj0qgJfO5i8e66z4HEiMWRF4JH8ZsbZqrUzKXX-WpPQuVA0MOslhzq8m3KS3UIEurWCn80utY4AWCuHzGooJbb*PcHhYMqIBbLcJW76RK9HQjI8Bxiu4C9wECXeWIqeVuHzbmb7PGizIDVwg-g8I-zrBgd8OH3kWtC09YwSB-F9wOHrrcBG*Sv6fdnwubW1ndv0V1jsok2DhMT9IFBOoa-brtWYIdttkRs2J4m*Ai5IgVOagS2wyboiHKptd7aQ7j1YnJENZbFfs2nSwvPTvvSA8w-vCJHo-xEW6tPWaAOrVVRZscjNb4HovTUKBhxrCm8cZYw0ZahlRWGDpB0QkiI-xSv7YdYzoAQMvEOF8h*MKTn1Had8cI2FJ3WtcaT3siShD*APePK6dwqGsNJRz3lfbeX*hykpwK8kTumfS6z51bv06bjahc*fo1vjNt8ivt2BJPWqnGkYH9-8r76iBia7d1zKKnzfqk-mu3m9eP0kiKkoqMKaugV2muZlt5h4ps29ikmTIc-8vYtwHOdLDay7PUhTC4tKepBVRZh6nXrIa5POmkVNV7hVfeoMXqlid2B-2LM3CWCo*W2r23aefVN8mi3t-dnWNUlVgurAc*674C76Py8Qdr0*EJqmYjhrQw6jbm*nB3O-kd1aYqWXWC3Msg1a5r9sRu1WVLKzmwwzjPX6b44R5ULVhu1OqH6*O7hFIbN584lUhWM6g1nWFqhwhN3**Bam802sRvZjguTILo*UAH7WW1DRRG5MsTs-ZP3tOFMkQKWuJ3LRLDXtnkyN25S0LYEmH8R0vTstUmFwafeJSmm90Iuseu9DKArqrb1Wn2cZv2zgAEYy66U7kkBTQSC76WlwBJTVpoK6MUohrEll23wivbnhNaGzaSe6kWRq1ItpFkqv9gXkCAAAAuty8CgAAAAA",
+ "https://news.rambler.ru/moscow_city/46342947-pogoda-v-moskve-sinoptiki-poobeschali-moskvicham-silnyy-liven-blizhayshey-nochyu/?utm_source=head&utm_campaign=self_promo&utm_medium=news&utm_content=news",
+};
+
+Y_CPU_BENCHMARK(Parsing, iface) {
+ for (size_t i = 0; i < iface.Iterations(); ++i) {
+ for (auto&& url : URLS) {
+ NUri::TUri uri;
+ auto parseResult = uri.Parse(url, uri.FeaturesAll);
+ Y_DO_NOT_OPTIMIZE_AWAY(parseResult);
+ Y_VERIFY(parseResult == NUri::TState::ParsedOK, "cannot parse %s: %d", url.c_str(), static_cast<ui32>(parseResult));
+ }
+ }
+}
+
+Y_CPU_BENCHMARK(ParsingAndCopying, iface) {
+ for (size_t i = 0; i < iface.Iterations(); ++i) {
+ for (auto&& url : URLS) {
+ NUri::TUri uri;
+ auto parseResult = uri.Parse(url, uri.FeaturesAll);
+ Y_VERIFY(parseResult == NUri::TState::ParsedOK, "cannot parse %s: %d", url.c_str(), static_cast<ui32>(parseResult));
+ auto copy = uri;
+ Y_DO_NOT_OPTIMIZE_AWAY(copy);
+ }
+ }
+}
diff --git a/library/cpp/uri/benchmark/ya.make b/library/cpp/uri/benchmark/ya.make
new file mode 100644
index 00000000000..77ea238de71
--- /dev/null
+++ b/library/cpp/uri/benchmark/ya.make
@@ -0,0 +1,17 @@
+Y_BENCHMARK()
+
+OWNER(
+ svshevtsov
+ g:base
+)
+
+PEERDIR(
+ library/cpp/testing/benchmark
+ library/cpp/uri
+)
+
+SRCS(
+ main.cpp
+)
+
+END()
diff --git a/library/cpp/uri/common.cpp b/library/cpp/uri/common.cpp
new file mode 100644
index 00000000000..05af1e57d18
--- /dev/null
+++ b/library/cpp/uri/common.cpp
@@ -0,0 +1,115 @@
+#include "common.h"
+
+#include <util/generic/map.h>
+#include <util/generic/singleton.h>
+
+namespace NUri {
+ static_assert(TFeature::FeatureMAX <= sizeof(unsigned long) * 8, "expect TFeature::FeatureMAX <= sizeof(unsigned long) * 8");
+
+ const TSchemeInfo TSchemeInfo::Registry[] = {
+ TSchemeInfo(TScheme::SchemeEmpty, TStringBuf()), // scheme is empty and inited
+ TSchemeInfo(TScheme::SchemeHTTP, TStringBuf("http"), TField::FlagHost | TField::FlagPath, 80),
+ TSchemeInfo(TScheme::SchemeHTTPS, TStringBuf("https"), TField::FlagHost | TField::FlagPath, 443),
+ TSchemeInfo(TScheme::SchemeFTP, TStringBuf("ftp"), TField::FlagHost | TField::FlagPath, 20),
+ TSchemeInfo(TScheme::SchemeFILE, TStringBuf("file"), TField::FlagPath),
+ TSchemeInfo(TScheme::SchemeWS, TStringBuf("ws"), TField::FlagHost | TField::FlagPath, 80),
+ TSchemeInfo(TScheme::SchemeWSS, TStringBuf("wss"), TField::FlagHost | TField::FlagPath, 443),
+ // add above
+ TSchemeInfo(TScheme::SchemeUnknown, TStringBuf()) // scheme is empty and uninited
+ };
+
+ namespace {
+ struct TLessNoCase {
+ bool operator()(const TStringBuf& lt, const TStringBuf& rt) const {
+ return 0 > CompareNoCase(lt, rt);
+ }
+ };
+
+ class TSchemeInfoMap {
+ typedef TMap<TStringBuf, TScheme::EKind, TLessNoCase> TdMap;
+ TdMap Map_;
+
+ public:
+ TSchemeInfoMap() {
+ for (int i = TScheme::SchemeEmpty; i < TScheme::SchemeUnknown; ++i) {
+ const TSchemeInfo& info = TSchemeInfo::Get(TScheme::EKind(i));
+ Map_.insert(std::make_pair(info.Str, info.Kind));
+ }
+ }
+
+ TScheme::EKind Get(const TStringBuf& scheme) const {
+ const TdMap::const_iterator it = Map_.find(scheme);
+ return Map_.end() == it ? TScheme::SchemeUnknown : it->second;
+ }
+
+ static const TSchemeInfoMap& Instance() {
+ return *Singleton<TSchemeInfoMap>();
+ }
+ };
+
+ }
+
+ const TSchemeInfo& TSchemeInfo::Get(const TStringBuf& scheme) {
+ return Registry[TSchemeInfoMap::Instance().Get(scheme)];
+ }
+
+ const char* ParsedStateToString(const TState::EParsed& t) {
+ switch (t) {
+ case TState::ParsedOK:
+ return "ParsedOK";
+ case TState::ParsedEmpty:
+ return "ParsedEmpty";
+ case TState::ParsedRootless:
+ return "ParsedRootless";
+ case TState::ParsedBadFormat:
+ return "ParsedBadFormat";
+ case TState::ParsedBadPath:
+ return "ParsedBadPath";
+ case TState::ParsedTooLong:
+ return "ParsedTooLong";
+ case TState::ParsedBadPort:
+ return "ParsedBadPort";
+ case TState::ParsedBadAuth:
+ return "ParsedBadAuth";
+ case TState::ParsedBadScheme:
+ return "ParsedBadScheme";
+ case TState::ParsedBadHost:
+ return "ParsedBadHost";
+ default:
+ return "Parsed[Unknown]";
+ }
+ }
+
+ const char* FieldToString(const TField::EField& t) {
+ switch (t) {
+ case TField::FieldScheme:
+ return "scheme";
+ case TField::FieldUser:
+ return "username";
+ case TField::FieldPass:
+ return "password";
+ case TField::FieldHost:
+ return "host";
+ case TField::FieldHostAscii:
+ return "hostascii";
+ case TField::FieldPort:
+ return "port";
+ case TField::FieldPath:
+ return "path";
+ case TField::FieldQuery:
+ return "query";
+ case TField::FieldFrag:
+ return "fragment";
+ default:
+ return "Field[Unknown]";
+ }
+ }
+
+ const char* SchemeKindToString(const TScheme::EKind& t) {
+ const TSchemeInfo& info = TSchemeInfo::Get(t);
+ if (!info.Str.empty())
+ return info.Str.data();
+ return TScheme::SchemeEmpty == t ? "empty" : "unknown";
+ }
+
+}
diff --git a/library/cpp/uri/common.h b/library/cpp/uri/common.h
new file mode 100644
index 00000000000..80253577635
--- /dev/null
+++ b/library/cpp/uri/common.h
@@ -0,0 +1,511 @@
+#pragma once
+
+#include <util/stream/output.h>
+#include <util/system/compat.h>
+#include <util/generic/strbuf.h>
+
+namespace NUri {
+ namespace NEncode {
+ class TEncoder;
+ class TEncodeMapperBase;
+ struct TCharFlags;
+ }
+
+ namespace NParse {
+ class TRange;
+ }
+
+ class TParser;
+
+ struct TField {
+#define FIELD_NAME(f) Field##f
+#define FIELD_FLAG(f) Flag##f = 1U << FIELD_NAME(f)
+
+ enum EField {
+ FIELD_NAME(Scheme),
+ FIELD_NAME(User),
+ FIELD_NAME(Pass),
+ FIELD_NAME(Host),
+ FIELD_NAME(Port),
+ FIELD_NAME(Path),
+ FIELD_NAME(Query),
+ FIELD_NAME(Frag),
+
+ // add fields above
+ FieldUrlMAX,
+ // reset count so actual field offsets are not interrupted
+ FieldUrlLast = FieldUrlMAX - 1,
+ // add extra fields below
+
+ FIELD_NAME(HostAscii),
+
+ // add extra fields above
+ FieldAllMAX,
+ // add aliases below
+
+ FieldUsername = FieldUser,
+ FieldPassword = FieldPass,
+ FieldFragment = FieldFrag,
+ };
+
+ enum EFlags {
+ FIELD_FLAG(Scheme),
+ FIELD_FLAG(User),
+ FIELD_FLAG(Pass),
+ FIELD_FLAG(Host),
+ FIELD_FLAG(Port),
+ FIELD_FLAG(Path),
+ FIELD_FLAG(Query),
+ FIELD_FLAG(Frag),
+ FIELD_FLAG(UrlMAX),
+ FIELD_FLAG(HostAscii),
+ FIELD_FLAG(AllMAX),
+
+ FlagHostPort = FlagHost | FlagPort,
+ FlagAuth = FlagUser | FlagPass,
+ FlagFragment = FlagFrag,
+ FlagAction = FlagScheme | FlagHostPort | FlagPath,
+ FlagNoFrag = FlagAction | FlagQuery,
+ FlagUrlFields = FlagUrlMAX - 1,
+ FlagAll = FlagUrlFields, // obsolete, for backwards compatibility
+ FlagAllFields = FlagAllMAX - 1
+ };
+
+#undef FIELD_NAME
+#undef FIELD_FLAG
+ };
+
+ struct TState {
+ enum EParsed {
+ ParsedOK = 0,
+ ParsedEmpty = 1,
+ ParsedOpaque = 2,
+ ParsedRootless = ParsedOpaque,
+ ParsedBadFormat, // must follow all non-error states immediately
+ ParsedBadPath,
+ ParsedTooLong,
+ ParsedBadPort,
+ ParsedBadAuth,
+ ParsedBadScheme,
+ ParsedBadHost,
+
+ // add before this line
+ ParsedMAX
+ };
+ };
+
+ struct TScheme {
+ // don't forget to define a SchemeRegistry entry
+ enum EKind {
+ SchemeEmpty
+ // add schemes below this line
+ ,
+ SchemeHTTP,
+ SchemeHTTPS,
+ SchemeFTP,
+ SchemeFILE,
+ SchemeWS,
+ SchemeWSS
+ // add schemes above this line
+ ,
+ SchemeUnknown
+ };
+ };
+
+ class TFeature {
+ friend class NEncode::TEncoder;
+ friend class NEncode::TEncodeMapperBase;
+ friend struct NEncode::TCharFlags;
+ friend class TParser;
+ friend class NParse::TRange;
+
+#define FEATURE_NAME(f) _BitFeature##f
+#define FEATURE_FLAG_NAME(f) Feature##f
+#define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1UL << FEATURE_NAME(f)
+
+ protected:
+ enum EBit {
+ //==============================
+ // Cases interpreted as errors:
+ //==============================
+
+ // allows authorization user/password in URL
+ FEATURE_NAME(AuthSupported),
+
+ // allows all known schemes in URL
+ FEATURE_NAME(SchemeKnown),
+
+ // allows all schemes, not only known
+ FEATURE_NAME(SchemeFlexible),
+
+ // allow opaque (RFC 2396) or rootless (RFC 3986) urls
+ FEATURE_NAME(AllowRootless),
+
+ //==============================
+ // Cases interpreted for processing (if required):
+ // (effects on result of Parse method)
+ //==============================
+
+ // path needs normalization
+ // (simplification of directory tree: /../, /./, etc.
+ FEATURE_NAME(PathOperation),
+
+ // don't force empty path to "/"
+ FEATURE_NAME(AllowEmptyPath),
+
+ // in scheme and host segments:
+ // change upper case letters onto lower case ones
+ FEATURE_NAME(ToLower),
+
+ // decode unreserved symbols
+ FEATURE_NAME(DecodeUnreserved),
+
+ // legacy: decode standard symbols which may be safe for some fields
+ FEATURE_NAME(DecodeStandardExtra),
+
+ // decode symbols allowed (not necessarily safe to decode) only for a given field
+ // (do not use directly, instead use FeatureDecodeSafe mask below)
+ FEATURE_NAME(DecodeFieldAllowed),
+
+ // handling of spaces
+ FEATURE_NAME(EncodeSpace),
+
+ // in query segment: change escaped space to '+'
+ FEATURE_NAME(EncodeSpaceAsPlus),
+
+ // escape all string 'markup' symbols
+ FEATURE_NAME(EncodeForSQL),
+
+ // encoding of extended ascii symbols (8-bit)
+ FEATURE_NAME(EncodeExtendedASCII),
+
+ // decoding of extended ascii symbols (8-bit)
+ FEATURE_NAME(DecodeExtendedASCII),
+
+ // encoding of extended delimiter set
+ FEATURE_NAME(EncodeExtendedDelim),
+
+ // decoding of extended delimiter set
+ FEATURE_NAME(DecodeExtendedDelim),
+
+ // control characters [0x00 .. 0x20)
+ FEATURE_NAME(EncodeCntrl),
+
+ // raw percent character
+ FEATURE_NAME(EncodePercent),
+
+ // hash fragments
+ // https://developers.google.com/webmasters/ajax-crawling/docs/specification
+ // move and encode #! fragments to the query
+ FEATURE_NAME(HashBangToEscapedFragment),
+ // move and decode _escaped_fragment_ to the fragment
+ FEATURE_NAME(EscapedToHashBangFragment),
+
+ // reject absolute paths started by "/../"
+ FEATURE_NAME(PathDenyRootParent),
+
+ // paths started by "/../" - ignore head
+ FEATURE_NAME(PathStripRootParent),
+
+ // tries to fix errors (in particular, in fragment)
+ FEATURE_NAME(TryToFix),
+
+ // check host for DNS compliance
+ FEATURE_NAME(CheckHost),
+
+ // allow IDN hosts
+ // host is converted to punycode and stored in FieldHostAscii
+ // @note host contains characters in the charset of the document
+ // and percent-encoded characters in UTF-8 (RFC 3986, 3.2.2)
+ // @note if host contains no extended-ASCII characters and after
+ // percent-decoding cannot be converted from UTF-8 to UCS-4,
+ // try to recode from the document charset (if not UTF-8)
+ FEATURE_NAME(AllowHostIDN),
+
+ // forces AllowHostIDN, but host is replaced with punycode
+ // forces CheckHost since this replacement is irreversible
+ FEATURE_NAME(ConvertHostIDN),
+
+ // robot interpreted network paths as BadFormat urls
+ FEATURE_NAME(DenyNetworkPath),
+
+ // robot interprets URLs without a host as BadFormat
+ FEATURE_NAME(RemoteOnly),
+
+ /* non-RFC use case:
+ * 1. do not allow relative-path-only URIs when they can conflict with
+ * "host/path" (that is, only "./path" or "../path" are allowed);
+ * 2. if neither scheme nor userinfo are present but port is, it must
+ * be non-empty, to avoid conflict with "scheme:/...";
+ * 3. if AllowRootless is not specified, rootless (or opaque) URIs are
+ * not recognized;
+ * 4. if AllowRootless is specified, disallow userinfo, preferring
+ * "scheme:pa@th" over "user:pass@host", and even "host:port" when
+ * host contains only scheme-legal characters.
+ */
+ FEATURE_NAME(NoRelPath),
+
+ // standard prefers that all hex escapes were using uppercase A-F
+ FEATURE_NAME(UpperEncoded),
+
+ // internal usage: decode all encoded symbols
+ FEATURE_NAME(DecodeANY),
+
+ // add before this line
+ _FeatureMAX
+ };
+
+ protected:
+ enum EPrivate : ui32 {
+ FEATURE_FLAG(DecodeANY),
+ FEATURE_FLAG(DecodeFieldAllowed),
+ FEATURE_FLAG(DecodeStandardExtra),
+ };
+
+ public:
+ enum EPublic : ui32 {
+ FeatureMAX = _FeatureMAX,
+ FEATURE_FLAG(AuthSupported),
+ FEATURE_FLAG(SchemeKnown),
+ FEATURE_FLAG(SchemeFlexible),
+ FEATURE_FLAG(AllowRootless),
+ FEATURE_FLAG_NAME(AllowOpaque) = FEATURE_FLAG_NAME(AllowRootless),
+ FEATURE_FLAG(PathOperation),
+ FEATURE_FLAG(AllowEmptyPath),
+ FEATURE_FLAG(ToLower),
+ FEATURE_FLAG(DecodeUnreserved),
+ FEATURE_FLAG(EncodeSpace),
+ FEATURE_FLAG(EncodeSpaceAsPlus),
+ FEATURE_FLAG(EncodeForSQL),
+ FEATURE_FLAG(EncodeExtendedASCII),
+ FEATURE_FLAG(DecodeExtendedASCII),
+ FEATURE_FLAG(EncodeExtendedDelim),
+ FEATURE_FLAG(DecodeExtendedDelim),
+ FEATURE_FLAG(EncodeCntrl),
+ FEATURE_FLAG(EncodePercent),
+ FEATURE_FLAG(HashBangToEscapedFragment),
+ FEATURE_FLAG(EscapedToHashBangFragment),
+ FEATURE_FLAG(PathDenyRootParent),
+ FEATURE_FLAG(PathStripRootParent),
+ FEATURE_FLAG(TryToFix),
+ FEATURE_FLAG(CheckHost),
+ FEATURE_FLAG(AllowHostIDN),
+ FEATURE_FLAG(ConvertHostIDN),
+ FEATURE_FLAG(DenyNetworkPath),
+ FEATURE_FLAG(RemoteOnly),
+ FEATURE_FLAG(NoRelPath),
+ FEATURE_FLAG_NAME(HierURI) = FEATURE_FLAG_NAME(NoRelPath),
+ FEATURE_FLAG(UpperEncoded),
+ };
+
+#undef FEATURE_NAME
+#undef FEATURE_FLAG
+
+ public:
+ //==============================
+ enum ESets {
+ // these are guaranteed and will change buffer size
+
+ FeatureDecodeStandard = 0 | FeatureDecodeUnreserved | FeatureDecodeStandardExtra,
+
+ FeaturesDecodeExtended = 0 | FeatureDecodeExtendedASCII | FeatureDecodeExtendedDelim,
+
+ FeaturesDecode = 0 | FeatureDecodeUnreserved | FeatureDecodeStandard | FeaturesDecodeExtended,
+
+ FeaturesEncodeExtended = 0 | FeatureEncodeExtendedASCII | FeatureEncodeExtendedDelim,
+
+ FeaturesEncode = 0 | FeatureEncodeForSQL | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent | FeaturesEncodeExtended,
+
+ // these are not guaranteed to apply to a given field
+
+ FeatureDecodeAllowed = 0 | FeatureDecodeUnreserved | FeatureDecodeFieldAllowed,
+
+ FeaturesMaybeDecode = 0 | FeaturesDecode | FeatureDecodeAllowed,
+
+ FeaturesMaybeEncode = 0 | FeaturesEncode,
+
+ FeaturesEncodeDecode = 0 | FeaturesMaybeEncode | FeaturesMaybeDecode,
+
+ FeaturesAllEncoder = 0 | FeaturesEncodeDecode | FeatureDecodeANY | FeatureToLower | FeatureUpperEncoded | FeatureEncodeSpaceAsPlus,
+
+ //==============================
+ FeaturesNormalizeSet = 0 | FeaturePathOperation | FeatureToLower | FeatureDecodeAllowed | FeatureEncodeSpaceAsPlus | FeatureEncodeForSQL | FeaturePathStripRootParent | FeatureTryToFix | FeatureUpperEncoded,
+
+ FeaturesDefault = 0 // it reproduces old parsedURL
+ | FeaturePathOperation | FeaturePathDenyRootParent | FeatureCheckHost,
+
+ // essentially allows all valid RFC urls and keeps them as-is
+ FeaturesBare = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureAllowEmptyPath,
+
+ FeaturesAll = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureCheckHost | FeaturesNormalizeSet,
+
+ // Deprecated, use FeaturesRecommended
+ FeaturesRobotOld = 0
+ // http://tools.ietf.org/html/rfc3986#section-6.2.2
+ | FeatureToLower // 6.2.2.1
+ | FeatureUpperEncoded // 6.2.2.1
+ | FeatureDecodeUnreserved // 6.2.2.2
+ | FeaturePathOperation // 6.2.2.3
+ | FeaturePathDenyRootParent | FeatureSchemeKnown | FeatureConvertHostIDN | FeatureRemoteOnly | FeatureHashBangToEscapedFragment | FeatureCheckHost,
+
+ // these are mutually exclusive
+ FeaturesPath = 0 | FeaturePathDenyRootParent | FeaturePathStripRootParent,
+
+ FeaturesEscapedFragment = 0 | FeatureEscapedToHashBangFragment | FeatureHashBangToEscapedFragment,
+
+ FeaturesCheckSpecialChar = 0 | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent,
+
+ FeaturesEncodePChar = 0 | FeatureUpperEncoded | FeaturesEncodeDecode | FeaturesCheckSpecialChar,
+
+ // http://wiki.yandex-team.ru/robot/newDesign/dups/normolization
+ FeaturesRecommended = 0 | FeatureSchemeKnown | FeatureRemoteOnly | FeatureToLower | FeatureCheckHost | FeatureConvertHostIDN | FeatureHashBangToEscapedFragment | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodeExtendedASCII | FeatureUpperEncoded | FeatureDecodeUnreserved | FeaturePathOperation | FeaturePathStripRootParent,
+
+ FeaturesRobot = FeaturesRecommended
+ };
+ };
+
+ static inline int strnicmp(const char* lt, const char* rt, size_t len) {
+ return lt == rt ? 0 : ::strnicmp(lt, rt, len);
+ }
+
+ static inline int CompareNoCasePrefix(const TStringBuf& lt, const TStringBuf& rt) {
+ return strnicmp(lt.data(), rt.data(), rt.length());
+ }
+
+ static inline bool EqualNoCase(const TStringBuf& lt, const TStringBuf& rt) {
+ return lt.length() == rt.length() && 0 == CompareNoCasePrefix(lt, rt);
+ }
+
+ static inline int CompareNoCase(const TStringBuf& lt, const TStringBuf& rt) {
+ if (lt.length() == rt.length())
+ return CompareNoCasePrefix(lt, rt);
+ return lt.length() < rt.length() ? -1 : 1;
+ }
+
+ class TSchemeInfo {
+ public:
+ const TScheme::EKind Kind;
+ const ui16 Port;
+ const TStringBuf Str;
+ const ui32 FldReq;
+ TSchemeInfo(TScheme::EKind kind, TStringBuf str, ui32 fldReq = 0, ui16 port = 0)
+ : Kind(kind)
+ , Port(port)
+ , Str(str)
+ , FldReq(fldReq)
+ {
+ }
+ bool Matches(const TStringBuf& scheme) const {
+ return EqualNoCase(scheme, Str);
+ }
+
+ public:
+ static const TSchemeInfo& Get(const TStringBuf& scheme);
+ static const TSchemeInfo& Get(TScheme::EKind scheme) {
+ return Registry[scheme];
+ }
+ static TScheme::EKind GetKind(const TStringBuf& scheme) {
+ return Get(scheme).Kind;
+ }
+ static TStringBuf GetCanon(TScheme::EKind scheme) {
+ return Get(scheme).Str;
+ }
+ static ui16 GetDefaultPort(TScheme::EKind scheme) {
+ return Get(scheme).Port;
+ }
+
+ private:
+ static const TSchemeInfo Registry[];
+ };
+
+ struct TParseFlags {
+ const ui64 Allow;
+ const ui64 Extra;
+ TParseFlags(ui64 allow = 0, ui64 extra = 0)
+ : Allow(allow)
+ , Extra(extra)
+ {
+ }
+ ui64 operator&(const TParseFlags& flags) const {
+ return (Allow & flags.Allow) | (Extra & flags.Extra);
+ }
+ ui64 operator&(ui64 flags) const {
+ return (Allow & flags);
+ }
+ TParseFlags operator|(const TParseFlags& flags) const {
+ return TParseFlags(Allow | flags.Allow, Extra | flags.Extra);
+ }
+ TParseFlags Exclude(ui64 flags) const {
+ return TParseFlags(Allow & ~flags, Extra & ~flags);
+ }
+ };
+
+#define FEATURE_NAME(f) _BitFeature##f
+#define FEATURE_FLAG_NAME(f) Feature##f
+#define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1UL << FEATURE_NAME(f)
+
+ struct TQueryArg {
+ TStringBuf Name;
+ TStringBuf Value;
+
+ private:
+ enum EBit {
+ FEATURE_NAME(Filter),
+ FEATURE_NAME(SortByName),
+ FEATURE_NAME(RemoveEmptyQuery),
+ FEATURE_NAME(RewriteDirty),
+ _FeatureMAX
+ };
+
+ public:
+ enum EPublic : ui32 {
+ FeatureMAX = _FeatureMAX,
+ FEATURE_FLAG(Filter),
+ FEATURE_FLAG(SortByName),
+ FEATURE_FLAG(RemoveEmptyQuery),
+ FEATURE_FLAG(RewriteDirty),
+ };
+
+ enum EProcessed {
+ // OK and clean.
+ ProcessedOK = 0,
+
+ // OK, but query stored in internal buffer and TUri::Rewrite() is required.
+ ProcessedDirty = 1,
+
+ ProcessedMalformed = 2,
+ ProcessedTooMany = 3,
+ };
+ };
+
+ typedef bool (*TQueryArgFilter)(const TQueryArg& arg, void* filterData);
+
+#undef FEATURE_NAME
+#undef FEATURE_FLAG_NAME
+#undef FEATURE_FLAG
+
+ const char* FieldToString(const TField::EField& t);
+ const char* ParsedStateToString(const TState::EParsed& t);
+ const char* SchemeKindToString(const TScheme::EKind& t);
+
+}
+
+Y_DECLARE_OUT_SPEC(inline, NUri::TField::EField, out, t) {
+ out << NUri::FieldToString(t);
+}
+
+Y_DECLARE_OUT_SPEC(inline, NUri::TScheme::EKind, out, t) {
+ out << NUri::SchemeKindToString(t);
+}
+
+Y_DECLARE_OUT_SPEC(inline, NUri::TState::EParsed, out, t) {
+ out << NUri::ParsedStateToString(t);
+}
+
+static inline ui16 DefaultPort(NUri::TScheme::EKind scheme) {
+ return NUri::TSchemeInfo::GetDefaultPort(scheme);
+}
+
+static inline NUri::TScheme::EKind SchemeKind(const TStringBuf& scheme) {
+ return NUri::TSchemeInfo::GetKind(scheme);
+}
diff --git a/library/cpp/uri/encode.cpp b/library/cpp/uri/encode.cpp
new file mode 100644
index 00000000000..9eab1535bc6
--- /dev/null
+++ b/library/cpp/uri/encode.cpp
@@ -0,0 +1,221 @@
+#include "encode.h"
+
+#include <util/string/cast.h>
+#include <util/generic/singleton.h>
+
+namespace NUri {
+ namespace NEncode {
+// http://tools.ietf.org/html/rfc3986#section-2.2
+#define GENDELIMS0 ":/?#[]@"
+#define SUBDELIMS0 "!$&'()*+,;="
+// http://tools.ietf.org/html/rfc3986#section-2.3
+#define UNRESERVED "-._~"
+
+// now find subsets which can sometimes be decoded
+
+// remove '#' which can't ever be decoded
+// don't mark anything allowed for pass (pass is completely encoded)
+// safe in path, qry, frag
+#define GENDELIMS1 ":@"
+// allowed in qry, frag
+#define GENDELIMS2 "/?"
+
+// qry-unsafe chars
+#define SUBDELIMS1 "&+=;"
+// rest allowed in qry, frag
+#define SUBDELIMS2 "!$'()*,"
+
+ const TEncoder::TGrammar& TEncoder::Grammar() {
+ return *Singleton<TEncoder::TGrammar>();
+ }
+
+ // initialize the grammar map
+ TEncoder::TGrammar::TGrammar() {
+ // first set up unreserved characters safe in any field
+ const ui64 featUnres = TFeature::FeatureDecodeUnreserved;
+ AddRng('0', '9', ECFDigit, featUnres);
+ AddRng('A', 'Z', ECFUpper, featUnres | TFeature::FeatureToLower);
+ AddRng('a', 'z', ECFLower, featUnres);
+ Add(UNRESERVED, ECFUnres, featUnres);
+
+ // XXX: standard "safe" set used previously "-_.!~*();/:@$,", with comment:
+ // alnum + reserved + mark + ( '[', ']') - ('=' '+' '&' '\'' '"' '\\' '?')
+ Add("!*();/:@$,", ECFStdrd, TFeature::FeatureDecodeStandardExtra);
+
+ // now field-specific subsets of reserved characters (gen-delims + sub-delims)
+ const ui64 featSafe = TFeature::FeatureDecodeFieldAllowed;
+
+ Add(GENDELIMS1, 0, featSafe, TField::FlagPath | TField::FlagQuery | TField::FlagFrag);
+ Add(GENDELIMS2, 0, featSafe, TField::FlagQuery | TField::FlagFrag);
+
+ Add(SUBDELIMS1, 0, featSafe, TField::FlagUser);
+ Add(SUBDELIMS2, 0, featSafe, TField::FlagUser | TField::FlagQuery | TField::FlagFrag);
+
+ // control chars
+ AddRng(0x00, 0x20, TFeature::FeatureEncodeCntrl);
+ Add(0x7f, TFeature::FeatureEncodeCntrl);
+
+ // '%' starts a percent-encoded sequence
+ Add('%', TFeature::FeatureDecodeANY | TFeature::FeatureEncodePercent);
+
+ // extended ASCII
+ AddRng(128, 255, TFeature::FeatureEncodeExtendedASCII | TFeature::FeatureDecodeExtendedASCII);
+
+ // extended delims
+ Add("\"<>[\\]^`{|}", TFeature::FeatureEncodeExtendedDelim | TFeature::FeatureDecodeExtendedDelim);
+
+ // add characters with other features
+ Add(' ', TFeature::FeatureEncodeSpace | TFeature::FeatureEncodeSpaceAsPlus);
+ Add("'\"\\", TFeature::FeatureEncodeForSQL);
+
+ GetMutable(':').EncodeFld |= TField::FlagUser;
+ GetMutable('?').EncodeFld |= TField::FlagPath;
+ GetMutable('#').EncodeFld |= TField::FlagPath | TField::FlagQuery;
+ GetMutable('&').EncodeFld |= TField::FlagQuery;
+ GetMutable('+').EncodeFld |= TField::FlagQuery;
+ }
+
+ // should we decode an encoded character
+ bool TCharFlags::IsDecode(ui32 fldmask, ui64 flags) const {
+ const ui64 myflags = flags & FeatFlags;
+ if (myflags & TFeature::FeaturesEncode)
+ return false;
+ if (myflags & TFeature::FeaturesDecode)
+ return true;
+ return (fldmask & DecodeFld) && (flags & TFeature::FeatureDecodeFieldAllowed);
+ }
+
+ const int dD = 'a' - 'A';
+
+ int TEncodeMapper::EncodeSym(unsigned char& ch) const {
+ const TCharFlags& chflags = TEncoder::GetFlags(ch);
+ const ui64 flags = Flags & chflags.FeatFlags;
+
+ if (flags & TFeature::FeatureToLower)
+ ch += dD;
+
+ if (Q_DecodeAny)
+ return -1;
+
+ if (flags & TFeature::FeaturesEncode)
+ return 1;
+
+ if (' ' == ch) {
+ if (Q_EncodeSpcAsPlus)
+ ch = '+';
+ return 0;
+ }
+
+ return 0;
+ }
+
+ int TEncodeMapper::EncodeHex(unsigned char& ch) const {
+ const TCharFlags& chflags = TEncoder::GetFlags(ch);
+ const ui64 flags = Flags & chflags.FeatFlags;
+
+ if (flags & TFeature::FeatureToLower)
+ ch += dD;
+
+ if (Q_DecodeAny)
+ return -1;
+
+ if (chflags.IsDecode(FldMask, Flags))
+ return 0;
+
+ if (' ' == ch) {
+ if (!Q_EncodeSpcAsPlus)
+ return 1;
+ ch = '+';
+ return 0;
+ }
+
+ return 1;
+ }
+
+ bool TEncodeToMapper::Encode(unsigned char ch) const {
+ if (Q_DecodeAny)
+ return false;
+
+ const TCharFlags& chflags = TEncoder::GetFlags(ch);
+ if (FldMask & chflags.EncodeFld)
+ return true;
+
+ const ui64 flags = Flags & chflags.FeatFlags;
+ return (flags & TFeature::FeaturesEncode);
+ }
+
+ TEncoder::TEncoder(IOutputStream& out, const TEncodeMapper& fldsrc, const TEncodeToMapper& flddst)
+ : Out(out)
+ , FldSrc(fldsrc)
+ , FldDst(flddst)
+ , OutFlags(0)
+ , HexValue(0)
+ {
+ }
+
+ IOutputStream& TEncoder::Hex(IOutputStream& out, unsigned char val) {
+ static const char sHexCodes[] = "0123456789ABCDEF";
+ return out << sHexCodes[(val >> 4) & 0xF] << sHexCodes[val & 0xF];
+ }
+
+ IOutputStream& TEncoder::EncodeAll(IOutputStream& out, const TStringBuf& val) {
+ for (size_t i = 0; i != val.length(); ++i)
+ Encode(out, val[i]);
+ return out;
+ }
+
+ IOutputStream& TEncoder::EncodeNotAlnum(IOutputStream& out, const TStringBuf& val) {
+ for (size_t i = 0; i != val.length(); ++i) {
+ const char c = val[i];
+ if (IsAlnum(c))
+ out << c;
+ else
+ Encode(out, c);
+ }
+ return out;
+ }
+
+ IOutputStream& TEncoder::EncodeField(
+ IOutputStream& out, const TStringBuf& val, TField::EField fld) {
+ const ui32 fldmask = ui32(1) << fld;
+ for (size_t i = 0; i != val.length(); ++i) {
+ const char ch = val[i];
+ if (GetFlags(ch).IsAllowed(fldmask))
+ out << ch;
+ else
+ Encode(out, ch);
+ }
+ return out;
+ }
+
+ IOutputStream& TEncoder::EncodeField(
+ IOutputStream& out, const TStringBuf& val, TField::EField fld, ui64 flags) {
+ const ui32 fldmask = ui32(1) << fld;
+ for (size_t i = 0; i != val.length(); ++i) {
+ const char ch = val[i];
+ if (GetFlags(ch).IsDecode(fldmask, flags))
+ out << ch;
+ else
+ Encode(out, ch);
+ }
+ return out;
+ }
+
+ void TEncoder::Do(unsigned char ch, int res) {
+ OutFlags |= GetFlags(ch).FeatFlags;
+
+ bool escapepct = false;
+ if (0 < res) // definitely encode
+ escapepct = FldDst.Enabled();
+ else if (0 != res || !FldDst.Enabled() || !FldDst.Encode(ch)) {
+ Out << ch;
+ return;
+ }
+
+ Out << '%';
+ if (escapepct)
+ Out.Write("25", 2); // '%'
+ Hex(Out, ch);
+ }
+ }
+}
diff --git a/library/cpp/uri/encode.h b/library/cpp/uri/encode.h
new file mode 100644
index 00000000000..a9ece154270
--- /dev/null
+++ b/library/cpp/uri/encode.h
@@ -0,0 +1,282 @@
+#pragma once
+
+#include "common.h"
+
+#include <util/stream/output.h>
+
+namespace NUri {
+ namespace NEncode {
+#define CHAR_TYPE_NAME(f) _ECT##f
+#define CHAR_TYPE_FLAG(f) ECF##f = 1u << CHAR_TYPE_NAME(f)
+
+ enum ECharType {
+ CHAR_TYPE_NAME(Digit),
+ CHAR_TYPE_NAME(Lower),
+ CHAR_TYPE_NAME(Upper),
+ CHAR_TYPE_NAME(Unres),
+ CHAR_TYPE_NAME(Stdrd),
+ };
+
+ enum ECharFlag {
+ CHAR_TYPE_FLAG(Digit),
+ CHAR_TYPE_FLAG(Lower),
+ CHAR_TYPE_FLAG(Upper),
+ CHAR_TYPE_FLAG(Unres),
+ CHAR_TYPE_FLAG(Stdrd),
+ // compound group flags
+ ECGAlpha = ECFUpper | ECFLower,
+ ECGAlnum = ECGAlpha | ECFDigit,
+ ECGUnres = ECGAlnum | ECFUnres,
+ ECGStdrd = ECGUnres | ECFStdrd,
+ };
+
+#undef CHAR_TYPE_NAME
+#undef CHAR_TYPE_FLAG
+
+ struct TCharFlags {
+ ui32 TypeFlags;
+ ui64 FeatFlags;
+ ui32 DecodeFld; // decode if FeatureDecodeFieldAllowed
+ ui32 EncodeFld; // encode if shouldn't be treated as delimiter
+ TCharFlags(ui64 feat = 0)
+ : TypeFlags(0)
+ , FeatFlags(feat)
+ , DecodeFld(0)
+ , EncodeFld(0)
+ {
+ }
+ TCharFlags(ui32 type, ui64 feat, ui32 decmask = 0, ui32 encmask = 0)
+ : TypeFlags(type)
+ , FeatFlags(feat)
+ , DecodeFld(decmask)
+ , EncodeFld(encmask)
+ {
+ }
+ TCharFlags& Add(const TCharFlags& val) {
+ TypeFlags |= val.TypeFlags;
+ FeatFlags |= val.FeatFlags;
+ DecodeFld |= val.DecodeFld;
+ EncodeFld |= val.EncodeFld;
+ return *this;
+ }
+ bool IsAllowed(ui32 fldmask) const {
+ return (TypeFlags & ECGUnres) || (DecodeFld & ~EncodeFld & fldmask);
+ }
+ // should we decode an encoded character
+ bool IsDecode(ui32 fldmask, ui64 flags) const;
+ };
+
+ class TEncodeMapperBase {
+ protected:
+ TEncodeMapperBase()
+ : Flags(0)
+ , FldMask(0)
+ , Q_DecodeAny(false)
+ {
+ }
+ TEncodeMapperBase(ui64 flags, TField::EField fld)
+ : Flags(flags)
+ , FldMask(1u << fld)
+ , Q_DecodeAny(flags & TFeature::FeatureDecodeANY)
+ {
+ }
+
+ protected:
+ const ui64 Flags;
+ const ui32 FldMask;
+ const bool Q_DecodeAny; // this is a special option for username/password
+ };
+
+ // maps a sym or hex character and indicates whether it has to be encoded
+ class TEncodeMapper
+ : public TEncodeMapperBase {
+ public:
+ TEncodeMapper(ui64 flags, TField::EField fld = TField::FieldAllMAX)
+ : TEncodeMapperBase(flags, fld)
+ , Q_EncodeSpcAsPlus(flags & TFeature::FeatureEncodeSpaceAsPlus)
+ {
+ }
+ // negative=sym, positive=hex, zero=maybesym
+ int EncodeSym(unsigned char&) const;
+ int EncodeHex(unsigned char&) const;
+
+ protected:
+ const bool Q_EncodeSpcAsPlus;
+ };
+
+ // indicates whether a character has to be encoded when copying to a field
+ class TEncodeToMapper
+ : public TEncodeMapperBase {
+ public:
+ TEncodeToMapper()
+ : TEncodeMapperBase()
+ {
+ }
+ TEncodeToMapper(ui64 flags, TField::EField fld = TField::FieldAllMAX)
+ : TEncodeMapperBase(flags, fld)
+ {
+ }
+ bool Enabled() const {
+ return 0 != FldMask;
+ }
+ bool Encode(unsigned char) const;
+ };
+
+ class TEncoder {
+ public:
+ TEncoder(IOutputStream& out, const TEncodeMapper& fldsrc, const TEncodeToMapper& flddst = TEncodeToMapper());
+
+ ui64 ReEncode(const TStringBuf& url);
+ ui64 ReEncode(const char* str, size_t len) {
+ return ReEncode(TStringBuf(str, len));
+ }
+
+ protected:
+ static bool IsType(unsigned char c, ui64 flags) {
+ return GetFlags(c).TypeFlags & flags;
+ }
+
+ public:
+ static bool IsDigit(unsigned char c) {
+ return IsType(c, ECFDigit);
+ }
+ static bool IsUpper(unsigned char c) {
+ return IsType(c, ECFUpper);
+ }
+ static bool IsLower(unsigned char c) {
+ return IsType(c, ECFLower);
+ }
+ static bool IsAlpha(unsigned char c) {
+ return IsType(c, ECGAlpha);
+ }
+ static bool IsAlnum(unsigned char c) {
+ return IsType(c, ECGAlnum);
+ }
+ static bool IsUnres(unsigned char c) {
+ return IsType(c, ECGUnres);
+ }
+ static const TCharFlags& GetFlags(unsigned char c) {
+ return Grammar().Get(c);
+ }
+
+ public:
+ // process an encoded string, decoding safe chars and encoding unsafe
+ static IOutputStream& ReEncode(IOutputStream& out, const TStringBuf& val, const TEncodeMapper& srcfld) {
+ TEncoder(out, srcfld).ReEncode(val);
+ return out;
+ }
+ static IOutputStream& ReEncodeTo(IOutputStream& out, const TStringBuf& val, const TEncodeMapper& srcfld, const TEncodeToMapper& dstfld) {
+ TEncoder(out, srcfld, dstfld).ReEncode(val);
+ return out;
+ }
+
+ // see also UrlUnescape() from string/quote.h
+ static IOutputStream& Decode(
+ IOutputStream& out, const TStringBuf& val, ui64 flags) {
+ return ReEncode(out, val, flags | TFeature::FeatureDecodeANY);
+ }
+
+ public:
+ // process a raw string or char, encode as needed
+ static IOutputStream& Hex(IOutputStream& out, unsigned char val);
+ static IOutputStream& Encode(IOutputStream& out, unsigned char val) {
+ out << '%';
+ return Hex(out, val);
+ }
+ static IOutputStream& EncodeAll(IOutputStream& out, const TStringBuf& val);
+ static IOutputStream& EncodeNotAlnum(IOutputStream& out, const TStringBuf& val);
+
+ static IOutputStream& EncodeField(IOutputStream& out, const TStringBuf& val, TField::EField fld);
+ static IOutputStream& EncodeField(IOutputStream& out, const TStringBuf& val, TField::EField fld, ui64 flags);
+
+ static IOutputStream& Encode(IOutputStream& out, const TStringBuf& val) {
+ return EncodeField(out, val, TField::FieldAllMAX);
+ }
+
+ static IOutputStream& Encode(IOutputStream& out, const TStringBuf& val, ui64 flags) {
+ return EncodeField(out, val, TField::FieldAllMAX, flags);
+ }
+
+ public:
+ class TGrammar {
+ TCharFlags Map_[256];
+
+ public:
+ TGrammar();
+ const TCharFlags& Get(unsigned char ch) const {
+ return Map_[ch];
+ }
+
+ TCharFlags& GetMutable(unsigned char ch) {
+ return Map_[ch];
+ }
+ TCharFlags& Add(unsigned char ch, const TCharFlags& val) {
+ return GetMutable(ch).Add(val);
+ }
+
+ void AddRng(unsigned char lo, unsigned char hi, const TCharFlags& val) {
+ for (unsigned i = lo; i <= hi; ++i)
+ Add(i, val);
+ }
+ void AddRng(unsigned char lo, unsigned char hi, ui32 type, ui64 feat, ui32 decmask = 0, ui32 encmask = 0) {
+ AddRng(lo, hi, TCharFlags(type, feat, decmask, encmask));
+ }
+
+ void Add(const TStringBuf& set, const TCharFlags& val) {
+ for (size_t i = 0; i != set.length(); ++i)
+ Add(set[i], val);
+ }
+ void Add(const TStringBuf& set, ui32 type, ui64 feat, ui32 decmask = 0, ui32 encmask = 0) {
+ Add(set, TCharFlags(type, feat, decmask, encmask));
+ }
+ };
+
+ static const TGrammar& Grammar();
+
+ protected:
+ IOutputStream& Out;
+ const TEncodeMapper FldSrc;
+ const TEncodeToMapper FldDst;
+ ui64 OutFlags;
+ int HexValue;
+
+ protected:
+ void HexReset() {
+ HexValue = 0;
+ }
+
+ void HexDigit(char c) {
+ HexAdd(c - '0');
+ }
+ void HexUpper(char c) {
+ HexAdd(c - 'A' + 10);
+ }
+ void HexLower(char c) {
+ HexAdd(c - 'a' + 10);
+ }
+
+ void HexAdd(int val) {
+ HexValue <<= 4;
+ HexValue += val;
+ }
+
+ protected:
+ void DoSym(unsigned char ch) {
+ const int res = FldSrc.EncodeSym(ch);
+ Do(ch, res);
+ }
+ void DoHex(unsigned char ch) {
+ const int res = FldSrc.EncodeHex(ch);
+ Do(ch, res);
+ }
+ void DoHex() {
+ DoHex(HexValue);
+ HexValue = 0;
+ }
+ void Do(unsigned char, int);
+ };
+ }
+
+ using TEncoder = NEncode::TEncoder;
+
+}
diff --git a/library/cpp/uri/encodefsm.rl6 b/library/cpp/uri/encodefsm.rl6
new file mode 100644
index 00000000000..6a323aa85a3
--- /dev/null
+++ b/library/cpp/uri/encodefsm.rl6
@@ -0,0 +1,51 @@
+#include <library/cpp/uri/encode.h>
+
+#ifdef __clang__
+ #pragma clang diagnostic ignored "-Wunused-variable"
+#endif
+
+namespace NUri {
+namespace NEncode {
+
+%%{
+ machine TEncoder;
+
+ hex = (
+ digit >{ HexDigit(fc); } |
+ [A-F] >{ HexUpper(fc); } |
+ [a-f] >{ HexLower(fc); }
+ );
+
+ escaped = ( "%" hex hex )
+ > { HexReset(); }
+ % { DoHex(); };
+
+ bad_escaped = ( "%" hex )
+ % {
+ DoSym(*(fpc - 2));
+ DoSym(*(fpc - 1));
+ };
+
+ sym = (any - bad_escaped - escaped) %{ DoSym(*(fpc - 1)); };
+
+ main := ( escaped | bad_escaped | sym )**;
+
+ write data;
+}%%
+
+ui64 TEncoder::ReEncode(const TStringBuf &url)
+{
+ const char *p = url.data();
+ const char *pe = p + url.length();
+ const char *eof = pe;
+ int cs;
+ OutFlags = 0;
+
+ %% write init;
+ %% write exec;
+
+ return OutFlags;
+}
+
+}
+}
diff --git a/library/cpp/uri/http_url.h b/library/cpp/uri/http_url.h
new file mode 100644
index 00000000000..7c8e8d844d1
--- /dev/null
+++ b/library/cpp/uri/http_url.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "uri.h"
+#include "other.h"
+
+// XXX: use NUri::TUri directly; this whole file is for backwards compatibility
+
+class THttpURL
+ : public NUri::TUri {
+public:
+ typedef TField::EFlags TFlags;
+ typedef TField::EField TField;
+ typedef TScheme::EKind TSchemeKind;
+ typedef TState::EParsed TParsedState;
+
+public:
+ enum {
+ FeatureUnescapeStandard = TFeature::FeatureDecodeStandard,
+ FeatureEscSpace = TFeature::FeatureEncodeSpaceAsPlus,
+ FeatureEscapeUnescaped = TFeature::FeatureEncodeExtendedASCII,
+ FeatureNormalPath = TFeature::FeaturePathStripRootParent,
+ };
+
+public:
+ THttpURL(unsigned defaultPort = 80)
+ : TUri(defaultPort)
+ {
+ }
+
+ THttpURL(const TStringBuf& host, ui16 port, const TStringBuf& path, const TStringBuf& query = TStringBuf(), const TStringBuf& scheme = "http", unsigned defaultPort = 0)
+ : TUri(host, port, path, query, scheme, defaultPort)
+ {
+ }
+
+ THttpURL(const TUri& url)
+ : TUri(url)
+ {
+ }
+
+public: // XXX: don't use any of these legacy methods below
+public: // use TUri::GetField() instead
+ /// will return null-terminated if fld is not dirty
+ const char* Get(EField fld) const {
+ return GetField(fld).data();
+ }
+
+public: // use TUriUpdate class so that Rewrite() is only called once
+ void Set(EField field, const TStringBuf& value) {
+ if (SetInMemory(field, value))
+ Rewrite();
+ }
+
+ template <size_t size>
+ void Set(EField field, const char (&value)[size]) {
+ if (SetInMemory(field, value))
+ Rewrite();
+ }
+
+public: // use TUri::FldXXX methods for better control
+ // Partial quick set of the field, can be called for
+ // multiple fields
+ bool SetInMemory(EField field, const TStringBuf& value) {
+ return FldMemSet(field, value);
+ }
+
+ // clears a field
+ void Reset(EField field) {
+ FldClr(field);
+ }
+};
+
+static inline const char* HttpURLParsedStateToString(const NUri::TState::EParsed& t) {
+ return NUri::ParsedStateToString(t);
+}
+static inline const char* HttpUrlSchemeKindToString(const NUri::TScheme::EKind& t) {
+ return NUri::SchemeKindToString(t);
+}
diff --git a/library/cpp/uri/location.cpp b/library/cpp/uri/location.cpp
new file mode 100644
index 00000000000..a6a4d11ffa6
--- /dev/null
+++ b/library/cpp/uri/location.cpp
@@ -0,0 +1,31 @@
+#include "location.h"
+#include "uri.h"
+
+namespace NUri {
+ static const int URI_PARSE_FLAGS =
+ (TFeature::FeaturesRecommended | TFeature::FeatureConvertHostIDN | TFeature::FeatureEncodeExtendedDelim | TFeature::FeatureEncodePercent) & ~TFeature::FeatureHashBangToEscapedFragment;
+
+ TString ResolveRedirectLocation(const TStringBuf& baseUrl,
+ const TStringBuf& location) {
+ TUri baseUri;
+ TUri locationUri;
+
+ // Parse base URL.
+ if (baseUri.Parse(baseUrl, URI_PARSE_FLAGS) != NUri::TState::ParsedOK) {
+ return "";
+ }
+ // Parse location with respect to the base URL.
+ if (locationUri.Parse(location, baseUri, URI_PARSE_FLAGS) != NUri::TState::ParsedOK) {
+ return "";
+ }
+ // Inherit fragment.
+ if (!locationUri.GetField(NUri::TField::FieldFragment)) {
+ NUri::TUriUpdate update(locationUri);
+ update.Set(NUri::TField::FieldFragment, baseUri.GetField(NUri::TField::FieldFragment));
+ }
+ TString res;
+ locationUri.Print(res, NUri::TField::FlagAllFields);
+ return res;
+ }
+
+}
diff --git a/library/cpp/uri/location.h b/library/cpp/uri/location.h
new file mode 100644
index 00000000000..0f533fe0b5c
--- /dev/null
+++ b/library/cpp/uri/location.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <util/generic/string.h>
+
+namespace NUri {
+ /**
+ * Resolve Location header according to https://tools.ietf.org/html/rfc7231#section-7.1.2
+ *
+ * @return Resolved location's url or empty string in case of any error.
+ */
+ TString ResolveRedirectLocation(const TStringBuf& baseUrl, const TStringBuf& location);
+
+}
diff --git a/library/cpp/uri/location_ut.cpp b/library/cpp/uri/location_ut.cpp
new file mode 100644
index 00000000000..26a0f644711
--- /dev/null
+++ b/library/cpp/uri/location_ut.cpp
@@ -0,0 +1,40 @@
+#include "location.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+Y_UNIT_TEST_SUITE(TResolveRedirectTests) {
+ Y_UNIT_TEST(Absolute) {
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com", "http://redir-example.com/sub"), "http://redir-example.com/sub");
+ }
+ Y_UNIT_TEST(AbsWithFragment) {
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com", "http://redir-example.com/sub#Hello"), "http://redir-example.com/sub#Hello");
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com/#Hello", "http://redir-example.com/sub"), "http://redir-example.com/sub#Hello");
+ }
+ Y_UNIT_TEST(Rel) {
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com", "/sub"), "http://example.com/sub");
+ }
+ Y_UNIT_TEST(RelWithFragment) {
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com", "/sub#Hello"), "http://example.com/sub#Hello");
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com/#Hello", "/sub"), "http://example.com/sub#Hello");
+ }
+ Y_UNIT_TEST(WrongLocation) {
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com", ""), "");
+ }
+ Y_UNIT_TEST(WrongBase) {
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("", "http://example.com"), "");
+ }
+ Y_UNIT_TEST(HashBangIsNothingSpecial) {
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com", "http://redir-example.com/sub#!Hello"), "http://redir-example.com/sub#!Hello");
+ UNIT_ASSERT_EQUAL(
+ NUri::ResolveRedirectLocation("http://example.com/#!Hello", "http://redir-example.com/sub"), "http://redir-example.com/sub#!Hello");
+ }
+}
diff --git a/library/cpp/uri/other.cpp b/library/cpp/uri/other.cpp
new file mode 100644
index 00000000000..b23a5b68a9c
--- /dev/null
+++ b/library/cpp/uri/other.cpp
@@ -0,0 +1,82 @@
+#include "other.h"
+
+#include <util/string/util.h>
+#include <util/system/yassert.h>
+
+/********************************************************/
+/********************************************************/
+
+static const Tr InvertTr(".:/?#", "\005\004\003\002\001");
+static const Tr RevertTr("\005\004\003\002\001", ".:/?#");
+
+void TrspChars(char* s) {
+ InvertTr.Do(s);
+}
+
+void UnTrspChars(char* s) {
+ RevertTr.Do(s);
+}
+
+void TrspChars(char* s, size_t l) {
+ InvertTr.Do(s, l);
+}
+
+void UnTrspChars(char* s, size_t l) {
+ RevertTr.Do(s, l);
+}
+
+void TrspChars(const char* s, char* d) {
+ InvertTr.Do(s, d);
+}
+
+void UnTrspChars(const char* s, char* d) {
+ RevertTr.Do(s, d);
+}
+
+void InvertDomain(char* begin, char* end) {
+ // skip schema if it is present
+ const auto dotPos = TStringBuf{begin, end}.find('.');
+ if (dotPos == TStringBuf::npos)
+ return; // no need to invert anything
+ const auto schemaendPos = TStringBuf{begin, end}.find("://", 3);
+ if (schemaendPos < dotPos)
+ begin += schemaendPos + 3;
+ char* sl = (char*)memchr(begin, '/', end - begin);
+ char* cl = (char*)memchr(begin, ':', sl ? sl - begin : end - begin);
+ end = cl ? cl : (sl ? sl : end);
+
+ // invert string
+ for (size_t i = 0, n = end - begin; i < n / 2; ++i)
+ DoSwap(begin[i], begin[n - i - 1]);
+
+ // invert back each host name segment
+ char* b = begin;
+ while (true) {
+ char* e = (char*)memchr(b, '.', end - b);
+ if (!e)
+ e = end;
+ for (size_t i = 0, n = e - b; i < n / 2; ++i)
+ DoSwap(b[i], b[n - i - 1]);
+ if (e == end)
+ break;
+ b = e + 1;
+ }
+}
+
+void InvertUrl(char* begin, char* end) {
+ char* slash = strchr(begin, '/');
+ if (slash) {
+ *slash = 0;
+ }
+ strlwr(begin);
+ if (slash) {
+ *slash = '/';
+ }
+ InvertDomain(begin, end);
+ TrspChars(begin);
+}
+
+void RevertUrl(char* begin, char* end) {
+ UnTrspChars(begin);
+ InvertDomain(begin, end);
+}
diff --git a/library/cpp/uri/other.h b/library/cpp/uri/other.h
new file mode 100644
index 00000000000..7aec22e77b3
--- /dev/null
+++ b/library/cpp/uri/other.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <util/generic/string.h>
+
+// Some functions for inverted url representation
+// No scheme cut-off, no 80th port normalization
+
+void TrspChars(char* s);
+void UnTrspChars(char* s);
+void TrspChars(char* s, size_t l);
+void UnTrspChars(char* s, size_t l);
+void TrspChars(const char* s, char* d);
+void UnTrspChars(const char* s, char* d);
+
+void InvertDomain(char* begin, char* end);
+
+inline TString& InvertDomain(TString& url) {
+ InvertDomain(url.begin(), url.begin() + url.size());
+ return url;
+}
+
+void InvertUrl(char* begin, char* end);
+
+inline void InvertUrl(char* url) {
+ InvertUrl(url, url + strlen(url));
+}
+
+inline TString& InvertUrl(TString& url) {
+ InvertUrl(url.begin(), url.begin() + url.size());
+ return url;
+}
+
+void RevertUrl(char* begin, char* end);
+
+inline void RevertUrl(char* url) {
+ RevertUrl(url, url + strlen(url));
+}
+
+inline TString& RevertUrl(TString& url) {
+ RevertUrl(url.begin(), url.begin() + url.size());
+ return url;
+}
diff --git a/library/cpp/uri/parse.cpp b/library/cpp/uri/parse.cpp
new file mode 100644
index 00000000000..1db4e008c49
--- /dev/null
+++ b/library/cpp/uri/parse.cpp
@@ -0,0 +1,207 @@
+#include "parse.h"
+#include "common.h"
+#include "encode.h"
+
+namespace NUri {
+ const TParseFlags TParser::FieldFlags[] =
+ {
+ TParseFlags(0 // FieldScheme
+ | TFeature::FeatureToLower,
+ 0)
+
+ ,
+ TParseFlags(0 // FieldUsername
+ | TFeature::FeatureDecodeANY | TFeature::FeaturesDecode | TFeature::FeatureEncodePercent,
+ 0 | TFeature::FeatureToLower)
+
+ ,
+ TParseFlags(0 // FieldPassword
+ | TFeature::FeatureDecodeANY | TFeature::FeaturesDecode | TFeature::FeatureEncodePercent,
+ 0 | TFeature::FeatureToLower)
+
+ ,
+ TParseFlags(0 // FieldHost
+ | TFeature::FeatureToLower | TFeature::FeatureUpperEncoded | (TFeature::FeaturesMaybeEncode & ~TFeature::FeatureEncodeExtendedDelim),
+ 0 | TFeature::FeaturesMaybeDecode)
+
+ ,
+ TParseFlags(0 // FieldPort
+ ,
+ 0)
+
+ ,
+ TParseFlags(0 // FieldPath
+ | TFeature::FeaturesEncodePChar | TFeature::FeaturePathOperation,
+ 0 | TFeature::FeatureToLower | TFeature::FeatureEncodeSpaceAsPlus)
+
+ ,
+ TParseFlags(0 // FieldQuery
+ | TFeature::FeaturesEncodePChar | TFeature::FeatureEncodeSpaceAsPlus,
+ 0 | TFeature::FeatureToLower)
+
+ ,
+ TParseFlags(0 // FieldFragment
+ | TFeature::FeaturesEncodePChar,
+ 0 | TFeature::FeatureToLower | TFeature::FeatureEncodeSpaceAsPlus)};
+
+ namespace NParse {
+ void TRange::AddRange(const TRange& range, ui64 mask) {
+ FlagsAllPlaintext |= range.FlagsAllPlaintext;
+ // update only if flags apply here
+ mask &= range.FlagsEncodeMasked;
+ if (0 == mask)
+ return;
+ FlagsEncodeMasked |= mask;
+ if (mask & TFeature::FeaturesMaybeEncode)
+ Encode += range.Encode;
+ if (mask & TFeature::FeaturesDecode)
+ Decode += range.Decode;
+ }
+
+ }
+
+ void TParser::copyRequirementsImpl(const char* ptr) {
+ Y_ASSERT(0 != CurRange.FlagsAllPlaintext);
+ Y_UNUSED(ptr);
+#ifdef DO_PRN
+ PrintHead(ptr, __FUNCTION__)
+ << " all=[" << IntToString<16>(CurRange.FlagsAllPlaintext)
+ << "] enc=[" << IntToString<16>(CurRange.FlagsEncodeMasked)
+ << " & " << IntToString<16>(Flags.Allow | Flags.Extra) << "]";
+ PrintTail(CurRange.Beg, ptr);
+#endif
+ for (int i = 0; i < TField::FieldUrlMAX; ++i) {
+ const TField::EField fld = TField::EField(i);
+ TSection& section = Sections[fld];
+ // update only sections in progress
+ if (nullptr == section.Beg)
+ continue;
+ // and overlapping with the range
+ if (nullptr != section.End && section.End < CurRange.Beg)
+ continue;
+#ifdef DO_PRN
+ PrintHead(ptr, __FUNCTION__, fld)
+ << " all=[" << IntToString<16>(CurRange.FlagsAllPlaintext)
+ << "] enc=[" << IntToString<16>(CurRange.FlagsEncodeMasked)
+ << " & " << IntToString<16>(GetFieldFlags(fld)) << "]";
+ PrintTail(section.Beg, ptr);
+#endif
+ section.AddRange(CurRange, GetFieldFlags(fld));
+ }
+ CurRange.Reset();
+ }
+
+ void TParser::PctEndImpl(const char* ptr) {
+#ifdef DO_PRN
+ PrintHead(PctBegin, __FUNCTION__);
+ PrintTail(PctBegin, ptr);
+#else
+ Y_UNUSED(ptr);
+#endif
+ setRequirement(PctBegin, TEncoder::GetFlags('%').FeatFlags);
+ PctBegin = nullptr;
+ }
+
+ void TParser::HexSet(const char* ptr) {
+ Y_ASSERT(nullptr != PctBegin);
+#ifdef DO_PRN
+ PrintHead(ptr, __FUNCTION__);
+ PrintTail(PctBegin, ptr + 1);
+#endif
+ PctBegin = nullptr;
+ const unsigned char ch = HexValue;
+ ui64 flags = TEncoder::GetFlags('%').FeatFlags | TEncoder::GetFlags(ch).FeatFlags;
+
+ setRequirementExcept(ptr, flags, TFeature::FeaturesMaybeEncode);
+ }
+
+ TState::EParsed TParser::ParseImpl() {
+#ifdef DO_PRN
+ PrintHead(UriStr.data(), "[Parsing]") << "URL";
+ PrintTail(UriStr);
+#endif
+
+ const bool ok = doParse(UriStr.data(), UriStr.length());
+
+#ifdef DO_PRN
+ Cdbg << (ok ? "[Parsed]" : "[Failed]");
+ for (int idx = 0; idx < TField::FieldUrlMAX; ++idx) {
+ const TSection& section = Sections[idx];
+ if (section.IsSet())
+ Cdbg << ' ' << TField::EField(idx) << "=[" << section.Get() << ']';
+ }
+ Cdbg << Endl;
+#endif
+
+ if (!ok) {
+ if (!(Flags & TFeature::FeatureTryToFix) || !Sections[TField::FieldFrag].Beg)
+ return TState::ParsedBadFormat;
+ //Here: error was in fragment, just ignore it
+ ResetSection(TField::FieldFrag);
+ }
+
+ if ((Flags & TFeature::FeatureDenyNetworkPath) && IsNetPath())
+ return TState::ParsedBadFormat;
+
+ const TSection& scheme = Sections[TField::FieldScheme];
+ Scheme = scheme.IsSet() ? TSchemeInfo::GetKind(scheme.Get()) : TScheme::SchemeEmpty;
+ const TSchemeInfo& schemeInfo = TSchemeInfo::Get(Scheme);
+
+ if (IsRootless()) {
+ // opaque case happens
+ if (schemeInfo.FldReq & TField::FlagHost)
+ return TState::ParsedBadFormat;
+
+ if (TScheme::SchemeEmpty == Scheme)
+ return TState::ParsedBadScheme;
+
+ if (Flags & TFeature::FeatureAllowRootless)
+ return TState::ParsedOK;
+
+ if (!(Flags & TFeature::FeatureSchemeFlexible))
+ return TState::ParsedBadScheme;
+
+ return TState::ParsedRootless;
+ }
+
+ checkSectionCollision(TField::FieldUser, TField::FieldHost);
+ checkSectionCollision(TField::FieldPass, TField::FieldPort);
+
+ if (0 == (Flags & TFeature::FeatureAuthSupported))
+ if (Sections[TField::FieldUser].IsSet() || Sections[TField::FieldPass].IsSet())
+ return TState::ParsedBadAuth;
+
+ TSection& host = Sections[TField::FieldHost];
+ if (host.IsSet())
+ for (; host.End != host.Beg && '.' == host.End[-1];)
+ --host.End;
+
+ if (scheme.IsSet()) {
+ ui64 wantCareFlags = 0;
+ switch (Scheme) {
+ case TScheme::SchemeHTTP:
+ break;
+ case TScheme::SchemeEmpty:
+ Scheme = TScheme::SchemeUnknown;
+ [[fallthrough]];
+ case TScheme::SchemeUnknown:
+ wantCareFlags =
+ TFeature::FeatureSchemeFlexible | TFeature::FeatureNoRelPath;
+ break;
+ default:
+ wantCareFlags =
+ TFeature::FeatureSchemeFlexible | TFeature::FeatureSchemeKnown;
+ break;
+ }
+
+ if (0 != wantCareFlags && 0 == (Flags & wantCareFlags))
+ return TState::ParsedBadScheme;
+ if ((schemeInfo.FldReq & TField::FlagHost) || (Flags & TFeature::FeatureRemoteOnly))
+ if (!host.IsSet() || 0 == host.Len())
+ return TState::ParsedBadFormat;
+ }
+
+ return TState::ParsedOK;
+ }
+
+}
diff --git a/library/cpp/uri/parse.h b/library/cpp/uri/parse.h
new file mode 100644
index 00000000000..ca2358e5728
--- /dev/null
+++ b/library/cpp/uri/parse.h
@@ -0,0 +1,361 @@
+#pragma once
+
+// #define DO_PRN
+
+#include <cstddef>
+
+#include "common.h"
+
+#include <library/cpp/charset/doccodes.h>
+#include <util/generic/strbuf.h>
+#include <util/stream/output.h>
+#include <util/string/cast.h>
+#include <util/system/yassert.h>
+
+namespace NUri {
+ class TParser;
+
+ namespace NParse {
+ class TRange {
+ public:
+ const char* Beg;
+ ui64 FlagsEncodeMasked;
+ ui64 FlagsAllPlaintext;
+ ui32 Encode;
+ ui32 Decode;
+
+ public:
+ TRange(const char* beg = nullptr)
+ : Beg(beg)
+ , FlagsEncodeMasked(0)
+ , FlagsAllPlaintext(0)
+ , Encode(0)
+ , Decode(0)
+ {
+ }
+
+ void Reset(const char* beg = nullptr) {
+ *this = TRange(beg);
+ }
+
+ void AddRange(const TRange& range, ui64 mask);
+
+ void AddFlag(const char* ptr, ui64 mask, ui64 flag) {
+ if (0 != flag)
+ AddFlagImpl(ptr, mask, flag, flag);
+ }
+
+ void AddFlagExcept(const char* ptr, ui64 mask, ui64 flag, ui64 exclflag) {
+ if (0 != flag)
+ AddFlagImpl(ptr, mask, flag & ~exclflag, flag);
+ }
+
+ void AddFlagUnless(const char* ptr, ui64 mask, ui64 flag, ui64 exclmask) {
+ if (0 != flag)
+ AddFlagImpl(ptr, mask, flag, flag, exclmask);
+ }
+
+ void AddFlag(const char* ptr, ui64 mask, ui64 flag, ui64 exclflag, ui64 exclmask) {
+ if (0 != flag)
+ AddFlagImpl(ptr, mask, flag & ~exclflag, flag, exclmask);
+ }
+
+ private:
+ void AddFlagImpl(const char* ptr, ui64 mask, ui64 plainflag, ui64 encflag) {
+ AddFlagAllPlaintextImpl(ptr, plainflag);
+ AddFlagEncodeMaskedImpl(encflag & mask);
+ }
+
+ void AddFlagImpl(const char* ptr, ui64 mask, ui64 plainflag, ui64 encflag, ui64 exclmask) {
+ AddFlagAllPlaintextImpl(ptr, plainflag);
+ if (0 == (mask & exclmask))
+ AddFlagEncodeMaskedImpl(encflag & mask);
+ }
+
+ void AddFlagAllPlaintextImpl(const char* ptr, ui64 flag) {
+ if (nullptr == Beg)
+ Beg = ptr;
+ FlagsAllPlaintext |= flag;
+ }
+
+ void AddFlagEncodeMaskedImpl(ui64 flag) {
+ if (0 == flag)
+ return;
+ FlagsEncodeMasked |= flag;
+ if (flag & TFeature::FeaturesMaybeEncode)
+ ++Encode;
+ else if (flag & TFeature::FeaturesDecode)
+ ++Decode;
+ }
+ };
+
+ }
+
+ class TSection
+ : protected NParse::TRange {
+ private:
+ friend class TParser;
+
+ private:
+ const char* End;
+
+ TSection(const char* beg = nullptr)
+ : NParse::TRange(beg)
+ , End(nullptr)
+ {
+ }
+
+ void Reset() {
+ Enter(nullptr);
+ }
+
+ void Reset(const char* pc) {
+ Y_ASSERT(!Beg || !pc || Beg < pc);
+ Reset();
+ }
+
+ void Enter(const char* pc) {
+ *this = TSection(pc);
+ }
+
+ bool Leave(const char* pc) {
+ Y_ASSERT(Beg);
+ End = pc;
+ return true;
+ }
+
+ void Set(const TStringBuf& buf) {
+ Enter(buf.data());
+ Leave(buf.data() + buf.length());
+ }
+
+ public:
+ bool IsSet() const {
+ return End;
+ }
+
+ TStringBuf Get() const {
+ return TStringBuf(Beg, End);
+ }
+
+ size_t Len() const {
+ return End - Beg;
+ }
+
+ size_t DecodedLen() const {
+ return Len() - 2 * Decode;
+ }
+
+ size_t EncodedLen() const {
+ return 2 * Encode + DecodedLen();
+ }
+
+ ui32 GetEncode() const {
+ return Encode;
+ }
+
+ ui32 GetDecode() const {
+ return Decode;
+ }
+
+ ui64 GetFlagsEncode() const {
+ return FlagsEncodeMasked;
+ }
+
+ ui64 GetFlagsAllPlaintext() const {
+ return FlagsAllPlaintext;
+ }
+ };
+
+ class TParser {
+ public:
+ TSection Sections[TField::FieldUrlMAX];
+ TScheme::EKind Scheme;
+ const TParseFlags Flags;
+ const TStringBuf UriStr;
+ TState::EParsed State;
+ ECharset Enc;
+
+ public:
+ TParser(const TParseFlags& flags, const TStringBuf& uri, ECharset enc = CODES_UTF8)
+ : Scheme(TScheme::SchemeEmpty)
+ , Flags(flags | TFeature::FeatureDecodeANY)
+ , UriStr(uri)
+ , State(TState::ParsedEmpty)
+ , Enc(enc)
+ , HexValue(0)
+ , PctBegin(nullptr)
+ {
+ Y_ASSERT(0 == (Flags & TFeature::FeaturePathOperation)
+ // can't define all of them
+ || TFeature::FeaturesPath != (Flags & TFeature::FeaturesPath));
+ State = ParseImpl();
+ }
+
+ public:
+ const TSection& Get(TField::EField fld) const {
+ return Sections[fld];
+ }
+ TSection& GetMutable(TField::EField fld) {
+ return Sections[fld];
+ }
+ bool Has(TField::EField fld) const {
+ return Get(fld).IsSet();
+ }
+ bool IsNetPath() const {
+ return Has(TField::FieldHost) && 2 < UriStr.length() && '/' == UriStr[0] && '/' == UriStr[1];
+ }
+ bool IsRootless() const {
+ return Has(TField::FieldScheme) && !Has(TField::FieldHost) && (!Has(TField::FieldPath) || '/' != Get(TField::FieldPath).Get()[0]);
+ }
+ // for RFC 2396 compatibility
+ bool IsOpaque() const {
+ return IsRootless();
+ }
+ static ui64 GetFieldFlags(TField::EField fld, const TParseFlags& flags) {
+ return FieldFlags[fld] & flags;
+ }
+ ui64 GetFieldFlags(TField::EField fld) const {
+ return GetFieldFlags(fld, Flags);
+ }
+
+ protected:
+ static const TParseFlags FieldFlags[TField::FieldUrlMAX];
+ TSection::TRange CurRange;
+ unsigned HexValue;
+ const char* PctBegin;
+
+#ifdef DO_PRN
+ IOutputStream& PrintAddr(const char* ptr) const {
+ return Cdbg << "[" << IntToString<16>(ui64(ptr)) << "] ";
+ }
+
+ IOutputStream& PrintHead(const char* ptr, const char* func) const {
+ return PrintAddr(ptr) << func << " ";
+ }
+
+ IOutputStream& PrintHead(const char* ptr, const char* func, const TField::EField& fld) const {
+ return PrintHead(ptr, func) << fld;
+ }
+
+ IOutputStream& PrintTail(const TStringBuf& val) const {
+ return Cdbg << " [" << val << "]" << Endl;
+ }
+ IOutputStream& PrintTail(const char* beg, const char* end) const {
+ return PrintTail(TStringBuf(beg, end));
+ }
+#endif
+
+ void ResetSection(TField::EField fld, const char* pc = nullptr) {
+#ifdef DO_PRN
+ PrintHead(pc, __FUNCTION__, fld);
+ PrintTail(pc);
+#endif
+ Sections[fld].Reset(pc);
+ }
+
+ void storeSection(const TStringBuf& val, TField::EField fld) {
+#ifdef DO_PRN
+ PrintHead(val.data(), __FUNCTION__, fld);
+ PrintTail(val);
+#endif
+ Sections[fld].Set(val);
+ }
+
+ void startSection(const char* pc, TField::EField fld) {
+#ifdef DO_PRN
+ PrintHead(pc, __FUNCTION__, fld);
+ PrintTail(pc);
+#endif
+ copyRequirements(pc);
+ Sections[fld].Enter(pc);
+ }
+
+ void finishSection(const char* pc, TField::EField fld) {
+#ifdef DO_PRN
+ PrintHead(pc, __FUNCTION__, fld);
+ PrintTail(pc);
+#endif
+ if (Sections[fld].Leave(pc))
+ copyRequirements(pc);
+ }
+
+ void setRequirement(const char* ptr, ui64 flags) {
+#ifdef DO_PRN
+ PrintHead(ptr, __FUNCTION__) << IntToString<16>(flags)
+ << " & mask=" << IntToString<16>(Flags.Allow | Flags.Extra);
+ PrintTail(ptr);
+#endif
+ CurRange.AddFlag(ptr, Flags.Allow | Flags.Extra, flags);
+ }
+
+ void setRequirementExcept(const char* ptr, ui64 flags, ui64 exclflag) {
+#ifdef DO_PRN
+ PrintHead(ptr, __FUNCTION__) << IntToString<16>(flags)
+ << " & exclflag=" << IntToString<16>(exclflag)
+ << " & mask=" << IntToString<16>(Flags.Allow | Flags.Extra);
+ PrintTail(ptr);
+#endif
+ CurRange.AddFlagExcept(ptr, Flags.Allow | Flags.Extra, flags, exclflag);
+ }
+
+ void setRequirementUnless(const char* ptr, ui64 flags, ui64 exclmask) {
+#ifdef DO_PRN
+ PrintHead(ptr, __FUNCTION__) << IntToString<16>(flags)
+ << " & exclmask=" << IntToString<16>(exclmask)
+ << " & mask=" << IntToString<16>(Flags.Allow | Flags.Extra);
+ PrintTail(ptr);
+#endif
+ CurRange.AddFlagUnless(ptr, Flags.Allow | Flags.Extra, flags, exclmask);
+ }
+
+ void copyRequirementsImpl(const char* ptr);
+ void copyRequirements(const char* ptr) {
+ PctEnd(ptr);
+ if (nullptr != CurRange.Beg && CurRange.Beg != ptr)
+ copyRequirementsImpl(ptr);
+ }
+
+ void HexDigit(const char* ptr, char c) {
+ Y_UNUSED(ptr);
+ HexAdd(c - '0');
+ }
+ void HexUpper(const char* ptr, char c) {
+ setRequirementUnless(ptr, TFeature::FeatureToLower, TFeature::FeatureUpperEncoded);
+ HexAdd(c - 'A' + 10);
+ }
+ void HexLower(const char* ptr, char c) {
+ setRequirement(ptr, TFeature::FeatureUpperEncoded);
+ HexAdd(c - 'a' + 10);
+ }
+ void HexAdd(unsigned val) {
+ HexValue <<= 4;
+ HexValue += val;
+ }
+ void HexReset() {
+ HexValue = 0;
+ }
+ void HexSet(const char* ptr);
+
+ void PctEndImpl(const char* ptr);
+ void PctEnd(const char* ptr) {
+ if (nullptr != PctBegin && ptr != PctBegin)
+ PctEndImpl(ptr);
+ }
+ void PctBeg(const char* ptr) {
+ PctEnd(ptr);
+ HexReset();
+ PctBegin = ptr;
+ }
+
+ void checkSectionCollision(TField::EField fld1, TField::EField fld2) {
+ if (Sections[fld1].IsSet() && Sections[fld2].IsSet() && Sections[fld1].Beg == Sections[fld2].Beg) {
+ Sections[fld1].Reset();
+ }
+ }
+
+ bool doParse(const char* str_beg, size_t length);
+ TState::EParsed ParseImpl();
+ };
+
+}
diff --git a/library/cpp/uri/parsefsm.rl6 b/library/cpp/uri/parsefsm.rl6
new file mode 100644
index 00000000000..70977236503
--- /dev/null
+++ b/library/cpp/uri/parsefsm.rl6
@@ -0,0 +1,501 @@
+#include <library/cpp/uri/parse.h>
+
+#ifdef __clang__
+ #pragma clang diagnostic ignored "-Wunused-variable"
+#endif
+
+%%{
+ machine TParser;
+
+ #================================================
+ # RFC 3986 http://tools.ietf.org/html/rfc3986
+ # with some modifications
+ #================================================
+ # The RegEx
+ #
+ # http://www.ics.uci.edu/pub/ietf/uri/#Related
+ # ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ # 12 3 4 5 6 7 8 9
+ #results in the following subexpression matches:
+ # $1 = http:
+ # $2 = http
+ # $3 = //www.ics.uci.edu
+ # $4 = www.ics.uci.edu
+ # $5 = /pub/ietf/uri/
+ # $6 = <undefined>
+ # $7 = <undefined>
+ # $8 = #Related
+ # $9 = Related
+ #
+ # So $2:scheme $4:authority $5:path $7:query $9:fragment
+ #================================================
+
+
+ #================================================
+ # List of all ASCII characters and where they can be used
+ #================================================
+
+ # 0-31 x00-1F cntrl ext_cntrl
+ # 32 x20 space ext_space
+ # 33 x21 ! sub_delims
+ # 34 x22 " ext_delims
+ # 35 x23 # gen_delims / f=frag
+ # 36 x24 $ sub_delims
+ # 37 x25 % PCT
+ # 38 x26 & sub_delims
+ # 39 x27 ' sub_delims
+ # 40 x28 ( sub_delims
+ # 41 x29 ) sub_delims
+ # 42 x2A * sub_delims
+ # 43 x2B + sub_delims
+ # 44 x2C , sub_delims
+ # 45 x2D - unreserved
+ # 46 x2E . unreserved
+ # 47 x2F / gen_delims / f=path,qry,frag
+ # 48-57 x30-39 0-9 unreserved
+ # 58 x3A : gen_delims / f=pass,path,qry,frag
+ # 59 x3B ; sub_delims
+ # 60 x3C < ext_delims
+ # 61 x3D = sub_delims
+ # 62 x3E > ext_delims
+ # 63 x3F ? gen_delims / f=qry,frag
+ # 64 x40 @ gen_delims / f=path,qry,frag
+ # 65-90 x41-5A A-Z unreserved
+ # 91 x5B [ gen_delims / ext_delims
+ # 92 x5C \ ext_delims
+ # 93 x5D ] gen_delims / ext_delims
+ # 94 x5E ^ ext_delims
+ # 95 x5F _ unreserved
+ # 96 x60 ` ext_delims
+ # 97-122 x61-7A a-z unreserved
+ # 123 x7B { ext_delims
+ # 124 x7C | ext_delims
+ # 125 x7D } ext_delims
+ # 126 x7E ~ unreserved
+ # 127 x7F DEL ext_cntrl
+ # 128-255 x80-FF ext_ascii
+
+
+ #================================================
+ # Actions used in multiple definitions
+ #================================================
+
+ action act_req_enc_sql { REQ(fpc, FeatureEncodeForSQL) }
+
+ # REQ must apply to a char in range but not after the range has been reset
+ action act_req_pathop { REQ(fpc - 1, FeaturePathOperation) }
+
+ action act_clr_scheme { CLR(fpc, Scheme) }
+ action act_clr_user { CLR(fpc, User) }
+ action act_clr_host { CLR(fpc, Host) }
+ action act_beg_host { BEG(fpc, Host) }
+ action act_end_host { END(fpc, Host) }
+ action act_beg_path { BEG(fpc, Path) }
+ action act_end_path { END(fpc, Path) }
+
+
+ #================================================
+ # RFC 3986 ABNFs
+ #================================================
+
+ DIGIT = digit;
+
+ ALPHA = ( upper >{ REQ(fpc, FeatureToLower) } ) |
+ lower;
+
+ ALNUM = ALPHA | DIGIT;
+
+ PCT = "%" >{ PctBeg(fpc); } ;
+
+ HEXDIG = (
+ DIGIT >{ HexDigit(fpc, fc); }
+ | [A-F] >{ HexUpper(fpc, fc); }
+ | [a-f] >{ HexLower(fpc, fc); }
+ );
+
+ # HexSet sets REQ so must apply in range
+ HEXNUM = ( HEXDIG HEXDIG ) %{ HexSet(fpc - 1); };
+
+ pct_encoded = PCT HEXNUM;
+
+ unreserved = ALNUM | "-" | "." | "_" | "~";
+
+ gen_delims = ":" | "/" | "?" | "#" | "[" | "]" | "@";
+
+ sub_delims = "!" | "$" | "&" | "(" | ")"
+ | "*" | "+" | "," | ";" | "="
+ | ( ['] >act_req_enc_sql );
+
+
+ #================================================
+ # Local ABNFs
+ #================================================
+
+ VALID = ^(cntrl | space) | " ";
+
+ # safe character sequences
+ safe = unreserved | pct_encoded | sub_delims;
+
+ # MOD: Yandex extensions
+
+ ext_ascii = (VALID - ascii) >{ REQ(fpc, FeatureEncodeExtendedASCII) };
+ ext_delims = ( "[" | "]" | "|" | "{" | "}" | "`" | "^" | "<" | ">"
+ | ( ["\\] >act_req_enc_sql )
+ ) >{ REQ(fpc, FeatureEncodeExtendedDelim) }; # " fix hilite
+ ext_space = " " >{ REQ(fpc, FeatureEncodeSpace) };
+ ext_cntrl = cntrl >{ REQ(fpc, FeatureEncodeCntrl) };
+
+ pct_maybe_encoded = PCT (HEXDIG | HEXNUM)? ;
+ ext_safe = unreserved
+ | pct_maybe_encoded
+ | sub_delims
+ | ext_delims
+ | ext_space
+ | ext_cntrl
+ | ext_ascii;
+
+ # pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
+ # uric (RFC 2396)
+ # MOD: extension to format, add extended delimiters and 8-bit ascii
+
+ pchar_nc = ext_safe | "@";
+ pchar = pchar_nc | ":";
+ path_sep = "/";
+ uric = pchar | path_sep | "?";
+
+
+ #================================================
+ # Fields
+ #================================================
+ # Single fields use fXXX as machine definitions
+
+
+ #================================================
+ # Scheme
+ # scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+ #================================================
+
+ scheme = ( ALPHA ( ALPHA | DIGIT | "+" | "-" | "." )** );
+ fscheme = scheme >{ BEG(fpc, Scheme) } %{ END(fpc, Scheme) };
+
+
+ #================================================
+ # UserInfo
+ # userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
+ #================================================
+
+ # MOD: split into a pair of sections: username and password
+
+ fuser = ( ext_safe )** >{ BEG(fpc, User) } %{ END(fpc, User) };
+ fpass = ( ext_safe | ":" )** >{ BEG(fpc, Pass) } %{ END(fpc, Pass) };
+ userinfo = ( fuser ( ":" fpass )? ) ( "@" %act_clr_host @^act_clr_user );
+
+
+ #================================================
+ # Hostname
+ # host = IP-literal / IPv4address / reg-name
+ #================================================
+
+ # MOD: simplify IP-literal for now
+ IPv6address = (HEXDIG | ":" | ".")+;
+ IP_literal = "[" IPv6address "]";
+
+ # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
+ # MOD: simplify dec-octet which originally matches only 0-255
+
+ dec_octet = DIGIT+;
+ IPv4address = dec_octet "." dec_octet "." dec_octet "." dec_octet;
+
+ # MOD: non-empty; will use host?
+ # reg-name = *( unreserved / pct-encoded / sub-delims )
+ ### todo: allow ':' (need to fix grammar to disambiguate port)
+ achar = any - (0x00 .. 0x20) - '/' - '#' - '?' - ':' - '%';
+ upperhalf = any - (0x00 .. 0x7F);
+ hostname = (((achar | pct_encoded)+) & (any* (alnum | upperhalf) any*));
+ reg_name = hostname - IPv4address - IP_literal;
+
+ # uses first-match-wins approach
+ host = IP_literal | IPv4address | (reg_name - IPv4address);
+ fhost = host? >act_beg_host %act_end_host;
+ fhost_nempty = host >act_beg_host %act_end_host;
+
+
+ #================================================
+ # Port
+ # port = *DIGIT
+ #================================================
+
+ # MOD: use fport? for empty
+ fport = DIGIT+ >{ BEG(fpc, Port) } %{ END(fpc, Port) };
+
+
+ #================================================
+ # Authority
+ # authority = [ userinfo "@" ] host [ ":" port ]
+ #================================================
+
+ authority = userinfo? fhost ( ":" fport? )? ;
+
+
+ #================================================
+ # Path
+ #================================================
+ # path = path-abempty ; begins with "/" or is empty
+ # / path-absolute ; begins with "/" but not "//"
+ # / path-noscheme ; begins with a non-colon segment
+ # / path-rootless ; begins with a segment
+ # / path-empty ; zero characters
+ #================================================
+
+ # checkPath rules
+
+ checkPathHead =
+ "." ( "."? path_sep VALID* )? %act_req_pathop ;
+
+ checkPathTail =
+ VALID*
+ ( path_sep "."{1,2} ) %act_req_pathop ;
+
+ checkPathMid = VALID*
+ ( path_sep "."{,2} path_sep ) %act_req_pathop
+ VALID*;
+
+ checkAbsPath = checkPathMid | checkPathTail | VALID*;
+ checkRelPath = checkPathHead | checkAbsPath;
+
+ # segment = *pchar
+ segment = pchar**;
+
+ # segment-nz = 1*pchar
+ segment_nz = pchar+;
+
+ # segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
+ segment_nz_nc = pchar_nc+;
+
+ sep_segment = path_sep segment;
+
+ # non-standard definitions
+
+ fpath_abnempty =
+ (
+ ( sep_segment+ )
+ & checkAbsPath
+ )
+ >act_beg_path %act_end_path
+ ;
+
+ fpath_relative =
+ (
+ "."
+ ( "."? sep_segment+ )?
+ )
+ >act_beg_path %act_req_pathop %act_end_path
+ ;
+
+ # standard definitions
+
+ # do not save empty paths, they behave differently in relative resolutions
+ fpath_empty = zlen;
+
+ fpath_abempty = fpath_abnempty?;
+
+ fpath_absolute =
+ (
+ ( path_sep ( segment_nz sep_segment* )? )
+ & checkAbsPath
+ )
+ >act_beg_path %act_end_path
+ ;
+
+ fpath_noscheme =
+ (
+ ( segment_nz_nc sep_segment* )
+ & checkRelPath
+ )
+ >act_beg_path %act_end_path
+ ;
+
+ fpath_rootless =
+ (
+ ( segment_nz sep_segment* )
+ )
+ >act_beg_path %act_end_path
+ ;
+
+ #================================================
+ # Query and fragment
+ # query = *( pchar / "/" / "?" )
+ # fragment = *( pchar / "/" / "?" )
+ #================================================
+
+ # MOD: fragment allows '#' characters
+
+ fquery = (uric )** >{ BEG(fpc, Query) } %{ END(fpc, Query) };
+ ffrag = (uric | "#")** >{ BEG(fpc, Frag) } %{ END(fpc, Frag) };
+ query_frag = ("?" fquery)? ("#" ffrag)? ;
+
+
+ #================================================
+ # final ABNFs
+ # URI-reference = URI / relative-ref
+ #================================================
+ # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+ # hier-part = "//" authority path-abempty
+ # / path-absolute
+ # / path-rootless
+ # / path-empty
+ # relative-ref = relative-part [ "?" query ] [ "#" fragment ]
+ # relative-part = "//" authority path-abempty
+ # / path-absolute
+ # / path-noscheme
+ # / path-empty
+
+ net_path = "//" authority fpath_abempty;
+
+ URI =
+ fscheme ":"
+ (
+ net_path
+ | fpath_absolute
+ | fpath_rootless
+ | fpath_empty
+ )
+ $^act_clr_scheme
+ query_frag
+ ;
+
+ relative_ref =
+ (
+ net_path
+ | fpath_absolute
+ | fpath_noscheme
+ | fpath_empty
+ )
+ %act_clr_scheme
+ query_frag
+ ;
+
+ # non-standard definitions
+
+ URI_no_rootless =
+ fscheme ":"
+ (
+ net_path
+ | fpath_absolute
+ | fpath_empty
+ )
+ $^act_clr_scheme
+ query_frag
+ ;
+
+ host_path =
+ (
+ fhost_nempty fpath_abempty
+ | (fhost_nempty - scheme) ":" fport fpath_abempty
+ )
+ @^act_clr_host
+ ;
+
+ # no userinfo, path absolute, empty or clearly relative, starting with "./" | "../"
+ relative_ref_host_pabem =
+ (
+ net_path
+ | host_path
+ | fpath_absolute
+ | fpath_relative
+ | fpath_empty
+ )
+ %act_clr_scheme
+ query_frag
+ ;
+
+ # port must be non-empty, to avoid clash with "scheme:/..."
+ auth_path =
+ (
+ fhost_nempty ( ":" fport )? fpath_abempty
+ | userinfo fhost ( ":" fport? )? fpath_abempty
+ )
+ @^act_clr_host
+ @^act_clr_user
+ ;
+
+ # userinfo, path absolute, empty or clearly relative, starting with "./" | "../"
+ relative_ref_auth_pabem =
+ (
+ net_path
+ | auth_path
+ | fpath_absolute
+ | fpath_relative
+ | fpath_empty
+ )
+ %act_clr_scheme
+ query_frag
+ ;
+
+
+ # machine instantiations
+
+ URI_ref_no_rootless :=
+ (
+ URI_no_rootless
+ # scheme://user@host preferred over user://pass@host/path
+ | relative_ref_auth_pabem
+ )
+ ;
+
+ URI_ref_no_relpath :=
+ (
+ relative_ref_host_pabem
+ # host:port/path preferred over scheme:path/rootless
+ | (URI - relative_ref_host_pabem)
+ )
+ ;
+
+ URI_ref :=
+ (
+ relative_ref
+ | URI
+ )
+ ;
+
+ write data;
+
+}%%
+
+namespace NUri {
+
+bool TParser::doParse(const char* str_beg, size_t length)
+{
+ const char* p = str_beg;
+ const char* pe = str_beg + length;
+ const char* eof = pe;
+ int cs;
+
+#define BEG(ptr, fld) startSection (ptr, TField::Field ## fld);
+#define END(ptr, fld) finishSection(ptr, TField::Field ## fld);
+#define SET(val, fld) storeSection(val, TField::Field ## fld);
+#define CLR(ptr, fld) ResetSection (TField::Field ## fld, ptr);
+#define REQ(ptr, req) setRequirement(ptr, TFeature :: req);
+
+ %% write init nocs;
+
+ if (0 == (Flags & TFeature::FeatureNoRelPath)) {
+ cs = TParser_en_URI_ref;
+ } else if (0 == (Flags & TFeature::FeatureAllowRootless)) {
+ cs = TParser_en_URI_ref_no_rootless;
+ } else {
+ cs = TParser_en_URI_ref_no_relpath;
+ }
+
+ %% write exec;
+
+#undef BEG
+#undef END
+#undef SET
+#undef CLR
+#undef REQ
+
+ return cs >= TParser_first_final;
+}
+
+}
diff --git a/library/cpp/uri/qargs.cpp b/library/cpp/uri/qargs.cpp
new file mode 100644
index 00000000000..23058f81029
--- /dev/null
+++ b/library/cpp/uri/qargs.cpp
@@ -0,0 +1,279 @@
+#include "qargs.h"
+#include <string>
+
+namespace NUri {
+ namespace NOnStackArgsList {
+ struct TQArgNode {
+ TQArgNode* Prev;
+ TQArgNode* Next;
+
+ TStringBuf Name;
+ TStringBuf Value;
+ TStringBuf All;
+ };
+
+ TQArgNode MakeArg(TQArgNode* prev) {
+ return {prev, 0, {}, {}, {}};
+ }
+
+ const char* SkipDelimiter(const char* str, const char* end) {
+ while (str != end)
+ if (*str == '&')
+ ++str;
+ else
+ break;
+ return str;
+ }
+
+ /// return next pos or 0 if error
+ const char* ExtractArgData(const char* pos, const char* end, TQArgNode* arg) {
+ const char* nameStart = pos;
+ const char* nextArg = strchr(pos, '&');
+ const char* valueStart = strchr(pos, '=');
+ if (valueStart && nextArg && valueStart < nextArg) // a=1& or a=&
+ {
+ arg->Name = TStringBuf(nameStart, valueStart - nameStart);
+ arg->Value = TStringBuf(valueStart + 1, nextArg - valueStart - 1);
+ arg->All = TStringBuf(nameStart, nextArg - nameStart);
+ return nextArg;
+ } else if (valueStart && nextArg && valueStart > nextArg) // a&b=2
+ {
+ arg->Name = TStringBuf(nameStart, nextArg - nameStart);
+ arg->All = arg->Name;
+ return nextArg;
+ } else if (valueStart && !nextArg) // a=1 or a=
+ {
+ arg->Name = TStringBuf(nameStart, valueStart - nameStart);
+ arg->Value = TStringBuf(valueStart + 1, end - valueStart - 1);
+ arg->All = TStringBuf(nameStart, end - nameStart);
+ return end;
+ } else if (!valueStart && nextArg) // a&b
+ {
+ arg->Name = TStringBuf(nameStart, nextArg - nameStart);
+ arg->All = arg->Name;
+ return nextArg;
+ } else { // a
+ arg->Name = TStringBuf(nameStart, end - nameStart);
+ arg->All = arg->Name;
+ return end;
+ }
+ }
+
+ // arg can be null
+ TQArgNode* GetHead(TQArgNode* arg) {
+ TQArgNode* prev = arg;
+ while (prev) {
+ arg = prev;
+ prev = prev->Prev;
+ }
+ return arg;
+ }
+
+ // arg can be null
+ TQArgNode* GetLast(TQArgNode* arg) {
+ TQArgNode* next = arg;
+ while (next) {
+ arg = next;
+ next = arg->Next;
+ }
+ return arg;
+ }
+
+ int CompareName(const TQArgNode* l, const TQArgNode* r) {
+ return l->Name.compare(r->Name);
+ }
+
+ TQArgNode* Move(TQArgNode* before, TQArgNode* node) {
+ TQArgNode* tn = node->Next;
+ TQArgNode* tp = node->Prev;
+
+ node->Prev = before->Prev;
+ if (node->Prev)
+ node->Prev->Next = node;
+
+ node->Next = before;
+ before->Prev = node;
+
+ if (tn)
+ tn->Prev = tp;
+ if (tp)
+ tp->Next = tn;
+
+ return node;
+ }
+
+ // return new head
+ TQArgNode* QSortByName(TQArgNode* iter, TQArgNode* last) {
+ if (iter == last)
+ return iter;
+ if (iter->Next == last) {
+ int c = CompareName(iter, last);
+ return c <= 0 ? iter : Move(iter, last);
+ } else {
+ TQArgNode* pivot = iter;
+ iter = iter->Next;
+ TQArgNode* head = 0;
+ TQArgNode* tail = 0;
+ TQArgNode* tailPartitionStart = pivot;
+ while (true) {
+ TQArgNode* next = iter->Next;
+ int c = CompareName(iter, pivot);
+ int sign = (0 < c) - (c < 0);
+ switch (sign) {
+ case -1:
+ head = head ? Move(head, iter) : Move(pivot, iter);
+ break;
+
+ case 0:
+ pivot = Move(pivot, iter);
+ break;
+
+ case 1:
+ tail = iter;
+ break;
+ }
+
+ if (iter == last)
+ break;
+ iter = next;
+ }
+
+ if (head)
+ head = QSortByName(head, pivot->Prev);
+ if (tail)
+ QSortByName(tailPartitionStart->Next, tail);
+ return head ? head : pivot;
+ }
+ }
+ }
+
+ using namespace NOnStackArgsList;
+
+ class TQueryArgProcessing::Pipeline {
+ public:
+ Pipeline(TQueryArgProcessing& parent, TUri& subject)
+ : Parent(parent)
+ , Subject(subject)
+ , ArgsCount(0)
+ , IsDirty(false)
+ {
+ }
+
+ TQueryArg::EProcessed Process() {
+ const TStringBuf& query = Subject.GetField(NUri::TField::FieldQuery);
+ if (query.empty())
+ return ProcessEmpty();
+
+ const char* start = query.data();
+ return Parse(start, start + query.length(), 0);
+ }
+
+ TQueryArg::EProcessed ProcessEmpty() {
+ if (Parent.Flags & TQueryArg::FeatureRemoveEmptyQuery)
+ Subject.FldClr(NUri::TField::FieldQuery);
+
+ return TQueryArg::ProcessedOK;
+ }
+
+ TQueryArg::EProcessed Parse(const char* str, const char* end, TQArgNode* prev) {
+ str = SkipDelimiter(str, end);
+
+ if (str == end) {
+ TQArgNode* head = GetHead(prev);
+ TQArgNode* last = GetLast(prev);
+ return FinalizeParsing(head, last);
+ } else {
+ TQArgNode current = MakeArg(prev);
+ const char* next = ExtractArgData(str, end, &current);
+ if (!next)
+ return TQueryArg::ProcessedMalformed;
+
+ TQArgNode* tail = ApplyFilter(prev, &current);
+
+ if (++ArgsCount > MaxCount)
+ return TQueryArg::ProcessedTooMany;
+
+ return Parse(next, end, tail);
+ }
+ }
+
+ TQArgNode* ApplyFilter(TQArgNode* prev, TQArgNode* current) {
+ if (Parent.Flags & TQueryArg::FeatureFilter) {
+ TQueryArg arg = {current->Name, current->Value};
+ if (!Parent.Filter(arg, Parent.FilterData)) {
+ IsDirty = true;
+ return prev;
+ }
+ }
+
+ if (prev)
+ prev->Next = current;
+ return current;
+ }
+
+ TQueryArg::EProcessed FinalizeParsing(TQArgNode* head, TQArgNode* last) {
+ if (Parent.Flags & TQueryArg::FeatureSortByName) {
+ head = QSortByName(head, last);
+ IsDirty = true;
+ }
+
+ if (!IsDirty)
+ return TQueryArg::ProcessedOK;
+
+ bool dirty = Render(head);
+
+ bool rewrite = Parent.Flags & TQueryArg::FeatureRewriteDirty;
+ if (dirty && rewrite)
+ Subject.Rewrite();
+ return (!dirty || rewrite) ? TQueryArg::ProcessedOK : TQueryArg::ProcessedDirty;
+ }
+
+ bool Render(TQArgNode* head) {
+ std::string& result = Parent.Buffer;
+ result.clear();
+ result.reserve(Subject.GetField(NUri::TField::FieldQuery).length());
+ bool first = true;
+ while (head) {
+ if (first)
+ first = false;
+ else
+ result.append("&");
+
+ result.append(head->All);
+ head = head->Next;
+ }
+
+ if (result.empty())
+ return RenderEmpty();
+ else
+ return Subject.FldMemSet(NUri::TField::FieldQuery, result);
+ }
+
+ bool RenderEmpty() {
+ if (Parent.Flags & TQueryArg::FeatureRemoveEmptyQuery)
+ Subject.FldClr(NUri::TField::FieldQuery);
+ return false;
+ }
+
+ private:
+ TQueryArgProcessing& Parent;
+ TUri& Subject;
+
+ unsigned ArgsCount;
+ bool IsDirty;
+
+ static const unsigned MaxCount = 100;
+ };
+
+ TQueryArgProcessing::TQueryArgProcessing(ui32 flags, TQueryArgFilter filter, void* filterData)
+ : Flags(flags)
+ , Filter(filter)
+ , FilterData(filterData)
+ {
+ }
+
+ TQueryArg::EProcessed TQueryArgProcessing::Process(TUri& uri) {
+ Pipeline pipeline(*this, uri);
+ return pipeline.Process();
+ }
+}
diff --git a/library/cpp/uri/qargs.h b/library/cpp/uri/qargs.h
new file mode 100644
index 00000000000..fcba7cbd0cc
--- /dev/null
+++ b/library/cpp/uri/qargs.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "common.h"
+#include "uri.h"
+#include <string>
+
+namespace NUri {
+ class TQueryArgProcessing {
+ public:
+ TQueryArgProcessing(ui32 flags, TQueryArgFilter filter = 0, void* filterData = 0);
+
+ TQueryArg::EProcessed Process(TUri& uri);
+
+ private:
+ ui32 Flags;
+ TQueryArgFilter Filter;
+ void* FilterData;
+
+ class Pipeline;
+ std::string Buffer;
+ };
+}
diff --git a/library/cpp/uri/uri-ru_ut.cpp b/library/cpp/uri/uri-ru_ut.cpp
new file mode 100644
index 00000000000..ec35a164d29
--- /dev/null
+++ b/library/cpp/uri/uri-ru_ut.cpp
@@ -0,0 +1,163 @@
+#include "uri_ut.h"
+#include <library/cpp/charset/recyr.hh>
+#include <library/cpp/html/entity/htmlentity.h>
+#include <util/system/maxlen.h>
+
+namespace NUri {
+ namespace {
+ TString AsWin1251(const TString& s) {
+ return Recode(CODES_UTF8, CODES_WIN, s);
+ }
+ TString AsKoi8(const TString& s) {
+ return Recode(CODES_UTF8, CODES_KOI8, s);
+ }
+ }
+
+ Y_UNIT_TEST_SUITE(URLTestRU) {
+ Y_UNIT_TEST(test_httpURL2) {
+ TUri url;
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("g:h"), TState::ParsedBadScheme);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("http:g"), TState::ParsedBadFormat);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("/../g"), TState::ParsedBadPath);
+ const char* const UpCaseUrl = "http://www.TEST.Ru:80/InDex.html";
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse(UpCaseUrl), TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://www.TEST.Ru/InDex.html");
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse(UpCaseUrl, TFeature::FeaturesDefault | TFeature::FeatureToLower), TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://www.test.ru/InDex.html");
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagScheme), "http:");
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagScheme | TField::FlagHost), "http://www.test.ru");
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHost), "www.test.ru");
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHost | TField::FlagPath), "www.test.ru/InDex.html");
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagQuery), "");
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("http://www.TEST.Ru:90/InDex.html"), TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHostPort | TField::FlagPath), "www.TEST.Ru:90/InDex.html");
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("www.ya.ru/index.html"), TState::ParsedOK);
+ UNIT_ASSERT(!url.IsValidAbs());
+ UNIT_ASSERT(url.IsNull(TField::FlagHost));
+ UNIT_ASSERT(!url.IsNull(TField::FlagPath));
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagPath), "www.ya.ru/index.html");
+
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse(AsWin1251("www.TEST.Ru/ФЕУФ\\'\".html?ФЕУФ\\'\"=ФЕУФ+\\'\"%10")), TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), AsWin1251("www.TEST.Ru/ФЕУФ\\'\".html?ФЕУФ\\'\"=ФЕУФ+\\'\"%10"));
+
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse(AsWin1251("www.TEST.Ru/ФЕУФ\\'\".html?ФЕУФ\\'\"=ФЕУФ+\\'\"%10"),
+ TFeature::FeaturesDefault | TFeature::FeatureEncodeExtendedASCII),
+ TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(),
+ AsWin1251("www.TEST.Ru/%D4%C5%D3%D4\\'\".html?%D4%C5%D3%D4\\'\"=%D4%C5%D3%D4+\\'\"%10"));
+
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse(AsWin1251("www.TEST.Ru/ФЕУФ\\'\".html?ФЕУФ\\'\"=ФЕУФ+\\'\"%10"),
+ TFeature::FeaturesDefault | TFeature::FeatureEncodeForSQL),
+ TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), AsWin1251("www.TEST.Ru/ФЕУФ%5C%27%22.html?ФЕУФ%5C%27%22=ФЕУФ+%5C%27%22%10"));
+
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("q/%33%26%13%2f%2b%30%20",
+ TFeature::FeaturesDefault | TFeature::FeatureDecodeStandard),
+ TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "q/3%26%13/%2B0%20");
+
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("http://www.prime-tass.ru/news/0/{656F5BAE-6677-4762-9BED-9E3B77E72055}.uif"),
+ TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("//server/path"), TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("//server/path", TFeature::FeaturesRobot), TState::ParsedOK);
+ }
+
+ const TString links[] = {
+ "viewforum.php?f=1&amp;sid=b4568481b67b1d7683bea78634b2e240", "viewforum.php?f=1&sid=b4568481b67b1d7683bea78634b2e240",
+ "./viewtopic.php?p=74&amp;sid=6#p74", "./viewtopic.php?p=74&sid=6#p74",
+ "viewtopic.php?p=9313&amp;sid=8#9313", "viewtopic.php?p=9313&sid=8#9313",
+ "profile.php?mode=viewprofile&u=-1#drafts&amp;sid=a6e5989cee27adb5996bfff044af04ca", "profile.php?mode=viewprofile&u=-1#drafts&sid=a6e5989cee27adb5996bfff044af04ca",
+
+ "images\nil.jpg", "images%0Ail.jpg",
+ "http://caedebaturque.termez.su\r\n/?article=218", "http://caedebaturque.termez.su%0D%0A/?article=218",
+
+ AsKoi8("javascript:window.external.AddFavorite(\'http://www.humor.look.ru/\',\'Злобные Деды Морозы!!!\')"), "javascript:window.external.AddFavorite(\'http://www.humor.look.ru/\',\'%FA%CC%CF%C2%CE%D9%C5%20%E4%C5%C4%D9%20%ED%CF%D2%CF%DA%D9!!!\')",
+ "search.php?search_author=%CB%FE%E4%EC%E8%EB%E0+%C3%F3%F1%E5%E2%E0&amp;showresults=posts&amp;sid=8", "search.php?search_author=%CB%FE%E4%EC%E8%EB%E0+%C3%F3%F1%E5%E2%E0&showresults=posts&sid=8",
+ AsWin1251("/Search/author/?q=Штрибель Х.В."), "/Search/author/?q=%D8%F2%F0%E8%E1%E5%EB%FC%20%D5.%C2.",
+ AsWin1251("javascript:ins(\'ГОРШОК\')"), "javascript:ins(\'%C3%CE%D0%D8%CE%CA\')",
+ AsWin1251("?l=я"), "?l=%FF",
+ AsWin1251("content.php?id=3392&theme=Цена"), "content.php?id=3392&theme=%D6%E5%ED%E0",
+ "/a-mp3/stype-1/?search=А", "/a-mp3/stype-1/?search=%D0%90",
+ "/a-mp3/stype-1/?search=Б", "/a-mp3/stype-1/?search=%D0%91",
+ "/a-mp3/stype-1/?search=В", "/a-mp3/stype-1/?search=%D0%92",
+ "/a-mp3/stype-1/?search=Г", "/a-mp3/stype-1/?search=%D0%93",
+ "/a-mp3/stype-1/?search=Д", "/a-mp3/stype-1/?search=%D0%94",
+ "/a-mp3/stype-1/?search=Е", "/a-mp3/stype-1/?search=%D0%95",
+ "/a-mp3/stype-1/?search=Ж", "/a-mp3/stype-1/?search=%D0%96",
+ "/a-mp3/stype-1/?search=З", "/a-mp3/stype-1/?search=%D0%97",
+ // %98 is not defined in CP1251 so don't put it here explicitly
+ "/a-mp3/stype-1/?search=\xD0\x98", "/a-mp3/stype-1/?search=%D0%98",
+ "/a-mp3/stype-1/?search=Й", "/a-mp3/stype-1/?search=%D0%99",
+ "/a-mp3/stype-1/?search=К", "/a-mp3/stype-1/?search=%D0%9A",
+ "/a-mp3/stype-1/?search=Л", "/a-mp3/stype-1/?search=%D0%9B",
+ "/a-mp3/stype-1/?search=М", "/a-mp3/stype-1/?search=%D0%9C",
+ "/a-mp3/stype-1/?search=Н", "/a-mp3/stype-1/?search=%D0%9D",
+ "/a-mp3/stype-1/?search=О", "/a-mp3/stype-1/?search=%D0%9E",
+ "/a-mp3/stype-1/?search=П", "/a-mp3/stype-1/?search=%D0%9F",
+ "/a-mp3/stype-1/?search=\xD0", "/a-mp3/stype-1/?search=%D0",
+ "/a-mp3/stype-1/?search=С", "/a-mp3/stype-1/?search=%D0%A1",
+ "/a-mp3/stype-1/?search=Т", "/a-mp3/stype-1/?search=%D0%A2",
+ "/a-mp3/stype-1/?search=У", "/a-mp3/stype-1/?search=%D0%A3",
+ "/a-mp3/stype-1/?search=Ф", "/a-mp3/stype-1/?search=%D0%A4",
+ "/a-mp3/stype-1/?search=Х", "/a-mp3/stype-1/?search=%D0%A5",
+ "/a-mp3/stype-1/?search=Ц", "/a-mp3/stype-1/?search=%D0%A6",
+ "/a-mp3/stype-1/?search=Ч", "/a-mp3/stype-1/?search=%D0%A7",
+ "/a-mp3/stype-1/?search=Ш", "/a-mp3/stype-1/?search=%D0%A8",
+ "/a-mp3/stype-1/?search=Щ", "/a-mp3/stype-1/?search=%D0%A9",
+ "/a-mp3/stype-1/?search=Ы", "/a-mp3/stype-1/?search=%D0%AB",
+ "/a-mp3/stype-1/?search=Э", "/a-mp3/stype-1/?search=%D0%AD",
+ "/a-mp3/stype-1/?search=Ю", "/a-mp3/stype-1/?search=%D0%AE",
+ "/a-mp3/stype-1/?search=Я", "/a-mp3/stype-1/?search=%D0%AF",
+
+ "javascript:emoticon(\":&#39;(\")", "javascript:emoticon(\":\'(\")",
+ "javascript:emoticon(\'&gt;:o\')", "javascript:emoticon(\'>:o\')",
+ "javascript:emoticon(\']:-&gt;\')", "javascript:emoticon(\']:->\')",
+ "javascript:emoticon(\':-&#33;\')", "javascript:emoticon(\':-!\')",
+ "javascript:emoticon(\'@}-&gt;--\')", "javascript:emoticon(\'@}->--\')",
+ "http&#58;//www.is-ufa.ru/price2/price_IS.rar", "http://www.is-ufa.ru/price2/price_IS.rar",
+ "&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#105;&#110;&#102;&#111;&#64;&#101;&#116;&#101;&#109;&#46;&#100;&#101;", "mailto:info@etem.de",
+ "&quot;http://www.fubix.ru&quot;", "\"http://www.fubix.ru\"",
+ AsWin1251("mailto:&#107;&#97;&#109;&#112;&#97;&#64;&#117;&#107;&#114;&#46;&#110;&#101;&#116;?subject=Арабский язык"), "mailto:kampa@ukr.net?subject=%C0%F0%E0%E1%F1%EA%E8%E9%20%FF%E7%FB%EA",
+ {}};
+
+ Y_UNIT_TEST(testHtLinkDecode) {
+ char decodedlink[URL_MAXLEN + 10];
+ for (int i = 0; links[i]; i += 2) {
+ UNIT_ASSERT(HtLinkDecode(links[i].c_str(), decodedlink, sizeof(decodedlink)));
+ UNIT_ASSERT_VALUES_EQUAL(decodedlink, links[i + 1]);
+ }
+ }
+
+ Y_UNIT_TEST(testRuIDNA) {
+ {
+#define DEC "\xD7\xE5\xF0\xE5\xEf\xEE\xE2\xE5\xF6.\xF0\xF4" /* "Череповец.рф" in Windows-1251 */
+#define ENC "%D7%E5%F0%E5%EF%EE%E2%E5%F6.%F0%F4"
+// punycode corresponds to lowercase
+#define PNC "xn--b1afab7bff7cb.xn--p1ai"
+ TTest test = {
+ "http://" ENC "/" ENC "?" ENC "#" ENC, TParseFlags(TFeature::FeaturesAll | TFeature::FeatureAllowHostIDN, TFeature::FeatureDecodeExtendedASCII), TState::ParsedOK, "http", "", "", DEC, 80, "/" ENC, ENC, ENC};
+ TUri url;
+ URL_TEST_ENC(url, test, CODES_WIN);
+ UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldHostAscii), PNC);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" DEC "/" ENC "?" ENC "#" ENC);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHostAscii), "http://" PNC "/" ENC "?" ENC "#" ENC);
+#undef PNC
+#undef DEC
+#undef ENC
+ }
+ }
+
+ // Regression test for SEARCH-11283
+ Y_UNIT_TEST(RegressionTest11283) {
+ TStringBuf url = "http://xn--n1aaa.пидорасы.com/";
+
+ TUri uri;
+ TState::EParsed er = uri.Parse(url, NUri::TParseFlags(NUri::TFeature::FeaturesRobot | NUri::TFeature::FeatureNoRelPath));
+ UNIT_ASSERT_VALUES_EQUAL(er, TState::ParsedOK);
+ TStringBuf host = uri.GetHost();
+ // Should be properly null-terminated
+ UNIT_ASSERT_VALUES_EQUAL(host.size(), strlen(host.data()));
+ }
+ }
+
+}
diff --git a/library/cpp/uri/uri.cpp b/library/cpp/uri/uri.cpp
new file mode 100644
index 00000000000..1664e8c8ddf
--- /dev/null
+++ b/library/cpp/uri/uri.cpp
@@ -0,0 +1,623 @@
+#include "uri.h"
+#include "parse.h"
+
+#include <util/string/cast.h>
+#include <util/string/util.h>
+#include <util/system/maxlen.h>
+#include <util/system/yassert.h>
+#include <util/generic/map.h>
+
+namespace NUri {
+ TState::EParsed TUri::CheckHost(const TStringBuf& host) {
+ if (host.empty())
+ return ParsedOK;
+
+ unsigned domainLevel = 0;
+ unsigned domainLevelOfUnderscore = 0;
+
+ bool isAlnum = false;
+ bool startLabel = true;
+ for (size_t i = 0; i != host.length(); ++i) {
+ const char ch = host[i];
+
+ if ('.' == ch) { // label separator
+ if (!isAlnum || startLabel) // previous label must end in alnum
+ return ParsedBadHost;
+ startLabel = true;
+ continue;
+ }
+
+ isAlnum = isalnum((const unsigned char)ch);
+
+ if (startLabel) { // label is starting
+ if (!isAlnum && '_' != ch) // new label must start with alnum or '_'
+ return ParsedBadHost;
+ startLabel = false;
+ ++domainLevel;
+ if (ch == '_')
+ domainLevelOfUnderscore = domainLevel;
+ continue;
+ }
+
+ if (isAlnum || '-' == ch)
+ continue;
+
+ if (ch == '_') { // non-standard case we allow for certain hosts
+ domainLevelOfUnderscore = domainLevel;
+ continue;
+ }
+
+ return ParsedBadHost;
+ }
+
+ if (0 < domainLevelOfUnderscore && domainLevel < 2 + domainLevelOfUnderscore)
+ return ParsedBadHost;
+
+ return ParsedOK;
+ }
+
+ /********************************************************/
+ TUri::TUri(const TStringBuf& host, ui16 port, const TStringBuf& path, const TStringBuf& query, const TStringBuf& scheme, unsigned defaultPort)
+ : FieldsSet(0)
+ , Port(port)
+ , DefaultPort(0)
+ , Scheme(SchemeEmpty)
+ , FieldsDirty(0)
+ {
+ if (!scheme.empty()) {
+ if (SetSchemeImpl(TSchemeInfo::Get(scheme)).Str.empty())
+ FldSet(FieldScheme, scheme);
+ }
+
+ if (0 < defaultPort) // override the scheme's default port
+ DefaultPort = static_cast<ui16>(defaultPort);
+
+ char sport[6]; // enough for ui16
+ if (0 != port) {
+ const size_t len = ToString(port, sport, sizeof(sport));
+ FldSet(FieldPort, TStringBuf(sport, len));
+ }
+
+ FldTrySet(FieldHost, host);
+ FldTrySet(FieldPath, path);
+ FldTrySet(FieldQuery, query);
+
+ Rewrite();
+ }
+
+ /********************************************************/
+ bool TUri::FldSetImpl(
+ EField field, TStringBuf value, bool strconst, bool nocopy) {
+ if (!FldIsValid(field))
+ return false;
+
+ switch (field) {
+ case FieldScheme:
+ if (!SetScheme(TSchemeInfo::Get(value)).Str.empty())
+ return false;
+ break;
+
+ case FieldPort:
+ Port = value.empty() ? 0 : FromString<ui16>(value);
+ break;
+
+ default:
+ break;
+ }
+
+ if (!value.IsInited()) {
+ FldClr(field);
+ return false;
+ }
+
+ if (strconst) { // string constants don't need to be saved in the buffer
+ FldMarkClean(field);
+ FldSetNoDirty(field, value);
+ return false;
+ }
+
+ if (nocopy) {
+ FldSet(field, value);
+ return true;
+ }
+
+ return FldTryCpy(field, value);
+ }
+
+ /********************************************************/
+ bool TUri::FldTryCpy(EField field, const TStringBuf& value) {
+ if (!FldIsDirty(field)) {
+ do {
+ if (!FldIsSet(field))
+ break;
+
+ TStringBuf& fld = Fields[field];
+ if (fld.length() < value.length())
+ break;
+
+ char* oldV = (char*)fld.data();
+ if (!IsInBuffer(oldV))
+ break;
+
+ memcpy(oldV, value.data(), value.length());
+ oldV[value.length()] = 0;
+ fld.Trunc(value.length());
+ return false;
+ } while (false);
+
+ FldMarkDirty(field);
+ }
+
+ FldSetNoDirty(field, value);
+ return true;
+ }
+
+ /********************************************************/
+ void TUri::RewriteImpl() {
+ size_t len = 0;
+ for (int i = 0; i < FieldAllMAX; ++i) {
+ const EField fld = EField(i);
+ if (FldIsSet(fld))
+ len += 1 + Fields[fld].length();
+ }
+
+ if (!len)
+ Buffer.Clear();
+ else {
+ TBuffer newbuf;
+ newbuf.Resize(len);
+ TMemoryWriteBuffer out(newbuf.data(), newbuf.size());
+ for (int i = 0; i < FieldAllMAX; ++i) {
+ const EField fld = EField(i);
+ if (!FldIsSet(fld))
+ continue;
+
+ const char* beg = out.Buf();
+ const TStringBuf& val = Fields[fld];
+ out << val;
+ FldSetNoDirty(fld, TStringBuf(beg, val.length()));
+ out << '\0';
+ }
+ Buffer = std::move(newbuf);
+ }
+
+ CheckMissingFields();
+
+ FieldsDirty = 0;
+ }
+
+ void TUri::CheckMissingFields() {
+ // if host is set but path is not...
+ if (FldSetCmp(FlagPath | FlagHost, FlagHost))
+ // ... and the scheme requires a path...
+ if (GetSchemeInfo().FldReq & FlagPath)
+ // ... set path
+ FldSetNoDirty(FieldPath, TStringBuf("/"));
+ }
+
+ /********************************************************/
+ void TUri::Merge(const TUri& base, int correctAbs) {
+ if (base.Scheme == SchemeUnknown)
+ return;
+
+ if (!base.IsValidGlobal())
+ return;
+
+ const TStringBuf& selfscheme = GetField(FieldScheme);
+ // basescheme is present since IsValidGlobal() succeeded
+ const TStringBuf& basescheme = base.GetField(FieldScheme);
+ const bool noscheme = !selfscheme.IsInited();
+ if (!noscheme && !EqualNoCase(selfscheme, basescheme))
+ return;
+
+ const ui32 cleanFields = ~FieldsDirty;
+ do {
+ static constexpr TStringBuf rootPath = "/";
+
+ if (noscheme) {
+ if (!basescheme.empty()) {
+ FldSetNoDirty(FieldScheme, basescheme);
+ // check if it is canonical
+ if (basescheme.data() != base.GetSchemeInfo().Str.data())
+ FldMarkDirty(FieldScheme);
+ }
+ Scheme = base.Scheme;
+ DefaultPort = base.DefaultPort;
+ }
+
+ if (!IsNull(FlagHost))
+ break; // no merge
+
+ FldTrySet(FieldHost, base);
+ FldChkSet(FieldPort, base);
+ Port = base.Port;
+
+ if (noscheme && IsNull(FlagQuery) && IsNull(FlagPath))
+ FldTrySet(FieldQuery, base);
+
+ if (IsNull(FlagAuth) && !base.IsNull(FlagAuth)) {
+ FldChkSet(FieldUser, base);
+ FldChkSet(FieldPass, base);
+ }
+
+ if (IsValidAbs())
+ break;
+
+ TStringBuf p0 = base.GetField(FieldPath);
+ if (!p0.IsInited())
+ p0 = rootPath;
+
+ TStringBuf p1 = GetField(FieldPath);
+ if (!p1.IsInited()) {
+ if (p0.data() != rootPath.data())
+ FldSet(FieldPath, p0);
+ else
+ FldSetNoDirty(FieldPath, rootPath);
+ break;
+ }
+ if (p1 && '/' == p1[0])
+ p1.Skip(1); // p0 will have one
+
+ bool pathop = true;
+
+ TTempBufOutput out(p0.length() + p1.length() + 4);
+ out << p0;
+ if ('/' != p0.back())
+ out << "/../";
+ else if (p1.empty() || '.' != p1[0])
+ pathop = false;
+ out << p1;
+
+ char* beg = out.Data();
+ char* end = beg + out.Filled();
+ if (pathop && !PathOperation(beg, end, correctAbs)) {
+ Clear();
+ break;
+ }
+
+ // Needs immediate forced rewrite because of TTempBuf
+ FldSetNoDirty(FieldPath, TStringBuf(beg, end));
+ RewriteImpl();
+ } while (false);
+
+ CheckMissingFields();
+
+ // rewrite only if borrowed fields from base
+ if (cleanFields & FieldsDirty)
+ RewriteImpl();
+ }
+
+ /********************************************************/
+ TUri::TLinkType TUri::Normalize(const TUri& base,
+ const TStringBuf& link, const TStringBuf& codebase, long careFlags, ECharset enc) {
+ // parse URL
+ if (ParsedOK != ParseImpl(link, careFlags, 0, SchemeEmpty, enc))
+ return LinkIsBad;
+
+ const TStringBuf& host = GetHost();
+
+ // merge with base URL
+ // taken either from _BASE_ property or from optional argument
+ if (!codebase.empty()) {
+ // if optional code base given -- parse it
+ TUri codebaseUrl;
+ if (codebaseUrl.ParseImpl(codebase, careFlags, 0, SchemeEmpty, enc) != ParsedOK || !codebaseUrl.IsValidAbs())
+ return LinkIsBad;
+ Merge(codebaseUrl);
+ } else {
+ // Base is already in this variable
+ // see SetProperty() for details
+ Merge(base);
+ }
+
+ // check result: must be correct absolute URL
+ if (!IsValidAbs())
+ return LinkBadAbs;
+
+ if (!host.empty()) {
+ // - we don't care about different ports for the same server
+ // - we don't care about win|www|koi|etc. preffixes for the same server
+ if (GetPort() != base.GetPort() || !EqualNoCase(host, base.GetHost()))
+ return LinkIsGlobal;
+ }
+
+ // find out if it is link to itself then ignore it
+ if (!Compare(base, FlagPath | FlagQuery))
+ return LinkIsFragment;
+
+ return LinkIsLocal;
+ }
+
+ /********************************************************/
+
+ size_t TUri::PrintSize(ui32 flags) const {
+ size_t len = 10;
+ flags &= FieldsSet; // can't output what we don't have
+ if (flags & FlagHostAscii)
+ flags &= ~FlagHost; // don't want to print both of them
+ ui32 opt = 1;
+ for (int fld = 0; opt <= flags && fld < FieldAllMAX; ++fld, opt <<= 1) {
+ if (opt & flags) {
+ const TStringBuf& v = Fields[fld];
+ if (v.IsInited()) {
+ if (opt & FlagAuth)
+ len += 3 * v.length() + 1;
+ else
+ len += v.length() + 1;
+ }
+ }
+ }
+
+ return len;
+ }
+
+ IOutputStream& TUri::PrintImpl(IOutputStream& out, int flags) const {
+ TStringBuf v;
+
+ const int wantFlags = flags; // save the original
+ flags &= FieldsSet; // can't print what we don't have
+ if (flags & FlagHostAscii)
+ flags |= FlagHost; // to make host checks simpler below
+
+ if (flags & FlagScheme) {
+ v = Fields[FieldScheme];
+ if (!v.empty())
+ out << v << ':';
+ }
+
+ TStringBuf host;
+ if (flags & FlagHost) {
+ const EField fldhost =
+ flags & FlagHostAscii ? FieldHostAscii : FieldHost;
+ host = Fields[fldhost];
+ }
+
+ TStringBuf port;
+ if ((flags & FlagPort) && 0 != Port && Port != DefaultPort)
+ port = Fields[FieldPort];
+
+ if (host) {
+ if (wantFlags & FlagScheme)
+ out << "//";
+
+ if (flags & FlagAuth) {
+ if (flags & FlagUser) {
+ v = Fields[FieldUser];
+ if (!v.empty())
+ TEncoder::EncodeNotAlnum(out, v);
+ }
+
+ if (flags & FlagPass) {
+ v = Fields[FieldPass];
+ if (v.IsInited()) {
+ out << ':';
+ TEncoder::EncodeAll(out, v);
+ }
+ }
+
+ out << '@';
+ }
+
+ out << host;
+
+ if (port)
+ out << ':';
+ }
+ if (port)
+ out << port;
+
+ if (flags & FlagPath) {
+ v = Fields[FieldPath];
+ // for relative, empty path is not the same as missing
+ if (v.empty() && 0 == (flags & FlagHost))
+ v = TStringBuf(".");
+ out << v;
+ }
+
+ if (flags & FlagQuery) {
+ v = Fields[FieldQuery];
+ if (v.IsInited())
+ out << '?' << v;
+ }
+
+ if (flags & FlagFrag) {
+ v = Fields[FieldFrag];
+ if (v.IsInited())
+ out << '#' << v;
+ }
+
+ return out;
+ }
+
+ /********************************************************/
+ int TUri::CompareField(EField fld, const TUri& url) const {
+ const TStringBuf& v0 = GetField(fld);
+ const TStringBuf& v1 = url.GetField(fld);
+ switch (fld) {
+ case FieldScheme:
+ case FieldHost:
+ return CompareNoCase(v0, v1);
+ default:
+ return v0.compare(v1);
+ }
+ }
+
+ /********************************************************/
+ int TUri::Compare(const TUri& url, int flags) const {
+ // first compare fields with default values
+ if (flags & FlagPort) {
+ const int ret = GetPort() - url.GetPort();
+ if (ret)
+ return ret;
+ flags &= ~FlagPort;
+ }
+
+ // compare remaining sets of available fields
+ const int rtflags = flags & url.FieldsSet;
+ flags &= FieldsSet;
+ const int fldcmp = flags - rtflags;
+ if (fldcmp)
+ return fldcmp;
+
+ // field sets are the same, compare the fields themselves
+ for (int i = 0; i < FieldAllMAX; ++i) {
+ const EField fld = EField(i);
+ if (flags & FldFlag(fld)) {
+ const int ret = CompareField(fld, url);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+ }
+
+ /********************************************************/
+ bool TUri::PathOperation(char*& pathPtr, char*& pathEnd, int correctAbs) {
+ if (!pathPtr)
+ return false;
+ if (pathPtr == pathEnd)
+ return true;
+
+ if ((pathEnd - pathPtr) >= 2 && *(pathEnd - 2) == '/' && *(pathEnd - 1) == '.') {
+ --pathEnd;
+ }
+
+ char* p_wr = pathEnd;
+ int upCount = 0;
+
+ char* p_prev = pathEnd;
+ Y_ASSERT(p_prev > pathPtr);
+ while (p_prev > pathPtr && *(p_prev - 1) == '/')
+ p_prev--;
+
+ for (char* p_rd = p_prev; p_rd; p_rd = p_prev) {
+ Y_ASSERT(p_rd == pathEnd || p_rd[0] == '/');
+ p_prev = nullptr;
+
+ char* p = p_rd;
+
+ if (p > pathPtr) {
+ for (p--; *p != '/'; p--) {
+ if (p == pathPtr)
+ break;
+ }
+ if (*p == '/') {
+ p_prev = p++;
+ if ((p_prev - pathPtr >= 6 && !strnicmp(p_prev - 6, "http://", 7)) ||
+ (p_prev - pathPtr >= 7 && !strnicmp(p_prev - 7, "https://", 8))) {
+ --p_prev;
+ --p;
+ } else {
+ //skip multiple from head '/'
+ while (p_prev > pathPtr && *(p_prev - 1) == '/')
+ p_prev--;
+ }
+ }
+ }
+
+ Y_ASSERT(p_prev == nullptr || p_prev[0] == '/');
+ //and the first symbol !='/' after p_prev is p
+
+ if (p == p_rd) {
+ //empty block:
+ if (p_prev) { //either tail:
+ Y_ASSERT(p_rd == p_wr && *(p - 1) == '/');
+ --p_wr;
+ continue;
+ } else { //or head of abs path
+ *(--p_wr) = '/';
+ break;
+ }
+ }
+
+ if (p[0] == '.') {
+ if (p + 1 == p_rd) {
+ if (correctAbs || p_prev > pathPtr || pathPtr[0] != '/')
+ // ignore "./"
+ continue;
+ } else {
+ if ((p[1] == '.') && (p + 2 == p_rd)) {
+ // register "../" but not print
+ upCount++;
+ continue;
+ }
+ }
+ }
+
+ if (upCount) {
+ //unregister "../" and not print
+ upCount--;
+ continue;
+ }
+
+ // print
+ Y_ASSERT(p < p_rd);
+ Y_ASSERT(!p_prev || *(p - 1) == '/');
+ if (p_wr == p_rd) { //just skip
+ p_wr = p;
+ } else { //copy
+ int l = p_rd - p + 1;
+ p_wr -= l;
+ memmove(p_wr, p, l);
+ }
+ }
+
+ if (upCount) {
+ if (*pathPtr != '/') {
+ if (pathEnd == p_wr && *(p_wr - 1) == '.') {
+ Y_ASSERT(*(p_wr - 2) == '.');
+ p_wr -= 2;
+ upCount--;
+ }
+ for (; upCount > 0; upCount--) {
+ *(--p_wr) = '/';
+ *(--p_wr) = '.';
+ *(--p_wr) = '.';
+ }
+ } else {
+ if (correctAbs > 0)
+ return false;
+ if (correctAbs == 0) {
+ //Bad path but present in RFC:
+ // "Similarly, parsers must avoid treating "." and ".."
+ // as special when they are not complete components of
+ // a relative path. "
+ for (; upCount > 0; upCount--) {
+ *(--p_wr) = '.';
+ *(--p_wr) = '.';
+ *(--p_wr) = '/';
+ }
+ } else {
+ upCount = false;
+ }
+ }
+ }
+
+ Y_ASSERT(p_wr >= pathPtr);
+
+ if (upCount)
+ return false;
+ pathPtr = p_wr;
+ return true;
+ }
+
+ /********************************************************/
+ const char* LinkTypeToString(const TUri::TLinkType& t) {
+ switch (t) {
+ case TUri::LinkIsBad:
+ return "LinkIsBad";
+ case TUri::LinkBadAbs:
+ return "LinkBadAbs";
+ case TUri::LinkIsFragment:
+ return "LinkIsFragment";
+ case TUri::LinkIsLocal:
+ return "LinkIsLocal";
+ case TUri::LinkIsGlobal:
+ return "LinkIsGlobal";
+ }
+ Y_ASSERT(0);
+ return "";
+ }
+
+}
diff --git a/library/cpp/uri/uri.h b/library/cpp/uri/uri.h
new file mode 100644
index 00000000000..3b6c19fe4a8
--- /dev/null
+++ b/library/cpp/uri/uri.h
@@ -0,0 +1,626 @@
+#pragma once
+
+#include "common.h"
+#include "encode.h"
+
+#include <library/cpp/charset/doccodes.h>
+#include <util/generic/buffer.h>
+#include <util/generic/ptr.h>
+#include <util/generic/singleton.h>
+#include <util/generic/string.h>
+#include <util/memory/alloc.h>
+#include <util/stream/mem.h>
+#include <util/stream/output.h>
+#include <util/stream/str.h>
+#include <util/system/yassert.h>
+
+#include <cstdlib>
+
+namespace NUri {
+ /********************************************************/
+ class TUri
+ : public TFeature,
+ public TField,
+ public TScheme,
+ public TState {
+ public:
+ enum TLinkType {
+ LinkIsBad,
+ LinkBadAbs,
+ LinkIsFragment,
+ LinkIsLocal,
+ LinkIsGlobal
+ };
+
+ private:
+ TBuffer Buffer;
+ TStringBuf Fields[FieldAllMAX];
+ ui32 FieldsSet;
+ ui16 Port;
+ ui16 DefaultPort;
+ TScheme::EKind Scheme;
+ /// contains fields out of buffer (and possibly not null-terminated)
+ ui32 FieldsDirty;
+
+ private:
+ void Alloc(size_t len) {
+ Dealloc(); // to prevent copy below
+ Buffer.Resize(len);
+ }
+ void Dealloc() {
+ Buffer.Clear();
+ }
+
+ void ClearImpl() {
+ Port = 0;
+ FieldsSet = 0;
+ Scheme = SchemeEmpty;
+ FieldsDirty = 0;
+ }
+
+ void CopyData(const TUri& url) {
+ FieldsSet = url.FieldsSet;
+ Port = url.Port;
+ DefaultPort = url.DefaultPort;
+ Scheme = url.Scheme;
+ FieldsDirty = url.FieldsDirty;
+ }
+
+ void CopyImpl(const TUri& url) {
+ for (int i = 0; i < FieldAllMAX; ++i)
+ Fields[i] = url.Fields[i];
+
+ RewriteImpl();
+ }
+
+ private:
+ static ui32 FldFlag(EField fld) {
+ return 1 << fld;
+ }
+
+ public:
+ static bool FldIsValid(EField fld) {
+ return 0 <= fld && FieldAllMAX > fld;
+ }
+
+ bool FldSetCmp(ui32 chk, ui32 exp) const {
+ return (FieldsSet & chk) == exp;
+ }
+
+ bool FldSetCmp(ui32 chk) const {
+ return FldSetCmp(chk, chk);
+ }
+
+ bool FldIsSet(EField fld) const {
+ return !FldSetCmp(FldFlag(fld), 0);
+ }
+
+ private:
+ void FldMarkSet(EField fld) {
+ FieldsSet |= FldFlag(fld);
+ }
+
+ void FldMarkUnset(EField fld) {
+ FieldsSet &= ~FldFlag(fld);
+ }
+
+ // use when we know the field is dirty or RewriteImpl will be called
+ void FldSetNoDirty(EField fld, const TStringBuf& value) {
+ Fields[fld] = value;
+ FldMarkSet(fld);
+ }
+
+ void FldSet(EField fld, const TStringBuf& value) {
+ FldSetNoDirty(fld, value);
+ FldMarkDirty(fld);
+ }
+
+ const TStringBuf& FldGet(EField fld) const {
+ return Fields[fld];
+ }
+
+ private:
+ /// depending on value, clears or sets it
+ void FldChkSet(EField fld, const TStringBuf& value) {
+ if (value.IsInited())
+ FldSet(fld, value);
+ else
+ FldClr(fld);
+ }
+ void FldChkSet(EField fld, const TUri& other) {
+ FldChkSet(fld, other.GetField(fld));
+ }
+
+ /// set only if initialized
+ bool FldTrySet(EField fld, const TStringBuf& value) {
+ const bool ok = value.IsInited();
+ if (ok)
+ FldSet(fld, value);
+ return ok;
+ }
+ bool FldTrySet(EField fld, const TUri& other) {
+ return FldTrySet(fld, other.GetField(fld));
+ }
+
+ private:
+ /// copies the value if it fits
+ bool FldTryCpy(EField fld, const TStringBuf& value);
+
+ // main method: sets the field value, possibly copies, etc.
+ bool FldSetImpl(EField fld, TStringBuf value, bool strconst = false, bool nocopy = false);
+
+ public: // clear a field
+ void FldClr(EField fld) {
+ Fields[fld].Clear();
+ FldMarkUnset(fld);
+ FldMarkClean(fld);
+ }
+
+ bool FldTryClr(EField field) {
+ const bool ok = FldIsSet(field);
+ if (ok)
+ FldClr(field);
+ return ok;
+ }
+
+ public: // set a field value: might leave state dirty and require a Rewrite()
+ // copies if fits and not dirty, sets and marks dirty otherwise
+ bool FldMemCpy(EField field, const TStringBuf& value) {
+ return FldSetImpl(field, value, false);
+ }
+
+ // uses directly, marks dirty
+ /// @note client MUST guarantee value will be alive until Rewrite is called
+ bool FldMemSet(EField field, const TStringBuf& value) {
+ return FldSetImpl(field, value, false, true);
+ }
+
+ // uses directly, doesn't mark dirty (value scope exceeds "this")
+ bool FldMemUse(EField field, const TStringBuf& value) {
+ return FldSetImpl(field, value, true);
+ }
+
+ // uses directly, doesn't mark dirty
+ template <size_t size>
+ bool FldMemSet(EField field, const char (&value)[size]) {
+ static_assert(size > 0);
+ return FldSetImpl(field, TStringBuf(value, size - 1), true);
+ }
+
+ // duplicate one field to another
+ bool FldDup(EField src, EField dst) {
+ if (!FldIsSet(src) || !FldIsValid(dst))
+ return false;
+ FldSetNoDirty(dst, FldGet(src));
+ if (FldIsDirty(src))
+ FldMarkDirty(dst);
+ else
+ FldMarkClean(dst);
+ return true;
+ }
+
+ // move one field to another
+ bool FldMov(EField src, EField dst) {
+ if (!FldDup(src, dst))
+ return false;
+ FldClr(src);
+ return true;
+ }
+
+ private:
+ bool IsInBuffer(const char* buf) const {
+ return buf >= Buffer.data() && buf < Buffer.data() + Buffer.size();
+ }
+
+ public:
+ bool FldIsDirty() const {
+ return 0 != FieldsDirty;
+ }
+
+ bool FldIsDirty(EField fld) const {
+ return 0 != (FieldsDirty & FldFlag(fld));
+ }
+
+ private:
+ void FldMarkDirty(EField fld) {
+ FieldsDirty |= FldFlag(fld);
+ }
+
+ void FldMarkClean(EField fld) {
+ FieldsDirty &= ~FldFlag(fld);
+ }
+
+ void RewriteImpl();
+
+ public:
+ static TState::EParsed CheckHost(const TStringBuf& host);
+
+ // convert a [potential] IDN to ascii
+ static TMallocPtr<char> IDNToAscii(const wchar32* idna);
+ static TMallocPtr<char> IDNToAscii(const TStringBuf& host, ECharset enc = CODES_UTF8);
+
+ // convert hosts with percent-encoded or extended chars
+
+ // returns non-empty string if host can be converted to ASCII with given parameters
+ static TStringBuf HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc = CODES_UTF8);
+
+ // returns host if already ascii, or non-empty if it can be converted
+ static TStringBuf HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc = CODES_UTF8);
+
+ public:
+ explicit TUri(unsigned defaultPort = 0)
+ : FieldsSet(0)
+ , Port(0)
+ , DefaultPort(static_cast<ui16>(defaultPort))
+ , Scheme(SchemeEmpty)
+ , FieldsDirty(0)
+ {
+ }
+
+ TUri(const TStringBuf& host, ui16 port, const TStringBuf& path, const TStringBuf& query = TStringBuf(), const TStringBuf& scheme = "http", unsigned defaultPort = 0);
+
+ TUri(const TUri& url)
+ : FieldsSet(url.FieldsSet)
+ , Port(url.Port)
+ , DefaultPort(url.DefaultPort)
+ , Scheme(url.Scheme)
+ , FieldsDirty(url.FieldsDirty)
+ {
+ CopyImpl(url);
+ }
+
+ ~TUri() {
+ Clear();
+ }
+
+ void Copy(const TUri& url) {
+ if (&url != this) {
+ CopyData(url);
+ CopyImpl(url);
+ }
+ }
+
+ void Clear() {
+ Dealloc();
+ ClearImpl();
+ }
+
+ ui32 GetFieldMask() const {
+ return FieldsSet;
+ }
+
+ ui32 GetUrlFieldMask() const {
+ return GetFieldMask() & FlagUrlFields;
+ }
+
+ ui32 GetDirtyMask() const {
+ return FieldsDirty;
+ }
+
+ void CheckMissingFields();
+
+ // Process methods
+
+ void Rewrite() {
+ if (FldIsDirty())
+ RewriteImpl();
+ }
+
+ private:
+ TState::EParsed AssignImpl(const TParser& parser, TScheme::EKind defscheme = SchemeEmpty);
+
+ TState::EParsed ParseImpl(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, TScheme::EKind defscheme = SchemeEmpty, ECharset enc = CODES_UTF8);
+
+ public:
+ TState::EParsed Assign(const TParser& parser, TScheme::EKind defscheme = SchemeEmpty) {
+ const TState::EParsed ret = AssignImpl(parser, defscheme);
+ if (ParsedOK == ret)
+ Rewrite();
+ return ret;
+ }
+
+ TState::EParsed ParseUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8) {
+ const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc);
+ if (ParsedOK == ret)
+ Rewrite();
+ return ret;
+ }
+
+ // parses absolute URIs
+ // prepends default scheme (unless unknown) if URI has none
+ TState::EParsed ParseAbsUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, TScheme::EKind defscheme = SchemeUnknown, ECharset enc = CODES_UTF8);
+
+ TState::EParsed ParseAbsOrHttpUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8) {
+ return ParseAbsUri(url, flags, maxlen, SchemeHTTP, enc);
+ }
+
+ TState::EParsed Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8);
+
+ TState::EParsed Parse(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault) {
+ return ParseUri(url, flags);
+ }
+
+ TState::EParsed Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& base_url, ui32 maxlen = 0, ECharset enc = CODES_UTF8);
+
+ TState::EParsed ParseAbs(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, const TStringBuf& base_url = TStringBuf(), ui32 maxlen = 0, ECharset enc = CODES_UTF8) {
+ const TState::EParsed result = Parse(url, flags, base_url, maxlen, enc);
+ return ParsedOK != result || IsValidGlobal() ? result : ParsedBadFormat;
+ }
+
+ // correctAbs works with head "/.." portions:
+ // 1 - reject URL
+ // 0 - keep portions
+ // -1 - ignore portions
+
+ void Merge(const TUri& base, int correctAbs = -1);
+
+ TLinkType Normalize(const TUri& base, const TStringBuf& link, const TStringBuf& codebase = TStringBuf(), long careFlags = FeaturesDefault, ECharset enc = CODES_UTF8);
+
+ private:
+ int PrintFlags(int flags) const {
+ if (0 == (FlagUrlFields & flags))
+ flags |= FlagUrlFields;
+ return flags;
+ }
+
+ protected:
+ size_t PrintSize(ui32 flags) const;
+
+ // Output method, prints to stream
+ IOutputStream& PrintImpl(IOutputStream& out, int flags) const;
+
+ char* PrintImpl(char* str, size_t size, int flags) const {
+ TMemoryOutput out(str, size);
+ PrintImpl(out, flags) << '\0';
+ return str;
+ }
+
+ static bool IsAbsPath(const TStringBuf& path) {
+ return 1 <= path.length() && path[0] == '/';
+ }
+
+ bool IsAbsPathImpl() const {
+ return IsAbsPath(GetField(FieldPath));
+ }
+
+ public:
+ // Output method, prints to stream
+ IOutputStream& Print(IOutputStream& out, int flags = FlagUrlFields) const {
+ return PrintImpl(out, PrintFlags(flags));
+ }
+
+ // Output method, print to str, allocate memory if str is NULL
+ // Should be deprecated
+ char* Print(char* str, size_t size, int flags = FlagUrlFields) const {
+ return nullptr == str ? Serialize(flags) : Serialize(str, size, flags);
+ }
+
+ char* Serialize(char* str, size_t size, int flags = FlagUrlFields) const {
+ Y_ASSERT(str);
+ flags = PrintFlags(flags);
+ const size_t printSize = PrintSize(flags) + 1;
+ return printSize > size ? nullptr : PrintImpl(str, size, flags);
+ }
+
+ char* Serialize(int flags = FlagUrlFields) const {
+ flags = PrintFlags(flags);
+ const size_t size = PrintSize(flags) + 1;
+ return PrintImpl(static_cast<char*>(malloc(size)), size, flags);
+ }
+
+ // Output method to str
+ void Print(TString& str, int flags = FlagUrlFields) const {
+ flags = PrintFlags(flags);
+ str.reserve(str.length() + PrintSize(flags));
+ TStringOutput out(str);
+ PrintImpl(out, flags);
+ }
+
+ TString PrintS(int flags = FlagUrlFields) const {
+ TString str;
+ Print(str, flags);
+ return str;
+ }
+
+ // Only non-default scheme and port are printed
+ char* PrintHost(char* str, size_t size) const {
+ return Print(str, size, (Scheme != SchemeHTTP ? FlagScheme : 0) | FlagHostPort);
+ }
+ TString PrintHostS() const {
+ return PrintS((Scheme != SchemeHTTP ? FlagScheme : 0) | FlagHostPort);
+ }
+
+ // Info methods
+ int Compare(const TUri& A, int flags = FlagUrlFields) const;
+
+ int CompareField(EField fld, const TUri& url) const;
+
+ const TStringBuf& GetField(EField fld) const {
+ return FldIsValid(fld) && FldIsSet(fld) ? FldGet(fld) : Default<TStringBuf>();
+ }
+
+ ui16 GetPort() const {
+ return 0 == Port ? DefaultPort : Port;
+ }
+
+ const TStringBuf& GetHost() const {
+ if (GetFieldMask() & FlagHostAscii)
+ return FldGet(FieldHostAscii);
+ if (GetFieldMask() & FlagHost)
+ return FldGet(FieldHost);
+ return Default<TStringBuf>();
+ }
+
+ bool UseHostAscii() {
+ return FldMov(FieldHostAscii, FieldHost);
+ }
+
+ TScheme::EKind GetScheme() const {
+ return Scheme;
+ }
+ const TSchemeInfo& GetSchemeInfo() const {
+ return TSchemeInfo::Get(Scheme);
+ }
+
+ bool IsNull(ui32 flags = FlagScheme | FlagHost | FlagPath) const {
+ return !FldSetCmp(flags);
+ }
+
+ bool IsNull(EField fld) const {
+ return !FldIsSet(fld);
+ }
+
+ bool IsValidAbs() const {
+ if (IsNull(FlagScheme | FlagHost | FlagPath))
+ return false;
+ return IsAbsPathImpl();
+ }
+
+ bool IsValidGlobal() const {
+ if (IsNull(FlagScheme | FlagHost))
+ return false;
+ if (IsNull(FlagPath))
+ return true;
+ return IsAbsPathImpl();
+ }
+
+ bool IsRootless() const {
+ return FldSetCmp(FlagScheme | FlagHost | FlagPath, FlagScheme | FlagPath) && !IsAbsPathImpl();
+ }
+
+ // for RFC 2396 compatibility
+ bool IsOpaque() const {
+ return IsRootless();
+ }
+
+ // Inline helpers
+ TUri& operator=(const TUri& u) {
+ Copy(u);
+ return *this;
+ }
+
+ bool operator!() const {
+ return IsNull();
+ }
+
+ bool Equal(const TUri& A, int flags = FlagUrlFields) const {
+ return (Compare(A, flags) == 0);
+ }
+
+ bool Less(const TUri& A, int flags = FlagUrlFields) const {
+ return (Compare(A, flags) < 0);
+ }
+
+ bool operator==(const TUri& A) const {
+ return Equal(A, FlagNoFrag);
+ }
+
+ bool operator!=(const TUri& A) const {
+ return !Equal(A, FlagNoFrag);
+ }
+
+ bool operator<(const TUri& A) const {
+ return Less(A, FlagNoFrag);
+ }
+
+ bool IsSameDocument(const TUri& other) const {
+ // pre: both *this and 'other' should be normalized to valid abs
+ Y_ASSERT(IsValidAbs());
+ return Equal(other, FlagNoFrag);
+ }
+
+ bool IsLocal(const TUri& other) const {
+ // pre: both *this and 'other' should be normalized to valid abs
+ Y_ASSERT(IsValidAbs() && other.IsValidAbs());
+ return Equal(other, FlagScheme | FlagHostPort);
+ }
+
+ TLinkType Locality(const TUri& other) const {
+ if (IsSameDocument(other))
+ return LinkIsFragment;
+ else if (IsLocal(other))
+ return LinkIsLocal;
+ return LinkIsGlobal;
+ }
+
+ static IOutputStream& ReEncodeField(IOutputStream& out, const TStringBuf& val, EField fld, long flags = FeaturesEncodeDecode) {
+ return NEncode::TEncoder::ReEncode(out, val, NEncode::TEncodeMapper(flags, fld));
+ }
+
+ static IOutputStream& ReEncodeToField(IOutputStream& out, const TStringBuf& val, EField srcfld, long srcflags, EField dstfld, long dstflags) {
+ return NEncode::TEncoder::ReEncodeTo(out, val, NEncode::TEncodeMapper(srcflags, srcfld), NEncode::TEncodeToMapper(dstflags, dstfld));
+ }
+
+ static IOutputStream& ReEncode(IOutputStream& out, const TStringBuf& val, long flags = FeaturesEncodeDecode) {
+ return ReEncodeField(out, val, FieldAllMAX, flags);
+ }
+
+ static int PathOperationFlag(const TParseFlags& flags) {
+ return flags & FeaturePathDenyRootParent ? 1
+ : flags & FeaturePathStripRootParent ? -1 : 0;
+ }
+
+ static bool PathOperation(char*& pathBeg, char*& pathEnd, int correctAbs);
+
+ private:
+ const TSchemeInfo& SetSchemeImpl(const TSchemeInfo& info) {
+ Scheme = info.Kind;
+ DefaultPort = info.Port;
+ if (!info.Str.empty())
+ FldSetNoDirty(FieldScheme, info.Str);
+ return info;
+ }
+ const TSchemeInfo& SetSchemeImpl(TScheme::EKind scheme) {
+ return SetSchemeImpl(TSchemeInfo::Get(scheme));
+ }
+
+ public:
+ const TSchemeInfo& SetScheme(const TSchemeInfo& info) {
+ SetSchemeImpl(info);
+ if (!info.Str.empty())
+ FldMarkClean(FieldScheme);
+ return info;
+ }
+ const TSchemeInfo& SetScheme(TScheme::EKind scheme) {
+ return SetScheme(TSchemeInfo::Get(scheme));
+ }
+ };
+
+ class TUriUpdate {
+ TUri& Uri_;
+
+ public:
+ TUriUpdate(TUri& uri)
+ : Uri_(uri)
+ {
+ }
+ ~TUriUpdate() {
+ Uri_.Rewrite();
+ }
+
+ public:
+ bool Set(TField::EField field, const TStringBuf& value) {
+ return Uri_.FldMemSet(field, value);
+ }
+
+ template <size_t size>
+ bool Set(TField::EField field, const char (&value)[size]) {
+ return Uri_.FldMemSet(field, value);
+ }
+
+ void Clr(TField::EField field) {
+ Uri_.FldClr(field);
+ }
+ };
+
+ const char* LinkTypeToString(const TUri::TLinkType& t);
+
+}
+
+Y_DECLARE_OUT_SPEC(inline, NUri::TUri, out, url) {
+ url.Print(out);
+}
+
+Y_DECLARE_OUT_SPEC(inline, NUri::TUri::TLinkType, out, t) {
+ out << NUri::LinkTypeToString(t);
+}
diff --git a/library/cpp/uri/uri_ut.cpp b/library/cpp/uri/uri_ut.cpp
new file mode 100644
index 00000000000..2ebd83fc93a
--- /dev/null
+++ b/library/cpp/uri/uri_ut.cpp
@@ -0,0 +1,1022 @@
+#include "uri_ut.h"
+#include "other.h"
+#include "qargs.h"
+#include <library/cpp/html/entity/htmlentity.h>
+
+#include <util/system/maxlen.h>
+
+namespace NUri {
+ Y_UNIT_TEST_SUITE(URLTest) {
+ static const char* urls[] = {
+ "http://a/b/c/d;p?q#r",
+ "g", "http://a/b/c/g",
+ "./g", "http://a/b/c/g",
+ "g/", "http://a/b/c/g/",
+ "/g", "http://a/g",
+ "//g", "http://g/",
+ "?y", "http://a/b/c/d;p?y",
+ "g?y", "http://a/b/c/g?y",
+ "#s", "http://a/b/c/d;p?q#s",
+ "g#s", "http://a/b/c/g#s",
+ "g?y#s", "http://a/b/c/g?y#s",
+ ";x", "http://a/b/c/;x",
+ "g;x", "http://a/b/c/g;x",
+ "g;x?y#s", "http://a/b/c/g;x?y#s",
+ ".", "http://a/b/c/",
+ "./", "http://a/b/c/",
+ "./.", "http://a/b/c/",
+ "././", "http://a/b/c/",
+ "././.", "http://a/b/c/",
+ "..", "http://a/b/",
+ "../", "http://a/b/",
+ "../.", "http://a/b/",
+ "../g", "http://a/b/g",
+ "../..", "http://a/",
+ "../../", "http://a/",
+ "../../.", "http://a/",
+ "../../g", "http://a/g",
+ "../../../g", "http://a/g",
+ "../../../../g", "http://a/g",
+ "/./g", "http://a/g",
+ "g.", "http://a/b/c/g.",
+ ".g", "http://a/b/c/.g",
+ "g..", "http://a/b/c/g..",
+ "..g", "http://a/b/c/..g",
+ "./../g", "http://a/b/g",
+ "./g/.", "http://a/b/c/g/",
+ "g/./h", "http://a/b/c/g/h",
+ "g/../h", "http://a/b/c/h",
+ "g;x=1/./y", "http://a/b/c/g;x=1/y",
+ "g;x=1/../y", "http://a/b/c/y",
+ "g?y/./x", "http://a/b/c/g?y/./x",
+ "g?y/../x", "http://a/b/c/g?y/../x",
+ "g#s/./x", "http://a/b/c/g#s/./x",
+ "g#s/../x", "http://a/b/c/g#s/../x",
+ "?", "http://a/b/c/d;p?",
+ "/?", "http://a/?",
+ "x?", "http://a/b/c/x?",
+ "x%20y", "http://a/b/c/x%20y",
+ "%20y", "http://a/b/c/%20y",
+ // "%2zy", "http://a/b/c/%2zy",
+ nullptr};
+
+ Y_UNIT_TEST(test_httpURL) {
+ TUri rel, base, abs;
+ TState::EParsed er = base.Parse(urls[0]);
+ UNIT_ASSERT_VALUES_EQUAL(er, TState::ParsedOK);
+ UNIT_ASSERT(base.IsValidAbs());
+ UNIT_ASSERT_VALUES_EQUAL(base.PrintS(), urls[0]);
+
+ TString errbuf;
+ TStringOutput out(errbuf);
+ const long mflag = TFeature::FeaturesAll;
+ for (int i = 1; urls[i]; i += 2) {
+ er = rel.Parse(urls[i]);
+ UNIT_ASSERT_VALUES_EQUAL_C(er, TState::ParsedOK, urls[i]);
+ rel.Merge(base);
+ UNIT_ASSERT_VALUES_EQUAL_C(rel.PrintS(), urls[i + 1], urls[i]);
+
+ // try the same thing differently
+ er = rel.Parse(urls[i], mflag, urls[0]);
+ UNIT_ASSERT_VALUES_EQUAL_C(er, TState::ParsedOK, urls[i]);
+ UNIT_ASSERT_VALUES_EQUAL_C(rel.PrintS(), urls[i + 1], urls[i]);
+
+ // lastly...
+ er = abs.Parse(urls[i + 1], mflag);
+ UNIT_ASSERT_VALUES_EQUAL(er, TState::ParsedOK);
+ errbuf.clear();
+ out << '[' << rel.PrintS()
+ << "] != [" << abs.PrintS() << ']';
+ UNIT_ASSERT_EQUAL_C(rel, abs, errbuf);
+ }
+ }
+
+ Y_UNIT_TEST(test_Schemes) {
+ TUri url;
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("www.ya.ru/index.html"), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeEmpty);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("http://www.ya.ru"), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeHTTP);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("https://www.ya.ru"), TState::ParsedBadScheme);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("https://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeKnown), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeHTTPS);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpwhatever://www.ya.ru"), TState::ParsedBadScheme);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpwhatever://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeFlexible), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeUnknown);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpswhatever://www.ya.ru"), TState::ParsedBadScheme);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpswhatever://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeFlexible), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeUnknown);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("ftp://www.ya.ru"), TState::ParsedBadScheme);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("ftp://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeFlexible), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeFTP);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpsssss://www.ya.ru"), TState::ParsedBadScheme);
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpsssss://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeFlexible), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeUnknown);
+ }
+
+ struct Link4Norm {
+ const char* const base;
+ const char* const link;
+ const char* const result;
+ TUri::TLinkType ltype;
+ };
+
+ static const Link4Norm link4Norm[] = {
+ {"http://www.alltest.ru/all.php?a=aberporth", "http://www.alltest.ru/all.php?a=domestic jobs", "", TUri::LinkIsBad},
+ {"http://www.alltest.ru/all.php?a=aberporth", "http://www.alltest.ru/all.php?a=domestic%20jobs", "http://www.alltest.ru/all.php?a=domestic%20jobs", TUri::LinkIsLocal},
+ {"http://president.rf/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8", "http://president.rf/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/1024", "http://president.rf/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/1024", TUri::LinkIsLocal},
+ {nullptr, nullptr, nullptr, TUri::LinkIsBad},
+ };
+
+ Y_UNIT_TEST(test_httpURLNormalize) {
+ TUri normalizedLink;
+
+ for (int i = 0; link4Norm[i].link; i++) {
+ TUri base;
+ TState::EParsed er = base.Parse(link4Norm[i].base);
+ UNIT_ASSERT_VALUES_EQUAL_C(er, TState::ParsedOK, link4Norm[i].base);
+ TUri::TLinkType ltype = normalizedLink.Normalize(base, link4Norm[i].link);
+ UNIT_ASSERT_VALUES_EQUAL_C(ltype, link4Norm[i].ltype, link4Norm[i].link);
+ TString s = TUri::LinkIsBad == ltype ? "" : normalizedLink.PrintS();
+ UNIT_ASSERT_VALUES_EQUAL_C(s, link4Norm[i].result, link4Norm[i].link);
+ }
+ }
+
+ static const char* urlsWithMultipleSlash[] = {
+ "http://a/http://b", "http://a/http://b",
+ "http://a/https://b", "http://a/https://b",
+ "http://a/b://c", "http://a/b:/c",
+ "http://a/b//c", "http://a/b/c",
+ nullptr, nullptr};
+
+ Y_UNIT_TEST(test_httpURLPathOperation) {
+ char copyUrl[URL_MAXLEN];
+ for (int i = 0; urlsWithMultipleSlash[i]; i += 2) {
+ const TStringBuf url(urlsWithMultipleSlash[i]);
+ const TStringBuf normurl(urlsWithMultipleSlash[i + 1]);
+ memcpy(copyUrl, url.data(), url.length());
+ char* p = copyUrl;
+ char* e = copyUrl + url.length();
+ TUri::PathOperation(p, e, 1);
+ UNIT_ASSERT_VALUES_EQUAL(TStringBuf(p, e), normurl);
+ TUri uri;
+ UNIT_ASSERT_VALUES_EQUAL(TState::ParsedOK, uri.Parse(url));
+ UNIT_ASSERT_VALUES_EQUAL_C(uri.PrintS(), normurl, url);
+ }
+ }
+
+ static const char* hostsForCheckHost[] = {
+ "simplehost.ru",
+ "third_level.host.ru",
+ "_ok.somewhere.ru",
+ "a.b",
+ "second_level.ru",
+ "_bad.ru",
+ "_",
+ "yandex.ru:443",
+ nullptr};
+
+ static TState::EParsed answersForCheckHost[] = {
+ TState::ParsedOK,
+ TState::ParsedOK,
+ TState::ParsedOK,
+ TState::ParsedOK,
+ TState::ParsedBadHost,
+ TState::ParsedBadHost,
+ TState::ParsedBadHost,
+ TState::ParsedBadHost,
+ };
+
+ Y_UNIT_TEST(test_httpURLCheckHost) {
+ for (size_t index = 0; hostsForCheckHost[index]; ++index) {
+ TState::EParsed state = TUri::CheckHost(hostsForCheckHost[index]);
+ UNIT_ASSERT_VALUES_EQUAL(state, answersForCheckHost[index]);
+ }
+ }
+
+ Y_UNIT_TEST(test_httpURLSet) {
+ // set port
+ {
+ TUri parsedUrl;
+ parsedUrl.Parse("http://www.host.com/script.cgi?param1=value1&param2=value2");
+ parsedUrl.FldMemSet(TField::FieldPort, "8080");
+ UNIT_ASSERT_VALUES_EQUAL(parsedUrl.GetPort(), 8080);
+ UNIT_ASSERT_VALUES_EQUAL(parsedUrl.PrintS(), "http://www.host.com:8080/script.cgi?param1=value1&param2=value2");
+ }
+
+ // clear port
+ {
+ TUri parsedUrl;
+ parsedUrl.Parse("http://www.host.com:8080/script.cgi?param1=value1&param2=value2");
+ parsedUrl.FldMemSet(TField::FieldPort, nullptr);
+ UNIT_ASSERT_VALUES_EQUAL(parsedUrl.GetPort(), 80);
+ UNIT_ASSERT_VALUES_EQUAL(parsedUrl.PrintS(), "http://www.host.com/script.cgi?param1=value1&param2=value2");
+ }
+
+ // change scheme with default port
+ {
+ TUri parsedUrl;
+ parsedUrl.Parse("http://www.host.com/script.cgi?param1=value1&param2=value2");
+ parsedUrl.FldMemSet(TField::FieldScheme, "https");
+ UNIT_ASSERT_VALUES_EQUAL(parsedUrl.GetPort(), 443);
+ UNIT_ASSERT_VALUES_EQUAL(parsedUrl.PrintS(), "https://www.host.com/script.cgi?param1=value1&param2=value2");
+ }
+
+ // change scheme with non-default port
+ {
+ TUri parsedUrl;
+ parsedUrl.Parse("http://www.host.com:8080/script.cgi?param1=value1&param2=value2");
+ parsedUrl.FldMemSet(TField::FieldScheme, "https");
+ UNIT_ASSERT_VALUES_EQUAL(parsedUrl.GetPort(), 8080);
+ UNIT_ASSERT_VALUES_EQUAL(parsedUrl.PrintS(), "https://www.host.com:8080/script.cgi?param1=value1&param2=value2");
+ }
+ }
+
+ Y_UNIT_TEST(test_httpURLAuth) {
+ {
+ TUri parsedUrl;
+ TState::EParsed st = parsedUrl.Parse("http://@www.host.com/path", TFeature::FeaturesRobot);
+ UNIT_ASSERT_VALUES_EQUAL(st, TState::ParsedBadAuth);
+ }
+
+ {
+ TUri parsedUrl;
+ TState::EParsed st = parsedUrl.Parse("http://loginwithnopass@www.host.com/path", TFeature::FeatureAuthSupported);
+ UNIT_ASSERT_VALUES_EQUAL(st, TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldHost), "www.host.com");
+ UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldUser), "loginwithnopass");
+ UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldPass), "");
+ }
+
+ {
+ TUri parsedUrl;
+ TState::EParsed st = parsedUrl.Parse("http://login:pass@www.host.com/path", TFeature::FeatureAuthSupported);
+ UNIT_ASSERT_VALUES_EQUAL(st, TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldHost), "www.host.com");
+ UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldUser), "login");
+ UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldPass), "pass");
+ }
+ }
+
+ Y_UNIT_TEST(test01) {
+ TTest test = {
+ "user:pass@host:8080", TFeature::FeaturesAll, TState::ParsedRootless, "user", "", "", "", 0, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ Y_UNIT_TEST(test02) {
+ TTest test = {
+ "http://host", TFeature::FeaturesAll, TState::ParsedOK, "http", "", "", "host", 80, "/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ Y_UNIT_TEST(test03) {
+ TTest test = {
+ "https://host", TFeature::FeatureSchemeFlexible | TFeature::FeatureAllowHostIDN, TState::ParsedOK, "https", "", "", "host", 443, "/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ Y_UNIT_TEST(test04) {
+ TTest test = {
+ "user:pass@host:8080", TFeature::FeaturesAll | TFeature::FeatureNoRelPath | TFeature::FeatureAllowRootless, TState::ParsedOK, "user", "", "", "", 0, "pass@host:8080", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ TUri url2(url);
+ CMP_URL(url2, test);
+ URL_EQ(url, url2);
+ }
+
+ Y_UNIT_TEST(test05) {
+ TTest test = {
+ "host:8080", TFeature::FeaturesAll | TFeature::FeatureNoRelPath | TFeature::FeatureAllowRootless, TState::ParsedOK, "host", "", "", "", 0, "8080", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "host:8080");
+ }
+
+ Y_UNIT_TEST(test06) {
+ TTest test = {
+ "http://user:pass@host?q", TFeature::FeaturesAll, TState::ParsedOK, "http", "user", "pass", "host", 80, "/", "q", ""};
+ TUri url;
+ URL_TEST(url, test);
+ url.FldMemSet(TField::FieldScheme, "https");
+ UNIT_ASSERT(!url.FldIsDirty());
+ UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldScheme), "https");
+ UNIT_ASSERT_VALUES_EQUAL(url.GetPort(), 443);
+
+ // test copying
+ TUri url2(url);
+ // make sure strings are equal...
+ UNIT_ASSERT_VALUES_EQUAL(
+ url.GetField(TField::FieldUser),
+ url2.GetField(TField::FieldUser));
+ // ... and memory locations are the same
+ UNIT_ASSERT_EQUAL(
+ url.GetField(TField::FieldUser),
+ url2.GetField(TField::FieldUser));
+ // and urls compare the same
+ URL_EQ(url, url2);
+
+ // cause a dirty field
+ url.FldMemSet(TField::FieldUser, "use"); // it is now shorter
+ UNIT_ASSERT(!url.FldIsDirty());
+ url.FldMemSet(TField::FieldUser, TStringBuf("user"));
+ UNIT_ASSERT(url.FldIsDirty());
+
+ // copy again
+ url2 = url;
+ UNIT_ASSERT(url.FldIsDirty());
+ UNIT_ASSERT(!url2.FldIsDirty());
+ URL_EQ(url, url2);
+ // make sure strings are equal...
+ UNIT_ASSERT_VALUES_EQUAL(
+ url.GetField(TField::FieldUser),
+ url2.GetField(TField::FieldUser));
+ // ... but memory locations are different
+ UNIT_ASSERT_UNEQUAL(
+ url.GetField(TField::FieldUser).data(),
+ url2.GetField(TField::FieldUser).data());
+ URL_EQ(url, url2);
+
+ // make query empty
+ url.FldMemSet(TField::FieldQuery, "");
+ url2 = url;
+ URL_EQ(url, url2);
+ // set query to null value (should clear it)
+ url2.FldMemSet(TField::FieldQuery, TStringBuf());
+ // make sure they are no longer equal
+ URL_NEQ(url, url2);
+ // reset query
+ url.FldClr(TField::FieldQuery);
+ // equal again
+ URL_EQ(url, url2);
+ // reset port and set the other to default
+ url.FldClr(TField::FieldPort);
+ url2.FldMemSet(TField::FieldPort, "443");
+ URL_EQ(url, url2);
+ }
+
+ Y_UNIT_TEST(test07) {
+ {
+ TTest test = {
+ "http://host/path//", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "http", "", "", "host", 80, "/path/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ url.FldMemSet(TField::FieldScheme, "HTTPs");
+ UNIT_ASSERT_EQUAL(TScheme::SchemeHTTPS, url.GetScheme());
+ UNIT_ASSERT_EQUAL("https", url.GetField(TField::FieldScheme));
+ url.FldMemSet(TField::FieldScheme, "HtTP");
+ UNIT_ASSERT_EQUAL(TScheme::SchemeHTTP, url.GetScheme());
+ UNIT_ASSERT_EQUAL("http", url.GetField(TField::FieldScheme));
+ }
+
+ {
+ const TString scheme = "http";
+ const TString host = "host.com";
+ const TString urlstr = scheme + "://" + host;
+ TTest test = {
+ urlstr, TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, scheme, "", "", host, 80, "/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), urlstr + "/");
+ }
+ }
+
+ Y_UNIT_TEST(test08) {
+ {
+ TTest test = {
+ "mailto://user@host.com", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "mailto", "user", "", "host.com", 0, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "host:/path/.path/.", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "host", "", "", "", 0, "/path/.path/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "host:1/path/.path/.", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "host", 1, "/path/.path/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "host:1/path/.path/.", TFeature::FeaturesAll | TFeature::FeatureAllowRootless, TState::ParsedOK, "host", "", "", "", 0, "1/path/.path/.", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "/[foo]:bar", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "/[foo]:bar", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ ".", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ ".", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "././.", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "././.", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "./path", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "path", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "./path", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "path", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "../path", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "../path", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "../path", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "../path", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "/../path", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "/path", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ }
+
+ Y_UNIT_TEST(test09) {
+ {
+ TTest test = {
+ "mailto:user@host.com", TFeature::FeaturesAll | TFeature::FeatureAllowRootless, TState::ParsedOK, "mailto", "", "", "", 0, "user@host.com", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "scheme:", TFeature::FeaturesAll | TFeature::FeatureNoRelPath | TFeature::FeatureAllowRootless, TState::ParsedOK, "scheme", "", "", "", 0, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "scheme:", TFeature::FeaturesAll | TFeature::FeatureAllowRootless, TState::ParsedOK, "scheme", "", "", "", 0, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ }
+
+ Y_UNIT_TEST(test10) {
+ // test some escaping madness, note the ehost vs host
+ {
+ TString host = "президент.рф";
+ TString ehost = "%D0%BF%D1%80%D0%B5%D0%B7%D0%B8%D0%B4%D0%B5%D0%BD%D1%82.%D1%80%D1%84";
+ const TString urlstr = TString::Join("http://", host, "/");
+ TTest test = {
+ urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeaturesDefault | TFeature::FeatureCheckHost, TState::ParsedBadHost, "http", "", "", ehost, 80, "/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ {
+ TString host = "%D0%BF%D1%80%D0%B5%D0%B7%D0%B8%D0%B4%D0%B5%D0%BD%D1%82.%D1%80%D1%84";
+ const TString urlstr = TString::Join("http://", host, "/");
+ TTest test = {
+ urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeaturesDefault | TFeature::FeatureCheckHost, TState::ParsedBadHost, "http", "", "", host, 80, "/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ {
+ TString host = "Фilip.ru";
+ TString ehost = "%D0%A4ilip.ru";
+ const TString urlstr = TString::Join("http://", host);
+ TTest test = {
+ urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeaturesDefault, TState::ParsedBadHost, "http", "", "", ehost, 80, "/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ {
+ TString host = "%D0%A4ilip.ru";
+ const TString urlstr = TString::Join("http://", host);
+ TTest test = {
+ urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeaturesDefault, TState::ParsedBadHost, "http", "", "", host, 80, "/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ {
+ TString host = "Filip%90.rЯ";
+ TString ehost = "Filip%90.r%D0%AF";
+ const TString urlstr = TString::Join(host, ":8080");
+ TTest test = {
+ urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeatureDecodeAllowed | TFeature::FeaturesDefault | TFeature::FeatureNoRelPath, TState::ParsedBadHost, "", "", "", ehost, 8080, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ {
+ TString host = "Filip%90.r%D0%AF";
+ const TString urlstr = TString::Join(host, ":8080");
+ TTest test = {
+ urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeatureDecodeAllowed | TFeature::FeaturesDefault | TFeature::FeatureNoRelPath, TState::ParsedBadHost, "", "", "", host, 8080, "", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ }
+
+ Y_UNIT_TEST(test11) {
+ {
+ TTest test = {
+ "HtTp://HoSt/%50aTh/?Query#Frag", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "http", "", "", "host", 80, "/PaTh/", "Query", "Frag"};
+ TUri url;
+ URL_TEST(url, test);
+ }
+
+ {
+ TTest test = {
+ "HtTp://HoSt/%50a%54h/?Query#Frag", TParseFlags(TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TFeature::FeatureToLower), TState::ParsedOK, "http", "", "", "host", 80, "/path/", "query", "frag"};
+ TUri url;
+ URL_TEST(url, test);
+ }
+ }
+
+ Y_UNIT_TEST(test12) {
+ // test characters which are not always safe
+ {
+#define RAW "/:"
+#define DEC "%2F:"
+#define ENC "%2F%3A"
+ TTest test = {
+ "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC, TFeature::FeaturesAll, TState::ParsedOK, "http", RAW, RAW, "host", 80, "/" DEC, RAW, RAW};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" ENC ":" ENC "@host/" DEC "?" RAW "#" RAW);
+#undef RAW
+#undef DEC
+#undef ENC
+ }
+ {
+#define RAW "?@"
+#define DEC "%3F@"
+#define ENC "%3F%40"
+ TTest test = {
+ "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC, TFeature::FeaturesAll, TState::ParsedOK, "http", RAW, RAW, "host", 80, "/" DEC, RAW, RAW};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" ENC ":" ENC "@host/" DEC "?" RAW "#" RAW);
+#undef RAW
+#undef DEC
+#undef ENC
+ }
+ {
+#define RAW "%&;="
+#define DEC "%25&;="
+#define ENC "%25%26%3B%3D"
+ TTest test = {
+ "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC, TFeature::FeaturesAll, TState::ParsedOK, "http", RAW, RAW, "host", 80, "/" ENC, ENC, ENC};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC);
+#undef RAW
+#undef DEC
+#undef ENC
+ }
+ {
+#define RAW "!$'()*,"
+#define DEC "!$%27()*,"
+#define ENC "%21%24%27%28%29%2A%2C"
+ TTest test = {
+ "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC, TFeature::FeaturesAll, TState::ParsedOK, "http", RAW, RAW, "host", 80, "/" ENC, DEC, DEC};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" ENC ":" ENC "@host/" ENC "?" DEC "#" DEC);
+#undef RAW
+#undef DEC
+#undef ENC
+ }
+ {
+#define DEC "Череповец。рф"
+#define ENC "%D0%A7%D0%B5%D1%80%D0%B5%D0%BF%D0%BE%D0%B2%D0%B5%D1%86%E3%80%82%D1%80%D1%84"
+// punycode corresponds to lowercase
+#define PNC "xn--b1afab7bff7cb.xn--p1ai"
+ TTest test = {
+ "http://" ENC "/" ENC "?" ENC "#" ENC, TParseFlags(TFeature::FeaturesAll | TFeature::FeatureAllowHostIDN, TFeature::FeatureDecodeExtendedASCII), TState::ParsedOK, "http", "", "", DEC, 80, "/" ENC, ENC, ENC};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldHostAscii), PNC);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" DEC "/" ENC "?" ENC "#" ENC);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHostAscii), "http://" PNC "/" ENC "?" ENC "#" ENC);
+#undef PNC
+#undef DEC
+#undef ENC
+ }
+ {
+#define DEC "Череповец。рф"
+#define ENC "%D0%A7%D0%B5%D1%80%D0%B5%D0%BF%D0%BE%D0%B2%D0%B5%D1%86%E3%80%82%D1%80%D1%84"
+// punycode corresponds to lowercase
+#define PNC "xn--b1afab7bff7cb.xn--p1ai"
+ TTest test = {
+ "http://" DEC "/" DEC "?" DEC "#" DEC, TParseFlags(TFeature::FeaturesRobot | TFeature::FeatureEncodeExtendedASCII), TState::ParsedOK, "http", "", "", PNC, 80, "/" ENC, ENC, ENC};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" PNC "/" ENC "?" ENC "#" ENC);
+#undef PNC
+#undef DEC
+#undef ENC
+ }
+ {
+#define DEC "независимая-экспертиза-оценка-ущерба-авто-дтп.рф"
+#define PNC "xn--------3veabbbbjgk5abecc3afsad2cg8bvq2alouolqf5brd3a4jzftgqd.xn--p1ai"
+ TTest test = {
+ "http://" DEC "/", TParseFlags(TFeature::FeaturesRobot), TState::ParsedOK, "http", "", "", PNC, 80, "/", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" PNC "/");
+#undef PNC
+#undef DEC
+ }
+ }
+
+ Y_UNIT_TEST(testFlexibleAuthority) {
+ TUri uri;
+ UNIT_ASSERT_EQUAL(uri.Parse("http://hello_world", TFeature::FeatureCheckHost), TState::ParsedBadHost);
+ UNIT_ASSERT_EQUAL(uri.Parse("http://hello_world", TFeature::FeatureSchemeFlexible), TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(uri.GetHost(), "hello_world");
+
+ UNIT_ASSERT_EQUAL(uri.Parse("httpzzzzz://)(*&^$!\\][';<>`~,q?./index.html", TFeature::FeatureSchemeFlexible), TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(uri.GetHost(), ")(*&^$!\\][';<>`~,q");
+ UNIT_ASSERT_VALUES_EQUAL(uri.GetField(TField::FieldPath), "");
+ UNIT_ASSERT_VALUES_EQUAL(uri.GetField(TField::FieldQuery), "./index.html");
+
+ UNIT_ASSERT_EQUAL(uri.Parse("htttttttp://)(*&^%45$!\\][';<>`~,.q/index.html", TFeature::FeatureSchemeFlexible), TState::ParsedOK);
+ UNIT_ASSERT_VALUES_EQUAL(uri.GetHost(), ")(*&^e$!\\][';<>`~,.q");
+ UNIT_ASSERT_VALUES_EQUAL(uri.GetField(TField::FieldPath), "/index.html");
+ UNIT_ASSERT_VALUES_EQUAL(uri.GetField(TField::FieldQuery), "");
+ }
+
+ Y_UNIT_TEST(testSpecialChar) {
+ // test characters which are not always allowed
+ {
+ TTest test = {
+ "http://host/pa th", TFeature::FeaturesAll | TFeature::FeatureEncodeSpace, TState::ParsedOK, "http", "", "", "host", 80, "/pa%20th", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/pa%20th");
+ }
+ {
+ TTest test = {
+ "http://host/pa th", TFeature::FeaturesAll, TState::ParsedBadFormat, "http", "", "", "host", 80, "/pa th", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/pa th");
+ }
+ {
+ TTest test = {
+ "http://host/pa%th%41", TFeature::FeaturesAll | TFeature::FeatureEncodePercent, TState::ParsedOK, "http", "", "", "host", 80, "/pa%25thA", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/pa%25thA");
+ }
+ {
+ TTest test = {
+ "http://host/invalid_second_char%az%1G", TFeature::FeaturesAll | TFeature::FeatureEncodePercent, TState::ParsedOK, "http", "", "", "host", 80, "/invalid_second_char%25az%251G", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/invalid_second_char%25az%251G");
+ }
+ {
+ TTest test = {
+ "http://host/border%2", TFeature::FeaturesAll | TFeature::FeatureEncodePercent, TState::ParsedOK, "http", "", "", "host", 80, "/border%252", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/border%252");
+ }
+ {
+ TTest test = {
+ "http://host/pa%th%41", TFeature::FeaturesAll, TState::ParsedBadFormat, "http", "", "", "host", 80, "/pa%thA", "", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/pa%thA");
+ }
+ }
+
+ Y_UNIT_TEST(testIPv6) {
+ {
+#define RAW "[1080:0:0:0:8:800:200C:417A]"
+#define DEC "[1080:0:0:0:8:800:200c:417a]"
+ TTest test = {
+ "http://" RAW "/" RAW "?" RAW "#" RAW, TParseFlags(TFeature::FeaturesAll), TState::ParsedOK, "http", "", "", DEC, 80, "/" RAW, RAW, RAW};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" DEC "/" RAW "?" RAW "#" RAW);
+#undef DEC
+#undef RAW
+ }
+ }
+
+ Y_UNIT_TEST(testEscapedFragment) {
+ {
+ TTest test = {
+ "http://host.com#!a=b&c=d#e+g%41%25", TParseFlags(TFeature::FeaturesAll | TFeature::FeatureHashBangToEscapedFragment), TState::ParsedOK, "http", "", "", "host.com", 80, "/", "_escaped_fragment_=a=b%26c=d%23e%2BgA%2525", ""};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host.com/?_escaped_fragment_=a=b%26c=d%23e%2BgA%2525");
+ }
+ {
+ TTest test = {
+ "http://host.com?_escaped_fragment_=a=b%26c=d%23e%2bg%2525", TParseFlags(TFeature::FeaturesAll | TFeature::FeatureEscapedToHashBangFragment), TState::ParsedOK, "http", "", "", "host.com", 80, "/", "", "!a=b&c=d#e+g%25"};
+ TUri url;
+ URL_TEST(url, test);
+ UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host.com/#!a=b&c=d#e+g%25");
+ }
+ }
+
+ Y_UNIT_TEST(testReEncode) {
+ {
+ TStringStream out;
+ TUri::ReEncode(out, "foo bar");
+ UNIT_ASSERT_VALUES_EQUAL(out.Str(), "foo%20bar");
+ }
+ }
+
+ static const TStringBuf NonRfcUrls[] = {
+ "http://deshevle.ru/price/price=&SrchTp=1&clID=24&BL=SrchTp=0|clID=24&frmID=75&SortBy=P&PreSort=&NmDir=0&VndDir=0&PrDir=0&SPP=44",
+ "http://secure.rollerwarehouse.com/skates/aggressive/skates/c/11[03]/tx/$$$+11[03][a-z]",
+ "http://secure.rollerwarehouse.com/skates/aggressive/skates/tx/$$$+110[a-z]",
+ "http://translate.google.com/translate_t?langpair=en|ru",
+ "http://www.garnier.com.ru/_ru/_ru/our_products/products_trade.aspx?tpcode=OUR_PRODUCTS^PRD_BODYCARE^EXTRA_SKIN^EXTRA_SKIN_BENEFITS",
+ "http://www.km.ru/magazin/view_print.asp?id={1846295A-223B-41DC-9F51-90D5D6236C49}",
+ "http://www.manutd.com/default.sps?pagegid={78F24B85-702C-4DC8-A5D4-2F67252C28AA}&itype=12977&pagebuildpageid=2716&bg=1",
+ "http://www.pokupay.ru/price/price=&SrchTp=1&clID=24&BL=SrchTp=0|clID=24&frmID=75&SPP=35&SortBy=N&PreSort=V&NmDir=0&VndDir=1&PrDir=0",
+ "http://www.rodnoyspb.ru/rest/plager/page[0].html",
+ "http://www.trinity.by/?section_id=46,47,48&cat=1&filters[]=2^_^Sony",
+ "http://translate.yandex.net/api/v1/tr.json/translate?lang=en-ru&text=>",
+ nullptr};
+
+ Y_UNIT_TEST(test_NonRfcUrls) {
+ TUri url;
+ const long flags = TFeature::FeaturesRobot;
+ for (size_t i = 0;; ++i) {
+ const TStringBuf& buf = NonRfcUrls[i];
+ if (!buf.IsInited())
+ break;
+ UNIT_ASSERT_VALUES_EQUAL(TState::ParsedOK, url.Parse(buf, flags));
+ }
+ }
+
+ static const TStringBuf CheckParseException[] = {
+ "http://www.'>'.com/?.net/",
+ nullptr};
+
+ Y_UNIT_TEST(test_CheckParseException) {
+ TUri url;
+ const long flags = TFeature::FeaturesRobot | TFeature::FeaturesEncode;
+ for (size_t i = 0;; ++i) {
+ const TStringBuf& buf = CheckParseException[i];
+ if (!buf.IsInited())
+ break;
+ TString what;
+ try {
+ // we care only about exceptions, not whether it parses correctly
+ url.Parse(buf, flags);
+ continue;
+ } catch (const std::exception& exc) {
+ what = exc.what();
+ } catch (...) {
+ what = "exception thrown";
+ }
+ ythrow yexception() << "failed to parse URL [" << buf << "]: " << what;
+ }
+ }
+
+ Y_UNIT_TEST(test_PrintPort) {
+ TUri uri;
+ {
+ uri.Parse("http://srv.net:9100/print", TFeature::FeaturesRecommended);
+ TString s = uri.PrintS(TUri::FlagPort);
+ Cdbg << uri.PrintS() << ',' << uri.PrintS(TUri::FlagPort) << Endl;
+ UNIT_ASSERT_VALUES_EQUAL(9100, FromString<ui32>(s));
+ }
+ {
+ uri.Parse("http://srv.net:80/print", TFeature::FeaturesRecommended);
+ TString s = uri.PrintS(TUri::FlagPort);
+ Cdbg << uri.PrintS() << ',' << uri.PrintS(TUri::FlagPort) << Endl;
+ UNIT_ASSERT(s.Empty());
+ }
+ }
+
+ Y_UNIT_TEST(test_ParseFailures) {
+ {
+ TTest test = {
+ "http://host:port", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""};
+ TUri url(-1);
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "http://javascript:alert(hi)", TFeature::FeaturesRobot, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""};
+ TUri url(-1);
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "http://host::0", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""};
+ TUri url(-1);
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "http://host ", TFeature::FeaturesAll, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""};
+ TUri url(-1);
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "http:00..03", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""};
+ TUri url(-1);
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "host:00..03", TFeature::FeaturesAll, TState::ParsedRootless, "host", "", "", "", 0, "", "", ""};
+ TUri url(-1);
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "http://roduct;isbn,0307371549;at,aid4c00179ab018www.mcnamarasband.wordpress.com/", TFeature::FeaturesAll, TState::ParsedBadHost, "http", "", "", "roduct;isbn,0307371549;at,aid4c00179ab018www.mcnamarasband.wordpress.com", 80, "/", "", ""};
+ TUri url(-1);
+ URL_TEST(url, test);
+ }
+ {
+ TTest test = {
+ "invalid url", TFeature::FeaturesDefault, TState::ParsedBadFormat, "", "", "", "", 0, "invalid url", "", ""};
+ TUri url(-1);
+ URL_TEST(url, test);
+ }
+ }
+ Y_UNIT_TEST(test_scheme_related_url) {
+ TUri url;
+ UNIT_ASSERT_VALUES_EQUAL(url.Parse("//www.hostname.ru/path", TFeature::FeaturesRobot), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeEmpty);
+ UNIT_ASSERT_VALUES_EQUAL(url.GetHost(), "www.hostname.ru");
+ UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldPath), "/path");
+
+ TUri baseUrl;
+ UNIT_ASSERT_VALUES_EQUAL(baseUrl.Parse("https://trololo.com", TFeature::FeaturesRobot), TState::ParsedOK);
+ UNIT_ASSERT_EQUAL(baseUrl.GetScheme(), TScheme::SchemeHTTPS);
+ url.Merge(baseUrl);
+ UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeHTTPS);
+ UNIT_ASSERT_VALUES_EQUAL(url.GetHost(), "www.hostname.ru");
+ UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldPath), "/path");
+ }
+ }
+
+ Y_UNIT_TEST_SUITE(TInvertDomainTest) {
+ Y_UNIT_TEST(TestInvert) {
+ TString a;
+ UNIT_ASSERT_EQUAL(InvertDomain(a), "");
+ TString aa(".:/foo");
+ UNIT_ASSERT_EQUAL(InvertDomain(aa), ".:/foo");
+ TString aaa("/foo.bar:");
+ UNIT_ASSERT_EQUAL(InvertDomain(aaa), "/foo.bar:");
+ TString b("ru");
+ UNIT_ASSERT_EQUAL(InvertDomain(b), "ru");
+ TString c(".ru");
+ UNIT_ASSERT_EQUAL(InvertDomain(c), "ru.");
+ TString d("ru.");
+ UNIT_ASSERT_EQUAL(InvertDomain(d), ".ru");
+ TString e("www.yandex.ru:80/yandsearch?text=foo");
+ UNIT_ASSERT_EQUAL(InvertDomain(e), "ru.yandex.www:80/yandsearch?text=foo");
+ TString f("www.yandex.ru:80/yandsearch?text=foo");
+ InvertDomain(f.begin(), f.begin() + 10);
+ UNIT_ASSERT_EQUAL(f, "yandex.www.ru:80/yandsearch?text=foo");
+ TString g("https://www.yandex.ru:80//");
+ UNIT_ASSERT_EQUAL(InvertDomain(g), "https://ru.yandex.www:80//");
+ TString h("www.yandex.ru:8080/redir.pl?url=https://google.com/");
+ UNIT_ASSERT_EQUAL(InvertDomain(h), "ru.yandex.www:8080/redir.pl?url=https://google.com/");
+ }
+ }
+
+ TQueryArg::EProcessed ProcessQargs(TString url, TString& processed, TQueryArgFilter filter = 0, void* filterData = 0) {
+ TUri uri;
+ uri.Parse(url, NUri::TFeature::FeaturesRecommended);
+
+ TQueryArgProcessing processing(TQueryArg::FeatureSortByName | (filter ? TQueryArg::FeatureFilter : 0) | TQueryArg::FeatureRewriteDirty, filter, filterData);
+ auto result = processing.Process(uri);
+ processed = uri.PrintS();
+ return result;
+ }
+
+ TString SortQargs(TString url) {
+ TString r;
+ ProcessQargs(url, r);
+ return r;
+ }
+
+ bool QueryArgsFilter(const TQueryArg& arg, void* filterData) {
+ const char* skipName = static_cast<const char*>(filterData);
+ return arg.Name != skipName;
+ }
+
+ TString FilterQargs(TString url, const char* name) {
+ TString r;
+ ProcessQargs(url, r, &QueryArgsFilter, const_cast<char*>(name));
+ return r;
+ }
+
+ Y_UNIT_TEST_SUITE(QargsTest) {
+ Y_UNIT_TEST(TestSorting) {
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/"), "http://ya.ru/");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?"), "http://ya.ru/?");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?some=value"), "http://ya.ru/?some=value");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?b=1&a=2"), "http://ya.ru/?a=2&b=1");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?b=1&a=2&a=3"), "http://ya.ru/?a=3&a=2&b=1");
+
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?aaa=3&b=b&a=1&aa=2"), "http://ya.ru/?a=1&aa=2&aaa=3&b=b");
+
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?a=1&b=1&c=1"), "http://ya.ru/?a=1&b=1&c=1");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?b=1&a=1&c=1"), "http://ya.ru/?a=1&b=1&c=1");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?c=1&a=1&b=1"), "http://ya.ru/?a=1&b=1&c=1");
+
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?c=1&a=1&a=1&b=1&c=1&b=1"), "http://ya.ru/?a=1&a=1&b=1&b=1&c=1&c=1");
+
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?b==&a=&&c="), "http://ya.ru/?a=&b==&c=");
+ }
+
+ Y_UNIT_TEST(TestParsingCorners) {
+ TString s;
+
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?=", s), TQueryArg::ProcessedOK);
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some", s), TQueryArg::ProcessedOK);
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some=", s), TQueryArg::ProcessedOK);
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/", s), TQueryArg::ProcessedOK);
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?&", s), TQueryArg::ProcessedOK);
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?&&", s), TQueryArg::ProcessedOK);
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some=", s), TQueryArg::ProcessedOK);
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some==", s), TQueryArg::ProcessedOK);
+ UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some=&&", s), TQueryArg::ProcessedOK);
+
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?="), "http://ya.ru/?=");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?some=="), "http://ya.ru/?some==");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?&&"), "http://ya.ru/?&&");
+ UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?a"), "http://ya.ru/?a");
+ }
+
+ Y_UNIT_TEST(TestFiltering) {
+ UNIT_ASSERT_STRINGS_EQUAL(FilterQargs("http://ya.ru/?some=value", "missing"), "http://ya.ru/?some=value");
+ UNIT_ASSERT_STRINGS_EQUAL(FilterQargs("http://ya.ru/?b=1&a=2", "b"), "http://ya.ru/?a=2");
+ UNIT_ASSERT_STRINGS_EQUAL(FilterQargs("http://ya.ru/?b=1&a=2&a=3", "a"), "http://ya.ru/?b=1");
+ UNIT_ASSERT_STRINGS_EQUAL(FilterQargs("http://ya.ru/?some=&another=", "another"), "http://ya.ru/?some=");
+ }
+
+ Y_UNIT_TEST(TestRemoveEmptyFeature) {
+ TUri uri;
+ uri.Parse("http://ya.ru/?", NUri::TFeature::FeaturesRecommended);
+
+ TQueryArgProcessing processing(TQueryArg::FeatureRemoveEmptyQuery | TQueryArg::FeatureRewriteDirty);
+ auto result = processing.Process(uri);
+ UNIT_ASSERT_EQUAL(result, TQueryArg::ProcessedOK);
+ UNIT_ASSERT_STRINGS_EQUAL(uri.PrintS(), "http://ya.ru/");
+ }
+
+ Y_UNIT_TEST(TestNoRemoveEmptyFeature) {
+ TUri uri;
+ uri.Parse("http://ya.ru/?", NUri::TFeature::FeaturesRecommended);
+
+ TQueryArgProcessing processing(0);
+ auto result = processing.Process(uri);
+ UNIT_ASSERT_EQUAL(result, TQueryArg::ProcessedOK);
+ UNIT_ASSERT_STRINGS_EQUAL(uri.PrintS(), "http://ya.ru/?");
+ }
+ }
+}
diff --git a/library/cpp/uri/uri_ut.h b/library/cpp/uri/uri_ut.h
new file mode 100644
index 00000000000..f8ac6e40927
--- /dev/null
+++ b/library/cpp/uri/uri_ut.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include "uri.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+namespace NUri {
+ struct TTest {
+ TStringBuf Val;
+ TParseFlags Flags;
+ TState::EParsed State;
+ TStringBuf Scheme;
+ TStringBuf User;
+ TStringBuf Pass;
+ TStringBuf Host;
+ ui16 Port;
+ TStringBuf Path;
+ TStringBuf Query;
+ TStringBuf Frag;
+ };
+
+}
+
+#define URL_MSG(url1, url2, cmp) \
+ (TString("[") + url1.PrintS() + ("] " cmp " [") + url2.PrintS() + "]")
+#define URL_EQ(url1, url2) \
+ UNIT_ASSERT_EQUAL_C(url, url2, URL_MSG(url1, url2, "!="))
+#define URL_NEQ(url1, url2) \
+ UNIT_ASSERT_UNEQUAL_C(url, url2, URL_MSG(url1, url2, "=="))
+
+#define CMP_FLD(url, test, fld) \
+ UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::Field##fld), test.fld)
+
+#define CMP_URL(url, test) \
+ do { \
+ CMP_FLD(url, test, Scheme); \
+ CMP_FLD(url, test, User); \
+ CMP_FLD(url, test, Pass); \
+ CMP_FLD(url, test, Host); \
+ UNIT_ASSERT_VALUES_EQUAL(url.GetPort(), test.Port); \
+ CMP_FLD(url, test, Path); \
+ CMP_FLD(url, test, Query); \
+ CMP_FLD(url, test, Frag); \
+ } while (false)
+
+#define URL_TEST_ENC(url, test, enc) \
+ do { \
+ TState::EParsed st = url.ParseUri(test.Val, test.Flags, 0, enc); \
+ UNIT_ASSERT_VALUES_EQUAL(st, test.State); \
+ CMP_URL(url, test); \
+ if (TState::ParsedOK != st) \
+ break; \
+ TUri _url; \
+ TString urlstr, urlstr2; \
+ urlstr = url.PrintS(); \
+ TState::EParsed st2 = _url.ParseUri(urlstr, \
+ (test.Flags & ~TFeature::FeatureNoRelPath) | TFeature::FeatureAllowRootless, 0, enc); \
+ if (TState::ParsedEmpty != st2) \
+ UNIT_ASSERT_VALUES_EQUAL(st2, test.State); \
+ urlstr2 = _url.PrintS(); \
+ UNIT_ASSERT_VALUES_EQUAL(urlstr, urlstr2); \
+ CMP_URL(_url, test); \
+ UNIT_ASSERT_VALUES_EQUAL(url.GetUrlFieldMask(), _url.GetUrlFieldMask()); \
+ URL_EQ(url, _url); \
+ const TStringBuf hostascii = url.GetField(TField::FieldHostAscii); \
+ if (hostascii.Empty()) \
+ break; \
+ urlstr = url.PrintS(TField::FlagHostAscii); \
+ st2 = _url.ParseUri(urlstr, \
+ (test.Flags & ~TFeature::FeatureNoRelPath) | TFeature::FeatureAllowRootless, 0, enc); \
+ UNIT_ASSERT_VALUES_EQUAL(st2, test.State); \
+ urlstr2 = _url.PrintS(); \
+ UNIT_ASSERT_VALUES_EQUAL(urlstr, urlstr2); \
+ TTest test2 = test; \
+ test2.Host = hostascii; \
+ CMP_URL(_url, test2); \
+ UNIT_ASSERT_VALUES_EQUAL(url.GetUrlFieldMask(), _url.GetUrlFieldMask()); \
+ } while (false)
+
+#define URL_TEST(url, test) \
+ URL_TEST_ENC(url, test, CODES_UTF8)
diff --git a/library/cpp/uri/ut/ya.make b/library/cpp/uri/ut/ya.make
new file mode 100644
index 00000000000..b2b2c1291a9
--- /dev/null
+++ b/library/cpp/uri/ut/ya.make
@@ -0,0 +1,19 @@
+UNITTEST_FOR(library/cpp/uri)
+
+OWNER(leo)
+
+NO_OPTIMIZE()
+
+NO_WSHADOW()
+
+PEERDIR(
+ library/cpp/html/entity
+)
+
+SRCS(
+ location_ut.cpp
+ uri-ru_ut.cpp
+ uri_ut.cpp
+)
+
+END()
diff --git a/library/cpp/uri/ya.make b/library/cpp/uri/ya.make
new file mode 100644
index 00000000000..8fc808a6af7
--- /dev/null
+++ b/library/cpp/uri/ya.make
@@ -0,0 +1,32 @@
+LIBRARY()
+
+OWNER(
+ mvel
+ g:base
+)
+
+SRCS(
+ assign.cpp
+ common.cpp
+ encode.cpp
+ http_url.h
+ location.cpp
+ other.cpp
+ parse.cpp
+ qargs.cpp
+ uri.cpp
+ encodefsm.rl6
+ parsefsm.rl6
+)
+
+PEERDIR(
+ contrib/libs/libidn
+ library/cpp/charset
+)
+
+END()
+
+RECURSE(
+ benchmark
+ ut
+)