diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/uri | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/uri')
26 files changed, 5890 insertions, 0 deletions
diff --git a/library/cpp/uri/assign.cpp b/library/cpp/uri/assign.cpp new file mode 100644 index 00000000000..ea1955a9e92 --- /dev/null +++ b/library/cpp/uri/assign.cpp @@ -0,0 +1,426 @@ +#include "uri.h" +#include "parse.h" + +#include <contrib/libs/libidn/idna.h> + +#include <library/cpp/charset/recyr.hh> +#include <util/charset/wide.h> +#include <util/memory/tempbuf.h> +#include <util/string/cast.h> +#include <util/system/maxlen.h> +#include <util/system/yassert.h> +#include <util/system/sys_alloc.h> + +namespace NUri { + TMallocPtr<char> TUri::IDNToAscii(const wchar32* idna) { + // XXX: don't use punycode_encode directly as it doesn't include + // proper stringprep and splitting on dot-equivalent characters + char* buf; + static_assert(sizeof(*idna) == sizeof(ui32), "fixme"); + if (IDNA_SUCCESS != idna_to_ascii_4z((const uint32_t*)idna, &buf, 0)) + buf = nullptr; + return buf; + } + + TMallocPtr<char> TUri::IDNToAscii(const TStringBuf& host, ECharset enc) { + TTempBuf buf(sizeof(wchar32) * (1 + host.length())); + wchar32* wbuf = reinterpret_cast<wchar32*>(buf.Data()); + + const size_t written = NDetail::NBaseOps::Recode(host, wbuf, enc).length(); + wbuf[written] = 0; + + return IDNToAscii(wbuf); + } + + TStringBuf TUri::HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc) { + TStringBuf outhost; // store the result here before returning it, to get RVO + + size_t buflen = 0; + + if (hasExtended && !allowIDN) + return outhost; // definitely can't convert + + // charset-recode: RFC 3986, 3.2.2, requires percent-encoded non-ASCII + // chars in reg-name to be UTF-8 so convert to UTF-8 prior to decoding + const bool recoding = CODES_UTF8 != enc && hasExtended; + if (recoding) { + size_t nrd, nwr; + buflen = host.length() * 4; + buf.Reset(static_cast<char*>(y_allocate(buflen))); + if (RECODE_OK != Recode(enc, CODES_UTF8, host.data(), buf.Get(), host.length(), buflen, nrd, nwr)) + return outhost; + host = TStringBuf(buf.Get(), nwr); + } + + // percent-decode + if (0 == buflen) { + buflen = host.length(); + buf.Reset(static_cast<char*>(y_allocate(buflen))); + } + // decoding shortens so writing over host in buf is OK + TMemoryWriteBuffer out(buf.Get(), buflen); + TEncoder decoder(out, FeatureDecodeANY | FeatureToLower); + const long outFlags = decoder.ReEncode(host); + hasExtended = 0 != (outFlags & FeatureEncodeExtendedASCII); + + // check again + if (hasExtended && !allowIDN) + return outhost; + + host = out.Str(); + + // convert to punycode if needed + if (!hasExtended) { + outhost = host; + return outhost; + } + + TMallocPtr<char> puny; + try { + puny = IDNToAscii(host); + } catch (const yexception& /* exc */) { + } + + if (!puny) { + // XXX: try user charset unless UTF8 or converted to it + if (CODES_UTF8 == enc || recoding) + return outhost; + try { + puny = IDNToAscii(host, enc); + } catch (const yexception& /* exc */) { + return outhost; + } + if (!puny) + return outhost; + } + + buf = puny; + outhost = buf.Get(); + + return outhost; + } + + TStringBuf TUri::HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc) { + // find what we have + long haveFlags = 0; + for (size_t i = 0; i != host.length(); ++i) + haveFlags |= TEncoder::GetFlags(host[i]).FeatFlags; + + // interested in encoded characters or (if IDN is allowed) extended ascii + TStringBuf outhost; + const bool haveExtended = haveFlags & FeatureEncodeExtendedASCII; + + if (!haveExtended || allowIDN) { + if (!haveExtended && 0 == (haveFlags & FeatureDecodeANY)) + outhost = host; + else + outhost = HostToAscii(host, buf, haveExtended, allowIDN, enc); + } + + return outhost; + } + + static inline bool AppendField(TMemoryWriteBuffer& out, TField::EField fld, const TStringBuf& val, long flags) { + if (val.empty()) + return false; + if (flags & TFeature::FeaturesAllEncoder) + TUri::ReEncodeField(out, val, fld, flags); + else + out << val; + return true; + } + + TState::EParsed TUri::AssignImpl(const TParser& parser, TScheme::EKind defscheme) { + Clear(); + + TState::EParsed ret = parser.State; + if (ParsedBadFormat <= ret) + return ret; + + const TSection& scheme = parser.Get(FieldScheme); + const TSchemeInfo& schemeInfo = SetSchemeImpl(parser.Scheme); + + // set the scheme always if available + if (schemeInfo.Str.empty() && scheme.IsSet()) + FldSet(FieldScheme, scheme.Get()); + + if (ParsedOK != ret) + return ret; + + size_t buflen = 0; + + // special processing for fields + + const bool convertIDN = parser.Flags & FeatureConvertHostIDN; + long flags = parser.Flags.Allow; + if (convertIDN) + flags |= FeatureAllowHostIDN | FeatureCheckHost; + + // process non-ASCII host for punycode + + TMallocPtr<char> hostptr; + TStringBuf hostascii; // empty: use host field; non-empty: ascii + bool hostConverted = false; // hostascii is empty or the original + const TSection& host = parser.Get(FieldHost); + if (host.IsSet() && !FldIsSet(FieldHost)) { + const bool allowIDN = (flags & FeatureAllowHostIDN); + const TStringBuf hostbuf = host.Get(); + + // if we know we have and allow extended-ASCII chars, no need to check further + if (allowIDN && (host.GetFlagsAllPlaintext() & FeatureEncodeExtendedASCII)) + hostascii = HostToAscii(hostbuf, hostptr, true, true, parser.Enc); + else + hostascii = HostToAscii(hostbuf, hostptr, allowIDN, parser.Enc); + + if (hostascii.empty()) + ret = ParsedBadHost; // exists but cannot be converted + else if (hostbuf.data() != hostascii.data()) { + hostConverted = true; + buflen += 1 + hostascii.length(); + if (convertIDN) + FldMarkSet(FieldHost); // so that we don't process host below + } + } + + // add unprocessed fields + + for (int idx = 0; idx < FieldUrlMAX; ++idx) { + const EField fld = EField(idx); + const TSection& section = parser.Get(fld); + if (section.IsSet() && !FldIsSet(fld)) + buflen += 1 + section.EncodedLen(); // includes null + } + if (0 == buflen) // no more sections set? + return ret; + + // process #! fragments + // https://developers.google.com/webmasters/ajax-crawling/docs/specification + + static const TStringBuf escFragPrefix(TStringBuf("_escaped_fragment_=")); + + bool encHashBangFrag = false; + TStringBuf qryBeforeEscapedFragment; + TStringBuf qryEscapedFragment; + do { + if (FldIsSet(FieldFrag) || FldIsSet(FieldQuery)) + break; + + const TSection& frag = parser.Get(FieldFrag); + if (frag.IsSet()) { + if (0 == (parser.Flags & FeatureHashBangToEscapedFragment)) + break; + const TStringBuf fragbuf = frag.Get(); + if (fragbuf.empty() || '!' != fragbuf[0]) + break; + encHashBangFrag = true; + // '!' will make space for '&' or '\0' if needed + buflen += escFragPrefix.length(); + buflen += 2 * fragbuf.length(); // we don't know how many will be encoded + } else { + const TSection& qry = parser.Get(FieldQuery); + if (!qry.IsSet()) + break; + // FeatureHashBangToEscapedFragment has preference + if (FeatureEscapedToHashBangFragment != (parser.Flags & FeaturesEscapedFragment)) + break; + qry.Get().RSplit('&', qryBeforeEscapedFragment, qryEscapedFragment); + if (!qryEscapedFragment.StartsWith(escFragPrefix)) { + qryEscapedFragment.Clear(); + break; + } + qryEscapedFragment.Skip(escFragPrefix.length()); + buflen += 2; // for '!' and '\0' in fragment + buflen -= escFragPrefix.length(); + } + } while (false); + + // now set all fields prior to validating + + Alloc(buflen); + + TMemoryWriteBuffer out(Buffer.data(), Buffer.size()); + for (int idx = 0; idx < FieldUrlMAX; ++idx) { + const EField fld = EField(idx); + + const TSection& section = parser.Get(fld); + if (!section.IsSet() || FldIsSet(fld)) + continue; + + if (FieldQuery == fld && encHashBangFrag) + continue; + + if (FieldFrag == fld && qryEscapedFragment.IsInited()) + continue; + + char* beg = out.Buf(); + TStringBuf val = section.Get(); + long careFlags = section.GetFlagsEncode(); + + switch (fld) { + default: + break; + + case FieldQuery: + if (qryEscapedFragment.IsInited()) { + const EField dstfld = FieldFrag; // that's where we will store + out << '!'; + if (!qryEscapedFragment.empty()) + ReEncodeToField(out, qryEscapedFragment, fld, FeatureDecodeANY | careFlags, dstfld, FeatureDecodeANY | parser.GetFieldFlags(dstfld)); + FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf())); + if (qryBeforeEscapedFragment.empty()) + continue; + out << '\0'; + beg = out.Buf(); + val = qryBeforeEscapedFragment; + } + break; + + case FieldFrag: + if (encHashBangFrag) { + const EField dstfld = FieldQuery; // that's where we will store + const TSection& qry = parser.Get(dstfld); + if (qry.IsSet()) + if (AppendField(out, dstfld, qry.Get(), qry.GetFlagsEncode())) + out << '&'; + out << escFragPrefix; + val.Skip(1); // skip '!' + ReEncodeToField(out, val, fld, careFlags, dstfld, parser.GetFieldFlags(dstfld)); + FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf())); + continue; + } + break; + } + + AppendField(out, fld, val, careFlags); + char* end = out.Buf(); + + if (careFlags & FeaturePathOperation) { + if (!PathOperation(beg, end, PathOperationFlag(parser.Flags))) + return ParsedBadPath; + + Y_ASSERT(beg >= out.Beg()); + out.SetPos(end); + } + + FldSetNoDirty(fld, TStringBuf(beg, end)); + + // special character case + const long checkChars = section.GetFlagsAllPlaintext() & FeaturesCheckSpecialChar; + if (0 != checkChars) { // has unencoded special chars: check permission + const long allowChars = parser.GetFieldFlags(fld) & checkChars; + if (checkChars != allowChars) + ret = ParsedBadFormat; + } + + out << '\0'; + } + + if (hostConverted) { + char* beg = out.Buf(); + out << hostascii; + char* end = out.Buf(); + const EField fld = convertIDN ? FieldHost : FieldHostAscii; + FldSetNoDirty(fld, TStringBuf(beg, end)); + out << '\0'; + } + + Buffer.Resize(out.Len()); + + if (GetScheme() == SchemeEmpty && SchemeEmpty != defscheme) { + if (SchemeUnknown == defscheme) + ret = ParsedBadScheme; + else + SetSchemeImpl(defscheme); + } + + if (0 == (parser.Flags & FeatureAllowEmptyPath)) + CheckMissingFields(); + + const TStringBuf& port = GetField(FieldPort); + if (!port.empty()) { + if (!TryFromString<ui16>(port, Port)) + ret = ParsedBadPort; + } + + if (ParsedOK != ret) + return ret; + + // run validity checks now that all fields are set + + // check the host for DNS compliance + do { + if (0 == (flags & FeatureCheckHost)) + break; + if (hostascii.empty()) + hostascii = GetField(FieldHost); + if (hostascii.empty()) + break; + // IP literal + if ('[' == hostascii[0] && ']' == hostascii.back()) + break; + ret = CheckHost(hostascii); + if (ParsedOK != ret) + return ret; + } while (false); + + return ret; + } + + TState::EParsed TUri::ParseImpl(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) { + Clear(); + + if (url.empty()) + return ParsedEmpty; + + if (maxlen > 0 && url.length() > maxlen) + return ParsedTooLong; + + const TParser parser(flags, url, enc); + + return AssignImpl(parser, defscheme); + } + + TState::EParsed TUri::Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& url_base, ui32 maxlen, ECharset enc) { + const TParseFlags flags1 = flags.Exclude(FeatureNoRelPath); + TState::EParsed ret = ParseImpl(url, url_base.empty() ? flags : flags1, maxlen, SchemeEmpty, enc); + if (ParsedOK != ret) + return ret; + + if (!url_base.empty() && !IsValidAbs()) { + TUri base; + ret = base.ParseImpl(url_base, flags, maxlen, SchemeEmpty, enc); + if (ParsedOK != ret) + return ret; + Merge(base, PathOperationFlag(flags)); + } + + Rewrite(); + return ret; + } + + TState::EParsed TUri::Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags, ui32 maxlen, ECharset enc) { + const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc); + if (ParsedOK != ret) + return ret; + + if (!IsValidAbs()) + Merge(base, PathOperationFlag(flags)); + + Rewrite(); + return ret; + } + + TState::EParsed TUri::ParseAbsUri(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) { + const TState::EParsed ret = ParseImpl( + url, flags | FeatureNoRelPath, maxlen, defscheme, enc); + if (ParsedOK != ret) + return ret; + + if (IsNull(FlagHost)) + return ParsedBadHost; + + Rewrite(); + return ParsedOK; + } + +} diff --git a/library/cpp/uri/benchmark/main.cpp b/library/cpp/uri/benchmark/main.cpp new file mode 100644 index 00000000000..d39704877e9 --- /dev/null +++ b/library/cpp/uri/benchmark/main.cpp @@ -0,0 +1,46 @@ +#include <library/cpp/uri/uri.h> + +#include <library/cpp/testing/benchmark/bench.h> + +#include <util/generic/vector.h> + +const TString URLS[] = { + "http://www.TEST.Ru:80/InDex.html", + "www.ya.ru/index.html", + "https://workplace.z.yandex-team.ru/di#vertical=drive&datePreset=week", + "https://warden.z.yandex-team.ru/components/web/report?filter_type=action_items&filter_status=total&filter_period=review", + "https://meduza.io/news/2021/05/01/italiya-vozobnovila-vydachu-turisticheskih-viz-v-moskve", + "https://gcc.gnu.org/projects/cxx-status.html#cxx20", + "https://github.com/llvm/llvm-project/commits/main/libcxx", + "https://photos.google.com/share/AF1QipNi8VN2pw2Ya_xCV8eFgzEZmiXDy1-GwhXbqFtvXoH3HypF10as9puV8FdoVZpOZA?key=WkZjQTIxQTM5a01oZkNUYTE2ZllKTVJKZk1CMTR3", + "https://mag.auto.ru/article/ladasolaris/?from=mag_web_block&utm_campaign=ladasolaris&utm_content=populyarnoe&utm_source=mag_web_block&utm_medium=cpc", + "https://yabs.yandex.ru/count/WZ4ejI_zODW1FH40j1nmE7kaiN1MJWK0s08GWY0nuM6EO000000useqKG0H846344d30nU21pYI00GcG0VpCzQaucBW1mCNWWOR1co7e0QG2y0A-meRx0v03yCE0RzqD-0Jozzu1Y0Nozzu1a0Nozzu1m0Nvks_D2-U0-KK5Iga7StZ0vgZ8Ao2m1u20c0ou1-u9q0SIW870W822W07u2DoYy82d4W10oQeB4EBQlCbpU0002zaBbLJ1w0lozzu1y0i6w0oNm808WWuaCaOrC30oGaKjHaCoHIqqDpWoBJX1HZajCaD3HJGtDKH3Gp7Dbvo7cB_HWagW3i24FO0Gm-3DmA8G0w7W4e606EaIQREKnW19jq8oc1C8g1E2WkxBsSo3dXRW4_BttW6W5FBttW6e5FBttW7G5EUCwadO583Nge06w1IC0j0LWDUgW0RO5S6AzkoZZxpyO_2G5i41e1RGWVs31iaMWHUe5md05xGIs1V0X3sm68AikOG6q1WG-1ZKfU3zXERBdee1W1cG6G6W6Qe3i1cu6T8P4dbXOdDVSsLoTcLoBt8rCZGjCkWPWC83y1c0mWE16l__eqZnoGHLc1hyy8W2i1hotyIEmftqxKJr6W40000R02tR-6NBeHRAQn6iQSWtbmq2DFDwGK8a9Muu2wvTOhNWE4vj3Fy-LbbUXS4Y0NLWZWE9MX1ZSNyOanc8rS-k0u1LdAMo0O-fHvO33T0LBwYZBzcF7h0qb3q0~1", + "https://yandex.ru/pogoda/nowcast/?utm_campaign=alert&utm_content=alert_separate&utm_medium=web&utm_source=home&utm_term=nowcast&morda_geolocation=1", + "https://an.yandex.ru/count/WluejI_zOE02fHS052S_YplH_yoFzGK0u0CGWY0nncwAO000000uYgLImfs4aOKAW06Suha2Y07-gFqAa07-vu2bpu20W0AO0VxdWALFe07Ug07Uk076nFll8S010jW1w9_VcG7W0Soxo1te0Ue40Q02vAG1y0Aasfk_0SOA-0IzX9W1Y0MzX9W1a0Nav9y1e0MCiIwe1ShJ9h05ojCck0NkpoZ9qoOhae0mkgnJ5wa7cB8OEOwtEX-m1u20a2Iu1u05ibB92YPyFmDpJFK_D7ddf9Yo002blSLG6C7e2xs4c07m2mc83BIDthu1gGpzGrRLHHZfF-WCbmBW3OA0mC60288E93OtGqOmC4L4BJ4sDaCjD4P5H2r1EJX5BKKuCZCvGZX4DqGtgwI2XAENwwcOvUBgu_6jdH_P3u0Gz8sO7OWGpz-nX0e1eU0HoV740-WHhS3enS2ScyBwO3_lGEaI3kz-5M87zWKoc1C4g1Eli-_mek7ml1RW4-xFA8WKqRAuZvJoWDi8e1JkpoYe5EJadudj_uC6w1I40iWLhPEErFa4q1MEz9w41jWLmOhsxAEFlFnZyA0Mq87zWmR95j0Mj8tUlW615vWNWfsO8wWN2RWN0S0NjHBO5y24FUWN0PaOe1WBi1ZIlwc41hWO0j0O4FWOeQI_pOMKhhOPW1c96MWs1G000000a1a1e1cg0x0Pk1d_0T8P4dbXOdDVSsLoTcLoBt8rCZCjCkWPWC83y1c0mWE16l__fs_4tYo3a1g0W06O6l70j06m6hkouUoAwFByhm6u6W7r6W4000226nmnDZ4vDZWrC3OnBZasE30pBZWrDJSmBZOuDJSt9I1HX8PYb34CCgLmxEvDy50h8PoKR4c2QKHEe8YaRoCEXF5m_E1rNC99Bpm6wIRbNGAU6GiQ3qguwKm8cq7la7pmUZxo079vS26KW_bTaCPY9QnjKrUHau50oq6yQCyn5TW0kBtjBWG2fE3FRk2DVwuirPJ_lvLb3GlkDKAujdLv~1?stat-id=1", + "https://rasp.yandex.ru/?utm_source=yamain&utm_medium=geoblock&utm_campaign=main", + "https://dsp-rambler.ru/click?url=hs31lTJpNQdz5IfhdiQZitCij144sWuuLcC3jopSDRPYYlPZNuD1x6OEJg74u0nbP0J0oXWXw7awYsCTgG7Cr*koGqxBBhOnzB5aZ6aLfCVVGlIuP3L194y025VLJAsK04sZwSvJbxEYfnbOWKPgQHOf7c8Gqkope95-kr--aTxpsg18jiwVdEPBENe3F2Iahm3yL*3vvzt-t7Vq6bANhL3DiJglfLw2WVd4tAhoAepFV*QybPJFoGHbdOWGMTyzpWMxWPLKb7MRFQWj1K*58qPyfNkfyQ72vvzjviirTM57xng8Cxbi08-HzM5x7imDOYIY8EvXOqfU3Q0KWsyO1RC1yHHVinVxthuIb16zLjg2aoYFpz-OLiTYhlmk1vyK9t9fLz8*Oiez9i*7TqIwcWZyX5gUFuOOJ4sTWbeQLe6IQEShvKIj7v9yBZRLkRmdHLfrREuTNpBMBtRG80MIxwXyt6SjjFOhSKtK-yDA19Wawzgw9fNOy9DW0TDAehQkPUTz*5-htmszyUqJWk5ovqoyHV3acnOI2-klqMCMfU9w3*GOYS0CuNTrggGCQH376EaeQthtwiUcabSarBEocGEsW7n27kIsrtYh2-SZwPvKC1Ek3dg35nuEO-MWNPMmqJvAhBGXHF*EQkcB3eLUEluJmwTxqPRv00M4PNgdYsYsgKYPU6MMJTxbH7fA*Q2vA7WErGONTSzhSOeNLPP4vR9WalRvzllDB3XH4bNx6Pleb3ZfWmoYTNN0Lux3-VxOSjIvDDGzMirfOVPKZB4qVQWsP4WHCrRgsijW43cKGhcQ0dPORYO5v0xhzMjoZ0qDEsaiXBkfRXccnQ3QGaXk2PC1vu*Zlj0qgJfO5i8e66z4HEiMWRF4JH8ZsbZqrUzKXX-WpPQuVA0MOslhzq8m3KS3UIEurWCn80utY4AWCuHzGooJbb*PcHhYMqIBbLcJW76RK9HQjI8Bxiu4C9wECXeWIqeVuHzbmb7PGizIDVwg-g8I-zrBgd8OH3kWtC09YwSB-F9wOHrrcBG*Sv6fdnwubW1ndv0V1jsok2DhMT9IFBOoa-brtWYIdttkRs2J4m*Ai5IgVOagS2wyboiHKptd7aQ7j1YnJENZbFfs2nSwvPTvvSA8w-vCJHo-xEW6tPWaAOrVVRZscjNb4HovTUKBhxrCm8cZYw0ZahlRWGDpB0QkiI-xSv7YdYzoAQMvEOF8h*MKTn1Had8cI2FJ3WtcaT3siShD*APePK6dwqGsNJRz3lfbeX*hykpwK8kTumfS6z51bv06bjahc*fo1vjNt8ivt2BJPWqnGkYH9-8r76iBia7d1zKKnzfqk-mu3m9eP0kiKkoqMKaugV2muZlt5h4ps29ikmTIc-8vYtwHOdLDay7PUhTC4tKepBVRZh6nXrIa5POmkVNV7hVfeoMXqlid2B-2LM3CWCo*W2r23aefVN8mi3t-dnWNUlVgurAc*674C76Py8Qdr0*EJqmYjhrQw6jbm*nB3O-kd1aYqWXWC3Msg1a5r9sRu1WVLKzmwwzjPX6b44R5ULVhu1OqH6*O7hFIbN584lUhWM6g1nWFqhwhN3**Bam802sRvZjguTILo*UAH7WW1DRRG5MsTs-ZP3tOFMkQKWuJ3LRLDXtnkyN25S0LYEmH8R0vTstUmFwafeJSmm90Iuseu9DKArqrb1Wn2cZv2zgAEYy66U7kkBTQSC76WlwBJTVpoK6MUohrEll23wivbnhNaGzaSe6kWRq1ItpFkqv9gXkCAAAAuty8CgAAAAA", + "https://news.rambler.ru/moscow_city/46342947-pogoda-v-moskve-sinoptiki-poobeschali-moskvicham-silnyy-liven-blizhayshey-nochyu/?utm_source=head&utm_campaign=self_promo&utm_medium=news&utm_content=news", +}; + +Y_CPU_BENCHMARK(Parsing, iface) { + for (size_t i = 0; i < iface.Iterations(); ++i) { + for (auto&& url : URLS) { + NUri::TUri uri; + auto parseResult = uri.Parse(url, uri.FeaturesAll); + Y_DO_NOT_OPTIMIZE_AWAY(parseResult); + Y_VERIFY(parseResult == NUri::TState::ParsedOK, "cannot parse %s: %d", url.c_str(), static_cast<ui32>(parseResult)); + } + } +} + +Y_CPU_BENCHMARK(ParsingAndCopying, iface) { + for (size_t i = 0; i < iface.Iterations(); ++i) { + for (auto&& url : URLS) { + NUri::TUri uri; + auto parseResult = uri.Parse(url, uri.FeaturesAll); + Y_VERIFY(parseResult == NUri::TState::ParsedOK, "cannot parse %s: %d", url.c_str(), static_cast<ui32>(parseResult)); + auto copy = uri; + Y_DO_NOT_OPTIMIZE_AWAY(copy); + } + } +} diff --git a/library/cpp/uri/benchmark/ya.make b/library/cpp/uri/benchmark/ya.make new file mode 100644 index 00000000000..77ea238de71 --- /dev/null +++ b/library/cpp/uri/benchmark/ya.make @@ -0,0 +1,17 @@ +Y_BENCHMARK() + +OWNER( + svshevtsov + g:base +) + +PEERDIR( + library/cpp/testing/benchmark + library/cpp/uri +) + +SRCS( + main.cpp +) + +END() diff --git a/library/cpp/uri/common.cpp b/library/cpp/uri/common.cpp new file mode 100644 index 00000000000..05af1e57d18 --- /dev/null +++ b/library/cpp/uri/common.cpp @@ -0,0 +1,115 @@ +#include "common.h" + +#include <util/generic/map.h> +#include <util/generic/singleton.h> + +namespace NUri { + static_assert(TFeature::FeatureMAX <= sizeof(unsigned long) * 8, "expect TFeature::FeatureMAX <= sizeof(unsigned long) * 8"); + + const TSchemeInfo TSchemeInfo::Registry[] = { + TSchemeInfo(TScheme::SchemeEmpty, TStringBuf()), // scheme is empty and inited + TSchemeInfo(TScheme::SchemeHTTP, TStringBuf("http"), TField::FlagHost | TField::FlagPath, 80), + TSchemeInfo(TScheme::SchemeHTTPS, TStringBuf("https"), TField::FlagHost | TField::FlagPath, 443), + TSchemeInfo(TScheme::SchemeFTP, TStringBuf("ftp"), TField::FlagHost | TField::FlagPath, 20), + TSchemeInfo(TScheme::SchemeFILE, TStringBuf("file"), TField::FlagPath), + TSchemeInfo(TScheme::SchemeWS, TStringBuf("ws"), TField::FlagHost | TField::FlagPath, 80), + TSchemeInfo(TScheme::SchemeWSS, TStringBuf("wss"), TField::FlagHost | TField::FlagPath, 443), + // add above + TSchemeInfo(TScheme::SchemeUnknown, TStringBuf()) // scheme is empty and uninited + }; + + namespace { + struct TLessNoCase { + bool operator()(const TStringBuf& lt, const TStringBuf& rt) const { + return 0 > CompareNoCase(lt, rt); + } + }; + + class TSchemeInfoMap { + typedef TMap<TStringBuf, TScheme::EKind, TLessNoCase> TdMap; + TdMap Map_; + + public: + TSchemeInfoMap() { + for (int i = TScheme::SchemeEmpty; i < TScheme::SchemeUnknown; ++i) { + const TSchemeInfo& info = TSchemeInfo::Get(TScheme::EKind(i)); + Map_.insert(std::make_pair(info.Str, info.Kind)); + } + } + + TScheme::EKind Get(const TStringBuf& scheme) const { + const TdMap::const_iterator it = Map_.find(scheme); + return Map_.end() == it ? TScheme::SchemeUnknown : it->second; + } + + static const TSchemeInfoMap& Instance() { + return *Singleton<TSchemeInfoMap>(); + } + }; + + } + + const TSchemeInfo& TSchemeInfo::Get(const TStringBuf& scheme) { + return Registry[TSchemeInfoMap::Instance().Get(scheme)]; + } + + const char* ParsedStateToString(const TState::EParsed& t) { + switch (t) { + case TState::ParsedOK: + return "ParsedOK"; + case TState::ParsedEmpty: + return "ParsedEmpty"; + case TState::ParsedRootless: + return "ParsedRootless"; + case TState::ParsedBadFormat: + return "ParsedBadFormat"; + case TState::ParsedBadPath: + return "ParsedBadPath"; + case TState::ParsedTooLong: + return "ParsedTooLong"; + case TState::ParsedBadPort: + return "ParsedBadPort"; + case TState::ParsedBadAuth: + return "ParsedBadAuth"; + case TState::ParsedBadScheme: + return "ParsedBadScheme"; + case TState::ParsedBadHost: + return "ParsedBadHost"; + default: + return "Parsed[Unknown]"; + } + } + + const char* FieldToString(const TField::EField& t) { + switch (t) { + case TField::FieldScheme: + return "scheme"; + case TField::FieldUser: + return "username"; + case TField::FieldPass: + return "password"; + case TField::FieldHost: + return "host"; + case TField::FieldHostAscii: + return "hostascii"; + case TField::FieldPort: + return "port"; + case TField::FieldPath: + return "path"; + case TField::FieldQuery: + return "query"; + case TField::FieldFrag: + return "fragment"; + default: + return "Field[Unknown]"; + } + } + + const char* SchemeKindToString(const TScheme::EKind& t) { + const TSchemeInfo& info = TSchemeInfo::Get(t); + if (!info.Str.empty()) + return info.Str.data(); + return TScheme::SchemeEmpty == t ? "empty" : "unknown"; + } + +} diff --git a/library/cpp/uri/common.h b/library/cpp/uri/common.h new file mode 100644 index 00000000000..80253577635 --- /dev/null +++ b/library/cpp/uri/common.h @@ -0,0 +1,511 @@ +#pragma once + +#include <util/stream/output.h> +#include <util/system/compat.h> +#include <util/generic/strbuf.h> + +namespace NUri { + namespace NEncode { + class TEncoder; + class TEncodeMapperBase; + struct TCharFlags; + } + + namespace NParse { + class TRange; + } + + class TParser; + + struct TField { +#define FIELD_NAME(f) Field##f +#define FIELD_FLAG(f) Flag##f = 1U << FIELD_NAME(f) + + enum EField { + FIELD_NAME(Scheme), + FIELD_NAME(User), + FIELD_NAME(Pass), + FIELD_NAME(Host), + FIELD_NAME(Port), + FIELD_NAME(Path), + FIELD_NAME(Query), + FIELD_NAME(Frag), + + // add fields above + FieldUrlMAX, + // reset count so actual field offsets are not interrupted + FieldUrlLast = FieldUrlMAX - 1, + // add extra fields below + + FIELD_NAME(HostAscii), + + // add extra fields above + FieldAllMAX, + // add aliases below + + FieldUsername = FieldUser, + FieldPassword = FieldPass, + FieldFragment = FieldFrag, + }; + + enum EFlags { + FIELD_FLAG(Scheme), + FIELD_FLAG(User), + FIELD_FLAG(Pass), + FIELD_FLAG(Host), + FIELD_FLAG(Port), + FIELD_FLAG(Path), + FIELD_FLAG(Query), + FIELD_FLAG(Frag), + FIELD_FLAG(UrlMAX), + FIELD_FLAG(HostAscii), + FIELD_FLAG(AllMAX), + + FlagHostPort = FlagHost | FlagPort, + FlagAuth = FlagUser | FlagPass, + FlagFragment = FlagFrag, + FlagAction = FlagScheme | FlagHostPort | FlagPath, + FlagNoFrag = FlagAction | FlagQuery, + FlagUrlFields = FlagUrlMAX - 1, + FlagAll = FlagUrlFields, // obsolete, for backwards compatibility + FlagAllFields = FlagAllMAX - 1 + }; + +#undef FIELD_NAME +#undef FIELD_FLAG + }; + + struct TState { + enum EParsed { + ParsedOK = 0, + ParsedEmpty = 1, + ParsedOpaque = 2, + ParsedRootless = ParsedOpaque, + ParsedBadFormat, // must follow all non-error states immediately + ParsedBadPath, + ParsedTooLong, + ParsedBadPort, + ParsedBadAuth, + ParsedBadScheme, + ParsedBadHost, + + // add before this line + ParsedMAX + }; + }; + + struct TScheme { + // don't forget to define a SchemeRegistry entry + enum EKind { + SchemeEmpty + // add schemes below this line + , + SchemeHTTP, + SchemeHTTPS, + SchemeFTP, + SchemeFILE, + SchemeWS, + SchemeWSS + // add schemes above this line + , + SchemeUnknown + }; + }; + + class TFeature { + friend class NEncode::TEncoder; + friend class NEncode::TEncodeMapperBase; + friend struct NEncode::TCharFlags; + friend class TParser; + friend class NParse::TRange; + +#define FEATURE_NAME(f) _BitFeature##f +#define FEATURE_FLAG_NAME(f) Feature##f +#define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1UL << FEATURE_NAME(f) + + protected: + enum EBit { + //============================== + // Cases interpreted as errors: + //============================== + + // allows authorization user/password in URL + FEATURE_NAME(AuthSupported), + + // allows all known schemes in URL + FEATURE_NAME(SchemeKnown), + + // allows all schemes, not only known + FEATURE_NAME(SchemeFlexible), + + // allow opaque (RFC 2396) or rootless (RFC 3986) urls + FEATURE_NAME(AllowRootless), + + //============================== + // Cases interpreted for processing (if required): + // (effects on result of Parse method) + //============================== + + // path needs normalization + // (simplification of directory tree: /../, /./, etc. + FEATURE_NAME(PathOperation), + + // don't force empty path to "/" + FEATURE_NAME(AllowEmptyPath), + + // in scheme and host segments: + // change upper case letters onto lower case ones + FEATURE_NAME(ToLower), + + // decode unreserved symbols + FEATURE_NAME(DecodeUnreserved), + + // legacy: decode standard symbols which may be safe for some fields + FEATURE_NAME(DecodeStandardExtra), + + // decode symbols allowed (not necessarily safe to decode) only for a given field + // (do not use directly, instead use FeatureDecodeSafe mask below) + FEATURE_NAME(DecodeFieldAllowed), + + // handling of spaces + FEATURE_NAME(EncodeSpace), + + // in query segment: change escaped space to '+' + FEATURE_NAME(EncodeSpaceAsPlus), + + // escape all string 'markup' symbols + FEATURE_NAME(EncodeForSQL), + + // encoding of extended ascii symbols (8-bit) + FEATURE_NAME(EncodeExtendedASCII), + + // decoding of extended ascii symbols (8-bit) + FEATURE_NAME(DecodeExtendedASCII), + + // encoding of extended delimiter set + FEATURE_NAME(EncodeExtendedDelim), + + // decoding of extended delimiter set + FEATURE_NAME(DecodeExtendedDelim), + + // control characters [0x00 .. 0x20) + FEATURE_NAME(EncodeCntrl), + + // raw percent character + FEATURE_NAME(EncodePercent), + + // hash fragments + // https://developers.google.com/webmasters/ajax-crawling/docs/specification + // move and encode #! fragments to the query + FEATURE_NAME(HashBangToEscapedFragment), + // move and decode _escaped_fragment_ to the fragment + FEATURE_NAME(EscapedToHashBangFragment), + + // reject absolute paths started by "/../" + FEATURE_NAME(PathDenyRootParent), + + // paths started by "/../" - ignore head + FEATURE_NAME(PathStripRootParent), + + // tries to fix errors (in particular, in fragment) + FEATURE_NAME(TryToFix), + + // check host for DNS compliance + FEATURE_NAME(CheckHost), + + // allow IDN hosts + // host is converted to punycode and stored in FieldHostAscii + // @note host contains characters in the charset of the document + // and percent-encoded characters in UTF-8 (RFC 3986, 3.2.2) + // @note if host contains no extended-ASCII characters and after + // percent-decoding cannot be converted from UTF-8 to UCS-4, + // try to recode from the document charset (if not UTF-8) + FEATURE_NAME(AllowHostIDN), + + // forces AllowHostIDN, but host is replaced with punycode + // forces CheckHost since this replacement is irreversible + FEATURE_NAME(ConvertHostIDN), + + // robot interpreted network paths as BadFormat urls + FEATURE_NAME(DenyNetworkPath), + + // robot interprets URLs without a host as BadFormat + FEATURE_NAME(RemoteOnly), + + /* non-RFC use case: + * 1. do not allow relative-path-only URIs when they can conflict with + * "host/path" (that is, only "./path" or "../path" are allowed); + * 2. if neither scheme nor userinfo are present but port is, it must + * be non-empty, to avoid conflict with "scheme:/..."; + * 3. if AllowRootless is not specified, rootless (or opaque) URIs are + * not recognized; + * 4. if AllowRootless is specified, disallow userinfo, preferring + * "scheme:pa@th" over "user:pass@host", and even "host:port" when + * host contains only scheme-legal characters. + */ + FEATURE_NAME(NoRelPath), + + // standard prefers that all hex escapes were using uppercase A-F + FEATURE_NAME(UpperEncoded), + + // internal usage: decode all encoded symbols + FEATURE_NAME(DecodeANY), + + // add before this line + _FeatureMAX + }; + + protected: + enum EPrivate : ui32 { + FEATURE_FLAG(DecodeANY), + FEATURE_FLAG(DecodeFieldAllowed), + FEATURE_FLAG(DecodeStandardExtra), + }; + + public: + enum EPublic : ui32 { + FeatureMAX = _FeatureMAX, + FEATURE_FLAG(AuthSupported), + FEATURE_FLAG(SchemeKnown), + FEATURE_FLAG(SchemeFlexible), + FEATURE_FLAG(AllowRootless), + FEATURE_FLAG_NAME(AllowOpaque) = FEATURE_FLAG_NAME(AllowRootless), + FEATURE_FLAG(PathOperation), + FEATURE_FLAG(AllowEmptyPath), + FEATURE_FLAG(ToLower), + FEATURE_FLAG(DecodeUnreserved), + FEATURE_FLAG(EncodeSpace), + FEATURE_FLAG(EncodeSpaceAsPlus), + FEATURE_FLAG(EncodeForSQL), + FEATURE_FLAG(EncodeExtendedASCII), + FEATURE_FLAG(DecodeExtendedASCII), + FEATURE_FLAG(EncodeExtendedDelim), + FEATURE_FLAG(DecodeExtendedDelim), + FEATURE_FLAG(EncodeCntrl), + FEATURE_FLAG(EncodePercent), + FEATURE_FLAG(HashBangToEscapedFragment), + FEATURE_FLAG(EscapedToHashBangFragment), + FEATURE_FLAG(PathDenyRootParent), + FEATURE_FLAG(PathStripRootParent), + FEATURE_FLAG(TryToFix), + FEATURE_FLAG(CheckHost), + FEATURE_FLAG(AllowHostIDN), + FEATURE_FLAG(ConvertHostIDN), + FEATURE_FLAG(DenyNetworkPath), + FEATURE_FLAG(RemoteOnly), + FEATURE_FLAG(NoRelPath), + FEATURE_FLAG_NAME(HierURI) = FEATURE_FLAG_NAME(NoRelPath), + FEATURE_FLAG(UpperEncoded), + }; + +#undef FEATURE_NAME +#undef FEATURE_FLAG + + public: + //============================== + enum ESets { + // these are guaranteed and will change buffer size + + FeatureDecodeStandard = 0 | FeatureDecodeUnreserved | FeatureDecodeStandardExtra, + + FeaturesDecodeExtended = 0 | FeatureDecodeExtendedASCII | FeatureDecodeExtendedDelim, + + FeaturesDecode = 0 | FeatureDecodeUnreserved | FeatureDecodeStandard | FeaturesDecodeExtended, + + FeaturesEncodeExtended = 0 | FeatureEncodeExtendedASCII | FeatureEncodeExtendedDelim, + + FeaturesEncode = 0 | FeatureEncodeForSQL | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent | FeaturesEncodeExtended, + + // these are not guaranteed to apply to a given field + + FeatureDecodeAllowed = 0 | FeatureDecodeUnreserved | FeatureDecodeFieldAllowed, + + FeaturesMaybeDecode = 0 | FeaturesDecode | FeatureDecodeAllowed, + + FeaturesMaybeEncode = 0 | FeaturesEncode, + + FeaturesEncodeDecode = 0 | FeaturesMaybeEncode | FeaturesMaybeDecode, + + FeaturesAllEncoder = 0 | FeaturesEncodeDecode | FeatureDecodeANY | FeatureToLower | FeatureUpperEncoded | FeatureEncodeSpaceAsPlus, + + //============================== + FeaturesNormalizeSet = 0 | FeaturePathOperation | FeatureToLower | FeatureDecodeAllowed | FeatureEncodeSpaceAsPlus | FeatureEncodeForSQL | FeaturePathStripRootParent | FeatureTryToFix | FeatureUpperEncoded, + + FeaturesDefault = 0 // it reproduces old parsedURL + | FeaturePathOperation | FeaturePathDenyRootParent | FeatureCheckHost, + + // essentially allows all valid RFC urls and keeps them as-is + FeaturesBare = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureAllowEmptyPath, + + FeaturesAll = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureCheckHost | FeaturesNormalizeSet, + + // Deprecated, use FeaturesRecommended + FeaturesRobotOld = 0 + // http://tools.ietf.org/html/rfc3986#section-6.2.2 + | FeatureToLower // 6.2.2.1 + | FeatureUpperEncoded // 6.2.2.1 + | FeatureDecodeUnreserved // 6.2.2.2 + | FeaturePathOperation // 6.2.2.3 + | FeaturePathDenyRootParent | FeatureSchemeKnown | FeatureConvertHostIDN | FeatureRemoteOnly | FeatureHashBangToEscapedFragment | FeatureCheckHost, + + // these are mutually exclusive + FeaturesPath = 0 | FeaturePathDenyRootParent | FeaturePathStripRootParent, + + FeaturesEscapedFragment = 0 | FeatureEscapedToHashBangFragment | FeatureHashBangToEscapedFragment, + + FeaturesCheckSpecialChar = 0 | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent, + + FeaturesEncodePChar = 0 | FeatureUpperEncoded | FeaturesEncodeDecode | FeaturesCheckSpecialChar, + + // http://wiki.yandex-team.ru/robot/newDesign/dups/normolization + FeaturesRecommended = 0 | FeatureSchemeKnown | FeatureRemoteOnly | FeatureToLower | FeatureCheckHost | FeatureConvertHostIDN | FeatureHashBangToEscapedFragment | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodeExtendedASCII | FeatureUpperEncoded | FeatureDecodeUnreserved | FeaturePathOperation | FeaturePathStripRootParent, + + FeaturesRobot = FeaturesRecommended + }; + }; + + static inline int strnicmp(const char* lt, const char* rt, size_t len) { + return lt == rt ? 0 : ::strnicmp(lt, rt, len); + } + + static inline int CompareNoCasePrefix(const TStringBuf& lt, const TStringBuf& rt) { + return strnicmp(lt.data(), rt.data(), rt.length()); + } + + static inline bool EqualNoCase(const TStringBuf& lt, const TStringBuf& rt) { + return lt.length() == rt.length() && 0 == CompareNoCasePrefix(lt, rt); + } + + static inline int CompareNoCase(const TStringBuf& lt, const TStringBuf& rt) { + if (lt.length() == rt.length()) + return CompareNoCasePrefix(lt, rt); + return lt.length() < rt.length() ? -1 : 1; + } + + class TSchemeInfo { + public: + const TScheme::EKind Kind; + const ui16 Port; + const TStringBuf Str; + const ui32 FldReq; + TSchemeInfo(TScheme::EKind kind, TStringBuf str, ui32 fldReq = 0, ui16 port = 0) + : Kind(kind) + , Port(port) + , Str(str) + , FldReq(fldReq) + { + } + bool Matches(const TStringBuf& scheme) const { + return EqualNoCase(scheme, Str); + } + + public: + static const TSchemeInfo& Get(const TStringBuf& scheme); + static const TSchemeInfo& Get(TScheme::EKind scheme) { + return Registry[scheme]; + } + static TScheme::EKind GetKind(const TStringBuf& scheme) { + return Get(scheme).Kind; + } + static TStringBuf GetCanon(TScheme::EKind scheme) { + return Get(scheme).Str; + } + static ui16 GetDefaultPort(TScheme::EKind scheme) { + return Get(scheme).Port; + } + + private: + static const TSchemeInfo Registry[]; + }; + + struct TParseFlags { + const ui64 Allow; + const ui64 Extra; + TParseFlags(ui64 allow = 0, ui64 extra = 0) + : Allow(allow) + , Extra(extra) + { + } + ui64 operator&(const TParseFlags& flags) const { + return (Allow & flags.Allow) | (Extra & flags.Extra); + } + ui64 operator&(ui64 flags) const { + return (Allow & flags); + } + TParseFlags operator|(const TParseFlags& flags) const { + return TParseFlags(Allow | flags.Allow, Extra | flags.Extra); + } + TParseFlags Exclude(ui64 flags) const { + return TParseFlags(Allow & ~flags, Extra & ~flags); + } + }; + +#define FEATURE_NAME(f) _BitFeature##f +#define FEATURE_FLAG_NAME(f) Feature##f +#define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1UL << FEATURE_NAME(f) + + struct TQueryArg { + TStringBuf Name; + TStringBuf Value; + + private: + enum EBit { + FEATURE_NAME(Filter), + FEATURE_NAME(SortByName), + FEATURE_NAME(RemoveEmptyQuery), + FEATURE_NAME(RewriteDirty), + _FeatureMAX + }; + + public: + enum EPublic : ui32 { + FeatureMAX = _FeatureMAX, + FEATURE_FLAG(Filter), + FEATURE_FLAG(SortByName), + FEATURE_FLAG(RemoveEmptyQuery), + FEATURE_FLAG(RewriteDirty), + }; + + enum EProcessed { + // OK and clean. + ProcessedOK = 0, + + // OK, but query stored in internal buffer and TUri::Rewrite() is required. + ProcessedDirty = 1, + + ProcessedMalformed = 2, + ProcessedTooMany = 3, + }; + }; + + typedef bool (*TQueryArgFilter)(const TQueryArg& arg, void* filterData); + +#undef FEATURE_NAME +#undef FEATURE_FLAG_NAME +#undef FEATURE_FLAG + + const char* FieldToString(const TField::EField& t); + const char* ParsedStateToString(const TState::EParsed& t); + const char* SchemeKindToString(const TScheme::EKind& t); + +} + +Y_DECLARE_OUT_SPEC(inline, NUri::TField::EField, out, t) { + out << NUri::FieldToString(t); +} + +Y_DECLARE_OUT_SPEC(inline, NUri::TScheme::EKind, out, t) { + out << NUri::SchemeKindToString(t); +} + +Y_DECLARE_OUT_SPEC(inline, NUri::TState::EParsed, out, t) { + out << NUri::ParsedStateToString(t); +} + +static inline ui16 DefaultPort(NUri::TScheme::EKind scheme) { + return NUri::TSchemeInfo::GetDefaultPort(scheme); +} + +static inline NUri::TScheme::EKind SchemeKind(const TStringBuf& scheme) { + return NUri::TSchemeInfo::GetKind(scheme); +} diff --git a/library/cpp/uri/encode.cpp b/library/cpp/uri/encode.cpp new file mode 100644 index 00000000000..9eab1535bc6 --- /dev/null +++ b/library/cpp/uri/encode.cpp @@ -0,0 +1,221 @@ +#include "encode.h" + +#include <util/string/cast.h> +#include <util/generic/singleton.h> + +namespace NUri { + namespace NEncode { +// http://tools.ietf.org/html/rfc3986#section-2.2 +#define GENDELIMS0 ":/?#[]@" +#define SUBDELIMS0 "!$&'()*+,;=" +// http://tools.ietf.org/html/rfc3986#section-2.3 +#define UNRESERVED "-._~" + +// now find subsets which can sometimes be decoded + +// remove '#' which can't ever be decoded +// don't mark anything allowed for pass (pass is completely encoded) +// safe in path, qry, frag +#define GENDELIMS1 ":@" +// allowed in qry, frag +#define GENDELIMS2 "/?" + +// qry-unsafe chars +#define SUBDELIMS1 "&+=;" +// rest allowed in qry, frag +#define SUBDELIMS2 "!$'()*," + + const TEncoder::TGrammar& TEncoder::Grammar() { + return *Singleton<TEncoder::TGrammar>(); + } + + // initialize the grammar map + TEncoder::TGrammar::TGrammar() { + // first set up unreserved characters safe in any field + const ui64 featUnres = TFeature::FeatureDecodeUnreserved; + AddRng('0', '9', ECFDigit, featUnres); + AddRng('A', 'Z', ECFUpper, featUnres | TFeature::FeatureToLower); + AddRng('a', 'z', ECFLower, featUnres); + Add(UNRESERVED, ECFUnres, featUnres); + + // XXX: standard "safe" set used previously "-_.!~*();/:@$,", with comment: + // alnum + reserved + mark + ( '[', ']') - ('=' '+' '&' '\'' '"' '\\' '?') + Add("!*();/:@$,", ECFStdrd, TFeature::FeatureDecodeStandardExtra); + + // now field-specific subsets of reserved characters (gen-delims + sub-delims) + const ui64 featSafe = TFeature::FeatureDecodeFieldAllowed; + + Add(GENDELIMS1, 0, featSafe, TField::FlagPath | TField::FlagQuery | TField::FlagFrag); + Add(GENDELIMS2, 0, featSafe, TField::FlagQuery | TField::FlagFrag); + + Add(SUBDELIMS1, 0, featSafe, TField::FlagUser); + Add(SUBDELIMS2, 0, featSafe, TField::FlagUser | TField::FlagQuery | TField::FlagFrag); + + // control chars + AddRng(0x00, 0x20, TFeature::FeatureEncodeCntrl); + Add(0x7f, TFeature::FeatureEncodeCntrl); + + // '%' starts a percent-encoded sequence + Add('%', TFeature::FeatureDecodeANY | TFeature::FeatureEncodePercent); + + // extended ASCII + AddRng(128, 255, TFeature::FeatureEncodeExtendedASCII | TFeature::FeatureDecodeExtendedASCII); + + // extended delims + Add("\"<>[\\]^`{|}", TFeature::FeatureEncodeExtendedDelim | TFeature::FeatureDecodeExtendedDelim); + + // add characters with other features + Add(' ', TFeature::FeatureEncodeSpace | TFeature::FeatureEncodeSpaceAsPlus); + Add("'\"\\", TFeature::FeatureEncodeForSQL); + + GetMutable(':').EncodeFld |= TField::FlagUser; + GetMutable('?').EncodeFld |= TField::FlagPath; + GetMutable('#').EncodeFld |= TField::FlagPath | TField::FlagQuery; + GetMutable('&').EncodeFld |= TField::FlagQuery; + GetMutable('+').EncodeFld |= TField::FlagQuery; + } + + // should we decode an encoded character + bool TCharFlags::IsDecode(ui32 fldmask, ui64 flags) const { + const ui64 myflags = flags & FeatFlags; + if (myflags & TFeature::FeaturesEncode) + return false; + if (myflags & TFeature::FeaturesDecode) + return true; + return (fldmask & DecodeFld) && (flags & TFeature::FeatureDecodeFieldAllowed); + } + + const int dD = 'a' - 'A'; + + int TEncodeMapper::EncodeSym(unsigned char& ch) const { + const TCharFlags& chflags = TEncoder::GetFlags(ch); + const ui64 flags = Flags & chflags.FeatFlags; + + if (flags & TFeature::FeatureToLower) + ch += dD; + + if (Q_DecodeAny) + return -1; + + if (flags & TFeature::FeaturesEncode) + return 1; + + if (' ' == ch) { + if (Q_EncodeSpcAsPlus) + ch = '+'; + return 0; + } + + return 0; + } + + int TEncodeMapper::EncodeHex(unsigned char& ch) const { + const TCharFlags& chflags = TEncoder::GetFlags(ch); + const ui64 flags = Flags & chflags.FeatFlags; + + if (flags & TFeature::FeatureToLower) + ch += dD; + + if (Q_DecodeAny) + return -1; + + if (chflags.IsDecode(FldMask, Flags)) + return 0; + + if (' ' == ch) { + if (!Q_EncodeSpcAsPlus) + return 1; + ch = '+'; + return 0; + } + + return 1; + } + + bool TEncodeToMapper::Encode(unsigned char ch) const { + if (Q_DecodeAny) + return false; + + const TCharFlags& chflags = TEncoder::GetFlags(ch); + if (FldMask & chflags.EncodeFld) + return true; + + const ui64 flags = Flags & chflags.FeatFlags; + return (flags & TFeature::FeaturesEncode); + } + + TEncoder::TEncoder(IOutputStream& out, const TEncodeMapper& fldsrc, const TEncodeToMapper& flddst) + : Out(out) + , FldSrc(fldsrc) + , FldDst(flddst) + , OutFlags(0) + , HexValue(0) + { + } + + IOutputStream& TEncoder::Hex(IOutputStream& out, unsigned char val) { + static const char sHexCodes[] = "0123456789ABCDEF"; + return out << sHexCodes[(val >> 4) & 0xF] << sHexCodes[val & 0xF]; + } + + IOutputStream& TEncoder::EncodeAll(IOutputStream& out, const TStringBuf& val) { + for (size_t i = 0; i != val.length(); ++i) + Encode(out, val[i]); + return out; + } + + IOutputStream& TEncoder::EncodeNotAlnum(IOutputStream& out, const TStringBuf& val) { + for (size_t i = 0; i != val.length(); ++i) { + const char c = val[i]; + if (IsAlnum(c)) + out << c; + else + Encode(out, c); + } + return out; + } + + IOutputStream& TEncoder::EncodeField( + IOutputStream& out, const TStringBuf& val, TField::EField fld) { + const ui32 fldmask = ui32(1) << fld; + for (size_t i = 0; i != val.length(); ++i) { + const char ch = val[i]; + if (GetFlags(ch).IsAllowed(fldmask)) + out << ch; + else + Encode(out, ch); + } + return out; + } + + IOutputStream& TEncoder::EncodeField( + IOutputStream& out, const TStringBuf& val, TField::EField fld, ui64 flags) { + const ui32 fldmask = ui32(1) << fld; + for (size_t i = 0; i != val.length(); ++i) { + const char ch = val[i]; + if (GetFlags(ch).IsDecode(fldmask, flags)) + out << ch; + else + Encode(out, ch); + } + return out; + } + + void TEncoder::Do(unsigned char ch, int res) { + OutFlags |= GetFlags(ch).FeatFlags; + + bool escapepct = false; + if (0 < res) // definitely encode + escapepct = FldDst.Enabled(); + else if (0 != res || !FldDst.Enabled() || !FldDst.Encode(ch)) { + Out << ch; + return; + } + + Out << '%'; + if (escapepct) + Out.Write("25", 2); // '%' + Hex(Out, ch); + } + } +} diff --git a/library/cpp/uri/encode.h b/library/cpp/uri/encode.h new file mode 100644 index 00000000000..a9ece154270 --- /dev/null +++ b/library/cpp/uri/encode.h @@ -0,0 +1,282 @@ +#pragma once + +#include "common.h" + +#include <util/stream/output.h> + +namespace NUri { + namespace NEncode { +#define CHAR_TYPE_NAME(f) _ECT##f +#define CHAR_TYPE_FLAG(f) ECF##f = 1u << CHAR_TYPE_NAME(f) + + enum ECharType { + CHAR_TYPE_NAME(Digit), + CHAR_TYPE_NAME(Lower), + CHAR_TYPE_NAME(Upper), + CHAR_TYPE_NAME(Unres), + CHAR_TYPE_NAME(Stdrd), + }; + + enum ECharFlag { + CHAR_TYPE_FLAG(Digit), + CHAR_TYPE_FLAG(Lower), + CHAR_TYPE_FLAG(Upper), + CHAR_TYPE_FLAG(Unres), + CHAR_TYPE_FLAG(Stdrd), + // compound group flags + ECGAlpha = ECFUpper | ECFLower, + ECGAlnum = ECGAlpha | ECFDigit, + ECGUnres = ECGAlnum | ECFUnres, + ECGStdrd = ECGUnres | ECFStdrd, + }; + +#undef CHAR_TYPE_NAME +#undef CHAR_TYPE_FLAG + + struct TCharFlags { + ui32 TypeFlags; + ui64 FeatFlags; + ui32 DecodeFld; // decode if FeatureDecodeFieldAllowed + ui32 EncodeFld; // encode if shouldn't be treated as delimiter + TCharFlags(ui64 feat = 0) + : TypeFlags(0) + , FeatFlags(feat) + , DecodeFld(0) + , EncodeFld(0) + { + } + TCharFlags(ui32 type, ui64 feat, ui32 decmask = 0, ui32 encmask = 0) + : TypeFlags(type) + , FeatFlags(feat) + , DecodeFld(decmask) + , EncodeFld(encmask) + { + } + TCharFlags& Add(const TCharFlags& val) { + TypeFlags |= val.TypeFlags; + FeatFlags |= val.FeatFlags; + DecodeFld |= val.DecodeFld; + EncodeFld |= val.EncodeFld; + return *this; + } + bool IsAllowed(ui32 fldmask) const { + return (TypeFlags & ECGUnres) || (DecodeFld & ~EncodeFld & fldmask); + } + // should we decode an encoded character + bool IsDecode(ui32 fldmask, ui64 flags) const; + }; + + class TEncodeMapperBase { + protected: + TEncodeMapperBase() + : Flags(0) + , FldMask(0) + , Q_DecodeAny(false) + { + } + TEncodeMapperBase(ui64 flags, TField::EField fld) + : Flags(flags) + , FldMask(1u << fld) + , Q_DecodeAny(flags & TFeature::FeatureDecodeANY) + { + } + + protected: + const ui64 Flags; + const ui32 FldMask; + const bool Q_DecodeAny; // this is a special option for username/password + }; + + // maps a sym or hex character and indicates whether it has to be encoded + class TEncodeMapper + : public TEncodeMapperBase { + public: + TEncodeMapper(ui64 flags, TField::EField fld = TField::FieldAllMAX) + : TEncodeMapperBase(flags, fld) + , Q_EncodeSpcAsPlus(flags & TFeature::FeatureEncodeSpaceAsPlus) + { + } + // negative=sym, positive=hex, zero=maybesym + int EncodeSym(unsigned char&) const; + int EncodeHex(unsigned char&) const; + + protected: + const bool Q_EncodeSpcAsPlus; + }; + + // indicates whether a character has to be encoded when copying to a field + class TEncodeToMapper + : public TEncodeMapperBase { + public: + TEncodeToMapper() + : TEncodeMapperBase() + { + } + TEncodeToMapper(ui64 flags, TField::EField fld = TField::FieldAllMAX) + : TEncodeMapperBase(flags, fld) + { + } + bool Enabled() const { + return 0 != FldMask; + } + bool Encode(unsigned char) const; + }; + + class TEncoder { + public: + TEncoder(IOutputStream& out, const TEncodeMapper& fldsrc, const TEncodeToMapper& flddst = TEncodeToMapper()); + + ui64 ReEncode(const TStringBuf& url); + ui64 ReEncode(const char* str, size_t len) { + return ReEncode(TStringBuf(str, len)); + } + + protected: + static bool IsType(unsigned char c, ui64 flags) { + return GetFlags(c).TypeFlags & flags; + } + + public: + static bool IsDigit(unsigned char c) { + return IsType(c, ECFDigit); + } + static bool IsUpper(unsigned char c) { + return IsType(c, ECFUpper); + } + static bool IsLower(unsigned char c) { + return IsType(c, ECFLower); + } + static bool IsAlpha(unsigned char c) { + return IsType(c, ECGAlpha); + } + static bool IsAlnum(unsigned char c) { + return IsType(c, ECGAlnum); + } + static bool IsUnres(unsigned char c) { + return IsType(c, ECGUnres); + } + static const TCharFlags& GetFlags(unsigned char c) { + return Grammar().Get(c); + } + + public: + // process an encoded string, decoding safe chars and encoding unsafe + static IOutputStream& ReEncode(IOutputStream& out, const TStringBuf& val, const TEncodeMapper& srcfld) { + TEncoder(out, srcfld).ReEncode(val); + return out; + } + static IOutputStream& ReEncodeTo(IOutputStream& out, const TStringBuf& val, const TEncodeMapper& srcfld, const TEncodeToMapper& dstfld) { + TEncoder(out, srcfld, dstfld).ReEncode(val); + return out; + } + + // see also UrlUnescape() from string/quote.h + static IOutputStream& Decode( + IOutputStream& out, const TStringBuf& val, ui64 flags) { + return ReEncode(out, val, flags | TFeature::FeatureDecodeANY); + } + + public: + // process a raw string or char, encode as needed + static IOutputStream& Hex(IOutputStream& out, unsigned char val); + static IOutputStream& Encode(IOutputStream& out, unsigned char val) { + out << '%'; + return Hex(out, val); + } + static IOutputStream& EncodeAll(IOutputStream& out, const TStringBuf& val); + static IOutputStream& EncodeNotAlnum(IOutputStream& out, const TStringBuf& val); + + static IOutputStream& EncodeField(IOutputStream& out, const TStringBuf& val, TField::EField fld); + static IOutputStream& EncodeField(IOutputStream& out, const TStringBuf& val, TField::EField fld, ui64 flags); + + static IOutputStream& Encode(IOutputStream& out, const TStringBuf& val) { + return EncodeField(out, val, TField::FieldAllMAX); + } + + static IOutputStream& Encode(IOutputStream& out, const TStringBuf& val, ui64 flags) { + return EncodeField(out, val, TField::FieldAllMAX, flags); + } + + public: + class TGrammar { + TCharFlags Map_[256]; + + public: + TGrammar(); + const TCharFlags& Get(unsigned char ch) const { + return Map_[ch]; + } + + TCharFlags& GetMutable(unsigned char ch) { + return Map_[ch]; + } + TCharFlags& Add(unsigned char ch, const TCharFlags& val) { + return GetMutable(ch).Add(val); + } + + void AddRng(unsigned char lo, unsigned char hi, const TCharFlags& val) { + for (unsigned i = lo; i <= hi; ++i) + Add(i, val); + } + void AddRng(unsigned char lo, unsigned char hi, ui32 type, ui64 feat, ui32 decmask = 0, ui32 encmask = 0) { + AddRng(lo, hi, TCharFlags(type, feat, decmask, encmask)); + } + + void Add(const TStringBuf& set, const TCharFlags& val) { + for (size_t i = 0; i != set.length(); ++i) + Add(set[i], val); + } + void Add(const TStringBuf& set, ui32 type, ui64 feat, ui32 decmask = 0, ui32 encmask = 0) { + Add(set, TCharFlags(type, feat, decmask, encmask)); + } + }; + + static const TGrammar& Grammar(); + + protected: + IOutputStream& Out; + const TEncodeMapper FldSrc; + const TEncodeToMapper FldDst; + ui64 OutFlags; + int HexValue; + + protected: + void HexReset() { + HexValue = 0; + } + + void HexDigit(char c) { + HexAdd(c - '0'); + } + void HexUpper(char c) { + HexAdd(c - 'A' + 10); + } + void HexLower(char c) { + HexAdd(c - 'a' + 10); + } + + void HexAdd(int val) { + HexValue <<= 4; + HexValue += val; + } + + protected: + void DoSym(unsigned char ch) { + const int res = FldSrc.EncodeSym(ch); + Do(ch, res); + } + void DoHex(unsigned char ch) { + const int res = FldSrc.EncodeHex(ch); + Do(ch, res); + } + void DoHex() { + DoHex(HexValue); + HexValue = 0; + } + void Do(unsigned char, int); + }; + } + + using TEncoder = NEncode::TEncoder; + +} diff --git a/library/cpp/uri/encodefsm.rl6 b/library/cpp/uri/encodefsm.rl6 new file mode 100644 index 00000000000..6a323aa85a3 --- /dev/null +++ b/library/cpp/uri/encodefsm.rl6 @@ -0,0 +1,51 @@ +#include <library/cpp/uri/encode.h> + +#ifdef __clang__ + #pragma clang diagnostic ignored "-Wunused-variable" +#endif + +namespace NUri { +namespace NEncode { + +%%{ + machine TEncoder; + + hex = ( + digit >{ HexDigit(fc); } | + [A-F] >{ HexUpper(fc); } | + [a-f] >{ HexLower(fc); } + ); + + escaped = ( "%" hex hex ) + > { HexReset(); } + % { DoHex(); }; + + bad_escaped = ( "%" hex ) + % { + DoSym(*(fpc - 2)); + DoSym(*(fpc - 1)); + }; + + sym = (any - bad_escaped - escaped) %{ DoSym(*(fpc - 1)); }; + + main := ( escaped | bad_escaped | sym )**; + + write data; +}%% + +ui64 TEncoder::ReEncode(const TStringBuf &url) +{ + const char *p = url.data(); + const char *pe = p + url.length(); + const char *eof = pe; + int cs; + OutFlags = 0; + + %% write init; + %% write exec; + + return OutFlags; +} + +} +} diff --git a/library/cpp/uri/http_url.h b/library/cpp/uri/http_url.h new file mode 100644 index 00000000000..7c8e8d844d1 --- /dev/null +++ b/library/cpp/uri/http_url.h @@ -0,0 +1,77 @@ +#pragma once + +#include "uri.h" +#include "other.h" + +// XXX: use NUri::TUri directly; this whole file is for backwards compatibility + +class THttpURL + : public NUri::TUri { +public: + typedef TField::EFlags TFlags; + typedef TField::EField TField; + typedef TScheme::EKind TSchemeKind; + typedef TState::EParsed TParsedState; + +public: + enum { + FeatureUnescapeStandard = TFeature::FeatureDecodeStandard, + FeatureEscSpace = TFeature::FeatureEncodeSpaceAsPlus, + FeatureEscapeUnescaped = TFeature::FeatureEncodeExtendedASCII, + FeatureNormalPath = TFeature::FeaturePathStripRootParent, + }; + +public: + THttpURL(unsigned defaultPort = 80) + : TUri(defaultPort) + { + } + + THttpURL(const TStringBuf& host, ui16 port, const TStringBuf& path, const TStringBuf& query = TStringBuf(), const TStringBuf& scheme = "http", unsigned defaultPort = 0) + : TUri(host, port, path, query, scheme, defaultPort) + { + } + + THttpURL(const TUri& url) + : TUri(url) + { + } + +public: // XXX: don't use any of these legacy methods below +public: // use TUri::GetField() instead + /// will return null-terminated if fld is not dirty + const char* Get(EField fld) const { + return GetField(fld).data(); + } + +public: // use TUriUpdate class so that Rewrite() is only called once + void Set(EField field, const TStringBuf& value) { + if (SetInMemory(field, value)) + Rewrite(); + } + + template <size_t size> + void Set(EField field, const char (&value)[size]) { + if (SetInMemory(field, value)) + Rewrite(); + } + +public: // use TUri::FldXXX methods for better control + // Partial quick set of the field, can be called for + // multiple fields + bool SetInMemory(EField field, const TStringBuf& value) { + return FldMemSet(field, value); + } + + // clears a field + void Reset(EField field) { + FldClr(field); + } +}; + +static inline const char* HttpURLParsedStateToString(const NUri::TState::EParsed& t) { + return NUri::ParsedStateToString(t); +} +static inline const char* HttpUrlSchemeKindToString(const NUri::TScheme::EKind& t) { + return NUri::SchemeKindToString(t); +} diff --git a/library/cpp/uri/location.cpp b/library/cpp/uri/location.cpp new file mode 100644 index 00000000000..a6a4d11ffa6 --- /dev/null +++ b/library/cpp/uri/location.cpp @@ -0,0 +1,31 @@ +#include "location.h" +#include "uri.h" + +namespace NUri { + static const int URI_PARSE_FLAGS = + (TFeature::FeaturesRecommended | TFeature::FeatureConvertHostIDN | TFeature::FeatureEncodeExtendedDelim | TFeature::FeatureEncodePercent) & ~TFeature::FeatureHashBangToEscapedFragment; + + TString ResolveRedirectLocation(const TStringBuf& baseUrl, + const TStringBuf& location) { + TUri baseUri; + TUri locationUri; + + // Parse base URL. + if (baseUri.Parse(baseUrl, URI_PARSE_FLAGS) != NUri::TState::ParsedOK) { + return ""; + } + // Parse location with respect to the base URL. + if (locationUri.Parse(location, baseUri, URI_PARSE_FLAGS) != NUri::TState::ParsedOK) { + return ""; + } + // Inherit fragment. + if (!locationUri.GetField(NUri::TField::FieldFragment)) { + NUri::TUriUpdate update(locationUri); + update.Set(NUri::TField::FieldFragment, baseUri.GetField(NUri::TField::FieldFragment)); + } + TString res; + locationUri.Print(res, NUri::TField::FlagAllFields); + return res; + } + +} diff --git a/library/cpp/uri/location.h b/library/cpp/uri/location.h new file mode 100644 index 00000000000..0f533fe0b5c --- /dev/null +++ b/library/cpp/uri/location.h @@ -0,0 +1,13 @@ +#pragma once + +#include <util/generic/string.h> + +namespace NUri { + /** + * Resolve Location header according to https://tools.ietf.org/html/rfc7231#section-7.1.2 + * + * @return Resolved location's url or empty string in case of any error. + */ + TString ResolveRedirectLocation(const TStringBuf& baseUrl, const TStringBuf& location); + +} diff --git a/library/cpp/uri/location_ut.cpp b/library/cpp/uri/location_ut.cpp new file mode 100644 index 00000000000..26a0f644711 --- /dev/null +++ b/library/cpp/uri/location_ut.cpp @@ -0,0 +1,40 @@ +#include "location.h" + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(TResolveRedirectTests) { + Y_UNIT_TEST(Absolute) { + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com", "http://redir-example.com/sub"), "http://redir-example.com/sub"); + } + Y_UNIT_TEST(AbsWithFragment) { + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com", "http://redir-example.com/sub#Hello"), "http://redir-example.com/sub#Hello"); + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com/#Hello", "http://redir-example.com/sub"), "http://redir-example.com/sub#Hello"); + } + Y_UNIT_TEST(Rel) { + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com", "/sub"), "http://example.com/sub"); + } + Y_UNIT_TEST(RelWithFragment) { + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com", "/sub#Hello"), "http://example.com/sub#Hello"); + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com/#Hello", "/sub"), "http://example.com/sub#Hello"); + } + Y_UNIT_TEST(WrongLocation) { + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com", ""), ""); + } + Y_UNIT_TEST(WrongBase) { + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("", "http://example.com"), ""); + } + Y_UNIT_TEST(HashBangIsNothingSpecial) { + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com", "http://redir-example.com/sub#!Hello"), "http://redir-example.com/sub#!Hello"); + UNIT_ASSERT_EQUAL( + NUri::ResolveRedirectLocation("http://example.com/#!Hello", "http://redir-example.com/sub"), "http://redir-example.com/sub#!Hello"); + } +} diff --git a/library/cpp/uri/other.cpp b/library/cpp/uri/other.cpp new file mode 100644 index 00000000000..b23a5b68a9c --- /dev/null +++ b/library/cpp/uri/other.cpp @@ -0,0 +1,82 @@ +#include "other.h" + +#include <util/string/util.h> +#include <util/system/yassert.h> + +/********************************************************/ +/********************************************************/ + +static const Tr InvertTr(".:/?#", "\005\004\003\002\001"); +static const Tr RevertTr("\005\004\003\002\001", ".:/?#"); + +void TrspChars(char* s) { + InvertTr.Do(s); +} + +void UnTrspChars(char* s) { + RevertTr.Do(s); +} + +void TrspChars(char* s, size_t l) { + InvertTr.Do(s, l); +} + +void UnTrspChars(char* s, size_t l) { + RevertTr.Do(s, l); +} + +void TrspChars(const char* s, char* d) { + InvertTr.Do(s, d); +} + +void UnTrspChars(const char* s, char* d) { + RevertTr.Do(s, d); +} + +void InvertDomain(char* begin, char* end) { + // skip schema if it is present + const auto dotPos = TStringBuf{begin, end}.find('.'); + if (dotPos == TStringBuf::npos) + return; // no need to invert anything + const auto schemaendPos = TStringBuf{begin, end}.find("://", 3); + if (schemaendPos < dotPos) + begin += schemaendPos + 3; + char* sl = (char*)memchr(begin, '/', end - begin); + char* cl = (char*)memchr(begin, ':', sl ? sl - begin : end - begin); + end = cl ? cl : (sl ? sl : end); + + // invert string + for (size_t i = 0, n = end - begin; i < n / 2; ++i) + DoSwap(begin[i], begin[n - i - 1]); + + // invert back each host name segment + char* b = begin; + while (true) { + char* e = (char*)memchr(b, '.', end - b); + if (!e) + e = end; + for (size_t i = 0, n = e - b; i < n / 2; ++i) + DoSwap(b[i], b[n - i - 1]); + if (e == end) + break; + b = e + 1; + } +} + +void InvertUrl(char* begin, char* end) { + char* slash = strchr(begin, '/'); + if (slash) { + *slash = 0; + } + strlwr(begin); + if (slash) { + *slash = '/'; + } + InvertDomain(begin, end); + TrspChars(begin); +} + +void RevertUrl(char* begin, char* end) { + UnTrspChars(begin); + InvertDomain(begin, end); +} diff --git a/library/cpp/uri/other.h b/library/cpp/uri/other.h new file mode 100644 index 00000000000..7aec22e77b3 --- /dev/null +++ b/library/cpp/uri/other.h @@ -0,0 +1,42 @@ +#pragma once + +#include <util/generic/string.h> + +// Some functions for inverted url representation +// No scheme cut-off, no 80th port normalization + +void TrspChars(char* s); +void UnTrspChars(char* s); +void TrspChars(char* s, size_t l); +void UnTrspChars(char* s, size_t l); +void TrspChars(const char* s, char* d); +void UnTrspChars(const char* s, char* d); + +void InvertDomain(char* begin, char* end); + +inline TString& InvertDomain(TString& url) { + InvertDomain(url.begin(), url.begin() + url.size()); + return url; +} + +void InvertUrl(char* begin, char* end); + +inline void InvertUrl(char* url) { + InvertUrl(url, url + strlen(url)); +} + +inline TString& InvertUrl(TString& url) { + InvertUrl(url.begin(), url.begin() + url.size()); + return url; +} + +void RevertUrl(char* begin, char* end); + +inline void RevertUrl(char* url) { + RevertUrl(url, url + strlen(url)); +} + +inline TString& RevertUrl(TString& url) { + RevertUrl(url.begin(), url.begin() + url.size()); + return url; +} diff --git a/library/cpp/uri/parse.cpp b/library/cpp/uri/parse.cpp new file mode 100644 index 00000000000..1db4e008c49 --- /dev/null +++ b/library/cpp/uri/parse.cpp @@ -0,0 +1,207 @@ +#include "parse.h" +#include "common.h" +#include "encode.h" + +namespace NUri { + const TParseFlags TParser::FieldFlags[] = + { + TParseFlags(0 // FieldScheme + | TFeature::FeatureToLower, + 0) + + , + TParseFlags(0 // FieldUsername + | TFeature::FeatureDecodeANY | TFeature::FeaturesDecode | TFeature::FeatureEncodePercent, + 0 | TFeature::FeatureToLower) + + , + TParseFlags(0 // FieldPassword + | TFeature::FeatureDecodeANY | TFeature::FeaturesDecode | TFeature::FeatureEncodePercent, + 0 | TFeature::FeatureToLower) + + , + TParseFlags(0 // FieldHost + | TFeature::FeatureToLower | TFeature::FeatureUpperEncoded | (TFeature::FeaturesMaybeEncode & ~TFeature::FeatureEncodeExtendedDelim), + 0 | TFeature::FeaturesMaybeDecode) + + , + TParseFlags(0 // FieldPort + , + 0) + + , + TParseFlags(0 // FieldPath + | TFeature::FeaturesEncodePChar | TFeature::FeaturePathOperation, + 0 | TFeature::FeatureToLower | TFeature::FeatureEncodeSpaceAsPlus) + + , + TParseFlags(0 // FieldQuery + | TFeature::FeaturesEncodePChar | TFeature::FeatureEncodeSpaceAsPlus, + 0 | TFeature::FeatureToLower) + + , + TParseFlags(0 // FieldFragment + | TFeature::FeaturesEncodePChar, + 0 | TFeature::FeatureToLower | TFeature::FeatureEncodeSpaceAsPlus)}; + + namespace NParse { + void TRange::AddRange(const TRange& range, ui64 mask) { + FlagsAllPlaintext |= range.FlagsAllPlaintext; + // update only if flags apply here + mask &= range.FlagsEncodeMasked; + if (0 == mask) + return; + FlagsEncodeMasked |= mask; + if (mask & TFeature::FeaturesMaybeEncode) + Encode += range.Encode; + if (mask & TFeature::FeaturesDecode) + Decode += range.Decode; + } + + } + + void TParser::copyRequirementsImpl(const char* ptr) { + Y_ASSERT(0 != CurRange.FlagsAllPlaintext); + Y_UNUSED(ptr); +#ifdef DO_PRN + PrintHead(ptr, __FUNCTION__) + << " all=[" << IntToString<16>(CurRange.FlagsAllPlaintext) + << "] enc=[" << IntToString<16>(CurRange.FlagsEncodeMasked) + << " & " << IntToString<16>(Flags.Allow | Flags.Extra) << "]"; + PrintTail(CurRange.Beg, ptr); +#endif + for (int i = 0; i < TField::FieldUrlMAX; ++i) { + const TField::EField fld = TField::EField(i); + TSection& section = Sections[fld]; + // update only sections in progress + if (nullptr == section.Beg) + continue; + // and overlapping with the range + if (nullptr != section.End && section.End < CurRange.Beg) + continue; +#ifdef DO_PRN + PrintHead(ptr, __FUNCTION__, fld) + << " all=[" << IntToString<16>(CurRange.FlagsAllPlaintext) + << "] enc=[" << IntToString<16>(CurRange.FlagsEncodeMasked) + << " & " << IntToString<16>(GetFieldFlags(fld)) << "]"; + PrintTail(section.Beg, ptr); +#endif + section.AddRange(CurRange, GetFieldFlags(fld)); + } + CurRange.Reset(); + } + + void TParser::PctEndImpl(const char* ptr) { +#ifdef DO_PRN + PrintHead(PctBegin, __FUNCTION__); + PrintTail(PctBegin, ptr); +#else + Y_UNUSED(ptr); +#endif + setRequirement(PctBegin, TEncoder::GetFlags('%').FeatFlags); + PctBegin = nullptr; + } + + void TParser::HexSet(const char* ptr) { + Y_ASSERT(nullptr != PctBegin); +#ifdef DO_PRN + PrintHead(ptr, __FUNCTION__); + PrintTail(PctBegin, ptr + 1); +#endif + PctBegin = nullptr; + const unsigned char ch = HexValue; + ui64 flags = TEncoder::GetFlags('%').FeatFlags | TEncoder::GetFlags(ch).FeatFlags; + + setRequirementExcept(ptr, flags, TFeature::FeaturesMaybeEncode); + } + + TState::EParsed TParser::ParseImpl() { +#ifdef DO_PRN + PrintHead(UriStr.data(), "[Parsing]") << "URL"; + PrintTail(UriStr); +#endif + + const bool ok = doParse(UriStr.data(), UriStr.length()); + +#ifdef DO_PRN + Cdbg << (ok ? "[Parsed]" : "[Failed]"); + for (int idx = 0; idx < TField::FieldUrlMAX; ++idx) { + const TSection& section = Sections[idx]; + if (section.IsSet()) + Cdbg << ' ' << TField::EField(idx) << "=[" << section.Get() << ']'; + } + Cdbg << Endl; +#endif + + if (!ok) { + if (!(Flags & TFeature::FeatureTryToFix) || !Sections[TField::FieldFrag].Beg) + return TState::ParsedBadFormat; + //Here: error was in fragment, just ignore it + ResetSection(TField::FieldFrag); + } + + if ((Flags & TFeature::FeatureDenyNetworkPath) && IsNetPath()) + return TState::ParsedBadFormat; + + const TSection& scheme = Sections[TField::FieldScheme]; + Scheme = scheme.IsSet() ? TSchemeInfo::GetKind(scheme.Get()) : TScheme::SchemeEmpty; + const TSchemeInfo& schemeInfo = TSchemeInfo::Get(Scheme); + + if (IsRootless()) { + // opaque case happens + if (schemeInfo.FldReq & TField::FlagHost) + return TState::ParsedBadFormat; + + if (TScheme::SchemeEmpty == Scheme) + return TState::ParsedBadScheme; + + if (Flags & TFeature::FeatureAllowRootless) + return TState::ParsedOK; + + if (!(Flags & TFeature::FeatureSchemeFlexible)) + return TState::ParsedBadScheme; + + return TState::ParsedRootless; + } + + checkSectionCollision(TField::FieldUser, TField::FieldHost); + checkSectionCollision(TField::FieldPass, TField::FieldPort); + + if (0 == (Flags & TFeature::FeatureAuthSupported)) + if (Sections[TField::FieldUser].IsSet() || Sections[TField::FieldPass].IsSet()) + return TState::ParsedBadAuth; + + TSection& host = Sections[TField::FieldHost]; + if (host.IsSet()) + for (; host.End != host.Beg && '.' == host.End[-1];) + --host.End; + + if (scheme.IsSet()) { + ui64 wantCareFlags = 0; + switch (Scheme) { + case TScheme::SchemeHTTP: + break; + case TScheme::SchemeEmpty: + Scheme = TScheme::SchemeUnknown; + [[fallthrough]]; + case TScheme::SchemeUnknown: + wantCareFlags = + TFeature::FeatureSchemeFlexible | TFeature::FeatureNoRelPath; + break; + default: + wantCareFlags = + TFeature::FeatureSchemeFlexible | TFeature::FeatureSchemeKnown; + break; + } + + if (0 != wantCareFlags && 0 == (Flags & wantCareFlags)) + return TState::ParsedBadScheme; + if ((schemeInfo.FldReq & TField::FlagHost) || (Flags & TFeature::FeatureRemoteOnly)) + if (!host.IsSet() || 0 == host.Len()) + return TState::ParsedBadFormat; + } + + return TState::ParsedOK; + } + +} diff --git a/library/cpp/uri/parse.h b/library/cpp/uri/parse.h new file mode 100644 index 00000000000..ca2358e5728 --- /dev/null +++ b/library/cpp/uri/parse.h @@ -0,0 +1,361 @@ +#pragma once + +// #define DO_PRN + +#include <cstddef> + +#include "common.h" + +#include <library/cpp/charset/doccodes.h> +#include <util/generic/strbuf.h> +#include <util/stream/output.h> +#include <util/string/cast.h> +#include <util/system/yassert.h> + +namespace NUri { + class TParser; + + namespace NParse { + class TRange { + public: + const char* Beg; + ui64 FlagsEncodeMasked; + ui64 FlagsAllPlaintext; + ui32 Encode; + ui32 Decode; + + public: + TRange(const char* beg = nullptr) + : Beg(beg) + , FlagsEncodeMasked(0) + , FlagsAllPlaintext(0) + , Encode(0) + , Decode(0) + { + } + + void Reset(const char* beg = nullptr) { + *this = TRange(beg); + } + + void AddRange(const TRange& range, ui64 mask); + + void AddFlag(const char* ptr, ui64 mask, ui64 flag) { + if (0 != flag) + AddFlagImpl(ptr, mask, flag, flag); + } + + void AddFlagExcept(const char* ptr, ui64 mask, ui64 flag, ui64 exclflag) { + if (0 != flag) + AddFlagImpl(ptr, mask, flag & ~exclflag, flag); + } + + void AddFlagUnless(const char* ptr, ui64 mask, ui64 flag, ui64 exclmask) { + if (0 != flag) + AddFlagImpl(ptr, mask, flag, flag, exclmask); + } + + void AddFlag(const char* ptr, ui64 mask, ui64 flag, ui64 exclflag, ui64 exclmask) { + if (0 != flag) + AddFlagImpl(ptr, mask, flag & ~exclflag, flag, exclmask); + } + + private: + void AddFlagImpl(const char* ptr, ui64 mask, ui64 plainflag, ui64 encflag) { + AddFlagAllPlaintextImpl(ptr, plainflag); + AddFlagEncodeMaskedImpl(encflag & mask); + } + + void AddFlagImpl(const char* ptr, ui64 mask, ui64 plainflag, ui64 encflag, ui64 exclmask) { + AddFlagAllPlaintextImpl(ptr, plainflag); + if (0 == (mask & exclmask)) + AddFlagEncodeMaskedImpl(encflag & mask); + } + + void AddFlagAllPlaintextImpl(const char* ptr, ui64 flag) { + if (nullptr == Beg) + Beg = ptr; + FlagsAllPlaintext |= flag; + } + + void AddFlagEncodeMaskedImpl(ui64 flag) { + if (0 == flag) + return; + FlagsEncodeMasked |= flag; + if (flag & TFeature::FeaturesMaybeEncode) + ++Encode; + else if (flag & TFeature::FeaturesDecode) + ++Decode; + } + }; + + } + + class TSection + : protected NParse::TRange { + private: + friend class TParser; + + private: + const char* End; + + TSection(const char* beg = nullptr) + : NParse::TRange(beg) + , End(nullptr) + { + } + + void Reset() { + Enter(nullptr); + } + + void Reset(const char* pc) { + Y_ASSERT(!Beg || !pc || Beg < pc); + Reset(); + } + + void Enter(const char* pc) { + *this = TSection(pc); + } + + bool Leave(const char* pc) { + Y_ASSERT(Beg); + End = pc; + return true; + } + + void Set(const TStringBuf& buf) { + Enter(buf.data()); + Leave(buf.data() + buf.length()); + } + + public: + bool IsSet() const { + return End; + } + + TStringBuf Get() const { + return TStringBuf(Beg, End); + } + + size_t Len() const { + return End - Beg; + } + + size_t DecodedLen() const { + return Len() - 2 * Decode; + } + + size_t EncodedLen() const { + return 2 * Encode + DecodedLen(); + } + + ui32 GetEncode() const { + return Encode; + } + + ui32 GetDecode() const { + return Decode; + } + + ui64 GetFlagsEncode() const { + return FlagsEncodeMasked; + } + + ui64 GetFlagsAllPlaintext() const { + return FlagsAllPlaintext; + } + }; + + class TParser { + public: + TSection Sections[TField::FieldUrlMAX]; + TScheme::EKind Scheme; + const TParseFlags Flags; + const TStringBuf UriStr; + TState::EParsed State; + ECharset Enc; + + public: + TParser(const TParseFlags& flags, const TStringBuf& uri, ECharset enc = CODES_UTF8) + : Scheme(TScheme::SchemeEmpty) + , Flags(flags | TFeature::FeatureDecodeANY) + , UriStr(uri) + , State(TState::ParsedEmpty) + , Enc(enc) + , HexValue(0) + , PctBegin(nullptr) + { + Y_ASSERT(0 == (Flags & TFeature::FeaturePathOperation) + // can't define all of them + || TFeature::FeaturesPath != (Flags & TFeature::FeaturesPath)); + State = ParseImpl(); + } + + public: + const TSection& Get(TField::EField fld) const { + return Sections[fld]; + } + TSection& GetMutable(TField::EField fld) { + return Sections[fld]; + } + bool Has(TField::EField fld) const { + return Get(fld).IsSet(); + } + bool IsNetPath() const { + return Has(TField::FieldHost) && 2 < UriStr.length() && '/' == UriStr[0] && '/' == UriStr[1]; + } + bool IsRootless() const { + return Has(TField::FieldScheme) && !Has(TField::FieldHost) && (!Has(TField::FieldPath) || '/' != Get(TField::FieldPath).Get()[0]); + } + // for RFC 2396 compatibility + bool IsOpaque() const { + return IsRootless(); + } + static ui64 GetFieldFlags(TField::EField fld, const TParseFlags& flags) { + return FieldFlags[fld] & flags; + } + ui64 GetFieldFlags(TField::EField fld) const { + return GetFieldFlags(fld, Flags); + } + + protected: + static const TParseFlags FieldFlags[TField::FieldUrlMAX]; + TSection::TRange CurRange; + unsigned HexValue; + const char* PctBegin; + +#ifdef DO_PRN + IOutputStream& PrintAddr(const char* ptr) const { + return Cdbg << "[" << IntToString<16>(ui64(ptr)) << "] "; + } + + IOutputStream& PrintHead(const char* ptr, const char* func) const { + return PrintAddr(ptr) << func << " "; + } + + IOutputStream& PrintHead(const char* ptr, const char* func, const TField::EField& fld) const { + return PrintHead(ptr, func) << fld; + } + + IOutputStream& PrintTail(const TStringBuf& val) const { + return Cdbg << " [" << val << "]" << Endl; + } + IOutputStream& PrintTail(const char* beg, const char* end) const { + return PrintTail(TStringBuf(beg, end)); + } +#endif + + void ResetSection(TField::EField fld, const char* pc = nullptr) { +#ifdef DO_PRN + PrintHead(pc, __FUNCTION__, fld); + PrintTail(pc); +#endif + Sections[fld].Reset(pc); + } + + void storeSection(const TStringBuf& val, TField::EField fld) { +#ifdef DO_PRN + PrintHead(val.data(), __FUNCTION__, fld); + PrintTail(val); +#endif + Sections[fld].Set(val); + } + + void startSection(const char* pc, TField::EField fld) { +#ifdef DO_PRN + PrintHead(pc, __FUNCTION__, fld); + PrintTail(pc); +#endif + copyRequirements(pc); + Sections[fld].Enter(pc); + } + + void finishSection(const char* pc, TField::EField fld) { +#ifdef DO_PRN + PrintHead(pc, __FUNCTION__, fld); + PrintTail(pc); +#endif + if (Sections[fld].Leave(pc)) + copyRequirements(pc); + } + + void setRequirement(const char* ptr, ui64 flags) { +#ifdef DO_PRN + PrintHead(ptr, __FUNCTION__) << IntToString<16>(flags) + << " & mask=" << IntToString<16>(Flags.Allow | Flags.Extra); + PrintTail(ptr); +#endif + CurRange.AddFlag(ptr, Flags.Allow | Flags.Extra, flags); + } + + void setRequirementExcept(const char* ptr, ui64 flags, ui64 exclflag) { +#ifdef DO_PRN + PrintHead(ptr, __FUNCTION__) << IntToString<16>(flags) + << " & exclflag=" << IntToString<16>(exclflag) + << " & mask=" << IntToString<16>(Flags.Allow | Flags.Extra); + PrintTail(ptr); +#endif + CurRange.AddFlagExcept(ptr, Flags.Allow | Flags.Extra, flags, exclflag); + } + + void setRequirementUnless(const char* ptr, ui64 flags, ui64 exclmask) { +#ifdef DO_PRN + PrintHead(ptr, __FUNCTION__) << IntToString<16>(flags) + << " & exclmask=" << IntToString<16>(exclmask) + << " & mask=" << IntToString<16>(Flags.Allow | Flags.Extra); + PrintTail(ptr); +#endif + CurRange.AddFlagUnless(ptr, Flags.Allow | Flags.Extra, flags, exclmask); + } + + void copyRequirementsImpl(const char* ptr); + void copyRequirements(const char* ptr) { + PctEnd(ptr); + if (nullptr != CurRange.Beg && CurRange.Beg != ptr) + copyRequirementsImpl(ptr); + } + + void HexDigit(const char* ptr, char c) { + Y_UNUSED(ptr); + HexAdd(c - '0'); + } + void HexUpper(const char* ptr, char c) { + setRequirementUnless(ptr, TFeature::FeatureToLower, TFeature::FeatureUpperEncoded); + HexAdd(c - 'A' + 10); + } + void HexLower(const char* ptr, char c) { + setRequirement(ptr, TFeature::FeatureUpperEncoded); + HexAdd(c - 'a' + 10); + } + void HexAdd(unsigned val) { + HexValue <<= 4; + HexValue += val; + } + void HexReset() { + HexValue = 0; + } + void HexSet(const char* ptr); + + void PctEndImpl(const char* ptr); + void PctEnd(const char* ptr) { + if (nullptr != PctBegin && ptr != PctBegin) + PctEndImpl(ptr); + } + void PctBeg(const char* ptr) { + PctEnd(ptr); + HexReset(); + PctBegin = ptr; + } + + void checkSectionCollision(TField::EField fld1, TField::EField fld2) { + if (Sections[fld1].IsSet() && Sections[fld2].IsSet() && Sections[fld1].Beg == Sections[fld2].Beg) { + Sections[fld1].Reset(); + } + } + + bool doParse(const char* str_beg, size_t length); + TState::EParsed ParseImpl(); + }; + +} diff --git a/library/cpp/uri/parsefsm.rl6 b/library/cpp/uri/parsefsm.rl6 new file mode 100644 index 00000000000..70977236503 --- /dev/null +++ b/library/cpp/uri/parsefsm.rl6 @@ -0,0 +1,501 @@ +#include <library/cpp/uri/parse.h> + +#ifdef __clang__ + #pragma clang diagnostic ignored "-Wunused-variable" +#endif + +%%{ + machine TParser; + + #================================================ + # RFC 3986 http://tools.ietf.org/html/rfc3986 + # with some modifications + #================================================ + # The RegEx + # + # http://www.ics.uci.edu/pub/ietf/uri/#Related + # ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? + # 12 3 4 5 6 7 8 9 + #results in the following subexpression matches: + # $1 = http: + # $2 = http + # $3 = //www.ics.uci.edu + # $4 = www.ics.uci.edu + # $5 = /pub/ietf/uri/ + # $6 = <undefined> + # $7 = <undefined> + # $8 = #Related + # $9 = Related + # + # So $2:scheme $4:authority $5:path $7:query $9:fragment + #================================================ + + + #================================================ + # List of all ASCII characters and where they can be used + #================================================ + + # 0-31 x00-1F cntrl ext_cntrl + # 32 x20 space ext_space + # 33 x21 ! sub_delims + # 34 x22 " ext_delims + # 35 x23 # gen_delims / f=frag + # 36 x24 $ sub_delims + # 37 x25 % PCT + # 38 x26 & sub_delims + # 39 x27 ' sub_delims + # 40 x28 ( sub_delims + # 41 x29 ) sub_delims + # 42 x2A * sub_delims + # 43 x2B + sub_delims + # 44 x2C , sub_delims + # 45 x2D - unreserved + # 46 x2E . unreserved + # 47 x2F / gen_delims / f=path,qry,frag + # 48-57 x30-39 0-9 unreserved + # 58 x3A : gen_delims / f=pass,path,qry,frag + # 59 x3B ; sub_delims + # 60 x3C < ext_delims + # 61 x3D = sub_delims + # 62 x3E > ext_delims + # 63 x3F ? gen_delims / f=qry,frag + # 64 x40 @ gen_delims / f=path,qry,frag + # 65-90 x41-5A A-Z unreserved + # 91 x5B [ gen_delims / ext_delims + # 92 x5C \ ext_delims + # 93 x5D ] gen_delims / ext_delims + # 94 x5E ^ ext_delims + # 95 x5F _ unreserved + # 96 x60 ` ext_delims + # 97-122 x61-7A a-z unreserved + # 123 x7B { ext_delims + # 124 x7C | ext_delims + # 125 x7D } ext_delims + # 126 x7E ~ unreserved + # 127 x7F DEL ext_cntrl + # 128-255 x80-FF ext_ascii + + + #================================================ + # Actions used in multiple definitions + #================================================ + + action act_req_enc_sql { REQ(fpc, FeatureEncodeForSQL) } + + # REQ must apply to a char in range but not after the range has been reset + action act_req_pathop { REQ(fpc - 1, FeaturePathOperation) } + + action act_clr_scheme { CLR(fpc, Scheme) } + action act_clr_user { CLR(fpc, User) } + action act_clr_host { CLR(fpc, Host) } + action act_beg_host { BEG(fpc, Host) } + action act_end_host { END(fpc, Host) } + action act_beg_path { BEG(fpc, Path) } + action act_end_path { END(fpc, Path) } + + + #================================================ + # RFC 3986 ABNFs + #================================================ + + DIGIT = digit; + + ALPHA = ( upper >{ REQ(fpc, FeatureToLower) } ) | + lower; + + ALNUM = ALPHA | DIGIT; + + PCT = "%" >{ PctBeg(fpc); } ; + + HEXDIG = ( + DIGIT >{ HexDigit(fpc, fc); } + | [A-F] >{ HexUpper(fpc, fc); } + | [a-f] >{ HexLower(fpc, fc); } + ); + + # HexSet sets REQ so must apply in range + HEXNUM = ( HEXDIG HEXDIG ) %{ HexSet(fpc - 1); }; + + pct_encoded = PCT HEXNUM; + + unreserved = ALNUM | "-" | "." | "_" | "~"; + + gen_delims = ":" | "/" | "?" | "#" | "[" | "]" | "@"; + + sub_delims = "!" | "$" | "&" | "(" | ")" + | "*" | "+" | "," | ";" | "=" + | ( ['] >act_req_enc_sql ); + + + #================================================ + # Local ABNFs + #================================================ + + VALID = ^(cntrl | space) | " "; + + # safe character sequences + safe = unreserved | pct_encoded | sub_delims; + + # MOD: Yandex extensions + + ext_ascii = (VALID - ascii) >{ REQ(fpc, FeatureEncodeExtendedASCII) }; + ext_delims = ( "[" | "]" | "|" | "{" | "}" | "`" | "^" | "<" | ">" + | ( ["\\] >act_req_enc_sql ) + ) >{ REQ(fpc, FeatureEncodeExtendedDelim) }; # " fix hilite + ext_space = " " >{ REQ(fpc, FeatureEncodeSpace) }; + ext_cntrl = cntrl >{ REQ(fpc, FeatureEncodeCntrl) }; + + pct_maybe_encoded = PCT (HEXDIG | HEXNUM)? ; + ext_safe = unreserved + | pct_maybe_encoded + | sub_delims + | ext_delims + | ext_space + | ext_cntrl + | ext_ascii; + + # pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + # uric (RFC 2396) + # MOD: extension to format, add extended delimiters and 8-bit ascii + + pchar_nc = ext_safe | "@"; + pchar = pchar_nc | ":"; + path_sep = "/"; + uric = pchar | path_sep | "?"; + + + #================================================ + # Fields + #================================================ + # Single fields use fXXX as machine definitions + + + #================================================ + # Scheme + # scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + #================================================ + + scheme = ( ALPHA ( ALPHA | DIGIT | "+" | "-" | "." )** ); + fscheme = scheme >{ BEG(fpc, Scheme) } %{ END(fpc, Scheme) }; + + + #================================================ + # UserInfo + # userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + #================================================ + + # MOD: split into a pair of sections: username and password + + fuser = ( ext_safe )** >{ BEG(fpc, User) } %{ END(fpc, User) }; + fpass = ( ext_safe | ":" )** >{ BEG(fpc, Pass) } %{ END(fpc, Pass) }; + userinfo = ( fuser ( ":" fpass )? ) ( "@" %act_clr_host @^act_clr_user ); + + + #================================================ + # Hostname + # host = IP-literal / IPv4address / reg-name + #================================================ + + # MOD: simplify IP-literal for now + IPv6address = (HEXDIG | ":" | ".")+; + IP_literal = "[" IPv6address "]"; + + # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + # MOD: simplify dec-octet which originally matches only 0-255 + + dec_octet = DIGIT+; + IPv4address = dec_octet "." dec_octet "." dec_octet "." dec_octet; + + # MOD: non-empty; will use host? + # reg-name = *( unreserved / pct-encoded / sub-delims ) + ### todo: allow ':' (need to fix grammar to disambiguate port) + achar = any - (0x00 .. 0x20) - '/' - '#' - '?' - ':' - '%'; + upperhalf = any - (0x00 .. 0x7F); + hostname = (((achar | pct_encoded)+) & (any* (alnum | upperhalf) any*)); + reg_name = hostname - IPv4address - IP_literal; + + # uses first-match-wins approach + host = IP_literal | IPv4address | (reg_name - IPv4address); + fhost = host? >act_beg_host %act_end_host; + fhost_nempty = host >act_beg_host %act_end_host; + + + #================================================ + # Port + # port = *DIGIT + #================================================ + + # MOD: use fport? for empty + fport = DIGIT+ >{ BEG(fpc, Port) } %{ END(fpc, Port) }; + + + #================================================ + # Authority + # authority = [ userinfo "@" ] host [ ":" port ] + #================================================ + + authority = userinfo? fhost ( ":" fport? )? ; + + + #================================================ + # Path + #================================================ + # path = path-abempty ; begins with "/" or is empty + # / path-absolute ; begins with "/" but not "//" + # / path-noscheme ; begins with a non-colon segment + # / path-rootless ; begins with a segment + # / path-empty ; zero characters + #================================================ + + # checkPath rules + + checkPathHead = + "." ( "."? path_sep VALID* )? %act_req_pathop ; + + checkPathTail = + VALID* + ( path_sep "."{1,2} ) %act_req_pathop ; + + checkPathMid = VALID* + ( path_sep "."{,2} path_sep ) %act_req_pathop + VALID*; + + checkAbsPath = checkPathMid | checkPathTail | VALID*; + checkRelPath = checkPathHead | checkAbsPath; + + # segment = *pchar + segment = pchar**; + + # segment-nz = 1*pchar + segment_nz = pchar+; + + # segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) + segment_nz_nc = pchar_nc+; + + sep_segment = path_sep segment; + + # non-standard definitions + + fpath_abnempty = + ( + ( sep_segment+ ) + & checkAbsPath + ) + >act_beg_path %act_end_path + ; + + fpath_relative = + ( + "." + ( "."? sep_segment+ )? + ) + >act_beg_path %act_req_pathop %act_end_path + ; + + # standard definitions + + # do not save empty paths, they behave differently in relative resolutions + fpath_empty = zlen; + + fpath_abempty = fpath_abnempty?; + + fpath_absolute = + ( + ( path_sep ( segment_nz sep_segment* )? ) + & checkAbsPath + ) + >act_beg_path %act_end_path + ; + + fpath_noscheme = + ( + ( segment_nz_nc sep_segment* ) + & checkRelPath + ) + >act_beg_path %act_end_path + ; + + fpath_rootless = + ( + ( segment_nz sep_segment* ) + ) + >act_beg_path %act_end_path + ; + + #================================================ + # Query and fragment + # query = *( pchar / "/" / "?" ) + # fragment = *( pchar / "/" / "?" ) + #================================================ + + # MOD: fragment allows '#' characters + + fquery = (uric )** >{ BEG(fpc, Query) } %{ END(fpc, Query) }; + ffrag = (uric | "#")** >{ BEG(fpc, Frag) } %{ END(fpc, Frag) }; + query_frag = ("?" fquery)? ("#" ffrag)? ; + + + #================================================ + # final ABNFs + # URI-reference = URI / relative-ref + #================================================ + # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + # hier-part = "//" authority path-abempty + # / path-absolute + # / path-rootless + # / path-empty + # relative-ref = relative-part [ "?" query ] [ "#" fragment ] + # relative-part = "//" authority path-abempty + # / path-absolute + # / path-noscheme + # / path-empty + + net_path = "//" authority fpath_abempty; + + URI = + fscheme ":" + ( + net_path + | fpath_absolute + | fpath_rootless + | fpath_empty + ) + $^act_clr_scheme + query_frag + ; + + relative_ref = + ( + net_path + | fpath_absolute + | fpath_noscheme + | fpath_empty + ) + %act_clr_scheme + query_frag + ; + + # non-standard definitions + + URI_no_rootless = + fscheme ":" + ( + net_path + | fpath_absolute + | fpath_empty + ) + $^act_clr_scheme + query_frag + ; + + host_path = + ( + fhost_nempty fpath_abempty + | (fhost_nempty - scheme) ":" fport fpath_abempty + ) + @^act_clr_host + ; + + # no userinfo, path absolute, empty or clearly relative, starting with "./" | "../" + relative_ref_host_pabem = + ( + net_path + | host_path + | fpath_absolute + | fpath_relative + | fpath_empty + ) + %act_clr_scheme + query_frag + ; + + # port must be non-empty, to avoid clash with "scheme:/..." + auth_path = + ( + fhost_nempty ( ":" fport )? fpath_abempty + | userinfo fhost ( ":" fport? )? fpath_abempty + ) + @^act_clr_host + @^act_clr_user + ; + + # userinfo, path absolute, empty or clearly relative, starting with "./" | "../" + relative_ref_auth_pabem = + ( + net_path + | auth_path + | fpath_absolute + | fpath_relative + | fpath_empty + ) + %act_clr_scheme + query_frag + ; + + + # machine instantiations + + URI_ref_no_rootless := + ( + URI_no_rootless + # scheme://user@host preferred over user://pass@host/path + | relative_ref_auth_pabem + ) + ; + + URI_ref_no_relpath := + ( + relative_ref_host_pabem + # host:port/path preferred over scheme:path/rootless + | (URI - relative_ref_host_pabem) + ) + ; + + URI_ref := + ( + relative_ref + | URI + ) + ; + + write data; + +}%% + +namespace NUri { + +bool TParser::doParse(const char* str_beg, size_t length) +{ + const char* p = str_beg; + const char* pe = str_beg + length; + const char* eof = pe; + int cs; + +#define BEG(ptr, fld) startSection (ptr, TField::Field ## fld); +#define END(ptr, fld) finishSection(ptr, TField::Field ## fld); +#define SET(val, fld) storeSection(val, TField::Field ## fld); +#define CLR(ptr, fld) ResetSection (TField::Field ## fld, ptr); +#define REQ(ptr, req) setRequirement(ptr, TFeature :: req); + + %% write init nocs; + + if (0 == (Flags & TFeature::FeatureNoRelPath)) { + cs = TParser_en_URI_ref; + } else if (0 == (Flags & TFeature::FeatureAllowRootless)) { + cs = TParser_en_URI_ref_no_rootless; + } else { + cs = TParser_en_URI_ref_no_relpath; + } + + %% write exec; + +#undef BEG +#undef END +#undef SET +#undef CLR +#undef REQ + + return cs >= TParser_first_final; +} + +} diff --git a/library/cpp/uri/qargs.cpp b/library/cpp/uri/qargs.cpp new file mode 100644 index 00000000000..23058f81029 --- /dev/null +++ b/library/cpp/uri/qargs.cpp @@ -0,0 +1,279 @@ +#include "qargs.h" +#include <string> + +namespace NUri { + namespace NOnStackArgsList { + struct TQArgNode { + TQArgNode* Prev; + TQArgNode* Next; + + TStringBuf Name; + TStringBuf Value; + TStringBuf All; + }; + + TQArgNode MakeArg(TQArgNode* prev) { + return {prev, 0, {}, {}, {}}; + } + + const char* SkipDelimiter(const char* str, const char* end) { + while (str != end) + if (*str == '&') + ++str; + else + break; + return str; + } + + /// return next pos or 0 if error + const char* ExtractArgData(const char* pos, const char* end, TQArgNode* arg) { + const char* nameStart = pos; + const char* nextArg = strchr(pos, '&'); + const char* valueStart = strchr(pos, '='); + if (valueStart && nextArg && valueStart < nextArg) // a=1& or a=& + { + arg->Name = TStringBuf(nameStart, valueStart - nameStart); + arg->Value = TStringBuf(valueStart + 1, nextArg - valueStart - 1); + arg->All = TStringBuf(nameStart, nextArg - nameStart); + return nextArg; + } else if (valueStart && nextArg && valueStart > nextArg) // a&b=2 + { + arg->Name = TStringBuf(nameStart, nextArg - nameStart); + arg->All = arg->Name; + return nextArg; + } else if (valueStart && !nextArg) // a=1 or a= + { + arg->Name = TStringBuf(nameStart, valueStart - nameStart); + arg->Value = TStringBuf(valueStart + 1, end - valueStart - 1); + arg->All = TStringBuf(nameStart, end - nameStart); + return end; + } else if (!valueStart && nextArg) // a&b + { + arg->Name = TStringBuf(nameStart, nextArg - nameStart); + arg->All = arg->Name; + return nextArg; + } else { // a + arg->Name = TStringBuf(nameStart, end - nameStart); + arg->All = arg->Name; + return end; + } + } + + // arg can be null + TQArgNode* GetHead(TQArgNode* arg) { + TQArgNode* prev = arg; + while (prev) { + arg = prev; + prev = prev->Prev; + } + return arg; + } + + // arg can be null + TQArgNode* GetLast(TQArgNode* arg) { + TQArgNode* next = arg; + while (next) { + arg = next; + next = arg->Next; + } + return arg; + } + + int CompareName(const TQArgNode* l, const TQArgNode* r) { + return l->Name.compare(r->Name); + } + + TQArgNode* Move(TQArgNode* before, TQArgNode* node) { + TQArgNode* tn = node->Next; + TQArgNode* tp = node->Prev; + + node->Prev = before->Prev; + if (node->Prev) + node->Prev->Next = node; + + node->Next = before; + before->Prev = node; + + if (tn) + tn->Prev = tp; + if (tp) + tp->Next = tn; + + return node; + } + + // return new head + TQArgNode* QSortByName(TQArgNode* iter, TQArgNode* last) { + if (iter == last) + return iter; + if (iter->Next == last) { + int c = CompareName(iter, last); + return c <= 0 ? iter : Move(iter, last); + } else { + TQArgNode* pivot = iter; + iter = iter->Next; + TQArgNode* head = 0; + TQArgNode* tail = 0; + TQArgNode* tailPartitionStart = pivot; + while (true) { + TQArgNode* next = iter->Next; + int c = CompareName(iter, pivot); + int sign = (0 < c) - (c < 0); + switch (sign) { + case -1: + head = head ? Move(head, iter) : Move(pivot, iter); + break; + + case 0: + pivot = Move(pivot, iter); + break; + + case 1: + tail = iter; + break; + } + + if (iter == last) + break; + iter = next; + } + + if (head) + head = QSortByName(head, pivot->Prev); + if (tail) + QSortByName(tailPartitionStart->Next, tail); + return head ? head : pivot; + } + } + } + + using namespace NOnStackArgsList; + + class TQueryArgProcessing::Pipeline { + public: + Pipeline(TQueryArgProcessing& parent, TUri& subject) + : Parent(parent) + , Subject(subject) + , ArgsCount(0) + , IsDirty(false) + { + } + + TQueryArg::EProcessed Process() { + const TStringBuf& query = Subject.GetField(NUri::TField::FieldQuery); + if (query.empty()) + return ProcessEmpty(); + + const char* start = query.data(); + return Parse(start, start + query.length(), 0); + } + + TQueryArg::EProcessed ProcessEmpty() { + if (Parent.Flags & TQueryArg::FeatureRemoveEmptyQuery) + Subject.FldClr(NUri::TField::FieldQuery); + + return TQueryArg::ProcessedOK; + } + + TQueryArg::EProcessed Parse(const char* str, const char* end, TQArgNode* prev) { + str = SkipDelimiter(str, end); + + if (str == end) { + TQArgNode* head = GetHead(prev); + TQArgNode* last = GetLast(prev); + return FinalizeParsing(head, last); + } else { + TQArgNode current = MakeArg(prev); + const char* next = ExtractArgData(str, end, ¤t); + if (!next) + return TQueryArg::ProcessedMalformed; + + TQArgNode* tail = ApplyFilter(prev, ¤t); + + if (++ArgsCount > MaxCount) + return TQueryArg::ProcessedTooMany; + + return Parse(next, end, tail); + } + } + + TQArgNode* ApplyFilter(TQArgNode* prev, TQArgNode* current) { + if (Parent.Flags & TQueryArg::FeatureFilter) { + TQueryArg arg = {current->Name, current->Value}; + if (!Parent.Filter(arg, Parent.FilterData)) { + IsDirty = true; + return prev; + } + } + + if (prev) + prev->Next = current; + return current; + } + + TQueryArg::EProcessed FinalizeParsing(TQArgNode* head, TQArgNode* last) { + if (Parent.Flags & TQueryArg::FeatureSortByName) { + head = QSortByName(head, last); + IsDirty = true; + } + + if (!IsDirty) + return TQueryArg::ProcessedOK; + + bool dirty = Render(head); + + bool rewrite = Parent.Flags & TQueryArg::FeatureRewriteDirty; + if (dirty && rewrite) + Subject.Rewrite(); + return (!dirty || rewrite) ? TQueryArg::ProcessedOK : TQueryArg::ProcessedDirty; + } + + bool Render(TQArgNode* head) { + std::string& result = Parent.Buffer; + result.clear(); + result.reserve(Subject.GetField(NUri::TField::FieldQuery).length()); + bool first = true; + while (head) { + if (first) + first = false; + else + result.append("&"); + + result.append(head->All); + head = head->Next; + } + + if (result.empty()) + return RenderEmpty(); + else + return Subject.FldMemSet(NUri::TField::FieldQuery, result); + } + + bool RenderEmpty() { + if (Parent.Flags & TQueryArg::FeatureRemoveEmptyQuery) + Subject.FldClr(NUri::TField::FieldQuery); + return false; + } + + private: + TQueryArgProcessing& Parent; + TUri& Subject; + + unsigned ArgsCount; + bool IsDirty; + + static const unsigned MaxCount = 100; + }; + + TQueryArgProcessing::TQueryArgProcessing(ui32 flags, TQueryArgFilter filter, void* filterData) + : Flags(flags) + , Filter(filter) + , FilterData(filterData) + { + } + + TQueryArg::EProcessed TQueryArgProcessing::Process(TUri& uri) { + Pipeline pipeline(*this, uri); + return pipeline.Process(); + } +} diff --git a/library/cpp/uri/qargs.h b/library/cpp/uri/qargs.h new file mode 100644 index 00000000000..fcba7cbd0cc --- /dev/null +++ b/library/cpp/uri/qargs.h @@ -0,0 +1,22 @@ +#pragma once + +#include "common.h" +#include "uri.h" +#include <string> + +namespace NUri { + class TQueryArgProcessing { + public: + TQueryArgProcessing(ui32 flags, TQueryArgFilter filter = 0, void* filterData = 0); + + TQueryArg::EProcessed Process(TUri& uri); + + private: + ui32 Flags; + TQueryArgFilter Filter; + void* FilterData; + + class Pipeline; + std::string Buffer; + }; +} diff --git a/library/cpp/uri/uri-ru_ut.cpp b/library/cpp/uri/uri-ru_ut.cpp new file mode 100644 index 00000000000..ec35a164d29 --- /dev/null +++ b/library/cpp/uri/uri-ru_ut.cpp @@ -0,0 +1,163 @@ +#include "uri_ut.h" +#include <library/cpp/charset/recyr.hh> +#include <library/cpp/html/entity/htmlentity.h> +#include <util/system/maxlen.h> + +namespace NUri { + namespace { + TString AsWin1251(const TString& s) { + return Recode(CODES_UTF8, CODES_WIN, s); + } + TString AsKoi8(const TString& s) { + return Recode(CODES_UTF8, CODES_KOI8, s); + } + } + + Y_UNIT_TEST_SUITE(URLTestRU) { + Y_UNIT_TEST(test_httpURL2) { + TUri url; + UNIT_ASSERT_VALUES_EQUAL(url.Parse("g:h"), TState::ParsedBadScheme); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("http:g"), TState::ParsedBadFormat); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("/../g"), TState::ParsedBadPath); + const char* const UpCaseUrl = "http://www.TEST.Ru:80/InDex.html"; + UNIT_ASSERT_VALUES_EQUAL(url.Parse(UpCaseUrl), TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://www.TEST.Ru/InDex.html"); + UNIT_ASSERT_VALUES_EQUAL(url.Parse(UpCaseUrl, TFeature::FeaturesDefault | TFeature::FeatureToLower), TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://www.test.ru/InDex.html"); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagScheme), "http:"); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagScheme | TField::FlagHost), "http://www.test.ru"); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHost), "www.test.ru"); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHost | TField::FlagPath), "www.test.ru/InDex.html"); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagQuery), ""); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("http://www.TEST.Ru:90/InDex.html"), TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHostPort | TField::FlagPath), "www.TEST.Ru:90/InDex.html"); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("www.ya.ru/index.html"), TState::ParsedOK); + UNIT_ASSERT(!url.IsValidAbs()); + UNIT_ASSERT(url.IsNull(TField::FlagHost)); + UNIT_ASSERT(!url.IsNull(TField::FlagPath)); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagPath), "www.ya.ru/index.html"); + + UNIT_ASSERT_VALUES_EQUAL(url.Parse(AsWin1251("www.TEST.Ru/ФЕУФ\\'\".html?ФЕУФ\\'\"=ФЕУФ+\\'\"%10")), TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), AsWin1251("www.TEST.Ru/ФЕУФ\\'\".html?ФЕУФ\\'\"=ФЕУФ+\\'\"%10")); + + UNIT_ASSERT_VALUES_EQUAL(url.Parse(AsWin1251("www.TEST.Ru/ФЕУФ\\'\".html?ФЕУФ\\'\"=ФЕУФ+\\'\"%10"), + TFeature::FeaturesDefault | TFeature::FeatureEncodeExtendedASCII), + TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), + AsWin1251("www.TEST.Ru/%D4%C5%D3%D4\\'\".html?%D4%C5%D3%D4\\'\"=%D4%C5%D3%D4+\\'\"%10")); + + UNIT_ASSERT_VALUES_EQUAL(url.Parse(AsWin1251("www.TEST.Ru/ФЕУФ\\'\".html?ФЕУФ\\'\"=ФЕУФ+\\'\"%10"), + TFeature::FeaturesDefault | TFeature::FeatureEncodeForSQL), + TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), AsWin1251("www.TEST.Ru/ФЕУФ%5C%27%22.html?ФЕУФ%5C%27%22=ФЕУФ+%5C%27%22%10")); + + UNIT_ASSERT_VALUES_EQUAL(url.Parse("q/%33%26%13%2f%2b%30%20", + TFeature::FeaturesDefault | TFeature::FeatureDecodeStandard), + TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "q/3%26%13/%2B0%20"); + + UNIT_ASSERT_VALUES_EQUAL(url.Parse("http://www.prime-tass.ru/news/0/{656F5BAE-6677-4762-9BED-9E3B77E72055}.uif"), + TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("//server/path"), TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("//server/path", TFeature::FeaturesRobot), TState::ParsedOK); + } + + const TString links[] = { + "viewforum.php?f=1&sid=b4568481b67b1d7683bea78634b2e240", "viewforum.php?f=1&sid=b4568481b67b1d7683bea78634b2e240", + "./viewtopic.php?p=74&sid=6#p74", "./viewtopic.php?p=74&sid=6#p74", + "viewtopic.php?p=9313&sid=8#9313", "viewtopic.php?p=9313&sid=8#9313", + "profile.php?mode=viewprofile&u=-1#drafts&sid=a6e5989cee27adb5996bfff044af04ca", "profile.php?mode=viewprofile&u=-1#drafts&sid=a6e5989cee27adb5996bfff044af04ca", + + "images\nil.jpg", "images%0Ail.jpg", + "http://caedebaturque.termez.su\r\n/?article=218", "http://caedebaturque.termez.su%0D%0A/?article=218", + + AsKoi8("javascript:window.external.AddFavorite(\'http://www.humor.look.ru/\',\'Злобные Деды Морозы!!!\')"), "javascript:window.external.AddFavorite(\'http://www.humor.look.ru/\',\'%FA%CC%CF%C2%CE%D9%C5%20%E4%C5%C4%D9%20%ED%CF%D2%CF%DA%D9!!!\')", + "search.php?search_author=%CB%FE%E4%EC%E8%EB%E0+%C3%F3%F1%E5%E2%E0&showresults=posts&sid=8", "search.php?search_author=%CB%FE%E4%EC%E8%EB%E0+%C3%F3%F1%E5%E2%E0&showresults=posts&sid=8", + AsWin1251("/Search/author/?q=Штрибель Х.В."), "/Search/author/?q=%D8%F2%F0%E8%E1%E5%EB%FC%20%D5.%C2.", + AsWin1251("javascript:ins(\'ГОРШОК\')"), "javascript:ins(\'%C3%CE%D0%D8%CE%CA\')", + AsWin1251("?l=я"), "?l=%FF", + AsWin1251("content.php?id=3392&theme=Цена"), "content.php?id=3392&theme=%D6%E5%ED%E0", + "/a-mp3/stype-1/?search=А", "/a-mp3/stype-1/?search=%D0%90", + "/a-mp3/stype-1/?search=Б", "/a-mp3/stype-1/?search=%D0%91", + "/a-mp3/stype-1/?search=В", "/a-mp3/stype-1/?search=%D0%92", + "/a-mp3/stype-1/?search=Г", "/a-mp3/stype-1/?search=%D0%93", + "/a-mp3/stype-1/?search=Д", "/a-mp3/stype-1/?search=%D0%94", + "/a-mp3/stype-1/?search=Е", "/a-mp3/stype-1/?search=%D0%95", + "/a-mp3/stype-1/?search=Ж", "/a-mp3/stype-1/?search=%D0%96", + "/a-mp3/stype-1/?search=З", "/a-mp3/stype-1/?search=%D0%97", + // %98 is not defined in CP1251 so don't put it here explicitly + "/a-mp3/stype-1/?search=\xD0\x98", "/a-mp3/stype-1/?search=%D0%98", + "/a-mp3/stype-1/?search=Й", "/a-mp3/stype-1/?search=%D0%99", + "/a-mp3/stype-1/?search=К", "/a-mp3/stype-1/?search=%D0%9A", + "/a-mp3/stype-1/?search=Л", "/a-mp3/stype-1/?search=%D0%9B", + "/a-mp3/stype-1/?search=М", "/a-mp3/stype-1/?search=%D0%9C", + "/a-mp3/stype-1/?search=Н", "/a-mp3/stype-1/?search=%D0%9D", + "/a-mp3/stype-1/?search=О", "/a-mp3/stype-1/?search=%D0%9E", + "/a-mp3/stype-1/?search=П", "/a-mp3/stype-1/?search=%D0%9F", + "/a-mp3/stype-1/?search=\xD0", "/a-mp3/stype-1/?search=%D0", + "/a-mp3/stype-1/?search=С", "/a-mp3/stype-1/?search=%D0%A1", + "/a-mp3/stype-1/?search=Т", "/a-mp3/stype-1/?search=%D0%A2", + "/a-mp3/stype-1/?search=У", "/a-mp3/stype-1/?search=%D0%A3", + "/a-mp3/stype-1/?search=Ф", "/a-mp3/stype-1/?search=%D0%A4", + "/a-mp3/stype-1/?search=Х", "/a-mp3/stype-1/?search=%D0%A5", + "/a-mp3/stype-1/?search=Ц", "/a-mp3/stype-1/?search=%D0%A6", + "/a-mp3/stype-1/?search=Ч", "/a-mp3/stype-1/?search=%D0%A7", + "/a-mp3/stype-1/?search=Ш", "/a-mp3/stype-1/?search=%D0%A8", + "/a-mp3/stype-1/?search=Щ", "/a-mp3/stype-1/?search=%D0%A9", + "/a-mp3/stype-1/?search=Ы", "/a-mp3/stype-1/?search=%D0%AB", + "/a-mp3/stype-1/?search=Э", "/a-mp3/stype-1/?search=%D0%AD", + "/a-mp3/stype-1/?search=Ю", "/a-mp3/stype-1/?search=%D0%AE", + "/a-mp3/stype-1/?search=Я", "/a-mp3/stype-1/?search=%D0%AF", + + "javascript:emoticon(\":'(\")", "javascript:emoticon(\":\'(\")", + "javascript:emoticon(\'>:o\')", "javascript:emoticon(\'>:o\')", + "javascript:emoticon(\']:->\')", "javascript:emoticon(\']:->\')", + "javascript:emoticon(\':-!\')", "javascript:emoticon(\':-!\')", + "javascript:emoticon(\'@}->--\')", "javascript:emoticon(\'@}->--\')", + "http://www.is-ufa.ru/price2/price_IS.rar", "http://www.is-ufa.ru/price2/price_IS.rar", + "mailto:info@etem.de", "mailto:info@etem.de", + ""http://www.fubix.ru"", "\"http://www.fubix.ru\"", + AsWin1251("mailto:kampa@ukr.net?subject=Арабский язык"), "mailto:kampa@ukr.net?subject=%C0%F0%E0%E1%F1%EA%E8%E9%20%FF%E7%FB%EA", + {}}; + + Y_UNIT_TEST(testHtLinkDecode) { + char decodedlink[URL_MAXLEN + 10]; + for (int i = 0; links[i]; i += 2) { + UNIT_ASSERT(HtLinkDecode(links[i].c_str(), decodedlink, sizeof(decodedlink))); + UNIT_ASSERT_VALUES_EQUAL(decodedlink, links[i + 1]); + } + } + + Y_UNIT_TEST(testRuIDNA) { + { +#define DEC "\xD7\xE5\xF0\xE5\xEf\xEE\xE2\xE5\xF6.\xF0\xF4" /* "Череповец.рф" in Windows-1251 */ +#define ENC "%D7%E5%F0%E5%EF%EE%E2%E5%F6.%F0%F4" +// punycode corresponds to lowercase +#define PNC "xn--b1afab7bff7cb.xn--p1ai" + TTest test = { + "http://" ENC "/" ENC "?" ENC "#" ENC, TParseFlags(TFeature::FeaturesAll | TFeature::FeatureAllowHostIDN, TFeature::FeatureDecodeExtendedASCII), TState::ParsedOK, "http", "", "", DEC, 80, "/" ENC, ENC, ENC}; + TUri url; + URL_TEST_ENC(url, test, CODES_WIN); + UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldHostAscii), PNC); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" DEC "/" ENC "?" ENC "#" ENC); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHostAscii), "http://" PNC "/" ENC "?" ENC "#" ENC); +#undef PNC +#undef DEC +#undef ENC + } + } + + // Regression test for SEARCH-11283 + Y_UNIT_TEST(RegressionTest11283) { + TStringBuf url = "http://xn--n1aaa.пидорасы.com/"; + + TUri uri; + TState::EParsed er = uri.Parse(url, NUri::TParseFlags(NUri::TFeature::FeaturesRobot | NUri::TFeature::FeatureNoRelPath)); + UNIT_ASSERT_VALUES_EQUAL(er, TState::ParsedOK); + TStringBuf host = uri.GetHost(); + // Should be properly null-terminated + UNIT_ASSERT_VALUES_EQUAL(host.size(), strlen(host.data())); + } + } + +} diff --git a/library/cpp/uri/uri.cpp b/library/cpp/uri/uri.cpp new file mode 100644 index 00000000000..1664e8c8ddf --- /dev/null +++ b/library/cpp/uri/uri.cpp @@ -0,0 +1,623 @@ +#include "uri.h" +#include "parse.h" + +#include <util/string/cast.h> +#include <util/string/util.h> +#include <util/system/maxlen.h> +#include <util/system/yassert.h> +#include <util/generic/map.h> + +namespace NUri { + TState::EParsed TUri::CheckHost(const TStringBuf& host) { + if (host.empty()) + return ParsedOK; + + unsigned domainLevel = 0; + unsigned domainLevelOfUnderscore = 0; + + bool isAlnum = false; + bool startLabel = true; + for (size_t i = 0; i != host.length(); ++i) { + const char ch = host[i]; + + if ('.' == ch) { // label separator + if (!isAlnum || startLabel) // previous label must end in alnum + return ParsedBadHost; + startLabel = true; + continue; + } + + isAlnum = isalnum((const unsigned char)ch); + + if (startLabel) { // label is starting + if (!isAlnum && '_' != ch) // new label must start with alnum or '_' + return ParsedBadHost; + startLabel = false; + ++domainLevel; + if (ch == '_') + domainLevelOfUnderscore = domainLevel; + continue; + } + + if (isAlnum || '-' == ch) + continue; + + if (ch == '_') { // non-standard case we allow for certain hosts + domainLevelOfUnderscore = domainLevel; + continue; + } + + return ParsedBadHost; + } + + if (0 < domainLevelOfUnderscore && domainLevel < 2 + domainLevelOfUnderscore) + return ParsedBadHost; + + return ParsedOK; + } + + /********************************************************/ + TUri::TUri(const TStringBuf& host, ui16 port, const TStringBuf& path, const TStringBuf& query, const TStringBuf& scheme, unsigned defaultPort) + : FieldsSet(0) + , Port(port) + , DefaultPort(0) + , Scheme(SchemeEmpty) + , FieldsDirty(0) + { + if (!scheme.empty()) { + if (SetSchemeImpl(TSchemeInfo::Get(scheme)).Str.empty()) + FldSet(FieldScheme, scheme); + } + + if (0 < defaultPort) // override the scheme's default port + DefaultPort = static_cast<ui16>(defaultPort); + + char sport[6]; // enough for ui16 + if (0 != port) { + const size_t len = ToString(port, sport, sizeof(sport)); + FldSet(FieldPort, TStringBuf(sport, len)); + } + + FldTrySet(FieldHost, host); + FldTrySet(FieldPath, path); + FldTrySet(FieldQuery, query); + + Rewrite(); + } + + /********************************************************/ + bool TUri::FldSetImpl( + EField field, TStringBuf value, bool strconst, bool nocopy) { + if (!FldIsValid(field)) + return false; + + switch (field) { + case FieldScheme: + if (!SetScheme(TSchemeInfo::Get(value)).Str.empty()) + return false; + break; + + case FieldPort: + Port = value.empty() ? 0 : FromString<ui16>(value); + break; + + default: + break; + } + + if (!value.IsInited()) { + FldClr(field); + return false; + } + + if (strconst) { // string constants don't need to be saved in the buffer + FldMarkClean(field); + FldSetNoDirty(field, value); + return false; + } + + if (nocopy) { + FldSet(field, value); + return true; + } + + return FldTryCpy(field, value); + } + + /********************************************************/ + bool TUri::FldTryCpy(EField field, const TStringBuf& value) { + if (!FldIsDirty(field)) { + do { + if (!FldIsSet(field)) + break; + + TStringBuf& fld = Fields[field]; + if (fld.length() < value.length()) + break; + + char* oldV = (char*)fld.data(); + if (!IsInBuffer(oldV)) + break; + + memcpy(oldV, value.data(), value.length()); + oldV[value.length()] = 0; + fld.Trunc(value.length()); + return false; + } while (false); + + FldMarkDirty(field); + } + + FldSetNoDirty(field, value); + return true; + } + + /********************************************************/ + void TUri::RewriteImpl() { + size_t len = 0; + for (int i = 0; i < FieldAllMAX; ++i) { + const EField fld = EField(i); + if (FldIsSet(fld)) + len += 1 + Fields[fld].length(); + } + + if (!len) + Buffer.Clear(); + else { + TBuffer newbuf; + newbuf.Resize(len); + TMemoryWriteBuffer out(newbuf.data(), newbuf.size()); + for (int i = 0; i < FieldAllMAX; ++i) { + const EField fld = EField(i); + if (!FldIsSet(fld)) + continue; + + const char* beg = out.Buf(); + const TStringBuf& val = Fields[fld]; + out << val; + FldSetNoDirty(fld, TStringBuf(beg, val.length())); + out << '\0'; + } + Buffer = std::move(newbuf); + } + + CheckMissingFields(); + + FieldsDirty = 0; + } + + void TUri::CheckMissingFields() { + // if host is set but path is not... + if (FldSetCmp(FlagPath | FlagHost, FlagHost)) + // ... and the scheme requires a path... + if (GetSchemeInfo().FldReq & FlagPath) + // ... set path + FldSetNoDirty(FieldPath, TStringBuf("/")); + } + + /********************************************************/ + void TUri::Merge(const TUri& base, int correctAbs) { + if (base.Scheme == SchemeUnknown) + return; + + if (!base.IsValidGlobal()) + return; + + const TStringBuf& selfscheme = GetField(FieldScheme); + // basescheme is present since IsValidGlobal() succeeded + const TStringBuf& basescheme = base.GetField(FieldScheme); + const bool noscheme = !selfscheme.IsInited(); + if (!noscheme && !EqualNoCase(selfscheme, basescheme)) + return; + + const ui32 cleanFields = ~FieldsDirty; + do { + static constexpr TStringBuf rootPath = "/"; + + if (noscheme) { + if (!basescheme.empty()) { + FldSetNoDirty(FieldScheme, basescheme); + // check if it is canonical + if (basescheme.data() != base.GetSchemeInfo().Str.data()) + FldMarkDirty(FieldScheme); + } + Scheme = base.Scheme; + DefaultPort = base.DefaultPort; + } + + if (!IsNull(FlagHost)) + break; // no merge + + FldTrySet(FieldHost, base); + FldChkSet(FieldPort, base); + Port = base.Port; + + if (noscheme && IsNull(FlagQuery) && IsNull(FlagPath)) + FldTrySet(FieldQuery, base); + + if (IsNull(FlagAuth) && !base.IsNull(FlagAuth)) { + FldChkSet(FieldUser, base); + FldChkSet(FieldPass, base); + } + + if (IsValidAbs()) + break; + + TStringBuf p0 = base.GetField(FieldPath); + if (!p0.IsInited()) + p0 = rootPath; + + TStringBuf p1 = GetField(FieldPath); + if (!p1.IsInited()) { + if (p0.data() != rootPath.data()) + FldSet(FieldPath, p0); + else + FldSetNoDirty(FieldPath, rootPath); + break; + } + if (p1 && '/' == p1[0]) + p1.Skip(1); // p0 will have one + + bool pathop = true; + + TTempBufOutput out(p0.length() + p1.length() + 4); + out << p0; + if ('/' != p0.back()) + out << "/../"; + else if (p1.empty() || '.' != p1[0]) + pathop = false; + out << p1; + + char* beg = out.Data(); + char* end = beg + out.Filled(); + if (pathop && !PathOperation(beg, end, correctAbs)) { + Clear(); + break; + } + + // Needs immediate forced rewrite because of TTempBuf + FldSetNoDirty(FieldPath, TStringBuf(beg, end)); + RewriteImpl(); + } while (false); + + CheckMissingFields(); + + // rewrite only if borrowed fields from base + if (cleanFields & FieldsDirty) + RewriteImpl(); + } + + /********************************************************/ + TUri::TLinkType TUri::Normalize(const TUri& base, + const TStringBuf& link, const TStringBuf& codebase, long careFlags, ECharset enc) { + // parse URL + if (ParsedOK != ParseImpl(link, careFlags, 0, SchemeEmpty, enc)) + return LinkIsBad; + + const TStringBuf& host = GetHost(); + + // merge with base URL + // taken either from _BASE_ property or from optional argument + if (!codebase.empty()) { + // if optional code base given -- parse it + TUri codebaseUrl; + if (codebaseUrl.ParseImpl(codebase, careFlags, 0, SchemeEmpty, enc) != ParsedOK || !codebaseUrl.IsValidAbs()) + return LinkIsBad; + Merge(codebaseUrl); + } else { + // Base is already in this variable + // see SetProperty() for details + Merge(base); + } + + // check result: must be correct absolute URL + if (!IsValidAbs()) + return LinkBadAbs; + + if (!host.empty()) { + // - we don't care about different ports for the same server + // - we don't care about win|www|koi|etc. preffixes for the same server + if (GetPort() != base.GetPort() || !EqualNoCase(host, base.GetHost())) + return LinkIsGlobal; + } + + // find out if it is link to itself then ignore it + if (!Compare(base, FlagPath | FlagQuery)) + return LinkIsFragment; + + return LinkIsLocal; + } + + /********************************************************/ + + size_t TUri::PrintSize(ui32 flags) const { + size_t len = 10; + flags &= FieldsSet; // can't output what we don't have + if (flags & FlagHostAscii) + flags &= ~FlagHost; // don't want to print both of them + ui32 opt = 1; + for (int fld = 0; opt <= flags && fld < FieldAllMAX; ++fld, opt <<= 1) { + if (opt & flags) { + const TStringBuf& v = Fields[fld]; + if (v.IsInited()) { + if (opt & FlagAuth) + len += 3 * v.length() + 1; + else + len += v.length() + 1; + } + } + } + + return len; + } + + IOutputStream& TUri::PrintImpl(IOutputStream& out, int flags) const { + TStringBuf v; + + const int wantFlags = flags; // save the original + flags &= FieldsSet; // can't print what we don't have + if (flags & FlagHostAscii) + flags |= FlagHost; // to make host checks simpler below + + if (flags & FlagScheme) { + v = Fields[FieldScheme]; + if (!v.empty()) + out << v << ':'; + } + + TStringBuf host; + if (flags & FlagHost) { + const EField fldhost = + flags & FlagHostAscii ? FieldHostAscii : FieldHost; + host = Fields[fldhost]; + } + + TStringBuf port; + if ((flags & FlagPort) && 0 != Port && Port != DefaultPort) + port = Fields[FieldPort]; + + if (host) { + if (wantFlags & FlagScheme) + out << "//"; + + if (flags & FlagAuth) { + if (flags & FlagUser) { + v = Fields[FieldUser]; + if (!v.empty()) + TEncoder::EncodeNotAlnum(out, v); + } + + if (flags & FlagPass) { + v = Fields[FieldPass]; + if (v.IsInited()) { + out << ':'; + TEncoder::EncodeAll(out, v); + } + } + + out << '@'; + } + + out << host; + + if (port) + out << ':'; + } + if (port) + out << port; + + if (flags & FlagPath) { + v = Fields[FieldPath]; + // for relative, empty path is not the same as missing + if (v.empty() && 0 == (flags & FlagHost)) + v = TStringBuf("."); + out << v; + } + + if (flags & FlagQuery) { + v = Fields[FieldQuery]; + if (v.IsInited()) + out << '?' << v; + } + + if (flags & FlagFrag) { + v = Fields[FieldFrag]; + if (v.IsInited()) + out << '#' << v; + } + + return out; + } + + /********************************************************/ + int TUri::CompareField(EField fld, const TUri& url) const { + const TStringBuf& v0 = GetField(fld); + const TStringBuf& v1 = url.GetField(fld); + switch (fld) { + case FieldScheme: + case FieldHost: + return CompareNoCase(v0, v1); + default: + return v0.compare(v1); + } + } + + /********************************************************/ + int TUri::Compare(const TUri& url, int flags) const { + // first compare fields with default values + if (flags & FlagPort) { + const int ret = GetPort() - url.GetPort(); + if (ret) + return ret; + flags &= ~FlagPort; + } + + // compare remaining sets of available fields + const int rtflags = flags & url.FieldsSet; + flags &= FieldsSet; + const int fldcmp = flags - rtflags; + if (fldcmp) + return fldcmp; + + // field sets are the same, compare the fields themselves + for (int i = 0; i < FieldAllMAX; ++i) { + const EField fld = EField(i); + if (flags & FldFlag(fld)) { + const int ret = CompareField(fld, url); + if (ret) + return ret; + } + } + + return 0; + } + + /********************************************************/ + bool TUri::PathOperation(char*& pathPtr, char*& pathEnd, int correctAbs) { + if (!pathPtr) + return false; + if (pathPtr == pathEnd) + return true; + + if ((pathEnd - pathPtr) >= 2 && *(pathEnd - 2) == '/' && *(pathEnd - 1) == '.') { + --pathEnd; + } + + char* p_wr = pathEnd; + int upCount = 0; + + char* p_prev = pathEnd; + Y_ASSERT(p_prev > pathPtr); + while (p_prev > pathPtr && *(p_prev - 1) == '/') + p_prev--; + + for (char* p_rd = p_prev; p_rd; p_rd = p_prev) { + Y_ASSERT(p_rd == pathEnd || p_rd[0] == '/'); + p_prev = nullptr; + + char* p = p_rd; + + if (p > pathPtr) { + for (p--; *p != '/'; p--) { + if (p == pathPtr) + break; + } + if (*p == '/') { + p_prev = p++; + if ((p_prev - pathPtr >= 6 && !strnicmp(p_prev - 6, "http://", 7)) || + (p_prev - pathPtr >= 7 && !strnicmp(p_prev - 7, "https://", 8))) { + --p_prev; + --p; + } else { + //skip multiple from head '/' + while (p_prev > pathPtr && *(p_prev - 1) == '/') + p_prev--; + } + } + } + + Y_ASSERT(p_prev == nullptr || p_prev[0] == '/'); + //and the first symbol !='/' after p_prev is p + + if (p == p_rd) { + //empty block: + if (p_prev) { //either tail: + Y_ASSERT(p_rd == p_wr && *(p - 1) == '/'); + --p_wr; + continue; + } else { //or head of abs path + *(--p_wr) = '/'; + break; + } + } + + if (p[0] == '.') { + if (p + 1 == p_rd) { + if (correctAbs || p_prev > pathPtr || pathPtr[0] != '/') + // ignore "./" + continue; + } else { + if ((p[1] == '.') && (p + 2 == p_rd)) { + // register "../" but not print + upCount++; + continue; + } + } + } + + if (upCount) { + //unregister "../" and not print + upCount--; + continue; + } + + // print + Y_ASSERT(p < p_rd); + Y_ASSERT(!p_prev || *(p - 1) == '/'); + if (p_wr == p_rd) { //just skip + p_wr = p; + } else { //copy + int l = p_rd - p + 1; + p_wr -= l; + memmove(p_wr, p, l); + } + } + + if (upCount) { + if (*pathPtr != '/') { + if (pathEnd == p_wr && *(p_wr - 1) == '.') { + Y_ASSERT(*(p_wr - 2) == '.'); + p_wr -= 2; + upCount--; + } + for (; upCount > 0; upCount--) { + *(--p_wr) = '/'; + *(--p_wr) = '.'; + *(--p_wr) = '.'; + } + } else { + if (correctAbs > 0) + return false; + if (correctAbs == 0) { + //Bad path but present in RFC: + // "Similarly, parsers must avoid treating "." and ".." + // as special when they are not complete components of + // a relative path. " + for (; upCount > 0; upCount--) { + *(--p_wr) = '.'; + *(--p_wr) = '.'; + *(--p_wr) = '/'; + } + } else { + upCount = false; + } + } + } + + Y_ASSERT(p_wr >= pathPtr); + + if (upCount) + return false; + pathPtr = p_wr; + return true; + } + + /********************************************************/ + const char* LinkTypeToString(const TUri::TLinkType& t) { + switch (t) { + case TUri::LinkIsBad: + return "LinkIsBad"; + case TUri::LinkBadAbs: + return "LinkBadAbs"; + case TUri::LinkIsFragment: + return "LinkIsFragment"; + case TUri::LinkIsLocal: + return "LinkIsLocal"; + case TUri::LinkIsGlobal: + return "LinkIsGlobal"; + } + Y_ASSERT(0); + return ""; + } + +} diff --git a/library/cpp/uri/uri.h b/library/cpp/uri/uri.h new file mode 100644 index 00000000000..3b6c19fe4a8 --- /dev/null +++ b/library/cpp/uri/uri.h @@ -0,0 +1,626 @@ +#pragma once + +#include "common.h" +#include "encode.h" + +#include <library/cpp/charset/doccodes.h> +#include <util/generic/buffer.h> +#include <util/generic/ptr.h> +#include <util/generic/singleton.h> +#include <util/generic/string.h> +#include <util/memory/alloc.h> +#include <util/stream/mem.h> +#include <util/stream/output.h> +#include <util/stream/str.h> +#include <util/system/yassert.h> + +#include <cstdlib> + +namespace NUri { + /********************************************************/ + class TUri + : public TFeature, + public TField, + public TScheme, + public TState { + public: + enum TLinkType { + LinkIsBad, + LinkBadAbs, + LinkIsFragment, + LinkIsLocal, + LinkIsGlobal + }; + + private: + TBuffer Buffer; + TStringBuf Fields[FieldAllMAX]; + ui32 FieldsSet; + ui16 Port; + ui16 DefaultPort; + TScheme::EKind Scheme; + /// contains fields out of buffer (and possibly not null-terminated) + ui32 FieldsDirty; + + private: + void Alloc(size_t len) { + Dealloc(); // to prevent copy below + Buffer.Resize(len); + } + void Dealloc() { + Buffer.Clear(); + } + + void ClearImpl() { + Port = 0; + FieldsSet = 0; + Scheme = SchemeEmpty; + FieldsDirty = 0; + } + + void CopyData(const TUri& url) { + FieldsSet = url.FieldsSet; + Port = url.Port; + DefaultPort = url.DefaultPort; + Scheme = url.Scheme; + FieldsDirty = url.FieldsDirty; + } + + void CopyImpl(const TUri& url) { + for (int i = 0; i < FieldAllMAX; ++i) + Fields[i] = url.Fields[i]; + + RewriteImpl(); + } + + private: + static ui32 FldFlag(EField fld) { + return 1 << fld; + } + + public: + static bool FldIsValid(EField fld) { + return 0 <= fld && FieldAllMAX > fld; + } + + bool FldSetCmp(ui32 chk, ui32 exp) const { + return (FieldsSet & chk) == exp; + } + + bool FldSetCmp(ui32 chk) const { + return FldSetCmp(chk, chk); + } + + bool FldIsSet(EField fld) const { + return !FldSetCmp(FldFlag(fld), 0); + } + + private: + void FldMarkSet(EField fld) { + FieldsSet |= FldFlag(fld); + } + + void FldMarkUnset(EField fld) { + FieldsSet &= ~FldFlag(fld); + } + + // use when we know the field is dirty or RewriteImpl will be called + void FldSetNoDirty(EField fld, const TStringBuf& value) { + Fields[fld] = value; + FldMarkSet(fld); + } + + void FldSet(EField fld, const TStringBuf& value) { + FldSetNoDirty(fld, value); + FldMarkDirty(fld); + } + + const TStringBuf& FldGet(EField fld) const { + return Fields[fld]; + } + + private: + /// depending on value, clears or sets it + void FldChkSet(EField fld, const TStringBuf& value) { + if (value.IsInited()) + FldSet(fld, value); + else + FldClr(fld); + } + void FldChkSet(EField fld, const TUri& other) { + FldChkSet(fld, other.GetField(fld)); + } + + /// set only if initialized + bool FldTrySet(EField fld, const TStringBuf& value) { + const bool ok = value.IsInited(); + if (ok) + FldSet(fld, value); + return ok; + } + bool FldTrySet(EField fld, const TUri& other) { + return FldTrySet(fld, other.GetField(fld)); + } + + private: + /// copies the value if it fits + bool FldTryCpy(EField fld, const TStringBuf& value); + + // main method: sets the field value, possibly copies, etc. + bool FldSetImpl(EField fld, TStringBuf value, bool strconst = false, bool nocopy = false); + + public: // clear a field + void FldClr(EField fld) { + Fields[fld].Clear(); + FldMarkUnset(fld); + FldMarkClean(fld); + } + + bool FldTryClr(EField field) { + const bool ok = FldIsSet(field); + if (ok) + FldClr(field); + return ok; + } + + public: // set a field value: might leave state dirty and require a Rewrite() + // copies if fits and not dirty, sets and marks dirty otherwise + bool FldMemCpy(EField field, const TStringBuf& value) { + return FldSetImpl(field, value, false); + } + + // uses directly, marks dirty + /// @note client MUST guarantee value will be alive until Rewrite is called + bool FldMemSet(EField field, const TStringBuf& value) { + return FldSetImpl(field, value, false, true); + } + + // uses directly, doesn't mark dirty (value scope exceeds "this") + bool FldMemUse(EField field, const TStringBuf& value) { + return FldSetImpl(field, value, true); + } + + // uses directly, doesn't mark dirty + template <size_t size> + bool FldMemSet(EField field, const char (&value)[size]) { + static_assert(size > 0); + return FldSetImpl(field, TStringBuf(value, size - 1), true); + } + + // duplicate one field to another + bool FldDup(EField src, EField dst) { + if (!FldIsSet(src) || !FldIsValid(dst)) + return false; + FldSetNoDirty(dst, FldGet(src)); + if (FldIsDirty(src)) + FldMarkDirty(dst); + else + FldMarkClean(dst); + return true; + } + + // move one field to another + bool FldMov(EField src, EField dst) { + if (!FldDup(src, dst)) + return false; + FldClr(src); + return true; + } + + private: + bool IsInBuffer(const char* buf) const { + return buf >= Buffer.data() && buf < Buffer.data() + Buffer.size(); + } + + public: + bool FldIsDirty() const { + return 0 != FieldsDirty; + } + + bool FldIsDirty(EField fld) const { + return 0 != (FieldsDirty & FldFlag(fld)); + } + + private: + void FldMarkDirty(EField fld) { + FieldsDirty |= FldFlag(fld); + } + + void FldMarkClean(EField fld) { + FieldsDirty &= ~FldFlag(fld); + } + + void RewriteImpl(); + + public: + static TState::EParsed CheckHost(const TStringBuf& host); + + // convert a [potential] IDN to ascii + static TMallocPtr<char> IDNToAscii(const wchar32* idna); + static TMallocPtr<char> IDNToAscii(const TStringBuf& host, ECharset enc = CODES_UTF8); + + // convert hosts with percent-encoded or extended chars + + // returns non-empty string if host can be converted to ASCII with given parameters + static TStringBuf HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc = CODES_UTF8); + + // returns host if already ascii, or non-empty if it can be converted + static TStringBuf HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc = CODES_UTF8); + + public: + explicit TUri(unsigned defaultPort = 0) + : FieldsSet(0) + , Port(0) + , DefaultPort(static_cast<ui16>(defaultPort)) + , Scheme(SchemeEmpty) + , FieldsDirty(0) + { + } + + TUri(const TStringBuf& host, ui16 port, const TStringBuf& path, const TStringBuf& query = TStringBuf(), const TStringBuf& scheme = "http", unsigned defaultPort = 0); + + TUri(const TUri& url) + : FieldsSet(url.FieldsSet) + , Port(url.Port) + , DefaultPort(url.DefaultPort) + , Scheme(url.Scheme) + , FieldsDirty(url.FieldsDirty) + { + CopyImpl(url); + } + + ~TUri() { + Clear(); + } + + void Copy(const TUri& url) { + if (&url != this) { + CopyData(url); + CopyImpl(url); + } + } + + void Clear() { + Dealloc(); + ClearImpl(); + } + + ui32 GetFieldMask() const { + return FieldsSet; + } + + ui32 GetUrlFieldMask() const { + return GetFieldMask() & FlagUrlFields; + } + + ui32 GetDirtyMask() const { + return FieldsDirty; + } + + void CheckMissingFields(); + + // Process methods + + void Rewrite() { + if (FldIsDirty()) + RewriteImpl(); + } + + private: + TState::EParsed AssignImpl(const TParser& parser, TScheme::EKind defscheme = SchemeEmpty); + + TState::EParsed ParseImpl(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, TScheme::EKind defscheme = SchemeEmpty, ECharset enc = CODES_UTF8); + + public: + TState::EParsed Assign(const TParser& parser, TScheme::EKind defscheme = SchemeEmpty) { + const TState::EParsed ret = AssignImpl(parser, defscheme); + if (ParsedOK == ret) + Rewrite(); + return ret; + } + + TState::EParsed ParseUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8) { + const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc); + if (ParsedOK == ret) + Rewrite(); + return ret; + } + + // parses absolute URIs + // prepends default scheme (unless unknown) if URI has none + TState::EParsed ParseAbsUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, TScheme::EKind defscheme = SchemeUnknown, ECharset enc = CODES_UTF8); + + TState::EParsed ParseAbsOrHttpUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8) { + return ParseAbsUri(url, flags, maxlen, SchemeHTTP, enc); + } + + TState::EParsed Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8); + + TState::EParsed Parse(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault) { + return ParseUri(url, flags); + } + + TState::EParsed Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& base_url, ui32 maxlen = 0, ECharset enc = CODES_UTF8); + + TState::EParsed ParseAbs(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, const TStringBuf& base_url = TStringBuf(), ui32 maxlen = 0, ECharset enc = CODES_UTF8) { + const TState::EParsed result = Parse(url, flags, base_url, maxlen, enc); + return ParsedOK != result || IsValidGlobal() ? result : ParsedBadFormat; + } + + // correctAbs works with head "/.." portions: + // 1 - reject URL + // 0 - keep portions + // -1 - ignore portions + + void Merge(const TUri& base, int correctAbs = -1); + + TLinkType Normalize(const TUri& base, const TStringBuf& link, const TStringBuf& codebase = TStringBuf(), long careFlags = FeaturesDefault, ECharset enc = CODES_UTF8); + + private: + int PrintFlags(int flags) const { + if (0 == (FlagUrlFields & flags)) + flags |= FlagUrlFields; + return flags; + } + + protected: + size_t PrintSize(ui32 flags) const; + + // Output method, prints to stream + IOutputStream& PrintImpl(IOutputStream& out, int flags) const; + + char* PrintImpl(char* str, size_t size, int flags) const { + TMemoryOutput out(str, size); + PrintImpl(out, flags) << '\0'; + return str; + } + + static bool IsAbsPath(const TStringBuf& path) { + return 1 <= path.length() && path[0] == '/'; + } + + bool IsAbsPathImpl() const { + return IsAbsPath(GetField(FieldPath)); + } + + public: + // Output method, prints to stream + IOutputStream& Print(IOutputStream& out, int flags = FlagUrlFields) const { + return PrintImpl(out, PrintFlags(flags)); + } + + // Output method, print to str, allocate memory if str is NULL + // Should be deprecated + char* Print(char* str, size_t size, int flags = FlagUrlFields) const { + return nullptr == str ? Serialize(flags) : Serialize(str, size, flags); + } + + char* Serialize(char* str, size_t size, int flags = FlagUrlFields) const { + Y_ASSERT(str); + flags = PrintFlags(flags); + const size_t printSize = PrintSize(flags) + 1; + return printSize > size ? nullptr : PrintImpl(str, size, flags); + } + + char* Serialize(int flags = FlagUrlFields) const { + flags = PrintFlags(flags); + const size_t size = PrintSize(flags) + 1; + return PrintImpl(static_cast<char*>(malloc(size)), size, flags); + } + + // Output method to str + void Print(TString& str, int flags = FlagUrlFields) const { + flags = PrintFlags(flags); + str.reserve(str.length() + PrintSize(flags)); + TStringOutput out(str); + PrintImpl(out, flags); + } + + TString PrintS(int flags = FlagUrlFields) const { + TString str; + Print(str, flags); + return str; + } + + // Only non-default scheme and port are printed + char* PrintHost(char* str, size_t size) const { + return Print(str, size, (Scheme != SchemeHTTP ? FlagScheme : 0) | FlagHostPort); + } + TString PrintHostS() const { + return PrintS((Scheme != SchemeHTTP ? FlagScheme : 0) | FlagHostPort); + } + + // Info methods + int Compare(const TUri& A, int flags = FlagUrlFields) const; + + int CompareField(EField fld, const TUri& url) const; + + const TStringBuf& GetField(EField fld) const { + return FldIsValid(fld) && FldIsSet(fld) ? FldGet(fld) : Default<TStringBuf>(); + } + + ui16 GetPort() const { + return 0 == Port ? DefaultPort : Port; + } + + const TStringBuf& GetHost() const { + if (GetFieldMask() & FlagHostAscii) + return FldGet(FieldHostAscii); + if (GetFieldMask() & FlagHost) + return FldGet(FieldHost); + return Default<TStringBuf>(); + } + + bool UseHostAscii() { + return FldMov(FieldHostAscii, FieldHost); + } + + TScheme::EKind GetScheme() const { + return Scheme; + } + const TSchemeInfo& GetSchemeInfo() const { + return TSchemeInfo::Get(Scheme); + } + + bool IsNull(ui32 flags = FlagScheme | FlagHost | FlagPath) const { + return !FldSetCmp(flags); + } + + bool IsNull(EField fld) const { + return !FldIsSet(fld); + } + + bool IsValidAbs() const { + if (IsNull(FlagScheme | FlagHost | FlagPath)) + return false; + return IsAbsPathImpl(); + } + + bool IsValidGlobal() const { + if (IsNull(FlagScheme | FlagHost)) + return false; + if (IsNull(FlagPath)) + return true; + return IsAbsPathImpl(); + } + + bool IsRootless() const { + return FldSetCmp(FlagScheme | FlagHost | FlagPath, FlagScheme | FlagPath) && !IsAbsPathImpl(); + } + + // for RFC 2396 compatibility + bool IsOpaque() const { + return IsRootless(); + } + + // Inline helpers + TUri& operator=(const TUri& u) { + Copy(u); + return *this; + } + + bool operator!() const { + return IsNull(); + } + + bool Equal(const TUri& A, int flags = FlagUrlFields) const { + return (Compare(A, flags) == 0); + } + + bool Less(const TUri& A, int flags = FlagUrlFields) const { + return (Compare(A, flags) < 0); + } + + bool operator==(const TUri& A) const { + return Equal(A, FlagNoFrag); + } + + bool operator!=(const TUri& A) const { + return !Equal(A, FlagNoFrag); + } + + bool operator<(const TUri& A) const { + return Less(A, FlagNoFrag); + } + + bool IsSameDocument(const TUri& other) const { + // pre: both *this and 'other' should be normalized to valid abs + Y_ASSERT(IsValidAbs()); + return Equal(other, FlagNoFrag); + } + + bool IsLocal(const TUri& other) const { + // pre: both *this and 'other' should be normalized to valid abs + Y_ASSERT(IsValidAbs() && other.IsValidAbs()); + return Equal(other, FlagScheme | FlagHostPort); + } + + TLinkType Locality(const TUri& other) const { + if (IsSameDocument(other)) + return LinkIsFragment; + else if (IsLocal(other)) + return LinkIsLocal; + return LinkIsGlobal; + } + + static IOutputStream& ReEncodeField(IOutputStream& out, const TStringBuf& val, EField fld, long flags = FeaturesEncodeDecode) { + return NEncode::TEncoder::ReEncode(out, val, NEncode::TEncodeMapper(flags, fld)); + } + + static IOutputStream& ReEncodeToField(IOutputStream& out, const TStringBuf& val, EField srcfld, long srcflags, EField dstfld, long dstflags) { + return NEncode::TEncoder::ReEncodeTo(out, val, NEncode::TEncodeMapper(srcflags, srcfld), NEncode::TEncodeToMapper(dstflags, dstfld)); + } + + static IOutputStream& ReEncode(IOutputStream& out, const TStringBuf& val, long flags = FeaturesEncodeDecode) { + return ReEncodeField(out, val, FieldAllMAX, flags); + } + + static int PathOperationFlag(const TParseFlags& flags) { + return flags & FeaturePathDenyRootParent ? 1 + : flags & FeaturePathStripRootParent ? -1 : 0; + } + + static bool PathOperation(char*& pathBeg, char*& pathEnd, int correctAbs); + + private: + const TSchemeInfo& SetSchemeImpl(const TSchemeInfo& info) { + Scheme = info.Kind; + DefaultPort = info.Port; + if (!info.Str.empty()) + FldSetNoDirty(FieldScheme, info.Str); + return info; + } + const TSchemeInfo& SetSchemeImpl(TScheme::EKind scheme) { + return SetSchemeImpl(TSchemeInfo::Get(scheme)); + } + + public: + const TSchemeInfo& SetScheme(const TSchemeInfo& info) { + SetSchemeImpl(info); + if (!info.Str.empty()) + FldMarkClean(FieldScheme); + return info; + } + const TSchemeInfo& SetScheme(TScheme::EKind scheme) { + return SetScheme(TSchemeInfo::Get(scheme)); + } + }; + + class TUriUpdate { + TUri& Uri_; + + public: + TUriUpdate(TUri& uri) + : Uri_(uri) + { + } + ~TUriUpdate() { + Uri_.Rewrite(); + } + + public: + bool Set(TField::EField field, const TStringBuf& value) { + return Uri_.FldMemSet(field, value); + } + + template <size_t size> + bool Set(TField::EField field, const char (&value)[size]) { + return Uri_.FldMemSet(field, value); + } + + void Clr(TField::EField field) { + Uri_.FldClr(field); + } + }; + + const char* LinkTypeToString(const TUri::TLinkType& t); + +} + +Y_DECLARE_OUT_SPEC(inline, NUri::TUri, out, url) { + url.Print(out); +} + +Y_DECLARE_OUT_SPEC(inline, NUri::TUri::TLinkType, out, t) { + out << NUri::LinkTypeToString(t); +} diff --git a/library/cpp/uri/uri_ut.cpp b/library/cpp/uri/uri_ut.cpp new file mode 100644 index 00000000000..2ebd83fc93a --- /dev/null +++ b/library/cpp/uri/uri_ut.cpp @@ -0,0 +1,1022 @@ +#include "uri_ut.h" +#include "other.h" +#include "qargs.h" +#include <library/cpp/html/entity/htmlentity.h> + +#include <util/system/maxlen.h> + +namespace NUri { + Y_UNIT_TEST_SUITE(URLTest) { + static const char* urls[] = { + "http://a/b/c/d;p?q#r", + "g", "http://a/b/c/g", + "./g", "http://a/b/c/g", + "g/", "http://a/b/c/g/", + "/g", "http://a/g", + "//g", "http://g/", + "?y", "http://a/b/c/d;p?y", + "g?y", "http://a/b/c/g?y", + "#s", "http://a/b/c/d;p?q#s", + "g#s", "http://a/b/c/g#s", + "g?y#s", "http://a/b/c/g?y#s", + ";x", "http://a/b/c/;x", + "g;x", "http://a/b/c/g;x", + "g;x?y#s", "http://a/b/c/g;x?y#s", + ".", "http://a/b/c/", + "./", "http://a/b/c/", + "./.", "http://a/b/c/", + "././", "http://a/b/c/", + "././.", "http://a/b/c/", + "..", "http://a/b/", + "../", "http://a/b/", + "../.", "http://a/b/", + "../g", "http://a/b/g", + "../..", "http://a/", + "../../", "http://a/", + "../../.", "http://a/", + "../../g", "http://a/g", + "../../../g", "http://a/g", + "../../../../g", "http://a/g", + "/./g", "http://a/g", + "g.", "http://a/b/c/g.", + ".g", "http://a/b/c/.g", + "g..", "http://a/b/c/g..", + "..g", "http://a/b/c/..g", + "./../g", "http://a/b/g", + "./g/.", "http://a/b/c/g/", + "g/./h", "http://a/b/c/g/h", + "g/../h", "http://a/b/c/h", + "g;x=1/./y", "http://a/b/c/g;x=1/y", + "g;x=1/../y", "http://a/b/c/y", + "g?y/./x", "http://a/b/c/g?y/./x", + "g?y/../x", "http://a/b/c/g?y/../x", + "g#s/./x", "http://a/b/c/g#s/./x", + "g#s/../x", "http://a/b/c/g#s/../x", + "?", "http://a/b/c/d;p?", + "/?", "http://a/?", + "x?", "http://a/b/c/x?", + "x%20y", "http://a/b/c/x%20y", + "%20y", "http://a/b/c/%20y", + // "%2zy", "http://a/b/c/%2zy", + nullptr}; + + Y_UNIT_TEST(test_httpURL) { + TUri rel, base, abs; + TState::EParsed er = base.Parse(urls[0]); + UNIT_ASSERT_VALUES_EQUAL(er, TState::ParsedOK); + UNIT_ASSERT(base.IsValidAbs()); + UNIT_ASSERT_VALUES_EQUAL(base.PrintS(), urls[0]); + + TString errbuf; + TStringOutput out(errbuf); + const long mflag = TFeature::FeaturesAll; + for (int i = 1; urls[i]; i += 2) { + er = rel.Parse(urls[i]); + UNIT_ASSERT_VALUES_EQUAL_C(er, TState::ParsedOK, urls[i]); + rel.Merge(base); + UNIT_ASSERT_VALUES_EQUAL_C(rel.PrintS(), urls[i + 1], urls[i]); + + // try the same thing differently + er = rel.Parse(urls[i], mflag, urls[0]); + UNIT_ASSERT_VALUES_EQUAL_C(er, TState::ParsedOK, urls[i]); + UNIT_ASSERT_VALUES_EQUAL_C(rel.PrintS(), urls[i + 1], urls[i]); + + // lastly... + er = abs.Parse(urls[i + 1], mflag); + UNIT_ASSERT_VALUES_EQUAL(er, TState::ParsedOK); + errbuf.clear(); + out << '[' << rel.PrintS() + << "] != [" << abs.PrintS() << ']'; + UNIT_ASSERT_EQUAL_C(rel, abs, errbuf); + } + } + + Y_UNIT_TEST(test_Schemes) { + TUri url; + UNIT_ASSERT_VALUES_EQUAL(url.Parse("www.ya.ru/index.html"), TState::ParsedOK); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeEmpty); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("http://www.ya.ru"), TState::ParsedOK); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeHTTP); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("https://www.ya.ru"), TState::ParsedBadScheme); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("https://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeKnown), TState::ParsedOK); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeHTTPS); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpwhatever://www.ya.ru"), TState::ParsedBadScheme); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpwhatever://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeFlexible), TState::ParsedOK); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeUnknown); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpswhatever://www.ya.ru"), TState::ParsedBadScheme); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpswhatever://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeFlexible), TState::ParsedOK); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeUnknown); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("ftp://www.ya.ru"), TState::ParsedBadScheme); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("ftp://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeFlexible), TState::ParsedOK); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeFTP); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpsssss://www.ya.ru"), TState::ParsedBadScheme); + UNIT_ASSERT_VALUES_EQUAL(url.Parse("httpsssss://www.ya.ru", TFeature::FeaturesDefault | TFeature::FeatureSchemeFlexible), TState::ParsedOK); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeUnknown); + } + + struct Link4Norm { + const char* const base; + const char* const link; + const char* const result; + TUri::TLinkType ltype; + }; + + static const Link4Norm link4Norm[] = { + {"http://www.alltest.ru/all.php?a=aberporth", "http://www.alltest.ru/all.php?a=domestic jobs", "", TUri::LinkIsBad}, + {"http://www.alltest.ru/all.php?a=aberporth", "http://www.alltest.ru/all.php?a=domestic%20jobs", "http://www.alltest.ru/all.php?a=domestic%20jobs", TUri::LinkIsLocal}, + {"http://president.rf/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8", "http://president.rf/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/1024", "http://president.rf/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/1024", TUri::LinkIsLocal}, + {nullptr, nullptr, nullptr, TUri::LinkIsBad}, + }; + + Y_UNIT_TEST(test_httpURLNormalize) { + TUri normalizedLink; + + for (int i = 0; link4Norm[i].link; i++) { + TUri base; + TState::EParsed er = base.Parse(link4Norm[i].base); + UNIT_ASSERT_VALUES_EQUAL_C(er, TState::ParsedOK, link4Norm[i].base); + TUri::TLinkType ltype = normalizedLink.Normalize(base, link4Norm[i].link); + UNIT_ASSERT_VALUES_EQUAL_C(ltype, link4Norm[i].ltype, link4Norm[i].link); + TString s = TUri::LinkIsBad == ltype ? "" : normalizedLink.PrintS(); + UNIT_ASSERT_VALUES_EQUAL_C(s, link4Norm[i].result, link4Norm[i].link); + } + } + + static const char* urlsWithMultipleSlash[] = { + "http://a/http://b", "http://a/http://b", + "http://a/https://b", "http://a/https://b", + "http://a/b://c", "http://a/b:/c", + "http://a/b//c", "http://a/b/c", + nullptr, nullptr}; + + Y_UNIT_TEST(test_httpURLPathOperation) { + char copyUrl[URL_MAXLEN]; + for (int i = 0; urlsWithMultipleSlash[i]; i += 2) { + const TStringBuf url(urlsWithMultipleSlash[i]); + const TStringBuf normurl(urlsWithMultipleSlash[i + 1]); + memcpy(copyUrl, url.data(), url.length()); + char* p = copyUrl; + char* e = copyUrl + url.length(); + TUri::PathOperation(p, e, 1); + UNIT_ASSERT_VALUES_EQUAL(TStringBuf(p, e), normurl); + TUri uri; + UNIT_ASSERT_VALUES_EQUAL(TState::ParsedOK, uri.Parse(url)); + UNIT_ASSERT_VALUES_EQUAL_C(uri.PrintS(), normurl, url); + } + } + + static const char* hostsForCheckHost[] = { + "simplehost.ru", + "third_level.host.ru", + "_ok.somewhere.ru", + "a.b", + "second_level.ru", + "_bad.ru", + "_", + "yandex.ru:443", + nullptr}; + + static TState::EParsed answersForCheckHost[] = { + TState::ParsedOK, + TState::ParsedOK, + TState::ParsedOK, + TState::ParsedOK, + TState::ParsedBadHost, + TState::ParsedBadHost, + TState::ParsedBadHost, + TState::ParsedBadHost, + }; + + Y_UNIT_TEST(test_httpURLCheckHost) { + for (size_t index = 0; hostsForCheckHost[index]; ++index) { + TState::EParsed state = TUri::CheckHost(hostsForCheckHost[index]); + UNIT_ASSERT_VALUES_EQUAL(state, answersForCheckHost[index]); + } + } + + Y_UNIT_TEST(test_httpURLSet) { + // set port + { + TUri parsedUrl; + parsedUrl.Parse("http://www.host.com/script.cgi?param1=value1¶m2=value2"); + parsedUrl.FldMemSet(TField::FieldPort, "8080"); + UNIT_ASSERT_VALUES_EQUAL(parsedUrl.GetPort(), 8080); + UNIT_ASSERT_VALUES_EQUAL(parsedUrl.PrintS(), "http://www.host.com:8080/script.cgi?param1=value1¶m2=value2"); + } + + // clear port + { + TUri parsedUrl; + parsedUrl.Parse("http://www.host.com:8080/script.cgi?param1=value1¶m2=value2"); + parsedUrl.FldMemSet(TField::FieldPort, nullptr); + UNIT_ASSERT_VALUES_EQUAL(parsedUrl.GetPort(), 80); + UNIT_ASSERT_VALUES_EQUAL(parsedUrl.PrintS(), "http://www.host.com/script.cgi?param1=value1¶m2=value2"); + } + + // change scheme with default port + { + TUri parsedUrl; + parsedUrl.Parse("http://www.host.com/script.cgi?param1=value1¶m2=value2"); + parsedUrl.FldMemSet(TField::FieldScheme, "https"); + UNIT_ASSERT_VALUES_EQUAL(parsedUrl.GetPort(), 443); + UNIT_ASSERT_VALUES_EQUAL(parsedUrl.PrintS(), "https://www.host.com/script.cgi?param1=value1¶m2=value2"); + } + + // change scheme with non-default port + { + TUri parsedUrl; + parsedUrl.Parse("http://www.host.com:8080/script.cgi?param1=value1¶m2=value2"); + parsedUrl.FldMemSet(TField::FieldScheme, "https"); + UNIT_ASSERT_VALUES_EQUAL(parsedUrl.GetPort(), 8080); + UNIT_ASSERT_VALUES_EQUAL(parsedUrl.PrintS(), "https://www.host.com:8080/script.cgi?param1=value1¶m2=value2"); + } + } + + Y_UNIT_TEST(test_httpURLAuth) { + { + TUri parsedUrl; + TState::EParsed st = parsedUrl.Parse("http://@www.host.com/path", TFeature::FeaturesRobot); + UNIT_ASSERT_VALUES_EQUAL(st, TState::ParsedBadAuth); + } + + { + TUri parsedUrl; + TState::EParsed st = parsedUrl.Parse("http://loginwithnopass@www.host.com/path", TFeature::FeatureAuthSupported); + UNIT_ASSERT_VALUES_EQUAL(st, TState::ParsedOK); + UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldHost), "www.host.com"); + UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldUser), "loginwithnopass"); + UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldPass), ""); + } + + { + TUri parsedUrl; + TState::EParsed st = parsedUrl.Parse("http://login:pass@www.host.com/path", TFeature::FeatureAuthSupported); + UNIT_ASSERT_VALUES_EQUAL(st, TState::ParsedOK); + UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldHost), "www.host.com"); + UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldUser), "login"); + UNIT_ASSERT_EQUAL(parsedUrl.GetField(TField::FieldPass), "pass"); + } + } + + Y_UNIT_TEST(test01) { + TTest test = { + "user:pass@host:8080", TFeature::FeaturesAll, TState::ParsedRootless, "user", "", "", "", 0, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + + Y_UNIT_TEST(test02) { + TTest test = { + "http://host", TFeature::FeaturesAll, TState::ParsedOK, "http", "", "", "host", 80, "/", "", ""}; + TUri url; + URL_TEST(url, test); + } + + Y_UNIT_TEST(test03) { + TTest test = { + "https://host", TFeature::FeatureSchemeFlexible | TFeature::FeatureAllowHostIDN, TState::ParsedOK, "https", "", "", "host", 443, "/", "", ""}; + TUri url; + URL_TEST(url, test); + } + + Y_UNIT_TEST(test04) { + TTest test = { + "user:pass@host:8080", TFeature::FeaturesAll | TFeature::FeatureNoRelPath | TFeature::FeatureAllowRootless, TState::ParsedOK, "user", "", "", "", 0, "pass@host:8080", "", ""}; + TUri url; + URL_TEST(url, test); + TUri url2(url); + CMP_URL(url2, test); + URL_EQ(url, url2); + } + + Y_UNIT_TEST(test05) { + TTest test = { + "host:8080", TFeature::FeaturesAll | TFeature::FeatureNoRelPath | TFeature::FeatureAllowRootless, TState::ParsedOK, "host", "", "", "", 0, "8080", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "host:8080"); + } + + Y_UNIT_TEST(test06) { + TTest test = { + "http://user:pass@host?q", TFeature::FeaturesAll, TState::ParsedOK, "http", "user", "pass", "host", 80, "/", "q", ""}; + TUri url; + URL_TEST(url, test); + url.FldMemSet(TField::FieldScheme, "https"); + UNIT_ASSERT(!url.FldIsDirty()); + UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldScheme), "https"); + UNIT_ASSERT_VALUES_EQUAL(url.GetPort(), 443); + + // test copying + TUri url2(url); + // make sure strings are equal... + UNIT_ASSERT_VALUES_EQUAL( + url.GetField(TField::FieldUser), + url2.GetField(TField::FieldUser)); + // ... and memory locations are the same + UNIT_ASSERT_EQUAL( + url.GetField(TField::FieldUser), + url2.GetField(TField::FieldUser)); + // and urls compare the same + URL_EQ(url, url2); + + // cause a dirty field + url.FldMemSet(TField::FieldUser, "use"); // it is now shorter + UNIT_ASSERT(!url.FldIsDirty()); + url.FldMemSet(TField::FieldUser, TStringBuf("user")); + UNIT_ASSERT(url.FldIsDirty()); + + // copy again + url2 = url; + UNIT_ASSERT(url.FldIsDirty()); + UNIT_ASSERT(!url2.FldIsDirty()); + URL_EQ(url, url2); + // make sure strings are equal... + UNIT_ASSERT_VALUES_EQUAL( + url.GetField(TField::FieldUser), + url2.GetField(TField::FieldUser)); + // ... but memory locations are different + UNIT_ASSERT_UNEQUAL( + url.GetField(TField::FieldUser).data(), + url2.GetField(TField::FieldUser).data()); + URL_EQ(url, url2); + + // make query empty + url.FldMemSet(TField::FieldQuery, ""); + url2 = url; + URL_EQ(url, url2); + // set query to null value (should clear it) + url2.FldMemSet(TField::FieldQuery, TStringBuf()); + // make sure they are no longer equal + URL_NEQ(url, url2); + // reset query + url.FldClr(TField::FieldQuery); + // equal again + URL_EQ(url, url2); + // reset port and set the other to default + url.FldClr(TField::FieldPort); + url2.FldMemSet(TField::FieldPort, "443"); + URL_EQ(url, url2); + } + + Y_UNIT_TEST(test07) { + { + TTest test = { + "http://host/path//", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "http", "", "", "host", 80, "/path/", "", ""}; + TUri url; + URL_TEST(url, test); + url.FldMemSet(TField::FieldScheme, "HTTPs"); + UNIT_ASSERT_EQUAL(TScheme::SchemeHTTPS, url.GetScheme()); + UNIT_ASSERT_EQUAL("https", url.GetField(TField::FieldScheme)); + url.FldMemSet(TField::FieldScheme, "HtTP"); + UNIT_ASSERT_EQUAL(TScheme::SchemeHTTP, url.GetScheme()); + UNIT_ASSERT_EQUAL("http", url.GetField(TField::FieldScheme)); + } + + { + const TString scheme = "http"; + const TString host = "host.com"; + const TString urlstr = scheme + "://" + host; + TTest test = { + urlstr, TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, scheme, "", "", host, 80, "/", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), urlstr + "/"); + } + } + + Y_UNIT_TEST(test08) { + { + TTest test = { + "mailto://user@host.com", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "mailto", "user", "", "host.com", 0, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "host:/path/.path/.", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "host", "", "", "", 0, "/path/.path/", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "host:1/path/.path/.", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "host", 1, "/path/.path/", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "host:1/path/.path/.", TFeature::FeaturesAll | TFeature::FeatureAllowRootless, TState::ParsedOK, "host", "", "", "", 0, "1/path/.path/.", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "/[foo]:bar", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "/[foo]:bar", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + ".", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + ".", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "././.", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "././.", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "./path", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "path", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "./path", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "path", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "../path", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "../path", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "../path", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "", "", "", "", 0, "../path", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "/../path", TFeature::FeaturesAll, TState::ParsedOK, "", "", "", "", 0, "/path", "", ""}; + TUri url; + URL_TEST(url, test); + } + } + + Y_UNIT_TEST(test09) { + { + TTest test = { + "mailto:user@host.com", TFeature::FeaturesAll | TFeature::FeatureAllowRootless, TState::ParsedOK, "mailto", "", "", "", 0, "user@host.com", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "scheme:", TFeature::FeaturesAll | TFeature::FeatureNoRelPath | TFeature::FeatureAllowRootless, TState::ParsedOK, "scheme", "", "", "", 0, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + { + TTest test = { + "scheme:", TFeature::FeaturesAll | TFeature::FeatureAllowRootless, TState::ParsedOK, "scheme", "", "", "", 0, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + } + + Y_UNIT_TEST(test10) { + // test some escaping madness, note the ehost vs host + { + TString host = "президент.рф"; + TString ehost = "%D0%BF%D1%80%D0%B5%D0%B7%D0%B8%D0%B4%D0%B5%D0%BD%D1%82.%D1%80%D1%84"; + const TString urlstr = TString::Join("http://", host, "/"); + TTest test = { + urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeaturesDefault | TFeature::FeatureCheckHost, TState::ParsedBadHost, "http", "", "", ehost, 80, "/", "", ""}; + TUri url; + URL_TEST(url, test); + } + + { + TString host = "%D0%BF%D1%80%D0%B5%D0%B7%D0%B8%D0%B4%D0%B5%D0%BD%D1%82.%D1%80%D1%84"; + const TString urlstr = TString::Join("http://", host, "/"); + TTest test = { + urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeaturesDefault | TFeature::FeatureCheckHost, TState::ParsedBadHost, "http", "", "", host, 80, "/", "", ""}; + TUri url; + URL_TEST(url, test); + } + + { + TString host = "Фilip.ru"; + TString ehost = "%D0%A4ilip.ru"; + const TString urlstr = TString::Join("http://", host); + TTest test = { + urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeaturesDefault, TState::ParsedBadHost, "http", "", "", ehost, 80, "/", "", ""}; + TUri url; + URL_TEST(url, test); + } + + { + TString host = "%D0%A4ilip.ru"; + const TString urlstr = TString::Join("http://", host); + TTest test = { + urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeaturesDefault, TState::ParsedBadHost, "http", "", "", host, 80, "/", "", ""}; + TUri url; + URL_TEST(url, test); + } + + { + TString host = "Filip%90.rЯ"; + TString ehost = "Filip%90.r%D0%AF"; + const TString urlstr = TString::Join(host, ":8080"); + TTest test = { + urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeatureDecodeAllowed | TFeature::FeaturesDefault | TFeature::FeatureNoRelPath, TState::ParsedBadHost, "", "", "", ehost, 8080, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + + { + TString host = "Filip%90.r%D0%AF"; + const TString urlstr = TString::Join(host, ":8080"); + TTest test = { + urlstr, TFeature::FeatureEncodeExtendedASCII | TFeature::FeatureDecodeAllowed | TFeature::FeaturesDefault | TFeature::FeatureNoRelPath, TState::ParsedBadHost, "", "", "", host, 8080, "", "", ""}; + TUri url; + URL_TEST(url, test); + } + } + + Y_UNIT_TEST(test11) { + { + TTest test = { + "HtTp://HoSt/%50aTh/?Query#Frag", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedOK, "http", "", "", "host", 80, "/PaTh/", "Query", "Frag"}; + TUri url; + URL_TEST(url, test); + } + + { + TTest test = { + "HtTp://HoSt/%50a%54h/?Query#Frag", TParseFlags(TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TFeature::FeatureToLower), TState::ParsedOK, "http", "", "", "host", 80, "/path/", "query", "frag"}; + TUri url; + URL_TEST(url, test); + } + } + + Y_UNIT_TEST(test12) { + // test characters which are not always safe + { +#define RAW "/:" +#define DEC "%2F:" +#define ENC "%2F%3A" + TTest test = { + "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC, TFeature::FeaturesAll, TState::ParsedOK, "http", RAW, RAW, "host", 80, "/" DEC, RAW, RAW}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" ENC ":" ENC "@host/" DEC "?" RAW "#" RAW); +#undef RAW +#undef DEC +#undef ENC + } + { +#define RAW "?@" +#define DEC "%3F@" +#define ENC "%3F%40" + TTest test = { + "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC, TFeature::FeaturesAll, TState::ParsedOK, "http", RAW, RAW, "host", 80, "/" DEC, RAW, RAW}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" ENC ":" ENC "@host/" DEC "?" RAW "#" RAW); +#undef RAW +#undef DEC +#undef ENC + } + { +#define RAW "%&;=" +#define DEC "%25&;=" +#define ENC "%25%26%3B%3D" + TTest test = { + "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC, TFeature::FeaturesAll, TState::ParsedOK, "http", RAW, RAW, "host", 80, "/" ENC, ENC, ENC}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC); +#undef RAW +#undef DEC +#undef ENC + } + { +#define RAW "!$'()*," +#define DEC "!$%27()*," +#define ENC "%21%24%27%28%29%2A%2C" + TTest test = { + "http://" ENC ":" ENC "@host/" ENC "?" ENC "#" ENC, TFeature::FeaturesAll, TState::ParsedOK, "http", RAW, RAW, "host", 80, "/" ENC, DEC, DEC}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" ENC ":" ENC "@host/" ENC "?" DEC "#" DEC); +#undef RAW +#undef DEC +#undef ENC + } + { +#define DEC "Череповец。рф" +#define ENC "%D0%A7%D0%B5%D1%80%D0%B5%D0%BF%D0%BE%D0%B2%D0%B5%D1%86%E3%80%82%D1%80%D1%84" +// punycode corresponds to lowercase +#define PNC "xn--b1afab7bff7cb.xn--p1ai" + TTest test = { + "http://" ENC "/" ENC "?" ENC "#" ENC, TParseFlags(TFeature::FeaturesAll | TFeature::FeatureAllowHostIDN, TFeature::FeatureDecodeExtendedASCII), TState::ParsedOK, "http", "", "", DEC, 80, "/" ENC, ENC, ENC}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldHostAscii), PNC); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" DEC "/" ENC "?" ENC "#" ENC); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(TField::FlagHostAscii), "http://" PNC "/" ENC "?" ENC "#" ENC); +#undef PNC +#undef DEC +#undef ENC + } + { +#define DEC "Череповец。рф" +#define ENC "%D0%A7%D0%B5%D1%80%D0%B5%D0%BF%D0%BE%D0%B2%D0%B5%D1%86%E3%80%82%D1%80%D1%84" +// punycode corresponds to lowercase +#define PNC "xn--b1afab7bff7cb.xn--p1ai" + TTest test = { + "http://" DEC "/" DEC "?" DEC "#" DEC, TParseFlags(TFeature::FeaturesRobot | TFeature::FeatureEncodeExtendedASCII), TState::ParsedOK, "http", "", "", PNC, 80, "/" ENC, ENC, ENC}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" PNC "/" ENC "?" ENC "#" ENC); +#undef PNC +#undef DEC +#undef ENC + } + { +#define DEC "независимая-экспертиза-оценка-ущерба-авто-дтп.рф" +#define PNC "xn--------3veabbbbjgk5abecc3afsad2cg8bvq2alouolqf5brd3a4jzftgqd.xn--p1ai" + TTest test = { + "http://" DEC "/", TParseFlags(TFeature::FeaturesRobot), TState::ParsedOK, "http", "", "", PNC, 80, "/", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" PNC "/"); +#undef PNC +#undef DEC + } + } + + Y_UNIT_TEST(testFlexibleAuthority) { + TUri uri; + UNIT_ASSERT_EQUAL(uri.Parse("http://hello_world", TFeature::FeatureCheckHost), TState::ParsedBadHost); + UNIT_ASSERT_EQUAL(uri.Parse("http://hello_world", TFeature::FeatureSchemeFlexible), TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(uri.GetHost(), "hello_world"); + + UNIT_ASSERT_EQUAL(uri.Parse("httpzzzzz://)(*&^$!\\][';<>`~,q?./index.html", TFeature::FeatureSchemeFlexible), TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(uri.GetHost(), ")(*&^$!\\][';<>`~,q"); + UNIT_ASSERT_VALUES_EQUAL(uri.GetField(TField::FieldPath), ""); + UNIT_ASSERT_VALUES_EQUAL(uri.GetField(TField::FieldQuery), "./index.html"); + + UNIT_ASSERT_EQUAL(uri.Parse("htttttttp://)(*&^%45$!\\][';<>`~,.q/index.html", TFeature::FeatureSchemeFlexible), TState::ParsedOK); + UNIT_ASSERT_VALUES_EQUAL(uri.GetHost(), ")(*&^e$!\\][';<>`~,.q"); + UNIT_ASSERT_VALUES_EQUAL(uri.GetField(TField::FieldPath), "/index.html"); + UNIT_ASSERT_VALUES_EQUAL(uri.GetField(TField::FieldQuery), ""); + } + + Y_UNIT_TEST(testSpecialChar) { + // test characters which are not always allowed + { + TTest test = { + "http://host/pa th", TFeature::FeaturesAll | TFeature::FeatureEncodeSpace, TState::ParsedOK, "http", "", "", "host", 80, "/pa%20th", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/pa%20th"); + } + { + TTest test = { + "http://host/pa th", TFeature::FeaturesAll, TState::ParsedBadFormat, "http", "", "", "host", 80, "/pa th", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/pa th"); + } + { + TTest test = { + "http://host/pa%th%41", TFeature::FeaturesAll | TFeature::FeatureEncodePercent, TState::ParsedOK, "http", "", "", "host", 80, "/pa%25thA", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/pa%25thA"); + } + { + TTest test = { + "http://host/invalid_second_char%az%1G", TFeature::FeaturesAll | TFeature::FeatureEncodePercent, TState::ParsedOK, "http", "", "", "host", 80, "/invalid_second_char%25az%251G", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/invalid_second_char%25az%251G"); + } + { + TTest test = { + "http://host/border%2", TFeature::FeaturesAll | TFeature::FeatureEncodePercent, TState::ParsedOK, "http", "", "", "host", 80, "/border%252", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/border%252"); + } + { + TTest test = { + "http://host/pa%th%41", TFeature::FeaturesAll, TState::ParsedBadFormat, "http", "", "", "host", 80, "/pa%thA", "", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host/pa%thA"); + } + } + + Y_UNIT_TEST(testIPv6) { + { +#define RAW "[1080:0:0:0:8:800:200C:417A]" +#define DEC "[1080:0:0:0:8:800:200c:417a]" + TTest test = { + "http://" RAW "/" RAW "?" RAW "#" RAW, TParseFlags(TFeature::FeaturesAll), TState::ParsedOK, "http", "", "", DEC, 80, "/" RAW, RAW, RAW}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://" DEC "/" RAW "?" RAW "#" RAW); +#undef DEC +#undef RAW + } + } + + Y_UNIT_TEST(testEscapedFragment) { + { + TTest test = { + "http://host.com#!a=b&c=d#e+g%41%25", TParseFlags(TFeature::FeaturesAll | TFeature::FeatureHashBangToEscapedFragment), TState::ParsedOK, "http", "", "", "host.com", 80, "/", "_escaped_fragment_=a=b%26c=d%23e%2BgA%2525", ""}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host.com/?_escaped_fragment_=a=b%26c=d%23e%2BgA%2525"); + } + { + TTest test = { + "http://host.com?_escaped_fragment_=a=b%26c=d%23e%2bg%2525", TParseFlags(TFeature::FeaturesAll | TFeature::FeatureEscapedToHashBangFragment), TState::ParsedOK, "http", "", "", "host.com", 80, "/", "", "!a=b&c=d#e+g%25"}; + TUri url; + URL_TEST(url, test); + UNIT_ASSERT_VALUES_EQUAL(url.PrintS(), "http://host.com/#!a=b&c=d#e+g%25"); + } + } + + Y_UNIT_TEST(testReEncode) { + { + TStringStream out; + TUri::ReEncode(out, "foo bar"); + UNIT_ASSERT_VALUES_EQUAL(out.Str(), "foo%20bar"); + } + } + + static const TStringBuf NonRfcUrls[] = { + "http://deshevle.ru/price/price=&SrchTp=1&clID=24&BL=SrchTp=0|clID=24&frmID=75&SortBy=P&PreSort=&NmDir=0&VndDir=0&PrDir=0&SPP=44", + "http://secure.rollerwarehouse.com/skates/aggressive/skates/c/11[03]/tx/$$$+11[03][a-z]", + "http://secure.rollerwarehouse.com/skates/aggressive/skates/tx/$$$+110[a-z]", + "http://translate.google.com/translate_t?langpair=en|ru", + "http://www.garnier.com.ru/_ru/_ru/our_products/products_trade.aspx?tpcode=OUR_PRODUCTS^PRD_BODYCARE^EXTRA_SKIN^EXTRA_SKIN_BENEFITS", + "http://www.km.ru/magazin/view_print.asp?id={1846295A-223B-41DC-9F51-90D5D6236C49}", + "http://www.manutd.com/default.sps?pagegid={78F24B85-702C-4DC8-A5D4-2F67252C28AA}&itype=12977&pagebuildpageid=2716&bg=1", + "http://www.pokupay.ru/price/price=&SrchTp=1&clID=24&BL=SrchTp=0|clID=24&frmID=75&SPP=35&SortBy=N&PreSort=V&NmDir=0&VndDir=1&PrDir=0", + "http://www.rodnoyspb.ru/rest/plager/page[0].html", + "http://www.trinity.by/?section_id=46,47,48&cat=1&filters[]=2^_^Sony", + "http://translate.yandex.net/api/v1/tr.json/translate?lang=en-ru&text=>", + nullptr}; + + Y_UNIT_TEST(test_NonRfcUrls) { + TUri url; + const long flags = TFeature::FeaturesRobot; + for (size_t i = 0;; ++i) { + const TStringBuf& buf = NonRfcUrls[i]; + if (!buf.IsInited()) + break; + UNIT_ASSERT_VALUES_EQUAL(TState::ParsedOK, url.Parse(buf, flags)); + } + } + + static const TStringBuf CheckParseException[] = { + "http://www.'>'.com/?.net/", + nullptr}; + + Y_UNIT_TEST(test_CheckParseException) { + TUri url; + const long flags = TFeature::FeaturesRobot | TFeature::FeaturesEncode; + for (size_t i = 0;; ++i) { + const TStringBuf& buf = CheckParseException[i]; + if (!buf.IsInited()) + break; + TString what; + try { + // we care only about exceptions, not whether it parses correctly + url.Parse(buf, flags); + continue; + } catch (const std::exception& exc) { + what = exc.what(); + } catch (...) { + what = "exception thrown"; + } + ythrow yexception() << "failed to parse URL [" << buf << "]: " << what; + } + } + + Y_UNIT_TEST(test_PrintPort) { + TUri uri; + { + uri.Parse("http://srv.net:9100/print", TFeature::FeaturesRecommended); + TString s = uri.PrintS(TUri::FlagPort); + Cdbg << uri.PrintS() << ',' << uri.PrintS(TUri::FlagPort) << Endl; + UNIT_ASSERT_VALUES_EQUAL(9100, FromString<ui32>(s)); + } + { + uri.Parse("http://srv.net:80/print", TFeature::FeaturesRecommended); + TString s = uri.PrintS(TUri::FlagPort); + Cdbg << uri.PrintS() << ',' << uri.PrintS(TUri::FlagPort) << Endl; + UNIT_ASSERT(s.Empty()); + } + } + + Y_UNIT_TEST(test_ParseFailures) { + { + TTest test = { + "http://host:port", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""}; + TUri url(-1); + URL_TEST(url, test); + } + { + TTest test = { + "http://javascript:alert(hi)", TFeature::FeaturesRobot, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""}; + TUri url(-1); + URL_TEST(url, test); + } + { + TTest test = { + "http://host::0", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""}; + TUri url(-1); + URL_TEST(url, test); + } + { + TTest test = { + "http://host ", TFeature::FeaturesAll, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""}; + TUri url(-1); + URL_TEST(url, test); + } + { + TTest test = { + "http:00..03", TFeature::FeaturesAll | TFeature::FeatureNoRelPath, TState::ParsedBadFormat, "", "", "", "", Max<ui16>(), "", "", ""}; + TUri url(-1); + URL_TEST(url, test); + } + { + TTest test = { + "host:00..03", TFeature::FeaturesAll, TState::ParsedRootless, "host", "", "", "", 0, "", "", ""}; + TUri url(-1); + URL_TEST(url, test); + } + { + TTest test = { + "http://roduct;isbn,0307371549;at,aid4c00179ab018www.mcnamarasband.wordpress.com/", TFeature::FeaturesAll, TState::ParsedBadHost, "http", "", "", "roduct;isbn,0307371549;at,aid4c00179ab018www.mcnamarasband.wordpress.com", 80, "/", "", ""}; + TUri url(-1); + URL_TEST(url, test); + } + { + TTest test = { + "invalid url", TFeature::FeaturesDefault, TState::ParsedBadFormat, "", "", "", "", 0, "invalid url", "", ""}; + TUri url(-1); + URL_TEST(url, test); + } + } + Y_UNIT_TEST(test_scheme_related_url) { + TUri url; + UNIT_ASSERT_VALUES_EQUAL(url.Parse("//www.hostname.ru/path", TFeature::FeaturesRobot), TState::ParsedOK); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeEmpty); + UNIT_ASSERT_VALUES_EQUAL(url.GetHost(), "www.hostname.ru"); + UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldPath), "/path"); + + TUri baseUrl; + UNIT_ASSERT_VALUES_EQUAL(baseUrl.Parse("https://trololo.com", TFeature::FeaturesRobot), TState::ParsedOK); + UNIT_ASSERT_EQUAL(baseUrl.GetScheme(), TScheme::SchemeHTTPS); + url.Merge(baseUrl); + UNIT_ASSERT_EQUAL(url.GetScheme(), TScheme::SchemeHTTPS); + UNIT_ASSERT_VALUES_EQUAL(url.GetHost(), "www.hostname.ru"); + UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::FieldPath), "/path"); + } + } + + Y_UNIT_TEST_SUITE(TInvertDomainTest) { + Y_UNIT_TEST(TestInvert) { + TString a; + UNIT_ASSERT_EQUAL(InvertDomain(a), ""); + TString aa(".:/foo"); + UNIT_ASSERT_EQUAL(InvertDomain(aa), ".:/foo"); + TString aaa("/foo.bar:"); + UNIT_ASSERT_EQUAL(InvertDomain(aaa), "/foo.bar:"); + TString b("ru"); + UNIT_ASSERT_EQUAL(InvertDomain(b), "ru"); + TString c(".ru"); + UNIT_ASSERT_EQUAL(InvertDomain(c), "ru."); + TString d("ru."); + UNIT_ASSERT_EQUAL(InvertDomain(d), ".ru"); + TString e("www.yandex.ru:80/yandsearch?text=foo"); + UNIT_ASSERT_EQUAL(InvertDomain(e), "ru.yandex.www:80/yandsearch?text=foo"); + TString f("www.yandex.ru:80/yandsearch?text=foo"); + InvertDomain(f.begin(), f.begin() + 10); + UNIT_ASSERT_EQUAL(f, "yandex.www.ru:80/yandsearch?text=foo"); + TString g("https://www.yandex.ru:80//"); + UNIT_ASSERT_EQUAL(InvertDomain(g), "https://ru.yandex.www:80//"); + TString h("www.yandex.ru:8080/redir.pl?url=https://google.com/"); + UNIT_ASSERT_EQUAL(InvertDomain(h), "ru.yandex.www:8080/redir.pl?url=https://google.com/"); + } + } + + TQueryArg::EProcessed ProcessQargs(TString url, TString& processed, TQueryArgFilter filter = 0, void* filterData = 0) { + TUri uri; + uri.Parse(url, NUri::TFeature::FeaturesRecommended); + + TQueryArgProcessing processing(TQueryArg::FeatureSortByName | (filter ? TQueryArg::FeatureFilter : 0) | TQueryArg::FeatureRewriteDirty, filter, filterData); + auto result = processing.Process(uri); + processed = uri.PrintS(); + return result; + } + + TString SortQargs(TString url) { + TString r; + ProcessQargs(url, r); + return r; + } + + bool QueryArgsFilter(const TQueryArg& arg, void* filterData) { + const char* skipName = static_cast<const char*>(filterData); + return arg.Name != skipName; + } + + TString FilterQargs(TString url, const char* name) { + TString r; + ProcessQargs(url, r, &QueryArgsFilter, const_cast<char*>(name)); + return r; + } + + Y_UNIT_TEST_SUITE(QargsTest) { + Y_UNIT_TEST(TestSorting) { + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/"), "http://ya.ru/"); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?"), "http://ya.ru/?"); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?some=value"), "http://ya.ru/?some=value"); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?b=1&a=2"), "http://ya.ru/?a=2&b=1"); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?b=1&a=2&a=3"), "http://ya.ru/?a=3&a=2&b=1"); + + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?aaa=3&b=b&a=1&aa=2"), "http://ya.ru/?a=1&aa=2&aaa=3&b=b"); + + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?a=1&b=1&c=1"), "http://ya.ru/?a=1&b=1&c=1"); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?b=1&a=1&c=1"), "http://ya.ru/?a=1&b=1&c=1"); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?c=1&a=1&b=1"), "http://ya.ru/?a=1&b=1&c=1"); + + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?c=1&a=1&a=1&b=1&c=1&b=1"), "http://ya.ru/?a=1&a=1&b=1&b=1&c=1&c=1"); + + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?b==&a=&&c="), "http://ya.ru/?a=&b==&c="); + } + + Y_UNIT_TEST(TestParsingCorners) { + TString s; + + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?=", s), TQueryArg::ProcessedOK); + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some", s), TQueryArg::ProcessedOK); + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some=", s), TQueryArg::ProcessedOK); + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/", s), TQueryArg::ProcessedOK); + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?&", s), TQueryArg::ProcessedOK); + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?&&", s), TQueryArg::ProcessedOK); + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some=", s), TQueryArg::ProcessedOK); + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some==", s), TQueryArg::ProcessedOK); + UNIT_ASSERT_EQUAL(ProcessQargs("http://ya.ru/?some=&&", s), TQueryArg::ProcessedOK); + + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?="), "http://ya.ru/?="); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?some=="), "http://ya.ru/?some=="); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?&&"), "http://ya.ru/?&&"); + UNIT_ASSERT_STRINGS_EQUAL(SortQargs("http://ya.ru/?a"), "http://ya.ru/?a"); + } + + Y_UNIT_TEST(TestFiltering) { + UNIT_ASSERT_STRINGS_EQUAL(FilterQargs("http://ya.ru/?some=value", "missing"), "http://ya.ru/?some=value"); + UNIT_ASSERT_STRINGS_EQUAL(FilterQargs("http://ya.ru/?b=1&a=2", "b"), "http://ya.ru/?a=2"); + UNIT_ASSERT_STRINGS_EQUAL(FilterQargs("http://ya.ru/?b=1&a=2&a=3", "a"), "http://ya.ru/?b=1"); + UNIT_ASSERT_STRINGS_EQUAL(FilterQargs("http://ya.ru/?some=&another=", "another"), "http://ya.ru/?some="); + } + + Y_UNIT_TEST(TestRemoveEmptyFeature) { + TUri uri; + uri.Parse("http://ya.ru/?", NUri::TFeature::FeaturesRecommended); + + TQueryArgProcessing processing(TQueryArg::FeatureRemoveEmptyQuery | TQueryArg::FeatureRewriteDirty); + auto result = processing.Process(uri); + UNIT_ASSERT_EQUAL(result, TQueryArg::ProcessedOK); + UNIT_ASSERT_STRINGS_EQUAL(uri.PrintS(), "http://ya.ru/"); + } + + Y_UNIT_TEST(TestNoRemoveEmptyFeature) { + TUri uri; + uri.Parse("http://ya.ru/?", NUri::TFeature::FeaturesRecommended); + + TQueryArgProcessing processing(0); + auto result = processing.Process(uri); + UNIT_ASSERT_EQUAL(result, TQueryArg::ProcessedOK); + UNIT_ASSERT_STRINGS_EQUAL(uri.PrintS(), "http://ya.ru/?"); + } + } +} diff --git a/library/cpp/uri/uri_ut.h b/library/cpp/uri/uri_ut.h new file mode 100644 index 00000000000..f8ac6e40927 --- /dev/null +++ b/library/cpp/uri/uri_ut.h @@ -0,0 +1,81 @@ +#pragma once + +#include "uri.h" + +#include <library/cpp/testing/unittest/registar.h> + +namespace NUri { + struct TTest { + TStringBuf Val; + TParseFlags Flags; + TState::EParsed State; + TStringBuf Scheme; + TStringBuf User; + TStringBuf Pass; + TStringBuf Host; + ui16 Port; + TStringBuf Path; + TStringBuf Query; + TStringBuf Frag; + }; + +} + +#define URL_MSG(url1, url2, cmp) \ + (TString("[") + url1.PrintS() + ("] " cmp " [") + url2.PrintS() + "]") +#define URL_EQ(url1, url2) \ + UNIT_ASSERT_EQUAL_C(url, url2, URL_MSG(url1, url2, "!=")) +#define URL_NEQ(url1, url2) \ + UNIT_ASSERT_UNEQUAL_C(url, url2, URL_MSG(url1, url2, "==")) + +#define CMP_FLD(url, test, fld) \ + UNIT_ASSERT_VALUES_EQUAL(url.GetField(TField::Field##fld), test.fld) + +#define CMP_URL(url, test) \ + do { \ + CMP_FLD(url, test, Scheme); \ + CMP_FLD(url, test, User); \ + CMP_FLD(url, test, Pass); \ + CMP_FLD(url, test, Host); \ + UNIT_ASSERT_VALUES_EQUAL(url.GetPort(), test.Port); \ + CMP_FLD(url, test, Path); \ + CMP_FLD(url, test, Query); \ + CMP_FLD(url, test, Frag); \ + } while (false) + +#define URL_TEST_ENC(url, test, enc) \ + do { \ + TState::EParsed st = url.ParseUri(test.Val, test.Flags, 0, enc); \ + UNIT_ASSERT_VALUES_EQUAL(st, test.State); \ + CMP_URL(url, test); \ + if (TState::ParsedOK != st) \ + break; \ + TUri _url; \ + TString urlstr, urlstr2; \ + urlstr = url.PrintS(); \ + TState::EParsed st2 = _url.ParseUri(urlstr, \ + (test.Flags & ~TFeature::FeatureNoRelPath) | TFeature::FeatureAllowRootless, 0, enc); \ + if (TState::ParsedEmpty != st2) \ + UNIT_ASSERT_VALUES_EQUAL(st2, test.State); \ + urlstr2 = _url.PrintS(); \ + UNIT_ASSERT_VALUES_EQUAL(urlstr, urlstr2); \ + CMP_URL(_url, test); \ + UNIT_ASSERT_VALUES_EQUAL(url.GetUrlFieldMask(), _url.GetUrlFieldMask()); \ + URL_EQ(url, _url); \ + const TStringBuf hostascii = url.GetField(TField::FieldHostAscii); \ + if (hostascii.Empty()) \ + break; \ + urlstr = url.PrintS(TField::FlagHostAscii); \ + st2 = _url.ParseUri(urlstr, \ + (test.Flags & ~TFeature::FeatureNoRelPath) | TFeature::FeatureAllowRootless, 0, enc); \ + UNIT_ASSERT_VALUES_EQUAL(st2, test.State); \ + urlstr2 = _url.PrintS(); \ + UNIT_ASSERT_VALUES_EQUAL(urlstr, urlstr2); \ + TTest test2 = test; \ + test2.Host = hostascii; \ + CMP_URL(_url, test2); \ + UNIT_ASSERT_VALUES_EQUAL(url.GetUrlFieldMask(), _url.GetUrlFieldMask()); \ + } while (false) + +#define URL_TEST(url, test) \ + URL_TEST_ENC(url, test, CODES_UTF8) diff --git a/library/cpp/uri/ut/ya.make b/library/cpp/uri/ut/ya.make new file mode 100644 index 00000000000..b2b2c1291a9 --- /dev/null +++ b/library/cpp/uri/ut/ya.make @@ -0,0 +1,19 @@ +UNITTEST_FOR(library/cpp/uri) + +OWNER(leo) + +NO_OPTIMIZE() + +NO_WSHADOW() + +PEERDIR( + library/cpp/html/entity +) + +SRCS( + location_ut.cpp + uri-ru_ut.cpp + uri_ut.cpp +) + +END() diff --git a/library/cpp/uri/ya.make b/library/cpp/uri/ya.make new file mode 100644 index 00000000000..8fc808a6af7 --- /dev/null +++ b/library/cpp/uri/ya.make @@ -0,0 +1,32 @@ +LIBRARY() + +OWNER( + mvel + g:base +) + +SRCS( + assign.cpp + common.cpp + encode.cpp + http_url.h + location.cpp + other.cpp + parse.cpp + qargs.cpp + uri.cpp + encodefsm.rl6 + parsefsm.rl6 +) + +PEERDIR( + contrib/libs/libidn + library/cpp/charset +) + +END() + +RECURSE( + benchmark + ut +) |