aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/uri/assign.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/uri/assign.cpp
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/uri/assign.cpp')
-rw-r--r--library/cpp/uri/assign.cpp426
1 files changed, 426 insertions, 0 deletions
diff --git a/library/cpp/uri/assign.cpp b/library/cpp/uri/assign.cpp
new file mode 100644
index 0000000000..ea1955a9e9
--- /dev/null
+++ b/library/cpp/uri/assign.cpp
@@ -0,0 +1,426 @@
+#include "uri.h"
+#include "parse.h"
+
+#include <contrib/libs/libidn/idna.h>
+
+#include <library/cpp/charset/recyr.hh>
+#include <util/charset/wide.h>
+#include <util/memory/tempbuf.h>
+#include <util/string/cast.h>
+#include <util/system/maxlen.h>
+#include <util/system/yassert.h>
+#include <util/system/sys_alloc.h>
+
+namespace NUri {
+ TMallocPtr<char> TUri::IDNToAscii(const wchar32* idna) {
+ // XXX: don't use punycode_encode directly as it doesn't include
+ // proper stringprep and splitting on dot-equivalent characters
+ char* buf;
+ static_assert(sizeof(*idna) == sizeof(ui32), "fixme");
+ if (IDNA_SUCCESS != idna_to_ascii_4z((const uint32_t*)idna, &buf, 0))
+ buf = nullptr;
+ return buf;
+ }
+
+ TMallocPtr<char> TUri::IDNToAscii(const TStringBuf& host, ECharset enc) {
+ TTempBuf buf(sizeof(wchar32) * (1 + host.length()));
+ wchar32* wbuf = reinterpret_cast<wchar32*>(buf.Data());
+
+ const size_t written = NDetail::NBaseOps::Recode(host, wbuf, enc).length();
+ wbuf[written] = 0;
+
+ return IDNToAscii(wbuf);
+ }
+
+ TStringBuf TUri::HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc) {
+ TStringBuf outhost; // store the result here before returning it, to get RVO
+
+ size_t buflen = 0;
+
+ if (hasExtended && !allowIDN)
+ return outhost; // definitely can't convert
+
+ // charset-recode: RFC 3986, 3.2.2, requires percent-encoded non-ASCII
+ // chars in reg-name to be UTF-8 so convert to UTF-8 prior to decoding
+ const bool recoding = CODES_UTF8 != enc && hasExtended;
+ if (recoding) {
+ size_t nrd, nwr;
+ buflen = host.length() * 4;
+ buf.Reset(static_cast<char*>(y_allocate(buflen)));
+ if (RECODE_OK != Recode(enc, CODES_UTF8, host.data(), buf.Get(), host.length(), buflen, nrd, nwr))
+ return outhost;
+ host = TStringBuf(buf.Get(), nwr);
+ }
+
+ // percent-decode
+ if (0 == buflen) {
+ buflen = host.length();
+ buf.Reset(static_cast<char*>(y_allocate(buflen)));
+ }
+ // decoding shortens so writing over host in buf is OK
+ TMemoryWriteBuffer out(buf.Get(), buflen);
+ TEncoder decoder(out, FeatureDecodeANY | FeatureToLower);
+ const long outFlags = decoder.ReEncode(host);
+ hasExtended = 0 != (outFlags & FeatureEncodeExtendedASCII);
+
+ // check again
+ if (hasExtended && !allowIDN)
+ return outhost;
+
+ host = out.Str();
+
+ // convert to punycode if needed
+ if (!hasExtended) {
+ outhost = host;
+ return outhost;
+ }
+
+ TMallocPtr<char> puny;
+ try {
+ puny = IDNToAscii(host);
+ } catch (const yexception& /* exc */) {
+ }
+
+ if (!puny) {
+ // XXX: try user charset unless UTF8 or converted to it
+ if (CODES_UTF8 == enc || recoding)
+ return outhost;
+ try {
+ puny = IDNToAscii(host, enc);
+ } catch (const yexception& /* exc */) {
+ return outhost;
+ }
+ if (!puny)
+ return outhost;
+ }
+
+ buf = puny;
+ outhost = buf.Get();
+
+ return outhost;
+ }
+
+ TStringBuf TUri::HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc) {
+ // find what we have
+ long haveFlags = 0;
+ for (size_t i = 0; i != host.length(); ++i)
+ haveFlags |= TEncoder::GetFlags(host[i]).FeatFlags;
+
+ // interested in encoded characters or (if IDN is allowed) extended ascii
+ TStringBuf outhost;
+ const bool haveExtended = haveFlags & FeatureEncodeExtendedASCII;
+
+ if (!haveExtended || allowIDN) {
+ if (!haveExtended && 0 == (haveFlags & FeatureDecodeANY))
+ outhost = host;
+ else
+ outhost = HostToAscii(host, buf, haveExtended, allowIDN, enc);
+ }
+
+ return outhost;
+ }
+
+ static inline bool AppendField(TMemoryWriteBuffer& out, TField::EField fld, const TStringBuf& val, long flags) {
+ if (val.empty())
+ return false;
+ if (flags & TFeature::FeaturesAllEncoder)
+ TUri::ReEncodeField(out, val, fld, flags);
+ else
+ out << val;
+ return true;
+ }
+
+ TState::EParsed TUri::AssignImpl(const TParser& parser, TScheme::EKind defscheme) {
+ Clear();
+
+ TState::EParsed ret = parser.State;
+ if (ParsedBadFormat <= ret)
+ return ret;
+
+ const TSection& scheme = parser.Get(FieldScheme);
+ const TSchemeInfo& schemeInfo = SetSchemeImpl(parser.Scheme);
+
+ // set the scheme always if available
+ if (schemeInfo.Str.empty() && scheme.IsSet())
+ FldSet(FieldScheme, scheme.Get());
+
+ if (ParsedOK != ret)
+ return ret;
+
+ size_t buflen = 0;
+
+ // special processing for fields
+
+ const bool convertIDN = parser.Flags & FeatureConvertHostIDN;
+ long flags = parser.Flags.Allow;
+ if (convertIDN)
+ flags |= FeatureAllowHostIDN | FeatureCheckHost;
+
+ // process non-ASCII host for punycode
+
+ TMallocPtr<char> hostptr;
+ TStringBuf hostascii; // empty: use host field; non-empty: ascii
+ bool hostConverted = false; // hostascii is empty or the original
+ const TSection& host = parser.Get(FieldHost);
+ if (host.IsSet() && !FldIsSet(FieldHost)) {
+ const bool allowIDN = (flags & FeatureAllowHostIDN);
+ const TStringBuf hostbuf = host.Get();
+
+ // if we know we have and allow extended-ASCII chars, no need to check further
+ if (allowIDN && (host.GetFlagsAllPlaintext() & FeatureEncodeExtendedASCII))
+ hostascii = HostToAscii(hostbuf, hostptr, true, true, parser.Enc);
+ else
+ hostascii = HostToAscii(hostbuf, hostptr, allowIDN, parser.Enc);
+
+ if (hostascii.empty())
+ ret = ParsedBadHost; // exists but cannot be converted
+ else if (hostbuf.data() != hostascii.data()) {
+ hostConverted = true;
+ buflen += 1 + hostascii.length();
+ if (convertIDN)
+ FldMarkSet(FieldHost); // so that we don't process host below
+ }
+ }
+
+ // add unprocessed fields
+
+ for (int idx = 0; idx < FieldUrlMAX; ++idx) {
+ const EField fld = EField(idx);
+ const TSection& section = parser.Get(fld);
+ if (section.IsSet() && !FldIsSet(fld))
+ buflen += 1 + section.EncodedLen(); // includes null
+ }
+ if (0 == buflen) // no more sections set?
+ return ret;
+
+ // process #! fragments
+ // https://developers.google.com/webmasters/ajax-crawling/docs/specification
+
+ static const TStringBuf escFragPrefix(TStringBuf("_escaped_fragment_="));
+
+ bool encHashBangFrag = false;
+ TStringBuf qryBeforeEscapedFragment;
+ TStringBuf qryEscapedFragment;
+ do {
+ if (FldIsSet(FieldFrag) || FldIsSet(FieldQuery))
+ break;
+
+ const TSection& frag = parser.Get(FieldFrag);
+ if (frag.IsSet()) {
+ if (0 == (parser.Flags & FeatureHashBangToEscapedFragment))
+ break;
+ const TStringBuf fragbuf = frag.Get();
+ if (fragbuf.empty() || '!' != fragbuf[0])
+ break;
+ encHashBangFrag = true;
+ // '!' will make space for '&' or '\0' if needed
+ buflen += escFragPrefix.length();
+ buflen += 2 * fragbuf.length(); // we don't know how many will be encoded
+ } else {
+ const TSection& qry = parser.Get(FieldQuery);
+ if (!qry.IsSet())
+ break;
+ // FeatureHashBangToEscapedFragment has preference
+ if (FeatureEscapedToHashBangFragment != (parser.Flags & FeaturesEscapedFragment))
+ break;
+ qry.Get().RSplit('&', qryBeforeEscapedFragment, qryEscapedFragment);
+ if (!qryEscapedFragment.StartsWith(escFragPrefix)) {
+ qryEscapedFragment.Clear();
+ break;
+ }
+ qryEscapedFragment.Skip(escFragPrefix.length());
+ buflen += 2; // for '!' and '\0' in fragment
+ buflen -= escFragPrefix.length();
+ }
+ } while (false);
+
+ // now set all fields prior to validating
+
+ Alloc(buflen);
+
+ TMemoryWriteBuffer out(Buffer.data(), Buffer.size());
+ for (int idx = 0; idx < FieldUrlMAX; ++idx) {
+ const EField fld = EField(idx);
+
+ const TSection& section = parser.Get(fld);
+ if (!section.IsSet() || FldIsSet(fld))
+ continue;
+
+ if (FieldQuery == fld && encHashBangFrag)
+ continue;
+
+ if (FieldFrag == fld && qryEscapedFragment.IsInited())
+ continue;
+
+ char* beg = out.Buf();
+ TStringBuf val = section.Get();
+ long careFlags = section.GetFlagsEncode();
+
+ switch (fld) {
+ default:
+ break;
+
+ case FieldQuery:
+ if (qryEscapedFragment.IsInited()) {
+ const EField dstfld = FieldFrag; // that's where we will store
+ out << '!';
+ if (!qryEscapedFragment.empty())
+ ReEncodeToField(out, qryEscapedFragment, fld, FeatureDecodeANY | careFlags, dstfld, FeatureDecodeANY | parser.GetFieldFlags(dstfld));
+ FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
+ if (qryBeforeEscapedFragment.empty())
+ continue;
+ out << '\0';
+ beg = out.Buf();
+ val = qryBeforeEscapedFragment;
+ }
+ break;
+
+ case FieldFrag:
+ if (encHashBangFrag) {
+ const EField dstfld = FieldQuery; // that's where we will store
+ const TSection& qry = parser.Get(dstfld);
+ if (qry.IsSet())
+ if (AppendField(out, dstfld, qry.Get(), qry.GetFlagsEncode()))
+ out << '&';
+ out << escFragPrefix;
+ val.Skip(1); // skip '!'
+ ReEncodeToField(out, val, fld, careFlags, dstfld, parser.GetFieldFlags(dstfld));
+ FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
+ continue;
+ }
+ break;
+ }
+
+ AppendField(out, fld, val, careFlags);
+ char* end = out.Buf();
+
+ if (careFlags & FeaturePathOperation) {
+ if (!PathOperation(beg, end, PathOperationFlag(parser.Flags)))
+ return ParsedBadPath;
+
+ Y_ASSERT(beg >= out.Beg());
+ out.SetPos(end);
+ }
+
+ FldSetNoDirty(fld, TStringBuf(beg, end));
+
+ // special character case
+ const long checkChars = section.GetFlagsAllPlaintext() & FeaturesCheckSpecialChar;
+ if (0 != checkChars) { // has unencoded special chars: check permission
+ const long allowChars = parser.GetFieldFlags(fld) & checkChars;
+ if (checkChars != allowChars)
+ ret = ParsedBadFormat;
+ }
+
+ out << '\0';
+ }
+
+ if (hostConverted) {
+ char* beg = out.Buf();
+ out << hostascii;
+ char* end = out.Buf();
+ const EField fld = convertIDN ? FieldHost : FieldHostAscii;
+ FldSetNoDirty(fld, TStringBuf(beg, end));
+ out << '\0';
+ }
+
+ Buffer.Resize(out.Len());
+
+ if (GetScheme() == SchemeEmpty && SchemeEmpty != defscheme) {
+ if (SchemeUnknown == defscheme)
+ ret = ParsedBadScheme;
+ else
+ SetSchemeImpl(defscheme);
+ }
+
+ if (0 == (parser.Flags & FeatureAllowEmptyPath))
+ CheckMissingFields();
+
+ const TStringBuf& port = GetField(FieldPort);
+ if (!port.empty()) {
+ if (!TryFromString<ui16>(port, Port))
+ ret = ParsedBadPort;
+ }
+
+ if (ParsedOK != ret)
+ return ret;
+
+ // run validity checks now that all fields are set
+
+ // check the host for DNS compliance
+ do {
+ if (0 == (flags & FeatureCheckHost))
+ break;
+ if (hostascii.empty())
+ hostascii = GetField(FieldHost);
+ if (hostascii.empty())
+ break;
+ // IP literal
+ if ('[' == hostascii[0] && ']' == hostascii.back())
+ break;
+ ret = CheckHost(hostascii);
+ if (ParsedOK != ret)
+ return ret;
+ } while (false);
+
+ return ret;
+ }
+
+ TState::EParsed TUri::ParseImpl(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
+ Clear();
+
+ if (url.empty())
+ return ParsedEmpty;
+
+ if (maxlen > 0 && url.length() > maxlen)
+ return ParsedTooLong;
+
+ const TParser parser(flags, url, enc);
+
+ return AssignImpl(parser, defscheme);
+ }
+
+ TState::EParsed TUri::Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& url_base, ui32 maxlen, ECharset enc) {
+ const TParseFlags flags1 = flags.Exclude(FeatureNoRelPath);
+ TState::EParsed ret = ParseImpl(url, url_base.empty() ? flags : flags1, maxlen, SchemeEmpty, enc);
+ if (ParsedOK != ret)
+ return ret;
+
+ if (!url_base.empty() && !IsValidAbs()) {
+ TUri base;
+ ret = base.ParseImpl(url_base, flags, maxlen, SchemeEmpty, enc);
+ if (ParsedOK != ret)
+ return ret;
+ Merge(base, PathOperationFlag(flags));
+ }
+
+ Rewrite();
+ return ret;
+ }
+
+ TState::EParsed TUri::Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags, ui32 maxlen, ECharset enc) {
+ const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc);
+ if (ParsedOK != ret)
+ return ret;
+
+ if (!IsValidAbs())
+ Merge(base, PathOperationFlag(flags));
+
+ Rewrite();
+ return ret;
+ }
+
+ TState::EParsed TUri::ParseAbsUri(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
+ const TState::EParsed ret = ParseImpl(
+ url, flags | FeatureNoRelPath, maxlen, defscheme, enc);
+ if (ParsedOK != ret)
+ return ret;
+
+ if (IsNull(FlagHost))
+ return ParsedBadHost;
+
+ Rewrite();
+ return ParsedOK;
+ }
+
+}