diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/uri/assign.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/uri/assign.cpp')
-rw-r--r-- | library/cpp/uri/assign.cpp | 426 |
1 files changed, 426 insertions, 0 deletions
diff --git a/library/cpp/uri/assign.cpp b/library/cpp/uri/assign.cpp new file mode 100644 index 0000000000..ea1955a9e9 --- /dev/null +++ b/library/cpp/uri/assign.cpp @@ -0,0 +1,426 @@ +#include "uri.h" +#include "parse.h" + +#include <contrib/libs/libidn/idna.h> + +#include <library/cpp/charset/recyr.hh> +#include <util/charset/wide.h> +#include <util/memory/tempbuf.h> +#include <util/string/cast.h> +#include <util/system/maxlen.h> +#include <util/system/yassert.h> +#include <util/system/sys_alloc.h> + +namespace NUri { + TMallocPtr<char> TUri::IDNToAscii(const wchar32* idna) { + // XXX: don't use punycode_encode directly as it doesn't include + // proper stringprep and splitting on dot-equivalent characters + char* buf; + static_assert(sizeof(*idna) == sizeof(ui32), "fixme"); + if (IDNA_SUCCESS != idna_to_ascii_4z((const uint32_t*)idna, &buf, 0)) + buf = nullptr; + return buf; + } + + TMallocPtr<char> TUri::IDNToAscii(const TStringBuf& host, ECharset enc) { + TTempBuf buf(sizeof(wchar32) * (1 + host.length())); + wchar32* wbuf = reinterpret_cast<wchar32*>(buf.Data()); + + const size_t written = NDetail::NBaseOps::Recode(host, wbuf, enc).length(); + wbuf[written] = 0; + + return IDNToAscii(wbuf); + } + + TStringBuf TUri::HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc) { + TStringBuf outhost; // store the result here before returning it, to get RVO + + size_t buflen = 0; + + if (hasExtended && !allowIDN) + return outhost; // definitely can't convert + + // charset-recode: RFC 3986, 3.2.2, requires percent-encoded non-ASCII + // chars in reg-name to be UTF-8 so convert to UTF-8 prior to decoding + const bool recoding = CODES_UTF8 != enc && hasExtended; + if (recoding) { + size_t nrd, nwr; + buflen = host.length() * 4; + buf.Reset(static_cast<char*>(y_allocate(buflen))); + if (RECODE_OK != Recode(enc, CODES_UTF8, host.data(), buf.Get(), host.length(), buflen, nrd, nwr)) + return outhost; + host = TStringBuf(buf.Get(), nwr); + } + + // percent-decode + if (0 == buflen) { + buflen = host.length(); + buf.Reset(static_cast<char*>(y_allocate(buflen))); + } + // decoding shortens so writing over host in buf is OK + TMemoryWriteBuffer out(buf.Get(), buflen); + TEncoder decoder(out, FeatureDecodeANY | FeatureToLower); + const long outFlags = decoder.ReEncode(host); + hasExtended = 0 != (outFlags & FeatureEncodeExtendedASCII); + + // check again + if (hasExtended && !allowIDN) + return outhost; + + host = out.Str(); + + // convert to punycode if needed + if (!hasExtended) { + outhost = host; + return outhost; + } + + TMallocPtr<char> puny; + try { + puny = IDNToAscii(host); + } catch (const yexception& /* exc */) { + } + + if (!puny) { + // XXX: try user charset unless UTF8 or converted to it + if (CODES_UTF8 == enc || recoding) + return outhost; + try { + puny = IDNToAscii(host, enc); + } catch (const yexception& /* exc */) { + return outhost; + } + if (!puny) + return outhost; + } + + buf = puny; + outhost = buf.Get(); + + return outhost; + } + + TStringBuf TUri::HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc) { + // find what we have + long haveFlags = 0; + for (size_t i = 0; i != host.length(); ++i) + haveFlags |= TEncoder::GetFlags(host[i]).FeatFlags; + + // interested in encoded characters or (if IDN is allowed) extended ascii + TStringBuf outhost; + const bool haveExtended = haveFlags & FeatureEncodeExtendedASCII; + + if (!haveExtended || allowIDN) { + if (!haveExtended && 0 == (haveFlags & FeatureDecodeANY)) + outhost = host; + else + outhost = HostToAscii(host, buf, haveExtended, allowIDN, enc); + } + + return outhost; + } + + static inline bool AppendField(TMemoryWriteBuffer& out, TField::EField fld, const TStringBuf& val, long flags) { + if (val.empty()) + return false; + if (flags & TFeature::FeaturesAllEncoder) + TUri::ReEncodeField(out, val, fld, flags); + else + out << val; + return true; + } + + TState::EParsed TUri::AssignImpl(const TParser& parser, TScheme::EKind defscheme) { + Clear(); + + TState::EParsed ret = parser.State; + if (ParsedBadFormat <= ret) + return ret; + + const TSection& scheme = parser.Get(FieldScheme); + const TSchemeInfo& schemeInfo = SetSchemeImpl(parser.Scheme); + + // set the scheme always if available + if (schemeInfo.Str.empty() && scheme.IsSet()) + FldSet(FieldScheme, scheme.Get()); + + if (ParsedOK != ret) + return ret; + + size_t buflen = 0; + + // special processing for fields + + const bool convertIDN = parser.Flags & FeatureConvertHostIDN; + long flags = parser.Flags.Allow; + if (convertIDN) + flags |= FeatureAllowHostIDN | FeatureCheckHost; + + // process non-ASCII host for punycode + + TMallocPtr<char> hostptr; + TStringBuf hostascii; // empty: use host field; non-empty: ascii + bool hostConverted = false; // hostascii is empty or the original + const TSection& host = parser.Get(FieldHost); + if (host.IsSet() && !FldIsSet(FieldHost)) { + const bool allowIDN = (flags & FeatureAllowHostIDN); + const TStringBuf hostbuf = host.Get(); + + // if we know we have and allow extended-ASCII chars, no need to check further + if (allowIDN && (host.GetFlagsAllPlaintext() & FeatureEncodeExtendedASCII)) + hostascii = HostToAscii(hostbuf, hostptr, true, true, parser.Enc); + else + hostascii = HostToAscii(hostbuf, hostptr, allowIDN, parser.Enc); + + if (hostascii.empty()) + ret = ParsedBadHost; // exists but cannot be converted + else if (hostbuf.data() != hostascii.data()) { + hostConverted = true; + buflen += 1 + hostascii.length(); + if (convertIDN) + FldMarkSet(FieldHost); // so that we don't process host below + } + } + + // add unprocessed fields + + for (int idx = 0; idx < FieldUrlMAX; ++idx) { + const EField fld = EField(idx); + const TSection& section = parser.Get(fld); + if (section.IsSet() && !FldIsSet(fld)) + buflen += 1 + section.EncodedLen(); // includes null + } + if (0 == buflen) // no more sections set? + return ret; + + // process #! fragments + // https://developers.google.com/webmasters/ajax-crawling/docs/specification + + static const TStringBuf escFragPrefix(TStringBuf("_escaped_fragment_=")); + + bool encHashBangFrag = false; + TStringBuf qryBeforeEscapedFragment; + TStringBuf qryEscapedFragment; + do { + if (FldIsSet(FieldFrag) || FldIsSet(FieldQuery)) + break; + + const TSection& frag = parser.Get(FieldFrag); + if (frag.IsSet()) { + if (0 == (parser.Flags & FeatureHashBangToEscapedFragment)) + break; + const TStringBuf fragbuf = frag.Get(); + if (fragbuf.empty() || '!' != fragbuf[0]) + break; + encHashBangFrag = true; + // '!' will make space for '&' or '\0' if needed + buflen += escFragPrefix.length(); + buflen += 2 * fragbuf.length(); // we don't know how many will be encoded + } else { + const TSection& qry = parser.Get(FieldQuery); + if (!qry.IsSet()) + break; + // FeatureHashBangToEscapedFragment has preference + if (FeatureEscapedToHashBangFragment != (parser.Flags & FeaturesEscapedFragment)) + break; + qry.Get().RSplit('&', qryBeforeEscapedFragment, qryEscapedFragment); + if (!qryEscapedFragment.StartsWith(escFragPrefix)) { + qryEscapedFragment.Clear(); + break; + } + qryEscapedFragment.Skip(escFragPrefix.length()); + buflen += 2; // for '!' and '\0' in fragment + buflen -= escFragPrefix.length(); + } + } while (false); + + // now set all fields prior to validating + + Alloc(buflen); + + TMemoryWriteBuffer out(Buffer.data(), Buffer.size()); + for (int idx = 0; idx < FieldUrlMAX; ++idx) { + const EField fld = EField(idx); + + const TSection& section = parser.Get(fld); + if (!section.IsSet() || FldIsSet(fld)) + continue; + + if (FieldQuery == fld && encHashBangFrag) + continue; + + if (FieldFrag == fld && qryEscapedFragment.IsInited()) + continue; + + char* beg = out.Buf(); + TStringBuf val = section.Get(); + long careFlags = section.GetFlagsEncode(); + + switch (fld) { + default: + break; + + case FieldQuery: + if (qryEscapedFragment.IsInited()) { + const EField dstfld = FieldFrag; // that's where we will store + out << '!'; + if (!qryEscapedFragment.empty()) + ReEncodeToField(out, qryEscapedFragment, fld, FeatureDecodeANY | careFlags, dstfld, FeatureDecodeANY | parser.GetFieldFlags(dstfld)); + FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf())); + if (qryBeforeEscapedFragment.empty()) + continue; + out << '\0'; + beg = out.Buf(); + val = qryBeforeEscapedFragment; + } + break; + + case FieldFrag: + if (encHashBangFrag) { + const EField dstfld = FieldQuery; // that's where we will store + const TSection& qry = parser.Get(dstfld); + if (qry.IsSet()) + if (AppendField(out, dstfld, qry.Get(), qry.GetFlagsEncode())) + out << '&'; + out << escFragPrefix; + val.Skip(1); // skip '!' + ReEncodeToField(out, val, fld, careFlags, dstfld, parser.GetFieldFlags(dstfld)); + FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf())); + continue; + } + break; + } + + AppendField(out, fld, val, careFlags); + char* end = out.Buf(); + + if (careFlags & FeaturePathOperation) { + if (!PathOperation(beg, end, PathOperationFlag(parser.Flags))) + return ParsedBadPath; + + Y_ASSERT(beg >= out.Beg()); + out.SetPos(end); + } + + FldSetNoDirty(fld, TStringBuf(beg, end)); + + // special character case + const long checkChars = section.GetFlagsAllPlaintext() & FeaturesCheckSpecialChar; + if (0 != checkChars) { // has unencoded special chars: check permission + const long allowChars = parser.GetFieldFlags(fld) & checkChars; + if (checkChars != allowChars) + ret = ParsedBadFormat; + } + + out << '\0'; + } + + if (hostConverted) { + char* beg = out.Buf(); + out << hostascii; + char* end = out.Buf(); + const EField fld = convertIDN ? FieldHost : FieldHostAscii; + FldSetNoDirty(fld, TStringBuf(beg, end)); + out << '\0'; + } + + Buffer.Resize(out.Len()); + + if (GetScheme() == SchemeEmpty && SchemeEmpty != defscheme) { + if (SchemeUnknown == defscheme) + ret = ParsedBadScheme; + else + SetSchemeImpl(defscheme); + } + + if (0 == (parser.Flags & FeatureAllowEmptyPath)) + CheckMissingFields(); + + const TStringBuf& port = GetField(FieldPort); + if (!port.empty()) { + if (!TryFromString<ui16>(port, Port)) + ret = ParsedBadPort; + } + + if (ParsedOK != ret) + return ret; + + // run validity checks now that all fields are set + + // check the host for DNS compliance + do { + if (0 == (flags & FeatureCheckHost)) + break; + if (hostascii.empty()) + hostascii = GetField(FieldHost); + if (hostascii.empty()) + break; + // IP literal + if ('[' == hostascii[0] && ']' == hostascii.back()) + break; + ret = CheckHost(hostascii); + if (ParsedOK != ret) + return ret; + } while (false); + + return ret; + } + + TState::EParsed TUri::ParseImpl(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) { + Clear(); + + if (url.empty()) + return ParsedEmpty; + + if (maxlen > 0 && url.length() > maxlen) + return ParsedTooLong; + + const TParser parser(flags, url, enc); + + return AssignImpl(parser, defscheme); + } + + TState::EParsed TUri::Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& url_base, ui32 maxlen, ECharset enc) { + const TParseFlags flags1 = flags.Exclude(FeatureNoRelPath); + TState::EParsed ret = ParseImpl(url, url_base.empty() ? flags : flags1, maxlen, SchemeEmpty, enc); + if (ParsedOK != ret) + return ret; + + if (!url_base.empty() && !IsValidAbs()) { + TUri base; + ret = base.ParseImpl(url_base, flags, maxlen, SchemeEmpty, enc); + if (ParsedOK != ret) + return ret; + Merge(base, PathOperationFlag(flags)); + } + + Rewrite(); + return ret; + } + + TState::EParsed TUri::Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags, ui32 maxlen, ECharset enc) { + const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc); + if (ParsedOK != ret) + return ret; + + if (!IsValidAbs()) + Merge(base, PathOperationFlag(flags)); + + Rewrite(); + return ret; + } + + TState::EParsed TUri::ParseAbsUri(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) { + const TState::EParsed ret = ParseImpl( + url, flags | FeatureNoRelPath, maxlen, defscheme, enc); + if (ParsedOK != ret) + return ret; + + if (IsNull(FlagHost)) + return ParsedBadHost; + + Rewrite(); + return ParsedOK; + } + +} |