#include "uri.h"
#include "parse.h"
#include <contrib/libs/libidn/idna.h>
#include <library/cpp/charset/recyr.hh>
#include <util/charset/wide.h>
#include <util/memory/tempbuf.h>
#include <util/string/cast.h>
#include <util/system/yassert.h>
#include <util/system/sys_alloc.h>
namespace NUri {
TMallocPtr<char> TUri::IDNToAscii(const wchar32* idna) {
// XXX: don't use punycode_encode directly as it doesn't include
// proper stringprep and splitting on dot-equivalent characters
char* buf;
static_assert(sizeof(*idna) == sizeof(ui32), "fixme");
if (IDNA_SUCCESS != idna_to_ascii_4z((const uint32_t*)idna, &buf, 0))
buf = nullptr;
return buf;
}
TMallocPtr<char> TUri::IDNToAscii(const TStringBuf& host, ECharset enc) {
TTempBuf buf(sizeof(wchar32) * (1 + host.length()));
wchar32* wbuf = reinterpret_cast<wchar32*>(buf.Data());
const size_t written = NDetail::NBaseOps::Recode(host, wbuf, enc).length();
wbuf[written] = 0;
return IDNToAscii(wbuf);
}
TStringBuf TUri::HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc) {
TStringBuf outhost; // store the result here before returning it, to get RVO
size_t buflen = 0;
if (hasExtended && !allowIDN)
return outhost; // definitely can't convert
// charset-recode: RFC 3986, 3.2.2, requires percent-encoded non-ASCII
// chars in reg-name to be UTF-8 so convert to UTF-8 prior to decoding
const bool recoding = CODES_UTF8 != enc && hasExtended;
if (recoding) {
size_t nrd, nwr;
buflen = host.length() * 4;
buf.Reset(static_cast<char*>(y_allocate(buflen)));
if (RECODE_OK != Recode(enc, CODES_UTF8, host.data(), buf.Get(), host.length(), buflen, nrd, nwr))
return outhost;
host = TStringBuf(buf.Get(), nwr);
}
// percent-decode
if (0 == buflen) {
buflen = host.length();
buf.Reset(static_cast<char*>(y_allocate(buflen)));
}
// decoding shortens so writing over host in buf is OK
TMemoryWriteBuffer out(buf.Get(), buflen);
TEncoder decoder(out, FeatureDecodeANY | FeatureToLower);
const long outFlags = decoder.ReEncode(host);
hasExtended = 0 != (outFlags & FeatureEncodeExtendedASCII);
// check again
if (hasExtended && !allowIDN)
return outhost;
host = out.Str();
// convert to punycode if needed
if (!hasExtended) {
outhost = host;
return outhost;
}
TMallocPtr<char> puny;
try {
puny = IDNToAscii(host);
} catch (const yexception& /* exc */) {
}
if (!puny) {
// XXX: try user charset unless UTF8 or converted to it
if (CODES_UTF8 == enc || recoding)
return outhost;
try {
puny = IDNToAscii(host, enc);
} catch (const yexception& /* exc */) {
return outhost;
}
if (!puny)
return outhost;
}
buf = puny;
outhost = buf.Get();
return outhost;
}
TStringBuf TUri::HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc) {
// find what we have
long haveFlags = 0;
for (size_t i = 0; i != host.length(); ++i)
haveFlags |= TEncoder::GetFlags(host[i]).FeatFlags;
// interested in encoded characters or (if IDN is allowed) extended ascii
TStringBuf outhost;
const bool haveExtended = haveFlags & FeatureEncodeExtendedASCII;
if (!haveExtended || allowIDN) {
if (!haveExtended && 0 == (haveFlags & FeatureDecodeANY))
outhost = host;
else
outhost = HostToAscii(host, buf, haveExtended, allowIDN, enc);
}
return outhost;
}
static inline bool AppendField(TMemoryWriteBuffer& out, TField::EField fld, const TStringBuf& val, long flags) {
if (val.empty())
return false;
if (flags & TFeature::FeaturesAllEncoder)
TUri::ReEncodeField(out, val, fld, flags);
else
out << val;
return true;
}
TState::EParsed TUri::AssignImpl(const TParser& parser, TScheme::EKind defscheme) {
Clear();
TState::EParsed ret = parser.State;
if (ParsedBadFormat <= ret)
return ret;
const TSection& scheme = parser.Get(FieldScheme);
const TSchemeInfo& schemeInfo = SetSchemeImpl(parser.Scheme);
// set the scheme always if available
if (schemeInfo.Str.empty() && scheme.IsSet())
FldSet(FieldScheme, scheme.Get());
if (ParsedOK != ret)
return ret;
size_t buflen = 0;
// special processing for fields
const bool convertIDN = parser.Flags & FeatureConvertHostIDN;
long flags = parser.Flags.Allow;
if (convertIDN)
flags |= FeatureAllowHostIDN | FeatureCheckHost;
// process non-ASCII host for punycode
TMallocPtr<char> hostptr;
TStringBuf hostascii; // empty: use host field; non-empty: ascii
bool hostConverted = false; // hostascii is empty or the original
const TSection& host = parser.Get(FieldHost);
if (host.IsSet() && !FldIsSet(FieldHost)) {
const bool allowIDN = (flags & FeatureAllowHostIDN);
const TStringBuf hostbuf = host.Get();
// if we know we have and allow extended-ASCII chars, no need to check further
if (allowIDN && (host.GetFlagsAllPlaintext() & FeatureEncodeExtendedASCII))
hostascii = HostToAscii(hostbuf, hostptr, true, true, parser.Enc);
else
hostascii = HostToAscii(hostbuf, hostptr, allowIDN, parser.Enc);
if (hostascii.empty())
ret = ParsedBadHost; // exists but cannot be converted
else if (hostbuf.data() != hostascii.data()) {
hostConverted = true;
buflen += 1 + hostascii.length();
if (convertIDN)
FldMarkSet(FieldHost); // so that we don't process host below
}
}
// add unprocessed fields
for (int idx = 0; idx < FieldUrlMAX; ++idx) {
const EField fld = EField(idx);
const TSection& section = parser.Get(fld);
if (section.IsSet() && !FldIsSet(fld))
buflen += 1 + section.EncodedLen(); // includes null
}
if (0 == buflen) // no more sections set?
return ret;
// process #! fragments
// https://developers.google.com/webmasters/ajax-crawling/docs/specification
static const TStringBuf escFragPrefix(TStringBuf("_escaped_fragment_="));
bool encHashBangFrag = false;
TStringBuf qryBeforeEscapedFragment;
TStringBuf qryEscapedFragment;
do {
if (FldIsSet(FieldFrag) || FldIsSet(FieldQuery))
break;
const TSection& frag = parser.Get(FieldFrag);
if (frag.IsSet()) {
if (0 == (parser.Flags & FeatureHashBangToEscapedFragment))
break;
const TStringBuf fragbuf = frag.Get();
if (fragbuf.empty() || '!' != fragbuf[0])
break;
encHashBangFrag = true;
// '!' will make space for '&' or '\0' if needed
buflen += escFragPrefix.length();
buflen += 2 * fragbuf.length(); // we don't know how many will be encoded
} else {
const TSection& qry = parser.Get(FieldQuery);
if (!qry.IsSet())
break;
// FeatureHashBangToEscapedFragment has preference
if (FeatureEscapedToHashBangFragment != (parser.Flags & FeaturesEscapedFragment))
break;
qry.Get().RSplit('&', qryBeforeEscapedFragment, qryEscapedFragment);
if (!qryEscapedFragment.StartsWith(escFragPrefix)) {
qryEscapedFragment.Clear();
break;
}
qryEscapedFragment.Skip(escFragPrefix.length());
buflen += 2; // for '!' and '\0' in fragment
buflen -= escFragPrefix.length();
}
} while (false);
// now set all fields prior to validating
Alloc(buflen);
TMemoryWriteBuffer out(Buffer.data(), Buffer.size());
for (int idx = 0; idx < FieldUrlMAX; ++idx) {
const EField fld = EField(idx);
const TSection& section = parser.Get(fld);
if (!section.IsSet() || FldIsSet(fld))
continue;
if (FieldQuery == fld && encHashBangFrag)
continue;
if (FieldFrag == fld && qryEscapedFragment.IsInited())
continue;
char* beg = out.Buf();
TStringBuf val = section.Get();
long careFlags = section.GetFlagsEncode();
switch (fld) {
default:
break;
case FieldQuery:
if (qryEscapedFragment.IsInited()) {
const EField dstfld = FieldFrag; // that's where we will store
out << '!';
if (!qryEscapedFragment.empty())
ReEncodeToField(out, qryEscapedFragment, fld, FeatureDecodeANY | careFlags, dstfld, FeatureDecodeANY | parser.GetFieldFlags(dstfld));
FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
if (qryBeforeEscapedFragment.empty())
continue;
out << '\0';
beg = out.Buf();
val = qryBeforeEscapedFragment;
}
break;
case FieldFrag:
if (encHashBangFrag) {
const EField dstfld = FieldQuery; // that's where we will store
const TSection& qry = parser.Get(dstfld);
if (qry.IsSet())
if (AppendField(out, dstfld, qry.Get(), qry.GetFlagsEncode()))
out << '&';
out << escFragPrefix;
val.Skip(1); // skip '!'
ReEncodeToField(out, val, fld, careFlags, dstfld, parser.GetFieldFlags(dstfld));
FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
continue;
}
break;
}
AppendField(out, fld, val, careFlags);
char* end = out.Buf();
if (careFlags & FeaturePathOperation) {
if (!PathOperation(beg, end, PathOperationFlag(parser.Flags)))
return ParsedBadPath;
Y_ASSERT(beg >= out.Beg());
out.SetPos(end);
}
FldSetNoDirty(fld, TStringBuf(beg, end));
// special character case
const long checkChars = section.GetFlagsAllPlaintext() & FeaturesCheckSpecialChar;
if (0 != checkChars) { // has unencoded special chars: check permission
const long allowChars = parser.GetFieldFlags(fld) & checkChars;
if (checkChars != allowChars)
ret = ParsedBadFormat;
}
out << '\0';
}
if (hostConverted) {
char* beg = out.Buf();
out << hostascii;
char* end = out.Buf();
const EField fld = convertIDN ? FieldHost : FieldHostAscii;
FldSetNoDirty(fld, TStringBuf(beg, end));
out << '\0';
}
Buffer.Resize(out.Len());
if (GetScheme() == SchemeEmpty && SchemeEmpty != defscheme) {
if (SchemeUnknown == defscheme)
ret = ParsedBadScheme;
else
SetSchemeImpl(defscheme);
}
if (0 == (parser.Flags & FeatureAllowEmptyPath))
CheckMissingFields();
const TStringBuf& port = GetField(FieldPort);
if (!port.empty()) {
if (!TryFromString<ui16>(port, Port))
ret = ParsedBadPort;
}
if (ParsedOK != ret)
return ret;
// run validity checks now that all fields are set
// check the host for DNS compliance
do {
if (0 == (flags & FeatureCheckHost))
break;
if (hostascii.empty())
hostascii = GetField(FieldHost);
if (hostascii.empty())
break;
// IP literal
if ('[' == hostascii[0] && ']' == hostascii.back())
break;
ret = CheckHost(hostascii);
if (ParsedOK != ret)
return ret;
} while (false);
return ret;
}
TState::EParsed TUri::ParseImpl(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
Clear();
if (url.empty())
return ParsedEmpty;
if (maxlen > 0 && url.length() > maxlen)
return ParsedTooLong;
const TParser parser(flags, url, enc);
return AssignImpl(parser, defscheme);
}
TState::EParsed TUri::Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& url_base, ui32 maxlen, ECharset enc) {
const TParseFlags flags1 = flags.Exclude(FeatureNoRelPath);
TState::EParsed ret = ParseImpl(url, url_base.empty() ? flags : flags1, maxlen, SchemeEmpty, enc);
if (ParsedOK != ret)
return ret;
if (!url_base.empty() && !IsValidAbs()) {
TUri base;
ret = base.ParseImpl(url_base, flags, maxlen, SchemeEmpty, enc);
if (ParsedOK != ret)
return ret;
Merge(base, PathOperationFlag(flags));
}
Rewrite();
return ret;
}
TState::EParsed TUri::Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags, ui32 maxlen, ECharset enc) {
const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc);
if (ParsedOK != ret)
return ret;
if (!IsValidAbs())
Merge(base, PathOperationFlag(flags));
Rewrite();
return ret;
}
TState::EParsed TUri::ParseAbsUri(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
const TState::EParsed ret = ParseImpl(
url, flags | FeatureNoRelPath, maxlen, defscheme, enc);
if (ParsedOK != ret)
return ret;
if (IsNull(FlagHost))
return ParsedBadHost;
Rewrite();
return ParsedOK;
}
}