diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/url/url.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils/url/url.cpp')
-rw-r--r-- | library/cpp/string_utils/url/url.cpp | 421 |
1 files changed, 421 insertions, 0 deletions
diff --git a/library/cpp/string_utils/url/url.cpp b/library/cpp/string_utils/url/url.cpp new file mode 100644 index 0000000000..85f4ac5d69 --- /dev/null +++ b/library/cpp/string_utils/url/url.cpp @@ -0,0 +1,421 @@ +#include "url.h" + +#include <util/string/cast.h> +#include <util/string/util.h> +#include <util/string/cstriter.h> +#include <util/string/ascii.h> +#include <util/string/strip.h> + +#include <util/charset/unidata.h> // for ToLower +#include <util/system/defaults.h> +#include <util/generic/algorithm.h> +#include <util/generic/hash_set.h> +#include <util/generic/yexception.h> +#include <util/generic/singleton.h> + +#include <cstdlib> + +namespace { + struct TUncheckedSize { + static bool Has(size_t) { + return true; + } + }; + + struct TKnownSize { + size_t MySize; + explicit TKnownSize(size_t sz) + : MySize(sz) + { + } + bool Has(size_t sz) const { + return sz <= MySize; + } + }; + + template <typename TChar1, typename TChar2> + int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) { + for (size_t i = 0; i < n; ++i) { + if ((TChar1)ToLower(s1[i]) != s2[i]) + return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1; + } + return 0; + } + + template <typename TChar, typename TBounds> + inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) { + const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0}; + const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0}; + if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0) + return 7; + if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0) + return 8; + return 0; + } + + template <typename T> + inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) { + size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps); + if (prefixSize) + return url.substr(prefixSize); + return url; + } +} + +namespace NUrl { + + TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) { + TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false); + TStringBuf path = url; + path.SkipPrefix(host); + return {host, path}; + } + +} // namespace NUrl + +size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept { + return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps); +} + +size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept { + return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps); +} + +size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept { + return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps); +} + +size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept { + return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps); +} + +TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept { + return CutHttpPrefixImpl(url, ignorehttps); +} + +TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept { + return CutHttpPrefixImpl(url, ignorehttps); +} + +size_t GetSchemePrefixSize(const TStringBuf url) noexcept { + struct TDelim: public str_spn { + inline TDelim() + : str_spn("!-/:-@[-`{|}", true) + { + } + }; + + const auto& delim = *Singleton<TDelim>(); + const char* n = delim.brk(url.data(), url.end()); + + if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') { + return 0; + } + + return n + 3 - url.begin(); +} + +TStringBuf GetSchemePrefix(const TStringBuf url) noexcept { + return url.Head(GetSchemePrefixSize(url)); +} + +TStringBuf CutSchemePrefix(const TStringBuf url) noexcept { + return url.Tail(GetSchemePrefixSize(url)); +} + +template <bool KeepPort> +static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) { + TStringBuf urlNoScheme = url; + + urlNoScheme.Skip(GetHttpPrefixSize(url)); + + struct TDelim: public str_spn { + inline TDelim() + : str_spn(KeepPort ? "/;?#" : "/:;?#") + { + } + }; + + const auto& nonHostCharacters = *Singleton<TDelim>(); + const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end()); + + if (firstNonHostCharacter != urlNoScheme.end()) { + return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data()); + } + + return urlNoScheme; +} + +TStringBuf GetHost(const TStringBuf url) noexcept { + return GetHostAndPortImpl<false>(url); +} + +TStringBuf GetHostAndPort(const TStringBuf url) noexcept { + return GetHostAndPortImpl<true>(url); +} + +TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept { + const size_t schemeSize = GetSchemePrefixSize(url); + const TStringBuf scheme = url.Head(schemeSize); + + const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://")); + + TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize)); + + if (trimDefaultPort) { + const size_t pos = hostAndPort.find(':'); + if (pos != TStringBuf::npos) { + const bool isHttps = (scheme == TStringBuf("https://")); + + const TStringBuf port = hostAndPort.Tail(pos + 1); + if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) { + // trimming default port + hostAndPort = hostAndPort.Head(pos); + } + } + } + + if (isHttp && trimHttp) { + return hostAndPort; + } else { + return TStringBuf(scheme.begin(), hostAndPort.end()); + } +} + +void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) { + auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url); + host = hostBuf; + path = pathBuf; +} + +void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) { + auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url); + host = hostBuf; + path = pathBuf; +} + +void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) { + TStringBuf urlWithoutFragment; + if (!url.TrySplit('#', urlWithoutFragment, fragment)) { + fragment = ""; + urlWithoutFragment = url; + } + if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) { + query = ""; + sanitizedUrl = urlWithoutFragment; + } +} + +bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) { + const size_t schemeSize = GetSchemePrefixSize(url); + if (schemeSize != 0) { + scheme = url.Head(schemeSize); + } + + TStringBuf portStr; + TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize)); + if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) { + // URL has port + if (!TryFromString(portStr, port)) { + return false; + } + } else { + host = hostAndPort; + if (scheme == TStringBuf("https://")) { + port = 443; + } else if (scheme == TStringBuf("http://")) { + port = 80; + } + } + return true; +} + +void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) { + bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port); + Y_ENSURE(isOk, "cannot parse port number from URL: " << url); +} + +TStringBuf GetOnlyHost(const TStringBuf url) noexcept { + return GetHost(CutSchemePrefix(url)); +} + +TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept { + const size_t off = url.find('/', GetHttpPrefixSize(url)); + TStringBuf hostUnused, path; + if (!url.TrySplitAt(off, hostUnused, path)) + return "/"; + + return trimFragment ? path.Before('#') : path; +} + +// this strange creature returns 2nd level domain, possibly with port +TStringBuf GetDomain(const TStringBuf host) noexcept { + const char* c = !host ? host.data() : host.end() - 1; + for (bool wasPoint = false; c != host.data(); --c) { + if (*c == '.') { + if (wasPoint) { + ++c; + break; + } + wasPoint = true; + } + } + return TStringBuf(c, host.end()); +} + +TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept { + size_t pos = host.size(); + for (size_t i = 0; i < level; ++i) { + pos = host.rfind('.', pos); + if (pos == TString::npos) + return host; + } + return host.SubStr(pos + 1); +} + +TStringBuf GetZone(const TStringBuf host) noexcept { + return GetParentDomain(host, 1); +} + +TStringBuf CutWWWPrefix(const TStringBuf url) noexcept { + if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3)) + return url.substr(4); + return url; +} + +TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept { + auto it = url.begin(); + + StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; }); + if (it == url.begin()) { + return url; + } + + StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); }); + if (it == url.end()) { + return url; + } + + if (*it++ == '.') { + return url.Tail(it - url.begin()); + } + + return url; +} + +TStringBuf CutMPrefix(const TStringBuf url) noexcept { + if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) { + return url.substr(2); + } + return url; +} + +static inline bool IsSchemeChar(char c) noexcept { + return IsAsciiAlnum(c); //what about '+' ?.. +} + +static bool HasPrefix(const TStringBuf url) noexcept { + TStringBuf scheme, unused; + if (!url.TrySplit(TStringBuf("://"), scheme, unused)) + return false; + + return AllOf(scheme, IsSchemeChar); +} + +TString AddSchemePrefix(const TString& url) { + return AddSchemePrefix(url, TStringBuf("http")); +} + +TString AddSchemePrefix(const TString& url, TStringBuf scheme) { + if (HasPrefix(url)) { + return url; + } + + return TString::Join(scheme, TStringBuf("://"), url); +} + +#define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0')) + +static inline int x2c(unsigned char* x) { + if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1])) + return -1; + return X(x[0]) * 16 + X(x[1]); +} + +#undef X + +static inline int Unescape(char* str) { + char *to, *from; + int dlen = 0; + if ((str = strchr(str, '%')) == nullptr) + return dlen; + for (to = str, from = str; *from; from++, to++) { + if ((*to = *from) == '%') { + int c = x2c((unsigned char*)from + 1); + *to = char((c > 0) ? c : '0'); + from += 2; + dlen += 2; + } + } + *to = 0; /* terminate it at the new length */ + return dlen; +} + +size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) { + if (source.empty() || source[0] == '?') + return strlcpy(dest, "/", dest_size); + size_t len = Min(dest_size - 1, source.length()); + memcpy(dest, source.data(), len); + dest[len] = 0; + len -= Unescape(dest); + strlwr(dest); + return len; +} + +size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) { + size_t len = Min(dest_size - 1, source.length()); + memcpy(dest, source.data(), len); + dest[len] = 0; + char buf[8] = ":"; + size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2); + buf[buflen] = '\0'; + char* ptr = strstr(dest, buf); + if (ptr && ptr[buflen] == 0) { + len -= buflen; + *ptr = 0; + } + strlwr(dest); + return len; +} + +TStringBuf RemoveFinalSlash(TStringBuf str) noexcept { + if (str.EndsWith('/')) { + str.Chop(1); + } + return str; +} + +TStringBuf CutUrlPrefixes(TStringBuf url) noexcept { + url = CutSchemePrefix(url); + url = CutWWWPrefix(url); + return url; +} + +bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept { + url = CutSchemePrefix(url); + const TStringBuf noHostSuffix = url.After('/'); + if (noHostSuffix == url) { + // no slash => no suffix with token info + return false; + } + const bool suffixHasPrefix = noHostSuffix.StartsWith(token); + if (!suffixHasPrefix) { + return false; + } + const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length(); + const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length(); + const bool nothingAfterPrefix = noHostSuffix.length() <= token.length(); + const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix; + return prefixIsToken; +} + |