diff options
| author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
|---|---|---|
| committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
| commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
| tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/url | |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/string_utils/url')
| -rw-r--r-- | library/cpp/string_utils/url/url.cpp | 421 | ||||
| -rw-r--r-- | library/cpp/string_utils/url/url.h | 170 | ||||
| -rw-r--r-- | library/cpp/string_utils/url/url_ut.cpp | 281 | ||||
| -rw-r--r-- | library/cpp/string_utils/url/ut/ya.make | 9 | ||||
| -rw-r--r-- | library/cpp/string_utils/url/ya.make | 10 | 
5 files changed, 891 insertions, 0 deletions
diff --git a/library/cpp/string_utils/url/url.cpp b/library/cpp/string_utils/url/url.cpp new file mode 100644 index 00000000000..85f4ac5d693 --- /dev/null +++ b/library/cpp/string_utils/url/url.cpp @@ -0,0 +1,421 @@ +#include "url.h" + +#include <util/string/cast.h> +#include <util/string/util.h> +#include <util/string/cstriter.h> +#include <util/string/ascii.h> +#include <util/string/strip.h> + +#include <util/charset/unidata.h> // for ToLower +#include <util/system/defaults.h> +#include <util/generic/algorithm.h> +#include <util/generic/hash_set.h> +#include <util/generic/yexception.h> +#include <util/generic/singleton.h> + +#include <cstdlib> + +namespace { +    struct TUncheckedSize { +        static bool Has(size_t) { +            return true; +        } +    }; + +    struct TKnownSize { +        size_t MySize; +        explicit TKnownSize(size_t sz) +            : MySize(sz) +        { +        } +        bool Has(size_t sz) const { +            return sz <= MySize; +        } +    }; + +    template <typename TChar1, typename TChar2> +    int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) { +        for (size_t i = 0; i < n; ++i) { +            if ((TChar1)ToLower(s1[i]) != s2[i]) +                return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1; +        } +        return 0; +    } + +    template <typename TChar, typename TBounds> +    inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) { +        const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0}; +        const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0}; +        if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0) +            return 7; +        if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0) +            return 8; +        return 0; +    } + +    template <typename T> +    inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) { +        size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps); +        if (prefixSize) +            return url.substr(prefixSize); +        return url; +    } +} + +namespace NUrl { + +    TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) { +        TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false); +        TStringBuf path = url; +        path.SkipPrefix(host); +        return {host, path}; +    } + +} // namespace NUrl + +size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept { +    return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps); +} + +size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept { +    return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps); +} + +size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept { +    return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps); +} + +size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept { +    return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps); +} + +TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept { +    return CutHttpPrefixImpl(url, ignorehttps); +} + +TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept { +    return CutHttpPrefixImpl(url, ignorehttps); +} + +size_t GetSchemePrefixSize(const TStringBuf url) noexcept { +    struct TDelim: public str_spn { +        inline TDelim() +            : str_spn("!-/:-@[-`{|}", true) +        { +        } +    }; + +    const auto& delim = *Singleton<TDelim>(); +    const char* n = delim.brk(url.data(), url.end()); + +    if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') { +        return 0; +    } + +    return n + 3 - url.begin(); +} + +TStringBuf GetSchemePrefix(const TStringBuf url) noexcept { +    return url.Head(GetSchemePrefixSize(url)); +} + +TStringBuf CutSchemePrefix(const TStringBuf url) noexcept { +    return url.Tail(GetSchemePrefixSize(url)); +} + +template <bool KeepPort> +static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) { +    TStringBuf urlNoScheme = url; + +    urlNoScheme.Skip(GetHttpPrefixSize(url)); + +    struct TDelim: public str_spn { +        inline TDelim() +            : str_spn(KeepPort ? "/;?#" : "/:;?#") +        { +        } +    }; + +    const auto& nonHostCharacters = *Singleton<TDelim>(); +    const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end()); + +    if (firstNonHostCharacter != urlNoScheme.end()) { +        return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data()); +    } + +    return urlNoScheme; +} + +TStringBuf GetHost(const TStringBuf url) noexcept { +    return GetHostAndPortImpl<false>(url); +} + +TStringBuf GetHostAndPort(const TStringBuf url) noexcept { +    return GetHostAndPortImpl<true>(url); +} + +TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept { +    const size_t schemeSize = GetSchemePrefixSize(url); +    const TStringBuf scheme = url.Head(schemeSize); + +    const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://")); + +    TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize)); + +    if (trimDefaultPort) { +        const size_t pos = hostAndPort.find(':'); +        if (pos != TStringBuf::npos) { +            const bool isHttps = (scheme == TStringBuf("https://")); + +            const TStringBuf port = hostAndPort.Tail(pos + 1); +            if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) { +                // trimming default port +                hostAndPort = hostAndPort.Head(pos); +            } +        } +    } + +    if (isHttp && trimHttp) { +        return hostAndPort; +    } else { +        return TStringBuf(scheme.begin(), hostAndPort.end()); +    } +} + +void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) { +    auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url); +    host = hostBuf; +    path = pathBuf; +} + +void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) { +    auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url); +    host = hostBuf; +    path = pathBuf; +} + +void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) { +    TStringBuf urlWithoutFragment; +    if (!url.TrySplit('#', urlWithoutFragment, fragment)) { +        fragment = ""; +        urlWithoutFragment = url; +    } +    if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) { +        query = ""; +        sanitizedUrl = urlWithoutFragment; +    } +} + +bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) { +    const size_t schemeSize = GetSchemePrefixSize(url); +    if (schemeSize != 0) { +        scheme = url.Head(schemeSize); +    } + +    TStringBuf portStr; +    TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize)); +    if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) { +        // URL has port +        if (!TryFromString(portStr, port)) { +            return false; +        } +    } else { +        host = hostAndPort; +        if (scheme == TStringBuf("https://")) { +            port = 443; +        } else if (scheme == TStringBuf("http://")) { +            port = 80; +        } +    } +    return true; +} + +void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) { +    bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port); +    Y_ENSURE(isOk, "cannot parse port number from URL: " << url); +} + +TStringBuf GetOnlyHost(const TStringBuf url) noexcept { +    return GetHost(CutSchemePrefix(url)); +} + +TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept { +    const size_t off = url.find('/', GetHttpPrefixSize(url)); +    TStringBuf hostUnused, path; +    if (!url.TrySplitAt(off, hostUnused, path)) +        return "/"; + +    return trimFragment ? path.Before('#') : path; +} + +// this strange creature returns 2nd level domain, possibly with port +TStringBuf GetDomain(const TStringBuf host) noexcept { +    const char* c = !host ? host.data() : host.end() - 1; +    for (bool wasPoint = false; c != host.data(); --c) { +        if (*c == '.') { +            if (wasPoint) { +                ++c; +                break; +            } +            wasPoint = true; +        } +    } +    return TStringBuf(c, host.end()); +} + +TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept { +    size_t pos = host.size(); +    for (size_t i = 0; i < level; ++i) { +        pos = host.rfind('.', pos); +        if (pos == TString::npos) +            return host; +    } +    return host.SubStr(pos + 1); +} + +TStringBuf GetZone(const TStringBuf host) noexcept { +    return GetParentDomain(host, 1); +} + +TStringBuf CutWWWPrefix(const TStringBuf url) noexcept { +    if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3)) +        return url.substr(4); +    return url; +} + +TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept { +    auto it = url.begin(); + +    StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; }); +    if (it == url.begin()) { +        return url; +    } + +    StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); }); +    if (it == url.end()) { +        return url; +    } + +    if (*it++ == '.') { +        return url.Tail(it - url.begin()); +    } + +    return url; +} + +TStringBuf CutMPrefix(const TStringBuf url) noexcept { +    if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) { +        return url.substr(2); +    } +    return url; +} + +static inline bool IsSchemeChar(char c) noexcept { +    return IsAsciiAlnum(c); //what about '+' ?.. +} + +static bool HasPrefix(const TStringBuf url) noexcept { +    TStringBuf scheme, unused; +    if (!url.TrySplit(TStringBuf("://"), scheme, unused)) +        return false; + +    return AllOf(scheme, IsSchemeChar); +} + +TString AddSchemePrefix(const TString& url) { +    return AddSchemePrefix(url, TStringBuf("http")); +} + +TString AddSchemePrefix(const TString& url, TStringBuf scheme) { +    if (HasPrefix(url)) { +        return url; +    } + +    return TString::Join(scheme, TStringBuf("://"), url); +} + +#define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0')) + +static inline int x2c(unsigned char* x) { +    if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1])) +        return -1; +    return X(x[0]) * 16 + X(x[1]); +} + +#undef X + +static inline int Unescape(char* str) { +    char *to, *from; +    int dlen = 0; +    if ((str = strchr(str, '%')) == nullptr) +        return dlen; +    for (to = str, from = str; *from; from++, to++) { +        if ((*to = *from) == '%') { +            int c = x2c((unsigned char*)from + 1); +            *to = char((c > 0) ? c : '0'); +            from += 2; +            dlen += 2; +        } +    } +    *to = 0; /* terminate it at the new length */ +    return dlen; +} + +size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) { +    if (source.empty() || source[0] == '?') +        return strlcpy(dest, "/", dest_size); +    size_t len = Min(dest_size - 1, source.length()); +    memcpy(dest, source.data(), len); +    dest[len] = 0; +    len -= Unescape(dest); +    strlwr(dest); +    return len; +} + +size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) { +    size_t len = Min(dest_size - 1, source.length()); +    memcpy(dest, source.data(), len); +    dest[len] = 0; +    char buf[8] = ":"; +    size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2); +    buf[buflen] = '\0'; +    char* ptr = strstr(dest, buf); +    if (ptr && ptr[buflen] == 0) { +        len -= buflen; +        *ptr = 0; +    } +    strlwr(dest); +    return len; +} + +TStringBuf RemoveFinalSlash(TStringBuf str) noexcept { +    if (str.EndsWith('/')) { +        str.Chop(1); +    } +    return str; +} + +TStringBuf CutUrlPrefixes(TStringBuf url) noexcept { +    url = CutSchemePrefix(url); +    url = CutWWWPrefix(url); +    return url; +} + +bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept { +    url = CutSchemePrefix(url); +    const TStringBuf noHostSuffix = url.After('/'); +    if (noHostSuffix == url) { +        // no slash => no suffix with token info +        return false; +    } +    const bool suffixHasPrefix = noHostSuffix.StartsWith(token); +    if (!suffixHasPrefix) { +        return false; +    } +    const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length(); +    const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length(); +    const bool nothingAfterPrefix = noHostSuffix.length() <= token.length(); +    const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix; +    return prefixIsToken; +} + diff --git a/library/cpp/string_utils/url/url.h b/library/cpp/string_utils/url/url.h new file mode 100644 index 00000000000..84137ccc57d --- /dev/null +++ b/library/cpp/string_utils/url/url.h @@ -0,0 +1,170 @@ +#pragma once + +#include <util/generic/fwd.h> +#include <util/generic/strbuf.h> + +namespace NUrl { + +    /** +     * Splits URL to host and path +     * Example: +     * auto [host, path] = SplitUrlToHostAndPath(url); +     * +     * @param[in] url                   any URL +     * @param[out] <host, path>     parsed host and path +     */ +    struct TSplitUrlToHostAndPathResult { +        TStringBuf host; +        TStringBuf path; +    }; + +    Y_PURE_FUNCTION +    TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url); + +} // namespace NUrl + +Y_PURE_FUNCTION +size_t GetHttpPrefixSize(const char* url, bool ignorehttps = false) noexcept; +Y_PURE_FUNCTION +size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps = false) noexcept; + +Y_PURE_FUNCTION +size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps = false) noexcept; + +Y_PURE_FUNCTION +size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps = false) noexcept; + +/** BEWARE of TStringBuf! You can not use operator ~ or c_str() like in TString +    !!!!!!!!!!!! */ +Y_PURE_FUNCTION +size_t GetSchemePrefixSize(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetSchemePrefix(const TStringBuf url) noexcept; + +//! removes protocol prefixes 'http://' and 'https://' from given URL +//! @note if URL has no prefix or some other prefix the function does nothing +//! @param url    URL from which the prefix should be removed +//! @param ignorehttps if true, leaves https:// +//! @return a new URL without protocol prefix +Y_PURE_FUNCTION +TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps = false) noexcept; + +Y_PURE_FUNCTION +TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps = false) noexcept; + +Y_PURE_FUNCTION +TStringBuf CutSchemePrefix(const TStringBuf url) noexcept; + +//! adds specified scheme prefix if URL has no scheme +//! @note if URL has scheme prefix already the function returns unchanged URL +TString AddSchemePrefix(const TString& url, const TStringBuf scheme); + +//! Same as `AddSchemePrefix(url, "http")`. +TString AddSchemePrefix(const TString& url); + +Y_PURE_FUNCTION +TStringBuf GetHost(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetHostAndPort(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp = true, bool trimDefaultPort = true) noexcept; + +/** + * Splits URL to host and path + * + * @param[in] url       any URL + * @param[out] host     parsed host + * @param[out] path     parsed path + */ +void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path); +void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path); + +/** + * Separates URL into url prefix, query (aka cgi params list), and fragment (aka part after #) + * + * @param[in] url               any URL + * @param[out] sanitizedUrl     parsed URL without query and fragment parts + * @param[out] query            parsed query + * @param[out] fragment         parsed fragment + */ +void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment); + +/** + * Extracts scheme, host and port from URL. + * + * Port will be parsed from URL with checks against ui16 overflow. If URL doesn't + * contain port it will be determined by one of the known schemes (currently + * https:// and http:// only). + * Given parameters will not be modified if URL has no appropriate components. + * + * @param[in] url       any URL + * @param[out] scheme   URL scheme + * @param[out] host     host name + * @param[out] port     parsed port number + * @return false if present port number cannot be parsed into ui16 + *         true  otherwise. + */ +bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port); + +/** + * Extracts scheme, host and port from URL. + * + * This function perform the same actions as TryGetSchemeHostAndPort(), but in + * case of impossibility to parse port number throws yexception. + * + * @param[in] url       any URL + * @param[out] scheme   URL scheme + * @param[out] host     host name + * @param[out] port     parsed port number + * @throws yexception  if present port number cannot be parsed into ui16. + */ +void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port); + +Y_PURE_FUNCTION +TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment = true) noexcept; +/** + * Extracts host from url and cuts http(https) protocol prefix and port if any. + * @param[in] url   any URL + * @return          host without port and http(https) prefix. + */ +Y_PURE_FUNCTION +TStringBuf GetOnlyHost(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept; // ("www.ya.ru", 2) -> "ya.ru" + +Y_PURE_FUNCTION +TStringBuf GetZone(const TStringBuf host) noexcept; + +Y_PURE_FUNCTION +TStringBuf CutWWWPrefix(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept; + +/** + * Cuts 'm.' prefix from url if and only if the url starts with it + * Example: 'm.some-domain.com' -> 'some-domain.com'. + * 'http://m.some-domain.com' is not changed + * + * @param[in] url   any URL + * @return          url without 'm.' or 'M.' prefix. + */ +Y_PURE_FUNCTION +TStringBuf CutMPrefix(const TStringBuf url) noexcept; + +Y_PURE_FUNCTION +TStringBuf GetDomain(const TStringBuf host) noexcept; // should not be used + +size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size); +size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport = 80); + +Y_PURE_FUNCTION +TStringBuf RemoveFinalSlash(TStringBuf str) noexcept; + +TStringBuf CutUrlPrefixes(TStringBuf url) noexcept; +bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept; + diff --git a/library/cpp/string_utils/url/url_ut.cpp b/library/cpp/string_utils/url/url_ut.cpp new file mode 100644 index 00000000000..15880138939 --- /dev/null +++ b/library/cpp/string_utils/url/url_ut.cpp @@ -0,0 +1,281 @@ +#include "url.h" + +#include <util/string/cast.h> + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(TUtilUrlTest) { +    Y_UNIT_TEST(TestGetHostAndGetHostAndPort) { +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru:8080")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru:8080", GetHostAndPort("ya.ru:8080")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru/bebe:8080")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru/bebe:8080")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru:8080/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("https://ya.ru:8080/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetHost("www.ya.ru:8080/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetHost("https://www.ya.ru:8080/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru:8080", GetHostAndPort("ya.ru:8080/bebe")); +        // irl RFC3986 sometimes gets ignored +        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetHost("pravda-kmv.ru?page=news&id=6973")); +        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetHostAndPort("pravda-kmv.ru?page=news&id=6973")); +        // check simple string +        UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetHost("some_blender_url")); +        UNIT_ASSERT_VALUES_EQUAL("", GetHost("")); +    } + +    Y_UNIT_TEST(TestGetPathAndQuery) { +        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org")); +        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/")); +        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org:8080")); +        UNIT_ASSERT_VALUES_EQUAL("/index.php?123/", GetPathAndQuery("ru.wikipedia.org/index.php?123/")); +        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("http://ru.wikipedia.org:8080")); +        UNIT_ASSERT_VALUES_EQUAL("/index.php?123/", GetPathAndQuery("https://ru.wikipedia.org/index.php?123/")); +        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/#comment")); +        UNIT_ASSERT_VALUES_EQUAL("/?1", GetPathAndQuery("ru.wikipedia.org/?1#comment")); +        UNIT_ASSERT_VALUES_EQUAL("/?1#comment", GetPathAndQuery("ru.wikipedia.org/?1#comment", false)); +    } + +    Y_UNIT_TEST(TestGetDomain) { +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("www.ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("a.b.ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya", GetDomain("ya")); +        UNIT_ASSERT_VALUES_EQUAL("", GetDomain("")); +    } + +    Y_UNIT_TEST(TestGetParentDomain) { +        UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("www.ya.ru", 0)); +        UNIT_ASSERT_VALUES_EQUAL("ru", GetParentDomain("www.ya.ru", 1)); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetParentDomain("www.ya.ru", 2)); +        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetParentDomain("www.ya.ru", 3)); +        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetParentDomain("www.ya.ru", 4)); +        UNIT_ASSERT_VALUES_EQUAL("com", GetParentDomain("ya.com", 1)); +        UNIT_ASSERT_VALUES_EQUAL("ya.com", GetParentDomain("ya.com", 2)); +        UNIT_ASSERT_VALUES_EQUAL("RU", GetParentDomain("RU", 1)); +        UNIT_ASSERT_VALUES_EQUAL("RU", GetParentDomain("RU", 2)); +        UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("", 0)); +        UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("", 1)); +    } + +    Y_UNIT_TEST(TestGetZone) { +        UNIT_ASSERT_VALUES_EQUAL("ru", GetZone("www.ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("com", GetZone("ya.com")); +        UNIT_ASSERT_VALUES_EQUAL("RU", GetZone("RU")); +        UNIT_ASSERT_VALUES_EQUAL("FHFBN", GetZone("ya.FHFBN")); +        UNIT_ASSERT_VALUES_EQUAL("", GetZone("")); +    } + +    Y_UNIT_TEST(TestAddSchemePrefix) { +        UNIT_ASSERT_VALUES_EQUAL("http://yandex.ru", AddSchemePrefix("yandex.ru")); +        UNIT_ASSERT_VALUES_EQUAL("http://yandex.ru", AddSchemePrefix("http://yandex.ru")); +        UNIT_ASSERT_VALUES_EQUAL("https://yandex.ru", AddSchemePrefix("https://yandex.ru")); +        UNIT_ASSERT_VALUES_EQUAL("file://yandex.ru", AddSchemePrefix("file://yandex.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", AddSchemePrefix("ya.ru", "ftp")); +    } + +    Y_UNIT_TEST(TestSchemeGet) { +        UNIT_ASSERT_VALUES_EQUAL("http://", GetSchemePrefix("http://ya.ru/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("", GetSchemePrefix("yaru")); +        UNIT_ASSERT_VALUES_EQUAL("yaru://", GetSchemePrefix("yaru://ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("", GetSchemePrefix("ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("ftp://", GetSchemePrefix("ftp://ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("https://", GetSchemePrefix("https://")); // is that right? +    } + +    Y_UNIT_TEST(TestSchemeCut) { +        UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutSchemePrefix("http://ya.ru/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("yaru", CutSchemePrefix("yaru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("yaru://ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("ftp://ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("", CutSchemePrefix("https://")); // is that right? + +        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", CutHttpPrefix("ftp://ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("http://ya.ru/zzz")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("http://ya.ru/zzz", true)); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("https://ya.ru/zzz")); +        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru/zzz", CutHttpPrefix("https://ya.ru/zzz", true)); +        UNIT_ASSERT_VALUES_EQUAL("", CutHttpPrefix("https://"));               // is that right? +        UNIT_ASSERT_VALUES_EQUAL("https://", CutHttpPrefix("https://", true)); // is that right? +    } + +    Y_UNIT_TEST(TestMisc) { +        UNIT_ASSERT_VALUES_EQUAL("", CutWWWPrefix("www.")); +        UNIT_ASSERT_VALUES_EQUAL("", CutWWWPrefix("WwW.")); +        UNIT_ASSERT_VALUES_EQUAL("www", CutWWWPrefix("www")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWPrefix("www.ya.ru")); + +        UNIT_ASSERT_VALUES_EQUAL("", CutWWWNumberedPrefix("www.")); +        UNIT_ASSERT_VALUES_EQUAL("www", CutWWWNumberedPrefix("www")); +        UNIT_ASSERT_VALUES_EQUAL("www27", CutWWWNumberedPrefix("www27")); +        UNIT_ASSERT_VALUES_EQUAL("", CutWWWNumberedPrefix("www27.")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www.ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www2.ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www12.ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("ww2.ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("w1w2w3.ya.ru", CutWWWNumberedPrefix("w1w2w3.ya.ru")); +        UNIT_ASSERT_VALUES_EQUAL("123.ya.ru", CutWWWNumberedPrefix("123.ya.ru")); + +        UNIT_ASSERT_VALUES_EQUAL("", CutMPrefix("m.")); +        UNIT_ASSERT_VALUES_EQUAL("", CutMPrefix("M.")); +        UNIT_ASSERT_VALUES_EQUAL("m", CutMPrefix("m")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutMPrefix("m.ya.ru")); +    } + +    Y_UNIT_TEST(TestSplitUrlToHostAndPath) { +        TStringBuf host, path; + +        SplitUrlToHostAndPath("https://yandex.ru/yandsearch", host, path); +        UNIT_ASSERT_STRINGS_EQUAL(host, "https://yandex.ru"); +        UNIT_ASSERT_STRINGS_EQUAL(path, "/yandsearch"); + +        SplitUrlToHostAndPath("yandex.ru/yandsearch", host, path); +        UNIT_ASSERT_STRINGS_EQUAL(host, "yandex.ru"); +        UNIT_ASSERT_STRINGS_EQUAL(path, "/yandsearch"); + +        SplitUrlToHostAndPath("https://yandex.ru", host, path); +        UNIT_ASSERT_STRINGS_EQUAL(host, "https://yandex.ru"); +        UNIT_ASSERT_STRINGS_EQUAL(path, ""); + +        SplitUrlToHostAndPath("invalid url /", host, path); +        UNIT_ASSERT_STRINGS_EQUAL(host, "invalid url "); +        UNIT_ASSERT_STRINGS_EQUAL(path, "/"); + +        SplitUrlToHostAndPath("some_blender_url", host, path); +        UNIT_ASSERT_STRINGS_EQUAL(host, "some_blender_url"); +        UNIT_ASSERT_STRINGS_EQUAL(path, ""); +    } + +    Y_UNIT_TEST(TestSeparateUrlFromQueryAndFragment) { +        TStringBuf sanitizedUrl, query, fragment; + +        SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch", sanitizedUrl, query, fragment); +        UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch"); +        UNIT_ASSERT_STRINGS_EQUAL(query, ""); +        UNIT_ASSERT_STRINGS_EQUAL(fragment, ""); + +        SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch?param1=val1¶m2=val2", sanitizedUrl, query, fragment); +        UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch"); +        UNIT_ASSERT_STRINGS_EQUAL(query, "param1=val1¶m2=val2"); +        UNIT_ASSERT_STRINGS_EQUAL(fragment, ""); + +        SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch#fragment", sanitizedUrl, query, fragment); +        UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch"); +        UNIT_ASSERT_STRINGS_EQUAL(query, ""); +        UNIT_ASSERT_STRINGS_EQUAL(fragment, "fragment"); + +        SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch?param1=val1¶m2=val2#fragment", sanitizedUrl, query, fragment); +        UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch"); +        UNIT_ASSERT_STRINGS_EQUAL(query, "param1=val1¶m2=val2"); +        UNIT_ASSERT_STRINGS_EQUAL(fragment, "fragment"); +    } + +    Y_UNIT_TEST(TestGetSchemeHostAndPort) { +        { // all components are present +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("https://ya.ru:8080/bebe", scheme, host, port); +            UNIT_ASSERT_VALUES_EQUAL(scheme, "https://"); +            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); +            UNIT_ASSERT_VALUES_EQUAL(port, 8080); +        } +        { // scheme is abset +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("ya.ru:8080/bebe", scheme, host, port); +            UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown"); +            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); +            UNIT_ASSERT_VALUES_EQUAL(port, 8080); +        } +        { // scheme and port are absent +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("ya.ru/bebe", scheme, host, port); +            UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown"); +            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); +            UNIT_ASSERT_VALUES_EQUAL(port, 0); +        } +        { // port is absent, but returned its default value for HTTP +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("http://ya.ru/bebe", scheme, host, port); +            UNIT_ASSERT_VALUES_EQUAL(scheme, "http://"); +            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); +            UNIT_ASSERT_VALUES_EQUAL(port, 80); +        } +        { // port is absent, but returned its default value for HTTPS +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("https://ya.ru/bebe", scheme, host, port); +            UNIT_ASSERT_VALUES_EQUAL(scheme, "https://"); +            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru"); +            UNIT_ASSERT_VALUES_EQUAL(port, 443); +        } +        { // ipv6 +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("https://[1080:0:0:0:8:800:200C:417A]:443/bebe", scheme, host, port); +            UNIT_ASSERT_VALUES_EQUAL(scheme, "https://"); +            UNIT_ASSERT_VALUES_EQUAL(host, "[1080:0:0:0:8:800:200C:417A]"); +            UNIT_ASSERT_VALUES_EQUAL(port, 443); +        } +        { // ipv6 +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("[::1]/bebe", scheme, host, port); +            UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown"); +            UNIT_ASSERT_VALUES_EQUAL(host, "[::1]"); +            UNIT_ASSERT_VALUES_EQUAL(port, 0); +        } +        { // ipv6 +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("unknown:///bebe", scheme, host, port); +            UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown://"); +            UNIT_ASSERT_VALUES_EQUAL(host, ""); +            UNIT_ASSERT_VALUES_EQUAL(port, 0); +        } +        // port overflow +        auto testCase = []() { +            TStringBuf scheme("unknown"), host("unknown"); +            ui16 port = 0; +            GetSchemeHostAndPort("https://ya.ru:65536/bebe", scheme, host, port); +        }; +        UNIT_ASSERT_EXCEPTION(testCase(), yexception); +    } + +    Y_UNIT_TEST(TestCutUrlPrefixes) { +        UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutUrlPrefixes("http://ya.ru/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("yaru", CutUrlPrefixes("yaru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("yaru://ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ftp://ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("", CutUrlPrefixes("https://")); + +        UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutUrlPrefixes("https://www.ya.ru/bebe")); +        UNIT_ASSERT_VALUES_EQUAL("yaru", CutUrlPrefixes("www.yaru")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("yaru://www.ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("www.ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ftp://www.ya.ru://zzz")); +        UNIT_ASSERT_VALUES_EQUAL("", CutUrlPrefixes("http://www.")); +    } + +    Y_UNIT_TEST(TestUrlPathStartWithToken) { +        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe/zzz", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe?zzz", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe/", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe?", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("https://ya.ru/bebe", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/bebezzz", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/bebe.zzz", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://bebe", "bebe")); +        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("https://bebe/", "bebe")); +    } +} diff --git a/library/cpp/string_utils/url/ut/ya.make b/library/cpp/string_utils/url/ut/ya.make new file mode 100644 index 00000000000..0efa30e4d2c --- /dev/null +++ b/library/cpp/string_utils/url/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/string_utils/url) + +OWNER(g:util) + +SRCS( +    url_ut.cpp +) + +END() diff --git a/library/cpp/string_utils/url/ya.make b/library/cpp/string_utils/url/ya.make new file mode 100644 index 00000000000..b08d69ec83d --- /dev/null +++ b/library/cpp/string_utils/url/ya.make @@ -0,0 +1,10 @@ +LIBRARY() + +OWNER(g:util) + +SRCS( +    url.cpp +    url.h +) + +END()  | 
