intermediate changes

ref:cde9a383711a11544ce7e107a78147fb96cc4029
author: Devtools Arcadia <[email protected]> 2022-02-07 18:08:42 +0300
committer: Devtools Arcadia <[email protected]> 2022-02-07 18:08:42 +0300
commit: 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree: e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/url
5 files changed, 891 insertions, 0 deletions
diff --git a/library/cpp/string_utils/url/url.cpp b/library/cpp/string_utils/url/url.cpp
new file mode 100644
index 00000000000..85f4ac5d693
--- /dev/null
+++ b/library/cpp/string_utils/url/url.cpp
@@ -0,0 +1,421 @@
+#include "url.h"
+
+#include <util/string/cast.h>
+#include <util/string/util.h>
+#include <util/string/cstriter.h>
+#include <util/string/ascii.h>
+#include <util/string/strip.h>
+
+#include <util/charset/unidata.h> // for ToLower
+#include <util/system/defaults.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/hash_set.h>
+#include <util/generic/yexception.h>
+#include <util/generic/singleton.h>
+
+#include <cstdlib>
+
+namespace {
+    struct TUncheckedSize {
+        static bool Has(size_t) {
+            return true;
+        }
+    };
+
+    struct TKnownSize {
+        size_t MySize;
+        explicit TKnownSize(size_t sz)
+            : MySize(sz)
+        {
+        }
+        bool Has(size_t sz) const {
+            return sz <= MySize;
+        }
+    };
+
+    template <typename TChar1, typename TChar2>
+    int Compare1Case2(const TChar1* s1, const TChar2* s2, size_t n) {
+        for (size_t i = 0; i < n; ++i) {
+            if ((TChar1)ToLower(s1[i]) != s2[i])
+                return (TChar1)ToLower(s1[i]) < s2[i] ? -1 : 1;
+        }
+        return 0;
+    }
+
+    template <typename TChar, typename TBounds>
+    inline size_t GetHttpPrefixSizeImpl(const TChar* url, const TBounds& urlSize, bool ignorehttps) {
+        const TChar httpPrefix[] = {'h', 't', 't', 'p', ':', '/', '/', 0};
+        const TChar httpsPrefix[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0};
+        if (urlSize.Has(7) && Compare1Case2(url, httpPrefix, 7) == 0)
+            return 7;
+        if (!ignorehttps && urlSize.Has(8) && Compare1Case2(url, httpsPrefix, 8) == 0)
+            return 8;
+        return 0;
+    }
+
+    template <typename T>
+    inline T CutHttpPrefixImpl(const T& url, bool ignorehttps) {
+        size_t prefixSize = GetHttpPrefixSizeImpl<typename T::char_type>(url.data(), TKnownSize(url.size()), ignorehttps);
+        if (prefixSize)
+            return url.substr(prefixSize);
+        return url;
+    }
+}
+
+namespace NUrl {
+
+    TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url) {
+        TStringBuf host = GetSchemeHostAndPort(url, /*trimHttp=*/false, /*trimDefaultPort=*/false);
+        TStringBuf path = url;
+        path.SkipPrefix(host);
+        return {host, path};
+    }
+
+} // namespace NUrl
+
+size_t GetHttpPrefixSize(const char* url, bool ignorehttps) noexcept {
+    return GetHttpPrefixSizeImpl<char>(url, TUncheckedSize(), ignorehttps);
+}
+
+size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps) noexcept {
+    return GetHttpPrefixSizeImpl<wchar16>(url, TUncheckedSize(), ignorehttps);
+}
+
+size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps) noexcept {
+    return GetHttpPrefixSizeImpl<char>(url.data(), TKnownSize(url.size()), ignorehttps);
+}
+
+size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps) noexcept {
+    return GetHttpPrefixSizeImpl<wchar16>(url.data(), TKnownSize(url.size()), ignorehttps);
+}
+
+TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps) noexcept {
+    return CutHttpPrefixImpl(url, ignorehttps);
+}
+
+TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps) noexcept {
+    return CutHttpPrefixImpl(url, ignorehttps);
+}
+
+size_t GetSchemePrefixSize(const TStringBuf url) noexcept {
+    struct TDelim: public str_spn {
+        inline TDelim()
+            : str_spn("!-/:-@[-`{|}", true)
+        {
+        }
+    };
+
+    const auto& delim = *Singleton<TDelim>();
+    const char* n = delim.brk(url.data(), url.end());
+
+    if (n + 2 >= url.end() || *n != ':' || n[1] != '/' || n[2] != '/') {
+        return 0;
+    }
+
+    return n + 3 - url.begin();
+}
+
+TStringBuf GetSchemePrefix(const TStringBuf url) noexcept {
+    return url.Head(GetSchemePrefixSize(url));
+}
+
+TStringBuf CutSchemePrefix(const TStringBuf url) noexcept {
+    return url.Tail(GetSchemePrefixSize(url));
+}
+
+template <bool KeepPort>
+static inline TStringBuf GetHostAndPortImpl(const TStringBuf url) {
+    TStringBuf urlNoScheme = url;
+
+    urlNoScheme.Skip(GetHttpPrefixSize(url));
+
+    struct TDelim: public str_spn {
+        inline TDelim()
+            : str_spn(KeepPort ? "/;?#" : "/:;?#")
+        {
+        }
+    };
+
+    const auto& nonHostCharacters = *Singleton<TDelim>();
+    const char* firstNonHostCharacter = nonHostCharacters.brk(urlNoScheme.begin(), urlNoScheme.end());
+
+    if (firstNonHostCharacter != urlNoScheme.end()) {
+        return urlNoScheme.substr(0, firstNonHostCharacter - urlNoScheme.data());
+    }
+
+    return urlNoScheme;
+}
+
+TStringBuf GetHost(const TStringBuf url) noexcept {
+    return GetHostAndPortImpl<false>(url);
+}
+
+TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
+    return GetHostAndPortImpl<true>(url);
+}
+
+TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
+    const size_t schemeSize = GetSchemePrefixSize(url);
+    const TStringBuf scheme = url.Head(schemeSize);
+
+    const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
+
+    TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
+
+    if (trimDefaultPort) {
+        const size_t pos = hostAndPort.find(':');
+        if (pos != TStringBuf::npos) {
+            const bool isHttps = (scheme == TStringBuf("https://"));
+
+            const TStringBuf port = hostAndPort.Tail(pos + 1);
+            if ((isHttp && port == TStringBuf("80")) || (isHttps && port == TStringBuf("443"))) {
+                // trimming default port
+                hostAndPort = hostAndPort.Head(pos);
+            }
+        }
+    }
+
+    if (isHttp && trimHttp) {
+        return hostAndPort;
+    } else {
+        return TStringBuf(scheme.begin(), hostAndPort.end());
+    }
+}
+
+void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path) {
+    auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
+    host = hostBuf;
+    path = pathBuf;
+}
+
+void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path) {
+    auto [hostBuf, pathBuf] = NUrl::SplitUrlToHostAndPath(url);
+    host = hostBuf;
+    path = pathBuf;
+}
+
+void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment) {
+    TStringBuf urlWithoutFragment;
+    if (!url.TrySplit('#', urlWithoutFragment, fragment)) {
+        fragment = "";
+        urlWithoutFragment = url;
+    }
+    if (!urlWithoutFragment.TrySplit('?', sanitizedUrl, query)) {
+        query = "";
+        sanitizedUrl = urlWithoutFragment;
+    }
+}
+
+bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
+    const size_t schemeSize = GetSchemePrefixSize(url);
+    if (schemeSize != 0) {
+        scheme = url.Head(schemeSize);
+    }
+
+    TStringBuf portStr;
+    TStringBuf hostAndPort = GetHostAndPort(url.Tail(schemeSize));
+    if (hostAndPort && hostAndPort.back() != ']' && hostAndPort.TryRSplit(':', host, portStr)) {
+        // URL has port
+        if (!TryFromString(portStr, port)) {
+            return false;
+        }
+    } else {
+        host = hostAndPort;
+        if (scheme == TStringBuf("https://")) {
+            port = 443;
+        } else if (scheme == TStringBuf("http://")) {
+            port = 80;
+        }
+    }
+    return true;
+}
+
+void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port) {
+    bool isOk = TryGetSchemeHostAndPort(url, scheme, host, port);
+    Y_ENSURE(isOk, "cannot parse port number from URL: " << url);
+}
+
+TStringBuf GetOnlyHost(const TStringBuf url) noexcept {
+    return GetHost(CutSchemePrefix(url));
+}
+
+TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment) noexcept {
+    const size_t off = url.find('/', GetHttpPrefixSize(url));
+    TStringBuf hostUnused, path;
+    if (!url.TrySplitAt(off, hostUnused, path))
+        return "/";
+
+    return trimFragment ? path.Before('#') : path;
+}
+
+// this strange creature returns 2nd level domain, possibly with port
+TStringBuf GetDomain(const TStringBuf host) noexcept {
+    const char* c = !host ? host.data() : host.end() - 1;
+    for (bool wasPoint = false; c != host.data(); --c) {
+        if (*c == '.') {
+            if (wasPoint) {
+                ++c;
+                break;
+            }
+            wasPoint = true;
+        }
+    }
+    return TStringBuf(c, host.end());
+}
+
+TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept {
+    size_t pos = host.size();
+    for (size_t i = 0; i < level; ++i) {
+        pos = host.rfind('.', pos);
+        if (pos == TString::npos)
+            return host;
+    }
+    return host.SubStr(pos + 1);
+}
+
+TStringBuf GetZone(const TStringBuf host) noexcept {
+    return GetParentDomain(host, 1);
+}
+
+TStringBuf CutWWWPrefix(const TStringBuf url) noexcept {
+    if (url.size() >= 4 && url[3] == '.' && !strnicmp(url.data(), "www", 3))
+        return url.substr(4);
+    return url;
+}
+
+TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept {
+    auto it = url.begin();
+
+    StripRangeBegin(it, url.end(), [](auto& it){ return *it == 'w' || *it == 'W'; });
+    if (it == url.begin()) {
+        return url;
+    }
+
+    StripRangeBegin(it, url.end(), [](auto& it){ return IsAsciiDigit(*it); });
+    if (it == url.end()) {
+        return url;
+    }
+
+    if (*it++ == '.') {
+        return url.Tail(it - url.begin());
+    }
+
+    return url;
+}
+
+TStringBuf CutMPrefix(const TStringBuf url) noexcept {
+    if (url.size() >= 2 && url[1] == '.' && (url[0] == 'm' || url[0] == 'M')) {
+        return url.substr(2);
+    }
+    return url;
+}
+
+static inline bool IsSchemeChar(char c) noexcept {
+    return IsAsciiAlnum(c); //what about '+' ?..
+}
+
+static bool HasPrefix(const TStringBuf url) noexcept {
+    TStringBuf scheme, unused;
+    if (!url.TrySplit(TStringBuf("://"), scheme, unused))
+        return false;
+
+    return AllOf(scheme, IsSchemeChar);
+}
+
+TString AddSchemePrefix(const TString& url) {
+    return AddSchemePrefix(url, TStringBuf("http"));
+}
+
+TString AddSchemePrefix(const TString& url, TStringBuf scheme) {
+    if (HasPrefix(url)) {
+        return url;
+    }
+
+    return TString::Join(scheme, TStringBuf("://"), url);
+}
+
+#define X(c) (c >= 'A' ? ((c & 0xdf) - 'A') + 10 : (c - '0'))
+
+static inline int x2c(unsigned char* x) {
+    if (!IsAsciiHex(x[0]) || !IsAsciiHex(x[1]))
+        return -1;
+    return X(x[0]) * 16 + X(x[1]);
+}
+
+#undef X
+
+static inline int Unescape(char* str) {
+    char *to, *from;
+    int dlen = 0;
+    if ((str = strchr(str, '%')) == nullptr)
+        return dlen;
+    for (to = str, from = str; *from; from++, to++) {
+        if ((*to = *from) == '%') {
+            int c = x2c((unsigned char*)from + 1);
+            *to = char((c > 0) ? c : '0');
+            from += 2;
+            dlen += 2;
+        }
+    }
+    *to = 0; /* terminate it at the new length */
+    return dlen;
+}
+
+size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size) {
+    if (source.empty() || source[0] == '?')
+        return strlcpy(dest, "/", dest_size);
+    size_t len = Min(dest_size - 1, source.length());
+    memcpy(dest, source.data(), len);
+    dest[len] = 0;
+    len -= Unescape(dest);
+    strlwr(dest);
+    return len;
+}
+
+size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport) {
+    size_t len = Min(dest_size - 1, source.length());
+    memcpy(dest, source.data(), len);
+    dest[len] = 0;
+    char buf[8] = ":";
+    size_t buflen = 1 + ToString(defport, buf + 1, sizeof(buf) - 2);
+    buf[buflen] = '\0';
+    char* ptr = strstr(dest, buf);
+    if (ptr && ptr[buflen] == 0) {
+        len -= buflen;
+        *ptr = 0;
+    }
+    strlwr(dest);
+    return len;
+}
+
+TStringBuf RemoveFinalSlash(TStringBuf str) noexcept {
+    if (str.EndsWith('/')) {
+        str.Chop(1);
+    }
+    return str;
+}
+
+TStringBuf CutUrlPrefixes(TStringBuf url) noexcept {
+    url = CutSchemePrefix(url);
+    url = CutWWWPrefix(url);
+    return url;
+}
+
+bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept {
+    url = CutSchemePrefix(url);
+    const TStringBuf noHostSuffix = url.After('/');
+    if (noHostSuffix == url) {
+        // no slash => no suffix with token info
+        return false;
+    }
+    const bool suffixHasPrefix = noHostSuffix.StartsWith(token);
+    if (!suffixHasPrefix) {
+        return false;
+    }
+    const bool slashAfterPrefix = noHostSuffix.find("/", token.length()) == token.length();
+    const bool qMarkAfterPrefix = noHostSuffix.find("?", token.length()) == token.length();
+    const bool nothingAfterPrefix = noHostSuffix.length() <= token.length();
+    const bool prefixIsToken = slashAfterPrefix || qMarkAfterPrefix || nothingAfterPrefix;
+    return prefixIsToken;
+}
+
diff --git a/library/cpp/string_utils/url/url.h b/library/cpp/string_utils/url/url.h
new file mode 100644
index 00000000000..84137ccc57d
--- /dev/null
+++ b/library/cpp/string_utils/url/url.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <util/generic/fwd.h>
+#include <util/generic/strbuf.h>
+
+namespace NUrl {
+
+    /**
+     * Splits URL to host and path
+     * Example:
+     * auto [host, path] = SplitUrlToHostAndPath(url);
+     *
+     * @param[in] url                   any URL
+     * @param[out] <host, path>     parsed host and path
+     */
+    struct TSplitUrlToHostAndPathResult {
+        TStringBuf host;
+        TStringBuf path;
+    };
+
+    Y_PURE_FUNCTION
+    TSplitUrlToHostAndPathResult SplitUrlToHostAndPath(const TStringBuf url);
+
+} // namespace NUrl
+
+Y_PURE_FUNCTION
+size_t GetHttpPrefixSize(const char* url, bool ignorehttps = false) noexcept;
+Y_PURE_FUNCTION
+size_t GetHttpPrefixSize(const wchar16* url, bool ignorehttps = false) noexcept;
+
+Y_PURE_FUNCTION
+size_t GetHttpPrefixSize(const TStringBuf url, bool ignorehttps = false) noexcept;
+
+Y_PURE_FUNCTION
+size_t GetHttpPrefixSize(const TWtringBuf url, bool ignorehttps = false) noexcept;
+
+/** BEWARE of TStringBuf! You can not use operator ~ or c_str() like in TString
+    !!!!!!!!!!!! */
+Y_PURE_FUNCTION
+size_t GetSchemePrefixSize(const TStringBuf url) noexcept;
+
+Y_PURE_FUNCTION
+TStringBuf GetSchemePrefix(const TStringBuf url) noexcept;
+
+//! removes protocol prefixes 'http://' and 'https://' from given URL
+//! @note if URL has no prefix or some other prefix the function does nothing
+//! @param url    URL from which the prefix should be removed
+//! @param ignorehttps if true, leaves https://
+//! @return a new URL without protocol prefix
+Y_PURE_FUNCTION
+TStringBuf CutHttpPrefix(const TStringBuf url, bool ignorehttps = false) noexcept;
+
+Y_PURE_FUNCTION
+TWtringBuf CutHttpPrefix(const TWtringBuf url, bool ignorehttps = false) noexcept;
+
+Y_PURE_FUNCTION
+TStringBuf CutSchemePrefix(const TStringBuf url) noexcept;
+
+//! adds specified scheme prefix if URL has no scheme
+//! @note if URL has scheme prefix already the function returns unchanged URL
+TString AddSchemePrefix(const TString& url, const TStringBuf scheme);
+
+//! Same as `AddSchemePrefix(url, "http")`.
+TString AddSchemePrefix(const TString& url);
+
+Y_PURE_FUNCTION
+TStringBuf GetHost(const TStringBuf url) noexcept;
+
+Y_PURE_FUNCTION
+TStringBuf GetHostAndPort(const TStringBuf url) noexcept;
+
+Y_PURE_FUNCTION
+TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp = true, bool trimDefaultPort = true) noexcept;
+
+/**
+ * Splits URL to host and path
+ *
+ * @param[in] url       any URL
+ * @param[out] host     parsed host
+ * @param[out] path     parsed path
+ */
+void SplitUrlToHostAndPath(const TStringBuf url, TStringBuf& host, TStringBuf& path);
+void SplitUrlToHostAndPath(const TStringBuf url, TString& host, TString& path);
+
+/**
+ * Separates URL into url prefix, query (aka cgi params list), and fragment (aka part after #)
+ *
+ * @param[in] url               any URL
+ * @param[out] sanitizedUrl     parsed URL without query and fragment parts
+ * @param[out] query            parsed query
+ * @param[out] fragment         parsed fragment
+ */
+void SeparateUrlFromQueryAndFragment(const TStringBuf url, TStringBuf& sanitizedUrl, TStringBuf& query, TStringBuf& fragment);
+
+/**
+ * Extracts scheme, host and port from URL.
+ *
+ * Port will be parsed from URL with checks against ui16 overflow. If URL doesn't
+ * contain port it will be determined by one of the known schemes (currently
+ * https:// and http:// only).
+ * Given parameters will not be modified if URL has no appropriate components.
+ *
+ * @param[in] url       any URL
+ * @param[out] scheme   URL scheme
+ * @param[out] host     host name
+ * @param[out] port     parsed port number
+ * @return false if present port number cannot be parsed into ui16
+ *         true  otherwise.
+ */
+bool TryGetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port);
+
+/**
+ * Extracts scheme, host and port from URL.
+ *
+ * This function perform the same actions as TryGetSchemeHostAndPort(), but in
+ * case of impossibility to parse port number throws yexception.
+ *
+ * @param[in] url       any URL
+ * @param[out] scheme   URL scheme
+ * @param[out] host     host name
+ * @param[out] port     parsed port number
+ * @throws yexception  if present port number cannot be parsed into ui16.
+ */
+void GetSchemeHostAndPort(const TStringBuf url, TStringBuf& scheme, TStringBuf& host, ui16& port);
+
+Y_PURE_FUNCTION
+TStringBuf GetPathAndQuery(const TStringBuf url, bool trimFragment = true) noexcept;
+/**
+ * Extracts host from url and cuts http(https) protocol prefix and port if any.
+ * @param[in] url   any URL
+ * @return          host without port and http(https) prefix.
+ */
+Y_PURE_FUNCTION
+TStringBuf GetOnlyHost(const TStringBuf url) noexcept;
+
+Y_PURE_FUNCTION
+TStringBuf GetParentDomain(const TStringBuf host, size_t level) noexcept; // ("www.ya.ru", 2) -> "ya.ru"
+
+Y_PURE_FUNCTION
+TStringBuf GetZone(const TStringBuf host) noexcept;
+
+Y_PURE_FUNCTION
+TStringBuf CutWWWPrefix(const TStringBuf url) noexcept;
+
+Y_PURE_FUNCTION
+TStringBuf CutWWWNumberedPrefix(const TStringBuf url) noexcept;
+
+/**
+ * Cuts 'm.' prefix from url if and only if the url starts with it
+ * Example: 'm.some-domain.com' -> 'some-domain.com'.
+ * 'http://m.some-domain.com' is not changed
+ *
+ * @param[in] url   any URL
+ * @return          url without 'm.' or 'M.' prefix.
+ */
+Y_PURE_FUNCTION
+TStringBuf CutMPrefix(const TStringBuf url) noexcept;
+
+Y_PURE_FUNCTION
+TStringBuf GetDomain(const TStringBuf host) noexcept; // should not be used
+
+size_t NormalizeUrlName(char* dest, const TStringBuf source, size_t dest_size);
+size_t NormalizeHostName(char* dest, const TStringBuf source, size_t dest_size, ui16 defport = 80);
+
+Y_PURE_FUNCTION
+TStringBuf RemoveFinalSlash(TStringBuf str) noexcept;
+
+TStringBuf CutUrlPrefixes(TStringBuf url) noexcept;
+bool DoesUrlPathStartWithToken(TStringBuf url, const TStringBuf& token) noexcept;
+
diff --git a/library/cpp/string_utils/url/url_ut.cpp b/library/cpp/string_utils/url/url_ut.cpp
new file mode 100644
index 00000000000..15880138939
--- /dev/null
+++ b/library/cpp/string_utils/url/url_ut.cpp
@@ -0,0 +1,281 @@
+#include "url.h"
+
+#include <util/string/cast.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+Y_UNIT_TEST_SUITE(TUtilUrlTest) {
+    Y_UNIT_TEST(TestGetHostAndGetHostAndPort) {
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru:8080"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru:8080", GetHostAndPort("ya.ru:8080"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru/bebe:8080"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHostAndPort("ya.ru/bebe:8080"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("ya.ru:8080/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetHost("https://ya.ru:8080/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetHost("www.ya.ru:8080/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetHost("https://www.ya.ru:8080/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru:8080", GetHostAndPort("ya.ru:8080/bebe"));
+        // irl RFC3986 sometimes gets ignored
+        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetHost("pravda-kmv.ru?page=news&id=6973"));
+        UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetHostAndPort("pravda-kmv.ru?page=news&id=6973"));
+        // check simple string
+        UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetHost("some_blender_url"));
+        UNIT_ASSERT_VALUES_EQUAL("", GetHost(""));
+    }
+
+    Y_UNIT_TEST(TestGetPathAndQuery) {
+        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org"));
+        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/"));
+        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org:8080"));
+        UNIT_ASSERT_VALUES_EQUAL("/index.php?123/", GetPathAndQuery("ru.wikipedia.org/index.php?123/"));
+        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("http://ru.wikipedia.org:8080"));
+        UNIT_ASSERT_VALUES_EQUAL("/index.php?123/", GetPathAndQuery("https://ru.wikipedia.org/index.php?123/"));
+        UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/#comment"));
+        UNIT_ASSERT_VALUES_EQUAL("/?1", GetPathAndQuery("ru.wikipedia.org/?1#comment"));
+        UNIT_ASSERT_VALUES_EQUAL("/?1#comment", GetPathAndQuery("ru.wikipedia.org/?1#comment", false));
+    }
+
+    Y_UNIT_TEST(TestGetDomain) {
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("www.ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("a.b.ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetDomain("ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya", GetDomain("ya"));
+        UNIT_ASSERT_VALUES_EQUAL("", GetDomain(""));
+    }
+
+    Y_UNIT_TEST(TestGetParentDomain) {
+        UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("www.ya.ru", 0));
+        UNIT_ASSERT_VALUES_EQUAL("ru", GetParentDomain("www.ya.ru", 1));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetParentDomain("www.ya.ru", 2));
+        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetParentDomain("www.ya.ru", 3));
+        UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetParentDomain("www.ya.ru", 4));
+        UNIT_ASSERT_VALUES_EQUAL("com", GetParentDomain("ya.com", 1));
+        UNIT_ASSERT_VALUES_EQUAL("ya.com", GetParentDomain("ya.com", 2));
+        UNIT_ASSERT_VALUES_EQUAL("RU", GetParentDomain("RU", 1));
+        UNIT_ASSERT_VALUES_EQUAL("RU", GetParentDomain("RU", 2));
+        UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("", 0));
+        UNIT_ASSERT_VALUES_EQUAL("", GetParentDomain("", 1));
+    }
+
+    Y_UNIT_TEST(TestGetZone) {
+        UNIT_ASSERT_VALUES_EQUAL("ru", GetZone("www.ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("com", GetZone("ya.com"));
+        UNIT_ASSERT_VALUES_EQUAL("RU", GetZone("RU"));
+        UNIT_ASSERT_VALUES_EQUAL("FHFBN", GetZone("ya.FHFBN"));
+        UNIT_ASSERT_VALUES_EQUAL("", GetZone(""));
+    }
+
+    Y_UNIT_TEST(TestAddSchemePrefix) {
+        UNIT_ASSERT_VALUES_EQUAL("http://yandex.ru", AddSchemePrefix("yandex.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("http://yandex.ru", AddSchemePrefix("http://yandex.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("https://yandex.ru", AddSchemePrefix("https://yandex.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("file://yandex.ru", AddSchemePrefix("file://yandex.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", AddSchemePrefix("ya.ru", "ftp"));
+    }
+
+    Y_UNIT_TEST(TestSchemeGet) {
+        UNIT_ASSERT_VALUES_EQUAL("http://", GetSchemePrefix("http://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("", GetSchemePrefix("yaru"));
+        UNIT_ASSERT_VALUES_EQUAL("yaru://", GetSchemePrefix("yaru://ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("", GetSchemePrefix("ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("ftp://", GetSchemePrefix("ftp://ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("https://", GetSchemePrefix("https://")); // is that right?
+    }
+
+    Y_UNIT_TEST(TestSchemeCut) {
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutSchemePrefix("http://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("yaru", CutSchemePrefix("yaru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("yaru://ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutSchemePrefix("ftp://ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("", CutSchemePrefix("https://")); // is that right?
+
+        UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", CutHttpPrefix("ftp://ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("http://ya.ru/zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("http://ya.ru/zzz", true));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru/zzz", CutHttpPrefix("https://ya.ru/zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("https://ya.ru/zzz", CutHttpPrefix("https://ya.ru/zzz", true));
+        UNIT_ASSERT_VALUES_EQUAL("", CutHttpPrefix("https://"));               // is that right?
+        UNIT_ASSERT_VALUES_EQUAL("https://", CutHttpPrefix("https://", true)); // is that right?
+    }
+
+    Y_UNIT_TEST(TestMisc) {
+        UNIT_ASSERT_VALUES_EQUAL("", CutWWWPrefix("www."));
+        UNIT_ASSERT_VALUES_EQUAL("", CutWWWPrefix("WwW."));
+        UNIT_ASSERT_VALUES_EQUAL("www", CutWWWPrefix("www"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWPrefix("www.ya.ru"));
+
+        UNIT_ASSERT_VALUES_EQUAL("", CutWWWNumberedPrefix("www."));
+        UNIT_ASSERT_VALUES_EQUAL("www", CutWWWNumberedPrefix("www"));
+        UNIT_ASSERT_VALUES_EQUAL("www27", CutWWWNumberedPrefix("www27"));
+        UNIT_ASSERT_VALUES_EQUAL("", CutWWWNumberedPrefix("www27."));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www.ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www2.ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("www12.ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutWWWNumberedPrefix("ww2.ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("w1w2w3.ya.ru", CutWWWNumberedPrefix("w1w2w3.ya.ru"));
+        UNIT_ASSERT_VALUES_EQUAL("123.ya.ru", CutWWWNumberedPrefix("123.ya.ru"));
+
+        UNIT_ASSERT_VALUES_EQUAL("", CutMPrefix("m."));
+        UNIT_ASSERT_VALUES_EQUAL("", CutMPrefix("M."));
+        UNIT_ASSERT_VALUES_EQUAL("m", CutMPrefix("m"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru", CutMPrefix("m.ya.ru"));
+    }
+
+    Y_UNIT_TEST(TestSplitUrlToHostAndPath) {
+        TStringBuf host, path;
+
+        SplitUrlToHostAndPath("https://yandex.ru/yandsearch", host, path);
+        UNIT_ASSERT_STRINGS_EQUAL(host, "https://yandex.ru");
+        UNIT_ASSERT_STRINGS_EQUAL(path, "/yandsearch");
+
+        SplitUrlToHostAndPath("yandex.ru/yandsearch", host, path);
+        UNIT_ASSERT_STRINGS_EQUAL(host, "yandex.ru");
+        UNIT_ASSERT_STRINGS_EQUAL(path, "/yandsearch");
+
+        SplitUrlToHostAndPath("https://yandex.ru", host, path);
+        UNIT_ASSERT_STRINGS_EQUAL(host, "https://yandex.ru");
+        UNIT_ASSERT_STRINGS_EQUAL(path, "");
+
+        SplitUrlToHostAndPath("invalid url /", host, path);
+        UNIT_ASSERT_STRINGS_EQUAL(host, "invalid url ");
+        UNIT_ASSERT_STRINGS_EQUAL(path, "/");
+
+        SplitUrlToHostAndPath("some_blender_url", host, path);
+        UNIT_ASSERT_STRINGS_EQUAL(host, "some_blender_url");
+        UNIT_ASSERT_STRINGS_EQUAL(path, "");
+    }
+
+    Y_UNIT_TEST(TestSeparateUrlFromQueryAndFragment) {
+        TStringBuf sanitizedUrl, query, fragment;
+
+        SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch", sanitizedUrl, query, fragment);
+        UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch");
+        UNIT_ASSERT_STRINGS_EQUAL(query, "");
+        UNIT_ASSERT_STRINGS_EQUAL(fragment, "");
+
+        SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch?param1=val1&param2=val2", sanitizedUrl, query, fragment);
+        UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch");
+        UNIT_ASSERT_STRINGS_EQUAL(query, "param1=val1&param2=val2");
+        UNIT_ASSERT_STRINGS_EQUAL(fragment, "");
+
+        SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch#fragment", sanitizedUrl, query, fragment);
+        UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch");
+        UNIT_ASSERT_STRINGS_EQUAL(query, "");
+        UNIT_ASSERT_STRINGS_EQUAL(fragment, "fragment");
+
+        SeparateUrlFromQueryAndFragment("https://yandex.ru/yandsearch?param1=val1&param2=val2#fragment", sanitizedUrl, query, fragment);
+        UNIT_ASSERT_STRINGS_EQUAL(sanitizedUrl, "https://yandex.ru/yandsearch");
+        UNIT_ASSERT_STRINGS_EQUAL(query, "param1=val1&param2=val2");
+        UNIT_ASSERT_STRINGS_EQUAL(fragment, "fragment");
+    }
+
+    Y_UNIT_TEST(TestGetSchemeHostAndPort) {
+        { // all components are present
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("https://ya.ru:8080/bebe", scheme, host, port);
+            UNIT_ASSERT_VALUES_EQUAL(scheme, "https://");
+            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru");
+            UNIT_ASSERT_VALUES_EQUAL(port, 8080);
+        }
+        { // scheme is abset
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("ya.ru:8080/bebe", scheme, host, port);
+            UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown");
+            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru");
+            UNIT_ASSERT_VALUES_EQUAL(port, 8080);
+        }
+        { // scheme and port are absent
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("ya.ru/bebe", scheme, host, port);
+            UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown");
+            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru");
+            UNIT_ASSERT_VALUES_EQUAL(port, 0);
+        }
+        { // port is absent, but returned its default value for HTTP
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("http://ya.ru/bebe", scheme, host, port);
+            UNIT_ASSERT_VALUES_EQUAL(scheme, "http://");
+            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru");
+            UNIT_ASSERT_VALUES_EQUAL(port, 80);
+        }
+        { // port is absent, but returned its default value for HTTPS
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("https://ya.ru/bebe", scheme, host, port);
+            UNIT_ASSERT_VALUES_EQUAL(scheme, "https://");
+            UNIT_ASSERT_VALUES_EQUAL(host, "ya.ru");
+            UNIT_ASSERT_VALUES_EQUAL(port, 443);
+        }
+        { // ipv6
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("https://[1080:0:0:0:8:800:200C:417A]:443/bebe", scheme, host, port);
+            UNIT_ASSERT_VALUES_EQUAL(scheme, "https://");
+            UNIT_ASSERT_VALUES_EQUAL(host, "[1080:0:0:0:8:800:200C:417A]");
+            UNIT_ASSERT_VALUES_EQUAL(port, 443);
+        }
+        { // ipv6
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("[::1]/bebe", scheme, host, port);
+            UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown");
+            UNIT_ASSERT_VALUES_EQUAL(host, "[::1]");
+            UNIT_ASSERT_VALUES_EQUAL(port, 0);
+        }
+        { // ipv6
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("unknown:///bebe", scheme, host, port);
+            UNIT_ASSERT_VALUES_EQUAL(scheme, "unknown://");
+            UNIT_ASSERT_VALUES_EQUAL(host, "");
+            UNIT_ASSERT_VALUES_EQUAL(port, 0);
+        }
+        // port overflow
+        auto testCase = []() {
+            TStringBuf scheme("unknown"), host("unknown");
+            ui16 port = 0;
+            GetSchemeHostAndPort("https://ya.ru:65536/bebe", scheme, host, port);
+        };
+        UNIT_ASSERT_EXCEPTION(testCase(), yexception);
+    }
+
+    Y_UNIT_TEST(TestCutUrlPrefixes) {
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutUrlPrefixes("http://ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("yaru", CutUrlPrefixes("yaru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("yaru://ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ftp://ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("", CutUrlPrefixes("https://"));
+
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru/bebe", CutUrlPrefixes("https://www.ya.ru/bebe"));
+        UNIT_ASSERT_VALUES_EQUAL("yaru", CutUrlPrefixes("www.yaru"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("yaru://www.ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("www.ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("ya.ru://zzz", CutUrlPrefixes("ftp://www.ya.ru://zzz"));
+        UNIT_ASSERT_VALUES_EQUAL("", CutUrlPrefixes("http://www."));
+    }
+
+    Y_UNIT_TEST(TestUrlPathStartWithToken) {
+        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe/zzz", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe?zzz", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe/", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("http://ya.ru/bebe?", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(true, DoesUrlPathStartWithToken("https://ya.ru/bebe", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/bebezzz", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/bebe.zzz", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru/", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://ya.ru", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("http://bebe", "bebe"));
+        UNIT_ASSERT_VALUES_EQUAL(false, DoesUrlPathStartWithToken("https://bebe/", "bebe"));
+    }
+}
diff --git a/library/cpp/string_utils/url/ut/ya.make b/library/cpp/string_utils/url/ut/ya.make
new file mode 100644
index 00000000000..0efa30e4d2c
--- /dev/null
+++ b/library/cpp/string_utils/url/ut/ya.make
@@ -0,0 +1,9 @@
+UNITTEST_FOR(library/cpp/string_utils/url)
+
+OWNER(g:util)
+
+SRCS(
+    url_ut.cpp
+)
+
+END()
diff --git a/library/cpp/string_utils/url/ya.make b/library/cpp/string_utils/url/ya.make
new file mode 100644
index 00000000000..b08d69ec83d
--- /dev/null
+++ b/library/cpp/string_utils/url/ya.make
@@ -0,0 +1,10 @@
+LIBRARY()
+
+OWNER(g:util)
+
+SRCS(
+    url.cpp
+    url.h
+)
+
+END()
author	Devtools Arcadia <[email protected]>	2022-02-07 18:08:42 +0300
committer	Devtools Arcadia <[email protected]>	2022-02-07 18:08:42 +0300
commit	1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
tree	e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/string_utils/url