diff options
author | alexromanov <alexromanov@yandex-team.com> | 2022-12-15 15:16:46 +0300 |
---|---|---|
committer | alexromanov <alexromanov@yandex-team.com> | 2022-12-15 15:16:46 +0300 |
commit | fa3668fcff4c1ff0fd81e75a84cb1b0c8cb9f856 (patch) | |
tree | 25dbcd352b04506a6b80c5156a872095439a0175 | |
parent | 56b9fdade84190c6f3f457ff268dea573f58b0d8 (diff) | |
download | ydb-fa3668fcff4c1ff0fd81e75a84cb1b0c8cb9f856.tar.gz |
Support non-http scheme in Url::GetSchemeHostPort and Url::GetSchemeHost
-rw-r--r-- | library/cpp/string_utils/url/url.cpp | 15 | ||||
-rw-r--r-- | library/cpp/string_utils/url/url.h | 3 | ||||
-rw-r--r-- | library/cpp/string_utils/url/url_ut.cpp | 87 | ||||
-rw-r--r-- | ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h | 4 |
4 files changed, 107 insertions, 2 deletions
diff --git a/library/cpp/string_utils/url/url.cpp b/library/cpp/string_utils/url/url.cpp index 85f4ac5d69..44e6cb671a 100644 --- a/library/cpp/string_utils/url/url.cpp +++ b/library/cpp/string_utils/url/url.cpp @@ -154,6 +154,21 @@ TStringBuf GetHostAndPort(const TStringBuf url) noexcept { return GetHostAndPortImpl<true>(url); } +TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp) noexcept { + const size_t schemeSize = GetSchemePrefixSize(url); + const TStringBuf scheme = url.Head(schemeSize); + + const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://")); + + const TStringBuf host = GetHost(url.Tail(schemeSize)); + + if (isHttp && trimHttp) { + return host; + } else { + return TStringBuf(scheme.begin(), host.end()); + } +} + TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept { const size_t schemeSize = GetSchemePrefixSize(url); const TStringBuf scheme = url.Head(schemeSize); diff --git a/library/cpp/string_utils/url/url.h b/library/cpp/string_utils/url/url.h index 84137ccc57..68fa23d64c 100644 --- a/library/cpp/string_utils/url/url.h +++ b/library/cpp/string_utils/url/url.h @@ -70,6 +70,9 @@ Y_PURE_FUNCTION TStringBuf GetHostAndPort(const TStringBuf url) noexcept; Y_PURE_FUNCTION +TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp = true) noexcept; + +Y_PURE_FUNCTION TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp = true, bool trimDefaultPort = true) noexcept; /** diff --git a/library/cpp/string_utils/url/url_ut.cpp b/library/cpp/string_utils/url/url_ut.cpp index 1588013893..7980a36e99 100644 --- a/library/cpp/string_utils/url/url_ut.cpp +++ b/library/cpp/string_utils/url/url_ut.cpp @@ -27,6 +27,93 @@ Y_UNIT_TEST_SUITE(TUtilUrlTest) { UNIT_ASSERT_VALUES_EQUAL("", GetHost("")); } + Y_UNIT_TEST(TestGetSchemeHostAndPortWithoutSplit) { + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false)); + UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false)); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false)); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false)); + + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru:80/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru:80/bebe")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81/bebe", /*trimHttp*/false, /*trimDefaultPort*/false)); + UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru:80/bebe")); + UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru:80/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("https://www.ya.ru", GetSchemeHostAndPort("https://www.ya.ru:443/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru:80", GetSchemeHostAndPort("http://www.ya.ru:80/bebe", /*trimHttp*/false, /*trimDefaultPort*/false)); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru:443/bebe")); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru:444", GetSchemeHostAndPort("https://ya.ru:444/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru:444", GetSchemeHostAndPort("https://ya.ru:444/bebe", /*trimHttp*/false, /*trimDefaultPort*/false)); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:8080", GetSchemeHostAndPort("ftp://ya.ru:8080/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:1234", GetSchemeHostAndPort("ftp://ya.ru:1234/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:80", GetSchemeHostAndPort("ftp://ya.ru:80/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:80", GetSchemeHostAndPort("ftp://ya.ru:80/bebe", /*trimHttp*/false, /*trimDefaultPort*/false)); + + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru:80")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru:80")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru:80", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81", /*trimHttp*/false, /*trimDefaultPort*/false)); + + // irl RFC3986 sometimes gets ignored + UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973")); + UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/false, /*trimDefaultPort*/false)); + // check simple string + UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url")); + UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url", /*trimHttp*/false, /*trimDefaultPort*/false)); + UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort("")); + UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort("", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort("", /*trimHttp*/false, /*trimDefaultPort*/false)); + } + + Y_UNIT_TEST(TestGetSchemeHost) { + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHost("http://www.ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHost("http://www.ya.ru/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru/bebe", /*trimHttp*/false)); + + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru:80/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru:80/bebe")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru:81/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHost("http://www.ya.ru:80/bebe")); + UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHost("http://www.ya.ru:80/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru:443/bebe")); + UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru:444/bebe", /*trimHttp*/false)); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru:8080/bebe")); + UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru:1234/bebe", /*trimHttp*/false)); + + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru:80")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru")); + UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru:80")); + UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru:81", /*trimHttp*/false)); + + // irl RFC3986 sometimes gets ignored + UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHost("pravda-kmv.ru?page=news&id=6973")); + UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHost("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/ false)); + // check simple string + UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHost("some_blender_url")); + UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHost("some_blender_url", /*trimHttp*/ false)); + UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHost("")); + UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHost("", /*trimHttp*/ false)); + } + Y_UNIT_TEST(TestGetPathAndQuery) { UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org")); UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/")); diff --git a/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h b/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h index 3fc2dd6164..29e757976d 100644 --- a/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h +++ b/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h @@ -59,7 +59,7 @@ namespace { SIMPLE_UDF(TGetSchemeHost, TOptional<char*>(TOptional<char*>)) { EMPTY_RESULT_ON_EMPTY_ARG(0); const std::string_view url(args[0].AsStringRef()); - const std::string_view host(GetHost(url)); + const std::string_view host(GetSchemeHost(url, /* trimHttp */ false)); return host.empty() ? TUnboxedValue() : valueBuilder->SubString(args[0], 0U, std::distance(url.begin(), host.end())); } @@ -67,7 +67,7 @@ namespace { SIMPLE_UDF(TGetSchemeHostPort, TOptional<char*>(TOptional<char*>)) { EMPTY_RESULT_ON_EMPTY_ARG(0); const std::string_view url(args[0].AsStringRef()); - const std::string_view host(GetHostAndPort(url)); + const std::string_view host(GetSchemeHostAndPort(url, /* trimHttp */ false, /* trimDefaultPort */ false)); return host.empty() ? TUnboxedValue() : valueBuilder->SubString(args[0], 0U, std::distance(url.begin(), host.end())); } |