aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoralexromanov <alexromanov@yandex-team.com>2022-12-15 15:16:46 +0300
committeralexromanov <alexromanov@yandex-team.com>2022-12-15 15:16:46 +0300
commitfa3668fcff4c1ff0fd81e75a84cb1b0c8cb9f856 (patch)
tree25dbcd352b04506a6b80c5156a872095439a0175
parent56b9fdade84190c6f3f457ff268dea573f58b0d8 (diff)
downloadydb-fa3668fcff4c1ff0fd81e75a84cb1b0c8cb9f856.tar.gz
Support non-http scheme in Url::GetSchemeHostPort and Url::GetSchemeHost
-rw-r--r--library/cpp/string_utils/url/url.cpp15
-rw-r--r--library/cpp/string_utils/url/url.h3
-rw-r--r--library/cpp/string_utils/url/url_ut.cpp87
-rw-r--r--ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h4
4 files changed, 107 insertions, 2 deletions
diff --git a/library/cpp/string_utils/url/url.cpp b/library/cpp/string_utils/url/url.cpp
index 85f4ac5d69..44e6cb671a 100644
--- a/library/cpp/string_utils/url/url.cpp
+++ b/library/cpp/string_utils/url/url.cpp
@@ -154,6 +154,21 @@ TStringBuf GetHostAndPort(const TStringBuf url) noexcept {
return GetHostAndPortImpl<true>(url);
}
+TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp) noexcept {
+ const size_t schemeSize = GetSchemePrefixSize(url);
+ const TStringBuf scheme = url.Head(schemeSize);
+
+ const bool isHttp = (schemeSize == 0 || scheme == TStringBuf("http://"));
+
+ const TStringBuf host = GetHost(url.Tail(schemeSize));
+
+ if (isHttp && trimHttp) {
+ return host;
+ } else {
+ return TStringBuf(scheme.begin(), host.end());
+ }
+}
+
TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp, bool trimDefaultPort) noexcept {
const size_t schemeSize = GetSchemePrefixSize(url);
const TStringBuf scheme = url.Head(schemeSize);
diff --git a/library/cpp/string_utils/url/url.h b/library/cpp/string_utils/url/url.h
index 84137ccc57..68fa23d64c 100644
--- a/library/cpp/string_utils/url/url.h
+++ b/library/cpp/string_utils/url/url.h
@@ -70,6 +70,9 @@ Y_PURE_FUNCTION
TStringBuf GetHostAndPort(const TStringBuf url) noexcept;
Y_PURE_FUNCTION
+TStringBuf GetSchemeHost(const TStringBuf url, bool trimHttp = true) noexcept;
+
+Y_PURE_FUNCTION
TStringBuf GetSchemeHostAndPort(const TStringBuf url, bool trimHttp = true, bool trimDefaultPort = true) noexcept;
/**
diff --git a/library/cpp/string_utils/url/url_ut.cpp b/library/cpp/string_utils/url/url_ut.cpp
index 1588013893..7980a36e99 100644
--- a/library/cpp/string_utils/url/url_ut.cpp
+++ b/library/cpp/string_utils/url/url_ut.cpp
@@ -27,6 +27,93 @@ Y_UNIT_TEST_SUITE(TUtilUrlTest) {
UNIT_ASSERT_VALUES_EQUAL("", GetHost(""));
}
+ Y_UNIT_TEST(TestGetSchemeHostAndPortWithoutSplit) {
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+ UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHostAndPort("ftp://ya.ru/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru:80/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru:80/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+ UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru:80/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHostAndPort("http://www.ya.ru:80/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("https://www.ya.ru", GetSchemeHostAndPort("https://www.ya.ru:443/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru:80", GetSchemeHostAndPort("http://www.ya.ru:80/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHostAndPort("https://ya.ru:443/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru:444", GetSchemeHostAndPort("https://ya.ru:444/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru:444", GetSchemeHostAndPort("https://ya.ru:444/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:8080", GetSchemeHostAndPort("ftp://ya.ru:8080/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:1234", GetSchemeHostAndPort("ftp://ya.ru:1234/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:80", GetSchemeHostAndPort("ftp://ya.ru:80/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru:80", GetSchemeHostAndPort("ftp://ya.ru:80/bebe", /*trimHttp*/false, /*trimDefaultPort*/false));
+
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru:80"));
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("ya.ru"));
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHostAndPort("http://ya.ru:80"));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHostAndPort("http://ya.ru:80", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru:81", GetSchemeHostAndPort("http://ya.ru:81", /*trimHttp*/false, /*trimDefaultPort*/false));
+
+ // irl RFC3986 sometimes gets ignored
+ UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973"));
+ UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHostAndPort("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/false, /*trimDefaultPort*/false));
+ // check simple string
+ UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url"));
+ UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHostAndPort("some_blender_url", /*trimHttp*/false, /*trimDefaultPort*/false));
+ UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort(""));
+ UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort("", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHostAndPort("", /*trimHttp*/false, /*trimDefaultPort*/false));
+ }
+
+ Y_UNIT_TEST(TestGetSchemeHost) {
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHost("http://www.ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHost("http://www.ya.ru/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru/bebe", /*trimHttp*/false));
+
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru:80/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru:80/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru:81/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("www.ya.ru", GetSchemeHost("http://www.ya.ru:80/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("http://www.ya.ru", GetSchemeHost("http://www.ya.ru:80/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru:443/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("https://ya.ru", GetSchemeHost("https://ya.ru:444/bebe", /*trimHttp*/false));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru:8080/bebe"));
+ UNIT_ASSERT_VALUES_EQUAL("ftp://ya.ru", GetSchemeHost("ftp://ya.ru:1234/bebe", /*trimHttp*/false));
+
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru:80"));
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("ya.ru"));
+ UNIT_ASSERT_VALUES_EQUAL("ya.ru", GetSchemeHost("http://ya.ru:80"));
+ UNIT_ASSERT_VALUES_EQUAL("http://ya.ru", GetSchemeHost("http://ya.ru:81", /*trimHttp*/false));
+
+ // irl RFC3986 sometimes gets ignored
+ UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHost("pravda-kmv.ru?page=news&id=6973"));
+ UNIT_ASSERT_VALUES_EQUAL("pravda-kmv.ru", GetSchemeHost("pravda-kmv.ru?page=news&id=6973", /*trimHttp*/ false));
+ // check simple string
+ UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHost("some_blender_url"));
+ UNIT_ASSERT_VALUES_EQUAL("some_blender_url", GetSchemeHost("some_blender_url", /*trimHttp*/ false));
+ UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHost(""));
+ UNIT_ASSERT_VALUES_EQUAL("", GetSchemeHost("", /*trimHttp*/ false));
+ }
+
Y_UNIT_TEST(TestGetPathAndQuery) {
UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org"));
UNIT_ASSERT_VALUES_EQUAL("/", GetPathAndQuery("ru.wikipedia.org/"));
diff --git a/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h b/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h
index 3fc2dd6164..29e757976d 100644
--- a/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h
+++ b/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h
@@ -59,7 +59,7 @@ namespace {
SIMPLE_UDF(TGetSchemeHost, TOptional<char*>(TOptional<char*>)) {
EMPTY_RESULT_ON_EMPTY_ARG(0);
const std::string_view url(args[0].AsStringRef());
- const std::string_view host(GetHost(url));
+ const std::string_view host(GetSchemeHost(url, /* trimHttp */ false));
return host.empty() ? TUnboxedValue() :
valueBuilder->SubString(args[0], 0U, std::distance(url.begin(), host.end()));
}
@@ -67,7 +67,7 @@ namespace {
SIMPLE_UDF(TGetSchemeHostPort, TOptional<char*>(TOptional<char*>)) {
EMPTY_RESULT_ON_EMPTY_ARG(0);
const std::string_view url(args[0].AsStringRef());
- const std::string_view host(GetHostAndPort(url));
+ const std::string_view host(GetSchemeHostAndPort(url, /* trimHttp */ false, /* trimDefaultPort */ false));
return host.empty() ? TUnboxedValue() :
valueBuilder->SubString(args[0], 0U, std::distance(url.begin(), host.end()));
}