diff options
author | epbugaev <epbugaev@yandex-team.com> | 2023-03-02 16:48:33 +0300 |
---|---|---|
committer | epbugaev <epbugaev@yandex-team.com> | 2023-03-02 16:48:33 +0300 |
commit | 46106987bf9ad142884f9f94b1326ded3e96f96c (patch) | |
tree | f2a662b7708d269f0475ac72fe8cea37643638d3 | |
parent | 753d7fc890774ea3fcaec6eaf6d448ac2910e363 (diff) | |
download | ydb-46106987bf9ad142884f9f94b1326ded3e96f96c.tar.gz |
New block base_url udfs
Add 4 new block base_url udfs:
Encode
Decode
GetTail
GetPath
-rw-r--r-- | ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h | 79 |
1 files changed, 75 insertions, 4 deletions
diff --git a/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h b/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h index 8e3de1aca1..8479fcbcbc 100644 --- a/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h +++ b/ydb/library/yql/udfs/common/url_base/lib/url_base_udf.h @@ -98,7 +98,7 @@ SIMPLE_UDF(TGetPort, TOptional<ui64>(TOptional<char*>)) { : TUnboxedValuePod(); } -SIMPLE_UDF(TGetTail, TOptional<char*>(TOptional<char*>)) { +BEGIN_SIMPLE_ARROW_UDF(TGetTail, TOptional<char*>(TOptional<char*>)) { EMPTY_RESULT_ON_EMPTY_ARG(0); const TStringBuf url(args[0].AsStringRef()); TStringBuf host, tail; @@ -107,8 +107,24 @@ SIMPLE_UDF(TGetTail, TOptional<char*>(TOptional<char*>)) { ? valueBuilder->NewString(tail) : valueBuilder->NewString(TString('/').append(tail)); } +struct TGetTailKernelExec : public TUnaryKernelExec<TGetTailKernelExec> { + template <typename TSink> + static void Process(TBlockItem arg, const TSink& sink) { + if (!arg) { + return sink(TBlockItem()); + } + const TStringBuf url(arg.AsStringRef()); + TStringBuf host, tail; + SplitUrlToHostAndPath(url, host, tail); + if (tail.StartsWith('/')) { + return sink(TBlockItem(TStringRef(tail))); + } + sink(TBlockItem(TStringRef(TString('/').append(tail)))); + } +}; +END_SIMPLE_ARROW_UDF(TGetTail, TGetTailKernelExec::Do); -SIMPLE_UDF(TGetPath, TOptional<char*>(TOptional<char*>)) { +BEGIN_SIMPLE_ARROW_UDF(TGetPath, TOptional<char*>(TOptional<char*>)) { EMPTY_RESULT_ON_EMPTY_ARG(0); const std::string_view url(args[0].AsStringRef()); std::string_view cut(CutSchemePrefix(url)); @@ -125,6 +141,28 @@ SIMPLE_UDF(TGetPath, TOptional<char*>(TOptional<char*>)) { return valueBuilder->SubString(args[0], std::distance(url.begin(), cut.begin()), cut.length()); } +struct TGetPathKernelExec : public TUnaryKernelExec<TGetPathKernelExec> { + template <typename TSink> + static void Process(TBlockItem arg, const TSink& sink) { + if (!arg) { + return sink(TBlockItem()); + } + const std::string_view url(arg.AsStringRef()); + std::string_view cut(CutSchemePrefix(url)); + const auto s = cut.find('/'); + if (s == std::string_view::npos) { + return sink(TBlockItem(TStringRef("/"))); + } + + cut.remove_prefix(s); + const auto end = cut.find_first_of("?#"); + if (std::string_view::npos != end) { + cut.remove_suffix(cut.size() - end); + } + sink(TBlockItem(TStringRef(cut))); + } +}; +END_SIMPLE_ARROW_UDF(TGetPath, TGetPathKernelExec::Do); SIMPLE_UDF(TGetFragment, TOptional<char*>(TOptional<char*>)) { EMPTY_RESULT_ON_EMPTY_ARG(0); @@ -229,7 +267,7 @@ SIMPLE_UDF(TCutQueryStringAndFragment, char*(TAutoMap<char*>)) { return std::string_view::npos == cut ? NUdf::TUnboxedValue(args[0]) : valueBuilder->SubString(args[0], 0U, cut); } -SIMPLE_UDF(TEncode, TOptional<char*>(TOptional<char*>)) { +BEGIN_SIMPLE_ARROW_UDF(TEncode, TOptional<char*>(TOptional<char*>)) { EMPTY_RESULT_ON_EMPTY_ARG(0); const std::string_view input(args[0].AsStringRef()); if (input.empty()) { @@ -239,8 +277,24 @@ SIMPLE_UDF(TEncode, TOptional<char*>(TOptional<char*>)) { UrlEscape(url); return input == url ? NUdf::TUnboxedValue(args[0]) : valueBuilder->NewString(url); } +struct TEncodeKernelExec : public TUnaryKernelExec<TEncodeKernelExec> { + template <typename TSink> + static void Process(TBlockItem arg, const TSink& sink) { + if (!arg) { + return sink(TBlockItem()); + } + const std::string_view input(arg.AsStringRef()); + if (input.empty()) { + return sink(TBlockItem()); + } + TString url(input); + UrlEscape(url); + sink(TBlockItem(TStringRef(url))); + } +}; +END_SIMPLE_ARROW_UDF(TEncode, TEncodeKernelExec::Do); -SIMPLE_UDF(TDecode, TOptional<char*>(TOptional<char*>)) { +BEGIN_SIMPLE_ARROW_UDF(TDecode, TOptional<char*>(TOptional<char*>)) { EMPTY_RESULT_ON_EMPTY_ARG(0); const std::string_view input(args[0].AsStringRef()); if (input.empty()) { @@ -251,6 +305,23 @@ SIMPLE_UDF(TDecode, TOptional<char*>(TOptional<char*>)) { UrlUnescape(url); return input == url ? NUdf::TUnboxedValue(args[0]) : valueBuilder->NewString(url); } +struct TDecodeKernelExec : public TUnaryKernelExec<TDecodeKernelExec> { + template <typename TSink> + static void Process(TBlockItem arg, const TSink& sink) { + if (!arg) { + return sink(TBlockItem()); + } + const std::string_view input(arg.AsStringRef()); + if (input.empty()) { + return sink(TBlockItem()); + } + TString url(input); + SubstGlobal(url, '+', ' '); + UrlUnescape(url); + sink(TBlockItem(TStringRef(url))); + } +}; +END_SIMPLE_ARROW_UDF(TDecode, TDecodeKernelExec::Do); SIMPLE_UDF(TIsKnownTLD, bool(TAutoMap<char*>)) { Y_UNUSED(valueBuilder); |