diff options
author | imunkin <[email protected]> | 2024-11-08 10:00:23 +0300 |
---|---|---|
committer | imunkin <[email protected]> | 2024-11-08 10:12:13 +0300 |
commit | a784a2f943d6e15caa6241e2e96d80aac6dbf375 (patch) | |
tree | 05f1e5366c916b988a8afb75bdab8ddeee0f6e6d /yql/essentials/udfs/common/string | |
parent | d70137a7b530ccaa52834274913bbb5a3d1ca06e (diff) |
Move yql/udfs/common/ to /yql/essentials YQL-19206
Except the following directories:
* clickhouse/client
* datetime
* knn
* roaring
commit_hash:c7da95636144d28db109d6b17ddc762e9bacb59f
Diffstat (limited to 'yql/essentials/udfs/common/string')
66 files changed, 4654 insertions, 0 deletions
diff --git a/yql/essentials/udfs/common/string/string_udf.cpp b/yql/essentials/udfs/common/string/string_udf.cpp new file mode 100644 index 00000000000..d621e92582d --- /dev/null +++ b/yql/essentials/udfs/common/string/string_udf.cpp @@ -0,0 +1,926 @@ +#include <yql/essentials/public/udf/udf_allocator.h> +#include <yql/essentials/public/udf/udf_helpers.h> +#include <yql/essentials/public/udf/udf_value_builder.h> + +#include <library/cpp/charset/codepage.h> +#include <library/cpp/deprecated/split/split_iterator.h> +#include <library/cpp/html/pcdata/pcdata.h> +#include <library/cpp/string_utils/base32/base32.h> +#include <library/cpp/string_utils/base64/base64.h> +#include <library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h> +#include <library/cpp/string_utils/quote/quote.h> + +#include <yql/essentials/public/udf/arrow/udf_arrow_helpers.h> + +#include <util/charset/wide.h> +#include <util/generic/vector.h> +#include <util/stream/format.h> +#include <util/string/ascii.h> +#include <util/string/escape.h> +#include <util/string/hex.h> +#include <util/string/join.h> +#include <util/string/reverse.h> +#include <util/string/split.h> +#include <util/string/strip.h> +#include <util/string/subst.h> +#include <util/string/util.h> +#include <util/string/vector.h> + +using namespace NKikimr; +using namespace NUdf; + +namespace { + +#define STRING_UDF(udfName, function) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, char*(TAutoMap<char*>)) { \ + const TString input(args[0].AsStringRef()); \ + const auto& result = function(input); \ + return valueBuilder->NewString(result); \ + } \ + \ + struct T##udfName##KernelExec \ + : public TUnaryKernelExec<T##udfName##KernelExec> \ + { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ + const TString input(arg1.AsStringRef()); \ + const auto& result = function(input); \ + sink(TBlockItem(result)); \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) \ + + +// 'unsafe' udf is actually strict - it returns null on any exception +#define STRING_UNSAFE_UDF(udfName, function) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \ + EMPTY_RESULT_ON_EMPTY_ARG(0); \ + const TString input(args[0].AsStringRef()); \ + try { \ + const auto& result = function(input); \ + return valueBuilder->NewString(result); \ + } catch (yexception&) { \ + return TUnboxedValue(); \ + } \ + } \ + \ + struct T##udfName##KernelExec \ + : public TUnaryKernelExec<T##udfName##KernelExec> \ + { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ + if (!arg1) { \ + return sink(TBlockItem()); \ + } \ + \ + const TString input(arg1.AsStringRef()); \ + try { \ + const auto& result = function(input); \ + sink(TBlockItem(result)); \ + } catch (yexception&) { \ + return sink(TBlockItem()); \ + } \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) + +#define STROKA_UDF(udfName, function) \ + SIMPLE_STRICT_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \ + EMPTY_RESULT_ON_EMPTY_ARG(0) \ + const TString input(args[0].AsStringRef()); \ + try { \ + TUtf16String wide = UTF8ToWide(input); \ + function(wide); \ + return valueBuilder->NewString(WideToUTF8(wide)); \ + } catch (yexception&) { \ + return TUnboxedValue(); \ + } \ + } + +#define STROKA_CASE_UDF(udfName, function) \ + SIMPLE_STRICT_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \ + EMPTY_RESULT_ON_EMPTY_ARG(0) \ + const TString input(args[0].AsStringRef()); \ + try { \ + TUtf16String wide = UTF8ToWide(input); \ + function(wide.begin(), wide.size()); \ + return valueBuilder->NewString(WideToUTF8(wide)); \ + } catch (yexception&) { \ + return TUnboxedValue(); \ + } \ + } + +#define STROKA_ASCII_CASE_UDF(udfName, function) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, char*(TAutoMap<char*>)) { \ + TString input(args[0].AsStringRef()); \ + if (input.function()) { \ + return valueBuilder->NewString(input); \ + } else { \ + return args[0]; \ + } \ + } \ + \ + struct T##udfName##KernelExec \ + : public TUnaryKernelExec<T##udfName##KernelExec> \ + { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ + TString input(arg1.AsStringRef()); \ + if (input.function()) { \ + sink(TBlockItem(input)); \ + } else { \ + sink(arg1); \ + } \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) + + +#define STROKA_FIND_UDF(udfName, function) \ + SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \ + Y_UNUSED(valueBuilder); \ + if (args[0]) { \ + const TString haystack(args[0].AsStringRef()); \ + const TString needle(args[1].AsStringRef()); \ + return TUnboxedValuePod(haystack.function(needle)); \ + } else { \ + return TUnboxedValuePod(false); \ + } \ + } + +#define STRING_TWO_ARGS_UDF(udfName, function) \ + SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \ + Y_UNUSED(valueBuilder); \ + if (args[0]) { \ + const TString haystack(args[0].AsStringRef()); \ + const TString needle(args[1].AsStringRef()); \ + return TUnboxedValuePod(function(haystack, needle)); \ + } else { \ + return TUnboxedValuePod(false); \ + } \ + } + +#define IS_ASCII_UDF(function) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, bool(TOptional<char*>)) { \ + Y_UNUSED(valueBuilder); \ + if (args[0]) { \ + const TStringBuf input(args[0].AsStringRef()); \ + bool result = true; \ + for (auto c : input) { \ + if (!function(c)) { \ + result = false; \ + break; \ + } \ + } \ + return TUnboxedValuePod(result); \ + } else { \ + return TUnboxedValuePod(false); \ + } \ + } \ + \ + struct T##function##KernelExec \ + : public TUnaryKernelExec<T##function##KernelExec> \ + { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ + if (arg1) { \ + const TStringBuf input(arg1.AsStringRef()); \ + bool result = true; \ + for (auto c : input) { \ + if (!function(c)) { \ + result = false; \ + break; \ + } \ + } \ + sink(TBlockItem(result)); \ + } else { \ + sink(TBlockItem(false)); \ + } \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do) + + + +#define STRING_STREAM_PAD_FORMATTER_UDF(function) \ + BEGIN_SIMPLE_ARROW_UDF_WITH_OPTIONAL_ARGS(T##function, \ + char*(TAutoMap<char*>, ui64, TOptional<char*>), 1) \ + { \ + TStringStream result; \ + const TStringBuf input(args[0].AsStringRef()); \ + char paddingSymbol = ' '; \ + if (args[2]) { \ + if (args[2].AsStringRef().Size() != 1) { \ + ythrow yexception() << "Not 1 symbol in paddingSymbol"; \ + } \ + paddingSymbol = TString(args[2].AsStringRef())[0]; \ + } \ + const ui64 padLen = args[1].Get<ui64>(); \ + if (padLen > padLim) { \ + ythrow yexception() << "Padding length (" << padLen << ") exceeds maximum: " << padLim; \ + } \ + result << function(input, padLen, paddingSymbol); \ + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \ + } \ + \ + struct T##function##KernelExec \ + : public TGenericKernelExec<T##function##KernelExec, 3> \ + { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { \ + TStringStream result; \ + const TStringBuf input(args.GetElement(0).AsStringRef()); \ + char paddingSymbol = ' '; \ + if (args.GetElement(2)) { \ + if (args.GetElement(2).AsStringRef().Size() != 1) { \ + ythrow yexception() << "Not 1 symbol in paddingSymbol"; \ + } \ + paddingSymbol = TString(args.GetElement(2).AsStringRef())[0]; \ + } \ + const ui64 padLen = args.GetElement(1).Get<ui64>(); \ + if (padLen > padLim) { \ + ythrow yexception() << "Padding length (" << padLen \ + << ") exceeds maximum: " << padLim; \ + } \ + result << function(input, padLen, paddingSymbol); \ + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do) + +#define STRING_STREAM_NUM_FORMATTER_UDF(function, argType) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, char*(TAutoMap<argType>)) { \ + TStringStream result; \ + result << function(args[0].Get<argType>()); \ + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \ + } \ + \ + struct T##function##KernelExec \ + : public TUnaryKernelExec<T##function##KernelExec> \ + { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ + TStringStream result; \ + result << function(arg1.Get<argType>()); \ + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do) + +#define STRING_STREAM_TEXT_FORMATTER_UDF(function) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, char*(TAutoMap<char*>)) { \ + TStringStream result; \ + const TStringBuf input(args[0].AsStringRef()); \ + result << function(input); \ + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \ + } \ + \ + struct T##function##KernelExec \ + : public TUnaryKernelExec<T##function##KernelExec> \ + { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ + TStringStream result; \ + const TStringBuf input(arg1.AsStringRef()); \ + result << function(input); \ + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do) + + +#define STRING_STREAM_HRSZ_FORMATTER_UDF(udfName, hrSize) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, char*(TAutoMap<ui64>)) { \ + TStringStream result; \ + result << HumanReadableSize(args[0].Get<ui64>(), hrSize); \ + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \ + } \ + \ + struct T##udfName##KernelExec \ + : public TUnaryKernelExec<T##udfName##KernelExec> \ + { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ + TStringStream result; \ + result << HumanReadableSize(arg1.Get<ui64>(), hrSize); \ + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) + +#define STRING_UDF_MAP(XX) \ + XX(Base32Encode, Base32Encode) \ + XX(Base64Encode, Base64Encode) \ + XX(Base64EncodeUrl, Base64EncodeUrl) \ + XX(EscapeC, EscapeC) \ + XX(UnescapeC, UnescapeC) \ + XX(HexEncode, HexEncode) \ + XX(EncodeHtml, EncodeHtmlPcdata) \ + XX(DecodeHtml, DecodeHtmlPcdata) \ + XX(CgiEscape, CGIEscapeRet) \ + XX(CgiUnescape, CGIUnescapeRet) \ + XX(Strip, Strip) \ + XX(Collapse, Collapse) + +#define STRING_UNSAFE_UDF_MAP(XX) \ + XX(Base32Decode, Base32Decode) \ + XX(Base32StrictDecode, Base32StrictDecode) \ + XX(Base64Decode, Base64Decode) \ + XX(Base64StrictDecode, Base64StrictDecode) \ + XX(HexDecode, HexDecode) + +// NOTE: The functions below are marked as deprecated, so block implementation +// is not required for them. Hence, STROKA_CASE_UDF provides only the scalar +// one at the moment. +#define STROKA_CASE_UDF_MAP(XX) \ + XX(ToLower, ToLower) \ + XX(ToUpper, ToUpper) \ + XX(ToTitle, ToTitle) + +#define STROKA_ASCII_CASE_UDF_MAP(XX) \ + XX(AsciiToLower, to_lower) \ + XX(AsciiToUpper, to_upper) \ + XX(AsciiToTitle, to_title) + +// NOTE: The functions below are marked as deprecated, so block implementation +// is not required for them. Hence, STROKA_FIND_UDF provides only the scalar +// one at the moment. +#define STROKA_FIND_UDF_MAP(XX) \ + XX(StartsWith, StartsWith) \ + XX(EndsWith, EndsWith) \ + XX(HasPrefix, StartsWith) \ + XX(HasSuffix, EndsWith) + +// NOTE: The functions below are marked as deprecated, so block implementation +// is not required for them. Hence, STRING_TWO_ARGS_UDF provides only the +// scalar one at the moment. +#define STRING_TWO_ARGS_UDF_MAP(XX) \ + XX(StartsWithIgnoreCase, AsciiHasPrefixIgnoreCase) \ + XX(EndsWithIgnoreCase, AsciiHasSuffixIgnoreCase) \ + XX(HasPrefixIgnoreCase, AsciiHasPrefixIgnoreCase) \ + XX(HasSuffixIgnoreCase, AsciiHasSuffixIgnoreCase) + +// NOTE: The functions below are marked as deprecated, so block implementation +// is not required for them. Hence, STROKA_UDF provides only the scalar one at +// the moment. +#define STROKA_UDF_MAP(XX) \ + XX(Reverse, ReverseInPlace) + +#define IS_ASCII_UDF_MAP(XX) \ + XX(IsAscii) \ + XX(IsAsciiSpace) \ + XX(IsAsciiUpper) \ + XX(IsAsciiLower) \ + XX(IsAsciiDigit) \ + XX(IsAsciiAlpha) \ + XX(IsAsciiAlnum) \ + XX(IsAsciiHex) + +#define STRING_STREAM_PAD_FORMATTER_UDF_MAP(XX) \ + XX(LeftPad) \ + XX(RightPad) + +#define STRING_STREAM_NUM_FORMATTER_UDF_MAP(XX) \ + XX(Hex, ui64) \ + XX(SHex, i64) \ + XX(Bin, ui64) \ + XX(SBin, i64) + +#define STRING_STREAM_TEXT_FORMATTER_UDF_MAP(XX) \ + XX(HexText) \ + XX(BinText) + +#define STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(XX) \ + XX(HumanReadableQuantity, SF_QUANTITY) \ + XX(HumanReadableBytes, SF_BYTES) + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TCollapseText, char*(TAutoMap<char*>, ui64)) { + TString input(args[0].AsStringRef()); + ui64 maxLength = args[1].Get<ui64>(); + CollapseText(input, maxLength); + return valueBuilder->NewString(input); + } + + struct TCollapseTextKernelExec + : public TBinaryKernelExec<TCollapseTextKernelExec> + { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + TString input(arg1.AsStringRef()); + ui64 maxLength = arg2.Get<ui64>(); + CollapseText(input, maxLength); + return sink(TBlockItem(input)); + } + }; + + END_SIMPLE_ARROW_UDF(TCollapseText, TCollapseTextKernelExec::Do); + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TContains, bool(TOptional<char*>, char*)) { + Y_UNUSED(valueBuilder); + if (!args[0]) + return TUnboxedValuePod(false); + + const TString haystack(args[0].AsStringRef()); + const TString needle(args[1].AsStringRef()); + return TUnboxedValuePod(haystack.Contains(needle)); + } + + struct TContainsKernelExec : public TBinaryKernelExec<TContainsKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + if (!arg1) + return sink(TBlockItem(false)); + + const TString haystack(arg1.AsStringRef()); + const TString needle(arg2.AsStringRef()); + sink(TBlockItem(haystack.Contains(needle))); + } + }; + + END_SIMPLE_ARROW_UDF(TContains, TContainsKernelExec::Do); + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) { + if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef())) + return valueBuilder->NewString(result); + else + return args[0]; + } + + struct TReplaceAllKernelExec + : public TGenericKernelExec<TReplaceAllKernelExec, 3> + { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { + TString result(args.GetElement(0).AsStringRef()); + const TStringBuf what(args.GetElement(1).AsStringRef()); + const TStringBuf with(args.GetElement(2).AsStringRef()); + if (SubstGlobal(result, what, with)) { + return sink(TBlockItem(result)); + } else { + return sink(args.GetElement(0)); + } + } + }; + + END_SIMPLE_ARROW_UDF(TReplaceAll, TReplaceAllKernelExec::Do) + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceFirst, char*(TAutoMap<char*>, char*, char*)) { + std::string result(args[0].AsStringRef()); + const std::string_view what(args[1].AsStringRef()); + if (const auto index = result.find(what); index != std::string::npos) { + result.replace(index, what.size(), std::string_view(args[2].AsStringRef())); + return valueBuilder->NewString(result); + } + return args[0]; + } + + struct TReplaceFirstKernelExec + : public TGenericKernelExec<TReplaceFirstKernelExec, 3> + { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { + std::string result(args.GetElement(0).AsStringRef()); + const std::string_view what(args.GetElement(1).AsStringRef()); + const std::string_view with(args.GetElement(2).AsStringRef()); + if (const auto index = result.find(what); index != std::string::npos) { + result.replace(index, what.size(), with); + return sink(TBlockItem(result)); + } + return sink(args.GetElement(0)); + } + }; + + END_SIMPLE_ARROW_UDF(TReplaceFirst, TReplaceFirstKernelExec::Do) + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceLast, char*(TAutoMap<char*>, char*, char*)) { + std::string result(args[0].AsStringRef()); + const std::string_view what(args[1].AsStringRef()); + if (const auto index = result.rfind(what); index != std::string::npos) { + result.replace(index, what.size(), std::string_view(args[2].AsStringRef())); + return valueBuilder->NewString(result); + } + return args[0]; + } + + struct TReplaceLastKernelExec + : public TGenericKernelExec<TReplaceLastKernelExec, 3> + { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { + std::string result(args.GetElement(0).AsStringRef()); + const std::string_view what(args.GetElement(1).AsStringRef()); + const std::string_view with(args.GetElement(2).AsStringRef()); + if (const auto index = result.rfind(what); index != std::string::npos) { + result.replace(index, what.size(), with); + return sink(TBlockItem(result)); + } + return sink(args.GetElement(0)); + } + }; + + END_SIMPLE_ARROW_UDF(TReplaceLast, TReplaceLastKernelExec::Do) + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveAll, char*(TAutoMap<char*>, char*)) { + std::string input(args[0].AsStringRef()); + const std::string_view remove(args[1].AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + size_t tpos = 0; + for (const ui8 c : input) { + if (!chars[c]) { + input[tpos++] = c; + } + } + if (tpos != input.size()) { + input.resize(tpos); + return valueBuilder->NewString(input); + } + return args[0]; + } + + struct TRemoveAllKernelExec + : public TBinaryKernelExec<TRemoveAllKernelExec> + { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + std::string input(arg1.AsStringRef()); + const std::string_view remove(arg2.AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + size_t tpos = 0; + for (const ui8 c : input) { + if (!chars[c]) { + input[tpos++] = c; + } + } + if (tpos != input.size()) { + input.resize(tpos); + return sink(TBlockItem(input)); + } + sink(arg1); + } + }; + + END_SIMPLE_ARROW_UDF(TRemoveAll, TRemoveAllKernelExec::Do) + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveFirst, char*(TAutoMap<char*>, char*)) { + std::string input(args[0].AsStringRef()); + const std::string_view remove(args[1].AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + for (auto it = input.cbegin(); it != input.cend(); ++it) { + if (chars[static_cast<ui8>(*it)]) { + input.erase(it); + return valueBuilder->NewString(input); + } + } + return args[0]; + } + + struct TRemoveFirstKernelExec + : public TBinaryKernelExec<TRemoveFirstKernelExec> + { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + std::string input(arg1.AsStringRef()); + const std::string_view remove(arg2.AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + for (auto it = input.cbegin(); it != input.cend(); ++it) { + if (chars[static_cast<ui8>(*it)]) { + input.erase(it); + return sink(TBlockItem(input)); + } + } + sink(arg1); + } + }; + + END_SIMPLE_ARROW_UDF(TRemoveFirst, TRemoveFirstKernelExec::Do) + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveLast, char*(TAutoMap<char*>, char*)) { + std::string input(args[0].AsStringRef()); + const std::string_view remove(args[1].AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + for (auto it = input.crbegin(); it != input.crend(); ++it) { + if (chars[static_cast<ui8>(*it)]) { + input.erase(input.crend() - it - 1, 1); + return valueBuilder->NewString(input); + } + } + return args[0]; + } + + struct TRemoveLastKernelExec + : public TBinaryKernelExec<TRemoveLastKernelExec> + { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + std::string input(arg1.AsStringRef()); + const std::string_view remove(arg2.AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + for (auto it = input.crbegin(); it != input.crend(); ++it) { + if (chars[static_cast<ui8>(*it)]) { + input.erase(input.crend() - it - 1, 1); + return sink(TBlockItem(input)); + } + } + sink(arg1); + } + }; + + END_SIMPLE_ARROW_UDF(TRemoveLast, TRemoveLastKernelExec::Do) + + + // NOTE: String::Find is marked as deprecated, so block implementation is + // not required for them. Hence, only the scalar one is provided. + SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) { + Y_UNUSED(valueBuilder); + const TString haystack(args[0].AsStringRef()); + const TString needle(args[1].AsStringRef()); + const ui64 pos = args[2].GetOrDefault<ui64>(0); + return TUnboxedValuePod(haystack.find(needle, pos)); + } + + // NOTE: String::ReverseFind is marked as deprecated, so block + // implementation is not required for them. Hence, only the scalar one is + // provided. + SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TReverseFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) { + Y_UNUSED(valueBuilder); + const TString haystack(args[0].AsStringRef()); + const TString needle(args[1].AsStringRef()); + const ui64 pos = args[2].GetOrDefault<ui64>(TString::npos); + return TUnboxedValuePod(haystack.rfind(needle, pos)); + } + + // NOTE: String::Substring is marked as deprecated, so block implementation + // is not required for them. Hence, only the scalar one is provided. + SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSubstring, char*(TAutoMap<char*>, TOptional<ui64>, TOptional<ui64>), 1) { + const TString input(args[0].AsStringRef()); + const ui64 from = args[1].GetOrDefault<ui64>(0); + const ui64 count = args[2].GetOrDefault<ui64>(TString::npos); + return valueBuilder->NewString(input.substr(from, count)); + } + + using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>; + + template <typename TIt> + static void SplitToListImpl( + const IValueBuilder* valueBuilder, + const TUnboxedValue& input, + const std::string_view::const_iterator from, + const TIt& it, + TTmpVector& result) { + for (const auto& elem : it) { + result.emplace_back(valueBuilder->SubString(input, std::distance(from, elem.TokenStart()), std::distance(elem.TokenStart(), elem.TokenDelim()))); + } + } + template <typename TIt> + static void SplitToListImpl( + const IValueBuilder* valueBuilder, + const TUnboxedValue& input, + const std::string_view::const_iterator from, + TIt& it, + bool skipEmpty, + TTmpVector& result) { + if (skipEmpty) { + SplitToListImpl(valueBuilder, input, from, it.SkipEmpty(), result); + } else { + SplitToListImpl(valueBuilder, input, from, it, result); + } + } + + constexpr char delimeterStringName[] = "DelimeterString"; + constexpr char skipEmptyName[] = "SkipEmpty"; + constexpr char limitName[] = "Limit"; + using TDelimeterStringArg = TNamedArg<bool, delimeterStringName>; + using TSkipEmptyArg = TNamedArg<bool, skipEmptyName>; + using TLimitArg = TNamedArg<ui64, limitName>; + + + SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSplitToList, TListType<char*>( + TOptional<char*>, + char*, + TDelimeterStringArg, + TSkipEmptyArg, + TLimitArg + ), + 3) { + TTmpVector result; + if (args[0]) { + const std::string_view input(args[0].AsStringRef()); + const std::string_view delimeter(args[1].AsStringRef()); + const bool delimiterString = args[2].GetOrDefault<bool>(true); + const bool skipEmpty = args[3].GetOrDefault<bool>(false); + const auto limit = args[4].GetOrDefault<ui64>(0); + if (delimiterString) { + if (limit) { + auto it = StringSplitter(input).SplitByString(delimeter).Limit(limit + 1); + SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); + } else { + auto it = StringSplitter(input).SplitByString(delimeter); + SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); + } + } else { + if (limit) { + auto it = StringSplitter(input).SplitBySet(TString(delimeter).c_str()).Limit(limit + 1); + SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); + } else { + auto it = StringSplitter(input).SplitBySet(TString(delimeter).c_str()); + SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); + } + } + } + return valueBuilder->NewList(result.data(), result.size()); + } + + SIMPLE_STRICT_UDF(TJoinFromList, char*(TAutoMap<TListType<TOptional<char*>>>, char*)) { + auto input = args[0].GetListIterator(); + const TString delimeter(args[1].AsStringRef()); + TVector<TString> items; + + for (TUnboxedValue current; input.Next(current);) { + if (current) { + TString item(current.AsStringRef()); + items.push_back(std::move(item)); + } + } + + return valueBuilder->NewString(JoinSeq(delimeter, items)); + } + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TLevensteinDistance, ui64(TAutoMap<char*>, TAutoMap<char*>)) { + Y_UNUSED(valueBuilder); + const TStringBuf left(args[0].AsStringRef()); + const TStringBuf right(args[1].AsStringRef()); + const ui64 result = NLevenshtein::Distance(left, right); + return TUnboxedValuePod(result); + } + + struct TLevensteinDistanceKernelExec : public TBinaryKernelExec<TLevensteinDistanceKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + const std::string_view left(arg1.AsStringRef()); + const std::string_view right(arg2.AsStringRef()); + const ui64 result = NLevenshtein::Distance(left, right); + sink(TBlockItem(result)); + } + }; + + END_SIMPLE_ARROW_UDF(TLevensteinDistance, TLevensteinDistanceKernelExec::Do); + + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(THumanReadableDuration, char*(TAutoMap<ui64>)) { + TStringStream result; + result << HumanReadable(TDuration::MicroSeconds(args[0].Get<ui64>())); + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); + } + + struct THumanReadableDurationKernelExec + : public TUnaryKernelExec<THumanReadableDurationKernelExec> + { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { + TStringStream result; + result << HumanReadable(TDuration::MicroSeconds(arg1.Get<ui64>())); + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); + } + }; + + END_SIMPLE_ARROW_UDF(THumanReadableDuration, THumanReadableDurationKernelExec::Do) + + + BEGIN_SIMPLE_STRICT_ARROW_UDF(TPrec, char*(TAutoMap<double>, ui64)) { + TStringStream result; + result << Prec(args[0].Get<double>(), args[1].Get<ui64>()); + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); + } + + struct TPrecKernelExec : public TBinaryKernelExec<TPrecKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + TStringStream result; + result << Prec(arg1.Get<double>(), arg2.Get<ui64>()); + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); + } + }; + + END_SIMPLE_ARROW_UDF(TPrec, TPrecKernelExec::Do) + + + SIMPLE_STRICT_UDF(TToByteList, TListType<ui8>(char*)) { + const TStringBuf input(args[0].AsStringRef()); + TUnboxedValue* items = nullptr; + TUnboxedValue result = valueBuilder->NewArray(input.size(), items); + for (const unsigned char c : input) { + *items++ = TUnboxedValuePod(c); + } + return result; + } + + SIMPLE_STRICT_UDF(TFromByteList, char*(TListType<ui8>)) { + auto input = args[0]; + + if (auto elems = input.GetElements()) { + const auto elemCount = input.GetListLength(); + TUnboxedValue result = valueBuilder->NewStringNotFilled(input.GetListLength()); + auto bufferPtr = result.AsStringRef().Data(); + for (ui64 i = 0; i != elemCount; ++i) { + *(bufferPtr++) = elems[i].Get<ui8>(); + } + return result; + } + + std::vector<char, NKikimr::NUdf::TStdAllocatorForUdf<char>> buffer; + buffer.reserve(TUnboxedValuePod::InternalBufferSize); + + const auto& iter = input.GetListIterator(); + for (NUdf::TUnboxedValue item; iter.Next(item); ) { + buffer.push_back(item.Get<ui8>()); + } + + return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size())); + } + +#define STRING_REGISTER_UDF(udfName, ...) T##udfName, + + STRING_UDF_MAP(STRING_UDF) + STRING_UNSAFE_UDF_MAP(STRING_UNSAFE_UDF) + STROKA_UDF_MAP(STROKA_UDF) + STROKA_CASE_UDF_MAP(STROKA_CASE_UDF) + STROKA_ASCII_CASE_UDF_MAP(STROKA_ASCII_CASE_UDF) + STROKA_FIND_UDF_MAP(STROKA_FIND_UDF) + STRING_TWO_ARGS_UDF_MAP(STRING_TWO_ARGS_UDF) + IS_ASCII_UDF_MAP(IS_ASCII_UDF) + + static constexpr ui64 padLim = 1000000; + STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_STREAM_PAD_FORMATTER_UDF) + STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_STREAM_NUM_FORMATTER_UDF) + STRING_STREAM_TEXT_FORMATTER_UDF_MAP(STRING_STREAM_TEXT_FORMATTER_UDF) + STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(STRING_STREAM_HRSZ_FORMATTER_UDF) + + SIMPLE_MODULE(TStringModule, + STRING_UDF_MAP(STRING_REGISTER_UDF) + STRING_UNSAFE_UDF_MAP(STRING_REGISTER_UDF) + STROKA_UDF_MAP(STRING_REGISTER_UDF) + STROKA_CASE_UDF_MAP(STRING_REGISTER_UDF) + STROKA_ASCII_CASE_UDF_MAP(STRING_REGISTER_UDF) + STROKA_FIND_UDF_MAP(STRING_REGISTER_UDF) + STRING_TWO_ARGS_UDF_MAP(STRING_REGISTER_UDF) + IS_ASCII_UDF_MAP(STRING_REGISTER_UDF) + STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) + STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) + STRING_STREAM_TEXT_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) + STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) + TCollapseText, + TReplaceAll, + TReplaceFirst, + TReplaceLast, + TRemoveAll, + TRemoveFirst, + TRemoveLast, + TContains, + TFind, + TReverseFind, + TSubstring, + TSplitToList, + TJoinFromList, + TLevensteinDistance, + THumanReadableDuration, + TPrec, + TToByteList, + TFromByteList) +} + +REGISTER_MODULES(TStringModule) diff --git a/yql/essentials/udfs/common/string/test/canondata/result.json b/yql/essentials/udfs/common/string/test/canondata/result.json new file mode 100644 index 00000000000..f9e3a670c2c --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/result.json @@ -0,0 +1,112 @@ +{ + "test.test[AsciiChecks]": [ + { + "uri": "file://test.test_AsciiChecks_/results.txt" + } + ], + "test.test[Base32Decode]": [ + { + "uri": "file://test.test_Base32Decode_/results.txt" + } + ], + "test.test[Base32Encode]": [ + { + "uri": "file://test.test_Base32Encode_/results.txt" + } + ], + "test.test[BlockAsciiChecks]": [ + { + "uri": "file://test.test_BlockAsciiChecks_/results.txt" + } + ], + "test.test[BlockFind]": [ + { + "uri": "file://test.test_BlockFind_/results.txt" + } + ], + "test.test[BlockRemove]": [ + { + "uri": "file://test.test_BlockRemove_/results.txt" + } + ], + "test.test[BlockReplace]": [ + { + "uri": "file://test.test_BlockReplace_/results.txt" + } + ], + "test.test[BlockStreamFormat]": [ + { + "uri": "file://test.test_BlockStreamFormat_/results.txt" + } + ], + "test.test[BlockStringUDF]": [ + { + "uri": "file://test.test_BlockStringUDF_/results.txt" + } + ], + "test.test[BlockStringUnsafeUDF]": [ + { + "uri": "file://test.test_BlockStringUnsafeUDF_/results.txt" + } + ], + "test.test[BlockTo]": [ + { + "uri": "file://test.test_BlockTo_/results.txt" + } + ], + "test.test[ExtendAndTake]": [ + { + "uri": "file://test.test_ExtendAndTake_/results.txt" + } + ], + "test.test[Find]": [ + { + "uri": "file://test.test_Find_/results.txt" + } + ], + "test.test[List]": [ + { + "uri": "file://test.test_List_/results.txt" + } + ], + "test.test[List_v0]": [ + { + "uri": "file://test.test_List_v0_/results.txt" + } + ], + "test.test[Remove]": [ + { + "uri": "file://test.test_Remove_/results.txt" + } + ], + "test.test[ReplaceFirstLast]": [ + { + "uri": "file://test.test_ReplaceFirstLast_/results.txt" + } + ], + "test.test[Replace]": [ + { + "uri": "file://test.test_Replace_/results.txt" + } + ], + "test.test[StreamFormat]": [ + { + "uri": "file://test.test_StreamFormat_/results.txt" + } + ], + "test.test[StringUDF]": [ + { + "uri": "file://test.test_StringUDF_/results.txt" + } + ], + "test.test[StringUnsafeUDF]": [ + { + "uri": "file://test.test_StringUnsafeUDF_/results.txt" + } + ], + "test.test[To]": [ + { + "uri": "file://test.test_To_/results.txt" + } + ] +} diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_AsciiChecks_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_AsciiChecks_/results.txt new file mode 100644 index 00000000000..944b17d4c1e --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_AsciiChecks_/results.txt @@ -0,0 +1,124 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "isascii"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isspace"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isupper"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "islower"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isdigit"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isalpha"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isalnum"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "ishex"; + [ + "DataType"; + "Bool" + ] + ] + ] + ] + ]; + "Data" = [ + [ + %true; + %false; + %false; + %false; + %false; + %false; + %false; + %false + ]; + [ + %true; + %false; + %false; + %false; + %false; + %false; + %false; + %false + ]; + [ + %true; + %false; + %false; + %true; + %false; + %true; + %true; + %false + ]; + [ + %true; + %false; + %false; + %false; + %true; + %false; + %true; + %true + ]; + [ + %true; + %false; + %false; + %false; + %false; + %false; + %false; + %false + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_Base32Decode_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_Base32Decode_/results.txt new file mode 100644 index 00000000000..bf4aa56fa93 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_Base32Decode_/results.txt @@ -0,0 +1,79 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "strict_decoded"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "decoded"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + "ORSXG5A="; + [ + "test" + ]; + [ + "test" + ] + ]; + [ + "KRSXG5CUMVZXI==="; + [ + "TestTest" + ]; + [ + "TestTest" + ] + ]; + [ + "MFYHA3DF"; + [ + "apple" + ]; + [ + "apple" + ] + ]; + [ + "hmmmm===hmmmm"; + #; + [ + "\0\0\0" + ] + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_Base32Encode_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_Base32Encode_/results.txt new file mode 100644 index 00000000000..51c74759fc7 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_Base32Encode_/results.txt @@ -0,0 +1,44 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "encoded"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "test"; + "ORSXG5A=" + ]; + [ + "TestTest"; + "KRSXG5CUMVZXI===" + ]; + [ + "apple"; + "MFYHA3DF" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_BlockAsciiChecks_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockAsciiChecks_/results.txt new file mode 100644 index 00000000000..944b17d4c1e --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockAsciiChecks_/results.txt @@ -0,0 +1,124 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "isascii"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isspace"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isupper"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "islower"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isdigit"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isalpha"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "isalnum"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "ishex"; + [ + "DataType"; + "Bool" + ] + ] + ] + ] + ]; + "Data" = [ + [ + %true; + %false; + %false; + %false; + %false; + %false; + %false; + %false + ]; + [ + %true; + %false; + %false; + %false; + %false; + %false; + %false; + %false + ]; + [ + %true; + %false; + %false; + %true; + %false; + %true; + %true; + %false + ]; + [ + %true; + %false; + %false; + %false; + %true; + %false; + %true; + %true + ]; + [ + %true; + %false; + %false; + %false; + %false; + %false; + %false; + %false + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_BlockFind_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockFind_/results.txt new file mode 100644 index 00000000000..f6374e682e5 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockFind_/results.txt @@ -0,0 +1,69 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "contains"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "levenstein"; + [ + "DataType"; + "Uint64" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "fdsa"; + %false; + "3" + ]; + [ + "aswedfg"; + %true; + "5" + ]; + [ + "asdadsaasd"; + %true; + "8" + ]; + [ + "gdsfsassas"; + %true; + "8" + ]; + [ + ""; + %false; + "2" + ]; + [ + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + %false; + "23" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_BlockRemove_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockRemove_/results.txt new file mode 100644 index 00000000000..6fbf37a9f9b --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockRemove_/results.txt @@ -0,0 +1,173 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "all"; + [ + "DataType"; + "String" + ] + ]; + [ + "first"; + [ + "DataType"; + "String" + ] + ]; + [ + "last"; + [ + "DataType"; + "String" + ] + ]; + [ + "first2"; + [ + "DataType"; + "String" + ] + ]; + [ + "last2"; + [ + "DataType"; + "String" + ] + ]; + [ + "first3"; + [ + "DataType"; + "String" + ] + ]; + [ + "last3"; + [ + "DataType"; + "String" + ] + ]; + [ + "hwruall"; + [ + "DataType"; + "String" + ] + ]; + [ + "hwrufirst"; + [ + "DataType"; + "String" + ] + ]; + [ + "hwrulast"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "fdsa"; + "fd"; + "fds"; + "fds"; + "fda"; + "fds"; + "fdsa"; + "fdsa"; + "fdsa"; + "fdsa"; + "fdsa" + ]; + [ + "aswedfg"; + "wedfg"; + "swedfg"; + "swedfg"; + "swedfg"; + "awedfg"; + "aswedfg"; + "aswedfg"; + "aswedfg"; + "aswedfg"; + "aswedfg" + ]; + [ + "asdadsaasd"; + "ddd"; + "sdadsaasd"; + "asdadsasd"; + "sdadsaasd"; + "asdadsaad"; + "asdadsaasd"; + "asdadsaasd"; + "asdadsaasd"; + "asdadsaasd"; + "asdadsaasd" + ]; + [ + "gdsfsassas"; + "gdf"; + "gdsfsssas"; + "gdsfsasss"; + "gdfsassas"; + "gdsfsassa"; + "gdsfsassas"; + "gdsfsassas"; + "gdsfsassas"; + "gdsfsassas"; + "gdsfsassas" + ]; + [ + ""; + ""; + ""; + ""; + ""; + ""; + ""; + ""; + ""; + ""; + "" + ]; + [ + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!"; + "\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_BlockReplace_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockReplace_/results.txt new file mode 100644 index 00000000000..2ac3566c61d --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockReplace_/results.txt @@ -0,0 +1,134 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "all"; + [ + "DataType"; + "String" + ] + ]; + [ + "first"; + [ + "DataType"; + "String" + ] + ]; + [ + "last"; + [ + "DataType"; + "String" + ] + ]; + [ + "first2"; + [ + "DataType"; + "String" + ] + ]; + [ + "last2"; + [ + "DataType"; + "String" + ] + ]; + [ + "first3"; + [ + "DataType"; + "String" + ] + ]; + [ + "last3"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "fdsa"; + "fdsa"; + "fdsz"; + "fdsz"; + "fdszz"; + "fdszz"; + "fds"; + "fds" + ]; + [ + "aswedfg"; + "zzzwedfg"; + "zswedfg"; + "zswedfg"; + "zzswedfg"; + "zzswedfg"; + "swedfg"; + "swedfg" + ]; + [ + "asdadsaasd"; + "zzzdadsazzzd"; + "zsdadsaasd"; + "asdadsazsd"; + "zzsdadsaasd"; + "asdadsazzsd"; + "sdadsaasd"; + "asdadsasd" + ]; + [ + "gdsfsassas"; + "gdsfszzzszzz"; + "gdsfszssas"; + "gdsfsasszs"; + "gdsfszzssas"; + "gdsfsasszzs"; + "gdsfsssas"; + "gdsfsasss" + ]; + [ + ""; + ""; + ""; + ""; + ""; + ""; + ""; + "" + ]; + [ + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStreamFormat_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStreamFormat_/results.txt new file mode 100644 index 00000000000..b1bff8a57b8 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStreamFormat_/results.txt @@ -0,0 +1,208 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "right_pad"; + [ + "DataType"; + "String" + ] + ]; + [ + "left_pad"; + [ + "DataType"; + "String" + ] + ]; + [ + "right_pad_zero"; + [ + "DataType"; + "String" + ] + ]; + [ + "left_pad_zero"; + [ + "DataType"; + "String" + ] + ]; + [ + "hex"; + [ + "DataType"; + "String" + ] + ]; + [ + "shex"; + [ + "DataType"; + "String" + ] + ]; + [ + "bin"; + [ + "DataType"; + "String" + ] + ]; + [ + "sbin"; + [ + "DataType"; + "String" + ] + ]; + [ + "hex_text"; + [ + "DataType"; + "String" + ] + ]; + [ + "bin_text"; + [ + "DataType"; + "String" + ] + ]; + [ + "duration"; + [ + "DataType"; + "String" + ] + ]; + [ + "quantity"; + [ + "DataType"; + "String" + ] + ]; + [ + "bytes"; + [ + "DataType"; + "String" + ] + ]; + [ + "prec"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "qwertyui"; + "qwertyui "; + " qwertyui"; + "qwertyui000000000000"; + "000000000000qwertyui"; + "0x00000000499602D2"; + "-0x000000000000007B"; + "0b0000000000000000000000000000000001001001100101100000001011010010"; + "-0b0000000000000000000000000000000000000000000000000000000001111011"; + "71 77 65 72 74 79 75 69"; + "01110001 01110111 01100101 01110010 01110100 01111001 01110101 01101001"; + "20m 34s"; + "1.23G"; + "1.15GiB"; + "-0.009963" + ]; + [ + "asdfghjl"; + "asdfghjl "; + " asdfghjl"; + "asdfghjl000000000000"; + "000000000000asdfghjl"; + "0x000000024CB016EA"; + "-0x00000000000001C8"; + "0b0000000000000000000000000000001001001100101100000001011011101010"; + "-0b0000000000000000000000000000000000000000000000000000000111001000"; + "61 73 64 66 67 68 6A 6C"; + "01100001 01110011 01100100 01100110 01100111 01101000 01101010 01101100"; + "2h 44m 36s"; + "9.88G"; + "9.2GiB"; + "-0.03694" + ]; + [ + "zxcvbnm?"; + "zxcvbnm? "; + " zxcvbnm?"; + "zxcvbnm?000000000000"; + "000000000000zxcvbnm?"; + "0x00000002540BE3FF"; + "-0x0000000000000315"; + "0b0000000000000000000000000000001001010100000010111110001111111111"; + "-0b0000000000000000000000000000000000000000000000000000001100010101"; + "7A 78 63 76 62 6E 6D 3F"; + "01111010 01111000 01100011 01110110 01100010 01101110 01101101 00111111"; + "2h 46m 40s"; + "10G"; + "9.31GiB"; + "-0.06391" + ]; + [ + "12345678"; + "12345678 "; + " 12345678"; + "12345678000000000000"; + "00000000000012345678"; + "0x0000000000000000"; + "0x0000000000000000"; + "0b0000000000000000000000000000000000000000000000000000000000000000"; + "0b0000000000000000000000000000000000000000000000000000000000000000"; + "31 32 33 34 35 36 37 38"; + "00110001 00110010 00110011 00110100 00110101 00110110 00110111 00111000"; + "0us"; + "0"; + "0B"; + "0" + ]; + [ + "!@#$%^&*"; + "!@#$%^&* "; + " !@#$%^&*"; + "!@#$%^&*000000000000"; + "000000000000!@#$%^&*"; + "0x0000000223557439"; + "-0x00000000000003E7"; + "0b0000000000000000000000000000001000100011010101010111010000111001"; + "-0b0000000000000000000000000000000000000000000000000000001111100111"; + "21 40 23 24 25 5E 26 2A"; + "00100001 01000000 00100011 00100100 00100101 01011110 00100110 00101010"; + "2h 33m 2s"; + "9.18G"; + "8.55GiB"; + "-0.08092" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStringUDF_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStringUDF_/results.txt new file mode 100644 index 00000000000..a665105224f --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStringUDF_/results.txt @@ -0,0 +1,169 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "b32enc"; + [ + "DataType"; + "String" + ] + ]; + [ + "b64enc"; + [ + "DataType"; + "String" + ] + ]; + [ + "b64encu"; + [ + "DataType"; + "String" + ] + ]; + [ + "cesc"; + [ + "DataType"; + "String" + ] + ]; + [ + "cunesc"; + [ + "DataType"; + "String" + ] + ]; + [ + "xenc"; + [ + "DataType"; + "String" + ] + ]; + [ + "henc"; + [ + "DataType"; + "String" + ] + ]; + [ + "hdec"; + [ + "DataType"; + "String" + ] + ]; + [ + "cgesc"; + [ + "DataType"; + "String" + ] + ]; + [ + "cgunesc"; + [ + "DataType"; + "String" + ] + ]; + [ + "clps"; + [ + "DataType"; + "String" + ] + ]; + [ + "strp"; + [ + "DataType"; + "String" + ] + ]; + [ + "clpst"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "EAQCAILRO5SSA4TUPEQCAIDVNFXXAIC3EBOSI==="; + "ICAgIXF3ZSBydHkgICB1aW9wIFsgXSQ="; + "ICAgIXF3ZSBydHkgICB1aW9wIFsgXSQ,"; + " !qwe rty uiop [ ]$"; + " !qwe rty uiop [ ]$"; + "202020217177652072747920202075696F70205B205D24"; + " !qwe rty uiop [ ]$"; + " !qwe rty uiop [ ]$"; + "+++!qwe+rty+++uiop+%5B+%5D$"; + " !qwe rty uiop [ ]$"; + " !qwe rty uiop [ ]$"; + "!qwe rty uiop [ ]$"; + "!qwe ..." + ]; + [ + "IBQXGIBAEAQCAIBAMRTGO2BANJVWYXDOHMTSKIBA"; + "QGFzICAgICAgIGRmZ2ggamtsXG47JyUgIA=="; + "QGFzICAgICAgIGRmZ2ggamtsXG47JyUgIA,,"; + "@as dfgh jkl\\\\n;'% "; + "@as dfgh jkl\n;'% "; + "4061732020202020202064666768206A6B6C5C6E3B27252020"; + "@as dfgh jkl\\n;'% "; + "@as dfgh jkl\\n;'% "; + "@as+++++++dfgh+jkl%5Cn;%27%25++"; + "@as dfgh jkl\\n;'% "; + "@as dfgh jkl\\n;'% "; + "@as dfgh jkl\\n;'%"; + "@as ..." + ]; + [ + "EAQCAI32PBRQS5TCNYQASCQIEBWSYLRPH5PCAIBA"; + "ICAgI3p4Ywl2Ym4gCQoIIG0sLi8/XiAgIA=="; + "ICAgI3p4Ywl2Ym4gCQoIIG0sLi8_XiAgIA,,"; + " #zxc\\tvbn \\t\\n\\x08 m,./?^ "; + " #zxc\tvbn \t\n\x08 m,./?^ "; + "202020237A78630976626E20090A08206D2C2E2F3F5E202020"; + " #zxc\tvbn \t\n\x08 m,./?^ "; + " #zxc\tvbn \t\n\x08 m,./?^ "; + "+++%23zxc%09vbn+%09%0A%08+m%2C./%3F%5E+++"; + " #zxc\tvbn \t\n\x08 m,./?^ "; + " #zxc vbn \x08 m,./?^ "; + "#zxc\tvbn \t\n\x08 m,./?^"; + "#zxc ..." + ]; + [ + "GEQTEQBTEM2CINJFGZPDOJRYFI4SQMBJFVPT2KZMHQXD4==="; + "MSEyQDMjNCQ1JTZeNyY4KjkoMCktXz0rLDwuPg=="; + "MSEyQDMjNCQ1JTZeNyY4KjkoMCktXz0rLDwuPg,,"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "31213240332334243525365E3726382A392830292D5F3D2B2C3C2E3E"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@3%234$5%256%5E7%268*9%280%29-_%3D%2B%2C%3C.%3E"; + "1!2@3#4$5%6^7&8*9(0)-_= ,<.>"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@ ..." + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStringUnsafeUDF_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStringUnsafeUDF_/results.txt new file mode 100644 index 00000000000..26b182f9343 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockStringUnsafeUDF_/results.txt @@ -0,0 +1,158 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "b32dec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "b32sdec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "b64dec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "b64sdec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "xdec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + " !qwe rty uiop [ ]$" + ]; + [ + " !qwe rty uiop [ ]$" + ]; + [ + [ + "EAQCAILRO5SSA4TUPEQCAIDVNFXXAIC3EBOS" + ] + ]; + #; + # + ]; + [ + [ + [ + "QIAEXLvMggAcAECCAFgAQUALyg==" + ] + ]; + #; + [ + " !qwe rty uiop [ ]$" + ]; + [ + " !qwe rty uiop [ ]$" + ]; + # + ]; + [ + [ + [ + "0DQNA0D4P/93QP6/z4NA0DQP98Dxfg0DodA6PQ==" + ] + ]; + #; + #; + #; + [ + " !qwe rty uiop [ ]$" + ] + ]; + [ + [ + "@as dfgh jkl\\n;'% " + ]; + [ + "@as dfgh jkl\\n;'% " + ]; + [ + [ + "IBQXGIBAEAQCAIBAMRTGO2BANJVWYXDOHMTSKIBA" + ] + ]; + [ + [ + "IBQXGIBAEAQCAIBAMRTGO2BANJVWYXDOHMTSKIBA" + ] + ]; + # + ]; + [ + [ + [ + "gYoECABAgAQaIM6AAAAAubn0goBAAA==" + ] + ]; + #; + [ + "@as dfgh jkl\\n;'% " + ]; + [ + "@as dfgh jkl\\n;'% " + ]; + # + ]; + [ + [ + [ + "4DwP70DQNA0DQNA0D3Pe9/wNA8DwfC6LxNh1/XdA0A==" + ] + ]; + #; + #; + #; + [ + "@as dfgh jkl\\n;'% " + ] + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_BlockTo_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockTo_/results.txt new file mode 100644 index 00000000000..143cfb76417 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_BlockTo_/results.txt @@ -0,0 +1,88 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "ascii_lower"; + [ + "DataType"; + "String" + ] + ]; + [ + "ascii_upper"; + [ + "DataType"; + "String" + ] + ]; + [ + "ascii_title"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "test"; + "test"; + "TEST"; + "Test" + ]; + [ + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" + ]; + [ + "TeSt"; + "test"; + "TEST"; + "Test" + ]; + [ + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2" + ]; + [ + "Eyl\xC3\xBCl"; + "eyl\xC3\xBCl"; + "EYL\xC3\xBCL"; + "Eyl\xC3\xBCl" + ]; + [ + "6"; + "6"; + "6"; + "6" + ]; + [ + ""; + ""; + ""; + "" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_ExtendAndTake_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_ExtendAndTake_/results.txt new file mode 100644 index 00000000000..81269c68153 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_ExtendAndTake_/results.txt @@ -0,0 +1,60 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "column0"; + [ + "ListType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "column1"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + "a"; + "b"; + "c" + ]; + [ + "b" + ] + ]; + [ + [ + "d" + ]; + [ + "d" + ] + ]; + [ + []; + # + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_Find_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_Find_/results.txt new file mode 100644 index 00000000000..cec53212501 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_Find_/results.txt @@ -0,0 +1,147 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "contains"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "prefix"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "starts"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "suffix"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "ends"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "find"; + [ + "DataType"; + "Int64" + ] + ]; + [ + "rfind"; + [ + "DataType"; + "Int64" + ] + ]; + [ + "levenstein"; + [ + "DataType"; + "Uint64" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "fdsa"; + %false; + %false; + %false; + %false; + %false; + "-1"; + "-1"; + "3" + ]; + [ + "aswedfg"; + %true; + %true; + %true; + %false; + %false; + "0"; + "0"; + "5" + ]; + [ + "asdadsaasd"; + %true; + %true; + %true; + %false; + %false; + "0"; + "7"; + "8" + ]; + [ + "gdsfsassas"; + %true; + %false; + %false; + %true; + %true; + "5"; + "8"; + "8" + ]; + [ + ""; + %false; + %false; + %false; + %false; + %false; + "-1"; + "-1"; + "2" + ]; + [ + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + %false; + %false; + %false; + %false; + %false; + "-1"; + "-1"; + "23" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_List_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_List_/results.txt new file mode 100644 index 00000000000..dac9a135756 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_List_/results.txt @@ -0,0 +1,265 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "equals_to_original"; + [ + "DataType"; + "String" + ] + ]; + [ + "replace_delimeter"; + [ + "DataType"; + "String" + ] + ]; + [ + "just_split"; + [ + "ListType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "first"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "skip_empty"; + [ + "ListType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "multichar_delim_set"; + [ + "ListType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "multichar_delim_string"; + [ + "ListType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "limited"; + [ + "ListType"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + "a@b@c"; + "a@b@c"; + "a#b#c"; + [ + "a"; + "b"; + "c" + ]; + [ + "a" + ]; + [ + "a"; + "b"; + "c" + ]; + [ + "a"; + ""; + ""; + "c" + ]; + [ + "a@"; + "c" + ]; + [ + "a"; + "b@c" + ] + ]; + [ + "@a@b@c"; + "@a@b@c"; + "#a#b#c"; + [ + ""; + "a"; + "b"; + "c" + ]; + [ + "" + ]; + [ + "a"; + "b"; + "c" + ]; + [ + ""; + "a"; + ""; + ""; + "c" + ]; + [ + "@a@"; + "c" + ]; + [ + ""; + "a@b@c" + ] + ]; + [ + "@@@a@a"; + "@@@a@a"; + "###a#a"; + [ + ""; + ""; + ""; + "a"; + "a" + ]; + [ + "" + ]; + [ + "a"; + "a" + ]; + [ + ""; + ""; + ""; + "a"; + "a" + ]; + [ + "@@@a@a" + ]; + [ + ""; + "@@a@a" + ] + ]; + [ + "d#e#f"; + "d#e#f"; + "d#e#f"; + [ + "d#e#f" + ]; + [ + "d#e#f" + ]; + [ + "d#e#f" + ]; + [ + "d#e#f" + ]; + [ + "d#e#f" + ]; + [ + "d#e#f" + ] + ]; + [ + "d"; + "d"; + "d"; + [ + "d" + ]; + [ + "d" + ]; + [ + "d" + ]; + [ + "d" + ]; + [ + "d" + ]; + [ + "d" + ] + ]; + [ + ""; + ""; + ""; + [ + "" + ]; + [ + "" + ]; + []; + [ + "" + ]; + [ + "" + ]; + [ + "" + ] + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_List_v0_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_List_v0_/results.txt new file mode 100644 index 00000000000..b149ad38a60 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_List_v0_/results.txt @@ -0,0 +1,125 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "not_equals_to_original"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "not_equals_to_original_skip_empty"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "equals_to_original"; + [ + "DataType"; + "String" + ] + ]; + [ + "multichar"; + [ + "ListType"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + "a@b@c"; + #; + #; + "a@b@c"; + [ + "a"; + "b"; + "c" + ] + ]; + [ + "@a@b@c"; + #; + #; + "@a@b@c"; + [ + "a"; + "b"; + "c" + ] + ]; + [ + "@@@a@a"; + [ + "@@@a@a" + ]; + [ + "@@@a@a" + ]; + "@@@a@a"; + [ + "a"; + "a" + ] + ]; + [ + "d#e#f"; + #; + #; + "d#e#f"; + [ + "d"; + "e"; + "f" + ] + ]; + [ + "d"; + #; + #; + "d"; + [ + "d" + ] + ]; + [ + ""; + #; + #; + ""; + [] + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_Remove_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_Remove_/results.txt new file mode 100644 index 00000000000..6fbf37a9f9b --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_Remove_/results.txt @@ -0,0 +1,173 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "all"; + [ + "DataType"; + "String" + ] + ]; + [ + "first"; + [ + "DataType"; + "String" + ] + ]; + [ + "last"; + [ + "DataType"; + "String" + ] + ]; + [ + "first2"; + [ + "DataType"; + "String" + ] + ]; + [ + "last2"; + [ + "DataType"; + "String" + ] + ]; + [ + "first3"; + [ + "DataType"; + "String" + ] + ]; + [ + "last3"; + [ + "DataType"; + "String" + ] + ]; + [ + "hwruall"; + [ + "DataType"; + "String" + ] + ]; + [ + "hwrufirst"; + [ + "DataType"; + "String" + ] + ]; + [ + "hwrulast"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "fdsa"; + "fd"; + "fds"; + "fds"; + "fda"; + "fds"; + "fdsa"; + "fdsa"; + "fdsa"; + "fdsa"; + "fdsa" + ]; + [ + "aswedfg"; + "wedfg"; + "swedfg"; + "swedfg"; + "swedfg"; + "awedfg"; + "aswedfg"; + "aswedfg"; + "aswedfg"; + "aswedfg"; + "aswedfg" + ]; + [ + "asdadsaasd"; + "ddd"; + "sdadsaasd"; + "asdadsasd"; + "sdadsaasd"; + "asdadsaad"; + "asdadsaasd"; + "asdadsaasd"; + "asdadsaasd"; + "asdadsaasd"; + "asdadsaasd" + ]; + [ + "gdsfsassas"; + "gdf"; + "gdsfsssas"; + "gdsfsasss"; + "gdfsassas"; + "gdsfsassa"; + "gdsfsassas"; + "gdsfsassas"; + "gdsfsassas"; + "gdsfsassas"; + "gdsfsassas" + ]; + [ + ""; + ""; + ""; + ""; + ""; + ""; + ""; + ""; + ""; + ""; + "" + ]; + [ + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!"; + "\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_ReplaceFirstLast_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_ReplaceFirstLast_/results.txt new file mode 100644 index 00000000000..9320ac1c18a --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_ReplaceFirstLast_/results.txt @@ -0,0 +1,84 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "column0"; + [ + "DataType"; + "String" + ] + ]; + [ + "column1"; + [ + "DataType"; + "String" + ] + ]; + [ + "column2"; + [ + "DataType"; + "String" + ] + ]; + [ + "column3"; + [ + "DataType"; + "String" + ] + ]; + [ + "column4"; + [ + "DataType"; + "String" + ] + ]; + [ + "column5"; + [ + "DataType"; + "String" + ] + ]; + [ + "column6"; + [ + "DataType"; + "String" + ] + ]; + [ + "column7"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "gzas"; + "gzzzsas"; + "gsas"; + "gasas"; + "gasz"; + "gaszzzs"; + "gass"; + "gasas" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_Replace_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_Replace_/results.txt new file mode 100644 index 00000000000..2ac3566c61d --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_Replace_/results.txt @@ -0,0 +1,134 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "all"; + [ + "DataType"; + "String" + ] + ]; + [ + "first"; + [ + "DataType"; + "String" + ] + ]; + [ + "last"; + [ + "DataType"; + "String" + ] + ]; + [ + "first2"; + [ + "DataType"; + "String" + ] + ]; + [ + "last2"; + [ + "DataType"; + "String" + ] + ]; + [ + "first3"; + [ + "DataType"; + "String" + ] + ]; + [ + "last3"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "fdsa"; + "fdsa"; + "fdsz"; + "fdsz"; + "fdszz"; + "fdszz"; + "fds"; + "fds" + ]; + [ + "aswedfg"; + "zzzwedfg"; + "zswedfg"; + "zswedfg"; + "zzswedfg"; + "zzswedfg"; + "swedfg"; + "swedfg" + ]; + [ + "asdadsaasd"; + "zzzdadsazzzd"; + "zsdadsaasd"; + "asdadsazsd"; + "zzsdadsaasd"; + "asdadsazzsd"; + "sdadsaasd"; + "asdadsasd" + ]; + [ + "gdsfsassas"; + "gdsfszzzszzz"; + "gdsfszssas"; + "gdsfsasszs"; + "gdsfszzssas"; + "gdsfsasszzs"; + "gdsfsssas"; + "gdsfsasss" + ]; + [ + ""; + ""; + ""; + ""; + ""; + ""; + ""; + "" + ]; + [ + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`"; + "`\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80!`" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_StreamFormat_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_StreamFormat_/results.txt new file mode 100644 index 00000000000..b1bff8a57b8 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_StreamFormat_/results.txt @@ -0,0 +1,208 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "right_pad"; + [ + "DataType"; + "String" + ] + ]; + [ + "left_pad"; + [ + "DataType"; + "String" + ] + ]; + [ + "right_pad_zero"; + [ + "DataType"; + "String" + ] + ]; + [ + "left_pad_zero"; + [ + "DataType"; + "String" + ] + ]; + [ + "hex"; + [ + "DataType"; + "String" + ] + ]; + [ + "shex"; + [ + "DataType"; + "String" + ] + ]; + [ + "bin"; + [ + "DataType"; + "String" + ] + ]; + [ + "sbin"; + [ + "DataType"; + "String" + ] + ]; + [ + "hex_text"; + [ + "DataType"; + "String" + ] + ]; + [ + "bin_text"; + [ + "DataType"; + "String" + ] + ]; + [ + "duration"; + [ + "DataType"; + "String" + ] + ]; + [ + "quantity"; + [ + "DataType"; + "String" + ] + ]; + [ + "bytes"; + [ + "DataType"; + "String" + ] + ]; + [ + "prec"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "qwertyui"; + "qwertyui "; + " qwertyui"; + "qwertyui000000000000"; + "000000000000qwertyui"; + "0x00000000499602D2"; + "-0x000000000000007B"; + "0b0000000000000000000000000000000001001001100101100000001011010010"; + "-0b0000000000000000000000000000000000000000000000000000000001111011"; + "71 77 65 72 74 79 75 69"; + "01110001 01110111 01100101 01110010 01110100 01111001 01110101 01101001"; + "20m 34s"; + "1.23G"; + "1.15GiB"; + "-0.009963" + ]; + [ + "asdfghjl"; + "asdfghjl "; + " asdfghjl"; + "asdfghjl000000000000"; + "000000000000asdfghjl"; + "0x000000024CB016EA"; + "-0x00000000000001C8"; + "0b0000000000000000000000000000001001001100101100000001011011101010"; + "-0b0000000000000000000000000000000000000000000000000000000111001000"; + "61 73 64 66 67 68 6A 6C"; + "01100001 01110011 01100100 01100110 01100111 01101000 01101010 01101100"; + "2h 44m 36s"; + "9.88G"; + "9.2GiB"; + "-0.03694" + ]; + [ + "zxcvbnm?"; + "zxcvbnm? "; + " zxcvbnm?"; + "zxcvbnm?000000000000"; + "000000000000zxcvbnm?"; + "0x00000002540BE3FF"; + "-0x0000000000000315"; + "0b0000000000000000000000000000001001010100000010111110001111111111"; + "-0b0000000000000000000000000000000000000000000000000000001100010101"; + "7A 78 63 76 62 6E 6D 3F"; + "01111010 01111000 01100011 01110110 01100010 01101110 01101101 00111111"; + "2h 46m 40s"; + "10G"; + "9.31GiB"; + "-0.06391" + ]; + [ + "12345678"; + "12345678 "; + " 12345678"; + "12345678000000000000"; + "00000000000012345678"; + "0x0000000000000000"; + "0x0000000000000000"; + "0b0000000000000000000000000000000000000000000000000000000000000000"; + "0b0000000000000000000000000000000000000000000000000000000000000000"; + "31 32 33 34 35 36 37 38"; + "00110001 00110010 00110011 00110100 00110101 00110110 00110111 00111000"; + "0us"; + "0"; + "0B"; + "0" + ]; + [ + "!@#$%^&*"; + "!@#$%^&* "; + " !@#$%^&*"; + "!@#$%^&*000000000000"; + "000000000000!@#$%^&*"; + "0x0000000223557439"; + "-0x00000000000003E7"; + "0b0000000000000000000000000000001000100011010101010111010000111001"; + "-0b0000000000000000000000000000000000000000000000000000001111100111"; + "21 40 23 24 25 5E 26 2A"; + "00100001 01000000 00100011 00100100 00100101 01011110 00100110 00101010"; + "2h 33m 2s"; + "9.18G"; + "8.55GiB"; + "-0.08092" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_StringUDF_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_StringUDF_/results.txt new file mode 100644 index 00000000000..a665105224f --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_StringUDF_/results.txt @@ -0,0 +1,169 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "b32enc"; + [ + "DataType"; + "String" + ] + ]; + [ + "b64enc"; + [ + "DataType"; + "String" + ] + ]; + [ + "b64encu"; + [ + "DataType"; + "String" + ] + ]; + [ + "cesc"; + [ + "DataType"; + "String" + ] + ]; + [ + "cunesc"; + [ + "DataType"; + "String" + ] + ]; + [ + "xenc"; + [ + "DataType"; + "String" + ] + ]; + [ + "henc"; + [ + "DataType"; + "String" + ] + ]; + [ + "hdec"; + [ + "DataType"; + "String" + ] + ]; + [ + "cgesc"; + [ + "DataType"; + "String" + ] + ]; + [ + "cgunesc"; + [ + "DataType"; + "String" + ] + ]; + [ + "clps"; + [ + "DataType"; + "String" + ] + ]; + [ + "strp"; + [ + "DataType"; + "String" + ] + ]; + [ + "clpst"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "EAQCAILRO5SSA4TUPEQCAIDVNFXXAIC3EBOSI==="; + "ICAgIXF3ZSBydHkgICB1aW9wIFsgXSQ="; + "ICAgIXF3ZSBydHkgICB1aW9wIFsgXSQ,"; + " !qwe rty uiop [ ]$"; + " !qwe rty uiop [ ]$"; + "202020217177652072747920202075696F70205B205D24"; + " !qwe rty uiop [ ]$"; + " !qwe rty uiop [ ]$"; + "+++!qwe+rty+++uiop+%5B+%5D$"; + " !qwe rty uiop [ ]$"; + " !qwe rty uiop [ ]$"; + "!qwe rty uiop [ ]$"; + "!qwe ..." + ]; + [ + "IBQXGIBAEAQCAIBAMRTGO2BANJVWYXDOHMTSKIBA"; + "QGFzICAgICAgIGRmZ2ggamtsXG47JyUgIA=="; + "QGFzICAgICAgIGRmZ2ggamtsXG47JyUgIA,,"; + "@as dfgh jkl\\\\n;'% "; + "@as dfgh jkl\n;'% "; + "4061732020202020202064666768206A6B6C5C6E3B27252020"; + "@as dfgh jkl\\n;'% "; + "@as dfgh jkl\\n;'% "; + "@as+++++++dfgh+jkl%5Cn;%27%25++"; + "@as dfgh jkl\\n;'% "; + "@as dfgh jkl\\n;'% "; + "@as dfgh jkl\\n;'%"; + "@as ..." + ]; + [ + "EAQCAI32PBRQS5TCNYQASCQIEBWSYLRPH5PCAIBA"; + "ICAgI3p4Ywl2Ym4gCQoIIG0sLi8/XiAgIA=="; + "ICAgI3p4Ywl2Ym4gCQoIIG0sLi8_XiAgIA,,"; + " #zxc\\tvbn \\t\\n\\x08 m,./?^ "; + " #zxc\tvbn \t\n\x08 m,./?^ "; + "202020237A78630976626E20090A08206D2C2E2F3F5E202020"; + " #zxc\tvbn \t\n\x08 m,./?^ "; + " #zxc\tvbn \t\n\x08 m,./?^ "; + "+++%23zxc%09vbn+%09%0A%08+m%2C./%3F%5E+++"; + " #zxc\tvbn \t\n\x08 m,./?^ "; + " #zxc vbn \x08 m,./?^ "; + "#zxc\tvbn \t\n\x08 m,./?^"; + "#zxc ..." + ]; + [ + "GEQTEQBTEM2CINJFGZPDOJRYFI4SQMBJFVPT2KZMHQXD4==="; + "MSEyQDMjNCQ1JTZeNyY4KjkoMCktXz0rLDwuPg=="; + "MSEyQDMjNCQ1JTZeNyY4KjkoMCktXz0rLDwuPg,,"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "31213240332334243525365E3726382A392830292D5F3D2B2C3C2E3E"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@3%234$5%256%5E7%268*9%280%29-_%3D%2B%2C%3C.%3E"; + "1!2@3#4$5%6^7&8*9(0)-_= ,<.>"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@3#4$5%6^7&8*9(0)-_=+,<.>"; + "1!2@ ..." + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_StringUnsafeUDF_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_StringUnsafeUDF_/results.txt new file mode 100644 index 00000000000..26b182f9343 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_StringUnsafeUDF_/results.txt @@ -0,0 +1,158 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "b32dec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "b32sdec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "b64dec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "b64sdec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "xdec"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + " !qwe rty uiop [ ]$" + ]; + [ + " !qwe rty uiop [ ]$" + ]; + [ + [ + "EAQCAILRO5SSA4TUPEQCAIDVNFXXAIC3EBOS" + ] + ]; + #; + # + ]; + [ + [ + [ + "QIAEXLvMggAcAECCAFgAQUALyg==" + ] + ]; + #; + [ + " !qwe rty uiop [ ]$" + ]; + [ + " !qwe rty uiop [ ]$" + ]; + # + ]; + [ + [ + [ + "0DQNA0D4P/93QP6/z4NA0DQP98Dxfg0DodA6PQ==" + ] + ]; + #; + #; + #; + [ + " !qwe rty uiop [ ]$" + ] + ]; + [ + [ + "@as dfgh jkl\\n;'% " + ]; + [ + "@as dfgh jkl\\n;'% " + ]; + [ + [ + "IBQXGIBAEAQCAIBAMRTGO2BANJVWYXDOHMTSKIBA" + ] + ]; + [ + [ + "IBQXGIBAEAQCAIBAMRTGO2BANJVWYXDOHMTSKIBA" + ] + ]; + # + ]; + [ + [ + [ + "gYoECABAgAQaIM6AAAAAubn0goBAAA==" + ] + ]; + #; + [ + "@as dfgh jkl\\n;'% " + ]; + [ + "@as dfgh jkl\\n;'% " + ]; + # + ]; + [ + [ + [ + "4DwP70DQNA0DQNA0D3Pe9/wNA8DwfC6LxNh1/XdA0A==" + ] + ]; + #; + #; + #; + [ + "@as dfgh jkl\\n;'% " + ] + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/canondata/test.test_To_/results.txt b/yql/essentials/udfs/common/string/test/canondata/test.test_To_/results.txt new file mode 100644 index 00000000000..441e62fd21b --- /dev/null +++ b/yql/essentials/udfs/common/string/test/canondata/test.test_To_/results.txt @@ -0,0 +1,294 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "String" + ] + ]; + [ + "ascii_lower"; + [ + "DataType"; + "String" + ] + ]; + [ + "ascii_upper"; + [ + "DataType"; + "String" + ] + ]; + [ + "ascii_title"; + [ + "DataType"; + "String" + ] + ]; + [ + "lower"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "upper"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "title"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "reverse"; + [ + "OptionalType"; + [ + "DataType"; + "String" + ] + ] + ]; + [ + "byte_list"; + [ + "ListType"; + [ + "DataType"; + "Uint8" + ] + ] + ]; + [ + "from_byte_list"; + [ + "DataType"; + "String" + ] + ]; + [ + "from_lazy_byte_list"; + [ + "DataType"; + "String" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "test"; + "test"; + "TEST"; + "Test"; + [ + "test" + ]; + [ + "TEST" + ]; + [ + "Test" + ]; + [ + "tset" + ]; + [ + "116"; + "101"; + "115"; + "116" + ]; + "test"; + "test" + ]; + [ + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; + [ + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" + ]; + [ + "\xD0\xA2\xD0\x95\xD0\xA1\xD0\xA2" + ]; + [ + "\xD0\xA2\xD0\xB5\xD1\x81\xD1\x82" + ]; + [ + "\xD1\x82\xD1\x81\xD0\xB5\xD1\x82" + ]; + [ + "209"; + "130"; + "208"; + "181"; + "209"; + "129"; + "209"; + "130" + ]; + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" + ]; + [ + "TeSt"; + "test"; + "TEST"; + "Test"; + [ + "test" + ]; + [ + "TEST" + ]; + [ + "Test" + ]; + [ + "tSeT" + ]; + [ + "84"; + "101"; + "83"; + "116" + ]; + "TeSt"; + "TeSt" + ]; + [ + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; + [ + "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82" + ]; + [ + "\xD0\xA2\xD0\x95\xD0\xA1\xD0\xA2" + ]; + [ + "\xD0\xA2\xD0\xB5\xD1\x81\xD1\x82" + ]; + [ + "\xD0\xA2\xD1\x81\xD0\x95\xD1\x82" + ]; + [ + "209"; + "130"; + "208"; + "149"; + "209"; + "129"; + "208"; + "162" + ]; + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; + "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2" + ]; + [ + "Eyl\xC3\xBCl"; + "eyl\xC3\xBCl"; + "EYL\xC3\xBCL"; + "Eyl\xC3\xBCl"; + [ + "eyl\xC3\xBCl" + ]; + [ + "EYL\xC3\x9CL" + ]; + [ + "Eyl\xC3\xBCl" + ]; + [ + "l\xC3\xBClyE" + ]; + [ + "69"; + "121"; + "108"; + "195"; + "188"; + "108" + ]; + "Eyl\xC3\xBCl"; + "Eyl\xC3\xBCl" + ]; + [ + "6"; + "6"; + "6"; + "6"; + [ + "6" + ]; + [ + "6" + ]; + [ + "6" + ]; + [ + "6" + ]; + [ + "54" + ]; + "6"; + "6" + ]; + [ + ""; + ""; + ""; + ""; + [ + "" + ]; + [ + "" + ]; + [ + "" + ]; + [ + "" + ]; + []; + ""; + "" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/string/test/cases/AsciiChecks.in b/yql/essentials/udfs/common/string/test/cases/AsciiChecks.in new file mode 100644 index 00000000000..26a46b0f6c6 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/AsciiChecks.in @@ -0,0 +1,5 @@ +{"value"="qweRTY123$%?"}; +{"value"="asdFGHjkl:'|"}; +{"value"="zxcvbnm"}; +{"value"="1234567890"}; +{"value"="!@#$%^&*()_+{}"}; diff --git a/yql/essentials/udfs/common/string/test/cases/AsciiChecks.sql b/yql/essentials/udfs/common/string/test/cases/AsciiChecks.sql new file mode 100644 index 00000000000..f6e74d87462 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/AsciiChecks.sql @@ -0,0 +1,10 @@ +SELECT + String::IsAscii(value) as isascii, + String::IsAsciiSpace(value) as isspace, + String::IsAsciiUpper(value) as isupper, + String::IsAsciiLower(value) as islower, + String::IsAsciiDigit(value) as isdigit, + String::IsAsciiAlpha(value) as isalpha, + String::IsAsciiAlnum(value) as isalnum, + String::IsAsciiHex(value) as ishex +FROM Input diff --git a/yql/essentials/udfs/common/string/test/cases/Base32Decode.in b/yql/essentials/udfs/common/string/test/cases/Base32Decode.in new file mode 100644 index 00000000000..34af8b23d47 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/Base32Decode.in @@ -0,0 +1,4 @@ +{"key"="1";subkey="";"value"="ORSXG5A="}; +{"key"="2";subkey="";"value"="KRSXG5CUMVZXI==="}; +{"key"="3";subkey="";"value"="MFYHA3DF"}; +{"key"="4";subkey="";"value"="hmmmm===hmmmm"}; diff --git a/yql/essentials/udfs/common/string/test/cases/Base32Decode.sql b/yql/essentials/udfs/common/string/test/cases/Base32Decode.sql new file mode 100644 index 00000000000..51b47ec1665 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/Base32Decode.sql @@ -0,0 +1,6 @@ +/* syntax version 1 */ +SELECT + value, + String::Base32StrictDecode(value) AS strict_decoded, + String::Base32Decode(value) AS decoded +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/Base32Encode.in b/yql/essentials/udfs/common/string/test/cases/Base32Encode.in new file mode 100644 index 00000000000..c0051d04efd --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/Base32Encode.in @@ -0,0 +1,3 @@ +{"key"="1";subkey="";"value"="test"}; +{"key"="2";subkey="";"value"="TestTest"}; +{"key"="3";subkey="";"value"="apple"}; diff --git a/yql/essentials/udfs/common/string/test/cases/Base32Encode.sql b/yql/essentials/udfs/common/string/test/cases/Base32Encode.sql new file mode 100644 index 00000000000..1ff9e3e4078 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/Base32Encode.sql @@ -0,0 +1,5 @@ +/* syntax version 1 */ +SELECT + value, + String::Base32Encode(value) AS encoded +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockAsciiChecks.in b/yql/essentials/udfs/common/string/test/cases/BlockAsciiChecks.in new file mode 100644 index 00000000000..26a46b0f6c6 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockAsciiChecks.in @@ -0,0 +1,5 @@ +{"value"="qweRTY123$%?"}; +{"value"="asdFGHjkl:'|"}; +{"value"="zxcvbnm"}; +{"value"="1234567890"}; +{"value"="!@#$%^&*()_+{}"}; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockAsciiChecks.sql b/yql/essentials/udfs/common/string/test/cases/BlockAsciiChecks.sql new file mode 100644 index 00000000000..d8bf9e942be --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockAsciiChecks.sql @@ -0,0 +1,13 @@ +/* XXX: Enable UseBlocks pragma and provide input to trigger block execution. */ +PRAGMA UseBlocks; + +SELECT + String::IsAscii(value) as isascii, + String::IsAsciiSpace(value) as isspace, + String::IsAsciiUpper(value) as isupper, + String::IsAsciiLower(value) as islower, + String::IsAsciiDigit(value) as isdigit, + String::IsAsciiAlpha(value) as isalpha, + String::IsAsciiAlnum(value) as isalnum, + String::IsAsciiHex(value) as ishex +FROM Input diff --git a/yql/essentials/udfs/common/string/test/cases/BlockFind.sql b/yql/essentials/udfs/common/string/test/cases/BlockFind.sql new file mode 100644 index 00000000000..f1c855bcc11 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockFind.sql @@ -0,0 +1,7 @@ +/* syntax version 1 */ +pragma UseBlocks; +SELECT + value, + String::Contains(value, "as") AS contains, + String::LevensteinDistance(value, "as") AS levenstein +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockRemove.sql b/yql/essentials/udfs/common/string/test/cases/BlockRemove.sql new file mode 100644 index 00000000000..4c285b78d07 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockRemove.sql @@ -0,0 +1,16 @@ +/* XXX: Enable UseBlocks pragma and provide input to trigger block execution. */ +PRAGMA UseBlocks; + +SELECT + value, + String::RemoveAll(value, "as") AS all, + String::RemoveFirst(value, "a") AS first, + String::RemoveLast(value, "a") AS last, + String::RemoveFirst(value, "as") AS first2, + String::RemoveLast(value, "as") AS last2, + String::RemoveFirst(value, "") AS first3, + String::RemoveLast(value, "") AS last3, + String::RemoveAll(value, "`") AS hwruall, + String::RemoveFirst(value, "`") AS hwrufirst, + String::RemoveLast(value, "`") AS hwrulast, +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockReplace.sql b/yql/essentials/udfs/common/string/test/cases/BlockReplace.sql new file mode 100644 index 00000000000..030e36050cd --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockReplace.sql @@ -0,0 +1,13 @@ +/* XXX: Enable UseBlocks pragma and provide input to trigger block execution. */ +PRAGMA UseBlocks; + +SELECT + value, + String::ReplaceAll(value, "as", "zzz") AS all, + String::ReplaceFirst(value, "a", "z") AS first, + String::ReplaceLast(value, "a", "z") AS last, + String::ReplaceFirst(value, "a", "zz") AS first2, + String::ReplaceLast(value, "a", "zz") AS last2, + String::ReplaceFirst(value, "a", "") AS first3, + String::ReplaceLast(value, "a", "") AS last3 +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.in b/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.in new file mode 100644 index 00000000000..1a446c4e488 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.in @@ -0,0 +1,5 @@ +{"key"="1";"subkey"="1";"value"="qwertyui";"biguint"=1234567890u;"negint"=-123}; +{"key"="2";"subkey"="2";"value"="asdfghjl";"biguint"=9876543210u;"negint"=-456}; +{"key"="3";"subkey"="3";"value"="zxcvbnm?";"biguint"=9999999999u;"negint"=-789}; +{"key"="4";"subkey"="4";"value"="12345678";"biguint"=0000000000u;"negint"=-000}; +{"key"="5";"subkey"="5";"value"="!@#$%^&*";"biguint"=9182737465u;"negint"=-999}; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.in.attr b/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.in.attr new file mode 100644 index 00000000000..bbc040040c8 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.in.attr @@ -0,0 +1,9 @@ +{"_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"String"]]; + ["subkey";["DataType";"String"]]; + ["value";["DataType";"String"]]; + ["biguint";["DataType";"Uint64"]]; + ["negint";["DataType";"Int64"]] + ]]; +}} diff --git a/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.sql b/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.sql new file mode 100644 index 00000000000..8b61758a964 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockStreamFormat.sql @@ -0,0 +1,20 @@ +/* XXX: Enable UseBlocks pragma and provide input to trigger block execution. */ +PRAGMA UseBlocks; + +SELECT + value, + String::RightPad(value, 20) AS right_pad, + String::LeftPad(value, 20) AS left_pad, + String::RightPad(value, 20, "0") AS right_pad_zero, + String::LeftPad(value, 20, "0") AS left_pad_zero, + String::Hex(biguint) AS hex, + String::SHex(negint) AS shex, + String::Bin(biguint) AS bin, + String::SBin(negint) AS sbin, + String::HexText(value) AS hex_text, + String::BinText(value) AS bin_text, + String::HumanReadableDuration(biguint) AS duration, + String::HumanReadableQuantity(biguint) AS quantity, + String::HumanReadableBytes(biguint) AS bytes, + String::Prec(negint / 12345.6789, 4) AS prec +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockStringUDF.in b/yql/essentials/udfs/common/string/test/cases/BlockStringUDF.in new file mode 100644 index 00000000000..a9d378e0590 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockStringUDF.in @@ -0,0 +1,4 @@ +{"value"=" !qwe rty uiop [ ]$"}; +{"value"="@as dfgh jkl\\n;'\% "}; +{"value"=" #zxc\tvbn \t\n\b m,./?^ "}; +{"value"="1!2@3#4$5%6^7&8*9(0)-_=+,<.>"}; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockStringUDF.sql b/yql/essentials/udfs/common/string/test/cases/BlockStringUDF.sql new file mode 100644 index 00000000000..1f96f5d62b0 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockStringUDF.sql @@ -0,0 +1,18 @@ +/* XXX: Enable UseBlocks pragma and provide input to trigger block execution. */ +PRAGMA UseBlocks; + +SELECT + String::Base32Encode(value) as b32enc, + String::Base64Encode(value) as b64enc, + String::Base64EncodeUrl(value) as b64encu, + String::EscapeC(value) as cesc, + String::UnescapeC(value) as cunesc, + String::HexEncode(value) as xenc, + String::EncodeHtml(value) as henc, + String::DecodeHtml(value) as hdec, + String::CgiEscape(value) as cgesc, + String::CgiUnescape(value) as cgunesc, + String::Collapse(value) as clps, + String::Strip(value) as strp, + String::CollapseText(value, 9) as clpst, +FROM Input diff --git a/yql/essentials/udfs/common/string/test/cases/BlockStringUnsafeUDF.in b/yql/essentials/udfs/common/string/test/cases/BlockStringUnsafeUDF.in new file mode 100644 index 00000000000..2c15dd67ac6 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockStringUnsafeUDF.in @@ -0,0 +1,6 @@ +{"value"="EAQCAILRO5SSA4TUPEQCAIDVNFXXAIC3EBOSI==="}; +{"value"="ICAgIXF3ZSBydHkgICB1aW9wIFsgXSQ="}; +{"value"="202020217177652072747920202075696F70205B205D24"}; +{"value"="IBQXGIBAEAQCAIBAMRTGO2BANJVWYXDOHMTSKIBA"}; +{"value"="QGFzICAgICAgIGRmZ2ggamtsXG47JyUgIA,,"}; +{"value"="4061732020202020202064666768206A6B6C5C6E3B27252020"}; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockStringUnsafeUDF.sql b/yql/essentials/udfs/common/string/test/cases/BlockStringUnsafeUDF.sql new file mode 100644 index 00000000000..82f82f50d9d --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockStringUnsafeUDF.sql @@ -0,0 +1,10 @@ +/* XXX: Enable UseBlocks pragma and provide input to trigger block execution. */ +PRAGMA UseBlocks; + +SELECT + String::Base32Decode(value) as b32dec, + String::Base32StrictDecode(value) AS b32sdec, + String::Base64Decode(value) as b64dec, + String::Base64StrictDecode(value) AS b64sdec, + String::HexDecode(value) as xdec, +FROM Input diff --git a/yql/essentials/udfs/common/string/test/cases/BlockTo.in b/yql/essentials/udfs/common/string/test/cases/BlockTo.in new file mode 100644 index 00000000000..93a00f7db8d --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockTo.in @@ -0,0 +1,7 @@ +{"key"="1";"subkey"="1";"value"="test"}; +{"key"="2";"subkey"="2";"value"="\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"}; +{"key"="3";"subkey"="3";"value"="TeSt"}; +{"key"="4";"subkey"="4";"value"="\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"}; +{"key"="5";"subkey"="5";"value"="Eyl\xC3\xBCl"}; +{"key"="6";"subkey"="6";"value"="6"}; +{"key"="4";"subkey"="4";"value"=""}; diff --git a/yql/essentials/udfs/common/string/test/cases/BlockTo.sql b/yql/essentials/udfs/common/string/test/cases/BlockTo.sql new file mode 100644 index 00000000000..628febe899e --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/BlockTo.sql @@ -0,0 +1,9 @@ +/* XXX: Enable UseBlocks pragma and provide input to trigger block execution. */ +PRAGMA UseBlocks; + +SELECT + value, + String::AsciiToLower(value) AS ascii_lower, + String::AsciiToUpper(value) AS ascii_upper, + String::AsciiToTitle(value) AS ascii_title, +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/ExtendAndTake.in b/yql/essentials/udfs/common/string/test/cases/ExtendAndTake.in new file mode 100644 index 00000000000..27fc322b1ae --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/ExtendAndTake.in @@ -0,0 +1,3 @@ +{"key"="1";"subkey"="1";"value"="a b c"}; +{"key"="2";"subkey"="2";"value"="d"}; +{"key"="3";"subkey"="3";"value"=""}; diff --git a/yql/essentials/udfs/common/string/test/cases/ExtendAndTake.sql b/yql/essentials/udfs/common/string/test/cases/ExtendAndTake.sql new file mode 100644 index 00000000000..2dab551eb1c --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/ExtendAndTake.sql @@ -0,0 +1,10 @@ +/* syntax version 1 */ + +$split = ($row) -> { + return String::SplitToList($row.value, " ", true AS SkipEmpty, false AS DelimeterString); +}; + +SELECT + $split(TableRow()), + ListExtend($split(TableRow()), $split(TableRow()))[1] +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/Find.sql b/yql/essentials/udfs/common/string/test/cases/Find.sql new file mode 100644 index 00000000000..273553dcf9e --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/Find.sql @@ -0,0 +1,12 @@ +/* syntax version 1 */ +SELECT + value, + String::Contains(value, "as") AS contains, + String::HasPrefix(value, "as") AS prefix, + String::StartsWith(value, "as") AS starts, + String::HasSuffix(value, "as") AS suffix, + String::EndsWith(value, "as") AS ends, + String::Find(value, "as") AS find, + String::ReverseFind(value, "as") AS rfind, + String::LevensteinDistance(value, "as") AS levenstein +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/List.in b/yql/essentials/udfs/common/string/test/cases/List.in new file mode 100644 index 00000000000..949cf26c776 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/List.in @@ -0,0 +1,6 @@ +{"key"="1";"subkey"="1";"value"="a@b@c"}; +{"key"="1";"subkey"="1";"value"="@a@b@c"}; +{"key"="1";"subkey"="1";"value"="@@@a@a"}; +{"key"="2";"subkey"="2";"value"="d#e#f"}; +{"key"="3";"subkey"="3";"value"="d"}; +{"key"="4";"subkey"="4";"value"=""}; diff --git a/yql/essentials/udfs/common/string/test/cases/List.sql b/yql/essentials/udfs/common/string/test/cases/List.sql new file mode 100644 index 00000000000..42b983074e5 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/List.sql @@ -0,0 +1,12 @@ +/* syntax version 1 */ +SELECT + value, + Ensure(value, String::JoinFromList(String::SplitToList(value, "@"), "@") == value) AS equals_to_original, + String::JoinFromList(String::SplitToList(value, "@"), "#") AS replace_delimeter, + String::SplitToList(value, "@") AS just_split, + String::SplitToList(value, "@")[0] as first, + String::SplitToList(value, "@", true AS SkipEmpty) AS skip_empty, + String::SplitToList(value, "b@", false AS DelimeterString) AS multichar_delim_set, + String::SplitToList(value, "b@", true AS DelimeterString) AS multichar_delim_string, + String::SplitToList(value, "@", 1 AS Limit) AS limited +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/List_v0.in b/yql/essentials/udfs/common/string/test/cases/List_v0.in new file mode 100644 index 00000000000..949cf26c776 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/List_v0.in @@ -0,0 +1,6 @@ +{"key"="1";"subkey"="1";"value"="a@b@c"}; +{"key"="1";"subkey"="1";"value"="@a@b@c"}; +{"key"="1";"subkey"="1";"value"="@@@a@a"}; +{"key"="2";"subkey"="2";"value"="d#e#f"}; +{"key"="3";"subkey"="3";"value"="d"}; +{"key"="4";"subkey"="4";"value"=""}; diff --git a/yql/essentials/udfs/common/string/test/cases/List_v0.sql b/yql/essentials/udfs/common/string/test/cases/List_v0.sql new file mode 100644 index 00000000000..36d984dc6a8 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/List_v0.sql @@ -0,0 +1,27 @@ +/* syntax version 1 */ +-- use SplitToList settings which are used as defaults in v0 syntax +SELECT + value, + IF ( + String::Contains(value, "@@"), + Ensure( + value, + String::JoinFromList(String::SplitToList(value, "@", true AS SkipEmpty, false AS DelimeterString), "@") != value, + value + ) + ) AS not_equals_to_original, + IF ( + String::Contains(value, "@@"), + Ensure( + value, + String::JoinFromList(String::SplitToList(value, "@", true AS SkipEmpty, false AS DelimeterString), "@") != value, + value + ) + ) AS not_equals_to_original_skip_empty, + Ensure( + value, + String::JoinFromList(String::SplitToList(value, "@", false AS SkipEmpty, false AS DelimeterString), "@") == value, + value + ) AS equals_to_original, + String::SplitToList(value, "@#", true AS SkipEmpty, false AS DelimeterString) AS multichar +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/Remove.sql b/yql/essentials/udfs/common/string/test/cases/Remove.sql new file mode 100644 index 00000000000..8bfe2c92e26 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/Remove.sql @@ -0,0 +1,14 @@ +/* syntax version 1 */ +SELECT + value, + String::RemoveAll(value, "as") AS all, + String::RemoveFirst(value, "a") AS first, + String::RemoveLast(value, "a") AS last, + String::RemoveFirst(value, "as") AS first2, + String::RemoveLast(value, "as") AS last2, + String::RemoveFirst(value, "") AS first3, + String::RemoveLast(value, "") AS last3, + String::RemoveAll(value, "`") AS hwruall, + String::RemoveFirst(value, "`") AS hwrufirst, + String::RemoveLast(value, "`") AS hwrulast, +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/Replace.sql b/yql/essentials/udfs/common/string/test/cases/Replace.sql new file mode 100644 index 00000000000..0eea32a3e41 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/Replace.sql @@ -0,0 +1,11 @@ +/* syntax version 1 */ +SELECT + value, + String::ReplaceAll(value, "as", "zzz") AS all, + String::ReplaceFirst(value, "a", "z") AS first, + String::ReplaceLast(value, "a", "z") AS last, + String::ReplaceFirst(value, "a", "zz") AS first2, + String::ReplaceLast(value, "a", "zz") AS last2, + String::ReplaceFirst(value, "a", "") AS first3, + String::ReplaceLast(value, "a", "") AS last3 +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/ReplaceFirstLast.sql b/yql/essentials/udfs/common/string/test/cases/ReplaceFirstLast.sql new file mode 100644 index 00000000000..6a83400d424 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/ReplaceFirstLast.sql @@ -0,0 +1,10 @@ +SELECT + String::ReplaceFirst("gasas", "as", "z"), + String::ReplaceFirst("gasas", "a", "zzz"), + String::ReplaceFirst("gasas", "a", ""), + String::ReplaceFirst("gasas", "e", "z"), + String::ReplaceLast("gasas", "as", "z"), + String::ReplaceLast("gasas", "a", "zzz"), + String::ReplaceLast("gasas", "a", ""), + String::ReplaceLast("gasas", "k", "ey"); + diff --git a/yql/essentials/udfs/common/string/test/cases/StreamFormat.in b/yql/essentials/udfs/common/string/test/cases/StreamFormat.in new file mode 100644 index 00000000000..1a446c4e488 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/StreamFormat.in @@ -0,0 +1,5 @@ +{"key"="1";"subkey"="1";"value"="qwertyui";"biguint"=1234567890u;"negint"=-123}; +{"key"="2";"subkey"="2";"value"="asdfghjl";"biguint"=9876543210u;"negint"=-456}; +{"key"="3";"subkey"="3";"value"="zxcvbnm?";"biguint"=9999999999u;"negint"=-789}; +{"key"="4";"subkey"="4";"value"="12345678";"biguint"=0000000000u;"negint"=-000}; +{"key"="5";"subkey"="5";"value"="!@#$%^&*";"biguint"=9182737465u;"negint"=-999}; diff --git a/yql/essentials/udfs/common/string/test/cases/StreamFormat.in.attr b/yql/essentials/udfs/common/string/test/cases/StreamFormat.in.attr new file mode 100644 index 00000000000..bbc040040c8 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/StreamFormat.in.attr @@ -0,0 +1,9 @@ +{"_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"String"]]; + ["subkey";["DataType";"String"]]; + ["value";["DataType";"String"]]; + ["biguint";["DataType";"Uint64"]]; + ["negint";["DataType";"Int64"]] + ]]; +}} diff --git a/yql/essentials/udfs/common/string/test/cases/StreamFormat.sql b/yql/essentials/udfs/common/string/test/cases/StreamFormat.sql new file mode 100644 index 00000000000..46ee9a7c688 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/StreamFormat.sql @@ -0,0 +1,19 @@ +/* syntax version 1 */ + +SELECT + value, + String::RightPad(value, 20) AS right_pad, + String::LeftPad(value, 20) AS left_pad, + String::RightPad(value, 20, "0") AS right_pad_zero, + String::LeftPad(value, 20, "0") AS left_pad_zero, + String::Hex(biguint) AS hex, + String::SHex(negint) AS shex, + String::Bin(biguint) AS bin, + String::SBin(negint) AS sbin, + String::HexText(value) AS hex_text, + String::BinText(value) AS bin_text, + String::HumanReadableDuration(biguint) AS duration, + String::HumanReadableQuantity(biguint) AS quantity, + String::HumanReadableBytes(biguint) AS bytes, + String::Prec(negint / 12345.6789, 4) AS prec +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/StringUDF.in b/yql/essentials/udfs/common/string/test/cases/StringUDF.in new file mode 100644 index 00000000000..a9d378e0590 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/StringUDF.in @@ -0,0 +1,4 @@ +{"value"=" !qwe rty uiop [ ]$"}; +{"value"="@as dfgh jkl\\n;'\% "}; +{"value"=" #zxc\tvbn \t\n\b m,./?^ "}; +{"value"="1!2@3#4$5%6^7&8*9(0)-_=+,<.>"}; diff --git a/yql/essentials/udfs/common/string/test/cases/StringUDF.sql b/yql/essentials/udfs/common/string/test/cases/StringUDF.sql new file mode 100644 index 00000000000..77af707acb0 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/StringUDF.sql @@ -0,0 +1,15 @@ +SELECT + String::Base32Encode(value) as b32enc, + String::Base64Encode(value) as b64enc, + String::Base64EncodeUrl(value) as b64encu, + String::EscapeC(value) as cesc, + String::UnescapeC(value) as cunesc, + String::HexEncode(value) as xenc, + String::EncodeHtml(value) as henc, + String::DecodeHtml(value) as hdec, + String::CgiEscape(value) as cgesc, + String::CgiUnescape(value) as cgunesc, + String::Collapse(value) as clps, + String::Strip(value) as strp, + String::CollapseText(value, 9) as clpst, +FROM Input diff --git a/yql/essentials/udfs/common/string/test/cases/StringUnsafeUDF.in b/yql/essentials/udfs/common/string/test/cases/StringUnsafeUDF.in new file mode 100644 index 00000000000..2c15dd67ac6 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/StringUnsafeUDF.in @@ -0,0 +1,6 @@ +{"value"="EAQCAILRO5SSA4TUPEQCAIDVNFXXAIC3EBOSI==="}; +{"value"="ICAgIXF3ZSBydHkgICB1aW9wIFsgXSQ="}; +{"value"="202020217177652072747920202075696F70205B205D24"}; +{"value"="IBQXGIBAEAQCAIBAMRTGO2BANJVWYXDOHMTSKIBA"}; +{"value"="QGFzICAgICAgIGRmZ2ggamtsXG47JyUgIA,,"}; +{"value"="4061732020202020202064666768206A6B6C5C6E3B27252020"}; diff --git a/yql/essentials/udfs/common/string/test/cases/StringUnsafeUDF.sql b/yql/essentials/udfs/common/string/test/cases/StringUnsafeUDF.sql new file mode 100644 index 00000000000..dab39cbd391 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/StringUnsafeUDF.sql @@ -0,0 +1,7 @@ +SELECT + String::Base32Decode(value) as b32dec, + String::Base32StrictDecode(value) AS b32sdec, + String::Base64Decode(value) as b64dec, + String::Base64StrictDecode(value) AS b64sdec, + String::HexDecode(value) as xdec, +FROM Input diff --git a/yql/essentials/udfs/common/string/test/cases/To.in b/yql/essentials/udfs/common/string/test/cases/To.in new file mode 100644 index 00000000000..93a00f7db8d --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/To.in @@ -0,0 +1,7 @@ +{"key"="1";"subkey"="1";"value"="test"}; +{"key"="2";"subkey"="2";"value"="\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"}; +{"key"="3";"subkey"="3";"value"="TeSt"}; +{"key"="4";"subkey"="4";"value"="\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"}; +{"key"="5";"subkey"="5";"value"="Eyl\xC3\xBCl"}; +{"key"="6";"subkey"="6";"value"="6"}; +{"key"="4";"subkey"="4";"value"=""}; diff --git a/yql/essentials/udfs/common/string/test/cases/To.sql b/yql/essentials/udfs/common/string/test/cases/To.sql new file mode 100644 index 00000000000..a7faf41efe6 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/To.sql @@ -0,0 +1,14 @@ +/* syntax version 1 */ +SELECT + value, + String::AsciiToLower(value) AS ascii_lower, + String::AsciiToUpper(value) AS ascii_upper, + String::AsciiToTitle(value) AS ascii_title, + String::ToLower(value) AS lower, + String::ToUpper(value) AS upper, + String::ToTitle(value) AS title, + String::Reverse(value) AS reverse, + String::ToByteList(value) AS byte_list, + String::FromByteList(String::ToByteList(value)) AS from_byte_list, + String::FromByteList(YQL::LazyList(String::ToByteList(value))) AS from_lazy_byte_list +FROM Input; diff --git a/yql/essentials/udfs/common/string/test/cases/default.in b/yql/essentials/udfs/common/string/test/cases/default.in new file mode 100644 index 00000000000..182158fdf67 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/cases/default.in @@ -0,0 +1,6 @@ +{"key"="1";"subkey"="1";"value"="fdsa"}; +{"key"="2";"subkey"="2";"value"="aswedfg"}; +{"key"="3";"subkey"="3";"value"="asdadsaasd"}; +{"key"="4";"subkey"="4";"value"="gdsfsassas"}; +{"key"="5";"subkey"="5";"value"=""}; +{"key"="6";"subkey"="6";"value"="`Привет, мир!`"}; diff --git a/yql/essentials/udfs/common/string/test/ya.make b/yql/essentials/udfs/common/string/test/ya.make new file mode 100644 index 00000000000..87d8b667780 --- /dev/null +++ b/yql/essentials/udfs/common/string/test/ya.make @@ -0,0 +1,13 @@ +YQL_UDF_TEST_CONTRIB() + +DEPENDS(yql/essentials/udfs/common/string) + +TIMEOUT(300) + +SIZE(MEDIUM) + +IF (SANITIZER_TYPE == "memory") + TAG(ya:not_autocheck) # YQL-15385 +ENDIF() + +END() diff --git a/yql/essentials/udfs/common/string/ya.make b/yql/essentials/udfs/common/string/ya.make new file mode 100644 index 00000000000..12ae827ad17 --- /dev/null +++ b/yql/essentials/udfs/common/string/ya.make @@ -0,0 +1,38 @@ +IF (YQL_PACKAGED) + PACKAGE() + FROM_SANDBOX(FILE 7319905679 OUT_NOAUTO libstring_udf.so + ) + END() +ELSE () +YQL_UDF_CONTRIB(string_udf) + + YQL_ABI_VERSION( + 2 + 37 + 0 + ) + + SRCS( + string_udf.cpp + ) + + PEERDIR( + yql/essentials/public/udf/arrow + library/cpp/charset + library/cpp/deprecated/split + library/cpp/html/pcdata + library/cpp/string_utils/base32 + library/cpp/string_utils/base64 + library/cpp/string_utils/levenshtein_diff + library/cpp/string_utils/quote + ) + + END() +ENDIF () + + +RECURSE_FOR_TESTS( + test +) + + |