diff options
author | vvvv <[email protected]> | 2025-10-06 13:26:25 +0300 |
---|---|---|
committer | vvvv <[email protected]> | 2025-10-06 14:06:25 +0300 |
commit | eca8ce9cb1613d5c983185c4e43c20651a9638aa (patch) | |
tree | 61ee5ae779948e61af9a7691d19eaa2c09869121 /yql/essentials/udfs/common/string/string_udf.cpp | |
parent | 4adf7eecae16a9b228b28cc5f64c27ef69ad5ec2 (diff) |
YQL-20086 udfs
init
commit_hash:f9684778bf1ea956965f2360b80b91edb7d4ffbe
Diffstat (limited to 'yql/essentials/udfs/common/string/string_udf.cpp')
-rw-r--r-- | yql/essentials/udfs/common/string/string_udf.cpp | 1152 |
1 files changed, 557 insertions, 595 deletions
diff --git a/yql/essentials/udfs/common/string/string_udf.cpp b/yql/essentials/udfs/common/string/string_udf.cpp index b1dbb528cbb..6574bacbeea 100644 --- a/yql/essentials/udfs/common/string/string_udf.cpp +++ b/yql/essentials/udfs/common/string/string_udf.cpp @@ -83,7 +83,7 @@ TString ReverseBits(const TStringRef input) { END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) // 'unsafe' udf is actually strict - it returns null on any exception -#define STRING_UNSAFE_UDF(udfName, function) \ +#define STRING_UNSAFE_UDF(udfName, function) \ BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \ EMPTY_RESULT_ON_EMPTY_ARG(0); \ const TStringBuf input(args[0].AsStringRef()); \ @@ -96,8 +96,7 @@ TString ReverseBits(const TStringRef input) { } \ \ struct T##udfName##KernelExec \ - : public TUnaryKernelExec<T##udfName##KernelExec> \ - { \ + : public TUnaryKernelExec<T##udfName##KernelExec> { \ template <typename TSink> \ static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ if (!arg1) { \ @@ -119,7 +118,7 @@ TString ReverseBits(const TStringRef input) { // NOTE: The functions below are marked as deprecated, so block implementation // is not required for them SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), - builder.SetMaxLangVer(NYql::MakeLangVersion(2025, 1))) { + builder.SetMaxLangVer(NYql::MakeLangVersion(2025, 1))) { EMPTY_RESULT_ON_EMPTY_ARG(0) const TStringBuf input(args[0].AsStringRef()); try { @@ -144,7 +143,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } \ } -#define STROKA_ASCII_CASE_UDF(udfName, function) \ +#define STROKA_ASCII_CASE_UDF(udfName, function) \ BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, char*(TAutoMap<char*>)) { \ TString input(args[0].AsStringRef()); \ if (input.function()) { \ @@ -155,8 +154,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } \ \ struct T##udfName##KernelExec \ - : public TUnaryKernelExec<T##udfName##KernelExec> \ - { \ + : public TUnaryKernelExec<T##udfName##KernelExec> { \ template <typename TSink> \ static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ TString input(arg1.AsStringRef()); \ @@ -170,31 +168,29 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), \ END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) - -#define STROKA_FIND_UDF(udfName, function) \ - SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \ - Y_UNUSED(valueBuilder); \ - if (args[0]) { \ - const TStringBuf haystack(args[0].AsStringRef()); \ - const TStringBuf needle(args[1].AsStringRef()); \ - return TUnboxedValuePod(haystack.function(needle)); \ - } else { \ - return TUnboxedValuePod(false); \ - } \ +#define STROKA_FIND_UDF(udfName, function) \ + SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \ + Y_UNUSED(valueBuilder); \ + if (args[0]) { \ + const TStringBuf haystack(args[0].AsStringRef()); \ + const TStringBuf needle(args[1].AsStringRef()); \ + return TUnboxedValuePod(haystack.function(needle)); \ + } else { \ + return TUnboxedValuePod(false); \ + } \ } -#define STRING_TWO_ARGS_UDF_DEPRECATED_2025_02(udfName, function) \ - SIMPLE_STRICT_UDF_OPTIONS(T##udfName, bool(TOptional<char*>, char*), \ - builder.SetMaxLangVer(NYql::MakeLangVersion(2025, 1))) \ - { \ - Y_UNUSED(valueBuilder); \ - if (args[0]) { \ - const TStringBuf haystack(args[0].AsStringRef()); \ - const TStringBuf needle(args[1].AsStringRef()); \ - return TUnboxedValuePod(function(haystack, needle)); \ - } else { \ - return TUnboxedValuePod(false); \ - } \ +#define STRING_TWO_ARGS_UDF_DEPRECATED_2025_02(udfName, function) \ + SIMPLE_STRICT_UDF_OPTIONS(T##udfName, bool(TOptional<char*>, char*), \ + builder.SetMaxLangVer(NYql::MakeLangVersion(2025, 1))) { \ + Y_UNUSED(valueBuilder); \ + if (args[0]) { \ + const TStringBuf haystack(args[0].AsStringRef()); \ + const TStringBuf needle(args[1].AsStringRef()); \ + return TUnboxedValuePod(function(haystack, needle)); \ + } else { \ + return TUnboxedValuePod(false); \ + } \ } #define STRING_ASCII_CMP_IGNORE_CASE_UDF(udfName, function, minVersion) \ @@ -209,12 +205,10 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } \ \ struct T##udfName##KernelExec \ - : public TBinaryKernelExec<T##udfName##KernelExec> \ - { \ + : public TBinaryKernelExec<T##udfName##KernelExec> { \ template <typename TSink> \ static void Process(const IValueBuilder*, TBlockItem arg1, \ - TBlockItem arg2, const TSink& sink) \ - { \ + TBlockItem arg2, const TSink& sink) { \ if (arg1) { \ const TStringBuf haystack(arg1.AsStringRef()); \ const TStringBuf needle(arg2.AsStringRef()); \ @@ -226,9 +220,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), }; \ \ BEGIN_SIMPLE_STRICT_ARROW_UDF_OPTIONS(T##udfName, \ - bool(TOptional<char*>, char*), \ - builder.SetMinLangVer(minVersion)) \ - { \ + bool(TOptional<char*>, char*), \ + builder.SetMinLangVer(minVersion)) { \ Y_UNUSED(valueBuilder); \ return udfName##Impl(args); \ } \ @@ -236,8 +229,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) \ \ BEGIN_SIMPLE_STRICT_ARROW_UDF(T_yql_##udfName, \ - bool(TOptional<char*>, char*)) \ - { \ + bool(TOptional<char*>, char*)) { \ Y_UNUSED(valueBuilder); \ return udfName##Impl(args); \ } \ @@ -263,8 +255,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } \ \ struct T##function##KernelExec \ - : public TUnaryKernelExec<T##function##KernelExec> \ - { \ + : public TUnaryKernelExec<T##function##KernelExec> { \ template <typename TSink> \ static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ if (arg1) { \ @@ -285,58 +276,54 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), \ END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do) - - -#define STRING_STREAM_PAD_FORMATTER_UDF(function) \ - BEGIN_SIMPLE_ARROW_UDF_WITH_OPTIONAL_ARGS(T##function, \ - char*(TAutoMap<char*>, ui64, TOptional<char*>), 1) \ - { \ - TStringStream result; \ - const TStringBuf input(args[0].AsStringRef()); \ - char paddingSymbol = ' '; \ - if (args[2]) { \ - TStringBuf filler = args[2].AsStringRef(); \ - if (filler.Size() != 1) { \ - ythrow yexception() << "Not 1 symbol in paddingSymbol"; \ - } \ - paddingSymbol = filler[0]; \ - } \ - const ui64 padLen = args[1].Get<ui64>(); \ - if (padLen > padLim) { \ - ythrow yexception() << "Padding length (" << padLen << ") exceeds maximum: " << padLim; \ - } \ - result << function(input, padLen, paddingSymbol); \ - return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \ - } \ - \ - struct T##function##KernelExec \ - : public TGenericKernelExec<T##function##KernelExec, 3> \ - { \ - template <typename TSink> \ - static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { \ - TStringStream result; \ - const TStringBuf input(args.GetElement(0).AsStringRef()); \ - char paddingSymbol = ' '; \ - if (args.GetElement(2)) { \ - TStringBuf filler = args.GetElement(2).AsStringRef(); \ - if (filler.Size() != 1) { \ - ythrow yexception() << "Not 1 symbol in paddingSymbol"; \ - } \ - paddingSymbol = filler[0]; \ - } \ - const ui64 padLen = args.GetElement(1).Get<ui64>(); \ - if (padLen > padLim) { \ - ythrow yexception() << "Padding length (" << padLen \ - << ") exceeds maximum: " << padLim; \ - } \ - result << function(input, padLen, paddingSymbol); \ - sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \ - } \ - }; \ - \ +#define STRING_STREAM_PAD_FORMATTER_UDF(function) \ + BEGIN_SIMPLE_ARROW_UDF_WITH_OPTIONAL_ARGS(T##function, \ + char*(TAutoMap<char*>, ui64, TOptional<char*>), 1) { \ + TStringStream result; \ + const TStringBuf input(args[0].AsStringRef()); \ + char paddingSymbol = ' '; \ + if (args[2]) { \ + TStringBuf filler = args[2].AsStringRef(); \ + if (filler.Size() != 1) { \ + ythrow yexception() << "Not 1 symbol in paddingSymbol"; \ + } \ + paddingSymbol = filler[0]; \ + } \ + const ui64 padLen = args[1].Get<ui64>(); \ + if (padLen > padLim) { \ + ythrow yexception() << "Padding length (" << padLen << ") exceeds maximum: " << padLim; \ + } \ + result << function(input, padLen, paddingSymbol); \ + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); \ + } \ + \ + struct T##function##KernelExec \ + : public TGenericKernelExec<T##function##KernelExec, 3> { \ + template <typename TSink> \ + static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { \ + TStringStream result; \ + const TStringBuf input(args.GetElement(0).AsStringRef()); \ + char paddingSymbol = ' '; \ + if (args.GetElement(2)) { \ + TStringBuf filler = args.GetElement(2).AsStringRef(); \ + if (filler.Size() != 1) { \ + ythrow yexception() << "Not 1 symbol in paddingSymbol"; \ + } \ + paddingSymbol = filler[0]; \ + } \ + const ui64 padLen = args.GetElement(1).Get<ui64>(); \ + if (padLen > padLim) { \ + ythrow yexception() << "Padding length (" << padLen \ + << ") exceeds maximum: " << padLim; \ + } \ + result << function(input, padLen, paddingSymbol); \ + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); \ + } \ + }; \ + \ END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do) -#define STRING_STREAM_NUM_FORMATTER_UDF(function, argType) \ +#define STRING_STREAM_NUM_FORMATTER_UDF(function, argType) \ BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, char*(TAutoMap<argType>)) { \ TStringStream result; \ result << function(args[0].Get<argType>()); \ @@ -344,8 +331,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } \ \ struct T##function##KernelExec \ - : public TUnaryKernelExec<T##function##KernelExec> \ - { \ + : public TUnaryKernelExec<T##function##KernelExec> { \ template <typename TSink> \ static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ TStringStream result; \ @@ -356,7 +342,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), \ END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do) -#define STRING_STREAM_TEXT_FORMATTER_UDF(function) \ +#define STRING_STREAM_TEXT_FORMATTER_UDF(function) \ BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, char*(TAutoMap<char*>)) { \ TStringStream result; \ const TStringBuf input(args[0].AsStringRef()); \ @@ -365,8 +351,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } \ \ struct T##function##KernelExec \ - : public TUnaryKernelExec<T##function##KernelExec> \ - { \ + : public TUnaryKernelExec<T##function##KernelExec> { \ template <typename TSink> \ static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ TStringStream result; \ @@ -378,8 +363,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), \ END_SIMPLE_ARROW_UDF(T##function, T##function##KernelExec::Do) - -#define STRING_STREAM_HRSZ_FORMATTER_UDF(udfName, hrSize) \ +#define STRING_STREAM_HRSZ_FORMATTER_UDF(udfName, hrSize) \ BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, char*(TAutoMap<ui64>)) { \ TStringStream result; \ result << HumanReadableSize(args[0].Get<ui64>(), hrSize); \ @@ -387,8 +371,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } \ \ struct T##udfName##KernelExec \ - : public TUnaryKernelExec<T##udfName##KernelExec> \ - { \ + : public TUnaryKernelExec<T##udfName##KernelExec> { \ template <typename TSink> \ static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ TStringStream result; \ @@ -415,11 +398,11 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), XX(ReverseBytes, ReverseBytes, NYql::MakeLangVersion(2025, 2)) \ XX(ReverseBits, ReverseBits, NYql::MakeLangVersion(2025, 2)) -#define STRING_UNSAFE_UDF_MAP(XX) \ - XX(Base32Decode, Base32Decode) \ - XX(Base32StrictDecode, Base32StrictDecode) \ - XX(Base64Decode, Base64Decode) \ - XX(Base64StrictDecode, Base64StrictDecode) \ +#define STRING_UNSAFE_UDF_MAP(XX) \ + XX(Base32Decode, Base32Decode) \ + XX(Base32StrictDecode, Base32StrictDecode) \ + XX(Base64Decode, Base64Decode) \ + XX(Base64StrictDecode, Base64StrictDecode) \ XX(HexDecode, HexDecode) // NOTE: The functions below are marked as deprecated, so block implementation @@ -492,196 +475,212 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), XX(HumanReadableQuantity, SF_QUANTITY) \ XX(HumanReadableBytes, SF_BYTES) +BEGIN_SIMPLE_STRICT_ARROW_UDF(TCollapseText, char*(TAutoMap<char*>, ui64)) { + TString input(args[0].AsStringRef()); + ui64 maxLength = args[1].Get<ui64>(); + CollapseText(input, maxLength); + return valueBuilder->NewString(input); +} - BEGIN_SIMPLE_STRICT_ARROW_UDF(TCollapseText, char*(TAutoMap<char*>, ui64)) { - TString input(args[0].AsStringRef()); - ui64 maxLength = args[1].Get<ui64>(); +struct TCollapseTextKernelExec + : public TBinaryKernelExec<TCollapseTextKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + TString input(arg1.AsStringRef()); + ui64 maxLength = arg2.Get<ui64>(); CollapseText(input, maxLength); - return valueBuilder->NewString(input); + return sink(TBlockItem(input)); } +}; - struct TCollapseTextKernelExec - : public TBinaryKernelExec<TCollapseTextKernelExec> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { - TString input(arg1.AsStringRef()); - ui64 maxLength = arg2.Get<ui64>(); - CollapseText(input, maxLength); - return sink(TBlockItem(input)); - } - }; - - END_SIMPLE_ARROW_UDF(TCollapseText, TCollapseTextKernelExec::Do); +END_SIMPLE_ARROW_UDF(TCollapseText, TCollapseTextKernelExec::Do); - - BEGIN_SIMPLE_STRICT_ARROW_UDF(TContains, bool(TOptional<char*>, char*)) { - Y_UNUSED(valueBuilder); - if (!args[0]) - return TUnboxedValuePod(false); - - const TStringBuf haystack(args[0].AsStringRef()); - const TStringBuf needle(args[1].AsStringRef()); - return TUnboxedValuePod(haystack.Contains(needle)); +BEGIN_SIMPLE_STRICT_ARROW_UDF(TContains, bool(TOptional<char*>, char*)) { + Y_UNUSED(valueBuilder); + if (!args[0]) { + return TUnboxedValuePod(false); } - struct TContainsKernelExec : public TBinaryKernelExec<TContainsKernelExec> { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { - if (!arg1) - return sink(TBlockItem(false)); + const TStringBuf haystack(args[0].AsStringRef()); + const TStringBuf needle(args[1].AsStringRef()); + return TUnboxedValuePod(haystack.Contains(needle)); +} - const TStringBuf haystack(arg1.AsStringRef()); - const TStringBuf needle(arg2.AsStringRef()); - sink(TBlockItem(haystack.Contains(needle))); +struct TContainsKernelExec: public TBinaryKernelExec<TContainsKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + if (!arg1) { + return sink(TBlockItem(false)); } - }; - - END_SIMPLE_ARROW_UDF(TContains, TContainsKernelExec::Do); - static bool IgnoreCaseComparator(char a, char b) { - return AsciiToUpper(a) == AsciiToUpper(b); + const TStringBuf haystack(arg1.AsStringRef()); + const TStringBuf needle(arg2.AsStringRef()); + sink(TBlockItem(haystack.Contains(needle))); } +}; - struct TAsciiContainsIgnoreCaseKernelExec - : public TBinaryKernelExec<TAsciiContainsIgnoreCaseKernelExec> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { - if (!arg1) { - return sink(TBlockItem(arg2 ? false : true)); - } +END_SIMPLE_ARROW_UDF(TContains, TContainsKernelExec::Do); - const TStringBuf haystack(arg1.AsStringRef()); - const TStringBuf needle(arg2.AsStringRef()); - if (haystack.empty()) { - return sink(TBlockItem((needle.empty()))); - } - const auto found = std::search(haystack.cbegin(), haystack.cend(), - needle.cbegin(), needle.cend(), IgnoreCaseComparator); - sink(TBlockItem(found != haystack.cend())); - } - }; +static bool IgnoreCaseComparator(char a, char b) { + return AsciiToUpper(a) == AsciiToUpper(b); +} - TUnboxedValuePod AsciiContainsIgnoreCaseImpl(const TUnboxedValuePod* args) { - if (!args[0]) { - return TUnboxedValuePod(false); +struct TAsciiContainsIgnoreCaseKernelExec + : public TBinaryKernelExec<TAsciiContainsIgnoreCaseKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + if (!arg1) { + return sink(TBlockItem(arg2 ? false : true)); } - const TStringBuf haystack(args[0].AsStringRef()); - const TStringBuf needle(args[1].AsStringRef()); + const TStringBuf haystack(arg1.AsStringRef()); + const TStringBuf needle(arg2.AsStringRef()); if (haystack.empty()) { - return TUnboxedValuePod(needle.empty()); + return sink(TBlockItem((needle.empty()))); } const auto found = std::search(haystack.cbegin(), haystack.cend(), needle.cbegin(), needle.cend(), IgnoreCaseComparator); - return TUnboxedValuePod(found != haystack.cend()); + sink(TBlockItem(found != haystack.cend())); } +}; - BEGIN_SIMPLE_STRICT_ARROW_UDF_OPTIONS(TAsciiContainsIgnoreCase, bool(TOptional<char*>, char*), - builder.SetMinLangVer(NYql::MakeLangVersion(2025, 2))) - { - Y_UNUSED(valueBuilder); - return AsciiContainsIgnoreCaseImpl(args); +TUnboxedValuePod AsciiContainsIgnoreCaseImpl(const TUnboxedValuePod* args) { + if (!args[0]) { + return TUnboxedValuePod(false); } - END_SIMPLE_ARROW_UDF(TAsciiContainsIgnoreCase, TAsciiContainsIgnoreCaseKernelExec::Do); - - BEGIN_SIMPLE_STRICT_ARROW_UDF(T_yql_AsciiContainsIgnoreCase, bool(TOptional<char*>, char*)) - { - Y_UNUSED(valueBuilder); - return AsciiContainsIgnoreCaseImpl(args); + const TStringBuf haystack(args[0].AsStringRef()); + const TStringBuf needle(args[1].AsStringRef()); + if (haystack.empty()) { + return TUnboxedValuePod(needle.empty()); } + const auto found = std::search(haystack.cbegin(), haystack.cend(), + needle.cbegin(), needle.cend(), IgnoreCaseComparator); + return TUnboxedValuePod(found != haystack.cend()); +} - END_SIMPLE_ARROW_UDF(T_yql_AsciiContainsIgnoreCase, TAsciiContainsIgnoreCaseKernelExec::Do); - - BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) { - if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef())) - return valueBuilder->NewString(result); - else - return args[0]; - } +BEGIN_SIMPLE_STRICT_ARROW_UDF_OPTIONS(TAsciiContainsIgnoreCase, bool(TOptional<char*>, char*), + builder.SetMinLangVer(NYql::MakeLangVersion(2025, 2))) +{ + Y_UNUSED(valueBuilder); + return AsciiContainsIgnoreCaseImpl(args); +} - struct TReplaceAllKernelExec - : public TGenericKernelExec<TReplaceAllKernelExec, 3> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { - TString result(args.GetElement(0).AsStringRef()); - const TStringBuf what(args.GetElement(1).AsStringRef()); - const TStringBuf with(args.GetElement(2).AsStringRef()); - if (SubstGlobal(result, what, with)) { - return sink(TBlockItem(result)); - } else { - return sink(args.GetElement(0)); - } - } - }; +END_SIMPLE_ARROW_UDF(TAsciiContainsIgnoreCase, TAsciiContainsIgnoreCaseKernelExec::Do); - END_SIMPLE_ARROW_UDF(TReplaceAll, TReplaceAllKernelExec::Do) +BEGIN_SIMPLE_STRICT_ARROW_UDF(T_yql_AsciiContainsIgnoreCase, bool(TOptional<char*>, char*)) +{ + Y_UNUSED(valueBuilder); + return AsciiContainsIgnoreCaseImpl(args); +} +END_SIMPLE_ARROW_UDF(T_yql_AsciiContainsIgnoreCase, TAsciiContainsIgnoreCaseKernelExec::Do); - BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceFirst, char*(TAutoMap<char*>, char*, char*)) { - std::string result(args[0].AsStringRef()); - const std::string_view what(args[1].AsStringRef()); - if (const auto index = result.find(what); index != std::string::npos) { - result.replace(index, what.size(), std::string_view(args[2].AsStringRef())); - return valueBuilder->NewString(result); - } +BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) { + if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef())) { + return valueBuilder->NewString(result); + } else { return args[0]; } +} - struct TReplaceFirstKernelExec - : public TGenericKernelExec<TReplaceFirstKernelExec, 3> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { - std::string result(args.GetElement(0).AsStringRef()); - const std::string_view what(args.GetElement(1).AsStringRef()); - const std::string_view with(args.GetElement(2).AsStringRef()); - if (const auto index = result.find(what); index != std::string::npos) { - result.replace(index, what.size(), with); - return sink(TBlockItem(result)); - } +struct TReplaceAllKernelExec + : public TGenericKernelExec<TReplaceAllKernelExec, 3> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { + TString result(args.GetElement(0).AsStringRef()); + const TStringBuf what(args.GetElement(1).AsStringRef()); + const TStringBuf with(args.GetElement(2).AsStringRef()); + if (SubstGlobal(result, what, with)) { + return sink(TBlockItem(result)); + } else { return sink(args.GetElement(0)); } - }; + } +}; - END_SIMPLE_ARROW_UDF(TReplaceFirst, TReplaceFirstKernelExec::Do) +END_SIMPLE_ARROW_UDF(TReplaceAll, TReplaceAllKernelExec::Do) +BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceFirst, char*(TAutoMap<char*>, char*, char*)) { + std::string result(args[0].AsStringRef()); + const std::string_view what(args[1].AsStringRef()); + if (const auto index = result.find(what); index != std::string::npos) { + result.replace(index, what.size(), std::string_view(args[2].AsStringRef())); + return valueBuilder->NewString(result); + } + return args[0]; +} - BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceLast, char*(TAutoMap<char*>, char*, char*)) { - std::string result(args[0].AsStringRef()); - const std::string_view what(args[1].AsStringRef()); - if (const auto index = result.rfind(what); index != std::string::npos) { - result.replace(index, what.size(), std::string_view(args[2].AsStringRef())); - return valueBuilder->NewString(result); +struct TReplaceFirstKernelExec + : public TGenericKernelExec<TReplaceFirstKernelExec, 3> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { + std::string result(args.GetElement(0).AsStringRef()); + const std::string_view what(args.GetElement(1).AsStringRef()); + const std::string_view with(args.GetElement(2).AsStringRef()); + if (const auto index = result.find(what); index != std::string::npos) { + result.replace(index, what.size(), with); + return sink(TBlockItem(result)); } - return args[0]; + return sink(args.GetElement(0)); } +}; - struct TReplaceLastKernelExec - : public TGenericKernelExec<TReplaceLastKernelExec, 3> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { - std::string result(args.GetElement(0).AsStringRef()); - const std::string_view what(args.GetElement(1).AsStringRef()); - const std::string_view with(args.GetElement(2).AsStringRef()); - if (const auto index = result.rfind(what); index != std::string::npos) { - result.replace(index, what.size(), with); - return sink(TBlockItem(result)); - } - return sink(args.GetElement(0)); +END_SIMPLE_ARROW_UDF(TReplaceFirst, TReplaceFirstKernelExec::Do) + +BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceLast, char*(TAutoMap<char*>, char*, char*)) { + std::string result(args[0].AsStringRef()); + const std::string_view what(args[1].AsStringRef()); + if (const auto index = result.rfind(what); index != std::string::npos) { + result.replace(index, what.size(), std::string_view(args[2].AsStringRef())); + return valueBuilder->NewString(result); + } + return args[0]; +} + +struct TReplaceLastKernelExec + : public TGenericKernelExec<TReplaceLastKernelExec, 3> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem args, const TSink& sink) { + std::string result(args.GetElement(0).AsStringRef()); + const std::string_view what(args.GetElement(1).AsStringRef()); + const std::string_view with(args.GetElement(2).AsStringRef()); + if (const auto index = result.rfind(what); index != std::string::npos) { + result.replace(index, what.size(), with); + return sink(TBlockItem(result)); } - }; + return sink(args.GetElement(0)); + } +}; - END_SIMPLE_ARROW_UDF(TReplaceLast, TReplaceLastKernelExec::Do) +END_SIMPLE_ARROW_UDF(TReplaceLast, TReplaceLastKernelExec::Do) +BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveAll, char*(TAutoMap<char*>, char*)) { + std::string input(args[0].AsStringRef()); + const std::string_view remove(args[1].AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + size_t tpos = 0; + for (const ui8 c : input) { + if (!chars[c]) { + input[tpos++] = c; + } + } + if (tpos != input.size()) { + input.resize(tpos); + return valueBuilder->NewString(input); + } + return args[0]; +} - BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveAll, char*(TAutoMap<char*>, char*)) { - std::string input(args[0].AsStringRef()); - const std::string_view remove(args[1].AsStringRef()); +struct TRemoveAllKernelExec + : public TBinaryKernelExec<TRemoveAllKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + std::string input(arg1.AsStringRef()); + const std::string_view remove(arg2.AsStringRef()); std::array<bool, 256> chars{}; for (const ui8 c : remove) { chars[c] = true; @@ -694,42 +693,36 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } if (tpos != input.size()) { input.resize(tpos); - return valueBuilder->NewString(input); + return sink(TBlockItem(input)); } - return args[0]; + sink(arg1); } +}; - struct TRemoveAllKernelExec - : public TBinaryKernelExec<TRemoveAllKernelExec> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { - std::string input(arg1.AsStringRef()); - const std::string_view remove(arg2.AsStringRef()); - std::array<bool, 256> chars{}; - for (const ui8 c : remove) { - chars[c] = true; - } - size_t tpos = 0; - for (const ui8 c : input) { - if (!chars[c]) { - input[tpos++] = c; - } - } - if (tpos != input.size()) { - input.resize(tpos); - return sink(TBlockItem(input)); - } - sink(arg1); - } - }; - - END_SIMPLE_ARROW_UDF(TRemoveAll, TRemoveAllKernelExec::Do) +END_SIMPLE_ARROW_UDF(TRemoveAll, TRemoveAllKernelExec::Do) +BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveFirst, char*(TAutoMap<char*>, char*)) { + std::string input(args[0].AsStringRef()); + const std::string_view remove(args[1].AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + for (auto it = input.cbegin(); it != input.cend(); ++it) { + if (chars[static_cast<ui8>(*it)]) { + input.erase(it); + return valueBuilder->NewString(input); + } + } + return args[0]; +} - BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveFirst, char*(TAutoMap<char*>, char*)) { - std::string input(args[0].AsStringRef()); - const std::string_view remove(args[1].AsStringRef()); +struct TRemoveFirstKernelExec + : public TBinaryKernelExec<TRemoveFirstKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + std::string input(arg1.AsStringRef()); + const std::string_view remove(arg2.AsStringRef()); std::array<bool, 256> chars{}; for (const ui8 c : remove) { chars[c] = true; @@ -737,39 +730,37 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), for (auto it = input.cbegin(); it != input.cend(); ++it) { if (chars[static_cast<ui8>(*it)]) { input.erase(it); - return valueBuilder->NewString(input); + return sink(TBlockItem(input)); } } - return args[0]; + sink(arg1); } +}; - struct TRemoveFirstKernelExec - : public TBinaryKernelExec<TRemoveFirstKernelExec> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { - std::string input(arg1.AsStringRef()); - const std::string_view remove(arg2.AsStringRef()); - std::array<bool, 256> chars{}; - for (const ui8 c : remove) { - chars[c] = true; - } - for (auto it = input.cbegin(); it != input.cend(); ++it) { - if (chars[static_cast<ui8>(*it)]) { - input.erase(it); - return sink(TBlockItem(input)); - } - } - sink(arg1); - } - }; - - END_SIMPLE_ARROW_UDF(TRemoveFirst, TRemoveFirstKernelExec::Do) +END_SIMPLE_ARROW_UDF(TRemoveFirst, TRemoveFirstKernelExec::Do) +BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveLast, char*(TAutoMap<char*>, char*)) { + std::string input(args[0].AsStringRef()); + const std::string_view remove(args[1].AsStringRef()); + std::array<bool, 256> chars{}; + for (const ui8 c : remove) { + chars[c] = true; + } + for (auto it = input.crbegin(); it != input.crend(); ++it) { + if (chars[static_cast<ui8>(*it)]) { + input.erase(input.crend() - it - 1, 1); + return valueBuilder->NewString(input); + } + } + return args[0]; +} - BEGIN_SIMPLE_STRICT_ARROW_UDF(TRemoveLast, char*(TAutoMap<char*>, char*)) { - std::string input(args[0].AsStringRef()); - const std::string_view remove(args[1].AsStringRef()); +struct TRemoveLastKernelExec + : public TBinaryKernelExec<TRemoveLastKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + std::string input(arg1.AsStringRef()); + const std::string_view remove(arg2.AsStringRef()); std::array<bool, 256> chars{}; for (const ui8 c : remove) { chars[c] = true; @@ -777,347 +768,318 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), for (auto it = input.crbegin(); it != input.crend(); ++it) { if (chars[static_cast<ui8>(*it)]) { input.erase(input.crend() - it - 1, 1); - return valueBuilder->NewString(input); + return sink(TBlockItem(input)); } } - return args[0]; + sink(arg1); } +}; + +END_SIMPLE_ARROW_UDF(TRemoveLast, TRemoveLastKernelExec::Do) + +// NOTE: String::Find is marked as deprecated, so block implementation is +// not required for them. Hence, only the scalar one is provided. +SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) { + Y_UNUSED(valueBuilder); + const TStringBuf haystack(args[0].AsStringRef()); + const TStringBuf needle(args[1].AsStringRef()); + const ui64 pos = args[2].GetOrDefault<ui64>(0); + return TUnboxedValuePod(haystack.find(needle, pos)); +} - struct TRemoveLastKernelExec - : public TBinaryKernelExec<TRemoveLastKernelExec> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { - std::string input(arg1.AsStringRef()); - const std::string_view remove(arg2.AsStringRef()); - std::array<bool, 256> chars{}; - for (const ui8 c : remove) { - chars[c] = true; - } - for (auto it = input.crbegin(); it != input.crend(); ++it) { - if (chars[static_cast<ui8>(*it)]) { - input.erase(input.crend() - it - 1, 1); - return sink(TBlockItem(input)); - } - } - sink(arg1); - } - }; - - END_SIMPLE_ARROW_UDF(TRemoveLast, TRemoveLastKernelExec::Do) - +// NOTE: String::ReverseFind is marked as deprecated, so block +// implementation is not required for them. Hence, only the scalar one is +// provided. +SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TReverseFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) { + Y_UNUSED(valueBuilder); + const TStringBuf haystack(args[0].AsStringRef()); + const TStringBuf needle(args[1].AsStringRef()); + const ui64 pos = args[2].GetOrDefault<ui64>(TStringBuf::npos); + return TUnboxedValuePod(haystack.rfind(needle, pos)); +} - // NOTE: String::Find is marked as deprecated, so block implementation is - // not required for them. Hence, only the scalar one is provided. - SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) { - Y_UNUSED(valueBuilder); - const TStringBuf haystack(args[0].AsStringRef()); - const TStringBuf needle(args[1].AsStringRef()); - const ui64 pos = args[2].GetOrDefault<ui64>(0); - return TUnboxedValuePod(haystack.find(needle, pos)); - } +// NOTE: String::Substring is marked as deprecated, so block implementation +// is not required for them. Hence, only the scalar one is provided. +SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSubstring, char*(TAutoMap<char*>, TOptional<ui64>, TOptional<ui64>), 1) { + const TStringBuf input(args[0].AsStringRef()); + const ui64 from = args[1].GetOrDefault<ui64>(0); + const ui64 count = args[2].GetOrDefault<ui64>(TStringBuf::npos); + return valueBuilder->NewString(input.substr(from, count)); +} - // NOTE: String::ReverseFind is marked as deprecated, so block - // implementation is not required for them. Hence, only the scalar one is - // provided. - SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TReverseFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) { - Y_UNUSED(valueBuilder); - const TStringBuf haystack(args[0].AsStringRef()); - const TStringBuf needle(args[1].AsStringRef()); - const ui64 pos = args[2].GetOrDefault<ui64>(TStringBuf::npos); - return TUnboxedValuePod(haystack.rfind(needle, pos)); +using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>; + +template <typename TIt> +static void SplitToListImpl( + const IValueBuilder* valueBuilder, + const TUnboxedValue& input, + const std::string_view::const_iterator from, + const TIt& it, + TTmpVector& result) { + for (const auto& elem : it) { + result.emplace_back(valueBuilder->SubString(input, std::distance(from, elem.TokenStart()), std::distance(elem.TokenStart(), elem.TokenDelim()))); } - - // NOTE: String::Substring is marked as deprecated, so block implementation - // is not required for them. Hence, only the scalar one is provided. - SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSubstring, char*(TAutoMap<char*>, TOptional<ui64>, TOptional<ui64>), 1) { - const TStringBuf input(args[0].AsStringRef()); - const ui64 from = args[1].GetOrDefault<ui64>(0); - const ui64 count = args[2].GetOrDefault<ui64>(TStringBuf::npos); - return valueBuilder->NewString(input.substr(from, count)); +} +template <typename TIt> +static void SplitToListImpl( + const IValueBuilder* valueBuilder, + const TUnboxedValue& input, + const std::string_view::const_iterator from, + TIt& it, + bool skipEmpty, + TTmpVector& result) { + if (skipEmpty) { + SplitToListImpl(valueBuilder, input, from, it.SkipEmpty(), result); + } else { + SplitToListImpl(valueBuilder, input, from, it, result); } +} - using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>; - - template <typename TIt> - static void SplitToListImpl( - const IValueBuilder* valueBuilder, - const TUnboxedValue& input, - const std::string_view::const_iterator from, - const TIt& it, - TTmpVector& result) { - for (const auto& elem : it) { - result.emplace_back(valueBuilder->SubString(input, std::distance(from, elem.TokenStart()), std::distance(elem.TokenStart(), elem.TokenDelim()))); - } - } - template <typename TIt> - static void SplitToListImpl( - const IValueBuilder* valueBuilder, - const TUnboxedValue& input, - const std::string_view::const_iterator from, - TIt& it, - bool skipEmpty, - TTmpVector& result) { - if (skipEmpty) { - SplitToListImpl(valueBuilder, input, from, it.SkipEmpty(), result); +constexpr char delimeterStringName[] = "DelimeterString"; +constexpr char skipEmptyName[] = "SkipEmpty"; +constexpr char limitName[] = "Limit"; +using TDelimeterStringArg = TNamedArg<bool, delimeterStringName>; +using TSkipEmptyArg = TNamedArg<bool, skipEmptyName>; +using TLimitArg = TNamedArg<ui64, limitName>; + +SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSplitToList, TListType<char*>(TOptional<char*>, + char*, + TDelimeterStringArg, + TSkipEmptyArg, + TLimitArg), + 3) { + TTmpVector result; + if (args[0]) { + const std::string_view input(args[0].AsStringRef()); + const std::string_view delimeter(args[1].AsStringRef()); + const bool delimiterString = args[2].GetOrDefault<bool>(true); + const bool skipEmpty = args[3].GetOrDefault<bool>(false); + const auto limit = args[4].GetOrDefault<ui64>(0); + if (delimiterString) { + if (limit) { + auto it = StringSplitter(input).SplitByString(delimeter).Limit(limit + 1); + SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); + } else { + auto it = StringSplitter(input).SplitByString(delimeter); + SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); + } } else { - SplitToListImpl(valueBuilder, input, from, it, result); - } - } - - constexpr char delimeterStringName[] = "DelimeterString"; - constexpr char skipEmptyName[] = "SkipEmpty"; - constexpr char limitName[] = "Limit"; - using TDelimeterStringArg = TNamedArg<bool, delimeterStringName>; - using TSkipEmptyArg = TNamedArg<bool, skipEmptyName>; - using TLimitArg = TNamedArg<ui64, limitName>; - - - SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSplitToList, TListType<char*>( - TOptional<char*>, - char*, - TDelimeterStringArg, - TSkipEmptyArg, - TLimitArg - ), - 3) { - TTmpVector result; - if (args[0]) { - const std::string_view input(args[0].AsStringRef()); - const std::string_view delimeter(args[1].AsStringRef()); - const bool delimiterString = args[2].GetOrDefault<bool>(true); - const bool skipEmpty = args[3].GetOrDefault<bool>(false); - const auto limit = args[4].GetOrDefault<ui64>(0); - if (delimiterString) { - if (limit) { - auto it = StringSplitter(input).SplitByString(delimeter).Limit(limit + 1); - SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); - } else { - auto it = StringSplitter(input).SplitByString(delimeter); - SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); - } + if (limit) { + auto it = StringSplitter(input).SplitBySet(TString(delimeter).c_str()).Limit(limit + 1); + SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); } else { - if (limit) { - auto it = StringSplitter(input).SplitBySet(TString(delimeter).c_str()).Limit(limit + 1); - SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); - } else { - auto it = StringSplitter(input).SplitBySet(TString(delimeter).c_str()); - SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); - } + auto it = StringSplitter(input).SplitBySet(TString(delimeter).c_str()); + SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); } } - return valueBuilder->NewList(result.data(), result.size()); } + return valueBuilder->NewList(result.data(), result.size()); +} - SIMPLE_STRICT_UDF(TJoinFromList, char*(TAutoMap<TListType<TOptional<char*>>>, char*)) { - const TStringBuf delimeter(args[1].AsStringRef()); +SIMPLE_STRICT_UDF(TJoinFromList, char*(TAutoMap<TListType<TOptional<char*>>>, char*)) { + const TStringBuf delimeter(args[1].AsStringRef()); - // Construct the string in-place if the list is eager. - if (auto elems = args[0].GetElements()) { - ui64 elemCount = args[0].GetListLength(); - ui64 valueCount = 0; - ui64 resultLength = 0; + // Construct the string in-place if the list is eager. + if (auto elems = args[0].GetElements()) { + ui64 elemCount = args[0].GetListLength(); + ui64 valueCount = 0; + ui64 resultLength = 0; - for (ui64 i = 0; i != elemCount; ++i) { - if (elems[i]) { - resultLength += elems[i].AsStringRef().Size(); - ++valueCount; - } - } - if (valueCount > 0) { - resultLength += (valueCount - 1) * delimeter.size(); + for (ui64 i = 0; i != elemCount; ++i) { + if (elems[i]) { + resultLength += elems[i].AsStringRef().Size(); + ++valueCount; } + } + if (valueCount > 0) { + resultLength += (valueCount - 1) * delimeter.size(); + } - TUnboxedValue result = valueBuilder->NewStringNotFilled(resultLength); - if (!resultLength) { - return result; - } + TUnboxedValue result = valueBuilder->NewStringNotFilled(resultLength); + if (!resultLength) { + return result; + } - const auto buffer = result.AsStringRef(); - auto it = buffer.Data(); - const auto bufferEnd = buffer.Data() + buffer.Size(); - for (ui64 i = 0; i != elemCount; ++i) { - if (elems[i]) { - TStringBuf curStr = elems[i].AsStringRef(); - memcpy(it, curStr.data(), curStr.size()); - it += curStr.size(); - - // Last element just has been written. - if (it == bufferEnd) { - break; - } - memcpy(it, delimeter.data(), delimeter.size()); - it += delimeter.size(); + const auto buffer = result.AsStringRef(); + auto it = buffer.Data(); + const auto bufferEnd = buffer.Data() + buffer.Size(); + for (ui64 i = 0; i != elemCount; ++i) { + if (elems[i]) { + TStringBuf curStr = elems[i].AsStringRef(); + memcpy(it, curStr.data(), curStr.size()); + it += curStr.size(); + + // Last element just has been written. + if (it == bufferEnd) { + break; } + memcpy(it, delimeter.data(), delimeter.size()); + it += delimeter.size(); } - return result; } + return result; + } - auto input = args[0].GetListIterator(); + auto input = args[0].GetListIterator(); - // Since UnboxedValue can embed small strings, iterating over the list may invalidate StringRefs, thus a copy is required. - TVector<TString, TStdAllocatorForUdf<TString>> items; - if (args[0].HasFastListLength()) { - items.reserve(args[0].GetListLength()); - } + // Since UnboxedValue can embed small strings, iterating over the list may invalidate StringRefs, thus a copy is required. + TVector<TString, TStdAllocatorForUdf<TString>> items; + if (args[0].HasFastListLength()) { + items.reserve(args[0].GetListLength()); + } - for (TUnboxedValue current; input.Next(current);) { - if (current) { - items.emplace_back(current.AsStringRef()); - } + for (TUnboxedValue current; input.Next(current);) { + if (current) { + items.emplace_back(current.AsStringRef()); } - - return valueBuilder->NewString(JoinSeq(delimeter, items)); } - BEGIN_SIMPLE_STRICT_ARROW_UDF(TLevensteinDistance, ui64(TAutoMap<char*>, TAutoMap<char*>)) { - Y_UNUSED(valueBuilder); - const TStringBuf left(args[0].AsStringRef()); - const TStringBuf right(args[1].AsStringRef()); - const ui64 result = NLevenshtein::Distance(left, right); - return TUnboxedValuePod(result); - } + return valueBuilder->NewString(JoinSeq(delimeter, items)); +} - struct TLevensteinDistanceKernelExec : public TBinaryKernelExec<TLevensteinDistanceKernelExec> { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { - const std::string_view left(arg1.AsStringRef()); - const std::string_view right(arg2.AsStringRef()); - const ui64 result = NLevenshtein::Distance(left, right); - sink(TBlockItem(result)); - } - }; +BEGIN_SIMPLE_STRICT_ARROW_UDF(TLevensteinDistance, ui64(TAutoMap<char*>, TAutoMap<char*>)) { + Y_UNUSED(valueBuilder); + const TStringBuf left(args[0].AsStringRef()); + const TStringBuf right(args[1].AsStringRef()); + const ui64 result = NLevenshtein::Distance(left, right); + return TUnboxedValuePod(result); +} - END_SIMPLE_ARROW_UDF(TLevensteinDistance, TLevensteinDistanceKernelExec::Do); +struct TLevensteinDistanceKernelExec: public TBinaryKernelExec<TLevensteinDistanceKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { + const std::string_view left(arg1.AsStringRef()); + const std::string_view right(arg2.AsStringRef()); + const ui64 result = NLevenshtein::Distance(left, right); + sink(TBlockItem(result)); + } +}; +END_SIMPLE_ARROW_UDF(TLevensteinDistance, TLevensteinDistanceKernelExec::Do); +BEGIN_SIMPLE_STRICT_ARROW_UDF(THumanReadableDuration, char*(TAutoMap<ui64>)) { + TStringStream result; + result << HumanReadable(TDuration::MicroSeconds(args[0].Get<ui64>())); + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); +} - BEGIN_SIMPLE_STRICT_ARROW_UDF(THumanReadableDuration, char*(TAutoMap<ui64>)) { +struct THumanReadableDurationKernelExec + : public TUnaryKernelExec<THumanReadableDurationKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { TStringStream result; - result << HumanReadable(TDuration::MicroSeconds(args[0].Get<ui64>())); - return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); + result << HumanReadable(TDuration::MicroSeconds(arg1.Get<ui64>())); + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); } +}; - struct THumanReadableDurationKernelExec - : public TUnaryKernelExec<THumanReadableDurationKernelExec> - { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { - TStringStream result; - result << HumanReadable(TDuration::MicroSeconds(arg1.Get<ui64>())); - sink(TBlockItem(TStringRef(result.Data(), result.Size()))); - } - }; - - END_SIMPLE_ARROW_UDF(THumanReadableDuration, THumanReadableDurationKernelExec::Do) +END_SIMPLE_ARROW_UDF(THumanReadableDuration, THumanReadableDurationKernelExec::Do) +BEGIN_SIMPLE_STRICT_ARROW_UDF(TPrec, char*(TAutoMap<double>, ui64)) { + TStringStream result; + result << Prec(args[0].Get<double>(), args[1].Get<ui64>()); + return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); +} - BEGIN_SIMPLE_STRICT_ARROW_UDF(TPrec, char*(TAutoMap<double>, ui64)) { +struct TPrecKernelExec: public TBinaryKernelExec<TPrecKernelExec> { + template <typename TSink> + static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { TStringStream result; - result << Prec(args[0].Get<double>(), args[1].Get<ui64>()); - return valueBuilder->NewString(TStringRef(result.Data(), result.Size())); + result << Prec(arg1.Get<double>(), arg2.Get<ui64>()); + sink(TBlockItem(TStringRef(result.Data(), result.Size()))); } +}; - struct TPrecKernelExec : public TBinaryKernelExec<TPrecKernelExec> { - template <typename TSink> - static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { - TStringStream result; - result << Prec(arg1.Get<double>(), arg2.Get<ui64>()); - sink(TBlockItem(TStringRef(result.Data(), result.Size()))); - } - }; +END_SIMPLE_ARROW_UDF(TPrec, TPrecKernelExec::Do) - END_SIMPLE_ARROW_UDF(TPrec, TPrecKernelExec::Do) +SIMPLE_STRICT_UDF(TToByteList, TListType<ui8>(char*)) { + const TStringBuf input(args[0].AsStringRef()); + TUnboxedValue* items = nullptr; + TUnboxedValue result = valueBuilder->NewArray(input.size(), items); + for (const unsigned char c : input) { + *items++ = TUnboxedValuePod(c); + } + return result; +} +SIMPLE_STRICT_UDF(TFromByteList, char*(TListType<ui8>)) { + auto input = args[0]; - SIMPLE_STRICT_UDF(TToByteList, TListType<ui8>(char*)) { - const TStringBuf input(args[0].AsStringRef()); - TUnboxedValue* items = nullptr; - TUnboxedValue result = valueBuilder->NewArray(input.size(), items); - for (const unsigned char c : input) { - *items++ = TUnboxedValuePod(c); + if (auto elems = input.GetElements()) { + const auto elemCount = input.GetListLength(); + TUnboxedValue result = valueBuilder->NewStringNotFilled(input.GetListLength()); + auto bufferPtr = result.AsStringRef().Data(); + for (ui64 i = 0; i != elemCount; ++i) { + *(bufferPtr++) = elems[i].Get<ui8>(); } return result; } - SIMPLE_STRICT_UDF(TFromByteList, char*(TListType<ui8>)) { - auto input = args[0]; + std::vector<char, NKikimr::NUdf::TStdAllocatorForUdf<char>> buffer; + buffer.reserve(TUnboxedValuePod::InternalBufferSize); - if (auto elems = input.GetElements()) { - const auto elemCount = input.GetListLength(); - TUnboxedValue result = valueBuilder->NewStringNotFilled(input.GetListLength()); - auto bufferPtr = result.AsStringRef().Data(); - for (ui64 i = 0; i != elemCount; ++i) { - *(bufferPtr++) = elems[i].Get<ui8>(); - } - return result; - } - - std::vector<char, NKikimr::NUdf::TStdAllocatorForUdf<char>> buffer; - buffer.reserve(TUnboxedValuePod::InternalBufferSize); - - const auto& iter = input.GetListIterator(); - for (NUdf::TUnboxedValue item; iter.Next(item); ) { - buffer.push_back(item.Get<ui8>()); - } - - return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size())); + const auto& iter = input.GetListIterator(); + for (NUdf::TUnboxedValue item; iter.Next(item);) { + buffer.push_back(item.Get<ui8>()); } + return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size())); +} + #define STRING_REGISTER_UDF(udfName, ...) T##udfName, #define STRING_OPT_REGISTER_UDF(udfName, ...) T_yql_##udfName, - STRING_UDF_MAP(STRING_UDF) - STRING_UNSAFE_UDF_MAP(STRING_UNSAFE_UDF) - STROKA_CASE_UDF_MAP(STROKA_CASE_UDF) - STROKA_ASCII_CASE_UDF_MAP(STROKA_ASCII_CASE_UDF) - STROKA_FIND_UDF_MAP(STROKA_FIND_UDF) - STRING_TWO_ARGS_UDF_MAP_DEPRECATED_2025_02(STRING_TWO_ARGS_UDF_DEPRECATED_2025_02) - STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_ASCII_CMP_IGNORE_CASE_UDF) - IS_ASCII_UDF_MAP(IS_ASCII_UDF) - - static constexpr ui64 padLim = 1000000; - STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_STREAM_PAD_FORMATTER_UDF) - STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_STREAM_NUM_FORMATTER_UDF) - STRING_STREAM_TEXT_FORMATTER_UDF_MAP(STRING_STREAM_TEXT_FORMATTER_UDF) - STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(STRING_STREAM_HRSZ_FORMATTER_UDF) - - SIMPLE_MODULE(TStringModule, - STRING_UDF_MAP(STRING_REGISTER_UDF) - STRING_UNSAFE_UDF_MAP(STRING_REGISTER_UDF) - STROKA_UDF_MAP(STRING_REGISTER_UDF) - STROKA_CASE_UDF_MAP(STRING_REGISTER_UDF) - STROKA_ASCII_CASE_UDF_MAP(STRING_REGISTER_UDF) - STROKA_FIND_UDF_MAP(STRING_REGISTER_UDF) - STRING_TWO_ARGS_UDF_MAP_DEPRECATED_2025_02(STRING_REGISTER_UDF) - STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_REGISTER_UDF) - STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_OPT_REGISTER_UDF) - IS_ASCII_UDF_MAP(STRING_REGISTER_UDF) - STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) - STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) - STRING_STREAM_TEXT_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) - STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) - TReverse, - TCollapseText, - TReplaceAll, - TReplaceFirst, - TReplaceLast, - TRemoveAll, - TRemoveFirst, - TRemoveLast, - TContains, - TAsciiContainsIgnoreCase, - T_yql_AsciiContainsIgnoreCase, - TFind, - TReverseFind, - TSubstring, - TSplitToList, - TJoinFromList, - TLevensteinDistance, - THumanReadableDuration, - TPrec, - TToByteList, - TFromByteList) - } // namespace +STRING_UDF_MAP(STRING_UDF) +STRING_UNSAFE_UDF_MAP(STRING_UNSAFE_UDF) +STROKA_CASE_UDF_MAP(STROKA_CASE_UDF) +STROKA_ASCII_CASE_UDF_MAP(STROKA_ASCII_CASE_UDF) +STROKA_FIND_UDF_MAP(STROKA_FIND_UDF) +STRING_TWO_ARGS_UDF_MAP_DEPRECATED_2025_02(STRING_TWO_ARGS_UDF_DEPRECATED_2025_02) +STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_ASCII_CMP_IGNORE_CASE_UDF) +IS_ASCII_UDF_MAP(IS_ASCII_UDF) + +static constexpr ui64 padLim = 1000000; +STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_STREAM_PAD_FORMATTER_UDF) +STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_STREAM_NUM_FORMATTER_UDF) +STRING_STREAM_TEXT_FORMATTER_UDF_MAP(STRING_STREAM_TEXT_FORMATTER_UDF) +STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(STRING_STREAM_HRSZ_FORMATTER_UDF) + +SIMPLE_MODULE(TStringModule, + STRING_UDF_MAP(STRING_REGISTER_UDF) + STRING_UNSAFE_UDF_MAP(STRING_REGISTER_UDF) + STROKA_UDF_MAP(STRING_REGISTER_UDF) + STROKA_CASE_UDF_MAP(STRING_REGISTER_UDF) + STROKA_ASCII_CASE_UDF_MAP(STRING_REGISTER_UDF) + STROKA_FIND_UDF_MAP(STRING_REGISTER_UDF) + STRING_TWO_ARGS_UDF_MAP_DEPRECATED_2025_02(STRING_REGISTER_UDF) + STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_REGISTER_UDF) + STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_OPT_REGISTER_UDF) + IS_ASCII_UDF_MAP(STRING_REGISTER_UDF) + STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) + STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) + STRING_STREAM_TEXT_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) + STRING_STREAM_HRSZ_FORMATTER_UDF_MAP(STRING_REGISTER_UDF) + TReverse, + TCollapseText, + TReplaceAll, + TReplaceFirst, + TReplaceLast, + TRemoveAll, + TRemoveFirst, + TRemoveLast, + TContains, + TAsciiContainsIgnoreCase, + T_yql_AsciiContainsIgnoreCase, + TFind, + TReverseFind, + TSubstring, + TSplitToList, + TJoinFromList, + TLevensteinDistance, + THumanReadableDuration, + TPrec, + TToByteList, + TFromByteList) +} // namespace REGISTER_MODULES(TStringModule) |