diff options
author | lambda-delta <[email protected]> | 2025-07-30 15:17:02 +0300 |
---|---|---|
committer | lambda-delta <[email protected]> | 2025-07-30 15:58:48 +0300 |
commit | 87b8e32728794ca5b09fad0bcd6a440b50c5de1b (patch) | |
tree | 54a0dbda4a7252501f4bf2674a4707b048f50e29 /yql/essentials/udfs | |
parent | 9fcfb259c684ccad981aa200666ae60fc662a25e (diff) |
YQL-20184: Avoid redundant data copies in the String UDF
Avoid unnecessary data copies in the String UDF
commit_hash:958810269bf3a7d54b71005bdd8e55fffcfa8538
Diffstat (limited to 'yql/essentials/udfs')
-rw-r--r-- | yql/essentials/udfs/common/string/string_udf.cpp | 136 |
1 files changed, 98 insertions, 38 deletions
diff --git a/yql/essentials/udfs/common/string/string_udf.cpp b/yql/essentials/udfs/common/string/string_udf.cpp index f42fcfd2479..b1dbb528cbb 100644 --- a/yql/essentials/udfs/common/string/string_udf.cpp +++ b/yql/essentials/udfs/common/string/string_udf.cpp @@ -34,6 +34,18 @@ using namespace NUdf; namespace { +// Wrapper around a library/cpp/html/pcdata function that requires a TString reference. +TString DecodeHtmlPcdata(TStringBuf sz) { + return DecodeHtmlPcdata(TString{sz}); +} + +// Wrapper around a util/string/strip.h function that requires a TString reference. +TString Collapse(TStringBuf s, size_t maxLen = 0) { + TString ret{s}; + Collapse(ret, ret, maxLen); + return ret; +} + TString ReverseBytes(const TStringRef input) { TString result; result.ReserveAndResize(input.Size()); @@ -54,7 +66,7 @@ TString ReverseBits(const TStringRef input) { #define STRING_UDF(udfName, function, minVersion) \ BEGIN_SIMPLE_STRICT_ARROW_UDF_OPTIONS(T##udfName, char*(TAutoMap<char*>), builder.SetMinLangVer(minVersion)) { \ - const TString input(args[0].AsStringRef()); \ + const TStringBuf input(args[0].AsStringRef()); \ const auto& result = function(input); \ return valueBuilder->NewString(result); \ } \ @@ -62,7 +74,7 @@ TString ReverseBits(const TStringRef input) { struct T##udfName##KernelExec: public TUnaryKernelExec<T##udfName##KernelExec> { \ template <typename TSink> \ static void Process(const IValueBuilder*, TBlockItem arg1, const TSink& sink) { \ - const TString input(arg1.AsStringRef()); \ + const TStringBuf input(arg1.AsStringRef()); \ const auto& result = function(input); \ sink(TBlockItem(result)); \ } \ @@ -74,7 +86,7 @@ TString ReverseBits(const TStringRef input) { #define STRING_UNSAFE_UDF(udfName, function) \ BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \ EMPTY_RESULT_ON_EMPTY_ARG(0); \ - const TString input(args[0].AsStringRef()); \ + const TStringBuf input(args[0].AsStringRef()); \ try { \ const auto& result = function(input); \ return valueBuilder->NewString(result); \ @@ -92,7 +104,7 @@ TString ReverseBits(const TStringRef input) { return sink(TBlockItem()); \ } \ \ - const TString input(arg1.AsStringRef()); \ + const TStringBuf input(arg1.AsStringRef()); \ try { \ const auto& result = function(input); \ sink(TBlockItem(result)); \ @@ -109,7 +121,7 @@ TString ReverseBits(const TStringRef input) { SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), builder.SetMaxLangVer(NYql::MakeLangVersion(2025, 1))) { EMPTY_RESULT_ON_EMPTY_ARG(0) - const TString input(args[0].AsStringRef()); + const TStringBuf input(args[0].AsStringRef()); try { TUtf16String wide = UTF8ToWide(input); ReverseInPlace(wide); @@ -122,7 +134,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), #define STROKA_CASE_UDF(udfName, function) \ SIMPLE_STRICT_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \ EMPTY_RESULT_ON_EMPTY_ARG(0) \ - const TString input(args[0].AsStringRef()); \ + const TStringBuf input(args[0].AsStringRef()); \ try { \ TUtf16String wide = UTF8ToWide(input); \ function(wide.begin(), wide.size()); \ @@ -163,8 +175,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \ Y_UNUSED(valueBuilder); \ if (args[0]) { \ - const TString haystack(args[0].AsStringRef()); \ - const TString needle(args[1].AsStringRef()); \ + const TStringBuf haystack(args[0].AsStringRef()); \ + const TStringBuf needle(args[1].AsStringRef()); \ return TUnboxedValuePod(haystack.function(needle)); \ } else { \ return TUnboxedValuePod(false); \ @@ -177,8 +189,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), { \ Y_UNUSED(valueBuilder); \ if (args[0]) { \ - const TString haystack(args[0].AsStringRef()); \ - const TString needle(args[1].AsStringRef()); \ + const TStringBuf haystack(args[0].AsStringRef()); \ + const TStringBuf needle(args[1].AsStringRef()); \ return TUnboxedValuePod(function(haystack, needle)); \ } else { \ return TUnboxedValuePod(false); \ @@ -188,8 +200,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), #define STRING_ASCII_CMP_IGNORE_CASE_UDF(udfName, function, minVersion) \ TUnboxedValuePod udfName##Impl(const TUnboxedValuePod* args) { \ if (args[0]) { \ - const TString haystack(args[0].AsStringRef()); \ - const TString needle(args[1].AsStringRef()); \ + const TStringBuf haystack(args[0].AsStringRef()); \ + const TStringBuf needle(args[1].AsStringRef()); \ return TUnboxedValuePod(function(haystack, needle)); \ } else { \ return TUnboxedValuePod(false); \ @@ -204,8 +216,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), TBlockItem arg2, const TSink& sink) \ { \ if (arg1) { \ - const TString haystack(arg1.AsStringRef()); \ - const TString needle(arg2.AsStringRef()); \ + const TStringBuf haystack(arg1.AsStringRef()); \ + const TStringBuf needle(arg2.AsStringRef()); \ sink(TBlockItem(function(haystack, needle))); \ } else { \ sink(TBlockItem(false)); \ @@ -283,10 +295,11 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), const TStringBuf input(args[0].AsStringRef()); \ char paddingSymbol = ' '; \ if (args[2]) { \ - if (args[2].AsStringRef().Size() != 1) { \ + TStringBuf filler = args[2].AsStringRef(); \ + if (filler.Size() != 1) { \ ythrow yexception() << "Not 1 symbol in paddingSymbol"; \ } \ - paddingSymbol = TString(args[2].AsStringRef())[0]; \ + paddingSymbol = filler[0]; \ } \ const ui64 padLen = args[1].Get<ui64>(); \ if (padLen > padLim) { \ @@ -305,10 +318,11 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), const TStringBuf input(args.GetElement(0).AsStringRef()); \ char paddingSymbol = ' '; \ if (args.GetElement(2)) { \ - if (args.GetElement(2).AsStringRef().Size() != 1) { \ + TStringBuf filler = args.GetElement(2).AsStringRef(); \ + if (filler.Size() != 1) { \ ythrow yexception() << "Not 1 symbol in paddingSymbol"; \ } \ - paddingSymbol = TString(args.GetElement(2).AsStringRef())[0]; \ + paddingSymbol = filler[0]; \ } \ const ui64 padLen = args.GetElement(1).Get<ui64>(); \ if (padLen > padLim) { \ @@ -396,7 +410,7 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), XX(DecodeHtml, DecodeHtmlPcdata, NYql::UnknownLangVersion) \ XX(CgiEscape, CGIEscapeRet, NYql::UnknownLangVersion) \ XX(CgiUnescape, CGIUnescapeRet, NYql::UnknownLangVersion) \ - XX(Strip, Strip, NYql::UnknownLangVersion) \ + XX(Strip, StripString, NYql::UnknownLangVersion) \ XX(Collapse, Collapse, NYql::UnknownLangVersion) \ XX(ReverseBytes, ReverseBytes, NYql::MakeLangVersion(2025, 2)) \ XX(ReverseBits, ReverseBits, NYql::MakeLangVersion(2025, 2)) @@ -506,8 +520,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), if (!args[0]) return TUnboxedValuePod(false); - const TString haystack(args[0].AsStringRef()); - const TString needle(args[1].AsStringRef()); + const TStringBuf haystack(args[0].AsStringRef()); + const TStringBuf needle(args[1].AsStringRef()); return TUnboxedValuePod(haystack.Contains(needle)); } @@ -517,8 +531,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), if (!arg1) return sink(TBlockItem(false)); - const TString haystack(arg1.AsStringRef()); - const TString needle(arg2.AsStringRef()); + const TStringBuf haystack(arg1.AsStringRef()); + const TStringBuf needle(arg2.AsStringRef()); sink(TBlockItem(haystack.Contains(needle))); } }; @@ -538,8 +552,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), return sink(TBlockItem(arg2 ? false : true)); } - const TString haystack(arg1.AsStringRef()); - const TString needle(arg2.AsStringRef()); + const TStringBuf haystack(arg1.AsStringRef()); + const TStringBuf needle(arg2.AsStringRef()); if (haystack.empty()) { return sink(TBlockItem((needle.empty()))); } @@ -554,8 +568,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), return TUnboxedValuePod(false); } - const TString haystack(args[0].AsStringRef()); - const TString needle(args[1].AsStringRef()); + const TStringBuf haystack(args[0].AsStringRef()); + const TStringBuf needle(args[1].AsStringRef()); if (haystack.empty()) { return TUnboxedValuePod(needle.empty()); } @@ -797,8 +811,8 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), // not required for them. Hence, only the scalar one is provided. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) { Y_UNUSED(valueBuilder); - const TString haystack(args[0].AsStringRef()); - const TString needle(args[1].AsStringRef()); + const TStringBuf haystack(args[0].AsStringRef()); + const TStringBuf needle(args[1].AsStringRef()); const ui64 pos = args[2].GetOrDefault<ui64>(0); return TUnboxedValuePod(haystack.find(needle, pos)); } @@ -808,18 +822,18 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), // provided. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TReverseFind, i64(TAutoMap<char*>, char*, TOptional<ui64>), 1) { Y_UNUSED(valueBuilder); - const TString haystack(args[0].AsStringRef()); - const TString needle(args[1].AsStringRef()); - const ui64 pos = args[2].GetOrDefault<ui64>(TString::npos); + const TStringBuf haystack(args[0].AsStringRef()); + const TStringBuf needle(args[1].AsStringRef()); + const ui64 pos = args[2].GetOrDefault<ui64>(TStringBuf::npos); return TUnboxedValuePod(haystack.rfind(needle, pos)); } // NOTE: String::Substring is marked as deprecated, so block implementation // is not required for them. Hence, only the scalar one is provided. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TSubstring, char*(TAutoMap<char*>, TOptional<ui64>, TOptional<ui64>), 1) { - const TString input(args[0].AsStringRef()); + const TStringBuf input(args[0].AsStringRef()); const ui64 from = args[1].GetOrDefault<ui64>(0); - const ui64 count = args[2].GetOrDefault<ui64>(TString::npos); + const ui64 count = args[2].GetOrDefault<ui64>(TStringBuf::npos); return valueBuilder->NewString(input.substr(from, count)); } @@ -896,14 +910,60 @@ SIMPLE_STRICT_UDF_OPTIONS(TReverse, TOptional<char*>(TOptional<char*>), } SIMPLE_STRICT_UDF(TJoinFromList, char*(TAutoMap<TListType<TOptional<char*>>>, char*)) { + const TStringBuf delimeter(args[1].AsStringRef()); + + // Construct the string in-place if the list is eager. + if (auto elems = args[0].GetElements()) { + ui64 elemCount = args[0].GetListLength(); + ui64 valueCount = 0; + ui64 resultLength = 0; + + for (ui64 i = 0; i != elemCount; ++i) { + if (elems[i]) { + resultLength += elems[i].AsStringRef().Size(); + ++valueCount; + } + } + if (valueCount > 0) { + resultLength += (valueCount - 1) * delimeter.size(); + } + + TUnboxedValue result = valueBuilder->NewStringNotFilled(resultLength); + if (!resultLength) { + return result; + } + + const auto buffer = result.AsStringRef(); + auto it = buffer.Data(); + const auto bufferEnd = buffer.Data() + buffer.Size(); + for (ui64 i = 0; i != elemCount; ++i) { + if (elems[i]) { + TStringBuf curStr = elems[i].AsStringRef(); + memcpy(it, curStr.data(), curStr.size()); + it += curStr.size(); + + // Last element just has been written. + if (it == bufferEnd) { + break; + } + memcpy(it, delimeter.data(), delimeter.size()); + it += delimeter.size(); + } + } + return result; + } + auto input = args[0].GetListIterator(); - const TString delimeter(args[1].AsStringRef()); - TVector<TString> items; + + // Since UnboxedValue can embed small strings, iterating over the list may invalidate StringRefs, thus a copy is required. + TVector<TString, TStdAllocatorForUdf<TString>> items; + if (args[0].HasFastListLength()) { + items.reserve(args[0].GetListLength()); + } for (TUnboxedValue current; input.Next(current);) { if (current) { - TString item(current.AsStringRef()); - items.push_back(std::move(item)); + items.emplace_back(current.AsStringRef()); } } |