aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoraneporada <aneporada@ydb.tech>2023-01-23 11:36:09 +0300
committeraneporada <aneporada@ydb.tech>2023-01-23 11:36:09 +0300
commit93918c7defc46cd2776b2b9229f6d6cd2c28ca93 (patch)
tree87a9958da2257cea0e1d210fce948f6010909a6f
parentbf11eefa37a047706d6f807847aef142a22dd447 (diff)
downloadydb-93918c7defc46cd2776b2b9229f6d6cd2c28ca93.tar.gz
Use UDF strict flag in filter pushdown optimizer. Mark String UDFs as strict where appropriate
Everything is strict in String UDFs except for String::LeftPad and String::RightPad
-rw-r--r--ydb/library/yql/core/common_opt/yql_flatmap_over_join.cpp4
-rw-r--r--ydb/library/yql/public/udf/udf_helpers.h3
-rw-r--r--ydb/library/yql/udfs/common/string/CMakeLists.darwin.txt2
-rw-r--r--ydb/library/yql/udfs/common/string/CMakeLists.linux-aarch64.txt2
-rw-r--r--ydb/library/yql/udfs/common/string/CMakeLists.linux.txt2
-rw-r--r--ydb/library/yql/udfs/common/string/string_udf.cpp223
6 files changed, 121 insertions, 115 deletions
diff --git a/ydb/library/yql/core/common_opt/yql_flatmap_over_join.cpp b/ydb/library/yql/core/common_opt/yql_flatmap_over_join.cpp
index def2bf130e..34afad5254 100644
--- a/ydb/library/yql/core/common_opt/yql_flatmap_over_join.cpp
+++ b/ydb/library/yql/core/common_opt/yql_flatmap_over_join.cpp
@@ -148,7 +148,9 @@ TExprNode::TPtr SingleInputPredicatePushdownOverEquiJoin(TExprNode::TPtr equiJoi
} else if (node->IsCallable("DependsOn")) {
++insideDependsOn;
} else if (isStrict && !insideAssumeStrict && node->IsCallable({"Udf", "ScriptUdf", "Unwrap", "Ensure"})) {
- isStrict = false;
+ if (!node->IsCallable("Udf") || !HasSetting(*node->Child(TCoUdf::idx_Settings), "strict")) {
+ isStrict = false;
+ }
} else if (insideDependsOn && node.Get() == args->Child(0)) {
withDependsOn = true;
}
diff --git a/ydb/library/yql/public/udf/udf_helpers.h b/ydb/library/yql/public/udf/udf_helpers.h
index 8d7e812772..5112ce22fe 100644
--- a/ydb/library/yql/public/udf/udf_helpers.h
+++ b/ydb/library/yql/public/udf/udf_helpers.h
@@ -189,6 +189,9 @@ namespace NUdf {
#define SIMPLE_UDF_OPTIONS(udfName, signature, options) \
UDF(udfName, builder.SimpleSignature<signature>(); options;)
+#define SIMPLE_STRICT_UDF_OPTIONS(udfName, signature, options) \
+ UDF(udfName, builder.SimpleSignature<signature>().IsStrict(); options;)
+
#define SIMPLE_UDF_RUN_OPTIONS(udfName, signature, options) \
UDF_RUN(udfName, builder.SimpleSignature<signature>(); options;)
diff --git a/ydb/library/yql/udfs/common/string/CMakeLists.darwin.txt b/ydb/library/yql/udfs/common/string/CMakeLists.darwin.txt
index 9be6f63db3..443f7fdf0c 100644
--- a/ydb/library/yql/udfs/common/string/CMakeLists.darwin.txt
+++ b/ydb/library/yql/udfs/common/string/CMakeLists.darwin.txt
@@ -24,7 +24,7 @@ target_link_libraries(string_udf INTERFACE
add_global_library_for(string_udf.global string_udf)
target_compile_options(string_udf.global PRIVATE
-DUDF_ABI_VERSION_MAJOR=2
- -DUDF_ABI_VERSION_MINOR=27
+ -DUDF_ABI_VERSION_MINOR=28
-DUDF_ABI_VERSION_PATCH=0
)
target_link_libraries(string_udf.global PUBLIC
diff --git a/ydb/library/yql/udfs/common/string/CMakeLists.linux-aarch64.txt b/ydb/library/yql/udfs/common/string/CMakeLists.linux-aarch64.txt
index 1b7d56adfd..2548da1b23 100644
--- a/ydb/library/yql/udfs/common/string/CMakeLists.linux-aarch64.txt
+++ b/ydb/library/yql/udfs/common/string/CMakeLists.linux-aarch64.txt
@@ -25,7 +25,7 @@ target_link_libraries(string_udf INTERFACE
add_global_library_for(string_udf.global string_udf)
target_compile_options(string_udf.global PRIVATE
-DUDF_ABI_VERSION_MAJOR=2
- -DUDF_ABI_VERSION_MINOR=27
+ -DUDF_ABI_VERSION_MINOR=28
-DUDF_ABI_VERSION_PATCH=0
)
target_link_libraries(string_udf.global PUBLIC
diff --git a/ydb/library/yql/udfs/common/string/CMakeLists.linux.txt b/ydb/library/yql/udfs/common/string/CMakeLists.linux.txt
index 1b7d56adfd..2548da1b23 100644
--- a/ydb/library/yql/udfs/common/string/CMakeLists.linux.txt
+++ b/ydb/library/yql/udfs/common/string/CMakeLists.linux.txt
@@ -25,7 +25,7 @@ target_link_libraries(string_udf INTERFACE
add_global_library_for(string_udf.global string_udf)
target_compile_options(string_udf.global PRIVATE
-DUDF_ABI_VERSION_MAJOR=2
- -DUDF_ABI_VERSION_MINOR=27
+ -DUDF_ABI_VERSION_MINOR=28
-DUDF_ABI_VERSION_PATCH=0
)
target_link_libraries(string_udf.global PUBLIC
diff --git a/ydb/library/yql/udfs/common/string/string_udf.cpp b/ydb/library/yql/udfs/common/string/string_udf.cpp
index 170413e224..c2bce4ae42 100644
--- a/ydb/library/yql/udfs/common/string/string_udf.cpp
+++ b/ydb/library/yql/udfs/common/string/string_udf.cpp
@@ -27,53 +27,54 @@ using namespace NKikimr;
using namespace NUdf;
namespace {
-#define STRING_UDF(udfName, function) \
- SIMPLE_UDF(T##udfName, char*(TAutoMap<char*>)) { \
- const TString input(args[0].AsStringRef()); \
- const auto& result = function(input); \
- return valueBuilder->NewString(result); \
- }
-
-#define STRING_UNSAFE_UDF(udfName, function) \
- SIMPLE_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \
- EMPTY_RESULT_ON_EMPTY_ARG(0); \
- const TString input(args[0].AsStringRef()); \
- try { \
- const auto& result = function(input); \
- return valueBuilder->NewString(result); \
- } catch (yexception&) { \
- return TUnboxedValue(); \
- } \
- }
-
-#define STROKA_UDF(udfName, function) \
- SIMPLE_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \
- EMPTY_RESULT_ON_EMPTY_ARG(0) \
- const TString input(args[0].AsStringRef()); \
- try { \
- TUtf16String wide = UTF8ToWide(input); \
- function(wide); \
- return valueBuilder->NewString(WideToUTF8(wide)); \
- } catch (yexception&) { \
- return TUnboxedValue(); \
- } \
- }
-
-#define STROKA_CASE_UDF(udfName, function) \
- SIMPLE_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \
- EMPTY_RESULT_ON_EMPTY_ARG(0) \
- const TString input(args[0].AsStringRef()); \
- try { \
- TUtf16String wide = UTF8ToWide(input); \
- function(wide.begin(), wide.size()); \
- return valueBuilder->NewString(WideToUTF8(wide)); \
- } catch (yexception&) { \
- return TUnboxedValue(); \
- } \
+#define STRING_UDF(udfName, function) \
+ SIMPLE_STRICT_UDF(T##udfName, char*(TAutoMap<char*>)) { \
+ const TString input(args[0].AsStringRef()); \
+ const auto& result = function(input); \
+ return valueBuilder->NewString(result); \
+ }
+
+// 'unsafe' udf is actually strict - it returns null on any exception
+#define STRING_UNSAFE_UDF(udfName, function) \
+ SIMPLE_STRICT_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) {\
+ EMPTY_RESULT_ON_EMPTY_ARG(0); \
+ const TString input(args[0].AsStringRef()); \
+ try { \
+ const auto& result = function(input); \
+ return valueBuilder->NewString(result); \
+ } catch (yexception&) { \
+ return TUnboxedValue(); \
+ } \
+ }
+
+#define STROKA_UDF(udfName, function) \
+ SIMPLE_STRICT_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \
+ EMPTY_RESULT_ON_EMPTY_ARG(0) \
+ const TString input(args[0].AsStringRef()); \
+ try { \
+ TUtf16String wide = UTF8ToWide(input); \
+ function(wide); \
+ return valueBuilder->NewString(WideToUTF8(wide)); \
+ } catch (yexception&) { \
+ return TUnboxedValue(); \
+ } \
+ }
+
+#define STROKA_CASE_UDF(udfName, function) \
+ SIMPLE_STRICT_UDF(T##udfName, TOptional<char*>(TOptional<char*>)) { \
+ EMPTY_RESULT_ON_EMPTY_ARG(0) \
+ const TString input(args[0].AsStringRef()); \
+ try { \
+ TUtf16String wide = UTF8ToWide(input); \
+ function(wide.begin(), wide.size()); \
+ return valueBuilder->NewString(WideToUTF8(wide)); \
+ } catch (yexception&) { \
+ return TUnboxedValue(); \
+ } \
}
#define STROKA_ASCII_CASE_UDF(udfName, function) \
- SIMPLE_UDF(T##udfName, char*(TAutoMap<char*>)) { \
+ SIMPLE_STRICT_UDF(T##udfName, char*(TAutoMap<char*>)) { \
TString input(args[0].AsStringRef()); \
if (input.function()) { \
return valueBuilder->NewString(input); \
@@ -82,46 +83,46 @@ namespace {
} \
}
-#define STROKA_FIND_UDF(udfName, function) \
- SIMPLE_UDF(T##udfName, bool(TOptional<char*>, char*)) { \
- Y_UNUSED(valueBuilder); \
- if (args[0]) { \
- const TString haystack(args[0].AsStringRef()); \
- const TString needle(args[1].AsStringRef()); \
- return TUnboxedValuePod(haystack.function(needle)); \
- } else { \
- return TUnboxedValuePod(false); \
- } \
- }
-
-#define STRING_TWO_ARGS_UDF(udfName, function) \
- SIMPLE_UDF(T##udfName, bool(TOptional<char*>, char*)) { \
- Y_UNUSED(valueBuilder); \
- if (args[0]) { \
- const TString haystack(args[0].AsStringRef()); \
- const TString needle(args[1].AsStringRef()); \
- return TUnboxedValuePod(function(haystack, needle)); \
- } else { \
- return TUnboxedValuePod(false); \
- } \
- }
-
-#define IS_ASCII_UDF(function) \
- SIMPLE_UDF(T##function, bool(TOptional<char*>)) { \
- Y_UNUSED(valueBuilder); \
- if (args[0]) { \
- const TStringBuf input(args[0].AsStringRef()); \
- bool result = true; \
- for (auto c : input) { \
- if (!function(c)) { \
- result = false; \
- break; \
- } \
- } \
- return TUnboxedValuePod(result); \
- } else { \
- return TUnboxedValuePod(false); \
- } \
+#define STROKA_FIND_UDF(udfName, function) \
+ SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \
+ Y_UNUSED(valueBuilder); \
+ if (args[0]) { \
+ const TString haystack(args[0].AsStringRef()); \
+ const TString needle(args[1].AsStringRef()); \
+ return TUnboxedValuePod(haystack.function(needle)); \
+ } else { \
+ return TUnboxedValuePod(false); \
+ } \
+ }
+
+#define STRING_TWO_ARGS_UDF(udfName, function) \
+ SIMPLE_STRICT_UDF(T##udfName, bool(TOptional<char*>, char*)) { \
+ Y_UNUSED(valueBuilder); \
+ if (args[0]) { \
+ const TString haystack(args[0].AsStringRef()); \
+ const TString needle(args[1].AsStringRef()); \
+ return TUnboxedValuePod(function(haystack, needle)); \
+ } else { \
+ return TUnboxedValuePod(false); \
+ } \
+ }
+
+#define IS_ASCII_UDF(function) \
+ SIMPLE_STRICT_UDF(T##function, bool(TOptional<char*>)) { \
+ Y_UNUSED(valueBuilder); \
+ if (args[0]) { \
+ const TStringBuf input(args[0].AsStringRef()); \
+ bool result = true; \
+ for (auto c : input) { \
+ if (!function(c)) { \
+ result = false; \
+ break; \
+ } \
+ } \
+ return TUnboxedValuePod(result); \
+ } else { \
+ return TUnboxedValuePod(false); \
+ } \
}
#define STRING_UDF_MAP(XX) \
@@ -178,21 +179,21 @@ namespace {
XX(IsAsciiAlnum) \
XX(IsAsciiHex)
- SIMPLE_UDF(TCollapseText, char*(TAutoMap<char*>, ui64)) {
+ SIMPLE_STRICT_UDF(TCollapseText, char*(TAutoMap<char*>, ui64)) {
TString input(args[0].AsStringRef());
ui64 maxLength = args[1].Get<ui64>();
CollapseText(input, maxLength);
return valueBuilder->NewString(input);
}
- SIMPLE_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) {
+ SIMPLE_STRICT_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) {
if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef()))
return valueBuilder->NewString(result);
else
return args[0];
}
- SIMPLE_UDF(TReplaceFirst, char*(TAutoMap<char*>, char*, char*)) {
+ SIMPLE_STRICT_UDF(TReplaceFirst, char*(TAutoMap<char*>, char*, char*)) {
std::string result(args[0].AsStringRef());
const std::string_view what(args[1].AsStringRef());
if (const auto index = result.find(what); index != std::string::npos) {
@@ -202,7 +203,7 @@ namespace {
return args[0];
}
- SIMPLE_UDF(TReplaceLast, char*(TAutoMap<char*>, char*, char*)) {
+ SIMPLE_STRICT_UDF(TReplaceLast, char*(TAutoMap<char*>, char*, char*)) {
std::string result(args[0].AsStringRef());
const std::string_view what(args[1].AsStringRef());
if (const auto index = result.rfind(what); index != std::string::npos) {
@@ -212,7 +213,7 @@ namespace {
return args[0];
}
- SIMPLE_UDF(TRemoveAll, char*(TAutoMap<char*>, char*)) {
+ SIMPLE_STRICT_UDF(TRemoveAll, char*(TAutoMap<char*>, char*)) {
std::string input(args[0].AsStringRef());
const std::string_view remove(args[1].AsStringRef());
const std::unordered_set<char> chars(remove.cbegin(), remove.cend());
@@ -229,7 +230,7 @@ namespace {
return args[0];
}
- SIMPLE_UDF(TRemoveFirst, char*(TAutoMap<char*>, char*)) {
+ SIMPLE_STRICT_UDF(TRemoveFirst, char*(TAutoMap<char*>, char*)) {
std::string input(args[0].AsStringRef());
const std::string_view remove(args[1].AsStringRef());
std::unordered_set<char> chars(remove.cbegin(), remove.cend());
@@ -242,7 +243,7 @@ namespace {
return args[0];
}
- SIMPLE_UDF(TRemoveLast, char*(TAutoMap<char*>, char*)) {
+ SIMPLE_STRICT_UDF(TRemoveLast, char*(TAutoMap<char*>, char*)) {
std::string input(args[0].AsStringRef());
const std::string_view remove(args[1].AsStringRef());
std::unordered_set<char> chars(remove.cbegin(), remove.cend());
@@ -255,7 +256,7 @@ namespace {
return args[0];
}
- SIMPLE_UDF_OPTIONS(TFind, i64(TAutoMap<char*>, char*, TOptional<ui64>),
+ SIMPLE_STRICT_UDF_OPTIONS(TFind, i64(TAutoMap<char*>, char*, TOptional<ui64>),
builder.OptionalArgs(1)) {
Y_UNUSED(valueBuilder);
const TString haystack(args[0].AsStringRef());
@@ -264,8 +265,8 @@ namespace {
return TUnboxedValuePod(haystack.find(needle, pos));
}
- SIMPLE_UDF_OPTIONS(TReverseFind, i64(TAutoMap<char*>, char*, TOptional<ui64>),
- builder.OptionalArgs(1)) {
+ SIMPLE_STRICT_UDF_OPTIONS(TReverseFind, i64(TAutoMap<char*>, char*, TOptional<ui64>),
+ builder.OptionalArgs(1)) {
Y_UNUSED(valueBuilder);
const TString haystack(args[0].AsStringRef());
const TString needle(args[1].AsStringRef());
@@ -273,8 +274,8 @@ namespace {
return TUnboxedValuePod(haystack.rfind(needle, pos));
}
- SIMPLE_UDF_OPTIONS(TSubstring, char*(TAutoMap<char*>, TOptional<ui64>, TOptional<ui64>),
- builder.OptionalArgs(1)) {
+ SIMPLE_STRICT_UDF_OPTIONS(TSubstring, char*(TAutoMap<char*>, TOptional<ui64>, TOptional<ui64>),
+ builder.OptionalArgs(1)) {
const TString input(args[0].AsStringRef());
const ui64 from = args[1].GetOrDefault<ui64>(0);
const ui64 count = args[2].GetOrDefault<ui64>(TString::npos);
@@ -317,7 +318,7 @@ namespace {
using TLimitArg = TNamedArg<ui64, limitName>;
- SIMPLE_UDF_OPTIONS(TSplitToList, TListType<char*>(
+ SIMPLE_STRICT_UDF_OPTIONS(TSplitToList, TListType<char*>(
TOptional<char*>,
char*,
TDelimeterStringArg,
@@ -353,7 +354,7 @@ namespace {
return valueBuilder->NewList(result.data(), result.size());
}
- SIMPLE_UDF(TJoinFromList, char*(TAutoMap<TListType<TOptional<char*>>>, char*)) {
+ SIMPLE_STRICT_UDF(TJoinFromList, char*(TAutoMap<TListType<TOptional<char*>>>, char*)) {
auto input = args[0].GetListIterator();
const TString delimeter(args[1].AsStringRef());
TVector<TString> items;
@@ -368,7 +369,7 @@ namespace {
return valueBuilder->NewString(JoinSeq(delimeter, items));
}
- SIMPLE_UDF(TLevensteinDistance, ui64(TAutoMap<char*>, TAutoMap<char*>)) {
+ SIMPLE_STRICT_UDF(TLevensteinDistance, ui64(TAutoMap<char*>, TAutoMap<char*>)) {
Y_UNUSED(valueBuilder);
const TStringBuf left(args[0].AsStringRef());
const TStringBuf right(args[1].AsStringRef());
@@ -414,69 +415,69 @@ namespace {
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(THex, char*(TAutoMap<ui64>)) {
+ SIMPLE_STRICT_UDF(THex, char*(TAutoMap<ui64>)) {
TStringStream result;
result << Hex(args[0].Get<ui64>());
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(TSHex, char*(TAutoMap<i64>)) {
+ SIMPLE_STRICT_UDF(TSHex, char*(TAutoMap<i64>)) {
TStringStream result;
result << SHex(args[0].Get<i64>());
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(TBin, char*(TAutoMap<ui64>)) {
+ SIMPLE_STRICT_UDF(TBin, char*(TAutoMap<ui64>)) {
TStringStream result;
result << Bin(args[0].Get<ui64>());
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(TSBin, char*(TAutoMap<i64>)) {
+ SIMPLE_STRICT_UDF(TSBin, char*(TAutoMap<i64>)) {
TStringStream result;
result << SBin(args[0].Get<i64>());
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(THexText, char*(TAutoMap<char*>)) {
+ SIMPLE_STRICT_UDF(THexText, char*(TAutoMap<char*>)) {
TStringStream result;
const TStringBuf input(args[0].AsStringRef());
result << HexText(input);
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(TBinText, char*(TAutoMap<char*>)) {
+ SIMPLE_STRICT_UDF(TBinText, char*(TAutoMap<char*>)) {
TStringStream result;
const TStringBuf input(args[0].AsStringRef());
result << BinText(input);
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(THumanReadableDuration, char*(TAutoMap<ui64>)) {
+ SIMPLE_STRICT_UDF(THumanReadableDuration, char*(TAutoMap<ui64>)) {
TStringStream result;
result << HumanReadable(TDuration::MicroSeconds(args[0].Get<ui64>()));
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(THumanReadableQuantity, char*(TAutoMap<ui64>)) {
+ SIMPLE_STRICT_UDF(THumanReadableQuantity, char*(TAutoMap<ui64>)) {
TStringStream result;
result << HumanReadableSize(args[0].Get<ui64>(), SF_QUANTITY);
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(THumanReadableBytes, char*(TAutoMap<ui64>)) {
+ SIMPLE_STRICT_UDF(THumanReadableBytes, char*(TAutoMap<ui64>)) {
TStringStream result;
result << HumanReadableSize(args[0].Get<ui64>(), SF_BYTES);
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(TPrec, char*(TAutoMap<double>, ui64)) {
+ SIMPLE_STRICT_UDF(TPrec, char*(TAutoMap<double>, ui64)) {
TStringStream result;
result << Prec(args[0].Get<double>(), args[1].Get<ui64>());
return valueBuilder->NewString(TStringRef(result.Data(), result.Size()));
}
- SIMPLE_UDF(TToByteList, TListType<ui8>(char*)) {
+ SIMPLE_STRICT_UDF(TToByteList, TListType<ui8>(char*)) {
const TStringBuf input(args[0].AsStringRef());
TUnboxedValue* items = nullptr;
TUnboxedValue result = valueBuilder->NewArray(input.size(), items);
@@ -486,7 +487,7 @@ namespace {
return result;
}
- SIMPLE_UDF(TFromByteList, char*(TListType<ui8>)) {
+ SIMPLE_STRICT_UDF(TFromByteList, char*(TListType<ui8>)) {
auto input = args[0];
if (auto elems = input.GetElements()) {