summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoratarasov5 <[email protected]>2025-02-10 13:41:59 +0300
committeratarasov5 <[email protected]>2025-02-10 14:40:42 +0300
commite46bed95ee43ea70afccfa413ea7e9f9e088cc33 (patch)
treedd3f39b014a9f1aacc2c132b7fccf6b770e411b3
parentd5a7416eb3d3b6e73c97d2511781875814cb7045 (diff)
YQL-19535: Provide block implementations for some functions
YQL-19535: Provide block operations YQL-19535: Specify tests for blocked operations commit_hash:032aa58fc3f44f0eba3d9b38def021178da949ce
-rw-r--r--yql/essentials/tests/sql/minirun/part0/canondata/result.json6
-rw-r--r--yql/essentials/tests/sql/minirun/part1/canondata/result.json12
-rw-r--r--yql/essentials/tests/sql/minirun/part4/canondata/result.json6
-rw-r--r--yql/essentials/tests/sql/minirun/part5/canondata/result.json12
-rw-r--r--yql/essentials/tests/sql/minirun/part8/canondata/result.json6
-rw-r--r--yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h269
-rw-r--r--yql/essentials/udfs/common/unicode_base/lib/ya.make3
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/result.json20
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockIsCategory_/results.txt160
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockNormalize_/results.txt92
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockTo_/results.txt102
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt114
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Normalize_/results.txt92
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt31
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in6
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.sql16
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in4
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.sql13
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in7
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.sql12
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in6
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql30
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in4
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Normalize.sql9
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/To.in15
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr20
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql1
-rw-r--r--yql/essentials/udfs/common/unicode_base/ya.make2
33 files changed, 865 insertions, 245 deletions
diff --git a/yql/essentials/tests/sql/minirun/part0/canondata/result.json b/yql/essentials/tests/sql/minirun/part0/canondata/result.json
index 564c44698bf..f7aa9bc69d3 100644
--- a/yql/essentials/tests/sql/minirun/part0/canondata/result.json
+++ b/yql/essentials/tests/sql/minirun/part0/canondata/result.json
@@ -588,9 +588,9 @@
],
"test.test[expr-struct_literal--Debug]": [
{
- "checksum": "7013abbd2487b4c5c0783bd9d8e8773e",
- "size": 582,
- "uri": "https://{canondata_backend}/1942525/ede9d81525f3cde3c09402fe9435fdbba85f47bc/resource.tar.gz#test.test_expr-struct_literal--Debug_/opt.yql"
+ "checksum": "32fb9ad7f0ff99f13245971fde9c9e44",
+ "size": 607,
+ "uri": "https://{canondata_backend}/1600758/668d9612baf2b806cdbf57a4a5626576611cb0c8/resource.tar.gz#test.test_expr-struct_literal--Debug_/opt.yql"
}
],
"test.test[expr-struct_literal--Results]": [
diff --git a/yql/essentials/tests/sql/minirun/part1/canondata/result.json b/yql/essentials/tests/sql/minirun/part1/canondata/result.json
index e299c4d970d..9338973cc71 100644
--- a/yql/essentials/tests/sql/minirun/part1/canondata/result.json
+++ b/yql/essentials/tests/sql/minirun/part1/canondata/result.json
@@ -15,9 +15,9 @@
],
"test.test[action-eval_capture--Debug]": [
{
- "checksum": "addd79d812135465fc39c6ede76b5b00",
- "size": 1065,
- "uri": "https://{canondata_backend}/1925821/e00f3e167890c5f5da97383429fa618c17c22f4b/resource.tar.gz#test.test_action-eval_capture--Debug_/opt.yql"
+ "checksum": "11fa4fe28d1d33bfbe682131dba7ccdf",
+ "size": 1090,
+ "uri": "https://{canondata_backend}/1600758/8128a043e648302a268bf13245bc303a361f75b9/resource.tar.gz#test.test_action-eval_capture--Debug_/opt.yql"
}
],
"test.test[action-eval_capture--Results]": [
@@ -1302,9 +1302,9 @@
],
"test.test[udf-trivial_udf--Debug]": [
{
- "checksum": "e30ef93274f818b56638089fa4a0513e",
- "size": 400,
- "uri": "https://{canondata_backend}/995452/57f8b127ed5fa9fae2dd5ebb0f5870d86a7fcd2f/resource.tar.gz#test.test_udf-trivial_udf--Debug_/opt.yql"
+ "checksum": "8a826f54ac3877f855e9d5f4039f1957",
+ "size": 425,
+ "uri": "https://{canondata_backend}/1937001/da32717675dd7b959b82585f5fe8b8f1d2542461/resource.tar.gz#test.test_udf-trivial_udf--Debug_/opt.yql"
}
],
"test.test[udf-trivial_udf--Results]": [
diff --git a/yql/essentials/tests/sql/minirun/part4/canondata/result.json b/yql/essentials/tests/sql/minirun/part4/canondata/result.json
index 97c6fb14dd7..97562f4708a 100644
--- a/yql/essentials/tests/sql/minirun/part4/canondata/result.json
+++ b/yql/essentials/tests/sql/minirun/part4/canondata/result.json
@@ -281,9 +281,9 @@
],
"test.test[binding-compact_named_with_subq_contexts--Debug]": [
{
- "checksum": "6ae122590fa1a3740afe06c807082844",
- "size": 1096,
- "uri": "https://{canondata_backend}/1925821/db505909f0fb5dcb9a1c2635b652923e2e5d33c8/resource.tar.gz#test.test_binding-compact_named_with_subq_contexts--Debug_/opt.yql"
+ "checksum": "5fbd0bbbfed9dceb14486930c58f6d2a",
+ "size": 1135,
+ "uri": "https://{canondata_backend}/1889210/9d6331356a8b5731f25d9bf2d510824a0256baa0/resource.tar.gz#test.test_binding-compact_named_with_subq_contexts--Debug_/opt.yql"
}
],
"test.test[binding-compact_named_with_subq_contexts--Results]": [
diff --git a/yql/essentials/tests/sql/minirun/part5/canondata/result.json b/yql/essentials/tests/sql/minirun/part5/canondata/result.json
index 148c9d957ee..d109287d0f3 100644
--- a/yql/essentials/tests/sql/minirun/part5/canondata/result.json
+++ b/yql/essentials/tests/sql/minirun/part5/canondata/result.json
@@ -1,9 +1,9 @@
{
"test.test[action-action_udf_args--Debug]": [
{
- "checksum": "8f84413764bb8e1f2b44fbc31956d7ec",
- "size": 430,
- "uri": "https://{canondata_backend}/1925821/6007882aec2e7b1330cc057157b466b121eec1eb/resource.tar.gz#test.test_action-action_udf_args--Debug_/opt.yql"
+ "checksum": "460a745ac85e95986996b9d1aa9379ae",
+ "size": 455,
+ "uri": "https://{canondata_backend}/1809005/41147930b57b9f7a31e613bdd3a9f9eaef9009f6/resource.tar.gz#test.test_action-action_udf_args--Debug_/opt.yql"
}
],
"test.test[action-action_udf_args--Results]": [
@@ -1091,9 +1091,9 @@
],
"test.test[library-library_udf--Debug]": [
{
- "checksum": "e30ef93274f818b56638089fa4a0513e",
- "size": 400,
- "uri": "https://{canondata_backend}/1942100/1466d7e49a6dc5a8df761a5ac92539095e1a14a0/resource.tar.gz#test.test_library-library_udf--Debug_/opt.yql"
+ "checksum": "8a826f54ac3877f855e9d5f4039f1957",
+ "size": 425,
+ "uri": "https://{canondata_backend}/1942671/adb6336095b48aab4fed8e97a973ecc6eb2c7004/resource.tar.gz#test.test_library-library_udf--Debug_/opt.yql"
}
],
"test.test[library-library_udf--Results]": [
diff --git a/yql/essentials/tests/sql/minirun/part8/canondata/result.json b/yql/essentials/tests/sql/minirun/part8/canondata/result.json
index 99429e3c851..76dd1b302e7 100644
--- a/yql/essentials/tests/sql/minirun/part8/canondata/result.json
+++ b/yql/essentials/tests/sql/minirun/part8/canondata/result.json
@@ -1,9 +1,9 @@
{
"test.test[action-eval_percentile-default.txt-Debug]": [
{
- "checksum": "f1d59477f03e0fa8b684f5c6db6a2aca",
- "size": 2173,
- "uri": "https://{canondata_backend}/1925821/dbb639c652a305ac0d22d675f471bfcd73848bae/resource.tar.gz#test.test_action-eval_percentile-default.txt-Debug_/opt.yql"
+ "checksum": "132b473519fcf8576fbbc8a1ecdfd6bd",
+ "size": 2202,
+ "uri": "https://{canondata_backend}/1903280/9b009523486ad950a7d921352a60f1c892f4f1cc/resource.tar.gz#test.test_action-eval_percentile-default.txt-Debug_/opt.yql"
}
],
"test.test[action-eval_percentile-default.txt-Results]": [
diff --git a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
index 4a852a5a6f6..a16582fb4e3 100644
--- a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
+++ b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
@@ -3,6 +3,7 @@
#include <yql/essentials/public/udf/udf_allocator.h>
#include <yql/essentials/public/udf/udf_helpers.h>
#include <yql/essentials/utils/utf8.h>
+#include <yql/essentials/public/udf/arrow/udf_arrow_helpers.h>
#include <library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h>
#include <library/cpp/unicode/normalization/normalization.h>
@@ -24,6 +25,9 @@ using namespace NUdf;
using namespace NUnicode;
namespace {
+ inline constexpr bool IsAscii(wchar32 c) noexcept {
+ return ::IsAscii(c);
+ }
template <class It>
struct TIsUnicodeSpaceAdapter {
@@ -37,51 +41,144 @@ namespace {
return {};
}
-#define NORMALIZE_UDF_MAP(XX) \
- XX(Normalize, NFC) \
- XX(NormalizeNFD, NFD) \
- XX(NormalizeNFC, NFC) \
- XX(NormalizeNFKD, NFKD) \
- XX(NormalizeNFKC, NFKC)
-
-#define IS_CATEGORY_UDF_MAP(XX) \
- XX(IsAscii, IsAscii) \
- XX(IsSpace, IsSpace) \
- XX(IsUpper, IsUpper) \
- XX(IsLower, IsLower) \
- XX(IsDigit, IsDigit) \
- XX(IsAlpha, IsAlpha) \
- XX(IsAlnum, IsAlnum) \
- XX(IsHex, IsHexdigit)
-
-#define NORMALIZE_UDF(name, mode) \
- SIMPLE_UDF(T##name, TUtf8(TAutoMap<TUtf8>)) { \
- const auto& inputRef = args[0].AsStringRef(); \
- const TUtf16String& input = UTF8ToWide(inputRef.Data(), inputRef.Size()); \
- const TString& output = WideToUTF8(Normalize<mode>(input)); \
- return valueBuilder->NewString(output); \
- }
+ struct TNoChangesTag {};
-#define IS_CATEGORY_UDF(udfName, function) \
- SIMPLE_UDF(T##udfName, bool(TAutoMap<TUtf8>)) { \
- Y_UNUSED(valueBuilder); \
- const TStringBuf input(args[0].AsStringRef()); \
- bool result = true; \
- wchar32 rune; \
- const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin()); \
- const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end()); \
- while (cur != last) { \
- ReadUTF8CharAndAdvance(rune, cur, last); \
- if (!function(rune)) { \
- result = false; \
- break; \
- } \
- } \
- return TUnboxedValuePod(result); \
- }
+ template <typename TDerived>
+ struct TScalarOperationMixin {
+ static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) {
+ Y_DEBUG_ABORT_UNLESS(IsUtf8(args[0].AsStringRef()));
+ auto&& executeResult = TDerived::Execute(args[0].AsStringRef());
+ return ProcessResult(builder, std::move(executeResult), args);
+ }
+
+ private:
+ static TUnboxedValue ProcessResult(const IValueBuilder* builder, TString&& newString, const TUnboxedValuePod*) {
+ return builder->NewString(std::move(newString));
+ }
+
+ template <typename T>
+ static TUnboxedValue ProcessResult(const IValueBuilder* builder, std::variant<TNoChangesTag, T> newValue, const TUnboxedValuePod* initialArg) {
+ if (std::holds_alternative<T>(newValue)) {
+ return ProcessResult(builder, std::move(std::get<T>(newValue)), initialArg);
+ } else {
+ return initialArg[0];
+ }
+ }
+
+ static TUnboxedValue ProcessResult(const IValueBuilder* builder, bool result, const TUnboxedValuePod*) {
+ Y_UNUSED(builder);
+ return TUnboxedValuePod(result);
+ }
+ };
+
+ template <typename TDerived>
+ struct TBlockOperationMixin {
+ template <typename Sync>
+ static void DoExecute(const TBlockItem arg, const Sync& sync) {
+ Y_DEBUG_ABORT_UNLESS(IsUtf8(arg.AsStringRef()));
+ auto&& executeResult = TDerived::Execute(arg.AsStringRef());
+ TBlockItem boxedValue = ProcessResult(std::move(executeResult), arg);
+ sync(boxedValue);
+ }
+
+ private:
+ static TBlockItem ProcessResult(const TString& newString, const TBlockItem arg) {
+ Y_UNUSED(arg);
+ return TBlockItem(std::move(newString));
+ }
+
+ template <typename T>
+ static TBlockItem ProcessResult(const std::variant<TNoChangesTag, T>& newValue, const TBlockItem arg) {
+ if (std::holds_alternative<T>(newValue)) {
+ return ProcessResult(std::get<T>(newValue), arg);
+ } else {
+ return arg;
+ }
+ }
+
+ static TBlockItem ProcessResult(bool result, const TBlockItem arg) {
+ Y_UNUSED(arg);
+ return TBlockItem(result);
+ }
+ };
- NORMALIZE_UDF_MAP(NORMALIZE_UDF)
- IS_CATEGORY_UDF_MAP(IS_CATEGORY_UDF)
+ template <typename TDerived>
+ struct TOperationMixin: public TBlockOperationMixin<TDerived>, public TScalarOperationMixin<TDerived> {
+ using TBlockOperationMixin<TDerived>::DoExecute;
+ using TScalarOperationMixin<TDerived>::DoExecute;
+ };
+
+ template <auto mode>
+ struct TNormalizeUTF8: public TOperationMixin<TNormalizeUTF8<mode>> {
+ static TString Execute(TStringRef arg) {
+ const TUtf16String& input = UTF8ToWide(arg.Data(), arg.Size());
+ return WideToUTF8(Normalize<mode>(input));
+ }
+ };
+
+ template <bool (*Function)(wchar32)>
+ struct TCheckAllChars: public TOperationMixin<TCheckAllChars<Function>> {
+ static bool Execute(TStringRef arg) {
+ const TStringBuf input(arg);
+ wchar32 rune;
+ const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin());
+ const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());
+ while (cur != last) {
+ ReadUTF8CharAndAdvance(rune, cur, last);
+ if (!static_cast<bool (*)(wchar32)>(Function)(rune)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ };
+
+ template <bool (*Function)(TUtf16String&, size_t pos, size_t count)>
+ struct TStringToStringMapper: public TOperationMixin<TStringToStringMapper<Function>> {
+ static std::variant<TNoChangesTag, TString> Execute(TStringRef arg) {
+ if (auto wide = UTF8ToWide(arg);
+ static_cast<bool (*)(TUtf16String&, size_t pos, size_t count)>(Function)(wide, 0, TUtf16String::npos)) {
+ return WideToUTF8(std::move(wide));
+ } else {
+ return TNoChangesTag{};
+ }
+ }
+ };
+
+#define DEFINE_UTF8_OPERATION(udfName, Executor, signature) \
+ BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, signature) { \
+ return Executor::DoExecute(valueBuilder, args); \
+ } \
+ \
+ struct T##udfName##KernelExec \
+ : public TUnaryKernelExec<T##udfName##KernelExec> { \
+ template <typename TSink> \
+ static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, const TSink& sink) { \
+ Y_UNUSED(valueBuilder); \
+ Executor::DoExecute(arg1, sink); \
+ } \
+ }; \
+ \
+ END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
+
+ DEFINE_UTF8_OPERATION(Normalize, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(NormalizeNFD, TNormalizeUTF8<NFD>, TUtf8(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(NormalizeNFC, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(NormalizeNFKD, TNormalizeUTF8<NFKD>, TUtf8(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(NormalizeNFKC, TNormalizeUTF8<NFKC>, TUtf8(TAutoMap<TUtf8>));
+
+ DEFINE_UTF8_OPERATION(IsAscii, TCheckAllChars<IsAscii>, bool(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(IsSpace, TCheckAllChars<IsSpace>, bool(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(IsUpper, TCheckAllChars<IsUpper>, bool(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(IsLower, TCheckAllChars<IsLower>, bool(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(IsDigit, TCheckAllChars<IsDigit>, bool(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(IsAlpha, TCheckAllChars<IsAlpha>, bool(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(IsAlnum, TCheckAllChars<IsAlnum>, bool(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(IsHex, TCheckAllChars<IsHexdigit>, bool(TAutoMap<TUtf8>));
+
+ DEFINE_UTF8_OPERATION(ToTitle, TStringToStringMapper<ToTitle>, TUtf8(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(ToUpper, TStringToStringMapper<ToUpper>, TUtf8(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION(ToLower, TStringToStringMapper<ToLower>, TUtf8(TAutoMap<TUtf8>));
SIMPLE_UDF(TIsUtf, bool(TOptional<char*>)) {
Y_UNUSED(valueBuilder);
@@ -461,27 +558,6 @@ namespace {
return valueBuilder->NewString(WideToUTF8(wide));
}
- SIMPLE_UDF(TToLower, TUtf8(TAutoMap<TUtf8>)) {
- if (auto wide = UTF8ToWide(args->AsStringRef()); ToLower(wide))
- return valueBuilder->NewString(WideToUTF8(wide));
- else
- return *args;
- }
-
- SIMPLE_UDF(TToUpper, TUtf8(TAutoMap<TUtf8>)) {
- if (auto wide = UTF8ToWide(args->AsStringRef()); ToUpper(wide))
- return valueBuilder->NewString(WideToUTF8(wide));
- else
- return *args;
- }
-
- SIMPLE_UDF(TToTitle, TUtf8(TAutoMap<TUtf8>)) {
- if (auto wide = UTF8ToWide(args->AsStringRef()); ToTitle(wide))
- return valueBuilder->NewString(WideToUTF8(wide));
- else
- return *args;
- }
-
SIMPLE_UDF(TStrip, TUtf8(TAutoMap<TUtf8>)) {
const TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin()));
@@ -512,33 +588,42 @@ namespace {
return TUnboxedValuePod(result);
}
-#define REGISTER_NORMALIZE_UDF(name, mode) T##name,
-#define REGISTER_IS_CATEGORY_UDF(name, function) T##name,
#define EXPORTED_UNICODE_BASE_UDF \
- NORMALIZE_UDF_MAP(REGISTER_NORMALIZE_UDF) \
- IS_CATEGORY_UDF_MAP(REGISTER_IS_CATEGORY_UDF) \
- TIsUtf, \
- TGetLength, \
- TSubstring, \
- TFind, \
- TRFind, \
- TSplitToList, \
- TJoinFromList, \
- TLevensteinDistance, \
- TReplaceAll, \
- TReplaceFirst, \
- TReplaceLast, \
- TRemoveAll, \
- TRemoveFirst, \
- TRemoveLast, \
- TToCodePointList, \
- TFromCodePointList, \
- TReverse, \
- TToLower, \
- TToUpper, \
- TToTitle, \
- TToUint64, \
- TTryToUint64, \
- TStrip, \
- TIsUnicodeSet
+ TIsUtf, \
+ TGetLength, \
+ TSubstring, \
+ TFind, \
+ TRFind, \
+ TSplitToList, \
+ TJoinFromList, \
+ TLevensteinDistance, \
+ TReplaceAll, \
+ TReplaceFirst, \
+ TReplaceLast, \
+ TRemoveAll, \
+ TRemoveFirst, \
+ TRemoveLast, \
+ TToCodePointList, \
+ TFromCodePointList, \
+ TReverse, \
+ TToLower, \
+ TToUpper, \
+ TToTitle, \
+ TToUint64, \
+ TTryToUint64, \
+ TStrip, \
+ TIsUnicodeSet, \
+ TNormalize, \
+ TNormalizeNFD, \
+ TNormalizeNFC, \
+ TNormalizeNFKD, \
+ TNormalizeNFKC, \
+ TIsAscii, \
+ TIsSpace, \
+ TIsUpper, \
+ TIsLower, \
+ TIsDigit, \
+ TIsAlpha, \
+ TIsAlnum, \
+ TIsHex
}
diff --git a/yql/essentials/udfs/common/unicode_base/lib/ya.make b/yql/essentials/udfs/common/unicode_base/lib/ya.make
index f50858d02ae..2fda0829667 100644
--- a/yql/essentials/udfs/common/unicode_base/lib/ya.make
+++ b/yql/essentials/udfs/common/unicode_base/lib/ya.make
@@ -2,7 +2,7 @@ LIBRARY()
YQL_ABI_VERSION(
2
- 27
+ 37
0
)
@@ -16,6 +16,7 @@ PEERDIR(
library/cpp/unicode/normalization
library/cpp/unicode/set
yql/essentials/public/udf
+ yql/essentials/public/udf/arrow
yql/essentials/utils
)
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
index 8d19afc4281..8189dd16e08 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
@@ -1,4 +1,19 @@
{
+ "test.test[BlockIsCategory]": [
+ {
+ "uri": "file://test.test_BlockIsCategory_/results.txt"
+ }
+ ],
+ "test.test[BlockNormalize]": [
+ {
+ "uri": "file://test.test_BlockNormalize_/results.txt"
+ }
+ ],
+ "test.test[BlockTo]": [
+ {
+ "uri": "file://test.test_BlockTo_/results.txt"
+ }
+ ],
"test.test[Find]": [
{
"uri": "file://test.test_Find_/results.txt"
@@ -19,6 +34,11 @@
"uri": "file://test.test_List_/results.txt"
}
],
+ "test.test[Normalize]": [
+ {
+ "uri": "file://test.test_Normalize_/results.txt"
+ }
+ ],
"test.test[Remove]": [
{
"uri": "file://test.test_Remove_/results.txt"
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockIsCategory_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockIsCategory_/results.txt
new file mode 100644
index 00000000000..e95de9fe1d2
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockIsCategory_/results.txt
@@ -0,0 +1,160 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "column1";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "column2";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "column3";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "column4";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "column5";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "column6";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "column7";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "column8";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "column9";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "0F3A4E";
+ %true;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %true;
+ %true;
+ %false
+ ];
+ [
+ "\xD0\xB2\xD0\x92\xD0\xB0\xD0\x92\xD1\x8B\xD0\xB0";
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %true;
+ %true;
+ %false;
+ %false
+ ];
+ [
+ "\xD1\x84\xD1\x8B\xD0\xB2";
+ %false;
+ %false;
+ %false;
+ %true;
+ %false;
+ %true;
+ %true;
+ %false;
+ %false
+ ];
+ [
+ "1234";
+ %true;
+ %false;
+ %false;
+ %false;
+ %true;
+ %false;
+ %true;
+ %true;
+ %false
+ ];
+ [
+ "\xD0\xB2\xD1\2132\xD0\xB2-\xD0\xB0";
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false
+ ];
+ [
+ "\xD0\xB2\xD1\x8B\xD0\2601-!\xD1\x8B\xD0\xB2";
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockNormalize_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockNormalize_/results.txt
new file mode 100644
index 00000000000..2fc20b07f1e
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockNormalize_/results.txt
@@ -0,0 +1,92 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize_nfd";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize_nfc";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize_nfkd";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize_nfkc";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "\xC3\xA9";
+ "\xC3\xA9";
+ "e\xCC\x81";
+ "\xC3\xA9";
+ "e\xCC\x81";
+ "\xC3\xA9"
+ ];
+ [
+ "e\xCC\x81";
+ "\xC3\xA9";
+ "e\xCC\x81";
+ "\xC3\xA9";
+ "e\xCC\x81";
+ "\xC3\xA9"
+ ];
+ [
+ "\xC2\xB5";
+ "\xC2\xB5";
+ "\xC2\xB5";
+ "\xC2\xB5";
+ "\xCE\xBC";
+ "\xCE\xBC"
+ ];
+ [
+ "\xE2\x84\x8C";
+ "\xE2\x84\x8C";
+ "\xE2\x84\x8C";
+ "\xE2\x84\x8C";
+ "H";
+ "H"
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockTo_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockTo_/results.txt
new file mode 100644
index 00000000000..7f7b2525d78
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockTo_/results.txt
@@ -0,0 +1,102 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "lower";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "upper";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "title";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "reverse";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "test";
+ "test";
+ "TEST";
+ "Test";
+ "tset"
+ ];
+ [
+ "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82";
+ "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82";
+ "\xD0\xA2\xD0\x95\xD0\xA1\xD0\xA2";
+ "\xD0\xA2\xD0\xB5\xD1\x81\xD1\x82";
+ "\xD1\x82\xD1\x81\xD0\xB5\xD1\x82"
+ ];
+ [
+ "TeSt";
+ "test";
+ "TEST";
+ "Test";
+ "tSeT"
+ ];
+ [
+ "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2";
+ "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82";
+ "\xD0\xA2\xD0\x95\xD0\xA1\xD0\xA2";
+ "\xD0\xA2\xD0\xB5\xD1\x81\xD1\x82";
+ "\xD0\xA2\xD1\x81\xD0\x95\xD1\x82"
+ ];
+ [
+ "Eyl\xC3\xBCl";
+ "eyl\xC3\xBCl";
+ "EYL\xC3\x9CL";
+ "Eyl\xC3\xBCl";
+ "l\xC3\xBClyE"
+ ];
+ [
+ "6";
+ "6";
+ "6";
+ "6";
+ "6"
+ ];
+ [
+ "";
+ "";
+ "";
+ "";
+ ""
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt
index a6fd861c645..e95de9fe1d2 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt
@@ -8,10 +8,10 @@
"StructType";
[
[
- "column0";
+ "value";
[
"DataType";
- "Bool"
+ "Utf8"
]
];
[
@@ -76,85 +76,81 @@
"DataType";
"Bool"
]
- ];
- [
- "column10";
- [
- "DataType";
- "Bool"
- ]
- ];
- [
- "column11";
- [
- "DataType";
- "Bool"
- ]
- ];
- [
- "column12";
- [
- "DataType";
- "Bool"
- ]
- ];
- [
- "column13";
- [
- "DataType";
- "Bool"
- ]
- ];
- [
- "column14";
- [
- "DataType";
- "Bool"
- ]
- ];
- [
- "column15";
- [
- "DataType";
- "Bool"
- ]
- ];
- [
- "column16";
- [
- "DataType";
- "Bool"
- ]
- ];
- [
- "column17";
- [
- "DataType";
- "Bool"
- ]
]
]
]
];
"Data" = [
[
+ "0F3A4E";
%true;
%false;
- %true;
%false;
+ %false;
+ %false;
+ %false;
+ %true;
%true;
+ %false
+ ];
+ [
+ "\xD0\xB2\xD0\x92\xD0\xB0\xD0\x92\xD1\x8B\xD0\xB0";
+ %false;
+ %false;
%false;
+ %false;
+ %false;
+ %true;
%true;
%false;
+ %false
+ ];
+ [
+ "\xD1\x84\xD1\x8B\xD0\xB2";
+ %false;
+ %false;
+ %false;
%true;
%false;
%true;
+ %true;
%false;
+ %false
+ ];
+ [
+ "1234";
%true;
%false;
+ %false;
+ %false;
%true;
%false;
%true;
+ %true;
+ %false
+ ];
+ [
+ "\xD0\xB2\xD1\2132\xD0\xB2-\xD0\xB0";
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false
+ ];
+ [
+ "\xD0\xB2\xD1\x8B\xD0\2601-!\xD1\x8B\xD0\xB2";
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
+ %false;
%false
]
]
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Normalize_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Normalize_/results.txt
new file mode 100644
index 00000000000..2fc20b07f1e
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Normalize_/results.txt
@@ -0,0 +1,92 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize_nfd";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize_nfc";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize_nfkd";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "normalize_nfkc";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "\xC3\xA9";
+ "\xC3\xA9";
+ "e\xCC\x81";
+ "\xC3\xA9";
+ "e\xCC\x81";
+ "\xC3\xA9"
+ ];
+ [
+ "e\xCC\x81";
+ "\xC3\xA9";
+ "e\xCC\x81";
+ "\xC3\xA9";
+ "e\xCC\x81";
+ "\xC3\xA9"
+ ];
+ [
+ "\xC2\xB5";
+ "\xC2\xB5";
+ "\xC2\xB5";
+ "\xC2\xB5";
+ "\xCE\xBC";
+ "\xCE\xBC"
+ ];
+ [
+ "\xE2\x84\x8C";
+ "\xE2\x84\x8C";
+ "\xE2\x84\x8C";
+ "\xE2\x84\x8C";
+ "H";
+ "H"
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt
index 465ad350553..502cea3fd0f 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt
@@ -18,16 +18,6 @@
]
];
[
- "normalize";
- [
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
- ]
- ];
- [
"is";
[
"DataType";
@@ -175,9 +165,6 @@
[
"Eyl\xC3\xBCl"
];
- [
- "Eyl\xC3\xBCl"
- ];
%true;
[
"5"
@@ -221,9 +208,6 @@
[
"\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"
];
- [
- "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"
- ];
%true;
[
"6"
@@ -268,9 +252,6 @@
[
"\xC3\xBAnora"
];
- [
- "\xC3\xBAnora"
- ];
%true;
[
"5"
@@ -314,9 +295,6 @@
[
"Ci\xD1\x87 Ci\xD1\x87"
];
- [
- "Ci\xD1\x87 Ci\xD1\x87"
- ];
%true;
[
"7"
@@ -362,9 +340,6 @@
[
"\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"
];
- [
- "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"
- ];
%true;
[
"13"
@@ -424,9 +399,6 @@
[
"6"
];
- [
- "6"
- ];
%true;
[
"1"
@@ -466,9 +438,6 @@
[
""
];
- [
- ""
- ];
%true;
[
"0"
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in
new file mode 100644
index 00000000000..4aba89386b4
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in
@@ -0,0 +1,6 @@
+{"key"="1";"value"="0F3A4E"};
+{"key"="2";"value"="вВаВыа"};
+{"key"="3";"value"="фыв"};
+{"key"="4";"value"="1234"};
+{"key"="5";"value"="вы2в-а"};
+{"key"="6";"value"="выа1-!ыв"};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in.attr
new file mode 100644
index 00000000000..d5e5b2ca484
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.sql
new file mode 100644
index 00000000000..3a2b3d0c214
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.sql
@@ -0,0 +1,16 @@
+/* syntax version 1 */
+
+pragma UseBlocks;
+
+SELECT
+ value as value,
+ Unicode::IsAscii(value),
+ Unicode::IsSpace(value),
+ Unicode::IsUpper(value),
+ Unicode::IsLower(value),
+ Unicode::IsDigit(value),
+ Unicode::IsAlpha(value),
+ Unicode::IsAlnum(value),
+ Unicode::IsHex(value),
+ Unicode::IsUnicodeSet(value, "[вао]"u)
+FROM Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in
new file mode 100644
index 00000000000..2e56f171a4b
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in
@@ -0,0 +1,4 @@
+{"key"="1";"value"="\xC3\xA9"};
+{"key"="2";"value"="e\xCC\x81"};
+{"key"="3";"value"="\xC2\xB5"};
+{"key"="4";"value"="\xE2\x84\x8C"};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in.attr
new file mode 100644
index 00000000000..d5e5b2ca484
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.sql
new file mode 100644
index 00000000000..c0e063acd6b
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.sql
@@ -0,0 +1,13 @@
+/* syntax version 1 */
+
+pragma UseBlocks;
+
+SELECT
+ value AS value,
+ Unicode::Normalize(value) AS normalize,
+ Unicode::NormalizeNFD(value) AS normalize_nfd,
+ Unicode::NormalizeNFC(value) AS normalize_nfc,
+ Unicode::NormalizeNFKD(value) AS normalize_nfkd,
+ Unicode::NormalizeNFKC(value) AS normalize_nfkc
+FROM Input
+
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in
new file mode 100644
index 00000000000..82d72f16711
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in
@@ -0,0 +1,7 @@
+{"key"="1";"value"="test"};
+{"key"="2";"value"="\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"};
+{"key"="3";"value"="TeSt"};
+{"key"="4";"value"="\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"};
+{"key"="5";"value"="Eyl\xC3\xBCl"};
+{"key"="6";"value"="6"};
+{"key"="4";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in.attr
new file mode 100644
index 00000000000..d5e5b2ca484
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.sql
new file mode 100644
index 00000000000..a4d546ca6dd
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.sql
@@ -0,0 +1,12 @@
+/* syntax version 1 */
+
+pragma UseBlocks;
+
+SELECT
+ value,
+ Unicode::ToLower(value) AS lower,
+ Unicode::ToUpper(value) AS upper,
+ Unicode::ToTitle(value) AS title,
+ Unicode::Reverse(value) AS reverse,
+FROM Input;
+
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in
new file mode 100644
index 00000000000..4aba89386b4
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in
@@ -0,0 +1,6 @@
+{"key"="1";"value"="0F3A4E"};
+{"key"="2";"value"="вВаВыа"};
+{"key"="3";"value"="фыв"};
+{"key"="4";"value"="1234"};
+{"key"="5";"value"="вы2в-а"};
+{"key"="6";"value"="выа1-!ыв"};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in.attr
new file mode 100644
index 00000000000..d5e5b2ca484
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql
index 2effa23221e..bd933f911bf 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql
@@ -1,21 +1,13 @@
/* syntax version 1 */
SELECT
- Unicode::IsAscii("sdf"u),
- Unicode::IsAscii("выавыа"u),
- Unicode::IsSpace(" \u2002\u200a"u),
- Unicode::IsSpace("выавыа"u),
- Unicode::IsUpper("ФЫВ"u),
- Unicode::IsUpper("вВаВыа"u),
- Unicode::IsLower("фыв"u),
- Unicode::IsLower("вВаВыа"u),
- Unicode::IsDigit("1234"u),
- Unicode::IsDigit("выавыа"u),
- Unicode::IsAlpha("фвфы"u),
- Unicode::IsAlpha("вы2в-а"u),
- Unicode::IsAlnum("фыв13в"u),
- Unicode::IsAlnum("выа1-}ыв"u),
- Unicode::IsHex("0F3A4E"u),
- Unicode::IsHex("ваоао"u),
- Unicode::IsUnicodeSet("ваоао"u, "[вао]"u),
- Unicode::IsUnicodeSet("ваоао"u, "[ваб]"u)
-
+ value as value,
+ Unicode::IsAscii(value),
+ Unicode::IsSpace(value),
+ Unicode::IsUpper(value),
+ Unicode::IsLower(value),
+ Unicode::IsDigit(value),
+ Unicode::IsAlpha(value),
+ Unicode::IsAlnum(value),
+ Unicode::IsHex(value),
+ Unicode::IsUnicodeSet(value, "[вао]"u)
+FROM Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in
new file mode 100644
index 00000000000..2e56f171a4b
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in
@@ -0,0 +1,4 @@
+{"key"="1";"value"="\xC3\xA9"};
+{"key"="2";"value"="e\xCC\x81"};
+{"key"="3";"value"="\xC2\xB5"};
+{"key"="4";"value"="\xE2\x84\x8C"};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in.attr
new file mode 100644
index 00000000000..d5e5b2ca484
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.sql
new file mode 100644
index 00000000000..c0c8b053894
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.sql
@@ -0,0 +1,9 @@
+/* syntax version 1 */
+SELECT
+ value AS value,
+ Unicode::Normalize(value) AS normalize,
+ Unicode::NormalizeNFD(value) AS normalize_nfd,
+ Unicode::NormalizeNFC(value) AS normalize_nfc,
+ Unicode::NormalizeNFKD(value) AS normalize_nfkd,
+ Unicode::NormalizeNFKC(value) AS normalize_nfkc
+FROM Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/To.in b/yql/essentials/udfs/common/unicode_base/test/cases/To.in
index 5effdb9971b..82d72f16711 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/To.in
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/To.in
@@ -1,8 +1,7 @@
-{"key"="1";"subkey"="1";"value"="test"};
-{"key"="2";"subkey"="2";"value"="\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"};
-{"key"="3";"subkey"="3";"value"="TeSt"};
-{"key"="4";"subkey"="4";"value"="\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"};
-{"key"="5";"subkey"="5";"value"="Eyl\xC3\xBCl"};
-{"key"="6";"subkey"="6";"value"="6"};
-{"key"="4";"subkey"="4";"value"=""};
-
+{"key"="1";"value"="test"};
+{"key"="2";"value"="\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"};
+{"key"="3";"value"="TeSt"};
+{"key"="4";"value"="\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"};
+{"key"="5";"value"="Eyl\xC3\xBCl"};
+{"key"="6";"value"="6"};
+{"key"="4";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr
index 990efb1ff2c..d5e5b2ca484 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr
@@ -1,12 +1,8 @@
-{"_yql_row_spec"={
- "Type"=["StructType";[
- ["key";["DataType";"Utf8"]];
- ["subkey";["DataType";"Utf8"]];
- ["value";["DataType";"Utf8"]]
- ]];
- "SortDirections"=[1;1;];
- "SortedBy"=["key";"subkey";];
- "SortedByTypes"=[["DataType";"Utf8";];["DataType";"Utf8";];];
- "SortMembers"=["key";"subkey";];
-}}
-
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql
index b330682b6ed..cdff12f352b 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql
@@ -1,7 +1,6 @@
/* syntax version 1 */
SELECT
value AS value,
- Unicode::Normalize(value) AS normalize,
Unicode::IsUtf(value) AS is,
Unicode::GetLength(value) AS length,
Unicode::Substring(value, 1) AS one_end_substring,
diff --git a/yql/essentials/udfs/common/unicode_base/ya.make b/yql/essentials/udfs/common/unicode_base/ya.make
index 53a8f3af45b..4ec872e2495 100644
--- a/yql/essentials/udfs/common/unicode_base/ya.make
+++ b/yql/essentials/udfs/common/unicode_base/ya.make
@@ -2,7 +2,7 @@ YQL_UDF_CONTRIB(unicode_udf)
YQL_ABI_VERSION(
2
- 27
+ 37
0
)