aboutsummaryrefslogtreecommitdiffstats
path: root/yql
diff options
context:
space:
mode:
authoratarasov5 <atarasov5@yandex-team.com>2025-02-12 17:13:07 +0300
committeratarasov5 <atarasov5@yandex-team.com>2025-02-12 17:28:04 +0300
commitd9d2e3122cc237885d24563de584dce2504d0385 (patch)
tree5410acbdefb823bf23b27bbe732a06ff6afba003 /yql
parentc0fd4debea162d972c2a8dba88269faab10bc3fb (diff)
downloadydb-d9d2e3122cc237885d24563de584dce2504d0385.tar.gz
YQL-19535: Add utf8 udf block implementations
commit_hash:5eac5390db34d1ca89f96441c1cfcff9c5853587
Diffstat (limited to 'yql')
-rw-r--r--yql/essentials/tests/sql/minirun/part1/canondata/result.json6
-rw-r--r--yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h577
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/result.json30
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockFind_/results.txt134
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockRemove_/results.txt100
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockReplace_/results.txt124
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockStrip_/results.txt56
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockUnicode_/results.txt220
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt130
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt164
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt218
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt64
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_UnicodeCodePoint_/results.txt120
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt384
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in2
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.sql12
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in5
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.sql12
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.sql14
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in6
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.sql8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in7
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.sql18
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Find.in2
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Find.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Find.sql20
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Remove.in5
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Remove.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql12
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Replace.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql18
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Strip.in6
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Strip.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql10
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in14
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql5
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in7
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in.attr8
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.sql6
45 files changed, 1663 insertions, 941 deletions
diff --git a/yql/essentials/tests/sql/minirun/part1/canondata/result.json b/yql/essentials/tests/sql/minirun/part1/canondata/result.json
index bd0f9512c7..cb5503d6f3 100644
--- a/yql/essentials/tests/sql/minirun/part1/canondata/result.json
+++ b/yql/essentials/tests/sql/minirun/part1/canondata/result.json
@@ -837,9 +837,9 @@
],
"test.test[params-primitives--Debug]": [
{
- "checksum": "e232122561df92b9658b6e0e81770672",
- "size": 4184,
- "uri": "https://{canondata_backend}/1847551/378b3bd63cdb48c05228707db1486e34df62729c/resource.tar.gz#test.test_params-primitives--Debug_/opt.yql"
+ "checksum": "fc3e867d5a88ffa4e9b513f55347f2e4",
+ "size": 4214,
+ "uri": "https://{canondata_backend}/1936997/7e6197348eba6c6070acf6ee875cc5f6dc62f616/resource.tar.gz#test.test_params-primitives--Debug_/opt.yql"
}
],
"test.test[params-primitives--Results]": [
diff --git a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
index a16582fb4e..6982dbe162 100644
--- a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
+++ b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
@@ -45,19 +45,42 @@ namespace {
template <typename TDerived>
struct TScalarOperationMixin {
- static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) {
+ static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
+ requires requires { TDerived::Execute(TStringRef()); }
+ {
Y_DEBUG_ABORT_UNLESS(IsUtf8(args[0].AsStringRef()));
- auto&& executeResult = TDerived::Execute(args[0].AsStringRef());
+ auto executeResult = TDerived::Execute(args[0].AsStringRef());
+ return ProcessResult(builder, std::move(executeResult), args);
+ }
+
+ static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
+ requires requires { TDerived::Execute(TStringRef(), TStringRef()); }
+ {
+ auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef());
+ return ProcessResult(builder, std::move(executeResult), args);
+ }
+
+ static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
+ requires requires { TDerived::Execute(TStringRef(), TStringRef(), TStringRef()); }
+ {
+ auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef(), args[2].AsStringRef());
+ return ProcessResult(builder, std::move(executeResult), args);
+ }
+
+ static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args)
+ requires requires { TDerived::Execute(TStringRef(), TStringRef(), TMaybe<ui64>()); }
+ {
+ auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef(), args[2] ? TMaybe<ui64>(args[2].Get<ui64>()) : Nothing());
return ProcessResult(builder, std::move(executeResult), args);
}
private:
- static TUnboxedValue ProcessResult(const IValueBuilder* builder, TString&& newString, const TUnboxedValuePod*) {
- return builder->NewString(std::move(newString));
+ static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TString& newString, const TUnboxedValuePod*) {
+ return builder->NewString(newString);
}
template <typename T>
- static TUnboxedValue ProcessResult(const IValueBuilder* builder, std::variant<TNoChangesTag, T> newValue, const TUnboxedValuePod* initialArg) {
+ static TUnboxedValue ProcessResult(const IValueBuilder* builder, const std::variant<TNoChangesTag, T>& newValue, const TUnboxedValuePod* initialArg) {
if (std::holds_alternative<T>(newValue)) {
return ProcessResult(builder, std::move(std::get<T>(newValue)), initialArg);
} else {
@@ -65,7 +88,17 @@ namespace {
}
}
- static TUnboxedValue ProcessResult(const IValueBuilder* builder, bool result, const TUnboxedValuePod*) {
+ template <typename T>
+ static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TMaybe<T>& newValue, const TUnboxedValuePod* initialArg) {
+ if (newValue.Defined()) {
+ return ProcessResult(builder, *newValue, initialArg);
+ } else {
+ return TUnboxedValuePod();
+ }
+ }
+
+ template <typename T, typename = std::enable_if_t<TPrimitiveDataType<T>::Result>>
+ static TUnboxedValue ProcessResult(const IValueBuilder* builder, T result, const TUnboxedValuePod*) {
Y_UNUSED(builder);
return TUnboxedValuePod(result);
}
@@ -73,18 +106,61 @@ namespace {
template <typename TDerived>
struct TBlockOperationMixin {
- template <typename Sync>
- static void DoExecute(const TBlockItem arg, const Sync& sync) {
+ template <typename TSink>
+ static void BlockDoExecute(const TBlockItem arg, const TSink& sink)
+ requires requires { TDerived::Execute(TStringRef()); }
+ {
Y_DEBUG_ABORT_UNLESS(IsUtf8(arg.AsStringRef()));
- auto&& executeResult = TDerived::Execute(arg.AsStringRef());
- TBlockItem boxedValue = ProcessResult(std::move(executeResult), arg);
- sync(boxedValue);
+ auto executeResult = TDerived::Execute(arg.AsStringRef());
+ TBlockItem boxedValue = ProcessResult(executeResult, arg);
+ sink(boxedValue);
+ }
+
+ template <typename TSink>
+ static void BlockDoExecute(const TBlockItem arg1, const TBlockItem arg2, const TSink& sink)
+ requires requires { TDerived::Execute(TStringRef(), TStringRef()); }
+ {
+ auto executeResult = TDerived::Execute(arg1.AsStringRef(),
+ arg2.AsStringRef());
+ TBlockItem boxedValue = ProcessResult(executeResult, arg1);
+ sink(boxedValue);
+ }
+
+ template <typename TSink>
+ static void BlockDoExecute(const TBlockItem args, const TSink& sink)
+ requires(requires { TDerived::Execute(TStringRef(), TStringRef(), TStringRef()); })
+ {
+ auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(),
+ args.GetElement(1).AsStringRef(),
+ args.GetElement(2).AsStringRef());
+ TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0));
+ sink(boxedValue);
+ }
+
+ template <typename TSink>
+ static void BlockDoExecute(const TBlockItem args, const TSink& sink)
+ requires(requires { TDerived::Execute(TStringRef(), TStringRef(), TMaybe<ui64>(0ULL)); })
+ {
+ auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(),
+ args.GetElement(1).AsStringRef(),
+ (args.GetElement(2) ? TMaybe<ui64>(args.GetElement(2).Get<ui64>()) : Nothing()));
+ TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0));
+ sink(boxedValue);
}
private:
static TBlockItem ProcessResult(const TString& newString, const TBlockItem arg) {
Y_UNUSED(arg);
- return TBlockItem(std::move(newString));
+ return TBlockItem(newString);
+ }
+
+ template <typename T>
+ static TBlockItem ProcessResult(const TMaybe<T>& newValue, const TBlockItem arg) {
+ if (newValue.Defined()) {
+ return ProcessResult(*newValue, arg);
+ } else {
+ return TBlockItem();
+ }
}
template <typename T>
@@ -96,17 +172,15 @@ namespace {
}
}
- static TBlockItem ProcessResult(bool result, const TBlockItem arg) {
+ template <typename T, typename = std::enable_if_t<TPrimitiveDataType<T>::Result>>
+ static TBlockItem ProcessResult(T result, const TBlockItem arg) {
Y_UNUSED(arg);
return TBlockItem(result);
}
};
template <typename TDerived>
- struct TOperationMixin: public TBlockOperationMixin<TDerived>, public TScalarOperationMixin<TDerived> {
- using TBlockOperationMixin<TDerived>::DoExecute;
- using TScalarOperationMixin<TDerived>::DoExecute;
- };
+ struct TOperationMixin: public TBlockOperationMixin<TDerived>, public TScalarOperationMixin<TDerived> {};
template <auto mode>
struct TNormalizeUTF8: public TOperationMixin<TNormalizeUTF8<mode>> {
@@ -145,8 +219,210 @@ namespace {
}
};
-#define DEFINE_UTF8_OPERATION(udfName, Executor, signature) \
- BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, signature) { \
+ struct TLengthGetter: public TOperationMixin<TLengthGetter> {
+ static ui64 Execute(TStringRef inputRef) {
+ size_t result;
+ GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), result);
+ return static_cast<ui64>(result);
+ }
+ };
+
+ struct TReverser: public TOperationMixin<TReverser> {
+ static TString Execute(TStringRef inputRef) {
+ auto wide = UTF8ToWide(inputRef);
+ ReverseInPlace(wide);
+ return WideToUTF8(wide);
+ }
+ };
+
+ struct TStripper: public TOperationMixin<TStripper> {
+ static TString Execute(TStringRef inputRef) {
+ const TUtf32String input = UTF8ToUTF32<true>(inputRef);
+ const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin()));
+ return WideToUTF8(result);
+ }
+ };
+
+ struct TAllRemover: public TOperationMixin<TAllRemover> {
+ static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) {
+ TUtf32String input = UTF8ToUTF32<true>(inputRef);
+ const TUtf32String remove = UTF8ToUTF32<true>(removeRef);
+ const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
+ size_t tpos = 0;
+ for (const wchar32 c : input) {
+ if (!chars.contains(c)) {
+ input[tpos++] = c;
+ }
+ }
+ if (tpos != input.size()) {
+ input.resize(tpos);
+ return WideToUTF8(input);
+ }
+ return TNoChangesTag{};
+ }
+ };
+
+ struct TFirstRemover: public TOperationMixin<TFirstRemover> {
+ static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) {
+ TUtf32String input = UTF8ToUTF32<true>(inputRef);
+ const auto remove = UTF8ToUTF32<true>(removeRef);
+ const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
+ for (auto it = input.cbegin(); it != input.cend(); ++it) {
+ if (chars.contains(*it)) {
+ input.erase(it);
+ return WideToUTF8(input);
+ }
+ }
+ return TNoChangesTag{};
+ }
+ };
+
+ struct TUnicodeSetMatcher: public TOperationMixin<TUnicodeSetMatcher> {
+ static bool Execute(TStringRef inputRef, TStringRef customCategoryRef) {
+ const TStringBuf input(inputRef);
+ const TUtf16String& customCategory = UTF8ToWide(customCategoryRef);
+ TUnicodeSet unicodeSet;
+ try {
+ unicodeSet.Parse(customCategory);
+ } catch (...) {
+ UdfTerminate((TStringBuilder() << "Failed to parse unicode set: " << CurrentExceptionMessage()).c_str());
+ }
+ wchar32 rune;
+ const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin());
+ const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());
+ while (cur != last) {
+ ReadUTF8CharAndAdvance(rune, cur, last);
+ if (!unicodeSet.Has(rune)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ };
+
+ struct TLevensteinDistanceFinder: public TOperationMixin<TLevensteinDistanceFinder> {
+ static ui64 Execute(TStringRef leftRef, TStringRef rightRef) {
+ const TStringBuf left(leftRef);
+ const TStringBuf right(rightRef);
+ const auto& leftUtf32 = UTF8ToUTF32<true>(left);
+ const auto& rightUtf32 = UTF8ToUTF32<true>(right);
+ return NLevenshtein::Distance(leftUtf32, rightUtf32);
+ }
+ };
+
+ struct TLastRemoval: public TOperationMixin<TLastRemoval> {
+ static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) {
+ TUtf32String input = UTF8ToUTF32<true>(inputRef);
+ const TUtf32String remove = UTF8ToUTF32<true>(removeRef);
+ const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
+ for (auto it = input.crbegin(); it != input.crend(); ++it) {
+ if (chars.contains(*it)) {
+ input.erase(input.crend() - it - 1, 1);
+ return WideToUTF8(input);
+ }
+ }
+ return TNoChangesTag{};
+ }
+ };
+
+ struct TAllReplacer: public TOperationMixin<TAllReplacer> {
+ static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) {
+ if (TString result(inputRef); SubstGlobal(result, whatReplace, toReplace)) {
+ return result;
+ } else {
+ return TNoChangesTag{};
+ }
+ }
+ // Disable implict casts for arguments.
+ template <typename... Args>
+ static auto Execute(Args&&... args) = delete;
+ };
+
+ struct TFirstReplacer: public TOperationMixin<TFirstReplacer> {
+ static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) {
+ std::string result(inputRef);
+ const std::string_view what(whatReplace);
+ if (const auto index = result.find(what); index != std::string::npos) {
+ result.replace(index, what.size(), std::string_view(toReplace));
+ return result;
+ }
+ return TNoChangesTag{};
+ }
+ // Disable implict casts for arguments.
+ template <typename... Args>
+ static auto Execute(Args&&... args) = delete;
+ };
+
+ struct TLastReplacer: public TOperationMixin<TLastReplacer> {
+ static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) {
+ std::string result(inputRef);
+ const std::string_view what(whatReplace);
+ if (const auto index = result.rfind(what); index != std::string::npos) {
+ result.replace(index, what.size(), std::string_view(toReplace));
+ return result;
+ }
+ return TNoChangesTag{};
+ }
+ // Disable implict casts for arguments.
+ template <typename... Args>
+ static auto Execute(Args&&... args) = delete;
+ };
+
+ struct TFinder: public TOperationMixin<TFinder> {
+ static TMaybe<ui64> Execute(TStringRef inputRef, TStringRef whatFind, TMaybe<ui64> whereFind) {
+ const std::string_view string(inputRef);
+ const std::string_view needle(whatFind);
+ std::string_view::size_type pos = 0U;
+
+ if (auto p = whereFind.GetOrElse(0ULL)) {
+ for (auto ptr = string.data(); p && pos < string.size(); --p) {
+ const auto width = WideCharSize(*ptr);
+ pos += width;
+ ptr += width;
+ }
+ }
+
+ if (const auto find = string.find(needle, pos); std::string_view::npos != find) {
+ size_t result;
+ GetNumberOfUTF8Chars(string.data(), find, result);
+ return static_cast<ui64>(result);
+ }
+ return Nothing();
+ }
+ // Disable implict casts for arguments.
+ template <typename... Args>
+ static auto Execute(Args&&... args) = delete;
+ };
+
+ struct TRFinder: public TOperationMixin<TRFinder> {
+ static TMaybe<ui64> Execute(TStringRef inputRef, TStringRef whatFind, TMaybe<ui64> whereFind) {
+ const std::string_view string(inputRef);
+ const std::string_view needle(whatFind);
+ std::string_view::size_type pos = std::string_view::npos;
+
+ if (auto p = whereFind.GetOrElse(std::string_view::npos); std::string_view::npos != p) {
+ pos = 0ULL;
+ for (auto ptr = string.data(); p && pos < string.size(); --p) {
+ const auto width = WideCharSize(*ptr);
+ pos += width;
+ ptr += width;
+ }
+ }
+
+ if (const auto find = string.rfind(needle, pos); std::string_view::npos != find) {
+ size_t result;
+ GetNumberOfUTF8Chars(string.data(), find, result);
+ return static_cast<ui64>(result);
+ }
+ return Nothing();
+ }
+ // Disable implict casts for arguments.
+ template <typename... Args>
+ static auto Execute(Args&&... args) = delete;
+ };
+
+#define DEFINE_UTF8_OPERATION_STRICT(udfName, Executor, signature, optArgs) \
+ BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS(T##udfName, signature, optArgs) { \
return Executor::DoExecute(valueBuilder, args); \
} \
\
@@ -155,30 +431,86 @@ namespace {
template <typename TSink> \
static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, const TSink& sink) { \
Y_UNUSED(valueBuilder); \
- Executor::DoExecute(arg1, sink); \
+ Executor::BlockDoExecute(arg1, sink); \
+ } \
+ }; \
+ \
+ END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
+
+#define DEFINE_UTF8_OPERATION_BIN_BASE(macro, udfName, Executor, signature, optArgs) \
+ macro(T##udfName, signature, optArgs) { \
+ return Executor::DoExecute(valueBuilder, args); \
+ } \
+ \
+ struct T##udfName##KernelExec \
+ : public TBinaryKernelExec<T##udfName##KernelExec> { \
+ template <typename TSink> \
+ static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { \
+ Y_UNUSED(valueBuilder); \
+ Executor::BlockDoExecute(arg1, arg2, sink); \
+ } \
+ }; \
+ \
+ END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
+
+#define DEFINE_UTF8_OPERATION_BIN_STRICT(udfName, Executor, signature, optArgs) \
+ DEFINE_UTF8_OPERATION_BIN_BASE(BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS, udfName, Executor, signature, optArgs)
+
+#define DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(udfName, Executor, signature, optArgs) \
+ DEFINE_UTF8_OPERATION_BIN_BASE(BEGIN_SIMPLE_ARROW_UDF_WITH_OPTIONAL_ARGS, udfName, Executor, signature, optArgs)
+
+#define DEFINE_UTF8_OPERATION_MANY_STRICT(udfName, Executor, signature, argsCount, optArgsCount) \
+ BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS(T##udfName, signature, optArgsCount) { \
+ return Executor::DoExecute(valueBuilder, args); \
+ } \
+ \
+ struct T##udfName##KernelExec \
+ : public TGenericKernelExec<T##udfName##KernelExec, argsCount> { \
+ template <typename TSink> \
+ static void Process(const IValueBuilder* valueBuilder, TBlockItem args, const TSink& sink) { \
+ Y_UNUSED(valueBuilder); \
+ Executor::BlockDoExecute(args, sink); \
} \
}; \
\
END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
- DEFINE_UTF8_OPERATION(Normalize, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(NormalizeNFD, TNormalizeUTF8<NFD>, TUtf8(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(NormalizeNFC, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(NormalizeNFKD, TNormalizeUTF8<NFKD>, TUtf8(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(NormalizeNFKC, TNormalizeUTF8<NFKC>, TUtf8(TAutoMap<TUtf8>));
-
- DEFINE_UTF8_OPERATION(IsAscii, TCheckAllChars<IsAscii>, bool(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(IsSpace, TCheckAllChars<IsSpace>, bool(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(IsUpper, TCheckAllChars<IsUpper>, bool(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(IsLower, TCheckAllChars<IsLower>, bool(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(IsDigit, TCheckAllChars<IsDigit>, bool(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(IsAlpha, TCheckAllChars<IsAlpha>, bool(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(IsAlnum, TCheckAllChars<IsAlnum>, bool(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(IsHex, TCheckAllChars<IsHexdigit>, bool(TAutoMap<TUtf8>));
-
- DEFINE_UTF8_OPERATION(ToTitle, TStringToStringMapper<ToTitle>, TUtf8(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(ToUpper, TStringToStringMapper<ToUpper>, TUtf8(TAutoMap<TUtf8>));
- DEFINE_UTF8_OPERATION(ToLower, TStringToStringMapper<ToLower>, TUtf8(TAutoMap<TUtf8>));
+ DEFINE_UTF8_OPERATION_STRICT(Normalize, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(NormalizeNFD, TNormalizeUTF8<NFD>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(NormalizeNFC, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(NormalizeNFKD, TNormalizeUTF8<NFKD>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(NormalizeNFKC, TNormalizeUTF8<NFKC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+
+ DEFINE_UTF8_OPERATION_STRICT(IsAscii, TCheckAllChars<IsAscii>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(IsSpace, TCheckAllChars<IsSpace>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(IsUpper, TCheckAllChars<IsUpper>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(IsLower, TCheckAllChars<IsLower>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(IsDigit, TCheckAllChars<IsDigit>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(IsAlpha, TCheckAllChars<IsAlpha>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(IsAlnum, TCheckAllChars<IsAlnum>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(IsHex, TCheckAllChars<IsHexdigit>, bool(TAutoMap<TUtf8>), /*optArgs=*/0);
+
+ DEFINE_UTF8_OPERATION_STRICT(ToTitle, TStringToStringMapper<ToTitle>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(ToUpper, TStringToStringMapper<ToUpper>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(ToLower, TStringToStringMapper<ToLower>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+
+ DEFINE_UTF8_OPERATION_STRICT(GetLength, TLengthGetter, ui64(TAutoMap<TUtf8>), /*optArgs=*/0);
+
+ DEFINE_UTF8_OPERATION_STRICT(Reverse, TReverser, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_STRICT(Strip, TStripper, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0);
+
+ DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveAll, TAllRemover, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveFirst, TFirstRemover, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(IsUnicodeSet, TUnicodeSetMatcher, bool(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_BIN_STRICT(LevensteinDistance, TLevensteinDistanceFinder, ui64(TAutoMap<TUtf8>, TAutoMap<TUtf8>), /*optArgs=*/0);
+ DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveLast, TLastRemoval, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0);
+
+ DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceAll, TAllReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0);
+ DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceFirst, TFirstReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0);
+ DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceLast, TLastReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0);
+
+ DEFINE_UTF8_OPERATION_MANY_STRICT(Find, TFinder, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), /*argsCount=*/3, /*optionalArgs=*/1);
+ DEFINE_UTF8_OPERATION_MANY_STRICT(RFind, TRFinder, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), /*argsCount=*/3, /*optionalArgs=*/1);
SIMPLE_UDF(TIsUtf, bool(TOptional<char*>)) {
Y_UNUSED(valueBuilder);
@@ -189,14 +521,6 @@ namespace {
}
}
- SIMPLE_UDF(TGetLength, ui64(TAutoMap<TUtf8>)) {
- Y_UNUSED(valueBuilder);
- const auto& inputRef = args[0].AsStringRef();
- size_t result;
- GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), result);
- return TUnboxedValuePod(static_cast<ui64>(result));
- }
-
SIMPLE_UDF_WITH_OPTIONAL_ARGS(TToUint64, ui64(TAutoMap<TUtf8>, TOptional<ui16>), 1) {
Y_UNUSED(valueBuilder);
const TString inputStr(args[0].AsStringRef());
@@ -252,51 +576,6 @@ namespace {
return valueBuilder->NewString(SubstrUTF8(input, from, len));
}
- SIMPLE_UDF_WITH_OPTIONAL_ARGS(TFind, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), 1) {
- Y_UNUSED(valueBuilder);
- const std::string_view string(args[0].AsStringRef());
- const std::string_view needle(args[1].AsStringRef());
- std::string_view::size_type pos = 0U;
-
- if (auto p = args[2].GetOrDefault<ui64>(0ULL)) {
- for (auto ptr = string.data(); p && pos < string.size(); --p) {
- const auto width = WideCharSize(*ptr);
- pos += width;
- ptr += width;
- }
- }
-
- if (const auto find = string.find(needle, pos); std::string_view::npos != find) {
- size_t result;
- GetNumberOfUTF8Chars(string.data(), find, result);
- return TUnboxedValuePod(static_cast<ui64>(result));
- }
- return TUnboxedValuePod();
- }
-
- SIMPLE_UDF_WITH_OPTIONAL_ARGS(TRFind, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), 1) {
- Y_UNUSED(valueBuilder);
- const std::string_view string(args[0].AsStringRef());
- const std::string_view needle(args[1].AsStringRef());
- std::string_view::size_type pos = std::string_view::npos;
-
- if (auto p = args[2].GetOrDefault<ui64>(std::string_view::npos); std::string_view::npos != p) {
- pos = 0ULL;
- for (auto ptr = string.data(); p && pos < string.size(); --p) {
- const auto width = WideCharSize(*ptr);
- pos += width;
- ptr += width;
- }
- }
-
- if (const auto find = string.rfind(needle, pos); std::string_view::npos != find) {
- size_t result;
- GetNumberOfUTF8Chars(string.data(), find, result);
- return TUnboxedValuePod(static_cast<ui64>(result));
- }
- return TUnboxedValuePod();
- }
-
using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>;
template <typename TIt>
@@ -406,86 +685,6 @@ namespace {
return valueBuilder->NewString(JoinSeq(delimeter, items));
}
- SIMPLE_UDF(TLevensteinDistance, ui64(TAutoMap<TUtf8>, TAutoMap<TUtf8>)) {
- Y_UNUSED(valueBuilder);
- const TStringBuf left(args[0].AsStringRef());
- const TStringBuf right(args[1].AsStringRef());
- const auto& leftUtf32 = UTF8ToUTF32<true>(left);
- const auto& rightUtf32 = UTF8ToUTF32<true>(right);
- const ui64 result = NLevenshtein::Distance(leftUtf32, rightUtf32);
- return TUnboxedValuePod(result);
- }
-
- SIMPLE_UDF(TReplaceAll, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
- if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef()))
- return valueBuilder->NewString(result);
- else
- return args[0];
- }
-
- SIMPLE_UDF(TReplaceFirst, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
- std::string result(args[0].AsStringRef());
- const std::string_view what(args[1].AsStringRef());
- if (const auto index = result.find(what); index != std::string::npos) {
- result.replace(index, what.size(), std::string_view(args[2].AsStringRef()));
- return valueBuilder->NewString(result);
- }
- return args[0];
- }
-
- SIMPLE_UDF(TReplaceLast, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) {
- std::string result(args[0].AsStringRef());
- const std::string_view what(args[1].AsStringRef());
- if (const auto index = result.rfind(what); index != std::string::npos) {
- result.replace(index, what.size(), std::string_view(args[2].AsStringRef()));
- return valueBuilder->NewString(result);
- }
- return args[0];
- }
-
- SIMPLE_UDF(TRemoveAll, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
- TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
- const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
- const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
- size_t tpos = 0;
- for (const wchar32 c : input) {
- if (!chars.contains(c)) {
- input[tpos++] = c;
- }
- }
- if (tpos != input.size()) {
- input.resize(tpos);
- return valueBuilder->NewString(WideToUTF8(input));
- }
- return args[0];
- }
-
- SIMPLE_UDF(TRemoveFirst, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
- TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
- const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
- const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
- for (auto it = input.cbegin(); it != input.cend(); ++it) {
- if (chars.contains(*it)) {
- input.erase(it);
- return valueBuilder->NewString(WideToUTF8(input));
- }
- }
- return args[0];
- }
-
- SIMPLE_UDF(TRemoveLast, TUtf8(TAutoMap<TUtf8>, TUtf8)) {
- TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
- const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef());
- const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend());
- for (auto it = input.crbegin(); it != input.crend(); ++it) {
- if (chars.contains(*it)) {
- input.erase(input.crend() - it - 1, 1);
- return valueBuilder->NewString(WideToUTF8(input));
- }
- }
- return args[0];
- }
-
SIMPLE_UDF(TToCodePointList, TListType<ui32>(TAutoMap<TUtf8>)) {
size_t codePointCount = 0;
const auto& inputRef = args[0].AsStringRef();
@@ -552,42 +751,6 @@ namespace {
return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size()));
}
- SIMPLE_UDF(TReverse, TUtf8(TAutoMap<TUtf8>)) {
- auto wide = UTF8ToWide(args[0].AsStringRef());
- ReverseInPlace(wide);
- return valueBuilder->NewString(WideToUTF8(wide));
- }
-
- SIMPLE_UDF(TStrip, TUtf8(TAutoMap<TUtf8>)) {
- const TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());
- const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin()));
- return valueBuilder->NewString(WideToUTF8(result));
- }
-
- SIMPLE_UDF(TIsUnicodeSet, bool(TAutoMap<TUtf8>, TUtf8)) {
- Y_UNUSED(valueBuilder);
- const TStringBuf input(args[0].AsStringRef());
- const TUtf16String& customCategory = UTF8ToWide(args[1].AsStringRef());
- TUnicodeSet unicodeSet;
- try {
- unicodeSet.Parse(customCategory);
- } catch (...) {
- UdfTerminate((TStringBuilder() << "Failed to parse unicode set: " << CurrentExceptionMessage()).c_str());
- }
- bool result = true;
- wchar32 rune;
- const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin());
- const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());
- while (cur != last) {
- ReadUTF8CharAndAdvance(rune, cur, last);
- if (!unicodeSet.Has(rune)) {
- result = false;
- break;
- }
- }
- return TUnboxedValuePod(result);
- }
-
#define EXPORTED_UNICODE_BASE_UDF \
TIsUtf, \
TGetLength, \
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
index 8189dd16e0..bac6e1ebc4 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
@@ -1,4 +1,9 @@
{
+ "test.test[BlockFind]": [
+ {
+ "uri": "file://test.test_BlockFind_/results.txt"
+ }
+ ],
"test.test[BlockIsCategory]": [
{
"uri": "file://test.test_BlockIsCategory_/results.txt"
@@ -9,11 +14,31 @@
"uri": "file://test.test_BlockNormalize_/results.txt"
}
],
+ "test.test[BlockRemove]": [
+ {
+ "uri": "file://test.test_BlockRemove_/results.txt"
+ }
+ ],
+ "test.test[BlockReplace]": [
+ {
+ "uri": "file://test.test_BlockReplace_/results.txt"
+ }
+ ],
+ "test.test[BlockStrip]": [
+ {
+ "uri": "file://test.test_BlockStrip_/results.txt"
+ }
+ ],
"test.test[BlockTo]": [
{
"uri": "file://test.test_BlockTo_/results.txt"
}
],
+ "test.test[BlockUnicode]": [
+ {
+ "uri": "file://test.test_BlockUnicode_/results.txt"
+ }
+ ],
"test.test[Find]": [
{
"uri": "file://test.test_Find_/results.txt"
@@ -94,6 +119,11 @@
"uri": "file://test.test_TryToUint64_/results.txt"
}
],
+ "test.test[UnicodeCodePoint]": [
+ {
+ "uri": "file://test.test_UnicodeCodePoint_/results.txt"
+ }
+ ],
"test.test[Unicode]": [
{
"uri": "file://test.test_Unicode_/results.txt"
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockFind_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockFind_/results.txt
new file mode 100644
index 0000000000..4ee0b05ad2
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockFind_/results.txt
@@ -0,0 +1,134 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ];
+ [
+ "column1";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ];
+ [
+ "column2";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ];
+ [
+ "column3";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ];
+ [
+ "column4";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ];
+ [
+ "column5";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ];
+ [
+ "column6";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ [
+ "l\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm\xC3\xA4\xC3\x9Fig"
+ ];
+ [
+ "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm"
+ ];
+ [
+ "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"
+ ];
+ [
+ "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, "
+ ];
+ [
+ "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k"
+ ];
+ [
+ "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"
+ ];
+ [
+ "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r"
+ ]
+ ];
+ [
+ [
+ "l\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm\xC3\xA4\xC3\x9Fig"
+ ];
+ [
+ "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm"
+ ];
+ [
+ "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"
+ ];
+ [
+ "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, "
+ ];
+ [
+ "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k"
+ ];
+ [
+ "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"
+ ];
+ [
+ "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r"
+ ]
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockRemove_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockRemove_/results.txt
new file mode 100644
index 0000000000..4004c75199
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockRemove_/results.txt
@@ -0,0 +1,100 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "all";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "first";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "last";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "first2";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "last2";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2";
+ "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2";
+ "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
+ ];
+ [
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"
+ ];
+ [
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB2\xD1\x8B\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2";
+ "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2";
+ "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
+ ];
+ [
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x8B"
+ ];
+ [
+ "";
+ "";
+ "";
+ "";
+ "";
+ ""
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockReplace_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockReplace_/results.txt
new file mode 100644
index 0000000000..bdb61e7f5c
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockReplace_/results.txt
@@ -0,0 +1,124 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "all";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "first";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "last";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "first2";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "last2";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "first3";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "last3";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2z\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z";
+ "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2";
+ "\xD1\x8B\xD0\xB2zzz\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz"
+ ];
+ [
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fz\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0z\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fzzz\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0zzz\xD1\x87\xD1\x8B"
+ ];
+ [
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "z\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z";
+ "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2";
+ "zzz\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz"
+ ];
+ [
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2z\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0z\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2zzz\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x84\xD1\x8B"
+ ];
+ [
+ "";
+ "";
+ "";
+ "";
+ "";
+ "";
+ "";
+ ""
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockStrip_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockStrip_/results.txt
new file mode 100644
index 0000000000..22df398114
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockStrip_/results.txt
@@ -0,0 +1,56 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "column1";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD0\xB1\xD0\xB5\xD0\xB7 \xD0\xB2\xD0\xBD\xD0\xB5\xD1\x88\xD0\xBD\xD0\xB8\xD1\x85 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2";
+ "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD0\xB1\xD0\xB5\xD0\xB7 \xD0\xB2\xD0\xBD\xD0\xB5\xD1\x88\xD0\xBD\xD0\xB8\xD1\x85 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2"
+ ];
+ [
+ " \xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBB\xD0\xB5\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB";
+ "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBB\xD0\xB5\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB"
+ ];
+ [
+ "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBF\xD1\x80\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB ";
+ "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBF\xD1\x80\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB"
+ ];
+ [
+ "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0_\xD1\x81\xD0\xBE\xD0\xB2\xD1\x81\xD0\xB5\xD0\xBC_\xD0\xB1\xD0\xB5\xD0\xB7_\xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2";
+ "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0_\xD1\x81\xD0\xBE\xD0\xB2\xD1\x81\xD0\xB5\xD0\xBC_\xD0\xB1\xD0\xB5\xD0\xB7_\xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2"
+ ];
+ [
+ "\xE2\x80\x89\xD1\x8E\xD0\xBD\xD0\xB8\xD0\xBA\xD0\xBE\xD0\xB4+\xD0\xBF\xD0\xB5\xD1\x80\xD0\xB5\xD0\xB2\xD0\xBE\xD0\xB4 \xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB8\n";
+ "\xD1\x8E\xD0\xBD\xD0\xB8\xD0\xBA\xD0\xBE\xD0\xB4+\xD0\xBF\xD0\xB5\xD1\x80\xD0\xB5\xD0\xB2\xD0\xBE\xD0\xB4 \xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB8"
+ ];
+ [
+ "";
+ ""
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockUnicode_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockUnicode_/results.txt
new file mode 100644
index 0000000000..76cdb42446
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockUnicode_/results.txt
@@ -0,0 +1,220 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "value";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "is";
+ [
+ "DataType";
+ "Bool"
+ ]
+ ];
+ [
+ "length";
+ [
+ "DataType";
+ "Uint64"
+ ]
+ ];
+ [
+ "one_end_substring";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "two_end_substring";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "remove_all";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "levenstein";
+ [
+ "DataType";
+ "Uint64"
+ ]
+ ];
+ [
+ "reverse";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "find";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Uint64"
+ ]
+ ]
+ ];
+ [
+ "rfind";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Uint64"
+ ]
+ ]
+ ];
+ [
+ "find_from";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Uint64"
+ ]
+ ]
+ ];
+ [
+ "rfind_from";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Uint64"
+ ]
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "Eyl\xC3\xBCl";
+ %true;
+ "5";
+ "yl\xC3\xBCl";
+ "Ey";
+ "Eyl\xC3\xBCl";
+ "5";
+ "l\xC3\xBClyE";
+ #;
+ #;
+ #;
+ #
+ ];
+ [
+ "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F";
+ %true;
+ "6";
+ "\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F";
+ "\xD0\xB6\xD0\xBD";
+ "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F";
+ "5";
+ "\xD1\x8F\xD0\xBD\xD1\x9E\xD1\x96\xD0\xBD\xD0\xB6";
+ #;
+ #;
+ #;
+ #
+ ];
+ [
+ "\xC3\xBAnora";
+ %true;
+ "5";
+ "nora";
+ "\xC3\xBAn";
+ "\xC3\xBAnoa";
+ "5";
+ "aron\xC3\xBA";
+ #;
+ #;
+ #;
+ #
+ ];
+ [
+ "Ci\xD1\x87 Ci\xD1\x87";
+ %true;
+ "7";
+ "i\xD1\x87 Ci\xD1\x87";
+ "Ci";
+ "Ci Ci";
+ "5";
+ "\xD1\x87iC \xD1\x87iC";
+ #;
+ #;
+ #;
+ #
+ ];
+ [
+ "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82";
+ %true;
+ "13";
+ "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82";
+ "\xD0\xBF\xD1\x80";
+ "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82";
+ "5";
+ "\xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF \xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF";
+ [
+ "4"
+ ];
+ [
+ "11"
+ ];
+ [
+ "11"
+ ];
+ [
+ "4"
+ ]
+ ];
+ [
+ "6";
+ %true;
+ "1";
+ "";
+ "6";
+ "6";
+ "1";
+ "6";
+ #;
+ #;
+ #;
+ #
+ ];
+ [
+ "";
+ %true;
+ "0";
+ "";
+ "";
+ "";
+ "0";
+ "";
+ #;
+ #;
+ #;
+ #
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt
index bcccb2b511..4ee0b05ad2 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt
@@ -8,66 +8,73 @@
"StructType";
[
[
- "column0";
+ "value";
[
- "DataType";
- "Utf8"
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
]
];
[
"column1";
[
- "DataType";
- "Utf8"
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
]
];
[
"column2";
[
- "DataType";
- "Utf8"
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
]
- ]
- ]
- ]
- ];
- "Data" = [
- [
- "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm";
- "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m";
- "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, "
- ]
- ]
- }
- ]
- };
- {
- "Write" = [
- {
- "Type" = [
- "ListType";
- [
- "StructType";
- [
+ ];
[
- "column0";
+ "column3";
[
- "DataType";
- "Utf8"
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
]
];
[
- "column1";
+ "column4";
[
- "DataType";
- "Utf8"
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
]
];
[
- "column2";
+ "column5";
[
- "DataType";
- "Utf8"
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ];
+ [
+ "column6";
+ [
+ "OptionalType";
+ [
+ "DataType";
+ "Utf8"
+ ]
]
]
]
@@ -75,9 +82,50 @@
];
"Data" = [
[
- "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k";
- "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m";
- "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r"
+ [
+ "l\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm\xC3\xA4\xC3\x9Fig"
+ ];
+ [
+ "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm"
+ ];
+ [
+ "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"
+ ];
+ [
+ "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, "
+ ];
+ [
+ "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k"
+ ];
+ [
+ "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"
+ ];
+ [
+ "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r"
+ ]
+ ];
+ [
+ [
+ "l\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm\xC3\xA4\xC3\x9Fig"
+ ];
+ [
+ "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm"
+ ];
+ [
+ "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"
+ ];
+ [
+ "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, "
+ ];
+ [
+ "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k"
+ ];
+ [
+ "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"
+ ];
+ [
+ "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r"
+ ]
]
]
}
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt
index 11bcb15a2f..4004c75199 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt
@@ -8,63 +8,45 @@
"StructType";
[
[
- "column0";
+ "value";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"all";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"first";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"last";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"first2";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"last2";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
]
]
@@ -72,104 +54,44 @@
];
"Data" = [
[
- [
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
- ]
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2";
+ "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2";
+ "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
];
[
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"
- ]
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"
];
[
- [
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD0\xB2\xD1\x8B\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2"
- ];
- [
- "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
- ];
- [
- "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
- ]
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB2\xD1\x8B\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2";
+ "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2";
+ "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
];
[
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x8B"
- ]
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x8B"
];
[
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ]
+ "";
+ "";
+ "";
+ "";
+ "";
+ ""
]
]
}
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt
index 7390dbdbc3..bdb61e7f5c 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt
@@ -8,83 +8,59 @@
"StructType";
[
[
- "column0";
+ "value";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"all";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"first";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"last";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"first2";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"last2";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"first3";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"last3";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
]
]
@@ -92,134 +68,54 @@
];
"Data" = [
[
- [
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD1\x8B\xD0\xB2z\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
- ];
- [
- "\xD1\x8B\xD0\xB2zzz\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz"
- ]
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2z\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z";
+ "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2";
+ "\xD1\x8B\xD0\xB2zzz\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz"
];
[
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fz\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0z\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fzzz\xD0\xB0\xD1\x87\xD1\x8B"
- ];
- [
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0zzz\xD1\x87\xD1\x8B"
- ]
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fz\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0z\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fzzz\xD0\xB0\xD1\x87\xD1\x8B";
+ "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0zzz\xD1\x87\xD1\x8B"
];
[
- [
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "z\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z"
- ];
- [
- "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"
- ];
- [
- "zzz\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"
- ];
- [
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz"
- ]
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "z\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z";
+ "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2";
+ "zzz\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
+ "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz"
];
[
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2z\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0z\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2zzz\xD0\xB0\xD1\x84\xD1\x8B"
- ];
- [
- "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x84\xD1\x8B"
- ]
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2z\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0z\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2zzz\xD0\xB0\xD1\x84\xD1\x8B";
+ "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x84\xD1\x8B"
];
[
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ]
+ "";
+ "";
+ "";
+ "";
+ "";
+ "";
+ "";
+ ""
]
]
}
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt
index 613b639ed0..22df398114 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt
@@ -8,7 +8,7 @@
"StructType";
[
[
- "column0";
+ "value";
[
"DataType";
"Utf8"
@@ -20,53 +20,33 @@
"DataType";
"Utf8"
]
- ];
- [
- "column2";
- [
- "DataType";
- "Utf8"
- ]
- ];
- [
- "column3";
- [
- "DataType";
- "Utf8"
- ]
- ];
- [
- "column4";
- [
- "DataType";
- "Utf8"
- ]
- ];
- [
- "column5";
- [
- "DataType";
- "Utf8"
- ]
- ];
- [
- "column6";
- [
- "DataType";
- "Utf8"
- ]
]
]
]
];
"Data" = [
[
- "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
- "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B";
- "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0";
- "\xD0\xB0\xD0\xB0\xD0\xB2 \xD1\x8B\xD0\xB0 \xD1\x8B\xD0\xB2\xD0\xB0 \xD1\x8B\xD0\xB2\xD0\xB0";
- "\xD1\x8B\xD0\xB2\xD0\xB0";
- "\xD0\xB2\xD0\xB0\xD0\xBE\xD0\xB0\xD0\xBE";
+ "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD0\xB1\xD0\xB5\xD0\xB7 \xD0\xB2\xD0\xBD\xD0\xB5\xD1\x88\xD0\xBD\xD0\xB8\xD1\x85 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2";
+ "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD0\xB1\xD0\xB5\xD0\xB7 \xD0\xB2\xD0\xBD\xD0\xB5\xD1\x88\xD0\xBD\xD0\xB8\xD1\x85 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2"
+ ];
+ [
+ " \xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBB\xD0\xB5\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB";
+ "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBB\xD0\xB5\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB"
+ ];
+ [
+ "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBF\xD1\x80\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB ";
+ "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBF\xD1\x80\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB"
+ ];
+ [
+ "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0_\xD1\x81\xD0\xBE\xD0\xB2\xD1\x81\xD0\xB5\xD0\xBC_\xD0\xB1\xD0\xB5\xD0\xB7_\xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2";
+ "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0_\xD1\x81\xD0\xBE\xD0\xB2\xD1\x81\xD0\xB5\xD0\xBC_\xD0\xB1\xD0\xB5\xD0\xB7_\xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2"
+ ];
+ [
+ "\xE2\x80\x89\xD1\x8E\xD0\xBD\xD0\xB8\xD0\xBA\xD0\xBE\xD0\xB4+\xD0\xBF\xD0\xB5\xD1\x80\xD0\xB5\xD0\xB2\xD0\xBE\xD0\xB4 \xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB8\n";
+ "\xD1\x8E\xD0\xBD\xD0\xB8\xD0\xBA\xD0\xBE\xD0\xB4+\xD0\xBF\xD0\xB5\xD1\x80\xD0\xB5\xD0\xB2\xD0\xBE\xD0\xB4 \xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB8"
+ ];
+ [
+ "";
""
]
]
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_UnicodeCodePoint_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_UnicodeCodePoint_/results.txt
new file mode 100644
index 0000000000..cab1fc79ef
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_UnicodeCodePoint_/results.txt
@@ -0,0 +1,120 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "code_point_list";
+ [
+ "ListType";
+ [
+ "DataType";
+ "Uint32"
+ ]
+ ]
+ ];
+ [
+ "from_code_point_list";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ];
+ [
+ "from_lazy_code_point_list";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ [
+ "69";
+ "121";
+ "108";
+ "252";
+ "108"
+ ];
+ "Eyl\xC3\xBCl";
+ "Eyl\xC3\xBCl"
+ ];
+ [
+ [
+ "1078";
+ "1085";
+ "1110";
+ "1118";
+ "1085";
+ "1103"
+ ];
+ "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F";
+ "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"
+ ];
+ [
+ [
+ "250";
+ "110";
+ "111";
+ "114";
+ "97"
+ ];
+ "\xC3\xBAnora";
+ "\xC3\xBAnora"
+ ];
+ [
+ [
+ "67";
+ "105";
+ "1095";
+ "32";
+ "67";
+ "105";
+ "1095"
+ ];
+ "Ci\xD1\x87 Ci\xD1\x87";
+ "Ci\xD1\x87 Ci\xD1\x87"
+ ];
+ [
+ [
+ "1087";
+ "1088";
+ "1080";
+ "1074";
+ "1077";
+ "1090";
+ "32";
+ "1087";
+ "1088";
+ "1080";
+ "1074";
+ "1077";
+ "1090"
+ ];
+ "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82";
+ "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"
+ ];
+ [
+ [
+ "54"
+ ];
+ "6";
+ "6"
+ ];
+ [
+ [];
+ "";
+ ""
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt
index 502cea3fd0..76cdb42446 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt
@@ -10,11 +10,8 @@
[
"value";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
@@ -27,94 +24,43 @@
[
"length";
[
- "OptionalType";
- [
- "DataType";
- "Uint64"
- ]
+ "DataType";
+ "Uint64"
]
];
[
"one_end_substring";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"two_end_substring";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"remove_all";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
"levenstein";
[
- "OptionalType";
- [
- "DataType";
- "Uint64"
- ]
- ]
- ];
- [
- "code_point_list";
- [
- "OptionalType";
- [
- "ListType";
- [
- "DataType";
- "Uint32"
- ]
- ]
- ]
- ];
- [
- "from_code_point_list";
- [
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
- ]
- ];
- [
- "from_lazy_code_point_list";
- [
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Uint64"
]
];
[
"reverse";
[
- "OptionalType";
- [
- "DataType";
- "Utf8"
- ]
+ "DataType";
+ "Utf8"
]
];
[
@@ -162,226 +108,70 @@
];
"Data" = [
[
- [
- "Eyl\xC3\xBCl"
- ];
+ "Eyl\xC3\xBCl";
%true;
- [
- "5"
- ];
- [
- "yl\xC3\xBCl"
- ];
- [
- "Ey"
- ];
- [
- "Eyl\xC3\xBCl"
- ];
- [
- "5"
- ];
- [
- [
- "69";
- "121";
- "108";
- "252";
- "108"
- ]
- ];
- [
- "Eyl\xC3\xBCl"
- ];
- [
- "Eyl\xC3\xBCl"
- ];
- [
- "l\xC3\xBClyE"
- ];
+ "5";
+ "yl\xC3\xBCl";
+ "Ey";
+ "Eyl\xC3\xBCl";
+ "5";
+ "l\xC3\xBClyE";
#;
#;
#;
#
];
[
- [
- "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"
- ];
+ "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F";
%true;
- [
- "6"
- ];
- [
- "\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"
- ];
- [
- "\xD0\xB6\xD0\xBD"
- ];
- [
- "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"
- ];
- [
- "5"
- ];
- [
- [
- "1078";
- "1085";
- "1110";
- "1118";
- "1085";
- "1103"
- ]
- ];
- [
- "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"
- ];
- [
- "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"
- ];
- [
- "\xD1\x8F\xD0\xBD\xD1\x9E\xD1\x96\xD0\xBD\xD0\xB6"
- ];
+ "6";
+ "\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F";
+ "\xD0\xB6\xD0\xBD";
+ "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F";
+ "5";
+ "\xD1\x8F\xD0\xBD\xD1\x9E\xD1\x96\xD0\xBD\xD0\xB6";
#;
#;
#;
#
];
[
- [
- "\xC3\xBAnora"
- ];
+ "\xC3\xBAnora";
%true;
- [
- "5"
- ];
- [
- "nora"
- ];
- [
- "\xC3\xBAn"
- ];
- [
- "\xC3\xBAnoa"
- ];
- [
- "5"
- ];
- [
- [
- "250";
- "110";
- "111";
- "114";
- "97"
- ]
- ];
- [
- "\xC3\xBAnora"
- ];
- [
- "\xC3\xBAnora"
- ];
- [
- "aron\xC3\xBA"
- ];
+ "5";
+ "nora";
+ "\xC3\xBAn";
+ "\xC3\xBAnoa";
+ "5";
+ "aron\xC3\xBA";
#;
#;
#;
#
];
[
- [
- "Ci\xD1\x87 Ci\xD1\x87"
- ];
+ "Ci\xD1\x87 Ci\xD1\x87";
%true;
- [
- "7"
- ];
- [
- "i\xD1\x87 Ci\xD1\x87"
- ];
- [
- "Ci"
- ];
- [
- "Ci Ci"
- ];
- [
- "5"
- ];
- [
- [
- "67";
- "105";
- "1095";
- "32";
- "67";
- "105";
- "1095"
- ]
- ];
- [
- "Ci\xD1\x87 Ci\xD1\x87"
- ];
- [
- "Ci\xD1\x87 Ci\xD1\x87"
- ];
- [
- "\xD1\x87iC \xD1\x87iC"
- ];
+ "7";
+ "i\xD1\x87 Ci\xD1\x87";
+ "Ci";
+ "Ci Ci";
+ "5";
+ "\xD1\x87iC \xD1\x87iC";
#;
#;
#;
#
];
[
- [
- "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"
- ];
+ "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82";
%true;
- [
- "13"
- ];
- [
- "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"
- ];
- [
- "\xD0\xBF\xD1\x80"
- ];
- [
- "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"
- ];
- [
- "5"
- ];
- [
- [
- "1087";
- "1088";
- "1080";
- "1074";
- "1077";
- "1090";
- "32";
- "1087";
- "1088";
- "1080";
- "1074";
- "1077";
- "1090"
- ]
- ];
- [
- "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"
- ];
- [
- "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"
- ];
- [
- "\xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF \xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF"
- ];
+ "13";
+ "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82";
+ "\xD0\xBF\xD1\x80";
+ "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82";
+ "5";
+ "\xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF \xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF";
[
"4"
];
@@ -396,76 +186,28 @@
]
];
[
- [
- "6"
- ];
+ "6";
%true;
- [
- "1"
- ];
- [
- ""
- ];
- [
- "6"
- ];
- [
- "6"
- ];
- [
- "1"
- ];
- [
- [
- "54"
- ]
- ];
- [
- "6"
- ];
- [
- "6"
- ];
- [
- "6"
- ];
+ "1";
+ "";
+ "6";
+ "6";
+ "1";
+ "6";
#;
#;
#;
#
];
[
- [
- ""
- ];
+ "";
%true;
- [
- "0"
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- "0"
- ];
- [
- []
- ];
- [
- ""
- ];
- [
- ""
- ];
- [
- ""
- ];
+ "0";
+ "";
+ "";
+ "";
+ "0";
+ "";
#;
#;
#;
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in
new file mode 100644
index 0000000000..c40336b0e2
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in
@@ -0,0 +1,2 @@
+{"key"="1";"value"="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"};
+{"key"="2";"value"="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in.attr
new file mode 100644
index 0000000000..ea891bb344
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["OptionalType";["DataType";"Utf8"]]];
+ ["value";["OptionalType";["DataType";"Utf8"]]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.sql
new file mode 100644
index 0000000000..0954e77c94
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.sql
@@ -0,0 +1,12 @@
+
+pragma UseBlocks;
+
+SELECT
+ value as value,
+ Unicode::Substring(value, Unicode::Find(value, "ä"u), Unicode::RFind(value, "ä"u) - Unicode::Find(value, "ä"u)),
+ Unicode::Substring(value, Unicode::Find(value, "ö"u), Unicode::RFind(value, "ö"u) - Unicode::Find(value, "ö"u)),
+ Unicode::Substring(value, Unicode::Find(value, "ü"u), Unicode::RFind(value, "ü"u) - Unicode::Find(value, "ü"u)),
+ Unicode::Substring(value, Unicode::Find(value, "ä"u, 30ul), Unicode::RFind(value, "ä"u, 123ul) - Unicode::Find(value, "ä"u, 30ul)),
+ Unicode::Substring(value, Unicode::Find(value, "ö"u, 9ul), Unicode::RFind(value, "ö"u, 103ul) - Unicode::Find(value, "ö"u, 9ul)),
+ Unicode::Substring(value, Unicode::Find(value, "ü"u, 45ul), Unicode::RFind(value, "ü"u, 83ul) - Unicode::Find(value, "ü"u, 45ul))
+from Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in
new file mode 100644
index 0000000000..95262ac2b9
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in
@@ -0,0 +1,5 @@
+{"key"="1";"value"="ываыва"};
+{"key"="2";"value"="ячсячсяаачы"};
+{"key"="3";"value"="аавыаываыва"};
+{"key"="4";"value"="gd2цй3ываафы"};
+{"key"="5";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in.attr
new file mode 100644
index 0000000000..d5e5b2ca48
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.sql
new file mode 100644
index 0000000000..04a8593148
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.sql
@@ -0,0 +1,12 @@
+/* syntax version 1 */
+
+pragma UseBlocks;
+
+SELECT
+ value as value,
+ Unicode::RemoveAll(value, "фа"u) AS all,
+ Unicode::RemoveFirst(value, "а"u) AS first,
+ Unicode::RemoveLast(value, "а"u) AS last,
+ Unicode::RemoveFirst(value, "фа"u) AS first2,
+ Unicode::RemoveLast(value, "фа"u) AS last2
+FROM Input;
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.in.attr
new file mode 100644
index 0000000000..d5e5b2ca48
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.sql
new file mode 100644
index 0000000000..c50f01c184
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.sql
@@ -0,0 +1,14 @@
+/* syntax version 1 */
+
+pragma UseBlocks;
+
+SELECT
+ value,
+ Unicode::ReplaceAll(value, Utf8("аф"), Utf8("zzz")) AS all,
+ Unicode::ReplaceFirst(value, Utf8("а"), Utf8("z")) AS first,
+ Unicode::ReplaceLast(value, Utf8("а"), Utf8("z")) AS last,
+ Unicode::ReplaceFirst(value, Utf8("а"), Utf8("")) AS first2,
+ Unicode::ReplaceLast(value, Utf8("а"), Utf8("")) AS last2,
+ Unicode::ReplaceFirst(value, Utf8("а"), Utf8("zzz")) AS first3,
+ Unicode::ReplaceLast(value, Utf8("а"), Utf8("zzz")) AS last3
+FROM Input \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in
new file mode 100644
index 0000000000..d8e23353ed
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in
@@ -0,0 +1,6 @@
+{"key"="1";"value"="строка без внешних пробелов"};
+{"key"="2";"value"=" только левый пробел"};
+{"key"="3";"value"="только правый пробел "};
+{"key"="4";"value"="строка_совсем_без_пробелов"};
+{"key"="5";"value"="\u2009юникод+перевод строки\n"};
+{"key"="6";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in.attr
new file mode 100644
index 0000000000..d5e5b2ca48
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.sql
new file mode 100644
index 0000000000..04e1b04764
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.sql
@@ -0,0 +1,8 @@
+/* syntax version 1 */
+
+pragma UseBlocks;
+
+SELECT
+ value as value,
+ Unicode::Strip(value)
+From Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in
new file mode 100644
index 0000000000..d9c36c855a
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in
@@ -0,0 +1,7 @@
+{"key"="";"value"="Eyl\xC3\xBCl"};
+{"key"="";"value"="\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"};
+{"key"="";"value"="\xC3\xBAnora"};
+{"key"="";"value"="Ci\xD1\x87 Ci\xD1\x87"};
+{"key"="";"value"="\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"};
+{"key"="";"value"="6"};
+{"key"="";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in.attr
new file mode 100644
index 0000000000..5f1b009fbf
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+} \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.sql
new file mode 100644
index 0000000000..3ab0ffc96f
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.sql
@@ -0,0 +1,18 @@
+/* syntax version 1 */
+
+pragma UseBlocks;
+
+SELECT
+ value AS value,
+ Unicode::IsUtf(value) AS is,
+ Unicode::GetLength(value) AS length,
+ Unicode::Substring(value, 1) AS one_end_substring,
+ Unicode::Substring(value, 0, 2) AS two_end_substring,
+ Unicode::RemoveAll(value, "\xD1\x87пr") AS remove_all,
+ Unicode::LevensteinDistance(value, value || Unicode::Substring(value, 0, 5)) AS levenstein,
+ Unicode::Reverse(value) AS reverse,
+ Unicode::Find(value, "ет"u) AS find,
+ Unicode::RFind(value, "ет"u) AS rfind,
+ Unicode::Find(value, "ет"u, 7ul) AS find_from,
+ Unicode::RFind(value, "ет"u, 7ul) AS rfind_from
+FROM Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Find.in b/yql/essentials/udfs/common/unicode_base/test/cases/Find.in
new file mode 100644
index 0000000000..c40336b0e2
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Find.in
@@ -0,0 +1,2 @@
+{"key"="1";"value"="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"};
+{"key"="2";"value"="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Find.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Find.in.attr
new file mode 100644
index 0000000000..ea891bb344
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Find.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["OptionalType";["DataType";"Utf8"]]];
+ ["value";["OptionalType";["DataType";"Utf8"]]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Find.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Find.sql
index 9a9a58752e..1515be76c6 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/Find.sql
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Find.sql
@@ -1,13 +1,9 @@
-$text ="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"u;
-
SELECT
- Unicode::Substring($text, Unicode::Find($text, "ä"u), Unicode::RFind($text, "ä"u) - Unicode::Find($text, "ä"u)),
- Unicode::Substring($text, Unicode::Find($text, "ö"u), Unicode::RFind($text, "ö"u) - Unicode::Find($text, "ö"u)),
- Unicode::Substring($text, Unicode::Find($text, "ü"u), Unicode::RFind($text, "ü"u) - Unicode::Find($text, "ü"u));
-
-
-SELECT
- Unicode::Substring($text, Unicode::Find($text, "ä"u, 30ul), Unicode::RFind($text, "ä"u, 123ul) - Unicode::Find($text, "ä"u, 30ul)),
- Unicode::Substring($text, Unicode::Find($text, "ö"u, 9ul), Unicode::RFind($text, "ö"u, 103ul) - Unicode::Find($text, "ö"u, 9ul)),
- Unicode::Substring($text, Unicode::Find($text, "ü"u, 45ul), Unicode::RFind($text, "ü"u, 83ul) - Unicode::Find($text, "ü"u, 45ul));
-
+ value as value,
+ Unicode::Substring(value, Unicode::Find(value, "ä"u), Unicode::RFind(value, "ä"u) - Unicode::Find(value, "ä"u)),
+ Unicode::Substring(value, Unicode::Find(value, "ö"u), Unicode::RFind(value, "ö"u) - Unicode::Find(value, "ö"u)),
+ Unicode::Substring(value, Unicode::Find(value, "ü"u), Unicode::RFind(value, "ü"u) - Unicode::Find(value, "ü"u)),
+ Unicode::Substring(value, Unicode::Find(value, "ä"u, 30ul), Unicode::RFind(value, "ä"u, 123ul) - Unicode::Find(value, "ä"u, 30ul)),
+ Unicode::Substring(value, Unicode::Find(value, "ö"u, 9ul), Unicode::RFind(value, "ö"u, 103ul) - Unicode::Find(value, "ö"u, 9ul)),
+ Unicode::Substring(value, Unicode::Find(value, "ü"u, 45ul), Unicode::RFind(value, "ü"u, 83ul) - Unicode::Find(value, "ü"u, 45ul))
+from Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in
new file mode 100644
index 0000000000..95262ac2b9
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in
@@ -0,0 +1,5 @@
+{"key"="1";"value"="ываыва"};
+{"key"="2";"value"="ячсячсяаачы"};
+{"key"="3";"value"="аавыаываыва"};
+{"key"="4";"value"="gd2цй3ываафы"};
+{"key"="5";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in.attr
new file mode 100644
index 0000000000..d5e5b2ca48
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql
index ee96037f79..eae1d678a3 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql
@@ -1,9 +1,9 @@
/* syntax version 1 */
SELECT
- CAST(value AS Utf8),
- Unicode::RemoveAll(CAST(value AS Utf8), Utf8("фа")) AS all,
- Unicode::RemoveFirst(CAST(value AS Utf8), Utf8("а")) AS first,
- Unicode::RemoveLast(CAST(value AS Utf8), Utf8("а")) AS last,
- Unicode::RemoveFirst(CAST(value AS Utf8), Utf8("фа")) AS first2,
- Unicode::RemoveLast(CAST(value AS Utf8), Utf8("фа")) AS last2
+ value as value,
+ Unicode::RemoveAll(value, "фа"u) AS all,
+ Unicode::RemoveFirst(value, "а"u) AS first,
+ Unicode::RemoveLast(value, "а"u) AS last,
+ Unicode::RemoveFirst(value, "фа"u) AS first2,
+ Unicode::RemoveLast(value, "фа"u) AS last2
FROM Input;
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Replace.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Replace.in.attr
new file mode 100644
index 0000000000..d5e5b2ca48
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Replace.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql
index d623984413..9f875627ed 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql
@@ -1,11 +1,11 @@
/* syntax version 1 */
SELECT
- CAST(value AS Utf8),
- Unicode::ReplaceAll(CAST(value AS Utf8), Utf8("аф"), Utf8("zzz")) AS all,
- Unicode::ReplaceFirst(CAST(value AS Utf8), Utf8("а"), Utf8("z")) AS first,
- Unicode::ReplaceLast(CAST(value AS Utf8), Utf8("а"), Utf8("z")) AS last,
- Unicode::ReplaceFirst(CAST(value AS Utf8), Utf8("а"), Utf8("")) AS first2,
- Unicode::ReplaceLast(CAST(value AS Utf8), Utf8("а"), Utf8("")) AS last2,
- Unicode::ReplaceFirst(CAST(value AS Utf8), Utf8("а"), Utf8("zzz")) AS first3,
- Unicode::ReplaceLast(CAST(value AS Utf8), Utf8("а"), Utf8("zzz")) AS last3
-FROM Input;
+ value,
+ Unicode::ReplaceAll(value, Utf8("аф"), Utf8("zzz")) AS all,
+ Unicode::ReplaceFirst(value, Utf8("а"), Utf8("z")) AS first,
+ Unicode::ReplaceLast(value, Utf8("а"), Utf8("z")) AS last,
+ Unicode::ReplaceFirst(value, Utf8("а"), Utf8("")) AS first2,
+ Unicode::ReplaceLast(value, Utf8("а"), Utf8("")) AS last2,
+ Unicode::ReplaceFirst(value, Utf8("а"), Utf8("zzz")) AS first3,
+ Unicode::ReplaceLast(value, Utf8("а"), Utf8("zzz")) AS last3
+FROM Input \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in
new file mode 100644
index 0000000000..d8e23353ed
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in
@@ -0,0 +1,6 @@
+{"key"="1";"value"="строка без внешних пробелов"};
+{"key"="2";"value"=" только левый пробел"};
+{"key"="3";"value"="только правый пробел "};
+{"key"="4";"value"="строка_совсем_без_пробелов"};
+{"key"="5";"value"="\u2009юникод+перевод строки\n"};
+{"key"="6";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in.attr
new file mode 100644
index 0000000000..d5e5b2ca48
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql
index 45bde163e0..48f9498b8e 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql
@@ -1,9 +1,5 @@
/* syntax version 1 */
SELECT
- Unicode::Strip("ываыва"u),
- Unicode::Strip(" ячсячсяаачы"u),
- Unicode::Strip("аавыаываыва "u),
- Unicode::Strip("аав ыа ыва ыва "u),
- Unicode::Strip("\u2009ыва\n"u),
- Unicode::Strip("\u200aваоао\u2002"u),
- Unicode::Strip(""u)
+ value as value,
+ Unicode::Strip(value)
+From Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in
index 55f0307e35..d9c36c855a 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in
@@ -1,7 +1,7 @@
-{"key"="";"subkey"="";"value"="Eyl\xC3\xBCl"};
-{"key"="";"subkey"="";"value"="\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"};
-{"key"="";"subkey"="";"value"="\xC3\xBAnora"};
-{"key"="";"subkey"="";"value"="Ci\xD1\x87 Ci\xD1\x87"};
-{"key"="";"subkey"="";"value"="\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"};
-{"key"="";"subkey"="";"value"="6"};
-{"key"="";"subkey"="";"value"=""};
+{"key"="";"value"="Eyl\xC3\xBCl"};
+{"key"="";"value"="\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"};
+{"key"="";"value"="\xC3\xBAnora"};
+{"key"="";"value"="Ci\xD1\x87 Ci\xD1\x87"};
+{"key"="";"value"="\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"};
+{"key"="";"value"="6"};
+{"key"="";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in.attr
new file mode 100644
index 0000000000..5f1b009fbf
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+} \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql
index cdff12f352..6cbaededb6 100644
--- a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql
@@ -7,12 +7,9 @@ SELECT
Unicode::Substring(value, 0, 2) AS two_end_substring,
Unicode::RemoveAll(value, "\xD1\x87пr") AS remove_all,
Unicode::LevensteinDistance(value, value || Unicode::Substring(value, 0, 5)) AS levenstein,
- Unicode::ToCodePointList(value) AS code_point_list,
- Unicode::FromCodePointList(Unicode::ToCodePointList(value)) AS from_code_point_list,
- Unicode::FromCodePointList(YQL::LazyList(Unicode::ToCodePointList(value))) AS from_lazy_code_point_list,
Unicode::Reverse(value) AS reverse,
Unicode::Find(value, "ет"u) AS find,
Unicode::RFind(value, "ет"u) AS rfind,
Unicode::Find(value, "ет"u, 7ul) AS find_from,
Unicode::RFind(value, "ет"u, 7ul) AS rfind_from
-FROM (SELECT CAST(value AS Utf8) AS value FROM Input);
+FROM Input
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in
new file mode 100644
index 0000000000..d9c36c855a
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in
@@ -0,0 +1,7 @@
+{"key"="";"value"="Eyl\xC3\xBCl"};
+{"key"="";"value"="\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"};
+{"key"="";"value"="\xC3\xBAnora"};
+{"key"="";"value"="Ci\xD1\x87 Ci\xD1\x87"};
+{"key"="";"value"="\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"};
+{"key"="";"value"="6"};
+{"key"="";"value"=""};
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in.attr
new file mode 100644
index 0000000000..d5e5b2ca48
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in.attr
@@ -0,0 +1,8 @@
+{
+ "_yql_row_spec"={
+ "Type"=["StructType";[
+ ["key";["DataType";"Utf8"]];
+ ["value";["DataType";"Utf8"]]
+ ]];
+ }
+}
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.sql b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.sql
new file mode 100644
index 0000000000..cc26378317
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.sql
@@ -0,0 +1,6 @@
+/* syntax version 1 */
+SELECT
+ Unicode::ToCodePointList(value) AS code_point_list,
+ Unicode::FromCodePointList(Unicode::ToCodePointList(value)) AS from_code_point_list,
+ Unicode::FromCodePointList(YQL::LazyList(Unicode::ToCodePointList(value))) AS from_lazy_code_point_list,
+FROM Input