diff options
author | atarasov5 <atarasov5@yandex-team.com> | 2025-02-12 17:13:07 +0300 |
---|---|---|
committer | atarasov5 <atarasov5@yandex-team.com> | 2025-02-12 17:28:04 +0300 |
commit | d9d2e3122cc237885d24563de584dce2504d0385 (patch) | |
tree | 5410acbdefb823bf23b27bbe732a06ff6afba003 /yql | |
parent | c0fd4debea162d972c2a8dba88269faab10bc3fb (diff) | |
download | ydb-d9d2e3122cc237885d24563de584dce2504d0385.tar.gz |
YQL-19535: Add utf8 udf block implementations
commit_hash:5eac5390db34d1ca89f96441c1cfcff9c5853587
Diffstat (limited to 'yql')
45 files changed, 1663 insertions, 941 deletions
diff --git a/yql/essentials/tests/sql/minirun/part1/canondata/result.json b/yql/essentials/tests/sql/minirun/part1/canondata/result.json index bd0f9512c7..cb5503d6f3 100644 --- a/yql/essentials/tests/sql/minirun/part1/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part1/canondata/result.json @@ -837,9 +837,9 @@ ], "test.test[params-primitives--Debug]": [ { - "checksum": "e232122561df92b9658b6e0e81770672", - "size": 4184, - "uri": "https://{canondata_backend}/1847551/378b3bd63cdb48c05228707db1486e34df62729c/resource.tar.gz#test.test_params-primitives--Debug_/opt.yql" + "checksum": "fc3e867d5a88ffa4e9b513f55347f2e4", + "size": 4214, + "uri": "https://{canondata_backend}/1936997/7e6197348eba6c6070acf6ee875cc5f6dc62f616/resource.tar.gz#test.test_params-primitives--Debug_/opt.yql" } ], "test.test[params-primitives--Results]": [ diff --git a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h index a16582fb4e..6982dbe162 100644 --- a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h +++ b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h @@ -45,19 +45,42 @@ namespace { template <typename TDerived> struct TScalarOperationMixin { - static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) { + static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) + requires requires { TDerived::Execute(TStringRef()); } + { Y_DEBUG_ABORT_UNLESS(IsUtf8(args[0].AsStringRef())); - auto&& executeResult = TDerived::Execute(args[0].AsStringRef()); + auto executeResult = TDerived::Execute(args[0].AsStringRef()); + return ProcessResult(builder, std::move(executeResult), args); + } + + static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) + requires requires { TDerived::Execute(TStringRef(), TStringRef()); } + { + auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef()); + return ProcessResult(builder, std::move(executeResult), args); + } + + static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) + requires requires { TDerived::Execute(TStringRef(), TStringRef(), TStringRef()); } + { + auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef(), args[2].AsStringRef()); + return ProcessResult(builder, std::move(executeResult), args); + } + + static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) + requires requires { TDerived::Execute(TStringRef(), TStringRef(), TMaybe<ui64>()); } + { + auto executeResult = TDerived::Execute(args[0].AsStringRef(), args[1].AsStringRef(), args[2] ? TMaybe<ui64>(args[2].Get<ui64>()) : Nothing()); return ProcessResult(builder, std::move(executeResult), args); } private: - static TUnboxedValue ProcessResult(const IValueBuilder* builder, TString&& newString, const TUnboxedValuePod*) { - return builder->NewString(std::move(newString)); + static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TString& newString, const TUnboxedValuePod*) { + return builder->NewString(newString); } template <typename T> - static TUnboxedValue ProcessResult(const IValueBuilder* builder, std::variant<TNoChangesTag, T> newValue, const TUnboxedValuePod* initialArg) { + static TUnboxedValue ProcessResult(const IValueBuilder* builder, const std::variant<TNoChangesTag, T>& newValue, const TUnboxedValuePod* initialArg) { if (std::holds_alternative<T>(newValue)) { return ProcessResult(builder, std::move(std::get<T>(newValue)), initialArg); } else { @@ -65,7 +88,17 @@ namespace { } } - static TUnboxedValue ProcessResult(const IValueBuilder* builder, bool result, const TUnboxedValuePod*) { + template <typename T> + static TUnboxedValue ProcessResult(const IValueBuilder* builder, const TMaybe<T>& newValue, const TUnboxedValuePod* initialArg) { + if (newValue.Defined()) { + return ProcessResult(builder, *newValue, initialArg); + } else { + return TUnboxedValuePod(); + } + } + + template <typename T, typename = std::enable_if_t<TPrimitiveDataType<T>::Result>> + static TUnboxedValue ProcessResult(const IValueBuilder* builder, T result, const TUnboxedValuePod*) { Y_UNUSED(builder); return TUnboxedValuePod(result); } @@ -73,18 +106,61 @@ namespace { template <typename TDerived> struct TBlockOperationMixin { - template <typename Sync> - static void DoExecute(const TBlockItem arg, const Sync& sync) { + template <typename TSink> + static void BlockDoExecute(const TBlockItem arg, const TSink& sink) + requires requires { TDerived::Execute(TStringRef()); } + { Y_DEBUG_ABORT_UNLESS(IsUtf8(arg.AsStringRef())); - auto&& executeResult = TDerived::Execute(arg.AsStringRef()); - TBlockItem boxedValue = ProcessResult(std::move(executeResult), arg); - sync(boxedValue); + auto executeResult = TDerived::Execute(arg.AsStringRef()); + TBlockItem boxedValue = ProcessResult(executeResult, arg); + sink(boxedValue); + } + + template <typename TSink> + static void BlockDoExecute(const TBlockItem arg1, const TBlockItem arg2, const TSink& sink) + requires requires { TDerived::Execute(TStringRef(), TStringRef()); } + { + auto executeResult = TDerived::Execute(arg1.AsStringRef(), + arg2.AsStringRef()); + TBlockItem boxedValue = ProcessResult(executeResult, arg1); + sink(boxedValue); + } + + template <typename TSink> + static void BlockDoExecute(const TBlockItem args, const TSink& sink) + requires(requires { TDerived::Execute(TStringRef(), TStringRef(), TStringRef()); }) + { + auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(), + args.GetElement(1).AsStringRef(), + args.GetElement(2).AsStringRef()); + TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0)); + sink(boxedValue); + } + + template <typename TSink> + static void BlockDoExecute(const TBlockItem args, const TSink& sink) + requires(requires { TDerived::Execute(TStringRef(), TStringRef(), TMaybe<ui64>(0ULL)); }) + { + auto executeResult = TDerived::Execute(args.GetElement(0).AsStringRef(), + args.GetElement(1).AsStringRef(), + (args.GetElement(2) ? TMaybe<ui64>(args.GetElement(2).Get<ui64>()) : Nothing())); + TBlockItem boxedValue = ProcessResult(executeResult, args.GetElement(0)); + sink(boxedValue); } private: static TBlockItem ProcessResult(const TString& newString, const TBlockItem arg) { Y_UNUSED(arg); - return TBlockItem(std::move(newString)); + return TBlockItem(newString); + } + + template <typename T> + static TBlockItem ProcessResult(const TMaybe<T>& newValue, const TBlockItem arg) { + if (newValue.Defined()) { + return ProcessResult(*newValue, arg); + } else { + return TBlockItem(); + } } template <typename T> @@ -96,17 +172,15 @@ namespace { } } - static TBlockItem ProcessResult(bool result, const TBlockItem arg) { + template <typename T, typename = std::enable_if_t<TPrimitiveDataType<T>::Result>> + static TBlockItem ProcessResult(T result, const TBlockItem arg) { Y_UNUSED(arg); return TBlockItem(result); } }; template <typename TDerived> - struct TOperationMixin: public TBlockOperationMixin<TDerived>, public TScalarOperationMixin<TDerived> { - using TBlockOperationMixin<TDerived>::DoExecute; - using TScalarOperationMixin<TDerived>::DoExecute; - }; + struct TOperationMixin: public TBlockOperationMixin<TDerived>, public TScalarOperationMixin<TDerived> {}; template <auto mode> struct TNormalizeUTF8: public TOperationMixin<TNormalizeUTF8<mode>> { @@ -145,8 +219,210 @@ namespace { } }; -#define DEFINE_UTF8_OPERATION(udfName, Executor, signature) \ - BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, signature) { \ + struct TLengthGetter: public TOperationMixin<TLengthGetter> { + static ui64 Execute(TStringRef inputRef) { + size_t result; + GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), result); + return static_cast<ui64>(result); + } + }; + + struct TReverser: public TOperationMixin<TReverser> { + static TString Execute(TStringRef inputRef) { + auto wide = UTF8ToWide(inputRef); + ReverseInPlace(wide); + return WideToUTF8(wide); + } + }; + + struct TStripper: public TOperationMixin<TStripper> { + static TString Execute(TStringRef inputRef) { + const TUtf32String input = UTF8ToUTF32<true>(inputRef); + const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin())); + return WideToUTF8(result); + } + }; + + struct TAllRemover: public TOperationMixin<TAllRemover> { + static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) { + TUtf32String input = UTF8ToUTF32<true>(inputRef); + const TUtf32String remove = UTF8ToUTF32<true>(removeRef); + const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend()); + size_t tpos = 0; + for (const wchar32 c : input) { + if (!chars.contains(c)) { + input[tpos++] = c; + } + } + if (tpos != input.size()) { + input.resize(tpos); + return WideToUTF8(input); + } + return TNoChangesTag{}; + } + }; + + struct TFirstRemover: public TOperationMixin<TFirstRemover> { + static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) { + TUtf32String input = UTF8ToUTF32<true>(inputRef); + const auto remove = UTF8ToUTF32<true>(removeRef); + const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend()); + for (auto it = input.cbegin(); it != input.cend(); ++it) { + if (chars.contains(*it)) { + input.erase(it); + return WideToUTF8(input); + } + } + return TNoChangesTag{}; + } + }; + + struct TUnicodeSetMatcher: public TOperationMixin<TUnicodeSetMatcher> { + static bool Execute(TStringRef inputRef, TStringRef customCategoryRef) { + const TStringBuf input(inputRef); + const TUtf16String& customCategory = UTF8ToWide(customCategoryRef); + TUnicodeSet unicodeSet; + try { + unicodeSet.Parse(customCategory); + } catch (...) { + UdfTerminate((TStringBuilder() << "Failed to parse unicode set: " << CurrentExceptionMessage()).c_str()); + } + wchar32 rune; + const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin()); + const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end()); + while (cur != last) { + ReadUTF8CharAndAdvance(rune, cur, last); + if (!unicodeSet.Has(rune)) { + return false; + } + } + return true; + } + }; + + struct TLevensteinDistanceFinder: public TOperationMixin<TLevensteinDistanceFinder> { + static ui64 Execute(TStringRef leftRef, TStringRef rightRef) { + const TStringBuf left(leftRef); + const TStringBuf right(rightRef); + const auto& leftUtf32 = UTF8ToUTF32<true>(left); + const auto& rightUtf32 = UTF8ToUTF32<true>(right); + return NLevenshtein::Distance(leftUtf32, rightUtf32); + } + }; + + struct TLastRemoval: public TOperationMixin<TLastRemoval> { + static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef removeRef) { + TUtf32String input = UTF8ToUTF32<true>(inputRef); + const TUtf32String remove = UTF8ToUTF32<true>(removeRef); + const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend()); + for (auto it = input.crbegin(); it != input.crend(); ++it) { + if (chars.contains(*it)) { + input.erase(input.crend() - it - 1, 1); + return WideToUTF8(input); + } + } + return TNoChangesTag{}; + } + }; + + struct TAllReplacer: public TOperationMixin<TAllReplacer> { + static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) { + if (TString result(inputRef); SubstGlobal(result, whatReplace, toReplace)) { + return result; + } else { + return TNoChangesTag{}; + } + } + // Disable implict casts for arguments. + template <typename... Args> + static auto Execute(Args&&... args) = delete; + }; + + struct TFirstReplacer: public TOperationMixin<TFirstReplacer> { + static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) { + std::string result(inputRef); + const std::string_view what(whatReplace); + if (const auto index = result.find(what); index != std::string::npos) { + result.replace(index, what.size(), std::string_view(toReplace)); + return result; + } + return TNoChangesTag{}; + } + // Disable implict casts for arguments. + template <typename... Args> + static auto Execute(Args&&... args) = delete; + }; + + struct TLastReplacer: public TOperationMixin<TLastReplacer> { + static std::variant<TNoChangesTag, TString> Execute(TStringRef inputRef, TStringRef whatReplace, TStringRef toReplace) { + std::string result(inputRef); + const std::string_view what(whatReplace); + if (const auto index = result.rfind(what); index != std::string::npos) { + result.replace(index, what.size(), std::string_view(toReplace)); + return result; + } + return TNoChangesTag{}; + } + // Disable implict casts for arguments. + template <typename... Args> + static auto Execute(Args&&... args) = delete; + }; + + struct TFinder: public TOperationMixin<TFinder> { + static TMaybe<ui64> Execute(TStringRef inputRef, TStringRef whatFind, TMaybe<ui64> whereFind) { + const std::string_view string(inputRef); + const std::string_view needle(whatFind); + std::string_view::size_type pos = 0U; + + if (auto p = whereFind.GetOrElse(0ULL)) { + for (auto ptr = string.data(); p && pos < string.size(); --p) { + const auto width = WideCharSize(*ptr); + pos += width; + ptr += width; + } + } + + if (const auto find = string.find(needle, pos); std::string_view::npos != find) { + size_t result; + GetNumberOfUTF8Chars(string.data(), find, result); + return static_cast<ui64>(result); + } + return Nothing(); + } + // Disable implict casts for arguments. + template <typename... Args> + static auto Execute(Args&&... args) = delete; + }; + + struct TRFinder: public TOperationMixin<TRFinder> { + static TMaybe<ui64> Execute(TStringRef inputRef, TStringRef whatFind, TMaybe<ui64> whereFind) { + const std::string_view string(inputRef); + const std::string_view needle(whatFind); + std::string_view::size_type pos = std::string_view::npos; + + if (auto p = whereFind.GetOrElse(std::string_view::npos); std::string_view::npos != p) { + pos = 0ULL; + for (auto ptr = string.data(); p && pos < string.size(); --p) { + const auto width = WideCharSize(*ptr); + pos += width; + ptr += width; + } + } + + if (const auto find = string.rfind(needle, pos); std::string_view::npos != find) { + size_t result; + GetNumberOfUTF8Chars(string.data(), find, result); + return static_cast<ui64>(result); + } + return Nothing(); + } + // Disable implict casts for arguments. + template <typename... Args> + static auto Execute(Args&&... args) = delete; + }; + +#define DEFINE_UTF8_OPERATION_STRICT(udfName, Executor, signature, optArgs) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS(T##udfName, signature, optArgs) { \ return Executor::DoExecute(valueBuilder, args); \ } \ \ @@ -155,30 +431,86 @@ namespace { template <typename TSink> \ static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, const TSink& sink) { \ Y_UNUSED(valueBuilder); \ - Executor::DoExecute(arg1, sink); \ + Executor::BlockDoExecute(arg1, sink); \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) + +#define DEFINE_UTF8_OPERATION_BIN_BASE(macro, udfName, Executor, signature, optArgs) \ + macro(T##udfName, signature, optArgs) { \ + return Executor::DoExecute(valueBuilder, args); \ + } \ + \ + struct T##udfName##KernelExec \ + : public TBinaryKernelExec<T##udfName##KernelExec> { \ + template <typename TSink> \ + static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, TBlockItem arg2, const TSink& sink) { \ + Y_UNUSED(valueBuilder); \ + Executor::BlockDoExecute(arg1, arg2, sink); \ + } \ + }; \ + \ + END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) + +#define DEFINE_UTF8_OPERATION_BIN_STRICT(udfName, Executor, signature, optArgs) \ + DEFINE_UTF8_OPERATION_BIN_BASE(BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS, udfName, Executor, signature, optArgs) + +#define DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(udfName, Executor, signature, optArgs) \ + DEFINE_UTF8_OPERATION_BIN_BASE(BEGIN_SIMPLE_ARROW_UDF_WITH_OPTIONAL_ARGS, udfName, Executor, signature, optArgs) + +#define DEFINE_UTF8_OPERATION_MANY_STRICT(udfName, Executor, signature, argsCount, optArgsCount) \ + BEGIN_SIMPLE_STRICT_ARROW_UDF_WITH_OPTIONAL_ARGS(T##udfName, signature, optArgsCount) { \ + return Executor::DoExecute(valueBuilder, args); \ + } \ + \ + struct T##udfName##KernelExec \ + : public TGenericKernelExec<T##udfName##KernelExec, argsCount> { \ + template <typename TSink> \ + static void Process(const IValueBuilder* valueBuilder, TBlockItem args, const TSink& sink) { \ + Y_UNUSED(valueBuilder); \ + Executor::BlockDoExecute(args, sink); \ } \ }; \ \ END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) - DEFINE_UTF8_OPERATION(Normalize, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(NormalizeNFD, TNormalizeUTF8<NFD>, TUtf8(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(NormalizeNFC, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(NormalizeNFKD, TNormalizeUTF8<NFKD>, TUtf8(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(NormalizeNFKC, TNormalizeUTF8<NFKC>, TUtf8(TAutoMap<TUtf8>)); - - DEFINE_UTF8_OPERATION(IsAscii, TCheckAllChars<IsAscii>, bool(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(IsSpace, TCheckAllChars<IsSpace>, bool(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(IsUpper, TCheckAllChars<IsUpper>, bool(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(IsLower, TCheckAllChars<IsLower>, bool(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(IsDigit, TCheckAllChars<IsDigit>, bool(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(IsAlpha, TCheckAllChars<IsAlpha>, bool(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(IsAlnum, TCheckAllChars<IsAlnum>, bool(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(IsHex, TCheckAllChars<IsHexdigit>, bool(TAutoMap<TUtf8>)); - - DEFINE_UTF8_OPERATION(ToTitle, TStringToStringMapper<ToTitle>, TUtf8(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(ToUpper, TStringToStringMapper<ToUpper>, TUtf8(TAutoMap<TUtf8>)); - DEFINE_UTF8_OPERATION(ToLower, TStringToStringMapper<ToLower>, TUtf8(TAutoMap<TUtf8>)); + DEFINE_UTF8_OPERATION_STRICT(Normalize, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(NormalizeNFD, TNormalizeUTF8<NFD>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(NormalizeNFC, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(NormalizeNFKD, TNormalizeUTF8<NFKD>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(NormalizeNFKC, TNormalizeUTF8<NFKC>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + + DEFINE_UTF8_OPERATION_STRICT(IsAscii, TCheckAllChars<IsAscii>, bool(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(IsSpace, TCheckAllChars<IsSpace>, bool(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(IsUpper, TCheckAllChars<IsUpper>, bool(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(IsLower, TCheckAllChars<IsLower>, bool(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(IsDigit, TCheckAllChars<IsDigit>, bool(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(IsAlpha, TCheckAllChars<IsAlpha>, bool(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(IsAlnum, TCheckAllChars<IsAlnum>, bool(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(IsHex, TCheckAllChars<IsHexdigit>, bool(TAutoMap<TUtf8>), /*optArgs=*/0); + + DEFINE_UTF8_OPERATION_STRICT(ToTitle, TStringToStringMapper<ToTitle>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(ToUpper, TStringToStringMapper<ToUpper>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(ToLower, TStringToStringMapper<ToLower>, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + + DEFINE_UTF8_OPERATION_STRICT(GetLength, TLengthGetter, ui64(TAutoMap<TUtf8>), /*optArgs=*/0); + + DEFINE_UTF8_OPERATION_STRICT(Reverse, TReverser, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_STRICT(Strip, TStripper, TUtf8(TAutoMap<TUtf8>), /*optArgs=*/0); + + DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveAll, TAllRemover, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveFirst, TFirstRemover, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_BIN_NOT_STRICT(IsUnicodeSet, TUnicodeSetMatcher, bool(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_BIN_STRICT(LevensteinDistance, TLevensteinDistanceFinder, ui64(TAutoMap<TUtf8>, TAutoMap<TUtf8>), /*optArgs=*/0); + DEFINE_UTF8_OPERATION_BIN_STRICT(RemoveLast, TLastRemoval, TUtf8(TAutoMap<TUtf8>, TUtf8), /*optArgs=*/0); + + DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceAll, TAllReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0); + DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceFirst, TFirstReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0); + DEFINE_UTF8_OPERATION_MANY_STRICT(ReplaceLast, TLastReplacer, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8), /*argsCount=*/3, /*optionalArgs=*/0); + + DEFINE_UTF8_OPERATION_MANY_STRICT(Find, TFinder, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), /*argsCount=*/3, /*optionalArgs=*/1); + DEFINE_UTF8_OPERATION_MANY_STRICT(RFind, TRFinder, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), /*argsCount=*/3, /*optionalArgs=*/1); SIMPLE_UDF(TIsUtf, bool(TOptional<char*>)) { Y_UNUSED(valueBuilder); @@ -189,14 +521,6 @@ namespace { } } - SIMPLE_UDF(TGetLength, ui64(TAutoMap<TUtf8>)) { - Y_UNUSED(valueBuilder); - const auto& inputRef = args[0].AsStringRef(); - size_t result; - GetNumberOfUTF8Chars(inputRef.Data(), inputRef.Size(), result); - return TUnboxedValuePod(static_cast<ui64>(result)); - } - SIMPLE_UDF_WITH_OPTIONAL_ARGS(TToUint64, ui64(TAutoMap<TUtf8>, TOptional<ui16>), 1) { Y_UNUSED(valueBuilder); const TString inputStr(args[0].AsStringRef()); @@ -252,51 +576,6 @@ namespace { return valueBuilder->NewString(SubstrUTF8(input, from, len)); } - SIMPLE_UDF_WITH_OPTIONAL_ARGS(TFind, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), 1) { - Y_UNUSED(valueBuilder); - const std::string_view string(args[0].AsStringRef()); - const std::string_view needle(args[1].AsStringRef()); - std::string_view::size_type pos = 0U; - - if (auto p = args[2].GetOrDefault<ui64>(0ULL)) { - for (auto ptr = string.data(); p && pos < string.size(); --p) { - const auto width = WideCharSize(*ptr); - pos += width; - ptr += width; - } - } - - if (const auto find = string.find(needle, pos); std::string_view::npos != find) { - size_t result; - GetNumberOfUTF8Chars(string.data(), find, result); - return TUnboxedValuePod(static_cast<ui64>(result)); - } - return TUnboxedValuePod(); - } - - SIMPLE_UDF_WITH_OPTIONAL_ARGS(TRFind, TOptional<ui64>(TAutoMap<TUtf8>, TUtf8, TOptional<ui64>), 1) { - Y_UNUSED(valueBuilder); - const std::string_view string(args[0].AsStringRef()); - const std::string_view needle(args[1].AsStringRef()); - std::string_view::size_type pos = std::string_view::npos; - - if (auto p = args[2].GetOrDefault<ui64>(std::string_view::npos); std::string_view::npos != p) { - pos = 0ULL; - for (auto ptr = string.data(); p && pos < string.size(); --p) { - const auto width = WideCharSize(*ptr); - pos += width; - ptr += width; - } - } - - if (const auto find = string.rfind(needle, pos); std::string_view::npos != find) { - size_t result; - GetNumberOfUTF8Chars(string.data(), find, result); - return TUnboxedValuePod(static_cast<ui64>(result)); - } - return TUnboxedValuePod(); - } - using TTmpVector = TSmallVec<TUnboxedValue, TUnboxedValue::TAllocator>; template <typename TIt> @@ -406,86 +685,6 @@ namespace { return valueBuilder->NewString(JoinSeq(delimeter, items)); } - SIMPLE_UDF(TLevensteinDistance, ui64(TAutoMap<TUtf8>, TAutoMap<TUtf8>)) { - Y_UNUSED(valueBuilder); - const TStringBuf left(args[0].AsStringRef()); - const TStringBuf right(args[1].AsStringRef()); - const auto& leftUtf32 = UTF8ToUTF32<true>(left); - const auto& rightUtf32 = UTF8ToUTF32<true>(right); - const ui64 result = NLevenshtein::Distance(leftUtf32, rightUtf32); - return TUnboxedValuePod(result); - } - - SIMPLE_UDF(TReplaceAll, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) { - if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef())) - return valueBuilder->NewString(result); - else - return args[0]; - } - - SIMPLE_UDF(TReplaceFirst, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) { - std::string result(args[0].AsStringRef()); - const std::string_view what(args[1].AsStringRef()); - if (const auto index = result.find(what); index != std::string::npos) { - result.replace(index, what.size(), std::string_view(args[2].AsStringRef())); - return valueBuilder->NewString(result); - } - return args[0]; - } - - SIMPLE_UDF(TReplaceLast, TUtf8(TAutoMap<TUtf8>, TUtf8, TUtf8)) { - std::string result(args[0].AsStringRef()); - const std::string_view what(args[1].AsStringRef()); - if (const auto index = result.rfind(what); index != std::string::npos) { - result.replace(index, what.size(), std::string_view(args[2].AsStringRef())); - return valueBuilder->NewString(result); - } - return args[0]; - } - - SIMPLE_UDF(TRemoveAll, TUtf8(TAutoMap<TUtf8>, TUtf8)) { - TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef()); - const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef()); - const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend()); - size_t tpos = 0; - for (const wchar32 c : input) { - if (!chars.contains(c)) { - input[tpos++] = c; - } - } - if (tpos != input.size()) { - input.resize(tpos); - return valueBuilder->NewString(WideToUTF8(input)); - } - return args[0]; - } - - SIMPLE_UDF(TRemoveFirst, TUtf8(TAutoMap<TUtf8>, TUtf8)) { - TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef()); - const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef()); - const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend()); - for (auto it = input.cbegin(); it != input.cend(); ++it) { - if (chars.contains(*it)) { - input.erase(it); - return valueBuilder->NewString(WideToUTF8(input)); - } - } - return args[0]; - } - - SIMPLE_UDF(TRemoveLast, TUtf8(TAutoMap<TUtf8>, TUtf8)) { - TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef()); - const TUtf32String remove = UTF8ToUTF32<true>(args[1].AsStringRef()); - const std::unordered_set<wchar32> chars(remove.cbegin(), remove.cend()); - for (auto it = input.crbegin(); it != input.crend(); ++it) { - if (chars.contains(*it)) { - input.erase(input.crend() - it - 1, 1); - return valueBuilder->NewString(WideToUTF8(input)); - } - } - return args[0]; - } - SIMPLE_UDF(TToCodePointList, TListType<ui32>(TAutoMap<TUtf8>)) { size_t codePointCount = 0; const auto& inputRef = args[0].AsStringRef(); @@ -552,42 +751,6 @@ namespace { return valueBuilder->NewString(TStringRef(buffer.data(), buffer.size())); } - SIMPLE_UDF(TReverse, TUtf8(TAutoMap<TUtf8>)) { - auto wide = UTF8ToWide(args[0].AsStringRef()); - ReverseInPlace(wide); - return valueBuilder->NewString(WideToUTF8(wide)); - } - - SIMPLE_UDF(TStrip, TUtf8(TAutoMap<TUtf8>)) { - const TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef()); - const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin())); - return valueBuilder->NewString(WideToUTF8(result)); - } - - SIMPLE_UDF(TIsUnicodeSet, bool(TAutoMap<TUtf8>, TUtf8)) { - Y_UNUSED(valueBuilder); - const TStringBuf input(args[0].AsStringRef()); - const TUtf16String& customCategory = UTF8ToWide(args[1].AsStringRef()); - TUnicodeSet unicodeSet; - try { - unicodeSet.Parse(customCategory); - } catch (...) { - UdfTerminate((TStringBuilder() << "Failed to parse unicode set: " << CurrentExceptionMessage()).c_str()); - } - bool result = true; - wchar32 rune; - const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin()); - const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end()); - while (cur != last) { - ReadUTF8CharAndAdvance(rune, cur, last); - if (!unicodeSet.Has(rune)) { - result = false; - break; - } - } - return TUnboxedValuePod(result); - } - #define EXPORTED_UNICODE_BASE_UDF \ TIsUtf, \ TGetLength, \ diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json index 8189dd16e0..bac6e1ebc4 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json @@ -1,4 +1,9 @@ { + "test.test[BlockFind]": [ + { + "uri": "file://test.test_BlockFind_/results.txt" + } + ], "test.test[BlockIsCategory]": [ { "uri": "file://test.test_BlockIsCategory_/results.txt" @@ -9,11 +14,31 @@ "uri": "file://test.test_BlockNormalize_/results.txt" } ], + "test.test[BlockRemove]": [ + { + "uri": "file://test.test_BlockRemove_/results.txt" + } + ], + "test.test[BlockReplace]": [ + { + "uri": "file://test.test_BlockReplace_/results.txt" + } + ], + "test.test[BlockStrip]": [ + { + "uri": "file://test.test_BlockStrip_/results.txt" + } + ], "test.test[BlockTo]": [ { "uri": "file://test.test_BlockTo_/results.txt" } ], + "test.test[BlockUnicode]": [ + { + "uri": "file://test.test_BlockUnicode_/results.txt" + } + ], "test.test[Find]": [ { "uri": "file://test.test_Find_/results.txt" @@ -94,6 +119,11 @@ "uri": "file://test.test_TryToUint64_/results.txt" } ], + "test.test[UnicodeCodePoint]": [ + { + "uri": "file://test.test_UnicodeCodePoint_/results.txt" + } + ], "test.test[Unicode]": [ { "uri": "file://test.test_Unicode_/results.txt" diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockFind_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockFind_/results.txt new file mode 100644 index 0000000000..4ee0b05ad2 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockFind_/results.txt @@ -0,0 +1,134 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "OptionalType"; + [ + "DataType"; + "Utf8" + ] + ] + ]; + [ + "column1"; + [ + "OptionalType"; + [ + "DataType"; + "Utf8" + ] + ] + ]; + [ + "column2"; + [ + "OptionalType"; + [ + "DataType"; + "Utf8" + ] + ] + ]; + [ + "column3"; + [ + "OptionalType"; + [ + "DataType"; + "Utf8" + ] + ] + ]; + [ + "column4"; + [ + "OptionalType"; + [ + "DataType"; + "Utf8" + ] + ] + ]; + [ + "column5"; + [ + "OptionalType"; + [ + "DataType"; + "Utf8" + ] + ] + ]; + [ + "column6"; + [ + "OptionalType"; + [ + "DataType"; + "Utf8" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + "l\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm\xC3\xA4\xC3\x9Fig" + ]; + [ + "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm" + ]; + [ + "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m" + ]; + [ + "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, " + ]; + [ + "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k" + ]; + [ + "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m" + ]; + [ + "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r" + ] + ]; + [ + [ + "l\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm\xC3\xA4\xC3\x9Fig" + ]; + [ + "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm" + ]; + [ + "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m" + ]; + [ + "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, " + ]; + [ + "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k" + ]; + [ + "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m" + ]; + [ + "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r" + ] + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockRemove_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockRemove_/results.txt new file mode 100644 index 0000000000..4004c75199 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockRemove_/results.txt @@ -0,0 +1,100 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "all"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "first"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "last"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "first2"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "last2"; + [ + "DataType"; + "Utf8" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2"; + "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"; + "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" + ]; + [ + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B" + ]; + [ + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB2\xD1\x8B\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2"; + "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"; + "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" + ]; + [ + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x8B" + ]; + [ + ""; + ""; + ""; + ""; + ""; + "" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockReplace_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockReplace_/results.txt new file mode 100644 index 0000000000..bdb61e7f5c --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockReplace_/results.txt @@ -0,0 +1,124 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "all"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "first"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "last"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "first2"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "last2"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "first3"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "last3"; + [ + "DataType"; + "Utf8" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2z\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z"; + "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"; + "\xD1\x8B\xD0\xB2zzz\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz" + ]; + [ + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fz\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0z\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fzzz\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0zzz\xD1\x87\xD1\x8B" + ]; + [ + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "z\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z"; + "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"; + "zzz\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz" + ]; + [ + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2z\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0z\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2zzz\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x84\xD1\x8B" + ]; + [ + ""; + ""; + ""; + ""; + ""; + ""; + ""; + "" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockStrip_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockStrip_/results.txt new file mode 100644 index 0000000000..22df398114 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockStrip_/results.txt @@ -0,0 +1,56 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "column1"; + [ + "DataType"; + "Utf8" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD0\xB1\xD0\xB5\xD0\xB7 \xD0\xB2\xD0\xBD\xD0\xB5\xD1\x88\xD0\xBD\xD0\xB8\xD1\x85 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2"; + "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD0\xB1\xD0\xB5\xD0\xB7 \xD0\xB2\xD0\xBD\xD0\xB5\xD1\x88\xD0\xBD\xD0\xB8\xD1\x85 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2" + ]; + [ + " \xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBB\xD0\xB5\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB"; + "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBB\xD0\xB5\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB" + ]; + [ + "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBF\xD1\x80\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB "; + "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBF\xD1\x80\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB" + ]; + [ + "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0_\xD1\x81\xD0\xBE\xD0\xB2\xD1\x81\xD0\xB5\xD0\xBC_\xD0\xB1\xD0\xB5\xD0\xB7_\xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2"; + "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0_\xD1\x81\xD0\xBE\xD0\xB2\xD1\x81\xD0\xB5\xD0\xBC_\xD0\xB1\xD0\xB5\xD0\xB7_\xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2" + ]; + [ + "\xE2\x80\x89\xD1\x8E\xD0\xBD\xD0\xB8\xD0\xBA\xD0\xBE\xD0\xB4+\xD0\xBF\xD0\xB5\xD1\x80\xD0\xB5\xD0\xB2\xD0\xBE\xD0\xB4 \xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB8\n"; + "\xD1\x8E\xD0\xBD\xD0\xB8\xD0\xBA\xD0\xBE\xD0\xB4+\xD0\xBF\xD0\xB5\xD1\x80\xD0\xB5\xD0\xB2\xD0\xBE\xD0\xB4 \xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB8" + ]; + [ + ""; + "" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockUnicode_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockUnicode_/results.txt new file mode 100644 index 0000000000..76cdb42446 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockUnicode_/results.txt @@ -0,0 +1,220 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "value"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "is"; + [ + "DataType"; + "Bool" + ] + ]; + [ + "length"; + [ + "DataType"; + "Uint64" + ] + ]; + [ + "one_end_substring"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "two_end_substring"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "remove_all"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "levenstein"; + [ + "DataType"; + "Uint64" + ] + ]; + [ + "reverse"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "find"; + [ + "OptionalType"; + [ + "DataType"; + "Uint64" + ] + ] + ]; + [ + "rfind"; + [ + "OptionalType"; + [ + "DataType"; + "Uint64" + ] + ] + ]; + [ + "find_from"; + [ + "OptionalType"; + [ + "DataType"; + "Uint64" + ] + ] + ]; + [ + "rfind_from"; + [ + "OptionalType"; + [ + "DataType"; + "Uint64" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + "Eyl\xC3\xBCl"; + %true; + "5"; + "yl\xC3\xBCl"; + "Ey"; + "Eyl\xC3\xBCl"; + "5"; + "l\xC3\xBClyE"; + #; + #; + #; + # + ]; + [ + "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"; + %true; + "6"; + "\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"; + "\xD0\xB6\xD0\xBD"; + "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"; + "5"; + "\xD1\x8F\xD0\xBD\xD1\x9E\xD1\x96\xD0\xBD\xD0\xB6"; + #; + #; + #; + # + ]; + [ + "\xC3\xBAnora"; + %true; + "5"; + "nora"; + "\xC3\xBAn"; + "\xC3\xBAnoa"; + "5"; + "aron\xC3\xBA"; + #; + #; + #; + # + ]; + [ + "Ci\xD1\x87 Ci\xD1\x87"; + %true; + "7"; + "i\xD1\x87 Ci\xD1\x87"; + "Ci"; + "Ci Ci"; + "5"; + "\xD1\x87iC \xD1\x87iC"; + #; + #; + #; + # + ]; + [ + "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"; + %true; + "13"; + "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"; + "\xD0\xBF\xD1\x80"; + "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"; + "5"; + "\xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF \xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF"; + [ + "4" + ]; + [ + "11" + ]; + [ + "11" + ]; + [ + "4" + ] + ]; + [ + "6"; + %true; + "1"; + ""; + "6"; + "6"; + "1"; + "6"; + #; + #; + #; + # + ]; + [ + ""; + %true; + "0"; + ""; + ""; + ""; + "0"; + ""; + #; + #; + #; + # + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt index bcccb2b511..4ee0b05ad2 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Find_/results.txt @@ -8,66 +8,73 @@ "StructType"; [ [ - "column0"; + "value"; [ - "DataType"; - "Utf8" + "OptionalType"; + [ + "DataType"; + "Utf8" + ] ] ]; [ "column1"; [ - "DataType"; - "Utf8" + "OptionalType"; + [ + "DataType"; + "Utf8" + ] ] ]; [ "column2"; [ - "DataType"; - "Utf8" + "OptionalType"; + [ + "DataType"; + "Utf8" + ] ] - ] - ] - ] - ]; - "Data" = [ - [ - "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm"; - "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"; - "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, " - ] - ] - } - ] - }; - { - "Write" = [ - { - "Type" = [ - "ListType"; - [ - "StructType"; - [ + ]; [ - "column0"; + "column3"; [ - "DataType"; - "Utf8" + "OptionalType"; + [ + "DataType"; + "Utf8" + ] ] ]; [ - "column1"; + "column4"; [ - "DataType"; - "Utf8" + "OptionalType"; + [ + "DataType"; + "Utf8" + ] ] ]; [ - "column2"; + "column5"; [ - "DataType"; - "Utf8" + "OptionalType"; + [ + "DataType"; + "Utf8" + ] + ] + ]; + [ + "column6"; + [ + "OptionalType"; + [ + "DataType"; + "Utf8" + ] ] ] ] @@ -75,9 +82,50 @@ ]; "Data" = [ [ - "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k"; - "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m"; - "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r" + [ + "l\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm\xC3\xA4\xC3\x9Fig" + ]; + [ + "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm" + ]; + [ + "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m" + ]; + [ + "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, " + ]; + [ + "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k" + ]; + [ + "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m" + ]; + [ + "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r" + ] + ]; + [ + [ + "l\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm\xC3\xA4\xC3\x9Fig" + ]; + [ + "\xC3\xA4stig, m\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, \xC3\274berall, regelm" + ]; + [ + "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m" + ]; + [ + "\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m\xC3\xB6gen, " + ]; + [ + "\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k" + ]; + [ + "\xC3\266chten, ausf\xC3\xBChrlich, sp\xC3\xA4ter, k\xC3\xB6nnen, nat\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r\xC3\274ckw\xC3\xA4rts, k\xC3\xA4mpfen, m" + ]; + [ + "\xC3\xBCrlich, universit\xC3\xA4t, \xC3\266ffentlich, r" + ] ] ] } diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt index 11bcb15a2f..4004c75199 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Remove_/results.txt @@ -8,63 +8,45 @@ "StructType"; [ [ - "column0"; + "value"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "all"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "first"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "last"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "first2"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "last2"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ] ] @@ -72,104 +54,44 @@ ]; "Data" = [ [ - [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2" - ]; - [ - "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" - ]; - [ - "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" - ] + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2"; + "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"; + "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" ]; [ - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B" - ] + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B" ]; [ - [ - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD0\xB2\xD1\x8B\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2" - ]; - [ - "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" - ]; - [ - "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" - ] + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB2\xD1\x8B\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2"; + "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"; + "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" ]; [ - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x8B" - ] + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x8B" ]; [ - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ] + ""; + ""; + ""; + ""; + ""; + "" ] ] } diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt index 7390dbdbc3..bdb61e7f5c 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Replace_/results.txt @@ -8,83 +8,59 @@ "StructType"; [ [ - "column0"; + "value"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "all"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "first"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "last"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "first2"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "last2"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "first3"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "last3"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ] ] @@ -92,134 +68,54 @@ ]; "Data" = [ [ - [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD1\x8B\xD0\xB2z\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z" - ]; - [ - "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" - ]; - [ - "\xD1\x8B\xD0\xB2zzz\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz" - ] + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2z\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z"; + "\xD1\x8B\xD0\xB2\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"; + "\xD1\x8B\xD0\xB2zzz\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz" ]; [ - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fz\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0z\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fzzz\xD0\xB0\xD1\x87\xD1\x8B" - ]; - [ - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0zzz\xD1\x87\xD1\x8B" - ] + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fz\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0z\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8Fzzz\xD0\xB0\xD1\x87\xD1\x8B"; + "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0zzz\xD1\x87\xD1\x8B" ]; [ - [ - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "z\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z" - ]; - [ - "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2" - ]; - [ - "zzz\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0" - ]; - [ - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz" - ] + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "z\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2z"; + "\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2"; + "zzz\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; + "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2zzz" ]; [ - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2z\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0z\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2zzz\xD0\xB0\xD1\x84\xD1\x8B" - ]; - [ - "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x84\xD1\x8B" - ] + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2z\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0z\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2zzz\xD0\xB0\xD1\x84\xD1\x8B"; + "gd2\xD1\x86\xD0\2713\xD1\x8B\xD0\xB2\xD0\xB0zzz\xD1\x84\xD1\x8B" ]; [ - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ] + ""; + ""; + ""; + ""; + ""; + ""; + ""; + "" ] ] } diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt index 613b639ed0..22df398114 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Strip_/results.txt @@ -8,7 +8,7 @@ "StructType"; [ [ - "column0"; + "value"; [ "DataType"; "Utf8" @@ -20,53 +20,33 @@ "DataType"; "Utf8" ] - ]; - [ - "column2"; - [ - "DataType"; - "Utf8" - ] - ]; - [ - "column3"; - [ - "DataType"; - "Utf8" - ] - ]; - [ - "column4"; - [ - "DataType"; - "Utf8" - ] - ]; - [ - "column5"; - [ - "DataType"; - "Utf8" - ] - ]; - [ - "column6"; - [ - "DataType"; - "Utf8" - ] ] ] ] ]; "Data" = [ [ - "\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; - "\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD1\x87\xD1\x81\xD1\x8F\xD0\xB0\xD0\xB0\xD1\x87\xD1\x8B"; - "\xD0\xB0\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0\xD1\x8B\xD0\xB2\xD0\xB0"; - "\xD0\xB0\xD0\xB0\xD0\xB2 \xD1\x8B\xD0\xB0 \xD1\x8B\xD0\xB2\xD0\xB0 \xD1\x8B\xD0\xB2\xD0\xB0"; - "\xD1\x8B\xD0\xB2\xD0\xB0"; - "\xD0\xB2\xD0\xB0\xD0\xBE\xD0\xB0\xD0\xBE"; + "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD0\xB1\xD0\xB5\xD0\xB7 \xD0\xB2\xD0\xBD\xD0\xB5\xD1\x88\xD0\xBD\xD0\xB8\xD1\x85 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2"; + "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD0\xB1\xD0\xB5\xD0\xB7 \xD0\xB2\xD0\xBD\xD0\xB5\xD1\x88\xD0\xBD\xD0\xB8\xD1\x85 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2" + ]; + [ + " \xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBB\xD0\xB5\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB"; + "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBB\xD0\xB5\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB" + ]; + [ + "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBF\xD1\x80\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB "; + "\xD1\x82\xD0\xBE\xD0\xBB\xD1\x8C\xD0\xBA\xD0\xBE \xD0\xBF\xD1\x80\xD0\xB0\xD0\xB2\xD1\x8B\xD0\xB9 \xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB" + ]; + [ + "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0_\xD1\x81\xD0\xBE\xD0\xB2\xD1\x81\xD0\xB5\xD0\xBC_\xD0\xB1\xD0\xB5\xD0\xB7_\xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2"; + "\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0_\xD1\x81\xD0\xBE\xD0\xB2\xD1\x81\xD0\xB5\xD0\xBC_\xD0\xB1\xD0\xB5\xD0\xB7_\xD0\xBF\xD1\x80\xD0\xBE\xD0\xB1\xD0\xB5\xD0\xBB\xD0\xBE\xD0\xB2" + ]; + [ + "\xE2\x80\x89\xD1\x8E\xD0\xBD\xD0\xB8\xD0\xBA\xD0\xBE\xD0\xB4+\xD0\xBF\xD0\xB5\xD1\x80\xD0\xB5\xD0\xB2\xD0\xBE\xD0\xB4 \xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB8\n"; + "\xD1\x8E\xD0\xBD\xD0\xB8\xD0\xBA\xD0\xBE\xD0\xB4+\xD0\xBF\xD0\xB5\xD1\x80\xD0\xB5\xD0\xB2\xD0\xBE\xD0\xB4 \xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB8" + ]; + [ + ""; "" ] ] diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_UnicodeCodePoint_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_UnicodeCodePoint_/results.txt new file mode 100644 index 0000000000..cab1fc79ef --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_UnicodeCodePoint_/results.txt @@ -0,0 +1,120 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "code_point_list"; + [ + "ListType"; + [ + "DataType"; + "Uint32" + ] + ] + ]; + [ + "from_code_point_list"; + [ + "DataType"; + "Utf8" + ] + ]; + [ + "from_lazy_code_point_list"; + [ + "DataType"; + "Utf8" + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + "69"; + "121"; + "108"; + "252"; + "108" + ]; + "Eyl\xC3\xBCl"; + "Eyl\xC3\xBCl" + ]; + [ + [ + "1078"; + "1085"; + "1110"; + "1118"; + "1085"; + "1103" + ]; + "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"; + "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F" + ]; + [ + [ + "250"; + "110"; + "111"; + "114"; + "97" + ]; + "\xC3\xBAnora"; + "\xC3\xBAnora" + ]; + [ + [ + "67"; + "105"; + "1095"; + "32"; + "67"; + "105"; + "1095" + ]; + "Ci\xD1\x87 Ci\xD1\x87"; + "Ci\xD1\x87 Ci\xD1\x87" + ]; + [ + [ + "1087"; + "1088"; + "1080"; + "1074"; + "1077"; + "1090"; + "32"; + "1087"; + "1088"; + "1080"; + "1074"; + "1077"; + "1090" + ]; + "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"; + "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82" + ]; + [ + [ + "54" + ]; + "6"; + "6" + ]; + [ + []; + ""; + "" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt index 502cea3fd0..76cdb42446 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt @@ -10,11 +10,8 @@ [ "value"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ @@ -27,94 +24,43 @@ [ "length"; [ - "OptionalType"; - [ - "DataType"; - "Uint64" - ] + "DataType"; + "Uint64" ] ]; [ "one_end_substring"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "two_end_substring"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "remove_all"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ "levenstein"; [ - "OptionalType"; - [ - "DataType"; - "Uint64" - ] - ] - ]; - [ - "code_point_list"; - [ - "OptionalType"; - [ - "ListType"; - [ - "DataType"; - "Uint32" - ] - ] - ] - ]; - [ - "from_code_point_list"; - [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] - ] - ]; - [ - "from_lazy_code_point_list"; - [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Uint64" ] ]; [ "reverse"; [ - "OptionalType"; - [ - "DataType"; - "Utf8" - ] + "DataType"; + "Utf8" ] ]; [ @@ -162,226 +108,70 @@ ]; "Data" = [ [ - [ - "Eyl\xC3\xBCl" - ]; + "Eyl\xC3\xBCl"; %true; - [ - "5" - ]; - [ - "yl\xC3\xBCl" - ]; - [ - "Ey" - ]; - [ - "Eyl\xC3\xBCl" - ]; - [ - "5" - ]; - [ - [ - "69"; - "121"; - "108"; - "252"; - "108" - ] - ]; - [ - "Eyl\xC3\xBCl" - ]; - [ - "Eyl\xC3\xBCl" - ]; - [ - "l\xC3\xBClyE" - ]; + "5"; + "yl\xC3\xBCl"; + "Ey"; + "Eyl\xC3\xBCl"; + "5"; + "l\xC3\xBClyE"; #; #; #; # ]; [ - [ - "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F" - ]; + "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"; %true; - [ - "6" - ]; - [ - "\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F" - ]; - [ - "\xD0\xB6\xD0\xBD" - ]; - [ - "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F" - ]; - [ - "5" - ]; - [ - [ - "1078"; - "1085"; - "1110"; - "1118"; - "1085"; - "1103" - ] - ]; - [ - "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F" - ]; - [ - "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F" - ]; - [ - "\xD1\x8F\xD0\xBD\xD1\x9E\xD1\x96\xD0\xBD\xD0\xB6" - ]; + "6"; + "\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"; + "\xD0\xB6\xD0\xBD"; + "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"; + "5"; + "\xD1\x8F\xD0\xBD\xD1\x9E\xD1\x96\xD0\xBD\xD0\xB6"; #; #; #; # ]; [ - [ - "\xC3\xBAnora" - ]; + "\xC3\xBAnora"; %true; - [ - "5" - ]; - [ - "nora" - ]; - [ - "\xC3\xBAn" - ]; - [ - "\xC3\xBAnoa" - ]; - [ - "5" - ]; - [ - [ - "250"; - "110"; - "111"; - "114"; - "97" - ] - ]; - [ - "\xC3\xBAnora" - ]; - [ - "\xC3\xBAnora" - ]; - [ - "aron\xC3\xBA" - ]; + "5"; + "nora"; + "\xC3\xBAn"; + "\xC3\xBAnoa"; + "5"; + "aron\xC3\xBA"; #; #; #; # ]; [ - [ - "Ci\xD1\x87 Ci\xD1\x87" - ]; + "Ci\xD1\x87 Ci\xD1\x87"; %true; - [ - "7" - ]; - [ - "i\xD1\x87 Ci\xD1\x87" - ]; - [ - "Ci" - ]; - [ - "Ci Ci" - ]; - [ - "5" - ]; - [ - [ - "67"; - "105"; - "1095"; - "32"; - "67"; - "105"; - "1095" - ] - ]; - [ - "Ci\xD1\x87 Ci\xD1\x87" - ]; - [ - "Ci\xD1\x87 Ci\xD1\x87" - ]; - [ - "\xD1\x87iC \xD1\x87iC" - ]; + "7"; + "i\xD1\x87 Ci\xD1\x87"; + "Ci"; + "Ci Ci"; + "5"; + "\xD1\x87iC \xD1\x87iC"; #; #; #; # ]; [ - [ - "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82" - ]; + "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"; %true; - [ - "13" - ]; - [ - "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82" - ]; - [ - "\xD0\xBF\xD1\x80" - ]; - [ - "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82" - ]; - [ - "5" - ]; - [ - [ - "1087"; - "1088"; - "1080"; - "1074"; - "1077"; - "1090"; - "32"; - "1087"; - "1088"; - "1080"; - "1074"; - "1077"; - "1090" - ] - ]; - [ - "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82" - ]; - [ - "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82" - ]; - [ - "\xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF \xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF" - ]; + "13"; + "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"; + "\xD0\xBF\xD1\x80"; + "\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"; + "5"; + "\xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF \xD1\x82\xD0\xB5\xD0\xB2\xD0\xB8\xD1\x80\xD0\xBF"; [ "4" ]; @@ -396,76 +186,28 @@ ] ]; [ - [ - "6" - ]; + "6"; %true; - [ - "1" - ]; - [ - "" - ]; - [ - "6" - ]; - [ - "6" - ]; - [ - "1" - ]; - [ - [ - "54" - ] - ]; - [ - "6" - ]; - [ - "6" - ]; - [ - "6" - ]; + "1"; + ""; + "6"; + "6"; + "1"; + "6"; #; #; #; # ]; [ - [ - "" - ]; + ""; %true; - [ - "0" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; - [ - "0" - ]; - [ - [] - ]; - [ - "" - ]; - [ - "" - ]; - [ - "" - ]; + "0"; + ""; + ""; + ""; + "0"; + ""; #; #; #; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in new file mode 100644 index 0000000000..c40336b0e2 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in @@ -0,0 +1,2 @@ +{"key"="1";"value"="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"}; +{"key"="2";"value"="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in.attr new file mode 100644 index 0000000000..ea891bb344 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["OptionalType";["DataType";"Utf8"]]]; + ["value";["OptionalType";["DataType";"Utf8"]]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.sql new file mode 100644 index 0000000000..0954e77c94 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockFind.sql @@ -0,0 +1,12 @@ + +pragma UseBlocks; + +SELECT + value as value, + Unicode::Substring(value, Unicode::Find(value, "ä"u), Unicode::RFind(value, "ä"u) - Unicode::Find(value, "ä"u)), + Unicode::Substring(value, Unicode::Find(value, "ö"u), Unicode::RFind(value, "ö"u) - Unicode::Find(value, "ö"u)), + Unicode::Substring(value, Unicode::Find(value, "ü"u), Unicode::RFind(value, "ü"u) - Unicode::Find(value, "ü"u)), + Unicode::Substring(value, Unicode::Find(value, "ä"u, 30ul), Unicode::RFind(value, "ä"u, 123ul) - Unicode::Find(value, "ä"u, 30ul)), + Unicode::Substring(value, Unicode::Find(value, "ö"u, 9ul), Unicode::RFind(value, "ö"u, 103ul) - Unicode::Find(value, "ö"u, 9ul)), + Unicode::Substring(value, Unicode::Find(value, "ü"u, 45ul), Unicode::RFind(value, "ü"u, 83ul) - Unicode::Find(value, "ü"u, 45ul)) +from Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in new file mode 100644 index 0000000000..95262ac2b9 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in @@ -0,0 +1,5 @@ +{"key"="1";"value"="ываыва"}; +{"key"="2";"value"="ячсячсяаачы"}; +{"key"="3";"value"="аавыаываыва"}; +{"key"="4";"value"="gd2цй3ываафы"}; +{"key"="5";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in.attr new file mode 100644 index 0000000000..d5e5b2ca48 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.sql new file mode 100644 index 0000000000..04a8593148 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockRemove.sql @@ -0,0 +1,12 @@ +/* syntax version 1 */ + +pragma UseBlocks; + +SELECT + value as value, + Unicode::RemoveAll(value, "фа"u) AS all, + Unicode::RemoveFirst(value, "а"u) AS first, + Unicode::RemoveLast(value, "а"u) AS last, + Unicode::RemoveFirst(value, "фа"u) AS first2, + Unicode::RemoveLast(value, "фа"u) AS last2 +FROM Input; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.in.attr new file mode 100644 index 0000000000..d5e5b2ca48 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.sql new file mode 100644 index 0000000000..c50f01c184 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockReplace.sql @@ -0,0 +1,14 @@ +/* syntax version 1 */ + +pragma UseBlocks; + +SELECT + value, + Unicode::ReplaceAll(value, Utf8("аф"), Utf8("zzz")) AS all, + Unicode::ReplaceFirst(value, Utf8("а"), Utf8("z")) AS first, + Unicode::ReplaceLast(value, Utf8("а"), Utf8("z")) AS last, + Unicode::ReplaceFirst(value, Utf8("а"), Utf8("")) AS first2, + Unicode::ReplaceLast(value, Utf8("а"), Utf8("")) AS last2, + Unicode::ReplaceFirst(value, Utf8("а"), Utf8("zzz")) AS first3, + Unicode::ReplaceLast(value, Utf8("а"), Utf8("zzz")) AS last3 +FROM Input
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in new file mode 100644 index 0000000000..d8e23353ed --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in @@ -0,0 +1,6 @@ +{"key"="1";"value"="строка без внешних пробелов"}; +{"key"="2";"value"=" только левый пробел"}; +{"key"="3";"value"="только правый пробел "}; +{"key"="4";"value"="строка_совсем_без_пробелов"}; +{"key"="5";"value"="\u2009юникод+перевод строки\n"}; +{"key"="6";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in.attr new file mode 100644 index 0000000000..d5e5b2ca48 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.sql new file mode 100644 index 0000000000..04e1b04764 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockStrip.sql @@ -0,0 +1,8 @@ +/* syntax version 1 */ + +pragma UseBlocks; + +SELECT + value as value, + Unicode::Strip(value) +From Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in new file mode 100644 index 0000000000..d9c36c855a --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in @@ -0,0 +1,7 @@ +{"key"="";"value"="Eyl\xC3\xBCl"}; +{"key"="";"value"="\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"}; +{"key"="";"value"="\xC3\xBAnora"}; +{"key"="";"value"="Ci\xD1\x87 Ci\xD1\x87"}; +{"key"="";"value"="\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"}; +{"key"="";"value"="6"}; +{"key"="";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in.attr new file mode 100644 index 0000000000..5f1b009fbf --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +}
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.sql new file mode 100644 index 0000000000..3ab0ffc96f --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockUnicode.sql @@ -0,0 +1,18 @@ +/* syntax version 1 */ + +pragma UseBlocks; + +SELECT + value AS value, + Unicode::IsUtf(value) AS is, + Unicode::GetLength(value) AS length, + Unicode::Substring(value, 1) AS one_end_substring, + Unicode::Substring(value, 0, 2) AS two_end_substring, + Unicode::RemoveAll(value, "\xD1\x87пr") AS remove_all, + Unicode::LevensteinDistance(value, value || Unicode::Substring(value, 0, 5)) AS levenstein, + Unicode::Reverse(value) AS reverse, + Unicode::Find(value, "ет"u) AS find, + Unicode::RFind(value, "ет"u) AS rfind, + Unicode::Find(value, "ет"u, 7ul) AS find_from, + Unicode::RFind(value, "ет"u, 7ul) AS rfind_from +FROM Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Find.in b/yql/essentials/udfs/common/unicode_base/test/cases/Find.in new file mode 100644 index 0000000000..c40336b0e2 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Find.in @@ -0,0 +1,2 @@ +{"key"="1";"value"="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"}; +{"key"="2";"value"="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Find.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Find.in.attr new file mode 100644 index 0000000000..ea891bb344 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Find.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["OptionalType";["DataType";"Utf8"]]]; + ["value";["OptionalType";["DataType";"Utf8"]]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Find.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Find.sql index 9a9a58752e..1515be76c6 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/Find.sql +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Find.sql @@ -1,13 +1,9 @@ -$text ="lästig, möchten, ausführlich, später, können, natürlich, universität, öffentlich, rückwärts, kämpfen, mögen, überall, regelmäßig"u; - SELECT - Unicode::Substring($text, Unicode::Find($text, "ä"u), Unicode::RFind($text, "ä"u) - Unicode::Find($text, "ä"u)), - Unicode::Substring($text, Unicode::Find($text, "ö"u), Unicode::RFind($text, "ö"u) - Unicode::Find($text, "ö"u)), - Unicode::Substring($text, Unicode::Find($text, "ü"u), Unicode::RFind($text, "ü"u) - Unicode::Find($text, "ü"u)); - - -SELECT - Unicode::Substring($text, Unicode::Find($text, "ä"u, 30ul), Unicode::RFind($text, "ä"u, 123ul) - Unicode::Find($text, "ä"u, 30ul)), - Unicode::Substring($text, Unicode::Find($text, "ö"u, 9ul), Unicode::RFind($text, "ö"u, 103ul) - Unicode::Find($text, "ö"u, 9ul)), - Unicode::Substring($text, Unicode::Find($text, "ü"u, 45ul), Unicode::RFind($text, "ü"u, 83ul) - Unicode::Find($text, "ü"u, 45ul)); - + value as value, + Unicode::Substring(value, Unicode::Find(value, "ä"u), Unicode::RFind(value, "ä"u) - Unicode::Find(value, "ä"u)), + Unicode::Substring(value, Unicode::Find(value, "ö"u), Unicode::RFind(value, "ö"u) - Unicode::Find(value, "ö"u)), + Unicode::Substring(value, Unicode::Find(value, "ü"u), Unicode::RFind(value, "ü"u) - Unicode::Find(value, "ü"u)), + Unicode::Substring(value, Unicode::Find(value, "ä"u, 30ul), Unicode::RFind(value, "ä"u, 123ul) - Unicode::Find(value, "ä"u, 30ul)), + Unicode::Substring(value, Unicode::Find(value, "ö"u, 9ul), Unicode::RFind(value, "ö"u, 103ul) - Unicode::Find(value, "ö"u, 9ul)), + Unicode::Substring(value, Unicode::Find(value, "ü"u, 45ul), Unicode::RFind(value, "ü"u, 83ul) - Unicode::Find(value, "ü"u, 45ul)) +from Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in new file mode 100644 index 0000000000..95262ac2b9 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in @@ -0,0 +1,5 @@ +{"key"="1";"value"="ываыва"}; +{"key"="2";"value"="ячсячсяаачы"}; +{"key"="3";"value"="аавыаываыва"}; +{"key"="4";"value"="gd2цй3ываафы"}; +{"key"="5";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in.attr new file mode 100644 index 0000000000..d5e5b2ca48 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql index ee96037f79..eae1d678a3 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Remove.sql @@ -1,9 +1,9 @@ /* syntax version 1 */ SELECT - CAST(value AS Utf8), - Unicode::RemoveAll(CAST(value AS Utf8), Utf8("фа")) AS all, - Unicode::RemoveFirst(CAST(value AS Utf8), Utf8("а")) AS first, - Unicode::RemoveLast(CAST(value AS Utf8), Utf8("а")) AS last, - Unicode::RemoveFirst(CAST(value AS Utf8), Utf8("фа")) AS first2, - Unicode::RemoveLast(CAST(value AS Utf8), Utf8("фа")) AS last2 + value as value, + Unicode::RemoveAll(value, "фа"u) AS all, + Unicode::RemoveFirst(value, "а"u) AS first, + Unicode::RemoveLast(value, "а"u) AS last, + Unicode::RemoveFirst(value, "фа"u) AS first2, + Unicode::RemoveLast(value, "фа"u) AS last2 FROM Input; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Replace.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Replace.in.attr new file mode 100644 index 0000000000..d5e5b2ca48 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Replace.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql index d623984413..9f875627ed 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Replace.sql @@ -1,11 +1,11 @@ /* syntax version 1 */ SELECT - CAST(value AS Utf8), - Unicode::ReplaceAll(CAST(value AS Utf8), Utf8("аф"), Utf8("zzz")) AS all, - Unicode::ReplaceFirst(CAST(value AS Utf8), Utf8("а"), Utf8("z")) AS first, - Unicode::ReplaceLast(CAST(value AS Utf8), Utf8("а"), Utf8("z")) AS last, - Unicode::ReplaceFirst(CAST(value AS Utf8), Utf8("а"), Utf8("")) AS first2, - Unicode::ReplaceLast(CAST(value AS Utf8), Utf8("а"), Utf8("")) AS last2, - Unicode::ReplaceFirst(CAST(value AS Utf8), Utf8("а"), Utf8("zzz")) AS first3, - Unicode::ReplaceLast(CAST(value AS Utf8), Utf8("а"), Utf8("zzz")) AS last3 -FROM Input; + value, + Unicode::ReplaceAll(value, Utf8("аф"), Utf8("zzz")) AS all, + Unicode::ReplaceFirst(value, Utf8("а"), Utf8("z")) AS first, + Unicode::ReplaceLast(value, Utf8("а"), Utf8("z")) AS last, + Unicode::ReplaceFirst(value, Utf8("а"), Utf8("")) AS first2, + Unicode::ReplaceLast(value, Utf8("а"), Utf8("")) AS last2, + Unicode::ReplaceFirst(value, Utf8("а"), Utf8("zzz")) AS first3, + Unicode::ReplaceLast(value, Utf8("а"), Utf8("zzz")) AS last3 +FROM Input
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in new file mode 100644 index 0000000000..d8e23353ed --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in @@ -0,0 +1,6 @@ +{"key"="1";"value"="строка без внешних пробелов"}; +{"key"="2";"value"=" только левый пробел"}; +{"key"="3";"value"="только правый пробел "}; +{"key"="4";"value"="строка_совсем_без_пробелов"}; +{"key"="5";"value"="\u2009юникод+перевод строки\n"}; +{"key"="6";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in.attr new file mode 100644 index 0000000000..d5e5b2ca48 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql index 45bde163e0..48f9498b8e 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Strip.sql @@ -1,9 +1,5 @@ /* syntax version 1 */ SELECT - Unicode::Strip("ываыва"u), - Unicode::Strip(" ячсячсяаачы"u), - Unicode::Strip("аавыаываыва "u), - Unicode::Strip("аав ыа ыва ыва "u), - Unicode::Strip("\u2009ыва\n"u), - Unicode::Strip("\u200aваоао\u2002"u), - Unicode::Strip(""u) + value as value, + Unicode::Strip(value) +From Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in index 55f0307e35..d9c36c855a 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in @@ -1,7 +1,7 @@ -{"key"="";"subkey"="";"value"="Eyl\xC3\xBCl"}; -{"key"="";"subkey"="";"value"="\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"}; -{"key"="";"subkey"="";"value"="\xC3\xBAnora"}; -{"key"="";"subkey"="";"value"="Ci\xD1\x87 Ci\xD1\x87"}; -{"key"="";"subkey"="";"value"="\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"}; -{"key"="";"subkey"="";"value"="6"}; -{"key"="";"subkey"="";"value"=""}; +{"key"="";"value"="Eyl\xC3\xBCl"}; +{"key"="";"value"="\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"}; +{"key"="";"value"="\xC3\xBAnora"}; +{"key"="";"value"="Ci\xD1\x87 Ci\xD1\x87"}; +{"key"="";"value"="\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"}; +{"key"="";"value"="6"}; +{"key"="";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in.attr new file mode 100644 index 0000000000..5f1b009fbf --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +}
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql index cdff12f352..6cbaededb6 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql @@ -7,12 +7,9 @@ SELECT Unicode::Substring(value, 0, 2) AS two_end_substring, Unicode::RemoveAll(value, "\xD1\x87пr") AS remove_all, Unicode::LevensteinDistance(value, value || Unicode::Substring(value, 0, 5)) AS levenstein, - Unicode::ToCodePointList(value) AS code_point_list, - Unicode::FromCodePointList(Unicode::ToCodePointList(value)) AS from_code_point_list, - Unicode::FromCodePointList(YQL::LazyList(Unicode::ToCodePointList(value))) AS from_lazy_code_point_list, Unicode::Reverse(value) AS reverse, Unicode::Find(value, "ет"u) AS find, Unicode::RFind(value, "ет"u) AS rfind, Unicode::Find(value, "ет"u, 7ul) AS find_from, Unicode::RFind(value, "ет"u, 7ul) AS rfind_from -FROM (SELECT CAST(value AS Utf8) AS value FROM Input); +FROM Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in new file mode 100644 index 0000000000..d9c36c855a --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in @@ -0,0 +1,7 @@ +{"key"="";"value"="Eyl\xC3\xBCl"}; +{"key"="";"value"="\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"}; +{"key"="";"value"="\xC3\xBAnora"}; +{"key"="";"value"="Ci\xD1\x87 Ci\xD1\x87"}; +{"key"="";"value"="\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"}; +{"key"="";"value"="6"}; +{"key"="";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in.attr new file mode 100644 index 0000000000..d5e5b2ca48 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.in.attr @@ -0,0 +1,8 @@ +{ + "_yql_row_spec"={ + "Type"=["StructType";[ + ["key";["DataType";"Utf8"]]; + ["value";["DataType";"Utf8"]] + ]]; + } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.sql b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.sql new file mode 100644 index 0000000000..cc26378317 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/UnicodeCodePoint.sql @@ -0,0 +1,6 @@ +/* syntax version 1 */ +SELECT + Unicode::ToCodePointList(value) AS code_point_list, + Unicode::FromCodePointList(Unicode::ToCodePointList(value)) AS from_code_point_list, + Unicode::FromCodePointList(YQL::LazyList(Unicode::ToCodePointList(value))) AS from_lazy_code_point_list, +FROM Input |