diff options
| author | atarasov5 <[email protected]> | 2025-02-10 13:41:59 +0300 | 
|---|---|---|
| committer | atarasov5 <[email protected]> | 2025-02-10 14:40:42 +0300 | 
| commit | e46bed95ee43ea70afccfa413ea7e9f9e088cc33 (patch) | |
| tree | dd3f39b014a9f1aacc2c132b7fccf6b770e411b3 /yql/essentials | |
| parent | d5a7416eb3d3b6e73c97d2511781875814cb7045 (diff) | |
YQL-19535: Provide block implementations for some functions
YQL-19535: Provide block operations
YQL-19535: Specify tests for blocked operations
commit_hash:032aa58fc3f44f0eba3d9b38def021178da949ce
Diffstat (limited to 'yql/essentials')
33 files changed, 865 insertions, 245 deletions
diff --git a/yql/essentials/tests/sql/minirun/part0/canondata/result.json b/yql/essentials/tests/sql/minirun/part0/canondata/result.json index 564c44698bf..f7aa9bc69d3 100644 --- a/yql/essentials/tests/sql/minirun/part0/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part0/canondata/result.json @@ -588,9 +588,9 @@      ],      "test.test[expr-struct_literal--Debug]": [          { -            "checksum": "7013abbd2487b4c5c0783bd9d8e8773e", -            "size": 582, -            "uri": "https://{canondata_backend}/1942525/ede9d81525f3cde3c09402fe9435fdbba85f47bc/resource.tar.gz#test.test_expr-struct_literal--Debug_/opt.yql" +            "checksum": "32fb9ad7f0ff99f13245971fde9c9e44", +            "size": 607, +            "uri": "https://{canondata_backend}/1600758/668d9612baf2b806cdbf57a4a5626576611cb0c8/resource.tar.gz#test.test_expr-struct_literal--Debug_/opt.yql"          }      ],      "test.test[expr-struct_literal--Results]": [ diff --git a/yql/essentials/tests/sql/minirun/part1/canondata/result.json b/yql/essentials/tests/sql/minirun/part1/canondata/result.json index e299c4d970d..9338973cc71 100644 --- a/yql/essentials/tests/sql/minirun/part1/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part1/canondata/result.json @@ -15,9 +15,9 @@      ],      "test.test[action-eval_capture--Debug]": [          { -            "checksum": "addd79d812135465fc39c6ede76b5b00", -            "size": 1065, -            "uri": "https://{canondata_backend}/1925821/e00f3e167890c5f5da97383429fa618c17c22f4b/resource.tar.gz#test.test_action-eval_capture--Debug_/opt.yql" +            "checksum": "11fa4fe28d1d33bfbe682131dba7ccdf", +            "size": 1090, +            "uri": "https://{canondata_backend}/1600758/8128a043e648302a268bf13245bc303a361f75b9/resource.tar.gz#test.test_action-eval_capture--Debug_/opt.yql"          }      ],      "test.test[action-eval_capture--Results]": [ @@ -1302,9 +1302,9 @@      ],      "test.test[udf-trivial_udf--Debug]": [          { -            "checksum": "e30ef93274f818b56638089fa4a0513e", -            "size": 400, -            "uri": "https://{canondata_backend}/995452/57f8b127ed5fa9fae2dd5ebb0f5870d86a7fcd2f/resource.tar.gz#test.test_udf-trivial_udf--Debug_/opt.yql" +            "checksum": "8a826f54ac3877f855e9d5f4039f1957", +            "size": 425, +            "uri": "https://{canondata_backend}/1937001/da32717675dd7b959b82585f5fe8b8f1d2542461/resource.tar.gz#test.test_udf-trivial_udf--Debug_/opt.yql"          }      ],      "test.test[udf-trivial_udf--Results]": [ diff --git a/yql/essentials/tests/sql/minirun/part4/canondata/result.json b/yql/essentials/tests/sql/minirun/part4/canondata/result.json index 97c6fb14dd7..97562f4708a 100644 --- a/yql/essentials/tests/sql/minirun/part4/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part4/canondata/result.json @@ -281,9 +281,9 @@      ],      "test.test[binding-compact_named_with_subq_contexts--Debug]": [          { -            "checksum": "6ae122590fa1a3740afe06c807082844", -            "size": 1096, -            "uri": "https://{canondata_backend}/1925821/db505909f0fb5dcb9a1c2635b652923e2e5d33c8/resource.tar.gz#test.test_binding-compact_named_with_subq_contexts--Debug_/opt.yql" +            "checksum": "5fbd0bbbfed9dceb14486930c58f6d2a", +            "size": 1135, +            "uri": "https://{canondata_backend}/1889210/9d6331356a8b5731f25d9bf2d510824a0256baa0/resource.tar.gz#test.test_binding-compact_named_with_subq_contexts--Debug_/opt.yql"          }      ],      "test.test[binding-compact_named_with_subq_contexts--Results]": [ diff --git a/yql/essentials/tests/sql/minirun/part5/canondata/result.json b/yql/essentials/tests/sql/minirun/part5/canondata/result.json index 148c9d957ee..d109287d0f3 100644 --- a/yql/essentials/tests/sql/minirun/part5/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part5/canondata/result.json @@ -1,9 +1,9 @@  {      "test.test[action-action_udf_args--Debug]": [          { -            "checksum": "8f84413764bb8e1f2b44fbc31956d7ec", -            "size": 430, -            "uri": "https://{canondata_backend}/1925821/6007882aec2e7b1330cc057157b466b121eec1eb/resource.tar.gz#test.test_action-action_udf_args--Debug_/opt.yql" +            "checksum": "460a745ac85e95986996b9d1aa9379ae", +            "size": 455, +            "uri": "https://{canondata_backend}/1809005/41147930b57b9f7a31e613bdd3a9f9eaef9009f6/resource.tar.gz#test.test_action-action_udf_args--Debug_/opt.yql"          }      ],      "test.test[action-action_udf_args--Results]": [ @@ -1091,9 +1091,9 @@      ],      "test.test[library-library_udf--Debug]": [          { -            "checksum": "e30ef93274f818b56638089fa4a0513e", -            "size": 400, -            "uri": "https://{canondata_backend}/1942100/1466d7e49a6dc5a8df761a5ac92539095e1a14a0/resource.tar.gz#test.test_library-library_udf--Debug_/opt.yql" +            "checksum": "8a826f54ac3877f855e9d5f4039f1957", +            "size": 425, +            "uri": "https://{canondata_backend}/1942671/adb6336095b48aab4fed8e97a973ecc6eb2c7004/resource.tar.gz#test.test_library-library_udf--Debug_/opt.yql"          }      ],      "test.test[library-library_udf--Results]": [ diff --git a/yql/essentials/tests/sql/minirun/part8/canondata/result.json b/yql/essentials/tests/sql/minirun/part8/canondata/result.json index 99429e3c851..76dd1b302e7 100644 --- a/yql/essentials/tests/sql/minirun/part8/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part8/canondata/result.json @@ -1,9 +1,9 @@  {      "test.test[action-eval_percentile-default.txt-Debug]": [          { -            "checksum": "f1d59477f03e0fa8b684f5c6db6a2aca", -            "size": 2173, -            "uri": "https://{canondata_backend}/1925821/dbb639c652a305ac0d22d675f471bfcd73848bae/resource.tar.gz#test.test_action-eval_percentile-default.txt-Debug_/opt.yql" +            "checksum": "132b473519fcf8576fbbc8a1ecdfd6bd", +            "size": 2202, +            "uri": "https://{canondata_backend}/1903280/9b009523486ad950a7d921352a60f1c892f4f1cc/resource.tar.gz#test.test_action-eval_percentile-default.txt-Debug_/opt.yql"          }      ],      "test.test[action-eval_percentile-default.txt-Results]": [ diff --git a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h index 4a852a5a6f6..a16582fb4e3 100644 --- a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h +++ b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h @@ -3,6 +3,7 @@  #include <yql/essentials/public/udf/udf_allocator.h>  #include <yql/essentials/public/udf/udf_helpers.h>  #include <yql/essentials/utils/utf8.h> +#include <yql/essentials/public/udf/arrow/udf_arrow_helpers.h>  #include <library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h>  #include <library/cpp/unicode/normalization/normalization.h> @@ -24,6 +25,9 @@ using namespace NUdf;  using namespace NUnicode;  namespace { +    inline constexpr bool IsAscii(wchar32 c) noexcept { +        return ::IsAscii(c); +    }      template <class It>      struct TIsUnicodeSpaceAdapter { @@ -37,51 +41,144 @@ namespace {          return {};      } -#define NORMALIZE_UDF_MAP(XX) \ -    XX(Normalize, NFC)        \ -    XX(NormalizeNFD, NFD)     \ -    XX(NormalizeNFC, NFC)     \ -    XX(NormalizeNFKD, NFKD)   \ -    XX(NormalizeNFKC, NFKC) - -#define IS_CATEGORY_UDF_MAP(XX) \ -    XX(IsAscii, IsAscii)   \ -    XX(IsSpace, IsSpace)        \ -    XX(IsUpper, IsUpper)        \ -    XX(IsLower, IsLower)        \ -    XX(IsDigit, IsDigit)        \ -    XX(IsAlpha, IsAlpha)        \ -    XX(IsAlnum, IsAlnum)        \ -    XX(IsHex, IsHexdigit) - -#define NORMALIZE_UDF(name, mode)                                                 \ -    SIMPLE_UDF(T##name, TUtf8(TAutoMap<TUtf8>)) {                                 \ -        const auto& inputRef = args[0].AsStringRef();                             \ -        const TUtf16String& input = UTF8ToWide(inputRef.Data(), inputRef.Size()); \ -        const TString& output = WideToUTF8(Normalize<mode>(input));               \ -        return valueBuilder->NewString(output);                                   \ -    } +    struct TNoChangesTag {}; -#define IS_CATEGORY_UDF(udfName, function)                                                \ -    SIMPLE_UDF(T##udfName, bool(TAutoMap<TUtf8>)) {                                       \ -        Y_UNUSED(valueBuilder);                                                           \ -        const TStringBuf input(args[0].AsStringRef());                                    \ -        bool result = true;                                                               \ -        wchar32 rune;                                                                     \ -        const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin()); \ -        const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end());  \ -        while (cur != last) {                                                             \ -            ReadUTF8CharAndAdvance(rune, cur, last);                                      \ -            if (!function(rune)) {                                                        \ -                result = false;                                                           \ -                break;                                                                    \ -            }                                                                             \ -        }                                                                                 \ -        return TUnboxedValuePod(result);                                                  \ -    } +    template <typename TDerived> +    struct TScalarOperationMixin { +        static TUnboxedValue DoExecute(const IValueBuilder* builder, const TUnboxedValuePod* args) { +            Y_DEBUG_ABORT_UNLESS(IsUtf8(args[0].AsStringRef())); +            auto&& executeResult = TDerived::Execute(args[0].AsStringRef()); +            return ProcessResult(builder, std::move(executeResult), args); +        } + +    private: +        static TUnboxedValue ProcessResult(const IValueBuilder* builder, TString&& newString, const TUnboxedValuePod*) { +            return builder->NewString(std::move(newString)); +        } + +        template <typename T> +        static TUnboxedValue ProcessResult(const IValueBuilder* builder, std::variant<TNoChangesTag, T> newValue, const TUnboxedValuePod* initialArg) { +            if (std::holds_alternative<T>(newValue)) { +                return ProcessResult(builder, std::move(std::get<T>(newValue)), initialArg); +            } else { +                return initialArg[0]; +            } +        } + +        static TUnboxedValue ProcessResult(const IValueBuilder* builder, bool result, const TUnboxedValuePod*) { +            Y_UNUSED(builder); +            return TUnboxedValuePod(result); +        } +    }; + +    template <typename TDerived> +    struct TBlockOperationMixin { +        template <typename Sync> +        static void DoExecute(const TBlockItem arg, const Sync& sync) { +            Y_DEBUG_ABORT_UNLESS(IsUtf8(arg.AsStringRef())); +            auto&& executeResult = TDerived::Execute(arg.AsStringRef()); +            TBlockItem boxedValue = ProcessResult(std::move(executeResult), arg); +            sync(boxedValue); +        } + +    private: +        static TBlockItem ProcessResult(const TString& newString, const TBlockItem arg) { +            Y_UNUSED(arg); +            return TBlockItem(std::move(newString)); +        } + +        template <typename T> +        static TBlockItem ProcessResult(const std::variant<TNoChangesTag, T>& newValue, const TBlockItem arg) { +            if (std::holds_alternative<T>(newValue)) { +                return ProcessResult(std::get<T>(newValue), arg); +            } else { +                return arg; +            } +        } + +        static TBlockItem ProcessResult(bool result, const TBlockItem arg) { +            Y_UNUSED(arg); +            return TBlockItem(result); +        } +    }; -    NORMALIZE_UDF_MAP(NORMALIZE_UDF) -    IS_CATEGORY_UDF_MAP(IS_CATEGORY_UDF) +    template <typename TDerived> +    struct TOperationMixin: public TBlockOperationMixin<TDerived>, public TScalarOperationMixin<TDerived> { +        using TBlockOperationMixin<TDerived>::DoExecute; +        using TScalarOperationMixin<TDerived>::DoExecute; +    }; + +    template <auto mode> +    struct TNormalizeUTF8: public TOperationMixin<TNormalizeUTF8<mode>> { +        static TString Execute(TStringRef arg) { +            const TUtf16String& input = UTF8ToWide(arg.Data(), arg.Size()); +            return WideToUTF8(Normalize<mode>(input)); +        } +    }; + +    template <bool (*Function)(wchar32)> +    struct TCheckAllChars: public TOperationMixin<TCheckAllChars<Function>> { +        static bool Execute(TStringRef arg) { +            const TStringBuf input(arg); +            wchar32 rune; +            const unsigned char* cur = reinterpret_cast<const unsigned char*>(input.begin()); +            const unsigned char* last = reinterpret_cast<const unsigned char*>(input.end()); +            while (cur != last) { +                ReadUTF8CharAndAdvance(rune, cur, last); +                if (!static_cast<bool (*)(wchar32)>(Function)(rune)) { +                    return false; +                } +            } +            return true; +        } +    }; + +    template <bool (*Function)(TUtf16String&, size_t pos, size_t count)> +    struct TStringToStringMapper: public TOperationMixin<TStringToStringMapper<Function>> { +        static std::variant<TNoChangesTag, TString> Execute(TStringRef arg) { +            if (auto wide = UTF8ToWide(arg); +                static_cast<bool (*)(TUtf16String&, size_t pos, size_t count)>(Function)(wide, 0, TUtf16String::npos)) { +                return WideToUTF8(std::move(wide)); +            } else { +                return TNoChangesTag{}; +            } +        } +    }; + +#define DEFINE_UTF8_OPERATION(udfName, Executor, signature)                                          \ +    BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, signature) {                                           \ +        return Executor::DoExecute(valueBuilder, args);                                              \ +    }                                                                                                \ +                                                                                                     \ +    struct T##udfName##KernelExec                                                                    \ +        : public TUnaryKernelExec<T##udfName##KernelExec> {                                          \ +        template <typename TSink>                                                                    \ +        static void Process(const IValueBuilder* valueBuilder, TBlockItem arg1, const TSink& sink) { \ +            Y_UNUSED(valueBuilder);                                                                  \ +            Executor::DoExecute(arg1, sink);                                                         \ +        }                                                                                            \ +    };                                                                                               \ +                                                                                                     \ +    END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) + +    DEFINE_UTF8_OPERATION(Normalize, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(NormalizeNFD, TNormalizeUTF8<NFD>, TUtf8(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(NormalizeNFC, TNormalizeUTF8<NFC>, TUtf8(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(NormalizeNFKD, TNormalizeUTF8<NFKD>, TUtf8(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(NormalizeNFKC, TNormalizeUTF8<NFKC>, TUtf8(TAutoMap<TUtf8>)); + +    DEFINE_UTF8_OPERATION(IsAscii, TCheckAllChars<IsAscii>, bool(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(IsSpace, TCheckAllChars<IsSpace>, bool(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(IsUpper, TCheckAllChars<IsUpper>, bool(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(IsLower, TCheckAllChars<IsLower>, bool(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(IsDigit, TCheckAllChars<IsDigit>, bool(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(IsAlpha, TCheckAllChars<IsAlpha>, bool(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(IsAlnum, TCheckAllChars<IsAlnum>, bool(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(IsHex, TCheckAllChars<IsHexdigit>, bool(TAutoMap<TUtf8>)); + +    DEFINE_UTF8_OPERATION(ToTitle, TStringToStringMapper<ToTitle>, TUtf8(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(ToUpper, TStringToStringMapper<ToUpper>, TUtf8(TAutoMap<TUtf8>)); +    DEFINE_UTF8_OPERATION(ToLower, TStringToStringMapper<ToLower>, TUtf8(TAutoMap<TUtf8>));      SIMPLE_UDF(TIsUtf, bool(TOptional<char*>)) {          Y_UNUSED(valueBuilder); @@ -461,27 +558,6 @@ namespace {          return valueBuilder->NewString(WideToUTF8(wide));      } -    SIMPLE_UDF(TToLower, TUtf8(TAutoMap<TUtf8>)) { -        if (auto wide = UTF8ToWide(args->AsStringRef()); ToLower(wide)) -            return valueBuilder->NewString(WideToUTF8(wide)); -        else -            return *args; -    } - -    SIMPLE_UDF(TToUpper, TUtf8(TAutoMap<TUtf8>)) { -        if (auto wide = UTF8ToWide(args->AsStringRef()); ToUpper(wide)) -            return valueBuilder->NewString(WideToUTF8(wide)); -        else -            return *args; -    } - -    SIMPLE_UDF(TToTitle, TUtf8(TAutoMap<TUtf8>)) { -        if (auto wide = UTF8ToWide(args->AsStringRef()); ToTitle(wide)) -            return valueBuilder->NewString(WideToUTF8(wide)); -        else -            return *args; -    } -      SIMPLE_UDF(TStrip, TUtf8(TAutoMap<TUtf8>)) {          const TUtf32String input = UTF8ToUTF32<true>(args[0].AsStringRef());          const auto& result = StripString(input, IsUnicodeSpaceAdapter(input.begin())); @@ -512,33 +588,42 @@ namespace {          return TUnboxedValuePod(result);      } -#define REGISTER_NORMALIZE_UDF(name, mode) T##name, -#define REGISTER_IS_CATEGORY_UDF(name, function) T##name,  #define EXPORTED_UNICODE_BASE_UDF \ -    NORMALIZE_UDF_MAP(REGISTER_NORMALIZE_UDF) \ -    IS_CATEGORY_UDF_MAP(REGISTER_IS_CATEGORY_UDF) \ -    TIsUtf, \ -    TGetLength, \ -    TSubstring, \ -    TFind, \ -    TRFind, \ -    TSplitToList, \ -    TJoinFromList, \ -    TLevensteinDistance, \ -    TReplaceAll, \ -    TReplaceFirst, \ -    TReplaceLast, \ -    TRemoveAll, \ -    TRemoveFirst, \ -    TRemoveLast, \ -    TToCodePointList, \ -    TFromCodePointList, \ -    TReverse, \ -    TToLower, \ -    TToUpper, \ -    TToTitle, \ -    TToUint64, \ -    TTryToUint64, \ -    TStrip, \ -    TIsUnicodeSet +        TIsUtf,                   \ +        TGetLength,               \ +        TSubstring,               \ +        TFind,                    \ +        TRFind,                   \ +        TSplitToList,             \ +        TJoinFromList,            \ +        TLevensteinDistance,      \ +        TReplaceAll,              \ +        TReplaceFirst,            \ +        TReplaceLast,             \ +        TRemoveAll,               \ +        TRemoveFirst,             \ +        TRemoveLast,              \ +        TToCodePointList,         \ +        TFromCodePointList,       \ +        TReverse,                 \ +        TToLower,                 \ +        TToUpper,                 \ +        TToTitle,                 \ +        TToUint64,                \ +        TTryToUint64,             \ +        TStrip,                   \ +        TIsUnicodeSet,            \ +        TNormalize,               \ +        TNormalizeNFD,            \ +        TNormalizeNFC,            \ +        TNormalizeNFKD,           \ +        TNormalizeNFKC,           \ +        TIsAscii,                 \ +        TIsSpace,                 \ +        TIsUpper,                 \ +        TIsLower,                 \ +        TIsDigit,                 \ +        TIsAlpha,                 \ +        TIsAlnum,                 \ +        TIsHex  } diff --git a/yql/essentials/udfs/common/unicode_base/lib/ya.make b/yql/essentials/udfs/common/unicode_base/lib/ya.make index f50858d02ae..2fda0829667 100644 --- a/yql/essentials/udfs/common/unicode_base/lib/ya.make +++ b/yql/essentials/udfs/common/unicode_base/lib/ya.make @@ -2,7 +2,7 @@ LIBRARY()  YQL_ABI_VERSION(      2 -    27 +    37      0  ) @@ -16,6 +16,7 @@ PEERDIR(      library/cpp/unicode/normalization      library/cpp/unicode/set      yql/essentials/public/udf +    yql/essentials/public/udf/arrow      yql/essentials/utils  ) diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json index 8d19afc4281..8189dd16e08 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json @@ -1,4 +1,19 @@  { +    "test.test[BlockIsCategory]": [ +        { +            "uri": "file://test.test_BlockIsCategory_/results.txt" +        } +    ], +    "test.test[BlockNormalize]": [ +        { +            "uri": "file://test.test_BlockNormalize_/results.txt" +        } +    ], +    "test.test[BlockTo]": [ +        { +            "uri": "file://test.test_BlockTo_/results.txt" +        } +    ],      "test.test[Find]": [          {              "uri": "file://test.test_Find_/results.txt" @@ -19,6 +34,11 @@              "uri": "file://test.test_List_/results.txt"          }      ], +    "test.test[Normalize]": [ +        { +            "uri": "file://test.test_Normalize_/results.txt" +        } +    ],      "test.test[Remove]": [          {              "uri": "file://test.test_Remove_/results.txt" diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockIsCategory_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockIsCategory_/results.txt new file mode 100644 index 00000000000..e95de9fe1d2 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockIsCategory_/results.txt @@ -0,0 +1,160 @@ +[ +    { +        "Write" = [ +            { +                "Type" = [ +                    "ListType"; +                    [ +                        "StructType"; +                        [ +                            [ +                                "value"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "column1"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ]; +                            [ +                                "column2"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ]; +                            [ +                                "column3"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ]; +                            [ +                                "column4"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ]; +                            [ +                                "column5"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ]; +                            [ +                                "column6"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ]; +                            [ +                                "column7"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ]; +                            [ +                                "column8"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ]; +                            [ +                                "column9"; +                                [ +                                    "DataType"; +                                    "Bool" +                                ] +                            ] +                        ] +                    ] +                ]; +                "Data" = [ +                    [ +                        "0F3A4E"; +                        %true; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %true; +                        %true; +                        %false +                    ]; +                    [ +                        "\xD0\xB2\xD0\x92\xD0\xB0\xD0\x92\xD1\x8B\xD0\xB0"; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %true; +                        %true; +                        %false; +                        %false +                    ]; +                    [ +                        "\xD1\x84\xD1\x8B\xD0\xB2"; +                        %false; +                        %false; +                        %false; +                        %true; +                        %false; +                        %true; +                        %true; +                        %false; +                        %false +                    ]; +                    [ +                        "1234"; +                        %true; +                        %false; +                        %false; +                        %false; +                        %true; +                        %false; +                        %true; +                        %true; +                        %false +                    ]; +                    [ +                        "\xD0\xB2\xD1\2132\xD0\xB2-\xD0\xB0"; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false +                    ]; +                    [ +                        "\xD0\xB2\xD1\x8B\xD0\2601-!\xD1\x8B\xD0\xB2"; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false +                    ] +                ] +            } +        ] +    } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockNormalize_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockNormalize_/results.txt new file mode 100644 index 00000000000..2fc20b07f1e --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockNormalize_/results.txt @@ -0,0 +1,92 @@ +[ +    { +        "Write" = [ +            { +                "Type" = [ +                    "ListType"; +                    [ +                        "StructType"; +                        [ +                            [ +                                "value"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize_nfd"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize_nfc"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize_nfkd"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize_nfkc"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ] +                        ] +                    ] +                ]; +                "Data" = [ +                    [ +                        "\xC3\xA9"; +                        "\xC3\xA9"; +                        "e\xCC\x81"; +                        "\xC3\xA9"; +                        "e\xCC\x81"; +                        "\xC3\xA9" +                    ]; +                    [ +                        "e\xCC\x81"; +                        "\xC3\xA9"; +                        "e\xCC\x81"; +                        "\xC3\xA9"; +                        "e\xCC\x81"; +                        "\xC3\xA9" +                    ]; +                    [ +                        "\xC2\xB5"; +                        "\xC2\xB5"; +                        "\xC2\xB5"; +                        "\xC2\xB5"; +                        "\xCE\xBC"; +                        "\xCE\xBC" +                    ]; +                    [ +                        "\xE2\x84\x8C"; +                        "\xE2\x84\x8C"; +                        "\xE2\x84\x8C"; +                        "\xE2\x84\x8C"; +                        "H"; +                        "H" +                    ] +                ] +            } +        ] +    } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockTo_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockTo_/results.txt new file mode 100644 index 00000000000..7f7b2525d78 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_BlockTo_/results.txt @@ -0,0 +1,102 @@ +[ +    { +        "Write" = [ +            { +                "Type" = [ +                    "ListType"; +                    [ +                        "StructType"; +                        [ +                            [ +                                "value"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "lower"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "upper"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "title"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "reverse"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ] +                        ] +                    ] +                ]; +                "Data" = [ +                    [ +                        "test"; +                        "test"; +                        "TEST"; +                        "Test"; +                        "tset" +                    ]; +                    [ +                        "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; +                        "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; +                        "\xD0\xA2\xD0\x95\xD0\xA1\xD0\xA2"; +                        "\xD0\xA2\xD0\xB5\xD1\x81\xD1\x82"; +                        "\xD1\x82\xD1\x81\xD0\xB5\xD1\x82" +                    ]; +                    [ +                        "TeSt"; +                        "test"; +                        "TEST"; +                        "Test"; +                        "tSeT" +                    ]; +                    [ +                        "\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"; +                        "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"; +                        "\xD0\xA2\xD0\x95\xD0\xA1\xD0\xA2"; +                        "\xD0\xA2\xD0\xB5\xD1\x81\xD1\x82"; +                        "\xD0\xA2\xD1\x81\xD0\x95\xD1\x82" +                    ]; +                    [ +                        "Eyl\xC3\xBCl"; +                        "eyl\xC3\xBCl"; +                        "EYL\xC3\x9CL"; +                        "Eyl\xC3\xBCl"; +                        "l\xC3\xBClyE" +                    ]; +                    [ +                        "6"; +                        "6"; +                        "6"; +                        "6"; +                        "6" +                    ]; +                    [ +                        ""; +                        ""; +                        ""; +                        ""; +                        "" +                    ] +                ] +            } +        ] +    } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt index a6fd861c645..e95de9fe1d2 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_IsCategory_/results.txt @@ -8,10 +8,10 @@                          "StructType";                          [                              [ -                                "column0"; +                                "value";                                  [                                      "DataType"; -                                    "Bool" +                                    "Utf8"                                  ]                              ];                              [ @@ -76,85 +76,81 @@                                      "DataType";                                      "Bool"                                  ] -                            ]; -                            [ -                                "column10"; -                                [ -                                    "DataType"; -                                    "Bool" -                                ] -                            ]; -                            [ -                                "column11"; -                                [ -                                    "DataType"; -                                    "Bool" -                                ] -                            ]; -                            [ -                                "column12"; -                                [ -                                    "DataType"; -                                    "Bool" -                                ] -                            ]; -                            [ -                                "column13"; -                                [ -                                    "DataType"; -                                    "Bool" -                                ] -                            ]; -                            [ -                                "column14"; -                                [ -                                    "DataType"; -                                    "Bool" -                                ] -                            ]; -                            [ -                                "column15"; -                                [ -                                    "DataType"; -                                    "Bool" -                                ] -                            ]; -                            [ -                                "column16"; -                                [ -                                    "DataType"; -                                    "Bool" -                                ] -                            ]; -                            [ -                                "column17"; -                                [ -                                    "DataType"; -                                    "Bool" -                                ]                              ]                          ]                      ]                  ];                  "Data" = [                      [ +                        "0F3A4E";                          %true;                          %false; -                        %true;                          %false; +                        %false; +                        %false; +                        %false; +                        %true;                          %true; +                        %false +                    ]; +                    [ +                        "\xD0\xB2\xD0\x92\xD0\xB0\xD0\x92\xD1\x8B\xD0\xB0"; +                        %false; +                        %false;                          %false; +                        %false; +                        %false; +                        %true;                          %true;                          %false; +                        %false +                    ]; +                    [ +                        "\xD1\x84\xD1\x8B\xD0\xB2"; +                        %false; +                        %false; +                        %false;                          %true;                          %false;                          %true; +                        %true;                          %false; +                        %false +                    ]; +                    [ +                        "1234";                          %true;                          %false; +                        %false; +                        %false;                          %true;                          %false;                          %true; +                        %true; +                        %false +                    ]; +                    [ +                        "\xD0\xB2\xD1\2132\xD0\xB2-\xD0\xB0"; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false +                    ]; +                    [ +                        "\xD0\xB2\xD1\x8B\xD0\2601-!\xD1\x8B\xD0\xB2"; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false; +                        %false;                          %false                      ]                  ] diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Normalize_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Normalize_/results.txt new file mode 100644 index 00000000000..2fc20b07f1e --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Normalize_/results.txt @@ -0,0 +1,92 @@ +[ +    { +        "Write" = [ +            { +                "Type" = [ +                    "ListType"; +                    [ +                        "StructType"; +                        [ +                            [ +                                "value"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize_nfd"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize_nfc"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize_nfkd"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ]; +                            [ +                                "normalize_nfkc"; +                                [ +                                    "DataType"; +                                    "Utf8" +                                ] +                            ] +                        ] +                    ] +                ]; +                "Data" = [ +                    [ +                        "\xC3\xA9"; +                        "\xC3\xA9"; +                        "e\xCC\x81"; +                        "\xC3\xA9"; +                        "e\xCC\x81"; +                        "\xC3\xA9" +                    ]; +                    [ +                        "e\xCC\x81"; +                        "\xC3\xA9"; +                        "e\xCC\x81"; +                        "\xC3\xA9"; +                        "e\xCC\x81"; +                        "\xC3\xA9" +                    ]; +                    [ +                        "\xC2\xB5"; +                        "\xC2\xB5"; +                        "\xC2\xB5"; +                        "\xC2\xB5"; +                        "\xCE\xBC"; +                        "\xCE\xBC" +                    ]; +                    [ +                        "\xE2\x84\x8C"; +                        "\xE2\x84\x8C"; +                        "\xE2\x84\x8C"; +                        "\xE2\x84\x8C"; +                        "H"; +                        "H" +                    ] +                ] +            } +        ] +    } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt index 465ad350553..502cea3fd0f 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_Unicode_/results.txt @@ -18,16 +18,6 @@                                  ]                              ];                              [ -                                "normalize"; -                                [ -                                    "OptionalType"; -                                    [ -                                        "DataType"; -                                        "Utf8" -                                    ] -                                ] -                            ]; -                            [                                  "is";                                  [                                      "DataType"; @@ -175,9 +165,6 @@                          [                              "Eyl\xC3\xBCl"                          ]; -                        [ -                            "Eyl\xC3\xBCl" -                        ];                          %true;                          [                              "5" @@ -221,9 +208,6 @@                          [                              "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F"                          ]; -                        [ -                            "\xD0\xB6\xD0\xBD\xD1\x96\xD1\x9E\xD0\xBD\xD1\x8F" -                        ];                          %true;                          [                              "6" @@ -268,9 +252,6 @@                          [                              "\xC3\xBAnora"                          ]; -                        [ -                            "\xC3\xBAnora" -                        ];                          %true;                          [                              "5" @@ -314,9 +295,6 @@                          [                              "Ci\xD1\x87 Ci\xD1\x87"                          ]; -                        [ -                            "Ci\xD1\x87 Ci\xD1\x87" -                        ];                          %true;                          [                              "7" @@ -362,9 +340,6 @@                          [                              "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82"                          ]; -                        [ -                            "\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82 \xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82" -                        ];                          %true;                          [                              "13" @@ -424,9 +399,6 @@                          [                              "6"                          ]; -                        [ -                            "6" -                        ];                          %true;                          [                              "1" @@ -466,9 +438,6 @@                          [                              ""                          ]; -                        [ -                            "" -                        ];                          %true;                          [                              "0" diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in new file mode 100644 index 00000000000..4aba89386b4 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in @@ -0,0 +1,6 @@ +{"key"="1";"value"="0F3A4E"}; +{"key"="2";"value"="вВаВыа"}; +{"key"="3";"value"="фыв"}; +{"key"="4";"value"="1234"}; +{"key"="5";"value"="вы2в-а"}; +{"key"="6";"value"="выа1-!ыв"}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in.attr new file mode 100644 index 00000000000..d5e5b2ca484 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.in.attr @@ -0,0 +1,8 @@ +{ +    "_yql_row_spec"={ +        "Type"=["StructType";[ +            ["key";["DataType";"Utf8"]]; +            ["value";["DataType";"Utf8"]] +        ]]; +    } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.sql new file mode 100644 index 00000000000..3a2b3d0c214 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockIsCategory.sql @@ -0,0 +1,16 @@ +/* syntax version 1 */ + +pragma UseBlocks; + +SELECT +    value as value, +    Unicode::IsAscii(value), +    Unicode::IsSpace(value), +    Unicode::IsUpper(value), +    Unicode::IsLower(value), +    Unicode::IsDigit(value), +    Unicode::IsAlpha(value), +    Unicode::IsAlnum(value), +    Unicode::IsHex(value), +    Unicode::IsUnicodeSet(value, "[вао]"u) +FROM Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in new file mode 100644 index 00000000000..2e56f171a4b --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in @@ -0,0 +1,4 @@ +{"key"="1";"value"="\xC3\xA9"}; +{"key"="2";"value"="e\xCC\x81"}; +{"key"="3";"value"="\xC2\xB5"}; +{"key"="4";"value"="\xE2\x84\x8C"}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in.attr new file mode 100644 index 00000000000..d5e5b2ca484 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.in.attr @@ -0,0 +1,8 @@ +{ +    "_yql_row_spec"={ +        "Type"=["StructType";[ +            ["key";["DataType";"Utf8"]]; +            ["value";["DataType";"Utf8"]] +        ]]; +    } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.sql new file mode 100644 index 00000000000..c0e063acd6b --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockNormalize.sql @@ -0,0 +1,13 @@ +/* syntax version 1 */ + +pragma UseBlocks; + +SELECT +    value AS value, +    Unicode::Normalize(value) AS normalize, +    Unicode::NormalizeNFD(value) AS normalize_nfd, +    Unicode::NormalizeNFC(value) AS normalize_nfc, +    Unicode::NormalizeNFKD(value) AS normalize_nfkd, +    Unicode::NormalizeNFKC(value) AS normalize_nfkc +FROM Input + diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in new file mode 100644 index 00000000000..82d72f16711 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in @@ -0,0 +1,7 @@ +{"key"="1";"value"="test"}; +{"key"="2";"value"="\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"}; +{"key"="3";"value"="TeSt"}; +{"key"="4";"value"="\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"}; +{"key"="5";"value"="Eyl\xC3\xBCl"}; +{"key"="6";"value"="6"}; +{"key"="4";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in.attr new file mode 100644 index 00000000000..d5e5b2ca484 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.in.attr @@ -0,0 +1,8 @@ +{ +    "_yql_row_spec"={ +        "Type"=["StructType";[ +            ["key";["DataType";"Utf8"]]; +            ["value";["DataType";"Utf8"]] +        ]]; +    } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.sql b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.sql new file mode 100644 index 00000000000..a4d546ca6dd --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/BlockTo.sql @@ -0,0 +1,12 @@ +/* syntax version 1 */ + +pragma UseBlocks; + +SELECT +    value, +    Unicode::ToLower(value) AS lower, +    Unicode::ToUpper(value) AS upper, +    Unicode::ToTitle(value) AS title, +    Unicode::Reverse(value) AS reverse, +FROM Input; + diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in new file mode 100644 index 00000000000..4aba89386b4 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in @@ -0,0 +1,6 @@ +{"key"="1";"value"="0F3A4E"}; +{"key"="2";"value"="вВаВыа"}; +{"key"="3";"value"="фыв"}; +{"key"="4";"value"="1234"}; +{"key"="5";"value"="вы2в-а"}; +{"key"="6";"value"="выа1-!ыв"}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in.attr new file mode 100644 index 00000000000..d5e5b2ca484 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.in.attr @@ -0,0 +1,8 @@ +{ +    "_yql_row_spec"={ +        "Type"=["StructType";[ +            ["key";["DataType";"Utf8"]]; +            ["value";["DataType";"Utf8"]] +        ]]; +    } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql index 2effa23221e..bd933f911bf 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql +++ b/yql/essentials/udfs/common/unicode_base/test/cases/IsCategory.sql @@ -1,21 +1,13 @@  /* syntax version 1 */  SELECT -    Unicode::IsAscii("sdf"u), -    Unicode::IsAscii("выавыа"u), -    Unicode::IsSpace(" \u2002\u200a"u), -    Unicode::IsSpace("выавыа"u), -    Unicode::IsUpper("ФЫВ"u), -    Unicode::IsUpper("вВаВыа"u), -    Unicode::IsLower("фыв"u), -    Unicode::IsLower("вВаВыа"u), -    Unicode::IsDigit("1234"u), -    Unicode::IsDigit("выавыа"u), -    Unicode::IsAlpha("фвфы"u), -    Unicode::IsAlpha("вы2в-а"u), -    Unicode::IsAlnum("фыв13в"u), -    Unicode::IsAlnum("выа1-}ыв"u), -    Unicode::IsHex("0F3A4E"u), -    Unicode::IsHex("ваоао"u), -    Unicode::IsUnicodeSet("ваоао"u, "[вао]"u), -    Unicode::IsUnicodeSet("ваоао"u, "[ваб]"u) - +    value as value, +    Unicode::IsAscii(value), +    Unicode::IsSpace(value), +    Unicode::IsUpper(value), +    Unicode::IsLower(value), +    Unicode::IsDigit(value), +    Unicode::IsAlpha(value), +    Unicode::IsAlnum(value), +    Unicode::IsHex(value), +    Unicode::IsUnicodeSet(value, "[вао]"u) +FROM Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in new file mode 100644 index 00000000000..2e56f171a4b --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in @@ -0,0 +1,4 @@ +{"key"="1";"value"="\xC3\xA9"}; +{"key"="2";"value"="e\xCC\x81"}; +{"key"="3";"value"="\xC2\xB5"}; +{"key"="4";"value"="\xE2\x84\x8C"}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in.attr new file mode 100644 index 00000000000..d5e5b2ca484 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.in.attr @@ -0,0 +1,8 @@ +{ +    "_yql_row_spec"={ +        "Type"=["StructType";[ +            ["key";["DataType";"Utf8"]]; +            ["value";["DataType";"Utf8"]] +        ]]; +    } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.sql new file mode 100644 index 00000000000..c0c8b053894 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Normalize.sql @@ -0,0 +1,9 @@ +/* syntax version 1 */ +SELECT +    value AS value, +    Unicode::Normalize(value) AS normalize, +    Unicode::NormalizeNFD(value) AS normalize_nfd, +    Unicode::NormalizeNFC(value) AS normalize_nfc, +    Unicode::NormalizeNFKD(value) AS normalize_nfkd, +    Unicode::NormalizeNFKC(value) AS normalize_nfkc +FROM Input diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/To.in b/yql/essentials/udfs/common/unicode_base/test/cases/To.in index 5effdb9971b..82d72f16711 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/To.in +++ b/yql/essentials/udfs/common/unicode_base/test/cases/To.in @@ -1,8 +1,7 @@ -{"key"="1";"subkey"="1";"value"="test"}; -{"key"="2";"subkey"="2";"value"="\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"}; -{"key"="3";"subkey"="3";"value"="TeSt"}; -{"key"="4";"subkey"="4";"value"="\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"}; -{"key"="5";"subkey"="5";"value"="Eyl\xC3\xBCl"}; -{"key"="6";"subkey"="6";"value"="6"}; -{"key"="4";"subkey"="4";"value"=""}; - +{"key"="1";"value"="test"}; +{"key"="2";"value"="\xD1\x82\xD0\xB5\xD1\x81\xD1\x82"}; +{"key"="3";"value"="TeSt"}; +{"key"="4";"value"="\xD1\x82\xD0\x95\xD1\x81\xD0\xA2"}; +{"key"="5";"value"="Eyl\xC3\xBCl"}; +{"key"="6";"value"="6"}; +{"key"="4";"value"=""}; diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr b/yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr index 990efb1ff2c..d5e5b2ca484 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr +++ b/yql/essentials/udfs/common/unicode_base/test/cases/To.in.attr @@ -1,12 +1,8 @@ -{"_yql_row_spec"={ -	"Type"=["StructType";[ -		["key";["DataType";"Utf8"]]; -		["subkey";["DataType";"Utf8"]]; -		["value";["DataType";"Utf8"]] -	]]; -	"SortDirections"=[1;1;]; -	"SortedBy"=["key";"subkey";]; -	"SortedByTypes"=[["DataType";"Utf8";];["DataType";"Utf8";];]; -	"SortMembers"=["key";"subkey";]; -}} - +{ +    "_yql_row_spec"={ +        "Type"=["StructType";[ +            ["key";["DataType";"Utf8"]]; +            ["value";["DataType";"Utf8"]] +        ]]; +    } +} diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql index b330682b6ed..cdff12f352b 100644 --- a/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql +++ b/yql/essentials/udfs/common/unicode_base/test/cases/Unicode.sql @@ -1,7 +1,6 @@  /* syntax version 1 */  SELECT      value AS value, -    Unicode::Normalize(value) AS normalize,      Unicode::IsUtf(value) AS is,      Unicode::GetLength(value) AS length,      Unicode::Substring(value, 1) AS one_end_substring, diff --git a/yql/essentials/udfs/common/unicode_base/ya.make b/yql/essentials/udfs/common/unicode_base/ya.make index 53a8f3af45b..4ec872e2495 100644 --- a/yql/essentials/udfs/common/unicode_base/ya.make +++ b/yql/essentials/udfs/common/unicode_base/ya.make @@ -2,7 +2,7 @@ YQL_UDF_CONTRIB(unicode_udf)      YQL_ABI_VERSION(          2 -        27 +        37          0      )  | 
