diff options
author | atarasov5 <atarasov5@yandex-team.com> | 2025-02-05 15:33:55 +0300 |
---|---|---|
committer | atarasov5 <atarasov5@yandex-team.com> | 2025-02-05 15:57:41 +0300 |
commit | cb5cc9d6099ffd7e87ece77981034b0d3999baae (patch) | |
tree | 6405cfc862daa4adbc251569725188fc854ec1b2 | |
parent | b0d84c66847b173adfa4330121bef3a9150edf2c (diff) | |
download | ydb-cb5cc9d6099ffd7e87ece77981034b0d3999baae.tar.gz |
YQL-19551: Fix utf16 encoding problems
commit_hash:a55c7dadcacd5aac18465edf6e6cee8fb77dfcc0
6 files changed, 81 insertions, 6 deletions
diff --git a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h index 52bfbb2785..4a852a5a6f 100644 --- a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h +++ b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h @@ -218,7 +218,7 @@ namespace { static void SplitToListImpl( const IValueBuilder* valueBuilder, const TUnboxedValue& input, - const TUtf16String::const_iterator start, + const TUtf32String::const_iterator start, const TIt& it, TTmpVector& result) { const std::string_view& original = input.AsStringRef(); @@ -281,8 +281,8 @@ namespace { SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); } } else { - const auto& input = UTF8ToWide(args[0].AsStringRef()); - const auto& delimeter = UTF8ToWide(args[1].AsStringRef()); + const auto& input = UTF8ToUTF32<true>(args[0].AsStringRef()); + const auto& delimeter = UTF8ToUTF32<true>(args[1].AsStringRef()); if (limit) { auto it = StringSplitter(input).SplitBySet(delimeter.c_str()).Limit(limit + 1); SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result); @@ -313,9 +313,9 @@ namespace { Y_UNUSED(valueBuilder); const TStringBuf left(args[0].AsStringRef()); const TStringBuf right(args[1].AsStringRef()); - const TUtf16String& leftWide = UTF8ToWide(left); - const TUtf16String& rightWide = UTF8ToWide(right); - const ui64 result = NLevenshtein::Distance(leftWide, rightWide); + const auto& leftUtf32 = UTF8ToUTF32<true>(left); + const auto& rightUtf32 = UTF8ToUTF32<true>(right); + const ui64 result = NLevenshtein::Distance(leftUtf32, rightUtf32); return TUnboxedValuePod(result); } diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json index c9012f2333..8d19afc428 100644 --- a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json @@ -9,6 +9,11 @@ "uri": "file://test.test_IsCategory_/results.txt" } ], + "test.test[LevensteinDistanceCodePoints]": [ + { + "uri": "file://test.test_LevensteinDistanceCodePoints_/results.txt" + } + ], "test.test[List]": [ { "uri": "file://test.test_List_/results.txt" @@ -24,6 +29,11 @@ "uri": "file://test.test_Replace_/results.txt" } ], + "test.test[SplitToListNoCrash]": [ + { + "uri": "file://test.test_SplitToListNoCrash_/results.txt" + } + ], "test.test[Strip]": [ { "uri": "file://test.test_Strip_/results.txt" diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_LevensteinDistanceCodePoints_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_LevensteinDistanceCodePoints_/results.txt new file mode 100644 index 0000000000..0f35bd34bd --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_LevensteinDistanceCodePoints_/results.txt @@ -0,0 +1,28 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "column0"; + [ + "DataType"; + "Uint64" + ] + ] + ] + ] + ]; + "Data" = [ + [ + "1" + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_SplitToListNoCrash_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_SplitToListNoCrash_/results.txt new file mode 100644 index 0000000000..8516052a94 --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_SplitToListNoCrash_/results.txt @@ -0,0 +1,33 @@ +[ + { + "Write" = [ + { + "Type" = [ + "ListType"; + [ + "StructType"; + [ + [ + "column0"; + [ + "ListType"; + [ + "DataType"; + "Utf8" + ] + ] + ] + ] + ] + ]; + "Data" = [ + [ + [ + "\xF0\x90\x8E\x80" + ] + ] + ] + } + ] + } +]
\ No newline at end of file diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/LevensteinDistanceCodePoints.sql b/yql/essentials/udfs/common/unicode_base/test/cases/LevensteinDistanceCodePoints.sql new file mode 100644 index 0000000000..ec072b726a --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/LevensteinDistanceCodePoints.sql @@ -0,0 +1,2 @@ +SELECT + Unicode::LevensteinDistance("\xF0\x90\x8E\x80"u, "\xF0\x9B\x80\x80"u) diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/SplitToListNoCrash.sql b/yql/essentials/udfs/common/unicode_base/test/cases/SplitToListNoCrash.sql new file mode 100644 index 0000000000..643292ea0b --- /dev/null +++ b/yql/essentials/udfs/common/unicode_base/test/cases/SplitToListNoCrash.sql @@ -0,0 +1,2 @@ +SELECT + Unicode::SplitToList("\xF0\x90\x8E\x80"u, "\xF0\x90\x8E\x81"u, false AS DelimeterString) |