aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoratarasov5 <atarasov5@yandex-team.com>2025-02-05 15:33:55 +0300
committeratarasov5 <atarasov5@yandex-team.com>2025-02-05 15:57:41 +0300
commitcb5cc9d6099ffd7e87ece77981034b0d3999baae (patch)
tree6405cfc862daa4adbc251569725188fc854ec1b2
parentb0d84c66847b173adfa4330121bef3a9150edf2c (diff)
downloadydb-cb5cc9d6099ffd7e87ece77981034b0d3999baae.tar.gz
YQL-19551: Fix utf16 encoding problems
commit_hash:a55c7dadcacd5aac18465edf6e6cee8fb77dfcc0
-rw-r--r--yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h12
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/result.json10
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_LevensteinDistanceCodePoints_/results.txt28
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/canondata/test.test_SplitToListNoCrash_/results.txt33
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/LevensteinDistanceCodePoints.sql2
-rw-r--r--yql/essentials/udfs/common/unicode_base/test/cases/SplitToListNoCrash.sql2
6 files changed, 81 insertions, 6 deletions
diff --git a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
index 52bfbb2785..4a852a5a6f 100644
--- a/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
+++ b/yql/essentials/udfs/common/unicode_base/lib/unicode_base_udf.h
@@ -218,7 +218,7 @@ namespace {
static void SplitToListImpl(
const IValueBuilder* valueBuilder,
const TUnboxedValue& input,
- const TUtf16String::const_iterator start,
+ const TUtf32String::const_iterator start,
const TIt& it,
TTmpVector& result) {
const std::string_view& original = input.AsStringRef();
@@ -281,8 +281,8 @@ namespace {
SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
}
} else {
- const auto& input = UTF8ToWide(args[0].AsStringRef());
- const auto& delimeter = UTF8ToWide(args[1].AsStringRef());
+ const auto& input = UTF8ToUTF32<true>(args[0].AsStringRef());
+ const auto& delimeter = UTF8ToUTF32<true>(args[1].AsStringRef());
if (limit) {
auto it = StringSplitter(input).SplitBySet(delimeter.c_str()).Limit(limit + 1);
SplitToListImpl(valueBuilder, args[0], input.cbegin(), it, skipEmpty, result);
@@ -313,9 +313,9 @@ namespace {
Y_UNUSED(valueBuilder);
const TStringBuf left(args[0].AsStringRef());
const TStringBuf right(args[1].AsStringRef());
- const TUtf16String& leftWide = UTF8ToWide(left);
- const TUtf16String& rightWide = UTF8ToWide(right);
- const ui64 result = NLevenshtein::Distance(leftWide, rightWide);
+ const auto& leftUtf32 = UTF8ToUTF32<true>(left);
+ const auto& rightUtf32 = UTF8ToUTF32<true>(right);
+ const ui64 result = NLevenshtein::Distance(leftUtf32, rightUtf32);
return TUnboxedValuePod(result);
}
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
index c9012f2333..8d19afc428 100644
--- a/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/result.json
@@ -9,6 +9,11 @@
"uri": "file://test.test_IsCategory_/results.txt"
}
],
+ "test.test[LevensteinDistanceCodePoints]": [
+ {
+ "uri": "file://test.test_LevensteinDistanceCodePoints_/results.txt"
+ }
+ ],
"test.test[List]": [
{
"uri": "file://test.test_List_/results.txt"
@@ -24,6 +29,11 @@
"uri": "file://test.test_Replace_/results.txt"
}
],
+ "test.test[SplitToListNoCrash]": [
+ {
+ "uri": "file://test.test_SplitToListNoCrash_/results.txt"
+ }
+ ],
"test.test[Strip]": [
{
"uri": "file://test.test_Strip_/results.txt"
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_LevensteinDistanceCodePoints_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_LevensteinDistanceCodePoints_/results.txt
new file mode 100644
index 0000000000..0f35bd34bd
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_LevensteinDistanceCodePoints_/results.txt
@@ -0,0 +1,28 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "column0";
+ [
+ "DataType";
+ "Uint64"
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ "1"
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_SplitToListNoCrash_/results.txt b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_SplitToListNoCrash_/results.txt
new file mode 100644
index 0000000000..8516052a94
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/canondata/test.test_SplitToListNoCrash_/results.txt
@@ -0,0 +1,33 @@
+[
+ {
+ "Write" = [
+ {
+ "Type" = [
+ "ListType";
+ [
+ "StructType";
+ [
+ [
+ "column0";
+ [
+ "ListType";
+ [
+ "DataType";
+ "Utf8"
+ ]
+ ]
+ ]
+ ]
+ ]
+ ];
+ "Data" = [
+ [
+ [
+ "\xF0\x90\x8E\x80"
+ ]
+ ]
+ ]
+ }
+ ]
+ }
+] \ No newline at end of file
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/LevensteinDistanceCodePoints.sql b/yql/essentials/udfs/common/unicode_base/test/cases/LevensteinDistanceCodePoints.sql
new file mode 100644
index 0000000000..ec072b726a
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/LevensteinDistanceCodePoints.sql
@@ -0,0 +1,2 @@
+SELECT
+ Unicode::LevensteinDistance("\xF0\x90\x8E\x80"u, "\xF0\x9B\x80\x80"u)
diff --git a/yql/essentials/udfs/common/unicode_base/test/cases/SplitToListNoCrash.sql b/yql/essentials/udfs/common/unicode_base/test/cases/SplitToListNoCrash.sql
new file mode 100644
index 0000000000..643292ea0b
--- /dev/null
+++ b/yql/essentials/udfs/common/unicode_base/test/cases/SplitToListNoCrash.sql
@@ -0,0 +1,2 @@
+SELECT
+ Unicode::SplitToList("\xF0\x90\x8E\x80"u, "\xF0\x90\x8E\x81"u, false AS DelimeterString)