summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorgalaxycrab <[email protected]>2022-11-08 19:35:31 +0300
committergalaxycrab <[email protected]>2022-11-08 19:35:31 +0300
commit136a1044ad7b5d26a8b48cf9c66a8b71e4a90bb5 (patch)
tree411dca85c3590e3da06783dfb2ddedac5a38f2e6
parent9e7da59d8ee6574244def13568865416e89a5a07 (diff)
Save unicode text during escaping of yql issues
-rw-r--r--ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt1
-rw-r--r--ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt1
-rw-r--r--ydb/library/yql/public/issue/ut/CMakeLists.linux.txt1
-rw-r--r--ydb/library/yql/public/issue/yql_issue.cpp19
-rw-r--r--ydb/library/yql/public/issue/yql_issue_ut.cpp63
5 files changed, 52 insertions, 33 deletions
diff --git a/ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt b/ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt
index 3412ddc0591..e4a5be17361 100644
--- a/ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt
+++ b/ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt
@@ -17,6 +17,7 @@ target_link_libraries(ydb-library-yql-public-issue-ut PUBLIC
library-cpp-cpuid_check
cpp-testing-unittest_main
yql-public-issue
+ cpp-unicode-normalization
)
target_link_options(ydb-library-yql-public-issue-ut PRIVATE
-Wl,-no_deduplicate
diff --git a/ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt b/ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt
index 58987d4623c..8bb65cc22b8 100644
--- a/ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt
+++ b/ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt
@@ -17,6 +17,7 @@ target_link_libraries(ydb-library-yql-public-issue-ut PUBLIC
library-cpp-lfalloc
cpp-testing-unittest_main
yql-public-issue
+ cpp-unicode-normalization
)
target_link_options(ydb-library-yql-public-issue-ut PRIVATE
-ldl
diff --git a/ydb/library/yql/public/issue/ut/CMakeLists.linux.txt b/ydb/library/yql/public/issue/ut/CMakeLists.linux.txt
index fd1415fd123..916bff23c76 100644
--- a/ydb/library/yql/public/issue/ut/CMakeLists.linux.txt
+++ b/ydb/library/yql/public/issue/ut/CMakeLists.linux.txt
@@ -19,6 +19,7 @@ target_link_libraries(ydb-library-yql-public-issue-ut PUBLIC
library-cpp-cpuid_check
cpp-testing-unittest_main
yql-public-issue
+ cpp-unicode-normalization
)
target_link_options(ydb-library-yql-public-issue-ut PRIVATE
-ldl
diff --git a/ydb/library/yql/public/issue/yql_issue.cpp b/ydb/library/yql/public/issue/yql_issue.cpp
index b8d76a09c41..3446ce56f1e 100644
--- a/ydb/library/yql/public/issue/yql_issue.cpp
+++ b/ydb/library/yql/public/issue/yql_issue.cpp
@@ -5,6 +5,7 @@
#include <library/cpp/colorizer/output.h>
+#include <util/charset/utf8.h>
#include <util/string/ascii.h>
#include <util/string/split.h>
#include <util/string/strip.h>
@@ -19,11 +20,23 @@ namespace NYql {
void SanitizeNonAscii(TString& s) {
if (!NYql::IsUtf8(s)) {
- for (size_t i = 0; i < s.size(); ++i) {
- if (!IsAscii(s[i])) {
- s[i] = '?';
+ TString escaped;
+ escaped.reserve(s.size());
+ const unsigned char* i = reinterpret_cast<const unsigned char*>(s.data());
+ const unsigned char* end = i + s.size();
+ while (i < end) {
+ wchar32 rune;
+ size_t runeLen;
+ const RECODE_RESULT result = SafeReadUTF8Char(rune, runeLen, i, end);
+ if (result == RECODE_OK) {
+ escaped.insert(escaped.end(), reinterpret_cast<const char*>(i), reinterpret_cast<const char*>(i + runeLen));
+ i += runeLen;
+ } else {
+ escaped.push_back('?');
+ ++i;
}
}
+ s = escaped;
}
}
diff --git a/ydb/library/yql/public/issue/yql_issue_ut.cpp b/ydb/library/yql/public/issue/yql_issue_ut.cpp
index 0b13e4c2fc2..5b14faf3151 100644
--- a/ydb/library/yql/public/issue/yql_issue_ut.cpp
+++ b/ydb/library/yql/public/issue/yql_issue_ut.cpp
@@ -6,7 +6,11 @@
#include <ydb/library/yql/public/issue/yql_issue_message.h>
#include <ydb/public/api/protos/ydb_issue_message.pb.h>
+#include <library/cpp/unicode/normalization/normalization.h>
+
#include <util/charset/utf8.h>
+#include <util/charset/wide.h>
+#include <util/string/builder.h>
#include <google/protobuf/message.h>
#include <google/protobuf/descriptor.h>
@@ -143,37 +147,10 @@ Y_UNIT_TEST_SUITE(ToOneLineStringTest) {
Y_UNIT_TEST_SUITE(ToMessage) {
Y_UNIT_TEST(NonUtf8) {
- TString s;
- int chars[] = {
- 0x7f,
- 0xf8,
- 0xf7,
- 0xff,
- 0xf8,
- 0x1f,
- 0xff,
- 0xf2,
- 0xaf,
- 0xbf,
- 0xfe,
- 0xfa,
- 0xf5,
- 0x7f,
- 0xfe,
- 0xfa,
- 0x27,
- 0x20,
- 0x7d,
- 0x20,
- 0x5d,
- 0x2e
- };
- for (int i : chars) {
- s.append(static_cast<char>(i));
- }
- UNIT_ASSERT(!IsUtf(s));
+ const TString nonUtf8String = "\x7f\xf8\xf7\xff\xf8\x1f\xff\xf2\xaf\xbf\xfe\xfa\xf5\x7f\xfe\xfa\x27\x20\x7d\x20\x5d\x2e";
+ UNIT_ASSERT(!IsUtf(nonUtf8String));
TIssue issue;
- issue.SetMessage(s);
+ issue.SetMessage(nonUtf8String);
Ydb::Issue::IssueMessage msg;
IssueToMessage(issue, &msg);
@@ -183,3 +160,29 @@ Y_UNIT_TEST_SUITE(ToMessage) {
UNIT_ASSERT(msg2.ParseFromString(serialized));
}
}
+
+Y_UNIT_TEST_SUITE(EscapeNonUtf8) {
+ Y_UNIT_TEST(Escape) {
+ const TString nonUtf8String = "\xfe\xfa\xf5\xc2";
+ UNIT_ASSERT(!IsUtf(nonUtf8String));
+
+ // Check that our escaping correctly processes unicode pairs
+ const TString toNormalize = "Ёлка";
+ const TString nfd = WideToUTF8(Normalize<NUnicode::ENormalization::NFD>(UTF8ToWide(toNormalize))); // dots over 'ё' will be separate unicode symbol
+ const TString nfc = WideToUTF8(Normalize<NUnicode::ENormalization::NFC>(UTF8ToWide(toNormalize))); // dots over 'ё' will be with with their letter
+ UNIT_ASSERT_STRINGS_UNEQUAL(nfc, nfd);
+ std::pair<TString, TString> nonUtf8Messages[] = {
+ { nonUtf8String, "????" },
+ { TStringBuilder() << nonUtf8String << "Failed to parse file " << nonUtf8String << "עברית" << nonUtf8String, "????Failed to parse file ????עברית????" },
+ { nfd, nfd },
+ { nfc, nfc },
+ { TStringBuilder() << nfc << nonUtf8String << nfd, TStringBuilder() << nfc << "????" << nfd },
+ { TStringBuilder() << nfd << nonUtf8String << nfc, TStringBuilder() << nfd << "????" << nfc },
+ };
+
+ for (const auto& [src, dst] : nonUtf8Messages) {
+ TIssue issue(src);
+ UNIT_ASSERT_STRINGS_EQUAL(issue.GetMessage(), dst);
+ }
+ }
+}