diff options
author | galaxycrab <[email protected]> | 2022-11-08 19:35:31 +0300 |
---|---|---|
committer | galaxycrab <[email protected]> | 2022-11-08 19:35:31 +0300 |
commit | 136a1044ad7b5d26a8b48cf9c66a8b71e4a90bb5 (patch) | |
tree | 411dca85c3590e3da06783dfb2ddedac5a38f2e6 | |
parent | 9e7da59d8ee6574244def13568865416e89a5a07 (diff) |
Save unicode text during escaping of yql issues
5 files changed, 52 insertions, 33 deletions
diff --git a/ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt b/ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt index 3412ddc0591..e4a5be17361 100644 --- a/ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt +++ b/ydb/library/yql/public/issue/ut/CMakeLists.darwin.txt @@ -17,6 +17,7 @@ target_link_libraries(ydb-library-yql-public-issue-ut PUBLIC library-cpp-cpuid_check cpp-testing-unittest_main yql-public-issue + cpp-unicode-normalization ) target_link_options(ydb-library-yql-public-issue-ut PRIVATE -Wl,-no_deduplicate diff --git a/ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt b/ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt index 58987d4623c..8bb65cc22b8 100644 --- a/ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt +++ b/ydb/library/yql/public/issue/ut/CMakeLists.linux-aarch64.txt @@ -17,6 +17,7 @@ target_link_libraries(ydb-library-yql-public-issue-ut PUBLIC library-cpp-lfalloc cpp-testing-unittest_main yql-public-issue + cpp-unicode-normalization ) target_link_options(ydb-library-yql-public-issue-ut PRIVATE -ldl diff --git a/ydb/library/yql/public/issue/ut/CMakeLists.linux.txt b/ydb/library/yql/public/issue/ut/CMakeLists.linux.txt index fd1415fd123..916bff23c76 100644 --- a/ydb/library/yql/public/issue/ut/CMakeLists.linux.txt +++ b/ydb/library/yql/public/issue/ut/CMakeLists.linux.txt @@ -19,6 +19,7 @@ target_link_libraries(ydb-library-yql-public-issue-ut PUBLIC library-cpp-cpuid_check cpp-testing-unittest_main yql-public-issue + cpp-unicode-normalization ) target_link_options(ydb-library-yql-public-issue-ut PRIVATE -ldl diff --git a/ydb/library/yql/public/issue/yql_issue.cpp b/ydb/library/yql/public/issue/yql_issue.cpp index b8d76a09c41..3446ce56f1e 100644 --- a/ydb/library/yql/public/issue/yql_issue.cpp +++ b/ydb/library/yql/public/issue/yql_issue.cpp @@ -5,6 +5,7 @@ #include <library/cpp/colorizer/output.h> +#include <util/charset/utf8.h> #include <util/string/ascii.h> #include <util/string/split.h> #include <util/string/strip.h> @@ -19,11 +20,23 @@ namespace NYql { void SanitizeNonAscii(TString& s) { if (!NYql::IsUtf8(s)) { - for (size_t i = 0; i < s.size(); ++i) { - if (!IsAscii(s[i])) { - s[i] = '?'; + TString escaped; + escaped.reserve(s.size()); + const unsigned char* i = reinterpret_cast<const unsigned char*>(s.data()); + const unsigned char* end = i + s.size(); + while (i < end) { + wchar32 rune; + size_t runeLen; + const RECODE_RESULT result = SafeReadUTF8Char(rune, runeLen, i, end); + if (result == RECODE_OK) { + escaped.insert(escaped.end(), reinterpret_cast<const char*>(i), reinterpret_cast<const char*>(i + runeLen)); + i += runeLen; + } else { + escaped.push_back('?'); + ++i; } } + s = escaped; } } diff --git a/ydb/library/yql/public/issue/yql_issue_ut.cpp b/ydb/library/yql/public/issue/yql_issue_ut.cpp index 0b13e4c2fc2..5b14faf3151 100644 --- a/ydb/library/yql/public/issue/yql_issue_ut.cpp +++ b/ydb/library/yql/public/issue/yql_issue_ut.cpp @@ -6,7 +6,11 @@ #include <ydb/library/yql/public/issue/yql_issue_message.h> #include <ydb/public/api/protos/ydb_issue_message.pb.h> +#include <library/cpp/unicode/normalization/normalization.h> + #include <util/charset/utf8.h> +#include <util/charset/wide.h> +#include <util/string/builder.h> #include <google/protobuf/message.h> #include <google/protobuf/descriptor.h> @@ -143,37 +147,10 @@ Y_UNIT_TEST_SUITE(ToOneLineStringTest) { Y_UNIT_TEST_SUITE(ToMessage) { Y_UNIT_TEST(NonUtf8) { - TString s; - int chars[] = { - 0x7f, - 0xf8, - 0xf7, - 0xff, - 0xf8, - 0x1f, - 0xff, - 0xf2, - 0xaf, - 0xbf, - 0xfe, - 0xfa, - 0xf5, - 0x7f, - 0xfe, - 0xfa, - 0x27, - 0x20, - 0x7d, - 0x20, - 0x5d, - 0x2e - }; - for (int i : chars) { - s.append(static_cast<char>(i)); - } - UNIT_ASSERT(!IsUtf(s)); + const TString nonUtf8String = "\x7f\xf8\xf7\xff\xf8\x1f\xff\xf2\xaf\xbf\xfe\xfa\xf5\x7f\xfe\xfa\x27\x20\x7d\x20\x5d\x2e"; + UNIT_ASSERT(!IsUtf(nonUtf8String)); TIssue issue; - issue.SetMessage(s); + issue.SetMessage(nonUtf8String); Ydb::Issue::IssueMessage msg; IssueToMessage(issue, &msg); @@ -183,3 +160,29 @@ Y_UNIT_TEST_SUITE(ToMessage) { UNIT_ASSERT(msg2.ParseFromString(serialized)); } } + +Y_UNIT_TEST_SUITE(EscapeNonUtf8) { + Y_UNIT_TEST(Escape) { + const TString nonUtf8String = "\xfe\xfa\xf5\xc2"; + UNIT_ASSERT(!IsUtf(nonUtf8String)); + + // Check that our escaping correctly processes unicode pairs + const TString toNormalize = "Ёлка"; + const TString nfd = WideToUTF8(Normalize<NUnicode::ENormalization::NFD>(UTF8ToWide(toNormalize))); // dots over 'ё' will be separate unicode symbol + const TString nfc = WideToUTF8(Normalize<NUnicode::ENormalization::NFC>(UTF8ToWide(toNormalize))); // dots over 'ё' will be with with their letter + UNIT_ASSERT_STRINGS_UNEQUAL(nfc, nfd); + std::pair<TString, TString> nonUtf8Messages[] = { + { nonUtf8String, "????" }, + { TStringBuilder() << nonUtf8String << "Failed to parse file " << nonUtf8String << "עברית" << nonUtf8String, "????Failed to parse file ????עברית????" }, + { nfd, nfd }, + { nfc, nfc }, + { TStringBuilder() << nfc << nonUtf8String << nfd, TStringBuilder() << nfc << "????" << nfd }, + { TStringBuilder() << nfd << nonUtf8String << nfc, TStringBuilder() << nfd << "????" << nfc }, + }; + + for (const auto& [src, dst] : nonUtf8Messages) { + TIssue issue(src); + UNIT_ASSERT_STRINGS_EQUAL(issue.GetMessage(), dst); + } + } +} |