diff options
author | robot-piglet <robot-piglet@yandex-team.com> | 2023-10-23 15:27:33 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2023-10-23 15:53:51 +0300 |
commit | ecf3635d74967466437d56ee349273d7cbc28690 (patch) | |
tree | dfb66d45e23897ffd3a60b46af7b1a0dfabb80d5 /library | |
parent | 48560f4d9e8d6945601b2e6f4be16f55549e6a7d (diff) | |
download | ydb-ecf3635d74967466437d56ee349273d7cbc28690.tar.gz |
Intermediate changes
Diffstat (limited to 'library')
-rw-r--r-- | library/cpp/charset/codepage_ut.cpp | 227 | ||||
-rw-r--r-- | library/cpp/charset/lite/ut/ya.make | 1 | ||||
-rw-r--r-- | library/cpp/charset/recyr_int_ut.cpp | 160 | ||||
-rw-r--r-- | library/cpp/charset/ut/ya.make | 1 | ||||
-rw-r--r-- | library/cpp/charset/wide_ut.cpp | 82 |
5 files changed, 244 insertions, 227 deletions
diff --git a/library/cpp/charset/codepage_ut.cpp b/library/cpp/charset/codepage_ut.cpp index c3ac3ac478..3cb5bf3837 100644 --- a/library/cpp/charset/codepage_ut.cpp +++ b/library/cpp/charset/codepage_ut.cpp @@ -1,5 +1,4 @@ #include "codepage.h" -#include "recyr.hh" #include "wide.h" #include <library/cpp/testing/unittest/registar.h> @@ -33,28 +32,19 @@ class TCodepageTest: public TTestBase { private: UNIT_TEST_SUITE(TCodepageTest); UNIT_TEST(TestUTF); - UNIT_TEST(TestUTFFromUnknownPlane); - UNIT_TEST(TestBrokenMultibyte); - UNIT_TEST(TestSurrogatePairs); UNIT_TEST(TestEncodingHints); UNIT_TEST(TestToLower); UNIT_TEST(TestToUpper); UNIT_TEST(TestUpperLower); UNIT_TEST(TestBrokenRune); - UNIT_TEST(TestCanEncode); UNIT_TEST_SUITE_END(); public: void TestUTF(); - void TestUTFFromUnknownPlane(); - void TestBrokenMultibyte(); - void TestSurrogatePairs(); void TestEncodingHints(); void TestToLower(); void TestToUpper(); - void TestCanEncode(); - inline void TestUpperLower() { const CodePage* cp = CodePageByCharset(CODES_ASCII); char tmp[100]; @@ -160,144 +150,6 @@ void TCodepageTest::TestUTF() { } } -void TCodepageTest::TestBrokenMultibyte() { - const ECharset cp = CODES_EUC_JP; - - const char sampletext[] = {'\xe3'}; - wchar32 recodeResult[100]; - - size_t nwritten = 0; - size_t nread = 0; - - RECODE_RESULT res = RecodeToUnicode(cp, sampletext, recodeResult, Y_ARRAY_SIZE(sampletext), Y_ARRAY_SIZE(recodeResult), nread, nwritten); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(nread == 1); - UNIT_ASSERT(nwritten == 0); - - const char bigSample[] = {'\xC3', '\x87', '\xC3', '\x8E', '\xC2', '\xB0', '\xC3', '\x85', '\xC3', '\x85', '\xC3', '\xB8'}; - res = RecodeToUnicode(cp, bigSample, recodeResult, Y_ARRAY_SIZE(bigSample), Y_ARRAY_SIZE(recodeResult), nread, nwritten); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(nread == Y_ARRAY_SIZE(bigSample)); -} - -void TCodepageTest::TestUTFFromUnknownPlane() { - static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20, - 0x430, 0x431, 0x432, 0x20, - 0x1001, 0x1002, 0x1003, 0x20, - 0x10001, 0x10002, 0x10003}; - - static const size_t BUFFER_SIZE = 1024; - char bytebuffer[BUFFER_SIZE]; - - size_t readchars = 0; - size_t writtenbytes = 0; - size_t samplelen = Y_ARRAY_SIZE(sampletext); - - RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes); - - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(samplelen == readchars); - - size_t writtenbytes2 = 0; - char bytebuffer2[BUFFER_SIZE]; - for (size_t i = 0; i != samplelen; ++i) { - size_t nwr = 0; - const int res = RecodeFromUnicode(CODES_UTF8, sampletext[i], bytebuffer2 + writtenbytes2, BUFFER_SIZE - writtenbytes2, nwr); - UNIT_ASSERT_VALUES_EQUAL(res, int(RECODE_OK)); - writtenbytes2 += nwr; - UNIT_ASSERT(BUFFER_SIZE > writtenbytes2); - } - UNIT_ASSERT_VALUES_EQUAL(TStringBuf(bytebuffer, writtenbytes), TStringBuf(bytebuffer2, writtenbytes2)); - - wchar32 charbuffer[BUFFER_SIZE]; - size_t readbytes = 0; - size_t writtenchars = 0; - - res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars); - - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(readbytes == writtenbytes); - - wchar32* charbufferend = charbuffer + writtenchars; - DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8); - - UNIT_ASSERT(charbufferend == charbuffer + samplelen); - for (size_t i = 0; i < samplelen; ++i) - UNIT_ASSERT(sampletext[i] == charbuffer[i]); - - // Now, concatenate the thing with an explicit character and retest - res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(readbytes == writtenbytes); - - charbuffer[writtenchars] = 0x1234; - - size_t morewrittenchars = 0; - res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(readbytes == writtenbytes); - UNIT_ASSERT(writtenchars == morewrittenchars); - - charbuffer[2 * writtenchars + 1] = 0x5678; - - charbufferend = charbuffer + 2 * writtenchars + 2; - DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8); - - UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2); - for (size_t i = 0; i < samplelen; ++i) { - UNIT_ASSERT(sampletext[i] == charbuffer[i]); - UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]); - } - UNIT_ASSERT(0x1234 == charbuffer[samplelen]); - UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]); - - // test TChar version - // bytebuffer of len writtenbytes contains sampletext of len samplelen chars in utf8 - TUtf16String wtr = CharToWide(TStringBuf(bytebuffer, writtenbytes), CODES_UNKNOWNPLANE); - TChar* strend = wtr.begin() + wtr.size(); - DecodeUnknownPlane(wtr.begin(), strend, CODES_UTF8); - wtr.resize(strend - wtr.data(), 'Q'); - UNIT_ASSERT_VALUES_EQUAL(wtr.size(), samplelen); - for (size_t i = 0; i < wtr.size(); ++i) { - if (sampletext[i] >= 0x10000) { - UNIT_ASSERT_VALUES_EQUAL(wtr[i], ' '); - } else { - UNIT_ASSERT_VALUES_EQUAL(wtr[i], sampletext[i]); - } - } -} - -static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) { - size_t sSize = strlen(str); - size_t wSize = sSize * 2; - TArrayHolder<wchar16> w(new wchar16[wSize]); - - size_t read = 0; - size_t written = 0; - RECODE_RESULT res = RecodeToUnicode(CODES_UTF8, str, w.Get(), sSize, wSize, read, written); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(read == sSize); - UNIT_ASSERT(written == wideSize); - UNIT_ASSERT(!memcmp(w.Get(), wide, wideSize)); - - TArrayHolder<char> s(new char[sSize]); - res = RecodeFromUnicode(CODES_UTF8, w.Get(), s.Get(), wideSize, sSize, read, written); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(read == wideSize); - UNIT_ASSERT(written == sSize); - UNIT_ASSERT(!memcmp(s.Get(), str, sSize)); -} - -void TCodepageTest::TestSurrogatePairs() { - const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba"; - wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A}; - TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy)); - - const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n"; - wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'}; - TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2)); -} - void TCodepageTest::TestEncodingHints() { UNIT_ASSERT(CODES_WIN == EncodingHintByName("windows-1251")); UNIT_ASSERT(CODES_WIN == EncodingHintByName("Windows1251")); @@ -343,82 +195,3 @@ void TCodepageTest::TestToUpper() { ToUpper(data, n - 1); UNIT_ASSERT(strcmp(data, yandexUpperCase) == 0); } - -static void TestCanEncodeEmpty() { - TWtringBuf empty; - UNIT_ASSERT(CanBeEncoded(empty, CODES_WIN)); - UNIT_ASSERT(CanBeEncoded(empty, CODES_YANDEX)); - UNIT_ASSERT(CanBeEncoded(empty, CODES_UTF8)); -} - -static void TestCanEncodeEach(const TWtringBuf& text, ECharset encoding, bool expectedResult) { - // char by char - for (size_t i = 0; i < text.size(); ++i) { - if (CanBeEncoded(text.SubStr(i, 1), encoding) != expectedResult) - ythrow yexception() << "assertion failed: encoding " << NameByCharset(encoding) - << " on '" << text.SubStr(i, 1) << "' (expected " << expectedResult << ")"; - } - // whole text - UNIT_ASSERT_EQUAL(CanBeEncoded(text, encoding), expectedResult); -} - -void TCodepageTest::TestCanEncode() { - TestCanEncodeEmpty(); - - const TUtf16String lat = u"AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz"; - TestCanEncodeEach(lat, CODES_WIN, true); - TestCanEncodeEach(lat, CODES_YANDEX, true); - TestCanEncodeEach(lat, CODES_UTF8, true); - - const TUtf16String rus = u"АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"; - TestCanEncodeEach(rus, CODES_WIN, true); - TestCanEncodeEach(rus, CODES_YANDEX, true); - TestCanEncodeEach(rus, CODES_UTF8, true); - - const TUtf16String ukr = u"ҐґЄєІіЇї"; - TestCanEncodeEach(ukr, CODES_WIN, true); - TestCanEncodeEach(ukr, CODES_YANDEX, true); - TestCanEncodeEach(ukr, CODES_UTF8, true); - - const TUtf16String pol = u"ĄĆĘŁŃÓŚŹŻąćęłńóśźż"; - TestCanEncodeEach(pol, CODES_WIN, false); - TestCanEncodeEach(pol, CODES_YANDEX, true); - TestCanEncodeEach(pol, CODES_UTF_16BE, true); - - const TUtf16String ger = u"ÄäÖöÜüß"; - TestCanEncodeEach(ger, CODES_WIN, false); - TestCanEncodeEach(ger, CODES_YANDEX, true); - TestCanEncodeEach(ger, CODES_UTF_16LE, true); - - const TUtf16String fra1 = u"éàèùâêîôûëïç"; // supported in yandex cp - const TUtf16String fra2 = u"ÉÀÈÙÂÊÎÔÛËÏŸÿÇ"; - const TUtf16String fra3 = u"Æ挜"; - TestCanEncodeEach(fra1 + fra2 + fra3, CODES_WIN, false); - TestCanEncodeEach(fra1, CODES_YANDEX, true); - TestCanEncodeEach(fra2 + fra3, CODES_YANDEX, false); - TestCanEncodeEach(fra1 + fra2 + fra3, CODES_UTF8, true); - - const TUtf16String kaz = u"ӘәҒғҚқҢңӨөҰұҮүҺһ"; - TestCanEncodeEach(kaz, CODES_WIN, false); - TestCanEncodeEach(kaz, CODES_YANDEX, false); - TestCanEncodeEach(kaz, CODES_UTF8, true); - TestCanEncodeEach(kaz, CODES_KAZWIN, true); - - const TUtf16String tur1 = u"ĞİŞğş"; - const TUtf16String tur = tur1 + u"ı"; - TestCanEncodeEach(tur, CODES_WIN, false); - TestCanEncodeEach(tur, CODES_YANDEX, false); - TestCanEncodeEach(tur, CODES_UTF8, true); - - const TUtf16String chi = u"新隶体新隸體"; - TestCanEncodeEach(chi, CODES_WIN, false); - TestCanEncodeEach(chi, CODES_YANDEX, false); - TestCanEncodeEach(chi, CODES_UTF8, true); - TestCanEncodeEach(chi, CODES_UTF_16LE, true); - - const TUtf16String jap = u"漢字仮字交じり文"; - TestCanEncodeEach(jap, CODES_WIN, false); - TestCanEncodeEach(jap, CODES_YANDEX, false); - TestCanEncodeEach(jap, CODES_UTF8, true); - TestCanEncodeEach(jap, CODES_UTF_16BE, true); -} diff --git a/library/cpp/charset/lite/ut/ya.make b/library/cpp/charset/lite/ut/ya.make index 53c85fae98..b9c6b94382 100644 --- a/library/cpp/charset/lite/ut/ya.make +++ b/library/cpp/charset/lite/ut/ya.make @@ -2,6 +2,7 @@ UNITTEST_FOR(library/cpp/charset/lite) SRCDIR(library/cpp/charset) +CFLAGS(-DLIBRARY_CHARSET_WITHOUT_LIBICONV=yes) SRCS( ci_string_ut.cpp codepage_ut.cpp diff --git a/library/cpp/charset/recyr_int_ut.cpp b/library/cpp/charset/recyr_int_ut.cpp new file mode 100644 index 0000000000..019b7aa1d3 --- /dev/null +++ b/library/cpp/charset/recyr_int_ut.cpp @@ -0,0 +1,160 @@ +#include "codepage.h" +#include "recyr.hh" +#include "wide.h" + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/charset/utf8.h> +#include <util/system/yassert.h> + +class TRecyr_intTest: public TTestBase { +private: + UNIT_TEST_SUITE(TRecyr_intTest); + UNIT_TEST(TestUTFFromUnknownPlane); + UNIT_TEST(TestBrokenMultibyte); + UNIT_TEST(TestSurrogatePairs); + UNIT_TEST_SUITE_END(); + +public: + void TestUTFFromUnknownPlane(); + void TestBrokenMultibyte(); + void TestSurrogatePairs(); +}; + +void TRecyr_intTest::TestBrokenMultibyte() { + const ECharset cp = CODES_EUC_JP; + + const char sampletext[] = {'\xe3'}; + wchar32 recodeResult[100]; + + size_t nwritten = 0; + size_t nread = 0; + + RECODE_RESULT res = RecodeToUnicode(cp, sampletext, recodeResult, Y_ARRAY_SIZE(sampletext), Y_ARRAY_SIZE(recodeResult), nread, nwritten); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(nread == 1); + UNIT_ASSERT(nwritten == 0); + + const char bigSample[] = {'\xC3', '\x87', '\xC3', '\x8E', '\xC2', '\xB0', '\xC3', '\x85', '\xC3', '\x85', '\xC3', '\xB8'}; + res = RecodeToUnicode(cp, bigSample, recodeResult, Y_ARRAY_SIZE(bigSample), Y_ARRAY_SIZE(recodeResult), nread, nwritten); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(nread == Y_ARRAY_SIZE(bigSample)); +} + +void TRecyr_intTest::TestUTFFromUnknownPlane() { + static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20, + 0x430, 0x431, 0x432, 0x20, + 0x1001, 0x1002, 0x1003, 0x20, + 0x10001, 0x10002, 0x10003}; + + static const size_t BUFFER_SIZE = 1024; + char bytebuffer[BUFFER_SIZE]; + + size_t readchars = 0; + size_t writtenbytes = 0; + size_t samplelen = Y_ARRAY_SIZE(sampletext); + + RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes); + + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(samplelen == readchars); + + size_t writtenbytes2 = 0; + char bytebuffer2[BUFFER_SIZE]; + for (size_t i = 0; i != samplelen; ++i) { + size_t nwr = 0; + const int res = RecodeFromUnicode(CODES_UTF8, sampletext[i], bytebuffer2 + writtenbytes2, BUFFER_SIZE - writtenbytes2, nwr); + UNIT_ASSERT_VALUES_EQUAL(res, int(RECODE_OK)); + writtenbytes2 += nwr; + UNIT_ASSERT(BUFFER_SIZE > writtenbytes2); + } + UNIT_ASSERT_VALUES_EQUAL(TStringBuf(bytebuffer, writtenbytes), TStringBuf(bytebuffer2, writtenbytes2)); + + wchar32 charbuffer[BUFFER_SIZE]; + size_t readbytes = 0; + size_t writtenchars = 0; + + res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars); + + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(readbytes == writtenbytes); + + wchar32* charbufferend = charbuffer + writtenchars; + DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8); + + UNIT_ASSERT(charbufferend == charbuffer + samplelen); + for (size_t i = 0; i < samplelen; ++i) + UNIT_ASSERT(sampletext[i] == charbuffer[i]); + + // Now, concatenate the thing with an explicit character and retest + res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(readbytes == writtenbytes); + + charbuffer[writtenchars] = 0x1234; + + size_t morewrittenchars = 0; + res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(readbytes == writtenbytes); + UNIT_ASSERT(writtenchars == morewrittenchars); + + charbuffer[2 * writtenchars + 1] = 0x5678; + + charbufferend = charbuffer + 2 * writtenchars + 2; + DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8); + + UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2); + for (size_t i = 0; i < samplelen; ++i) { + UNIT_ASSERT(sampletext[i] == charbuffer[i]); + UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]); + } + UNIT_ASSERT(0x1234 == charbuffer[samplelen]); + UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]); + + // test TChar version + // bytebuffer of len writtenbytes contains sampletext of len samplelen chars in utf8 + TUtf16String wtr = CharToWide(TStringBuf(bytebuffer, writtenbytes), CODES_UNKNOWNPLANE); + TChar* strend = wtr.begin() + wtr.size(); + DecodeUnknownPlane(wtr.begin(), strend, CODES_UTF8); + wtr.resize(strend - wtr.data(), 'Q'); + UNIT_ASSERT_VALUES_EQUAL(wtr.size(), samplelen); + for (size_t i = 0; i < wtr.size(); ++i) { + if (sampletext[i] >= 0x10000) { + UNIT_ASSERT_VALUES_EQUAL(wtr[i], ' '); + } else { + UNIT_ASSERT_VALUES_EQUAL(wtr[i], sampletext[i]); + } + } +} + +static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) { + size_t sSize = strlen(str); + size_t wSize = sSize * 2; + TArrayHolder<wchar16> w(new wchar16[wSize]); + + size_t read = 0; + size_t written = 0; + RECODE_RESULT res = RecodeToUnicode(CODES_UTF8, str, w.Get(), sSize, wSize, read, written); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(read == sSize); + UNIT_ASSERT(written == wideSize); + UNIT_ASSERT(!memcmp(w.Get(), wide, wideSize)); + + TArrayHolder<char> s(new char[sSize]); + res = RecodeFromUnicode(CODES_UTF8, w.Get(), s.Get(), wideSize, sSize, read, written); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(read == wideSize); + UNIT_ASSERT(written == sSize); + UNIT_ASSERT(!memcmp(s.Get(), str, sSize)); +} + +void TRecyr_intTest::TestSurrogatePairs() { + const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba"; + wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A}; + TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy)); + + const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n"; + wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'}; + TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2)); +} diff --git a/library/cpp/charset/ut/ya.make b/library/cpp/charset/ut/ya.make index 38f029cafc..c22436f8ac 100644 --- a/library/cpp/charset/ut/ya.make +++ b/library/cpp/charset/ut/ya.make @@ -2,6 +2,7 @@ UNITTEST_FOR(library/cpp/charset) SRCS( iconv_ut.cpp + recyr_int_ut.cpp wide_ut.cpp ) diff --git a/library/cpp/charset/wide_ut.cpp b/library/cpp/charset/wide_ut.cpp index 78947d51ba..93567161ba 100644 --- a/library/cpp/charset/wide_ut.cpp +++ b/library/cpp/charset/wide_ut.cpp @@ -130,6 +130,7 @@ private: UNIT_TEST(TestRecodeAppend); UNIT_TEST(TestRecode); UNIT_TEST(TestUnicodeLimit); + UNIT_TEST(TestCanEncode); UNIT_TEST_SUITE_END(); public: @@ -147,6 +148,8 @@ public: void TestRecodeAppend(); void TestRecode(); void TestUnicodeLimit(); + + void TestCanEncode(); }; UNIT_TEST_SUITE_REGISTRATION(TConversionTest); @@ -397,3 +400,82 @@ void TConversionTest::TestUnicodeLimit() { } } } + +static void TestCanEncodeEmpty() { + TWtringBuf empty; + UNIT_ASSERT(CanBeEncoded(empty, CODES_WIN)); + UNIT_ASSERT(CanBeEncoded(empty, CODES_YANDEX)); + UNIT_ASSERT(CanBeEncoded(empty, CODES_UTF8)); +} + +static void TestCanEncodeEach(const TWtringBuf& text, ECharset encoding, bool expectedResult) { + // char by char + for (size_t i = 0; i < text.size(); ++i) { + if (CanBeEncoded(text.SubStr(i, 1), encoding) != expectedResult) + ythrow yexception() << "assertion failed: encoding " << NameByCharset(encoding) + << " on '" << text.SubStr(i, 1) << "' (expected " << expectedResult << ")"; + } + // whole text + UNIT_ASSERT_EQUAL(CanBeEncoded(text, encoding), expectedResult); +} + +void TConversionTest::TestCanEncode() { + TestCanEncodeEmpty(); + + const TUtf16String lat = u"AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz"; + TestCanEncodeEach(lat, CODES_WIN, true); + TestCanEncodeEach(lat, CODES_YANDEX, true); + TestCanEncodeEach(lat, CODES_UTF8, true); + + const TUtf16String rus = u"АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"; + TestCanEncodeEach(rus, CODES_WIN, true); + TestCanEncodeEach(rus, CODES_YANDEX, true); + TestCanEncodeEach(rus, CODES_UTF8, true); + + const TUtf16String ukr = u"ҐґЄєІіЇї"; + TestCanEncodeEach(ukr, CODES_WIN, true); + TestCanEncodeEach(ukr, CODES_YANDEX, true); + TestCanEncodeEach(ukr, CODES_UTF8, true); + + const TUtf16String pol = u"ĄĆĘŁŃÓŚŹŻąćęłńóśźż"; + TestCanEncodeEach(pol, CODES_WIN, false); + TestCanEncodeEach(pol, CODES_YANDEX, true); + TestCanEncodeEach(pol, CODES_UTF_16BE, true); + + const TUtf16String ger = u"ÄäÖöÜüß"; + TestCanEncodeEach(ger, CODES_WIN, false); + TestCanEncodeEach(ger, CODES_YANDEX, true); + TestCanEncodeEach(ger, CODES_UTF_16LE, true); + + const TUtf16String fra1 = u"éàèùâêîôûëïç"; // supported in yandex cp + const TUtf16String fra2 = u"ÉÀÈÙÂÊÎÔÛËÏŸÿÇ"; + const TUtf16String fra3 = u"Æ挜"; + TestCanEncodeEach(fra1 + fra2 + fra3, CODES_WIN, false); + TestCanEncodeEach(fra1, CODES_YANDEX, true); + TestCanEncodeEach(fra2 + fra3, CODES_YANDEX, false); + TestCanEncodeEach(fra1 + fra2 + fra3, CODES_UTF8, true); + + const TUtf16String kaz = u"ӘәҒғҚқҢңӨөҰұҮүҺһ"; + TestCanEncodeEach(kaz, CODES_WIN, false); + TestCanEncodeEach(kaz, CODES_YANDEX, false); + TestCanEncodeEach(kaz, CODES_UTF8, true); + TestCanEncodeEach(kaz, CODES_KAZWIN, true); + + const TUtf16String tur1 = u"ĞİŞğş"; + const TUtf16String tur = tur1 + u"ı"; + TestCanEncodeEach(tur, CODES_WIN, false); + TestCanEncodeEach(tur, CODES_YANDEX, false); + TestCanEncodeEach(tur, CODES_UTF8, true); + + const TUtf16String chi = u"新隶体新隸體"; + TestCanEncodeEach(chi, CODES_WIN, false); + TestCanEncodeEach(chi, CODES_YANDEX, false); + TestCanEncodeEach(chi, CODES_UTF8, true); + TestCanEncodeEach(chi, CODES_UTF_16LE, true); + + const TUtf16String jap = u"漢字仮字交じり文"; + TestCanEncodeEach(jap, CODES_WIN, false); + TestCanEncodeEach(jap, CODES_YANDEX, false); + TestCanEncodeEach(jap, CODES_UTF8, true); + TestCanEncodeEach(jap, CODES_UTF_16BE, true); +} |