diff options
author | art-snake <art-snake@yandex-team.ru> | 2022-02-10 16:50:35 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:50:35 +0300 |
commit | dc5517df41fe6319ff249956fe5650e4bbc3e660 (patch) | |
tree | 5d5cb817648f650d76cf1076100726fd9b8448e8 /util/charset | |
parent | 1700010e2088971894d12a7a16d6004866f986fd (diff) | |
download | ydb-dc5517df41fe6319ff249956fe5650e4bbc3e660.tar.gz |
Restoring authorship annotation for <art-snake@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'util/charset')
-rw-r--r-- | util/charset/utf8.cpp | 188 | ||||
-rw-r--r-- | util/charset/utf8.h | 16 | ||||
-rw-r--r-- | util/charset/utf8_ut.cpp | 74 |
3 files changed, 139 insertions, 139 deletions
diff --git a/util/charset/utf8.cpp b/util/charset/utf8.cpp index 21ed1adcc6..efe3a52f61 100644 --- a/util/charset/utf8.cpp +++ b/util/charset/utf8.cpp @@ -1,87 +1,87 @@ #include "unidata.h" #include "utf8.h" -namespace { - enum class ECaseConversion { - ToUpper, - ToLower, - }; - - wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) { - switch (conversion) { - case ECaseConversion::ToUpper: - return ToUpper(ch); - case ECaseConversion::ToLower: - return ToLower(ch); - } - Y_ASSERT(false); // NOTREACHED - return 0; - } - - bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n, - TString& newString) { - const unsigned char* p = (const unsigned char*)beg; - const unsigned char* const end = p + n; - - // first loop searches for the first character, which is changed by ConvertChar - // if there is no changed character, we don't need reallocation/copy - wchar32 cNew = 0; - size_t cLen = 0; - while (p < end) { - wchar32 c; - if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { - ythrow yexception() - << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); - } - cNew = ConvertChar(conversion, c); - - if (cNew != c) - break; - p += cLen; - } - if (p == end) { - return false; - } - - // some character changed after ToLower. Write new string to newString. - newString.resize(n); - - size_t written = (char*)p - beg; - char* writePtr = newString.begin(); - memcpy(writePtr, beg, written); - writePtr += written; - size_t destSpace = n - written; - - // before each iteration (including the first one) variable 'cNew' contains unwritten symbol - while (true) { - size_t cNewLen; +namespace { + enum class ECaseConversion { + ToUpper, + ToLower, + }; + + wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) { + switch (conversion) { + case ECaseConversion::ToUpper: + return ToUpper(ch); + case ECaseConversion::ToLower: + return ToLower(ch); + } + Y_ASSERT(false); // NOTREACHED + return 0; + } + + bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n, + TString& newString) { + const unsigned char* p = (const unsigned char*)beg; + const unsigned char* const end = p + n; + + // first loop searches for the first character, which is changed by ConvertChar + // if there is no changed character, we don't need reallocation/copy + wchar32 cNew = 0; + size_t cLen = 0; + while (p < end) { + wchar32 c; + if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { + ythrow yexception() + << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); + } + cNew = ConvertChar(conversion, c); + + if (cNew != c) + break; + p += cLen; + } + if (p == end) { + return false; + } + + // some character changed after ToLower. Write new string to newString. + newString.resize(n); + + size_t written = (char*)p - beg; + char* writePtr = newString.begin(); + memcpy(writePtr, beg, written); + writePtr += written; + size_t destSpace = n - written; + + // before each iteration (including the first one) variable 'cNew' contains unwritten symbol + while (true) { + size_t cNewLen; Y_ASSERT((writePtr - newString.data()) + destSpace == newString.size()); - if (RECODE_EOOUTPUT == - SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) { + if (RECODE_EOOUTPUT == + SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) { destSpace += newString.size(); newString.resize(newString.size() * 2); writePtr = newString.begin() + (newString.size() - destSpace); - continue; - } - destSpace -= cNewLen; - writePtr += cNewLen; - p += cLen; - if (p == end) { + continue; + } + destSpace -= cNewLen; + writePtr += cNewLen; + p += cLen; + if (p == end) { newString.resize(newString.size() - destSpace); - return true; - } - wchar32 c = 0; - if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { - ythrow yexception() - << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); - } - cNew = ConvertChar(conversion, c); - } - Y_ASSERT(false); - return false; - } -} // namespace - + return true; + } + wchar32 c = 0; + if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { + ythrow yexception() + << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); + } + cNew = ConvertChar(conversion, c); + } + Y_ASSERT(false); + return false; + } +} // namespace + extern const wchar32 BROKEN_RUNE = 0xFFFD; static const char* SkipUTF8Chars(const char* begin, const char* end, size_t numChars) { @@ -130,7 +130,7 @@ EUTF8Detect UTF8Detect(const char* s, size_t len) { } bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString) { - return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString); + return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString); } TString ToLowerUTF8(const TString& s) { @@ -148,23 +148,23 @@ TString ToLowerUTF8(TStringBuf s) { TString ToLowerUTF8(const char* s) { return ToLowerUTF8(TStringBuf(s)); } - -bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) { - return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString); -} - -TString ToUpperUTF8(const TString& s) { - TString newString; + +bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) { + return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString); +} + +TString ToUpperUTF8(const TString& s) { + TString newString; bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString); - return changed ? newString : s; -} - -TString ToUpperUTF8(TStringBuf s) { - TString newString; + return changed ? newString : s; +} + +TString ToUpperUTF8(TStringBuf s) { + TString newString; bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString); return changed ? newString : TString(s.data(), s.size()); -} - -TString ToUpperUTF8(const char* s) { - return ToUpperUTF8(TStringBuf(s)); -} +} + +TString ToUpperUTF8(const char* s) { + return ToUpperUTF8(TStringBuf(s)); +} diff --git a/util/charset/utf8.h b/util/charset/utf8.h index 5250bbeab2..5039b46ae9 100644 --- a/util/charset/utf8.h +++ b/util/charset/utf8.h @@ -374,15 +374,15 @@ bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString); TString ToLowerUTF8(const TString& s); TString ToLowerUTF8(TStringBuf s); TString ToLowerUTF8(const char* s); - + inline TString ToLowerUTF8(const std::string& s) { return ToLowerUTF8(TStringBuf(s)); } -//! returns true, if result is not the same as input, and put it in newString -//! returns false, if result is unmodified -bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString); - -TString ToUpperUTF8(const TString& s); -TString ToUpperUTF8(TStringBuf s); -TString ToUpperUTF8(const char* s); +//! returns true, if result is not the same as input, and put it in newString +//! returns false, if result is unmodified +bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString); + +TString ToUpperUTF8(const TString& s); +TString ToUpperUTF8(TStringBuf s); +TString ToUpperUTF8(const char* s); diff --git a/util/charset/utf8_ut.cpp b/util/charset/utf8_ut.cpp index 8cbb844dc7..9e68881cca 100644 --- a/util/charset/utf8_ut.cpp +++ b/util/charset/utf8_ut.cpp @@ -52,46 +52,46 @@ Y_UNIT_TEST_SUITE(TUtfUtilTest) { } } - Y_UNIT_TEST(TestToUpperUtfString) { - UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8("xyz XYZ привет!"), "XYZ XYZ ПРИВЕТ!"); - + Y_UNIT_TEST(TestToUpperUtfString) { + UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8("xyz XYZ привет!"), "XYZ XYZ ПРИВЕТ!"); + UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(TStringBuf("XYZ")), "XYZ"); - - { - TString s = "ПРИВЕТ!"; - TString q = "привет!"; - TString tmp; + + { + TString s = "ПРИВЕТ!"; + TString q = "привет!"; + TString tmp; UNIT_ASSERT(ToUpperUTF8Impl(s.data(), s.size(), tmp) == false); UNIT_ASSERT(ToUpperUTF8Impl(q.data(), q.size(), tmp) == true); - } - - { - const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3 - const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1 - TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird}; - const int N = Y_ARRAY_SIZE(chars); - //try all combinations of these letters. - int numberOfVariants = 1; - for (int len = 0; len <= 4; ++len) { - for (int i = 0; i < numberOfVariants; ++i) { - TString s; - int k = i; - for (int j = 0; j < len; ++j) { - //Treat 'i' like number in base-N system with digits from 'chars'-array - s += chars[k % N]; - k /= N; - } - - TUtf16String tmp = UTF8ToWide(s); - tmp.to_upper(); - - UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(s), WideToUTF8(tmp)); - } - numberOfVariants *= N; - } - } - } - + } + + { + const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3 + const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1 + TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird}; + const int N = Y_ARRAY_SIZE(chars); + //try all combinations of these letters. + int numberOfVariants = 1; + for (int len = 0; len <= 4; ++len) { + for (int i = 0; i < numberOfVariants; ++i) { + TString s; + int k = i; + for (int j = 0; j < len; ++j) { + //Treat 'i' like number in base-N system with digits from 'chars'-array + s += chars[k % N]; + k /= N; + } + + TUtf16String tmp = UTF8ToWide(s); + tmp.to_upper(); + + UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(s), WideToUTF8(tmp)); + } + numberOfVariants *= N; + } + } + } + Y_UNIT_TEST(TestUTF8ToWide) { TFileInput in(ArcadiaSourceRoot() + TStringBuf("/util/charset/ut/utf8/test1.txt")); |