diff options
author | art-snake <art-snake@yandex-team.ru> | 2022-02-10 16:50:34 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:50:34 +0300 |
commit | 1700010e2088971894d12a7a16d6004866f986fd (patch) | |
tree | ac3b38289119375037d595858db9751013220a3f /util/charset/utf8.cpp | |
parent | 785bc0acdf3b0c63f971ee17e845945d7381dcb7 (diff) | |
download | ydb-1700010e2088971894d12a7a16d6004866f986fd.tar.gz |
Restoring authorship annotation for <art-snake@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'util/charset/utf8.cpp')
-rw-r--r-- | util/charset/utf8.cpp | 188 |
1 files changed, 94 insertions, 94 deletions
diff --git a/util/charset/utf8.cpp b/util/charset/utf8.cpp index efe3a52f61..21ed1adcc6 100644 --- a/util/charset/utf8.cpp +++ b/util/charset/utf8.cpp @@ -1,87 +1,87 @@ #include "unidata.h" #include "utf8.h" -namespace { - enum class ECaseConversion { - ToUpper, - ToLower, - }; - - wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) { - switch (conversion) { - case ECaseConversion::ToUpper: - return ToUpper(ch); - case ECaseConversion::ToLower: - return ToLower(ch); - } - Y_ASSERT(false); // NOTREACHED - return 0; - } - - bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n, - TString& newString) { - const unsigned char* p = (const unsigned char*)beg; - const unsigned char* const end = p + n; - - // first loop searches for the first character, which is changed by ConvertChar - // if there is no changed character, we don't need reallocation/copy - wchar32 cNew = 0; - size_t cLen = 0; - while (p < end) { - wchar32 c; - if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { - ythrow yexception() - << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); - } - cNew = ConvertChar(conversion, c); - - if (cNew != c) - break; - p += cLen; - } - if (p == end) { - return false; - } - - // some character changed after ToLower. Write new string to newString. - newString.resize(n); - - size_t written = (char*)p - beg; - char* writePtr = newString.begin(); - memcpy(writePtr, beg, written); - writePtr += written; - size_t destSpace = n - written; - - // before each iteration (including the first one) variable 'cNew' contains unwritten symbol - while (true) { - size_t cNewLen; +namespace { + enum class ECaseConversion { + ToUpper, + ToLower, + }; + + wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) { + switch (conversion) { + case ECaseConversion::ToUpper: + return ToUpper(ch); + case ECaseConversion::ToLower: + return ToLower(ch); + } + Y_ASSERT(false); // NOTREACHED + return 0; + } + + bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n, + TString& newString) { + const unsigned char* p = (const unsigned char*)beg; + const unsigned char* const end = p + n; + + // first loop searches for the first character, which is changed by ConvertChar + // if there is no changed character, we don't need reallocation/copy + wchar32 cNew = 0; + size_t cLen = 0; + while (p < end) { + wchar32 c; + if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { + ythrow yexception() + << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); + } + cNew = ConvertChar(conversion, c); + + if (cNew != c) + break; + p += cLen; + } + if (p == end) { + return false; + } + + // some character changed after ToLower. Write new string to newString. + newString.resize(n); + + size_t written = (char*)p - beg; + char* writePtr = newString.begin(); + memcpy(writePtr, beg, written); + writePtr += written; + size_t destSpace = n - written; + + // before each iteration (including the first one) variable 'cNew' contains unwritten symbol + while (true) { + size_t cNewLen; Y_ASSERT((writePtr - newString.data()) + destSpace == newString.size()); - if (RECODE_EOOUTPUT == - SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) { + if (RECODE_EOOUTPUT == + SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) { destSpace += newString.size(); newString.resize(newString.size() * 2); writePtr = newString.begin() + (newString.size() - destSpace); - continue; - } - destSpace -= cNewLen; - writePtr += cNewLen; - p += cLen; - if (p == end) { + continue; + } + destSpace -= cNewLen; + writePtr += cNewLen; + p += cLen; + if (p == end) { newString.resize(newString.size() - destSpace); - return true; - } - wchar32 c = 0; - if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { - ythrow yexception() - << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); - } - cNew = ConvertChar(conversion, c); - } - Y_ASSERT(false); - return false; - } -} // namespace - + return true; + } + wchar32 c = 0; + if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) { + ythrow yexception() + << "failed to decode UTF-8 string at pos " << ((const char*)p - beg); + } + cNew = ConvertChar(conversion, c); + } + Y_ASSERT(false); + return false; + } +} // namespace + extern const wchar32 BROKEN_RUNE = 0xFFFD; static const char* SkipUTF8Chars(const char* begin, const char* end, size_t numChars) { @@ -130,7 +130,7 @@ EUTF8Detect UTF8Detect(const char* s, size_t len) { } bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString) { - return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString); + return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString); } TString ToLowerUTF8(const TString& s) { @@ -148,23 +148,23 @@ TString ToLowerUTF8(TStringBuf s) { TString ToLowerUTF8(const char* s) { return ToLowerUTF8(TStringBuf(s)); } - -bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) { - return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString); -} - -TString ToUpperUTF8(const TString& s) { - TString newString; + +bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) { + return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString); +} + +TString ToUpperUTF8(const TString& s) { + TString newString; bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString); - return changed ? newString : s; -} - -TString ToUpperUTF8(TStringBuf s) { - TString newString; + return changed ? newString : s; +} + +TString ToUpperUTF8(TStringBuf s) { + TString newString; bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString); return changed ? newString : TString(s.data(), s.size()); -} - -TString ToUpperUTF8(const char* s) { - return ToUpperUTF8(TStringBuf(s)); -} +} + +TString ToUpperUTF8(const char* s) { + return ToUpperUTF8(TStringBuf(s)); +} |