aboutsummaryrefslogtreecommitdiffstats
path: root/util/charset/utf8.cpp
diff options
context:
space:
mode:
authorart-snake <art-snake@yandex-team.ru>2022-02-10 16:50:34 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:50:34 +0300
commit1700010e2088971894d12a7a16d6004866f986fd (patch)
treeac3b38289119375037d595858db9751013220a3f /util/charset/utf8.cpp
parent785bc0acdf3b0c63f971ee17e845945d7381dcb7 (diff)
downloadydb-1700010e2088971894d12a7a16d6004866f986fd.tar.gz
Restoring authorship annotation for <art-snake@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'util/charset/utf8.cpp')
-rw-r--r--util/charset/utf8.cpp188
1 files changed, 94 insertions, 94 deletions
diff --git a/util/charset/utf8.cpp b/util/charset/utf8.cpp
index efe3a52f61..21ed1adcc6 100644
--- a/util/charset/utf8.cpp
+++ b/util/charset/utf8.cpp
@@ -1,87 +1,87 @@
#include "unidata.h"
#include "utf8.h"
-namespace {
- enum class ECaseConversion {
- ToUpper,
- ToLower,
- };
-
- wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) {
- switch (conversion) {
- case ECaseConversion::ToUpper:
- return ToUpper(ch);
- case ECaseConversion::ToLower:
- return ToLower(ch);
- }
- Y_ASSERT(false); // NOTREACHED
- return 0;
- }
-
- bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n,
- TString& newString) {
- const unsigned char* p = (const unsigned char*)beg;
- const unsigned char* const end = p + n;
-
- // first loop searches for the first character, which is changed by ConvertChar
- // if there is no changed character, we don't need reallocation/copy
- wchar32 cNew = 0;
- size_t cLen = 0;
- while (p < end) {
- wchar32 c;
- if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
- ythrow yexception()
- << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
- }
- cNew = ConvertChar(conversion, c);
-
- if (cNew != c)
- break;
- p += cLen;
- }
- if (p == end) {
- return false;
- }
-
- // some character changed after ToLower. Write new string to newString.
- newString.resize(n);
-
- size_t written = (char*)p - beg;
- char* writePtr = newString.begin();
- memcpy(writePtr, beg, written);
- writePtr += written;
- size_t destSpace = n - written;
-
- // before each iteration (including the first one) variable 'cNew' contains unwritten symbol
- while (true) {
- size_t cNewLen;
+namespace {
+ enum class ECaseConversion {
+ ToUpper,
+ ToLower,
+ };
+
+ wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) {
+ switch (conversion) {
+ case ECaseConversion::ToUpper:
+ return ToUpper(ch);
+ case ECaseConversion::ToLower:
+ return ToLower(ch);
+ }
+ Y_ASSERT(false); // NOTREACHED
+ return 0;
+ }
+
+ bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n,
+ TString& newString) {
+ const unsigned char* p = (const unsigned char*)beg;
+ const unsigned char* const end = p + n;
+
+ // first loop searches for the first character, which is changed by ConvertChar
+ // if there is no changed character, we don't need reallocation/copy
+ wchar32 cNew = 0;
+ size_t cLen = 0;
+ while (p < end) {
+ wchar32 c;
+ if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
+ ythrow yexception()
+ << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
+ }
+ cNew = ConvertChar(conversion, c);
+
+ if (cNew != c)
+ break;
+ p += cLen;
+ }
+ if (p == end) {
+ return false;
+ }
+
+ // some character changed after ToLower. Write new string to newString.
+ newString.resize(n);
+
+ size_t written = (char*)p - beg;
+ char* writePtr = newString.begin();
+ memcpy(writePtr, beg, written);
+ writePtr += written;
+ size_t destSpace = n - written;
+
+ // before each iteration (including the first one) variable 'cNew' contains unwritten symbol
+ while (true) {
+ size_t cNewLen;
Y_ASSERT((writePtr - newString.data()) + destSpace == newString.size());
- if (RECODE_EOOUTPUT ==
- SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) {
+ if (RECODE_EOOUTPUT ==
+ SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) {
destSpace += newString.size();
newString.resize(newString.size() * 2);
writePtr = newString.begin() + (newString.size() - destSpace);
- continue;
- }
- destSpace -= cNewLen;
- writePtr += cNewLen;
- p += cLen;
- if (p == end) {
+ continue;
+ }
+ destSpace -= cNewLen;
+ writePtr += cNewLen;
+ p += cLen;
+ if (p == end) {
newString.resize(newString.size() - destSpace);
- return true;
- }
- wchar32 c = 0;
- if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
- ythrow yexception()
- << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
- }
- cNew = ConvertChar(conversion, c);
- }
- Y_ASSERT(false);
- return false;
- }
-} // namespace
-
+ return true;
+ }
+ wchar32 c = 0;
+ if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
+ ythrow yexception()
+ << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
+ }
+ cNew = ConvertChar(conversion, c);
+ }
+ Y_ASSERT(false);
+ return false;
+ }
+} // namespace
+
extern const wchar32 BROKEN_RUNE = 0xFFFD;
static const char* SkipUTF8Chars(const char* begin, const char* end, size_t numChars) {
@@ -130,7 +130,7 @@ EUTF8Detect UTF8Detect(const char* s, size_t len) {
}
bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString) {
- return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString);
+ return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString);
}
TString ToLowerUTF8(const TString& s) {
@@ -148,23 +148,23 @@ TString ToLowerUTF8(TStringBuf s) {
TString ToLowerUTF8(const char* s) {
return ToLowerUTF8(TStringBuf(s));
}
-
-bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) {
- return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString);
-}
-
-TString ToUpperUTF8(const TString& s) {
- TString newString;
+
+bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) {
+ return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString);
+}
+
+TString ToUpperUTF8(const TString& s) {
+ TString newString;
bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
- return changed ? newString : s;
-}
-
-TString ToUpperUTF8(TStringBuf s) {
- TString newString;
+ return changed ? newString : s;
+}
+
+TString ToUpperUTF8(TStringBuf s) {
+ TString newString;
bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
return changed ? newString : TString(s.data(), s.size());
-}
-
-TString ToUpperUTF8(const char* s) {
- return ToUpperUTF8(TStringBuf(s));
-}
+}
+
+TString ToUpperUTF8(const char* s) {
+ return ToUpperUTF8(TStringBuf(s));
+}