aboutsummaryrefslogtreecommitdiffstats
path: root/util/charset
diff options
context:
space:
mode:
authorart-snake <art-snake@yandex-team.ru>2022-02-10 16:50:35 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:50:35 +0300
commitdc5517df41fe6319ff249956fe5650e4bbc3e660 (patch)
tree5d5cb817648f650d76cf1076100726fd9b8448e8 /util/charset
parent1700010e2088971894d12a7a16d6004866f986fd (diff)
downloadydb-dc5517df41fe6319ff249956fe5650e4bbc3e660.tar.gz
Restoring authorship annotation for <art-snake@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'util/charset')
-rw-r--r--util/charset/utf8.cpp188
-rw-r--r--util/charset/utf8.h16
-rw-r--r--util/charset/utf8_ut.cpp74
3 files changed, 139 insertions, 139 deletions
diff --git a/util/charset/utf8.cpp b/util/charset/utf8.cpp
index 21ed1adcc6..efe3a52f61 100644
--- a/util/charset/utf8.cpp
+++ b/util/charset/utf8.cpp
@@ -1,87 +1,87 @@
#include "unidata.h"
#include "utf8.h"
-namespace {
- enum class ECaseConversion {
- ToUpper,
- ToLower,
- };
-
- wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) {
- switch (conversion) {
- case ECaseConversion::ToUpper:
- return ToUpper(ch);
- case ECaseConversion::ToLower:
- return ToLower(ch);
- }
- Y_ASSERT(false); // NOTREACHED
- return 0;
- }
-
- bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n,
- TString& newString) {
- const unsigned char* p = (const unsigned char*)beg;
- const unsigned char* const end = p + n;
-
- // first loop searches for the first character, which is changed by ConvertChar
- // if there is no changed character, we don't need reallocation/copy
- wchar32 cNew = 0;
- size_t cLen = 0;
- while (p < end) {
- wchar32 c;
- if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
- ythrow yexception()
- << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
- }
- cNew = ConvertChar(conversion, c);
-
- if (cNew != c)
- break;
- p += cLen;
- }
- if (p == end) {
- return false;
- }
-
- // some character changed after ToLower. Write new string to newString.
- newString.resize(n);
-
- size_t written = (char*)p - beg;
- char* writePtr = newString.begin();
- memcpy(writePtr, beg, written);
- writePtr += written;
- size_t destSpace = n - written;
-
- // before each iteration (including the first one) variable 'cNew' contains unwritten symbol
- while (true) {
- size_t cNewLen;
+namespace {
+ enum class ECaseConversion {
+ ToUpper,
+ ToLower,
+ };
+
+ wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) {
+ switch (conversion) {
+ case ECaseConversion::ToUpper:
+ return ToUpper(ch);
+ case ECaseConversion::ToLower:
+ return ToLower(ch);
+ }
+ Y_ASSERT(false); // NOTREACHED
+ return 0;
+ }
+
+ bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n,
+ TString& newString) {
+ const unsigned char* p = (const unsigned char*)beg;
+ const unsigned char* const end = p + n;
+
+ // first loop searches for the first character, which is changed by ConvertChar
+ // if there is no changed character, we don't need reallocation/copy
+ wchar32 cNew = 0;
+ size_t cLen = 0;
+ while (p < end) {
+ wchar32 c;
+ if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
+ ythrow yexception()
+ << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
+ }
+ cNew = ConvertChar(conversion, c);
+
+ if (cNew != c)
+ break;
+ p += cLen;
+ }
+ if (p == end) {
+ return false;
+ }
+
+ // some character changed after ToLower. Write new string to newString.
+ newString.resize(n);
+
+ size_t written = (char*)p - beg;
+ char* writePtr = newString.begin();
+ memcpy(writePtr, beg, written);
+ writePtr += written;
+ size_t destSpace = n - written;
+
+ // before each iteration (including the first one) variable 'cNew' contains unwritten symbol
+ while (true) {
+ size_t cNewLen;
Y_ASSERT((writePtr - newString.data()) + destSpace == newString.size());
- if (RECODE_EOOUTPUT ==
- SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) {
+ if (RECODE_EOOUTPUT ==
+ SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) {
destSpace += newString.size();
newString.resize(newString.size() * 2);
writePtr = newString.begin() + (newString.size() - destSpace);
- continue;
- }
- destSpace -= cNewLen;
- writePtr += cNewLen;
- p += cLen;
- if (p == end) {
+ continue;
+ }
+ destSpace -= cNewLen;
+ writePtr += cNewLen;
+ p += cLen;
+ if (p == end) {
newString.resize(newString.size() - destSpace);
- return true;
- }
- wchar32 c = 0;
- if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
- ythrow yexception()
- << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
- }
- cNew = ConvertChar(conversion, c);
- }
- Y_ASSERT(false);
- return false;
- }
-} // namespace
-
+ return true;
+ }
+ wchar32 c = 0;
+ if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
+ ythrow yexception()
+ << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
+ }
+ cNew = ConvertChar(conversion, c);
+ }
+ Y_ASSERT(false);
+ return false;
+ }
+} // namespace
+
extern const wchar32 BROKEN_RUNE = 0xFFFD;
static const char* SkipUTF8Chars(const char* begin, const char* end, size_t numChars) {
@@ -130,7 +130,7 @@ EUTF8Detect UTF8Detect(const char* s, size_t len) {
}
bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString) {
- return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString);
+ return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString);
}
TString ToLowerUTF8(const TString& s) {
@@ -148,23 +148,23 @@ TString ToLowerUTF8(TStringBuf s) {
TString ToLowerUTF8(const char* s) {
return ToLowerUTF8(TStringBuf(s));
}
-
-bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) {
- return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString);
-}
-
-TString ToUpperUTF8(const TString& s) {
- TString newString;
+
+bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) {
+ return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString);
+}
+
+TString ToUpperUTF8(const TString& s) {
+ TString newString;
bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
- return changed ? newString : s;
-}
-
-TString ToUpperUTF8(TStringBuf s) {
- TString newString;
+ return changed ? newString : s;
+}
+
+TString ToUpperUTF8(TStringBuf s) {
+ TString newString;
bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
return changed ? newString : TString(s.data(), s.size());
-}
-
-TString ToUpperUTF8(const char* s) {
- return ToUpperUTF8(TStringBuf(s));
-}
+}
+
+TString ToUpperUTF8(const char* s) {
+ return ToUpperUTF8(TStringBuf(s));
+}
diff --git a/util/charset/utf8.h b/util/charset/utf8.h
index 5250bbeab2..5039b46ae9 100644
--- a/util/charset/utf8.h
+++ b/util/charset/utf8.h
@@ -374,15 +374,15 @@ bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
TString ToLowerUTF8(const TString& s);
TString ToLowerUTF8(TStringBuf s);
TString ToLowerUTF8(const char* s);
-
+
inline TString ToLowerUTF8(const std::string& s) {
return ToLowerUTF8(TStringBuf(s));
}
-//! returns true, if result is not the same as input, and put it in newString
-//! returns false, if result is unmodified
-bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString);
-
-TString ToUpperUTF8(const TString& s);
-TString ToUpperUTF8(TStringBuf s);
-TString ToUpperUTF8(const char* s);
+//! returns true, if result is not the same as input, and put it in newString
+//! returns false, if result is unmodified
+bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString);
+
+TString ToUpperUTF8(const TString& s);
+TString ToUpperUTF8(TStringBuf s);
+TString ToUpperUTF8(const char* s);
diff --git a/util/charset/utf8_ut.cpp b/util/charset/utf8_ut.cpp
index 8cbb844dc7..9e68881cca 100644
--- a/util/charset/utf8_ut.cpp
+++ b/util/charset/utf8_ut.cpp
@@ -52,46 +52,46 @@ Y_UNIT_TEST_SUITE(TUtfUtilTest) {
}
}
- Y_UNIT_TEST(TestToUpperUtfString) {
- UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8("xyz XYZ привет!"), "XYZ XYZ ПРИВЕТ!");
-
+ Y_UNIT_TEST(TestToUpperUtfString) {
+ UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8("xyz XYZ привет!"), "XYZ XYZ ПРИВЕТ!");
+
UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(TStringBuf("XYZ")), "XYZ");
-
- {
- TString s = "ПРИВЕТ!";
- TString q = "привет!";
- TString tmp;
+
+ {
+ TString s = "ПРИВЕТ!";
+ TString q = "привет!";
+ TString tmp;
UNIT_ASSERT(ToUpperUTF8Impl(s.data(), s.size(), tmp) == false);
UNIT_ASSERT(ToUpperUTF8Impl(q.data(), q.size(), tmp) == true);
- }
-
- {
- const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3
- const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1
- TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird};
- const int N = Y_ARRAY_SIZE(chars);
- //try all combinations of these letters.
- int numberOfVariants = 1;
- for (int len = 0; len <= 4; ++len) {
- for (int i = 0; i < numberOfVariants; ++i) {
- TString s;
- int k = i;
- for (int j = 0; j < len; ++j) {
- //Treat 'i' like number in base-N system with digits from 'chars'-array
- s += chars[k % N];
- k /= N;
- }
-
- TUtf16String tmp = UTF8ToWide(s);
- tmp.to_upper();
-
- UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(s), WideToUTF8(tmp));
- }
- numberOfVariants *= N;
- }
- }
- }
-
+ }
+
+ {
+ const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3
+ const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1
+ TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird};
+ const int N = Y_ARRAY_SIZE(chars);
+ //try all combinations of these letters.
+ int numberOfVariants = 1;
+ for (int len = 0; len <= 4; ++len) {
+ for (int i = 0; i < numberOfVariants; ++i) {
+ TString s;
+ int k = i;
+ for (int j = 0; j < len; ++j) {
+ //Treat 'i' like number in base-N system with digits from 'chars'-array
+ s += chars[k % N];
+ k /= N;
+ }
+
+ TUtf16String tmp = UTF8ToWide(s);
+ tmp.to_upper();
+
+ UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(s), WideToUTF8(tmp));
+ }
+ numberOfVariants *= N;
+ }
+ }
+ }
+
Y_UNIT_TEST(TestUTF8ToWide) {
TFileInput in(ArcadiaSourceRoot() + TStringBuf("/util/charset/ut/utf8/test1.txt"));