aboutsummaryrefslogtreecommitdiffstats
path: root/util/charset
diff options
context:
space:
mode:
authordmasloff <dmasloff@yandex-team.com>2024-08-17 23:33:42 +0300
committerdmasloff <dmasloff@yandex-team.com>2024-08-17 23:43:45 +0300
commit69340f4614e853b9319df4b454ab7497711ee3cd (patch)
tree9902a3e2f58fe0bd9a157e7b51ad1cc52efa5744 /util/charset
parenta905b53ec410defd5d2c40031ef8b34bb50a29f8 (diff)
downloadydb-69340f4614e853b9319df4b454ab7497711ee3cd.tar.gz
Set SpacesInLineCommentPrefix to 1 in /util
Set SpacesInLineCommentPrefix to 1 in /util 3853f9ec5143722c1bebd8dc0ffc9b61a6c17657
Diffstat (limited to 'util/charset')
-rw-r--r--util/charset/utf8.h52
-rw-r--r--util/charset/utf8_ut.cpp12
-rw-r--r--util/charset/wide.h6
-rw-r--r--util/charset/wide_sse41.cpp24
-rw-r--r--util/charset/wide_ut.cpp20
5 files changed, 57 insertions, 57 deletions
diff --git a/util/charset/utf8.h b/util/charset/utf8.h
index d0c45e9d06..b105d8db9d 100644
--- a/util/charset/utf8.h
+++ b/util/charset/utf8.h
@@ -16,23 +16,23 @@ inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
}
inline size_t UTF8RuneLen(const unsigned char lead_byte) {
- //b0XXXXXXX
+ // b0XXXXXXX
if ((lead_byte & 0x80) == 0x00) {
return 1;
}
- //b110XXXXX
+ // b110XXXXX
if ((lead_byte & 0xe0) == 0xc0) {
return 2;
}
- //b1110XXXX
+ // b1110XXXX
if ((lead_byte & 0xf0) == 0xe0) {
return 3;
}
- //b11110XXX
+ // b11110XXX
if ((lead_byte & 0xf8) == 0xf0) {
return 4;
}
- //b10XXXXXX
+ // b10XXXXXX
return 0;
}
@@ -73,7 +73,7 @@ inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const uns
Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
switch (UTF8RuneLen(*p)) {
case 0:
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+ return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
case 1:
n = 1;
@@ -194,27 +194,27 @@ inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const uns
size_t _len = UTF8RuneLen(*s);
if (s + _len > end)
- return RECODE_EOINPUT; //[EOINPUT]
+ return RECODE_EOINPUT; // [EOINPUT]
if (_len == 0)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
- _rune = *s++; //[00000000 0XXXXXXX]
+ return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
+ _rune = *s++; // [00000000 0XXXXXXX]
if (_len > 1) {
_rune &= UTF8LeadByteMask(_len);
unsigned char ch = *s++;
if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
- PutUTF8SixBits(_rune, ch); //[00000XXX XXYYYYYY]
+ return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in second byte
+ PutUTF8SixBits(_rune, ch); // [00000XXX XXYYYYYY]
if (_len > 2) {
ch = *s++;
if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
- PutUTF8SixBits(_rune, ch); //[XXXXYYYY YYZZZZZZ]
+ return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in third byte
+ PutUTF8SixBits(_rune, ch); // [XXXXYYYY YYZZZZZZ]
if (_len > 3) {
ch = *s;
if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
- PutUTF8SixBits(_rune, ch); //[XXXYY YYYYZZZZ ZZQQQQQQ]
+ return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in fourth byte
+ PutUTF8SixBits(_rune, ch); // [XXXYY YYYYZZZZ ZZQQQQQQ]
if (!IsValidUTF8Rune<4, strictMode>(_rune))
return RECODE_BROKENSYMBOL;
} else {
@@ -241,10 +241,10 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne
switch (UTF8RuneLen(*p)) {
case 0:
rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+ return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
case 1:
- rune = *p; //[00000000 0XXXXXXX]
+ rune = *p; // [00000000 0XXXXXXX]
++p;
return RECODE_OK;
@@ -255,8 +255,8 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne
rune = BROKEN_RUNE;
return RECODE_BROKENSYMBOL;
} else {
- PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
- PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY]
+ PutUTF8LeadBits(rune, *p++, 2); // [00000000 000XXXXX]
+ PutUTF8SixBits(rune, *p++); // [00000XXX XXYYYYYY]
if (!IsValidUTF8Rune<2, strictMode>(rune)) {
p -= 2;
rune = BROKEN_RUNE;
@@ -271,9 +271,9 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne
rune = BROKEN_RUNE;
return RECODE_BROKENSYMBOL;
} else {
- PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
- PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY]
- PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ]
+ PutUTF8LeadBits(rune, *p++, 3); // [00000000 0000XXXX]
+ PutUTF8SixBits(rune, *p++); // [000000XX XXYYYYYY]
+ PutUTF8SixBits(rune, *p++); // [XXXXYYYY YYZZZZZZ]
// check for overlong encoding and surrogates
if (!IsValidUTF8Rune<3, strictMode>(rune)) {
p -= 3;
@@ -289,10 +289,10 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne
rune = BROKEN_RUNE;
return RECODE_BROKENSYMBOL;
} else {
- PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX]
- PutUTF8SixBits(rune, *p++); //[00000000 0000000X XXYYYYYY]
- PutUTF8SixBits(rune, *p++); //[00000000 0XXXYYYY YYZZZZZZ]
- PutUTF8SixBits(rune, *p++); //[000XXXYY YYYYZZZZ ZZQQQQQQ]
+ PutUTF8LeadBits(rune, *p++, 4); // [00000000 00000000 00000XXX]
+ PutUTF8SixBits(rune, *p++); // [00000000 0000000X XXYYYYYY]
+ PutUTF8SixBits(rune, *p++); // [00000000 0XXXYYYY YYZZZZZZ]
+ PutUTF8SixBits(rune, *p++); // [000XXXYY YYYYZZZZ ZZQQQQQQ]
if (!IsValidUTF8Rune<4, strictMode>(rune)) {
p -= 4;
rune = BROKEN_RUNE;
diff --git a/util/charset/utf8_ut.cpp b/util/charset/utf8_ut.cpp
index 9e68881cca..00981f8060 100644
--- a/util/charset/utf8_ut.cpp
+++ b/util/charset/utf8_ut.cpp
@@ -27,17 +27,17 @@ Y_UNIT_TEST_SUITE(TUtfUtilTest) {
{
const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(tolower_utf8(weird)) is 3
- const char* turkI = "İ"; //strlen("İ") == 2, strlen(tolower_utf8("İ") == 1
+ const char* turkI = "İ"; // strlen("İ") == 2, strlen(tolower_utf8("İ") == 1
TStringBuf chars[] = {"f", "F", "Б", "б", weird, turkI};
const int N = Y_ARRAY_SIZE(chars);
- //try all combinations of these letters.
+ // try all combinations of these letters.
int numberOfVariants = 1;
for (int len = 0; len <= 4; ++len) {
for (int i = 0; i < numberOfVariants; ++i) {
TString s;
int k = i;
for (int j = 0; j < len; ++j) {
- //Treat 'i' like number in base-N system with digits from 'chars'-array
+ // Treat 'i' like number in base-N system with digits from 'chars'-array
s += chars[k % N];
k /= N;
}
@@ -67,17 +67,17 @@ Y_UNIT_TEST_SUITE(TUtfUtilTest) {
{
const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3
- const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1
+ const char* turkI = "İ"; // strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1
TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird};
const int N = Y_ARRAY_SIZE(chars);
- //try all combinations of these letters.
+ // try all combinations of these letters.
int numberOfVariants = 1;
for (int len = 0; len <= 4; ++len) {
for (int i = 0; i < numberOfVariants; ++i) {
TString s;
int k = i;
for (int j = 0; j < len; ++j) {
- //Treat 'i' like number in base-N system with digits from 'chars'-array
+ // Treat 'i' like number in base-N system with digits from 'chars'-array
s += chars[k % N];
k /= N;
}
diff --git a/util/charset/wide.h b/util/charset/wide.h
index 5a81f8aa47..06f48d60da 100644
--- a/util/charset/wide.h
+++ b/util/charset/wide.h
@@ -341,7 +341,7 @@ inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size
const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
const unsigned char* last = cur + len;
TCharType* p = dest;
-#ifdef _sse_ //can't check for sse4, as we build most of arcadia without sse4 support even on platforms that support it
+#ifdef _sse_ // can't check for sse4, as we build most of arcadia without sse4 support even on platforms that support it
if (cur + 16 <= last && NX86::CachedHaveSSE41()) {
::NDetail::UTF8ToWideImplSSE41(cur, last, p);
}
@@ -606,7 +606,7 @@ namespace NDetail {
#ifdef _sse2_
inline bool DoIsStringASCIISSE(const unsigned char* first, const unsigned char* last) {
- //scalar version for short strings
+ // scalar version for short strings
if (first + 8 > last) {
return ::NDetail::DoIsStringASCIISlow(first, last);
}
@@ -637,7 +637,7 @@ namespace NDetail {
return ::NDetail::DoIsStringASCIISlow(first, last);
}
-#endif //_sse2_
+#endif // _sse2_
}
diff --git a/util/charset/wide_sse41.cpp b/util/charset/wide_sse41.cpp
index d1f2a74851..0d86cb95f8 100644
--- a/util/charset/wide_sse41.cpp
+++ b/util/charset/wide_sse41.cpp
@@ -18,17 +18,17 @@ namespace NDetail {
#include <emmintrin.h>
#include <smmintrin.h>
-//processes to the first error, or until less then 16 bytes left
-//most code taken from https://woboq.com/blog/utf-8-processing-using-simd.html
+// processes to the first error, or until less then 16 bytes left
+// most code taken from https://woboq.com/blog/utf-8-processing-using-simd.html
-//return dstAdvance 0 in case of problems
+// return dstAdvance 0 in case of problems
static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned char*& cur, __m128i& utf16Low, __m128i& utf16High) {
unsigned char curAligned[16];
memcpy(curAligned, cur, sizeof(__m128i));
__m128i chunk = _mm_load_si128(reinterpret_cast<const __m128i*>(curAligned));
- //only ascii characters - simple copy
+ // only ascii characters - simple copy
if (!_mm_movemask_epi8(chunk)) {
utf16Low = _mm_unpacklo_epi8(chunk, _mm_setzero_si128());
utf16High = _mm_unpackhi_epi8(chunk, _mm_setzero_si128());
@@ -50,9 +50,9 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch
__m128i chunkLow, chunkHigh;
if (Y_LIKELY(!_mm_movemask_epi8(cond3))) {
- //main case: no bloks of size 3 or 4
+ // main case: no bloks of size 3 or 4
- //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
+ // rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
__m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
__m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
@@ -68,7 +68,7 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch
__m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
__m128i needNoContinuationMask = _mm_cmpeq_epi8(continuation1, _mm_set1_epi8(0));
__m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
- //each symbol should be exactly one of ascii, continuation or begin
+ // each symbol should be exactly one of ascii, continuation or begin
__m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
if (_mm_movemask_epi8(okMask) != 0xFFFF) {
@@ -114,7 +114,7 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch
return 0;
}
- //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
+ // rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
__m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
__m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
@@ -132,7 +132,7 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch
__m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
__m128i needNoContinuationMask = _mm_cmpeq_epi8(continuationsRunelen, _mm_set1_epi8(0));
__m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
- //each symbol should be exactly one of ascii, continuation or begin
+ // each symbol should be exactly one of ascii, continuation or begin
__m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
if (_mm_movemask_epi8(okMask) != 0xFFFF) {
@@ -209,7 +209,7 @@ namespace NDetail {
memcpy(dest, destAligned, sizeof(__m128i) * 2);
dest += dstAdvance;
}
- //The rest will be handled sequencially.
+ // The rest will be handled sequencially.
// Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
}
@@ -225,7 +225,7 @@ namespace NDetail {
break;
}
- //NOTE: we only work in case without surrogat pairs, so we can make simple copying with zeroes in 2 high bytes
+ // NOTE: we only work in case without surrogat pairs, so we can make simple copying with zeroes in 2 high bytes
__m128i utf32_lowlow = _mm_unpacklo_epi16(utf16Low, _mm_set1_epi8(0));
__m128i utf32_lowhigh = _mm_unpackhi_epi16(utf16Low, _mm_set1_epi8(0));
__m128i utf32_highlow = _mm_unpacklo_epi16(utf16High, _mm_set1_epi8(0));
@@ -239,7 +239,7 @@ namespace NDetail {
memcpy(dest, destAligned, sizeof(__m128i) * 4);
dest += dstAdvance;
}
- //The rest will be handled sequencially.
+ // The rest will be handled sequencially.
// Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
}
}
diff --git a/util/charset/wide_ut.cpp b/util/charset/wide_ut.cpp
index b33dd0c0de..dec843717e 100644
--- a/util/charset/wide_ut.cpp
+++ b/util/charset/wide_ut.cpp
@@ -111,16 +111,16 @@ namespace {
}
//! use this function to dump UTF8 text into a file in case of any changes
- // void DumpUTF8Text() {
- // TString s = WideToUTF8(UnicodeText);
- // std::ofstream f("utf8.txt");
- // f << std::hex;
- // for (int i = 0; i < (int)s.size(); ++i) {
- // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", ";
- // if ((i + 1) % 16 == 0)
- // f << std::endl;
- // }
- // }
+ // void DumpUTF8Text() {
+ // TString s = WideToUTF8(UnicodeText);
+ // std::ofstream f("utf8.txt");
+ // f << std::hex;
+ // for (int i = 0; i < (int)s.size(); ++i) {
+ // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", ";
+ // if ((i + 1) % 16 == 0)
+ // f << std::endl;
+ // }
+ // }
template <StrictUTF8 strictMode = StrictUTF8::No>
void CheckRecodeOK(wchar32 expected, unsigned char* first, size_t n) {