diff options
author | dmasloff <dmasloff@yandex-team.com> | 2024-08-17 23:33:42 +0300 |
---|---|---|
committer | dmasloff <dmasloff@yandex-team.com> | 2024-08-17 23:43:45 +0300 |
commit | 69340f4614e853b9319df4b454ab7497711ee3cd (patch) | |
tree | 9902a3e2f58fe0bd9a157e7b51ad1cc52efa5744 /util/charset | |
parent | a905b53ec410defd5d2c40031ef8b34bb50a29f8 (diff) | |
download | ydb-69340f4614e853b9319df4b454ab7497711ee3cd.tar.gz |
Set SpacesInLineCommentPrefix to 1 in /util
Set SpacesInLineCommentPrefix to 1 in /util
3853f9ec5143722c1bebd8dc0ffc9b61a6c17657
Diffstat (limited to 'util/charset')
-rw-r--r-- | util/charset/utf8.h | 52 | ||||
-rw-r--r-- | util/charset/utf8_ut.cpp | 12 | ||||
-rw-r--r-- | util/charset/wide.h | 6 | ||||
-rw-r--r-- | util/charset/wide_sse41.cpp | 24 | ||||
-rw-r--r-- | util/charset/wide_ut.cpp | 20 |
5 files changed, 57 insertions, 57 deletions
diff --git a/util/charset/utf8.h b/util/charset/utf8.h index d0c45e9d06..b105d8db9d 100644 --- a/util/charset/utf8.h +++ b/util/charset/utf8.h @@ -16,23 +16,23 @@ inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) { } inline size_t UTF8RuneLen(const unsigned char lead_byte) { - //b0XXXXXXX + // b0XXXXXXX if ((lead_byte & 0x80) == 0x00) { return 1; } - //b110XXXXX + // b110XXXXX if ((lead_byte & 0xe0) == 0xc0) { return 2; } - //b1110XXXX + // b1110XXXX if ((lead_byte & 0xf0) == 0xe0) { return 3; } - //b11110XXX + // b11110XXX if ((lead_byte & 0xf8) == 0xf0) { return 4; } - //b10XXXXXX + // b10XXXXXX return 0; } @@ -73,7 +73,7 @@ inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const uns Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions) switch (UTF8RuneLen(*p)) { case 0: - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte + return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte case 1: n = 1; @@ -194,27 +194,27 @@ inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const uns size_t _len = UTF8RuneLen(*s); if (s + _len > end) - return RECODE_EOINPUT; //[EOINPUT] + return RECODE_EOINPUT; // [EOINPUT] if (_len == 0) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte - _rune = *s++; //[00000000 0XXXXXXX] + return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte + _rune = *s++; // [00000000 0XXXXXXX] if (_len > 1) { _rune &= UTF8LeadByteMask(_len); unsigned char ch = *s++; if (!IsUTF8ContinuationByte(ch)) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte - PutUTF8SixBits(_rune, ch); //[00000XXX XXYYYYYY] + return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in second byte + PutUTF8SixBits(_rune, ch); // [00000XXX XXYYYYYY] if (_len > 2) { ch = *s++; if (!IsUTF8ContinuationByte(ch)) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte - PutUTF8SixBits(_rune, ch); //[XXXXYYYY YYZZZZZZ] + return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in third byte + PutUTF8SixBits(_rune, ch); // [XXXXYYYY YYZZZZZZ] if (_len > 3) { ch = *s; if (!IsUTF8ContinuationByte(ch)) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte - PutUTF8SixBits(_rune, ch); //[XXXYY YYYYZZZZ ZZQQQQQQ] + return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in fourth byte + PutUTF8SixBits(_rune, ch); // [XXXYY YYYYZZZZ ZZQQQQQQ] if (!IsValidUTF8Rune<4, strictMode>(_rune)) return RECODE_BROKENSYMBOL; } else { @@ -241,10 +241,10 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne switch (UTF8RuneLen(*p)) { case 0: rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte + return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte case 1: - rune = *p; //[00000000 0XXXXXXX] + rune = *p; // [00000000 0XXXXXXX] ++p; return RECODE_OK; @@ -255,8 +255,8 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne rune = BROKEN_RUNE; return RECODE_BROKENSYMBOL; } else { - PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX] - PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY] + PutUTF8LeadBits(rune, *p++, 2); // [00000000 000XXXXX] + PutUTF8SixBits(rune, *p++); // [00000XXX XXYYYYYY] if (!IsValidUTF8Rune<2, strictMode>(rune)) { p -= 2; rune = BROKEN_RUNE; @@ -271,9 +271,9 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne rune = BROKEN_RUNE; return RECODE_BROKENSYMBOL; } else { - PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX] - PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY] - PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ] + PutUTF8LeadBits(rune, *p++, 3); // [00000000 0000XXXX] + PutUTF8SixBits(rune, *p++); // [000000XX XXYYYYYY] + PutUTF8SixBits(rune, *p++); // [XXXXYYYY YYZZZZZZ] // check for overlong encoding and surrogates if (!IsValidUTF8Rune<3, strictMode>(rune)) { p -= 3; @@ -289,10 +289,10 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne rune = BROKEN_RUNE; return RECODE_BROKENSYMBOL; } else { - PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX] - PutUTF8SixBits(rune, *p++); //[00000000 0000000X XXYYYYYY] - PutUTF8SixBits(rune, *p++); //[00000000 0XXXYYYY YYZZZZZZ] - PutUTF8SixBits(rune, *p++); //[000XXXYY YYYYZZZZ ZZQQQQQQ] + PutUTF8LeadBits(rune, *p++, 4); // [00000000 00000000 00000XXX] + PutUTF8SixBits(rune, *p++); // [00000000 0000000X XXYYYYYY] + PutUTF8SixBits(rune, *p++); // [00000000 0XXXYYYY YYZZZZZZ] + PutUTF8SixBits(rune, *p++); // [000XXXYY YYYYZZZZ ZZQQQQQQ] if (!IsValidUTF8Rune<4, strictMode>(rune)) { p -= 4; rune = BROKEN_RUNE; diff --git a/util/charset/utf8_ut.cpp b/util/charset/utf8_ut.cpp index 9e68881cca..00981f8060 100644 --- a/util/charset/utf8_ut.cpp +++ b/util/charset/utf8_ut.cpp @@ -27,17 +27,17 @@ Y_UNIT_TEST_SUITE(TUtfUtilTest) { { const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(tolower_utf8(weird)) is 3 - const char* turkI = "İ"; //strlen("İ") == 2, strlen(tolower_utf8("İ") == 1 + const char* turkI = "İ"; // strlen("İ") == 2, strlen(tolower_utf8("İ") == 1 TStringBuf chars[] = {"f", "F", "Б", "б", weird, turkI}; const int N = Y_ARRAY_SIZE(chars); - //try all combinations of these letters. + // try all combinations of these letters. int numberOfVariants = 1; for (int len = 0; len <= 4; ++len) { for (int i = 0; i < numberOfVariants; ++i) { TString s; int k = i; for (int j = 0; j < len; ++j) { - //Treat 'i' like number in base-N system with digits from 'chars'-array + // Treat 'i' like number in base-N system with digits from 'chars'-array s += chars[k % N]; k /= N; } @@ -67,17 +67,17 @@ Y_UNIT_TEST_SUITE(TUtfUtilTest) { { const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3 - const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1 + const char* turkI = "İ"; // strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1 TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird}; const int N = Y_ARRAY_SIZE(chars); - //try all combinations of these letters. + // try all combinations of these letters. int numberOfVariants = 1; for (int len = 0; len <= 4; ++len) { for (int i = 0; i < numberOfVariants; ++i) { TString s; int k = i; for (int j = 0; j < len; ++j) { - //Treat 'i' like number in base-N system with digits from 'chars'-array + // Treat 'i' like number in base-N system with digits from 'chars'-array s += chars[k % N]; k /= N; } diff --git a/util/charset/wide.h b/util/charset/wide.h index 5a81f8aa47..06f48d60da 100644 --- a/util/charset/wide.h +++ b/util/charset/wide.h @@ -341,7 +341,7 @@ inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size const unsigned char* cur = reinterpret_cast<const unsigned char*>(text); const unsigned char* last = cur + len; TCharType* p = dest; -#ifdef _sse_ //can't check for sse4, as we build most of arcadia without sse4 support even on platforms that support it +#ifdef _sse_ // can't check for sse4, as we build most of arcadia without sse4 support even on platforms that support it if (cur + 16 <= last && NX86::CachedHaveSSE41()) { ::NDetail::UTF8ToWideImplSSE41(cur, last, p); } @@ -606,7 +606,7 @@ namespace NDetail { #ifdef _sse2_ inline bool DoIsStringASCIISSE(const unsigned char* first, const unsigned char* last) { - //scalar version for short strings + // scalar version for short strings if (first + 8 > last) { return ::NDetail::DoIsStringASCIISlow(first, last); } @@ -637,7 +637,7 @@ namespace NDetail { return ::NDetail::DoIsStringASCIISlow(first, last); } -#endif //_sse2_ +#endif // _sse2_ } diff --git a/util/charset/wide_sse41.cpp b/util/charset/wide_sse41.cpp index d1f2a74851..0d86cb95f8 100644 --- a/util/charset/wide_sse41.cpp +++ b/util/charset/wide_sse41.cpp @@ -18,17 +18,17 @@ namespace NDetail { #include <emmintrin.h> #include <smmintrin.h> -//processes to the first error, or until less then 16 bytes left -//most code taken from https://woboq.com/blog/utf-8-processing-using-simd.html +// processes to the first error, or until less then 16 bytes left +// most code taken from https://woboq.com/blog/utf-8-processing-using-simd.html -//return dstAdvance 0 in case of problems +// return dstAdvance 0 in case of problems static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned char*& cur, __m128i& utf16Low, __m128i& utf16High) { unsigned char curAligned[16]; memcpy(curAligned, cur, sizeof(__m128i)); __m128i chunk = _mm_load_si128(reinterpret_cast<const __m128i*>(curAligned)); - //only ascii characters - simple copy + // only ascii characters - simple copy if (!_mm_movemask_epi8(chunk)) { utf16Low = _mm_unpacklo_epi8(chunk, _mm_setzero_si128()); utf16High = _mm_unpackhi_epi8(chunk, _mm_setzero_si128()); @@ -50,9 +50,9 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch __m128i chunkLow, chunkHigh; if (Y_LIKELY(!_mm_movemask_epi8(cond3))) { - //main case: no bloks of size 3 or 4 + // main case: no bloks of size 3 or 4 - //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.) + // rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.) __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); @@ -68,7 +68,7 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0)); __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuation1, _mm_set1_epi8(0)); __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask); - //each symbol should be exactly one of ascii, continuation or begin + // each symbol should be exactly one of ascii, continuation or begin __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask); if (_mm_movemask_epi8(okMask) != 0xFFFF) { @@ -114,7 +114,7 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch return 0; } - //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.) + // rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.) __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); @@ -132,7 +132,7 @@ static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned ch __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0)); __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuationsRunelen, _mm_set1_epi8(0)); __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask); - //each symbol should be exactly one of ascii, continuation or begin + // each symbol should be exactly one of ascii, continuation or begin __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask); if (_mm_movemask_epi8(okMask) != 0xFFFF) { @@ -209,7 +209,7 @@ namespace NDetail { memcpy(dest, destAligned, sizeof(__m128i) * 2); dest += dstAdvance; } - //The rest will be handled sequencially. + // The rest will be handled sequencially. // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence } @@ -225,7 +225,7 @@ namespace NDetail { break; } - //NOTE: we only work in case without surrogat pairs, so we can make simple copying with zeroes in 2 high bytes + // NOTE: we only work in case without surrogat pairs, so we can make simple copying with zeroes in 2 high bytes __m128i utf32_lowlow = _mm_unpacklo_epi16(utf16Low, _mm_set1_epi8(0)); __m128i utf32_lowhigh = _mm_unpackhi_epi16(utf16Low, _mm_set1_epi8(0)); __m128i utf32_highlow = _mm_unpacklo_epi16(utf16High, _mm_set1_epi8(0)); @@ -239,7 +239,7 @@ namespace NDetail { memcpy(dest, destAligned, sizeof(__m128i) * 4); dest += dstAdvance; } - //The rest will be handled sequencially. + // The rest will be handled sequencially. // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence } } diff --git a/util/charset/wide_ut.cpp b/util/charset/wide_ut.cpp index b33dd0c0de..dec843717e 100644 --- a/util/charset/wide_ut.cpp +++ b/util/charset/wide_ut.cpp @@ -111,16 +111,16 @@ namespace { } //! use this function to dump UTF8 text into a file in case of any changes - // void DumpUTF8Text() { - // TString s = WideToUTF8(UnicodeText); - // std::ofstream f("utf8.txt"); - // f << std::hex; - // for (int i = 0; i < (int)s.size(); ++i) { - // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", "; - // if ((i + 1) % 16 == 0) - // f << std::endl; - // } - // } + // void DumpUTF8Text() { + // TString s = WideToUTF8(UnicodeText); + // std::ofstream f("utf8.txt"); + // f << std::hex; + // for (int i = 0; i < (int)s.size(); ++i) { + // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", "; + // if ((i + 1) % 16 == 0) + // f << std::endl; + // } + // } template <StrictUTF8 strictMode = StrictUTF8::No> void CheckRecodeOK(wchar32 expected, unsigned char* first, size_t n) { |