diff options
author | alzobnin <alzobnin@yandex-team.ru> | 2022-02-10 16:46:50 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:50 +0300 |
commit | 5085152b94bf621933243a498def7f37d2e76b58 (patch) | |
tree | 49e222ea1c5804306084bb3ae065bb702625360f /util/charset/utf8.h | |
parent | c9317148cc3e9f1b0bc0ce95172f47e099f2c554 (diff) | |
download | ydb-5085152b94bf621933243a498def7f37d2e76b58.tar.gz |
Restoring authorship annotation for <alzobnin@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'util/charset/utf8.h')
-rw-r--r-- | util/charset/utf8.h | 656 |
1 files changed, 328 insertions, 328 deletions
diff --git a/util/charset/utf8.h b/util/charset/utf8.h index 871dcb8ecd..5039b46ae9 100644 --- a/util/charset/utf8.h +++ b/util/charset/utf8.h @@ -1,21 +1,21 @@ -#pragma once - -#include "recode_result.h" - -#include <util/generic/strbuf.h> +#pragma once + +#include "recode_result.h" + +#include <util/generic/strbuf.h> #include <util/generic/string.h> -#include <util/generic/yexception.h> -#include <util/system/defaults.h> -#include <util/system/yassert.h> - -extern const wchar32 BROKEN_RUNE; - -inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) { +#include <util/generic/yexception.h> +#include <util/system/defaults.h> +#include <util/system/yassert.h> + +extern const wchar32 BROKEN_RUNE; + +inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) { // Y_ASSERT (utf8_rune_len <= 4); - return "\0\0\037\017\007"[utf8_rune_len]; -} - -inline size_t UTF8RuneLen(const unsigned char lead_byte) { + return "\0\0\037\017\007"[utf8_rune_len]; +} + +inline size_t UTF8RuneLen(const unsigned char lead_byte) { //b0XXXXXXX if ((lead_byte & 0x80) == 0x00) { return 1; @@ -34,343 +34,343 @@ inline size_t UTF8RuneLen(const unsigned char lead_byte) { } //b10XXXXXX return 0; -} - -inline size_t UTF8RuneLenByUCS(wchar32 rune) { - if (rune < 0x80) - return 1U; - else if (rune < 0x800) - return 2U; - else if (rune < 0x10000) - return 3U; - else if (rune < 0x200000) - return 4U; - else if (rune < 0x4000000) - return 5U; - else - return 6U; -} - -inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) { - rune = c; - rune &= UTF8LeadByteMask(len); -} - -inline void PutUTF8SixBits(wchar32& rune, unsigned char c) { - rune <<= 6; - rune |= c & 0x3F; -} - -inline bool IsUTF8ContinuationByte(unsigned char c) { - return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80); -} - -//! returns length of the current UTF8 character -//! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence -//! @param p pointer to the current character -//! @param e end of the character sequence -inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) { +} + +inline size_t UTF8RuneLenByUCS(wchar32 rune) { + if (rune < 0x80) + return 1U; + else if (rune < 0x800) + return 2U; + else if (rune < 0x10000) + return 3U; + else if (rune < 0x200000) + return 4U; + else if (rune < 0x4000000) + return 5U; + else + return 6U; +} + +inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) { + rune = c; + rune &= UTF8LeadByteMask(len); +} + +inline void PutUTF8SixBits(wchar32& rune, unsigned char c) { + rune <<= 6; + rune |= c & 0x3F; +} + +inline bool IsUTF8ContinuationByte(unsigned char c) { + return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80); +} + +//! returns length of the current UTF8 character +//! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence +//! @param p pointer to the current character +//! @param e end of the character sequence +inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) { Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions) - switch (UTF8RuneLen(*p)) { - case 0: - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte - - case 1: - n = 1; - return RECODE_OK; - - case 2: - if (p + 2 > e) { - return RECODE_EOINPUT; - } else if (!IsUTF8ContinuationByte(p[1])) { - return RECODE_BROKENSYMBOL; - } else { - n = 2; - return RECODE_OK; - } - case 3: - if (p + 3 > e) { - return RECODE_EOINPUT; + switch (UTF8RuneLen(*p)) { + case 0: + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte + + case 1: + n = 1; + return RECODE_OK; + + case 2: + if (p + 2 > e) { + return RECODE_EOINPUT; + } else if (!IsUTF8ContinuationByte(p[1])) { + return RECODE_BROKENSYMBOL; + } else { + n = 2; + return RECODE_OK; + } + case 3: + if (p + 3 > e) { + return RECODE_EOINPUT; } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) { - return RECODE_BROKENSYMBOL; - } else { - n = 3; - return RECODE_OK; - } - default: // actually 4 - if (p + 4 > e) { - return RECODE_EOINPUT; + return RECODE_BROKENSYMBOL; + } else { + n = 3; + return RECODE_OK; + } + default: // actually 4 + if (p + 4 > e) { + return RECODE_EOINPUT; } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) { - return RECODE_BROKENSYMBOL; - } else { - n = 4; - return RECODE_OK; - } - } -} - -//! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong -//! @param text UTF8 encoded text -//! @param len the length of the text in bytes -//! @param number number of encoded symbols in the text -inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) { - const unsigned char* cur = reinterpret_cast<const unsigned char*>(text); - const unsigned char* const last = cur + len; - number = 0; - size_t runeLen; - bool res = true; - while (cur != last) { - if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only - res = false; - break; - } - cur += runeLen; + return RECODE_BROKENSYMBOL; + } else { + n = 4; + return RECODE_OK; + } + } +} + +//! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong +//! @param text UTF8 encoded text +//! @param len the length of the text in bytes +//! @param number number of encoded symbols in the text +inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) { + const unsigned char* cur = reinterpret_cast<const unsigned char*>(text); + const unsigned char* const last = cur + len; + number = 0; + size_t runeLen; + bool res = true; + while (cur != last) { + if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only + res = false; + break; + } + cur += runeLen; Y_ASSERT(cur <= last); - ++number; - } - return res; -} - -inline size_t GetNumberOfUTF8Chars(TStringBuf text) { - size_t number; + ++number; + } + return res; +} + +inline size_t GetNumberOfUTF8Chars(TStringBuf text) { + size_t number; if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) { ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote(); - } - return number; -} - -//! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding -//! @param rune value of the current character -//! @param rune_len length of the UTF8 bytes sequence that has been read -//! @param s pointer to the current character -//! @param end the end of the character sequence -inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) { - rune = BROKEN_RUNE; - rune_len = 0; - wchar32 _rune; - - size_t _len = UTF8RuneLen(*s); - if (s + _len > end) - return RECODE_EOINPUT; //[EOINPUT] - if (_len == 0) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte - _rune = *s++; //[00000000 0XXXXXXX] - - if (_len > 1) { - _rune &= UTF8LeadByteMask(_len); - unsigned char ch = *s++; - if (!IsUTF8ContinuationByte(ch)) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte + } + return number; +} + +//! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding +//! @param rune value of the current character +//! @param rune_len length of the UTF8 bytes sequence that has been read +//! @param s pointer to the current character +//! @param end the end of the character sequence +inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) { + rune = BROKEN_RUNE; + rune_len = 0; + wchar32 _rune; + + size_t _len = UTF8RuneLen(*s); + if (s + _len > end) + return RECODE_EOINPUT; //[EOINPUT] + if (_len == 0) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte + _rune = *s++; //[00000000 0XXXXXXX] + + if (_len > 1) { + _rune &= UTF8LeadByteMask(_len); + unsigned char ch = *s++; + if (!IsUTF8ContinuationByte(ch)) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte PutUTF8SixBits(_rune, ch); //[00000XXX XXYYYYYY] - if (_len > 2) { - ch = *s++; - if (!IsUTF8ContinuationByte(ch)) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte + if (_len > 2) { + ch = *s++; + if (!IsUTF8ContinuationByte(ch)) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte PutUTF8SixBits(_rune, ch); //[XXXXYYYY YYZZZZZZ] - if (_len > 3) { - ch = *s; - if (!IsUTF8ContinuationByte(ch)) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte + if (_len > 3) { + ch = *s; + if (!IsUTF8ContinuationByte(ch)) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte PutUTF8SixBits(_rune, ch); //[XXXYY YYYYZZZZ ZZQQQQQQ] if (_rune > 0x10FFFF) // it is not a valid Unicode code point - return RECODE_BROKENSYMBOL; - if (_rune < 0x10000) // check for overlong encoding - return RECODE_BROKENSYMBOL; - } else { - if (_rune < 0x800) // check for overlong encoding - return RECODE_BROKENSYMBOL; - } - } else { - if (_rune < 0x80) // check for overlong encoding - return RECODE_BROKENSYMBOL; - } - } - rune_len = _len; - rune = _rune; - return RECODE_OK; -} - -//! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character -//! @param c value of the current character -//! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence -//! @param e the end of the character sequence + return RECODE_BROKENSYMBOL; + if (_rune < 0x10000) // check for overlong encoding + return RECODE_BROKENSYMBOL; + } else { + if (_rune < 0x800) // check for overlong encoding + return RECODE_BROKENSYMBOL; + } + } else { + if (_rune < 0x80) // check for overlong encoding + return RECODE_BROKENSYMBOL; + } + } + rune_len = _len; + rune = _rune; + return RECODE_OK; +} + +//! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character +//! @param c value of the current character +//! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence +//! @param e the end of the character sequence Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept { Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions) - switch (UTF8RuneLen(*p)) { - case 0: - rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte - - case 1: - rune = *p; //[00000000 0XXXXXXX] - ++p; - return RECODE_OK; - - case 2: - if (p + 2 > e) { - return RECODE_EOINPUT; - } else if (!IsUTF8ContinuationByte(p[1])) { - rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; - } else { - PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX] - PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY] + switch (UTF8RuneLen(*p)) { + case 0: + rune = BROKEN_RUNE; + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte + + case 1: + rune = *p; //[00000000 0XXXXXXX] + ++p; + return RECODE_OK; + + case 2: + if (p + 2 > e) { + return RECODE_EOINPUT; + } else if (!IsUTF8ContinuationByte(p[1])) { + rune = BROKEN_RUNE; + return RECODE_BROKENSYMBOL; + } else { + PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX] + PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY] if (Y_UNLIKELY(rune < 0x80)) { // overlong encoding - p -= 2; - rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; - } - return RECODE_OK; - } - case 3: - if (p + 3 > e) { - return RECODE_EOINPUT; + p -= 2; + rune = BROKEN_RUNE; + return RECODE_BROKENSYMBOL; + } + return RECODE_OK; + } + case 3: + if (p + 3 > e) { + return RECODE_EOINPUT; } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) { - rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; - } else { - PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX] - PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY] - PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ] - if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding - p -= 3; - rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; - } - return RECODE_OK; - } - case 4: - if (p + 4 > e) { - return RECODE_EOINPUT; + rune = BROKEN_RUNE; + return RECODE_BROKENSYMBOL; + } else { + PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX] + PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY] + PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ] + if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding + p -= 3; + rune = BROKEN_RUNE; + return RECODE_BROKENSYMBOL; + } + return RECODE_OK; + } + case 4: + if (p + 4 > e) { + return RECODE_EOINPUT; } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) { - rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; - } else { + rune = BROKEN_RUNE; + return RECODE_BROKENSYMBOL; + } else { PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX] PutUTF8SixBits(rune, *p++); //[00000000 0000000X XXYYYYYY] PutUTF8SixBits(rune, *p++); //[00000000 0XXXYYYY YYZZZZZZ] PutUTF8SixBits(rune, *p++); //[000XXXYY YYYYZZZZ ZZQQQQQQ] - if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point - p -= 4; - rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; - } - return RECODE_OK; - } - default: // >4 - rune = BROKEN_RUNE; - return RECODE_BROKENSYMBOL; - } -} - -//! writes one unicode symbol into a character sequence encoded UTF8 -//! checks for end of the buffer and returns the result of encoding -//! @param rune value of the current character -//! @param rune_len length of the UTF8 byte sequence that has been written -//! @param s pointer to the output buffer -//! @param tail available size of the buffer -inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) { - rune_len = 0; - if (rune < 0x80) { - if (tail <= 0) - return RECODE_EOOUTPUT; - *s = static_cast<unsigned char>(rune); - rune_len = 1; - return RECODE_OK; - } - if (rune < 0x800) { - if (tail <= 1) - return RECODE_EOOUTPUT; - *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6)); - *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); - rune_len = 2; - return RECODE_OK; - } - if (rune < 0x10000) { - if (tail <= 2) - return RECODE_EOOUTPUT; - *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12)); - *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); - *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); - rune_len = 3; - return RECODE_OK; - } - /*if (rune < 0x200000)*/ { - if (tail <= 3) - return RECODE_EOOUTPUT; - *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07)); - *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F)); - *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); - *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); - rune_len = 4; - return RECODE_OK; - } -} - -inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) { - return SafeWriteUTF8Char(rune, rune_len, s, end - s); -} - -//! writes one unicode symbol into a character sequence encoded UTF8 -//! @attention this function works as @c SafeWriteUTF8Char it does not check -//! the size of the output buffer, it supposes that buffer is long enough -//! @param rune value of the current character -//! @param rune_len length of the UTF8 byte sequence that has been written -//! @param s pointer to the output buffer -inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) { - if (rune < 0x80) { - *s = static_cast<unsigned char>(rune); - rune_len = 1; - return; - } - if (rune < 0x800) { - *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6)); - *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); - rune_len = 2; - return; - } - if (rune < 0x10000) { - *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12)); - *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); - *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); - rune_len = 3; - return; - } - /*if (rune < 0x200000)*/ { - *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07)); - *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F)); - *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); - *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); - rune_len = 4; - } -} - + if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point + p -= 4; + rune = BROKEN_RUNE; + return RECODE_BROKENSYMBOL; + } + return RECODE_OK; + } + default: // >4 + rune = BROKEN_RUNE; + return RECODE_BROKENSYMBOL; + } +} + +//! writes one unicode symbol into a character sequence encoded UTF8 +//! checks for end of the buffer and returns the result of encoding +//! @param rune value of the current character +//! @param rune_len length of the UTF8 byte sequence that has been written +//! @param s pointer to the output buffer +//! @param tail available size of the buffer +inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) { + rune_len = 0; + if (rune < 0x80) { + if (tail <= 0) + return RECODE_EOOUTPUT; + *s = static_cast<unsigned char>(rune); + rune_len = 1; + return RECODE_OK; + } + if (rune < 0x800) { + if (tail <= 1) + return RECODE_EOOUTPUT; + *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6)); + *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); + rune_len = 2; + return RECODE_OK; + } + if (rune < 0x10000) { + if (tail <= 2) + return RECODE_EOOUTPUT; + *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12)); + *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); + *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); + rune_len = 3; + return RECODE_OK; + } + /*if (rune < 0x200000)*/ { + if (tail <= 3) + return RECODE_EOOUTPUT; + *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07)); + *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F)); + *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); + *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); + rune_len = 4; + return RECODE_OK; + } +} + +inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) { + return SafeWriteUTF8Char(rune, rune_len, s, end - s); +} + +//! writes one unicode symbol into a character sequence encoded UTF8 +//! @attention this function works as @c SafeWriteUTF8Char it does not check +//! the size of the output buffer, it supposes that buffer is long enough +//! @param rune value of the current character +//! @param rune_len length of the UTF8 byte sequence that has been written +//! @param s pointer to the output buffer +inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) { + if (rune < 0x80) { + *s = static_cast<unsigned char>(rune); + rune_len = 1; + return; + } + if (rune < 0x800) { + *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6)); + *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); + rune_len = 2; + return; + } + if (rune < 0x10000) { + *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12)); + *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); + *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); + rune_len = 3; + return; + } + /*if (rune < 0x200000)*/ { + *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07)); + *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F)); + *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); + *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); + rune_len = 4; + } +} + TStringBuf SubstrUTF8(const TStringBuf str, size_t pos, size_t len); - -enum EUTF8Detect { - NotUTF8, - UTF8, - ASCII -}; - -EUTF8Detect UTF8Detect(const char* s, size_t len); - + +enum EUTF8Detect { + NotUTF8, + UTF8, + ASCII +}; + +EUTF8Detect UTF8Detect(const char* s, size_t len); + inline EUTF8Detect UTF8Detect(const TStringBuf input) { return UTF8Detect(input.data(), input.size()); -} - -inline bool IsUtf(const char* input, size_t len) { - return UTF8Detect(input, len) != NotUTF8; -} - +} + +inline bool IsUtf(const char* input, size_t len) { + return UTF8Detect(input, len) != NotUTF8; +} + inline bool IsUtf(const TStringBuf input) { return IsUtf(input.data(), input.size()); -} - -//! returns true, if result is not the same as input, and put it in newString -//! returns false, if result is unmodified +} + +//! returns true, if result is not the same as input, and put it in newString +//! returns false, if result is unmodified bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString); - + TString ToLowerUTF8(const TString& s); TString ToLowerUTF8(TStringBuf s); TString ToLowerUTF8(const char* s); |