aboutsummaryrefslogtreecommitdiffstats
path: root/util/charset/utf8.h
diff options
context:
space:
mode:
authoralzobnin <alzobnin@yandex-team.ru>2022-02-10 16:46:50 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:50 +0300
commit5085152b94bf621933243a498def7f37d2e76b58 (patch)
tree49e222ea1c5804306084bb3ae065bb702625360f /util/charset/utf8.h
parentc9317148cc3e9f1b0bc0ce95172f47e099f2c554 (diff)
downloadydb-5085152b94bf621933243a498def7f37d2e76b58.tar.gz
Restoring authorship annotation for <alzobnin@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'util/charset/utf8.h')
-rw-r--r--util/charset/utf8.h656
1 files changed, 328 insertions, 328 deletions
diff --git a/util/charset/utf8.h b/util/charset/utf8.h
index 871dcb8ecd..5039b46ae9 100644
--- a/util/charset/utf8.h
+++ b/util/charset/utf8.h
@@ -1,21 +1,21 @@
-#pragma once
-
-#include "recode_result.h"
-
-#include <util/generic/strbuf.h>
+#pragma once
+
+#include "recode_result.h"
+
+#include <util/generic/strbuf.h>
#include <util/generic/string.h>
-#include <util/generic/yexception.h>
-#include <util/system/defaults.h>
-#include <util/system/yassert.h>
-
-extern const wchar32 BROKEN_RUNE;
-
-inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
+#include <util/generic/yexception.h>
+#include <util/system/defaults.h>
+#include <util/system/yassert.h>
+
+extern const wchar32 BROKEN_RUNE;
+
+inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
// Y_ASSERT (utf8_rune_len <= 4);
- return "\0\0\037\017\007"[utf8_rune_len];
-}
-
-inline size_t UTF8RuneLen(const unsigned char lead_byte) {
+ return "\0\0\037\017\007"[utf8_rune_len];
+}
+
+inline size_t UTF8RuneLen(const unsigned char lead_byte) {
//b0XXXXXXX
if ((lead_byte & 0x80) == 0x00) {
return 1;
@@ -34,343 +34,343 @@ inline size_t UTF8RuneLen(const unsigned char lead_byte) {
}
//b10XXXXXX
return 0;
-}
-
-inline size_t UTF8RuneLenByUCS(wchar32 rune) {
- if (rune < 0x80)
- return 1U;
- else if (rune < 0x800)
- return 2U;
- else if (rune < 0x10000)
- return 3U;
- else if (rune < 0x200000)
- return 4U;
- else if (rune < 0x4000000)
- return 5U;
- else
- return 6U;
-}
-
-inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
- rune = c;
- rune &= UTF8LeadByteMask(len);
-}
-
-inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
- rune <<= 6;
- rune |= c & 0x3F;
-}
-
-inline bool IsUTF8ContinuationByte(unsigned char c) {
- return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
-}
-
-//! returns length of the current UTF8 character
-//! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence
-//! @param p pointer to the current character
-//! @param e end of the character sequence
-inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
+}
+
+inline size_t UTF8RuneLenByUCS(wchar32 rune) {
+ if (rune < 0x80)
+ return 1U;
+ else if (rune < 0x800)
+ return 2U;
+ else if (rune < 0x10000)
+ return 3U;
+ else if (rune < 0x200000)
+ return 4U;
+ else if (rune < 0x4000000)
+ return 5U;
+ else
+ return 6U;
+}
+
+inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
+ rune = c;
+ rune &= UTF8LeadByteMask(len);
+}
+
+inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
+ rune <<= 6;
+ rune |= c & 0x3F;
+}
+
+inline bool IsUTF8ContinuationByte(unsigned char c) {
+ return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
+}
+
+//! returns length of the current UTF8 character
+//! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence
+//! @param p pointer to the current character
+//! @param e end of the character sequence
+inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
- switch (UTF8RuneLen(*p)) {
- case 0:
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
-
- case 1:
- n = 1;
- return RECODE_OK;
-
- case 2:
- if (p + 2 > e) {
- return RECODE_EOINPUT;
- } else if (!IsUTF8ContinuationByte(p[1])) {
- return RECODE_BROKENSYMBOL;
- } else {
- n = 2;
- return RECODE_OK;
- }
- case 3:
- if (p + 3 > e) {
- return RECODE_EOINPUT;
+ switch (UTF8RuneLen(*p)) {
+ case 0:
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+
+ case 1:
+ n = 1;
+ return RECODE_OK;
+
+ case 2:
+ if (p + 2 > e) {
+ return RECODE_EOINPUT;
+ } else if (!IsUTF8ContinuationByte(p[1])) {
+ return RECODE_BROKENSYMBOL;
+ } else {
+ n = 2;
+ return RECODE_OK;
+ }
+ case 3:
+ if (p + 3 > e) {
+ return RECODE_EOINPUT;
} else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
- return RECODE_BROKENSYMBOL;
- } else {
- n = 3;
- return RECODE_OK;
- }
- default: // actually 4
- if (p + 4 > e) {
- return RECODE_EOINPUT;
+ return RECODE_BROKENSYMBOL;
+ } else {
+ n = 3;
+ return RECODE_OK;
+ }
+ default: // actually 4
+ if (p + 4 > e) {
+ return RECODE_EOINPUT;
} else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
- return RECODE_BROKENSYMBOL;
- } else {
- n = 4;
- return RECODE_OK;
- }
- }
-}
-
-//! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
-//! @param text UTF8 encoded text
-//! @param len the length of the text in bytes
-//! @param number number of encoded symbols in the text
-inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
- const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
- const unsigned char* const last = cur + len;
- number = 0;
- size_t runeLen;
- bool res = true;
- while (cur != last) {
- if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
- res = false;
- break;
- }
- cur += runeLen;
+ return RECODE_BROKENSYMBOL;
+ } else {
+ n = 4;
+ return RECODE_OK;
+ }
+ }
+}
+
+//! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
+//! @param text UTF8 encoded text
+//! @param len the length of the text in bytes
+//! @param number number of encoded symbols in the text
+inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
+ const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
+ const unsigned char* const last = cur + len;
+ number = 0;
+ size_t runeLen;
+ bool res = true;
+ while (cur != last) {
+ if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
+ res = false;
+ break;
+ }
+ cur += runeLen;
Y_ASSERT(cur <= last);
- ++number;
- }
- return res;
-}
-
-inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
- size_t number;
+ ++number;
+ }
+ return res;
+}
+
+inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
+ size_t number;
if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) {
ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote();
- }
- return number;
-}
-
-//! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
-//! @param rune value of the current character
-//! @param rune_len length of the UTF8 bytes sequence that has been read
-//! @param s pointer to the current character
-//! @param end the end of the character sequence
-inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
- rune = BROKEN_RUNE;
- rune_len = 0;
- wchar32 _rune;
-
- size_t _len = UTF8RuneLen(*s);
- if (s + _len > end)
- return RECODE_EOINPUT; //[EOINPUT]
- if (_len == 0)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
- _rune = *s++; //[00000000 0XXXXXXX]
-
- if (_len > 1) {
- _rune &= UTF8LeadByteMask(_len);
- unsigned char ch = *s++;
- if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
+ }
+ return number;
+}
+
+//! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
+//! @param rune value of the current character
+//! @param rune_len length of the UTF8 bytes sequence that has been read
+//! @param s pointer to the current character
+//! @param end the end of the character sequence
+inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
+ rune = BROKEN_RUNE;
+ rune_len = 0;
+ wchar32 _rune;
+
+ size_t _len = UTF8RuneLen(*s);
+ if (s + _len > end)
+ return RECODE_EOINPUT; //[EOINPUT]
+ if (_len == 0)
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+ _rune = *s++; //[00000000 0XXXXXXX]
+
+ if (_len > 1) {
+ _rune &= UTF8LeadByteMask(_len);
+ unsigned char ch = *s++;
+ if (!IsUTF8ContinuationByte(ch))
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
PutUTF8SixBits(_rune, ch); //[00000XXX XXYYYYYY]
- if (_len > 2) {
- ch = *s++;
- if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
+ if (_len > 2) {
+ ch = *s++;
+ if (!IsUTF8ContinuationByte(ch))
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
PutUTF8SixBits(_rune, ch); //[XXXXYYYY YYZZZZZZ]
- if (_len > 3) {
- ch = *s;
- if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
+ if (_len > 3) {
+ ch = *s;
+ if (!IsUTF8ContinuationByte(ch))
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
PutUTF8SixBits(_rune, ch); //[XXXYY YYYYZZZZ ZZQQQQQQ]
if (_rune > 0x10FFFF) // it is not a valid Unicode code point
- return RECODE_BROKENSYMBOL;
- if (_rune < 0x10000) // check for overlong encoding
- return RECODE_BROKENSYMBOL;
- } else {
- if (_rune < 0x800) // check for overlong encoding
- return RECODE_BROKENSYMBOL;
- }
- } else {
- if (_rune < 0x80) // check for overlong encoding
- return RECODE_BROKENSYMBOL;
- }
- }
- rune_len = _len;
- rune = _rune;
- return RECODE_OK;
-}
-
-//! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
-//! @param c value of the current character
-//! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
-//! @param e the end of the character sequence
+ return RECODE_BROKENSYMBOL;
+ if (_rune < 0x10000) // check for overlong encoding
+ return RECODE_BROKENSYMBOL;
+ } else {
+ if (_rune < 0x800) // check for overlong encoding
+ return RECODE_BROKENSYMBOL;
+ }
+ } else {
+ if (_rune < 0x80) // check for overlong encoding
+ return RECODE_BROKENSYMBOL;
+ }
+ }
+ rune_len = _len;
+ rune = _rune;
+ return RECODE_OK;
+}
+
+//! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
+//! @param c value of the current character
+//! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
+//! @param e the end of the character sequence
Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
- switch (UTF8RuneLen(*p)) {
- case 0:
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
-
- case 1:
- rune = *p; //[00000000 0XXXXXXX]
- ++p;
- return RECODE_OK;
-
- case 2:
- if (p + 2 > e) {
- return RECODE_EOINPUT;
- } else if (!IsUTF8ContinuationByte(p[1])) {
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- } else {
- PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
- PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY]
+ switch (UTF8RuneLen(*p)) {
+ case 0:
+ rune = BROKEN_RUNE;
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+
+ case 1:
+ rune = *p; //[00000000 0XXXXXXX]
+ ++p;
+ return RECODE_OK;
+
+ case 2:
+ if (p + 2 > e) {
+ return RECODE_EOINPUT;
+ } else if (!IsUTF8ContinuationByte(p[1])) {
+ rune = BROKEN_RUNE;
+ return RECODE_BROKENSYMBOL;
+ } else {
+ PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
+ PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY]
if (Y_UNLIKELY(rune < 0x80)) { // overlong encoding
- p -= 2;
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- }
- return RECODE_OK;
- }
- case 3:
- if (p + 3 > e) {
- return RECODE_EOINPUT;
+ p -= 2;
+ rune = BROKEN_RUNE;
+ return RECODE_BROKENSYMBOL;
+ }
+ return RECODE_OK;
+ }
+ case 3:
+ if (p + 3 > e) {
+ return RECODE_EOINPUT;
} else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- } else {
- PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
- PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY]
- PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ]
- if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding
- p -= 3;
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- }
- return RECODE_OK;
- }
- case 4:
- if (p + 4 > e) {
- return RECODE_EOINPUT;
+ rune = BROKEN_RUNE;
+ return RECODE_BROKENSYMBOL;
+ } else {
+ PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
+ PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY]
+ PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ]
+ if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding
+ p -= 3;
+ rune = BROKEN_RUNE;
+ return RECODE_BROKENSYMBOL;
+ }
+ return RECODE_OK;
+ }
+ case 4:
+ if (p + 4 > e) {
+ return RECODE_EOINPUT;
} else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- } else {
+ rune = BROKEN_RUNE;
+ return RECODE_BROKENSYMBOL;
+ } else {
PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX]
PutUTF8SixBits(rune, *p++); //[00000000 0000000X XXYYYYYY]
PutUTF8SixBits(rune, *p++); //[00000000 0XXXYYYY YYZZZZZZ]
PutUTF8SixBits(rune, *p++); //[000XXXYY YYYYZZZZ ZZQQQQQQ]
- if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point
- p -= 4;
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- }
- return RECODE_OK;
- }
- default: // >4
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- }
-}
-
-//! writes one unicode symbol into a character sequence encoded UTF8
-//! checks for end of the buffer and returns the result of encoding
-//! @param rune value of the current character
-//! @param rune_len length of the UTF8 byte sequence that has been written
-//! @param s pointer to the output buffer
-//! @param tail available size of the buffer
-inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
- rune_len = 0;
- if (rune < 0x80) {
- if (tail <= 0)
- return RECODE_EOOUTPUT;
- *s = static_cast<unsigned char>(rune);
- rune_len = 1;
- return RECODE_OK;
- }
- if (rune < 0x800) {
- if (tail <= 1)
- return RECODE_EOOUTPUT;
- *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 2;
- return RECODE_OK;
- }
- if (rune < 0x10000) {
- if (tail <= 2)
- return RECODE_EOOUTPUT;
- *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 3;
- return RECODE_OK;
- }
- /*if (rune < 0x200000)*/ {
- if (tail <= 3)
- return RECODE_EOOUTPUT;
- *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 4;
- return RECODE_OK;
- }
-}
-
-inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
- return SafeWriteUTF8Char(rune, rune_len, s, end - s);
-}
-
-//! writes one unicode symbol into a character sequence encoded UTF8
-//! @attention this function works as @c SafeWriteUTF8Char it does not check
-//! the size of the output buffer, it supposes that buffer is long enough
-//! @param rune value of the current character
-//! @param rune_len length of the UTF8 byte sequence that has been written
-//! @param s pointer to the output buffer
-inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
- if (rune < 0x80) {
- *s = static_cast<unsigned char>(rune);
- rune_len = 1;
- return;
- }
- if (rune < 0x800) {
- *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 2;
- return;
- }
- if (rune < 0x10000) {
- *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 3;
- return;
- }
- /*if (rune < 0x200000)*/ {
- *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 4;
- }
-}
-
+ if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point
+ p -= 4;
+ rune = BROKEN_RUNE;
+ return RECODE_BROKENSYMBOL;
+ }
+ return RECODE_OK;
+ }
+ default: // >4
+ rune = BROKEN_RUNE;
+ return RECODE_BROKENSYMBOL;
+ }
+}
+
+//! writes one unicode symbol into a character sequence encoded UTF8
+//! checks for end of the buffer and returns the result of encoding
+//! @param rune value of the current character
+//! @param rune_len length of the UTF8 byte sequence that has been written
+//! @param s pointer to the output buffer
+//! @param tail available size of the buffer
+inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
+ rune_len = 0;
+ if (rune < 0x80) {
+ if (tail <= 0)
+ return RECODE_EOOUTPUT;
+ *s = static_cast<unsigned char>(rune);
+ rune_len = 1;
+ return RECODE_OK;
+ }
+ if (rune < 0x800) {
+ if (tail <= 1)
+ return RECODE_EOOUTPUT;
+ *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
+ *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+ rune_len = 2;
+ return RECODE_OK;
+ }
+ if (rune < 0x10000) {
+ if (tail <= 2)
+ return RECODE_EOOUTPUT;
+ *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
+ *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
+ *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+ rune_len = 3;
+ return RECODE_OK;
+ }
+ /*if (rune < 0x200000)*/ {
+ if (tail <= 3)
+ return RECODE_EOOUTPUT;
+ *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
+ *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
+ *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
+ *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+ rune_len = 4;
+ return RECODE_OK;
+ }
+}
+
+inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
+ return SafeWriteUTF8Char(rune, rune_len, s, end - s);
+}
+
+//! writes one unicode symbol into a character sequence encoded UTF8
+//! @attention this function works as @c SafeWriteUTF8Char it does not check
+//! the size of the output buffer, it supposes that buffer is long enough
+//! @param rune value of the current character
+//! @param rune_len length of the UTF8 byte sequence that has been written
+//! @param s pointer to the output buffer
+inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
+ if (rune < 0x80) {
+ *s = static_cast<unsigned char>(rune);
+ rune_len = 1;
+ return;
+ }
+ if (rune < 0x800) {
+ *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
+ *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+ rune_len = 2;
+ return;
+ }
+ if (rune < 0x10000) {
+ *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
+ *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
+ *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+ rune_len = 3;
+ return;
+ }
+ /*if (rune < 0x200000)*/ {
+ *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
+ *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
+ *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
+ *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+ rune_len = 4;
+ }
+}
+
TStringBuf SubstrUTF8(const TStringBuf str, size_t pos, size_t len);
-
-enum EUTF8Detect {
- NotUTF8,
- UTF8,
- ASCII
-};
-
-EUTF8Detect UTF8Detect(const char* s, size_t len);
-
+
+enum EUTF8Detect {
+ NotUTF8,
+ UTF8,
+ ASCII
+};
+
+EUTF8Detect UTF8Detect(const char* s, size_t len);
+
inline EUTF8Detect UTF8Detect(const TStringBuf input) {
return UTF8Detect(input.data(), input.size());
-}
-
-inline bool IsUtf(const char* input, size_t len) {
- return UTF8Detect(input, len) != NotUTF8;
-}
-
+}
+
+inline bool IsUtf(const char* input, size_t len) {
+ return UTF8Detect(input, len) != NotUTF8;
+}
+
inline bool IsUtf(const TStringBuf input) {
return IsUtf(input.data(), input.size());
-}
-
-//! returns true, if result is not the same as input, and put it in newString
-//! returns false, if result is unmodified
+}
+
+//! returns true, if result is not the same as input, and put it in newString
+//! returns false, if result is unmodified
bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
-
+
TString ToLowerUTF8(const TString& s);
TString ToLowerUTF8(TStringBuf s);
TString ToLowerUTF8(const char* s);