Restoring authorship annotation for <alzobnin@yandex-team.ru>. Commit 2 of 2.

author: alzobnin <alzobnin@yandex-team.ru> 2022-02-10 16:46:50 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:46:50 +0300
commit: 5085152b94bf621933243a498def7f37d2e76b58 (patch)
tree: 49e222ea1c5804306084bb3ae065bb702625360f /util/charset/utf8.h
parent: c9317148cc3e9f1b0bc0ce95172f47e099f2c554 (diff)
download: ydb-5085152b94bf621933243a498def7f37d2e76b58.tar.gz
1 files changed, 328 insertions, 328 deletions
diff --git a/util/charset/utf8.h b/util/charset/utf8.h
index 871dcb8ecd..5039b46ae9 100644
--- a/util/charset/utf8.h
+++ b/util/charset/utf8.h
@@ -1,21 +1,21 @@
-#pragma once 
- 
-#include "recode_result.h" 
- 
-#include <util/generic/strbuf.h> 
+#pragma once
+
+#include "recode_result.h"
+
+#include <util/generic/strbuf.h>
 #include <util/generic/string.h>
-#include <util/generic/yexception.h> 
-#include <util/system/defaults.h> 
-#include <util/system/yassert.h> 
- 
-extern const wchar32 BROKEN_RUNE; 
- 
-inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) { 
+#include <util/generic/yexception.h>
+#include <util/system/defaults.h>
+#include <util/system/yassert.h>
+
+extern const wchar32 BROKEN_RUNE;
+
+inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
     // Y_ASSERT (utf8_rune_len <= 4);
-    return "\0\0\037\017\007"[utf8_rune_len]; 
-} 
- 
-inline size_t UTF8RuneLen(const unsigned char lead_byte) { 
+    return "\0\0\037\017\007"[utf8_rune_len];
+}
+
+inline size_t UTF8RuneLen(const unsigned char lead_byte) {
     //b0XXXXXXX
     if ((lead_byte & 0x80) == 0x00) {
         return 1;
@@ -34,343 +34,343 @@ inline size_t UTF8RuneLen(const unsigned char lead_byte) {
     }
     //b10XXXXXX
     return 0;
-} 
- 
-inline size_t UTF8RuneLenByUCS(wchar32 rune) { 
-    if (rune < 0x80) 
-        return 1U; 
-    else if (rune < 0x800) 
-        return 2U; 
-    else if (rune < 0x10000) 
-        return 3U; 
-    else if (rune < 0x200000) 
-        return 4U; 
-    else if (rune < 0x4000000) 
-        return 5U; 
-    else 
-        return 6U; 
-} 
- 
-inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) { 
-    rune = c; 
-    rune &= UTF8LeadByteMask(len); 
-} 
- 
-inline void PutUTF8SixBits(wchar32& rune, unsigned char c) { 
-    rune <<= 6; 
-    rune |= c & 0x3F; 
-} 
- 
-inline bool IsUTF8ContinuationByte(unsigned char c) { 
-    return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80); 
-} 
- 
-//! returns length of the current UTF8 character 
-//! @param n    length of the current character, it is assigned in case of valid UTF8 byte sequence 
-//! @param p    pointer to the current character 
-//! @param e    end of the character sequence 
-inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) { 
+}
+
+inline size_t UTF8RuneLenByUCS(wchar32 rune) {
+    if (rune < 0x80)
+        return 1U;
+    else if (rune < 0x800)
+        return 2U;
+    else if (rune < 0x10000)
+        return 3U;
+    else if (rune < 0x200000)
+        return 4U;
+    else if (rune < 0x4000000)
+        return 5U;
+    else
+        return 6U;
+}
+
+inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
+    rune = c;
+    rune &= UTF8LeadByteMask(len);
+}
+
+inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
+    rune <<= 6;
+    rune |= c & 0x3F;
+}
+
+inline bool IsUTF8ContinuationByte(unsigned char c) {
+    return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
+}
+
+//! returns length of the current UTF8 character
+//! @param n    length of the current character, it is assigned in case of valid UTF8 byte sequence
+//! @param p    pointer to the current character
+//! @param e    end of the character sequence
+inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
     Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
-    switch (UTF8RuneLen(*p)) { 
-        case 0: 
-            return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte 
- 
-        case 1: 
-            n = 1; 
-            return RECODE_OK; 
- 
-        case 2: 
-            if (p + 2 > e) { 
-                return RECODE_EOINPUT; 
-            } else if (!IsUTF8ContinuationByte(p[1])) { 
-                return RECODE_BROKENSYMBOL; 
-            } else { 
-                n = 2; 
-                return RECODE_OK; 
-            } 
-        case 3: 
-            if (p + 3 > e) { 
-                return RECODE_EOINPUT; 
+    switch (UTF8RuneLen(*p)) {
+        case 0:
+            return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+
+        case 1:
+            n = 1;
+            return RECODE_OK;
+
+        case 2:
+            if (p + 2 > e) {
+                return RECODE_EOINPUT;
+            } else if (!IsUTF8ContinuationByte(p[1])) {
+                return RECODE_BROKENSYMBOL;
+            } else {
+                n = 2;
+                return RECODE_OK;
+            }
+        case 3:
+            if (p + 3 > e) {
+                return RECODE_EOINPUT;
             } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
-                return RECODE_BROKENSYMBOL; 
-            } else { 
-                n = 3; 
-                return RECODE_OK; 
-            } 
-        default: // actually 4 
-            if (p + 4 > e) { 
-                return RECODE_EOINPUT; 
+                return RECODE_BROKENSYMBOL;
+            } else {
+                n = 3;
+                return RECODE_OK;
+            }
+        default: // actually 4
+            if (p + 4 > e) {
+                return RECODE_EOINPUT;
             } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
-                return RECODE_BROKENSYMBOL; 
-            } else { 
-                n = 4; 
-                return RECODE_OK; 
-            } 
-    } 
-} 
- 
-//! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong 
-//! @param text     UTF8 encoded text 
-//! @param len      the length of the text in bytes 
-//! @param number   number of encoded symbols in the text 
-inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) { 
-    const unsigned char* cur = reinterpret_cast<const unsigned char*>(text); 
-    const unsigned char* const last = cur + len; 
-    number = 0; 
-    size_t runeLen; 
-    bool res = true; 
-    while (cur != last) { 
-        if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only 
-            res = false; 
-            break; 
-        } 
-        cur += runeLen; 
+                return RECODE_BROKENSYMBOL;
+            } else {
+                n = 4;
+                return RECODE_OK;
+            }
+    }
+}
+
+//! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
+//! @param text     UTF8 encoded text
+//! @param len      the length of the text in bytes
+//! @param number   number of encoded symbols in the text
+inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
+    const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
+    const unsigned char* const last = cur + len;
+    number = 0;
+    size_t runeLen;
+    bool res = true;
+    while (cur != last) {
+        if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
+            res = false;
+            break;
+        }
+        cur += runeLen;
         Y_ASSERT(cur <= last);
-        ++number; 
-    } 
-    return res; 
-} 
- 
-inline size_t GetNumberOfUTF8Chars(TStringBuf text) { 
-    size_t number; 
+        ++number;
+    }
+    return res;
+}
+
+inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
+    size_t number;
     if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) {
         ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote();
-    } 
-    return number; 
-} 
- 
-//! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding 
-//! @param rune      value of the current character 
-//! @param rune_len  length of the UTF8 bytes sequence that has been read 
-//! @param s         pointer to the current character 
-//! @param end       the end of the character sequence 
-inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) { 
-    rune = BROKEN_RUNE; 
-    rune_len = 0; 
-    wchar32 _rune; 
- 
-    size_t _len = UTF8RuneLen(*s); 
-    if (s + _len > end) 
-        return RECODE_EOINPUT; //[EOINPUT] 
-    if (_len == 0) 
-        return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte 
-    _rune = *s++;                   //[00000000 0XXXXXXX] 
- 
-    if (_len > 1) { 
-        _rune &= UTF8LeadByteMask(_len); 
-        unsigned char ch = *s++; 
-        if (!IsUTF8ContinuationByte(ch)) 
-            return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte 
+    }
+    return number;
+}
+
+//! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
+//! @param rune      value of the current character
+//! @param rune_len  length of the UTF8 bytes sequence that has been read
+//! @param s         pointer to the current character
+//! @param end       the end of the character sequence
+inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
+    rune = BROKEN_RUNE;
+    rune_len = 0;
+    wchar32 _rune;
+
+    size_t _len = UTF8RuneLen(*s);
+    if (s + _len > end)
+        return RECODE_EOINPUT; //[EOINPUT]
+    if (_len == 0)
+        return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+    _rune = *s++;                   //[00000000 0XXXXXXX]
+
+    if (_len > 1) {
+        _rune &= UTF8LeadByteMask(_len);
+        unsigned char ch = *s++;
+        if (!IsUTF8ContinuationByte(ch))
+            return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
         PutUTF8SixBits(_rune, ch);      //[00000XXX XXYYYYYY]
-        if (_len > 2) { 
-            ch = *s++; 
-            if (!IsUTF8ContinuationByte(ch)) 
-                return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte 
+        if (_len > 2) {
+            ch = *s++;
+            if (!IsUTF8ContinuationByte(ch))
+                return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
             PutUTF8SixBits(_rune, ch);      //[XXXXYYYY YYZZZZZZ]
-            if (_len > 3) { 
-                ch = *s; 
-                if (!IsUTF8ContinuationByte(ch)) 
-                    return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte 
+            if (_len > 3) {
+                ch = *s;
+                if (!IsUTF8ContinuationByte(ch))
+                    return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
                 PutUTF8SixBits(_rune, ch);      //[XXXYY YYYYZZZZ ZZQQQQQQ]
                 if (_rune > 0x10FFFF)           // it is not a valid Unicode code point
-                    return RECODE_BROKENSYMBOL; 
-                if (_rune < 0x10000) // check for overlong encoding 
-                    return RECODE_BROKENSYMBOL; 
-            } else { 
-                if (_rune < 0x800) // check for overlong encoding 
-                    return RECODE_BROKENSYMBOL; 
-            } 
-        } else { 
-            if (_rune < 0x80) // check for overlong encoding 
-                return RECODE_BROKENSYMBOL; 
-        } 
-    } 
-    rune_len = _len; 
-    rune = _rune; 
-    return RECODE_OK; 
-} 
- 
-//! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character 
-//! @param c    value of the current character 
-//! @param p    pointer to the current character, it will be changed in case of valid UTF8 byte sequence 
-//! @param e    the end of the character sequence 
+                    return RECODE_BROKENSYMBOL;
+                if (_rune < 0x10000) // check for overlong encoding
+                    return RECODE_BROKENSYMBOL;
+            } else {
+                if (_rune < 0x800) // check for overlong encoding
+                    return RECODE_BROKENSYMBOL;
+            }
+        } else {
+            if (_rune < 0x80) // check for overlong encoding
+                return RECODE_BROKENSYMBOL;
+        }
+    }
+    rune_len = _len;
+    rune = _rune;
+    return RECODE_OK;
+}
+
+//! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
+//! @param c    value of the current character
+//! @param p    pointer to the current character, it will be changed in case of valid UTF8 byte sequence
+//! @param e    the end of the character sequence
 Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
     Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
-    switch (UTF8RuneLen(*p)) { 
-        case 0: 
-            rune = BROKEN_RUNE; 
-            return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte 
- 
-        case 1: 
-            rune = *p; //[00000000 0XXXXXXX] 
-            ++p; 
-            return RECODE_OK; 
- 
-        case 2: 
-            if (p + 2 > e) { 
-                return RECODE_EOINPUT; 
-            } else if (!IsUTF8ContinuationByte(p[1])) { 
-                rune = BROKEN_RUNE; 
-                return RECODE_BROKENSYMBOL; 
-            } else { 
-                PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX] 
-                PutUTF8SixBits(rune, *p++);     //[00000XXX XXYYYYYY] 
+    switch (UTF8RuneLen(*p)) {
+        case 0:
+            rune = BROKEN_RUNE;
+            return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+
+        case 1:
+            rune = *p; //[00000000 0XXXXXXX]
+            ++p;
+            return RECODE_OK;
+
+        case 2:
+            if (p + 2 > e) {
+                return RECODE_EOINPUT;
+            } else if (!IsUTF8ContinuationByte(p[1])) {
+                rune = BROKEN_RUNE;
+                return RECODE_BROKENSYMBOL;
+            } else {
+                PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
+                PutUTF8SixBits(rune, *p++);     //[00000XXX XXYYYYYY]
                 if (Y_UNLIKELY(rune < 0x80)) {  // overlong encoding
-                    p -= 2; 
-                    rune = BROKEN_RUNE; 
-                    return RECODE_BROKENSYMBOL; 
-                } 
-                return RECODE_OK; 
-            } 
-        case 3: 
-            if (p + 3 > e) { 
-                return RECODE_EOINPUT; 
+                    p -= 2;
+                    rune = BROKEN_RUNE;
+                    return RECODE_BROKENSYMBOL;
+                }
+                return RECODE_OK;
+            }
+        case 3:
+            if (p + 3 > e) {
+                return RECODE_EOINPUT;
             } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
-                rune = BROKEN_RUNE; 
-                return RECODE_BROKENSYMBOL; 
-            } else { 
-                PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX] 
-                PutUTF8SixBits(rune, *p++);     //[000000XX XXYYYYYY] 
-                PutUTF8SixBits(rune, *p++);     //[XXXXYYYY YYZZZZZZ] 
-                if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding 
-                    p -= 3; 
-                    rune = BROKEN_RUNE; 
-                    return RECODE_BROKENSYMBOL; 
-                } 
-                return RECODE_OK; 
-            } 
-        case 4: 
-            if (p + 4 > e) { 
-                return RECODE_EOINPUT; 
+                rune = BROKEN_RUNE;
+                return RECODE_BROKENSYMBOL;
+            } else {
+                PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
+                PutUTF8SixBits(rune, *p++);     //[000000XX XXYYYYYY]
+                PutUTF8SixBits(rune, *p++);     //[XXXXYYYY YYZZZZZZ]
+                if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding
+                    p -= 3;
+                    rune = BROKEN_RUNE;
+                    return RECODE_BROKENSYMBOL;
+                }
+                return RECODE_OK;
+            }
+        case 4:
+            if (p + 4 > e) {
+                return RECODE_EOINPUT;
             } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
-                rune = BROKEN_RUNE; 
-                return RECODE_BROKENSYMBOL; 
-            } else { 
+                rune = BROKEN_RUNE;
+                return RECODE_BROKENSYMBOL;
+            } else {
                 PutUTF8LeadBits(rune, *p++, 4);                      //[00000000 00000000 00000XXX]
                 PutUTF8SixBits(rune, *p++);                          //[00000000 0000000X XXYYYYYY]
                 PutUTF8SixBits(rune, *p++);                          //[00000000 0XXXYYYY YYZZZZZZ]
                 PutUTF8SixBits(rune, *p++);                          //[000XXXYY YYYYZZZZ ZZQQQQQQ]
-                if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point 
-                    p -= 4; 
-                    rune = BROKEN_RUNE; 
-                    return RECODE_BROKENSYMBOL; 
-                } 
-                return RECODE_OK; 
-            } 
-        default: // >4 
-            rune = BROKEN_RUNE; 
-            return RECODE_BROKENSYMBOL; 
-    } 
-} 
- 
-//! writes one unicode symbol into a character sequence encoded UTF8 
-//! checks for end of the buffer and returns the result of encoding 
-//! @param rune      value of the current character 
-//! @param rune_len  length of the UTF8 byte sequence that has been written 
-//! @param s         pointer to the output buffer 
-//! @param tail      available size of the buffer 
-inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) { 
-    rune_len = 0; 
-    if (rune < 0x80) { 
-        if (tail <= 0) 
-            return RECODE_EOOUTPUT; 
-        *s = static_cast<unsigned char>(rune); 
-        rune_len = 1; 
-        return RECODE_OK; 
-    } 
-    if (rune < 0x800) { 
-        if (tail <= 1) 
-            return RECODE_EOOUTPUT; 
-        *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6)); 
-        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); 
-        rune_len = 2; 
-        return RECODE_OK; 
-    } 
-    if (rune < 0x10000) { 
-        if (tail <= 2) 
-            return RECODE_EOOUTPUT; 
-        *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12)); 
-        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); 
-        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); 
-        rune_len = 3; 
-        return RECODE_OK; 
-    } 
-    /*if (rune < 0x200000)*/ { 
-        if (tail <= 3) 
-            return RECODE_EOOUTPUT; 
-        *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07)); 
-        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F)); 
-        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); 
-        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); 
-        rune_len = 4; 
-        return RECODE_OK; 
-    } 
-} 
- 
-inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) { 
-    return SafeWriteUTF8Char(rune, rune_len, s, end - s); 
-} 
- 
-//! writes one unicode symbol into a character sequence encoded UTF8 
-//! @attention       this function works as @c SafeWriteUTF8Char it does not check 
-//!                  the size of the output buffer, it supposes that buffer is long enough 
-//! @param rune      value of the current character 
-//! @param rune_len  length of the UTF8 byte sequence that has been written 
-//! @param s         pointer to the output buffer 
-inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) { 
-    if (rune < 0x80) { 
-        *s = static_cast<unsigned char>(rune); 
-        rune_len = 1; 
-        return; 
-    } 
-    if (rune < 0x800) { 
-        *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6)); 
-        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); 
-        rune_len = 2; 
-        return; 
-    } 
-    if (rune < 0x10000) { 
-        *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12)); 
-        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); 
-        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); 
-        rune_len = 3; 
-        return; 
-    } 
-    /*if (rune < 0x200000)*/ { 
-        *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07)); 
-        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F)); 
-        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F)); 
-        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F)); 
-        rune_len = 4; 
-    } 
-} 
- 
+                if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point
+                    p -= 4;
+                    rune = BROKEN_RUNE;
+                    return RECODE_BROKENSYMBOL;
+                }
+                return RECODE_OK;
+            }
+        default: // >4
+            rune = BROKEN_RUNE;
+            return RECODE_BROKENSYMBOL;
+    }
+}
+
+//! writes one unicode symbol into a character sequence encoded UTF8
+//! checks for end of the buffer and returns the result of encoding
+//! @param rune      value of the current character
+//! @param rune_len  length of the UTF8 byte sequence that has been written
+//! @param s         pointer to the output buffer
+//! @param tail      available size of the buffer
+inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
+    rune_len = 0;
+    if (rune < 0x80) {
+        if (tail <= 0)
+            return RECODE_EOOUTPUT;
+        *s = static_cast<unsigned char>(rune);
+        rune_len = 1;
+        return RECODE_OK;
+    }
+    if (rune < 0x800) {
+        if (tail <= 1)
+            return RECODE_EOOUTPUT;
+        *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
+        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+        rune_len = 2;
+        return RECODE_OK;
+    }
+    if (rune < 0x10000) {
+        if (tail <= 2)
+            return RECODE_EOOUTPUT;
+        *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
+        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
+        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+        rune_len = 3;
+        return RECODE_OK;
+    }
+    /*if (rune < 0x200000)*/ {
+        if (tail <= 3)
+            return RECODE_EOOUTPUT;
+        *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
+        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
+        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
+        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+        rune_len = 4;
+        return RECODE_OK;
+    }
+}
+
+inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
+    return SafeWriteUTF8Char(rune, rune_len, s, end - s);
+}
+
+//! writes one unicode symbol into a character sequence encoded UTF8
+//! @attention       this function works as @c SafeWriteUTF8Char it does not check
+//!                  the size of the output buffer, it supposes that buffer is long enough
+//! @param rune      value of the current character
+//! @param rune_len  length of the UTF8 byte sequence that has been written
+//! @param s         pointer to the output buffer
+inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
+    if (rune < 0x80) {
+        *s = static_cast<unsigned char>(rune);
+        rune_len = 1;
+        return;
+    }
+    if (rune < 0x800) {
+        *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
+        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+        rune_len = 2;
+        return;
+    }
+    if (rune < 0x10000) {
+        *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
+        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
+        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+        rune_len = 3;
+        return;
+    }
+    /*if (rune < 0x200000)*/ {
+        *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
+        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
+        *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
+        *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
+        rune_len = 4;
+    }
+}
+
 TStringBuf SubstrUTF8(const TStringBuf str, size_t pos, size_t len);
- 
-enum EUTF8Detect { 
-    NotUTF8, 
-    UTF8, 
-    ASCII 
-}; 
- 
-EUTF8Detect UTF8Detect(const char* s, size_t len); 
- 
+
+enum EUTF8Detect {
+    NotUTF8,
+    UTF8,
+    ASCII
+};
+
+EUTF8Detect UTF8Detect(const char* s, size_t len);
+
 inline EUTF8Detect UTF8Detect(const TStringBuf input) {
     return UTF8Detect(input.data(), input.size());
-} 
- 
-inline bool IsUtf(const char* input, size_t len) { 
-    return UTF8Detect(input, len) != NotUTF8; 
-} 
- 
+}
+
+inline bool IsUtf(const char* input, size_t len) {
+    return UTF8Detect(input, len) != NotUTF8;
+}
+
 inline bool IsUtf(const TStringBuf input) {
     return IsUtf(input.data(), input.size());
-} 
- 
-//! returns true, if result is not the same as input, and put it in newString 
-//! returns false, if result is unmodified 
+}
+
+//! returns true, if result is not the same as input, and put it in newString
+//! returns false, if result is unmodified
 bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
- 
+
 TString ToLowerUTF8(const TString& s);
 TString ToLowerUTF8(TStringBuf s);
 TString ToLowerUTF8(const char* s);
author	alzobnin <alzobnin@yandex-team.ru>	2022-02-10 16:46:50 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:46:50 +0300
commit	5085152b94bf621933243a498def7f37d2e76b58 (patch)
tree	49e222ea1c5804306084bb3ae065bb702625360f /util/charset/utf8.h
parent	c9317148cc3e9f1b0bc0ce95172f47e099f2c554 (diff)
download	ydb-5085152b94bf621933243a498def7f37d2e76b58.tar.gz