aboutsummaryrefslogtreecommitdiffstats
path: root/util/charset/utf8.h
diff options
context:
space:
mode:
authorAlexander Smirnov <alex@ydb.tech>2024-11-20 11:14:58 +0000
committerAlexander Smirnov <alex@ydb.tech>2024-11-20 11:14:58 +0000
commit31773f157bf8164364649b5f470f52dece0a4317 (patch)
tree33d0f7eef45303ab68cf08ab381ce5e5e36c5240 /util/charset/utf8.h
parent2c7938962d8689e175574fc1e817c05049f27905 (diff)
parenteff600952d5dfe17942f38f510a8ac2b203bb3a5 (diff)
downloadydb-31773f157bf8164364649b5f470f52dece0a4317.tar.gz
Merge branch 'rightlib' into mergelibs-241120-1113
Diffstat (limited to 'util/charset/utf8.h')
-rw-r--r--util/charset/utf8.h57
1 files changed, 35 insertions, 22 deletions
diff --git a/util/charset/utf8.h b/util/charset/utf8.h
index b105d8db9d..c1ffdd072f 100644
--- a/util/charset/utf8.h
+++ b/util/charset/utf8.h
@@ -37,18 +37,19 @@ inline size_t UTF8RuneLen(const unsigned char lead_byte) {
}
inline size_t UTF8RuneLenByUCS(wchar32 rune) {
- if (rune < 0x80)
+ if (rune < 0x80) {
return 1U;
- else if (rune < 0x800)
+ } else if (rune < 0x800) {
return 2U;
- else if (rune < 0x10000)
+ } else if (rune < 0x10000) {
return 3U;
- else if (rune < 0x200000)
+ } else if (rune < 0x200000) {
return 4U;
- else if (rune < 0x4000000)
+ } else if (rune < 0x4000000) {
return 5U;
- else
+ } else {
return 6U;
+ }
}
inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
@@ -193,37 +194,45 @@ inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const uns
wchar32 _rune;
size_t _len = UTF8RuneLen(*s);
- if (s + _len > end)
+ if (s + _len > end) {
return RECODE_EOINPUT; // [EOINPUT]
- if (_len == 0)
+ }
+ if (_len == 0) {
return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
- _rune = *s++; // [00000000 0XXXXXXX]
+ }
+ _rune = *s++; // [00000000 0XXXXXXX]
if (_len > 1) {
_rune &= UTF8LeadByteMask(_len);
unsigned char ch = *s++;
- if (!IsUTF8ContinuationByte(ch))
+ if (!IsUTF8ContinuationByte(ch)) {
return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in second byte
- PutUTF8SixBits(_rune, ch); // [00000XXX XXYYYYYY]
+ }
+ PutUTF8SixBits(_rune, ch); // [00000XXX XXYYYYYY]
if (_len > 2) {
ch = *s++;
- if (!IsUTF8ContinuationByte(ch))
+ if (!IsUTF8ContinuationByte(ch)) {
return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in third byte
- PutUTF8SixBits(_rune, ch); // [XXXXYYYY YYZZZZZZ]
+ }
+ PutUTF8SixBits(_rune, ch); // [XXXXYYYY YYZZZZZZ]
if (_len > 3) {
ch = *s;
- if (!IsUTF8ContinuationByte(ch))
+ if (!IsUTF8ContinuationByte(ch)) {
return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in fourth byte
- PutUTF8SixBits(_rune, ch); // [XXXYY YYYYZZZZ ZZQQQQQQ]
- if (!IsValidUTF8Rune<4, strictMode>(_rune))
+ }
+ PutUTF8SixBits(_rune, ch); // [XXXYY YYYYZZZZ ZZQQQQQQ]
+ if (!IsValidUTF8Rune<4, strictMode>(_rune)) {
return RECODE_BROKENSYMBOL;
+ }
} else {
- if (!IsValidUTF8Rune<3, strictMode>(_rune))
+ if (!IsValidUTF8Rune<3, strictMode>(_rune)) {
return RECODE_BROKENSYMBOL;
+ }
}
} else {
- if (!IsValidUTF8Rune<2, strictMode>(_rune))
+ if (!IsValidUTF8Rune<2, strictMode>(_rune)) {
return RECODE_BROKENSYMBOL;
+ }
}
}
rune_len = _len;
@@ -315,23 +324,26 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne
inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
rune_len = 0;
if (rune < 0x80) {
- if (tail <= 0)
+ if (tail <= 0) {
return RECODE_EOOUTPUT;
+ }
*s = static_cast<unsigned char>(rune);
rune_len = 1;
return RECODE_OK;
}
if (rune < 0x800) {
- if (tail <= 1)
+ if (tail <= 1) {
return RECODE_EOOUTPUT;
+ }
*s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
*s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
rune_len = 2;
return RECODE_OK;
}
if (rune < 0x10000) {
- if (tail <= 2)
+ if (tail <= 2) {
return RECODE_EOOUTPUT;
+ }
*s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
*s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
@@ -339,8 +351,9 @@ inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned
return RECODE_OK;
}
/*if (rune < 0x200000)*/ {
- if (tail <= 3)
+ if (tail <= 3) {
return RECODE_EOOUTPUT;
+ }
*s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));