charset: do not allow surrogate pairs in UTF-8

By [RFC3629 section 3](https://datatracker.ietf.org/doc/html/rfc3629#section-3): ``` The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF, which are reserved for use with the UTF-16 encoding form (as surrogate pairs) and do not directly represent characters. ``` Current implementation of `ReadUTF8CharAndAdvance` allows `UTF-8` encoded surrogate characters like 0xED 0xA0 0xBD or 0xED 0xB3 0x9A leaving them in the strings that cannot be processed afterwards by external programs like `iconv`. This patch provides `strict` template flag that disables this leniency. This flag is not enabled by default, because Arcadia already has hundreds of tests with inputs containing such surrogate pairs and these tests breaks in strict mode and there is a chance that prod might affected too. SSE4 implementation doesn't perform any validation at all, so it is left unchanged.
author: dpotapov <dpotapov@yandex-team.com> 2023-01-16 21:39:14 +0300
committer: dpotapov <dpotapov@yandex-team.com> 2023-01-16 21:39:14 +0300
commit: 328635a6bd949596c49a33c9c2b67d00cc2704db (patch)
tree: 84104ccf9cd6c8cf47e1ac329076bf47dfabb052 /util
parent: bfa024664d4edef47218bc0af66af681cfad9a88 (diff)
download: ydb-328635a6bd949596c49a33c9c2b67d00cc2704db.tar.gz
2 files changed, 114 insertions, 14 deletions
diff --git a/util/charset/utf8.h b/util/charset/utf8.h
index 5039b46ae9..76c1f94078 100644
--- a/util/charset/utf8.h
+++ b/util/charset/utf8.h
@@ -139,11 +139,54 @@ inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
     return number;
 }
 
+enum class StrictUTF8 {
+    Yes,
+    No
+};
+
+template <size_t runeLen, StrictUTF8 strictMode>
+inline bool IsValidUTF8Rune(wchar32 rune);
+
+template <>
+inline bool IsValidUTF8Rune<2, StrictUTF8::Yes>(wchar32 rune) {
+    // check for overlong encoding
+    return rune >= 0x80;
+}
+
+template <>
+inline bool IsValidUTF8Rune<2, StrictUTF8::No>(wchar32 rune) {
+    return IsValidUTF8Rune<2, StrictUTF8::Yes>(rune);
+}
+
+template <>
+inline bool IsValidUTF8Rune<3, StrictUTF8::Yes>(wchar32 rune) {
+    // surrogates are forbidden by RFC3629 section 3
+    return rune >= 0x800 && (rune < 0xD800 || rune > 0xDFFF);
+}
+
+template <>
+inline bool IsValidUTF8Rune<3, StrictUTF8::No>(wchar32 rune) {
+    // check for overlong encoding
+    return rune >= 0x800;
+}
+
+template <>
+inline bool IsValidUTF8Rune<4, StrictUTF8::Yes>(wchar32 rune) {
+    // check if this is a valid sumbod without overlong encoding
+    return rune <= 0x10FFFF && rune >= 0x10000;
+}
+
+template <>
+inline bool IsValidUTF8Rune<4, StrictUTF8::No>(wchar32 rune) {
+    return IsValidUTF8Rune<4, StrictUTF8::Yes>(rune);
+}
+
 //! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
 //! @param rune      value of the current character
 //! @param rune_len  length of the UTF8 bytes sequence that has been read
 //! @param s         pointer to the current character
 //! @param end       the end of the character sequence
+template <StrictUTF8 strictMode = StrictUTF8::No>
 inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
     rune = BROKEN_RUNE;
     rune_len = 0;
@@ -172,16 +215,14 @@ inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const uns
                 if (!IsUTF8ContinuationByte(ch))
                     return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
                 PutUTF8SixBits(_rune, ch);      //[XXXYY YYYYZZZZ ZZQQQQQQ]
-                if (_rune > 0x10FFFF)           // it is not a valid Unicode code point
-                    return RECODE_BROKENSYMBOL;
-                if (_rune < 0x10000) // check for overlong encoding
+                if (!IsValidUTF8Rune<4, strictMode>(_rune))
                     return RECODE_BROKENSYMBOL;
             } else {
-                if (_rune < 0x800) // check for overlong encoding
+                if (!IsValidUTF8Rune<3, strictMode>(_rune))
                     return RECODE_BROKENSYMBOL;
             }
         } else {
-            if (_rune < 0x80) // check for overlong encoding
+            if (!IsValidUTF8Rune<2, strictMode>(_rune))
                 return RECODE_BROKENSYMBOL;
         }
     }
@@ -194,6 +235,7 @@ inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const uns
 //! @param c    value of the current character
 //! @param p    pointer to the current character, it will be changed in case of valid UTF8 byte sequence
 //! @param e    the end of the character sequence
+template <StrictUTF8 strictMode = StrictUTF8::No>
 Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
     Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
     switch (UTF8RuneLen(*p)) {
@@ -215,7 +257,7 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne
             } else {
                 PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
                 PutUTF8SixBits(rune, *p++);     //[00000XXX XXYYYYYY]
-                if (Y_UNLIKELY(rune < 0x80)) {  // overlong encoding
+                if (!IsValidUTF8Rune<2, strictMode>(rune)) {
                     p -= 2;
                     rune = BROKEN_RUNE;
                     return RECODE_BROKENSYMBOL;
@@ -232,7 +274,8 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne
                 PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
                 PutUTF8SixBits(rune, *p++);     //[000000XX XXYYYYYY]
                 PutUTF8SixBits(rune, *p++);     //[XXXXYYYY YYZZZZZZ]
-                if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding
+                // check for overlong encoding and surrogates
+                if (!IsValidUTF8Rune<3, strictMode>(rune)) {
                     p -= 3;
                     rune = BROKEN_RUNE;
                     return RECODE_BROKENSYMBOL;
@@ -246,11 +289,11 @@ Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigne
                 rune = BROKEN_RUNE;
                 return RECODE_BROKENSYMBOL;
             } else {
-                PutUTF8LeadBits(rune, *p++, 4);                      //[00000000 00000000 00000XXX]
-                PutUTF8SixBits(rune, *p++);                          //[00000000 0000000X XXYYYYYY]
-                PutUTF8SixBits(rune, *p++);                          //[00000000 0XXXYYYY YYZZZZZZ]
-                PutUTF8SixBits(rune, *p++);                          //[000XXXYY YYYYZZZZ ZZQQQQQQ]
-                if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point
+                PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX]
+                PutUTF8SixBits(rune, *p++);     //[00000000 0000000X XXYYYYYY]
+                PutUTF8SixBits(rune, *p++);     //[00000000 0XXXYYYY YYZZZZZZ]
+                PutUTF8SixBits(rune, *p++);     //[000XXXYY YYYYZZZZ ZZQQQQQQ]
+                if (!IsValidUTF8Rune<4, strictMode>(rune)) {
                     p -= 4;
                     rune = BROKEN_RUNE;
                     return RECODE_BROKENSYMBOL;
diff --git a/util/charset/wide_ut.cpp b/util/charset/wide_ut.cpp
index d8f3233e73..b33dd0c0de 100644
--- a/util/charset/wide_ut.cpp
+++ b/util/charset/wide_ut.cpp
@@ -122,21 +122,23 @@ namespace {
     //        }
     //    }
 
+    template <StrictUTF8 strictMode = StrictUTF8::No>
     void CheckRecodeOK(wchar32 expected, unsigned char* first, size_t n) {
         wchar32 w = 0;
         const unsigned char* p = first;
 
-        RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, first + n);
+        RECODE_RESULT r = ReadUTF8CharAndAdvance<strictMode>(w, p, first + n);
         UNIT_ASSERT(w == expected);
         UNIT_ASSERT(size_t(p - first) == n);
         UNIT_ASSERT(r == RECODE_OK);
     }
 
+    template <StrictUTF8 strictMode = StrictUTF8::No>
     void CheckBrokenSymbol(unsigned char* first, unsigned char* last) {
         wchar32 w = 0;
         const unsigned char* p = first;
 
-        RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, last);
+        RECODE_RESULT r = ReadUTF8CharAndAdvance<strictMode>(w, p, last);
         UNIT_ASSERT(w == BROKEN_RUNE);
         UNIT_ASSERT(p - first == 0);
         UNIT_ASSERT(r == RECODE_BROKENSYMBOL);
@@ -299,6 +301,61 @@ void TConversionTest::TestReadUTF8Char() {
         CheckEndOfInput(first, 1);
     }
 
+    // leading byte of 3-byte symbol before surrogates: 1110 0001 - 1110 1100
+    for (c = 0xE1; c <= 0xEC; ++c) {
+        u = c;
+        CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
+
+        u |= 0x808000;
+        // w: 0000 0000  0000 0000 - 0000 0111  1100 0000
+        e = c & LEAD_BITS_MASK_3_BYTES;
+        e <<= 12;
+        CheckRecodeOK<StrictUTF8::Yes>(e, first, 3);
+
+        CheckEndOfInput(first, 2);
+        CheckEndOfInput(first, 1);
+    }
+
+    // rest of allowed characters before surrogate block
+    {
+        u = 0xED;
+        CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
+
+        u |= 0xBF9F00;
+        e = 0xD7FF;
+        CheckRecodeOK<StrictUTF8::Yes>(e, first, 3);
+
+        CheckEndOfInput(first, 2);
+        CheckEndOfInput(first, 1);
+    }
+
+    // rfc3629 section 4 forbids characters 0xD800 - 0xDFFF
+    {
+        u = 0xED;
+        CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
+
+        u |= 0x80A000;
+        CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
+
+        CheckEndOfInput(first, 2);
+        CheckEndOfInput(first, 1);
+    }
+
+    // leading byte of 3-byte symbol after surrogates: 1110 1110 - 1110 1111
+    for (c = 0xEE; c <= 0xEF; ++c) {
+        u = c;
+        CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
+
+        u |= 0x808000;
+        // w: 0000 0000  0000 0000 - 0000 0111  1100 0000
+        e = c & LEAD_BITS_MASK_3_BYTES;
+        e <<= 12;
+        CheckRecodeOK<StrictUTF8::Yes>(e, first, 3);
+
+        CheckEndOfInput(first, 2);
+        CheckEndOfInput(first, 1);
+    }
+
     // possible overlong encoding with leading byte 1111 0000
     {
         u = c = 0xF0;
author	dpotapov <dpotapov@yandex-team.com>	2023-01-16 21:39:14 +0300
committer	dpotapov <dpotapov@yandex-team.com>	2023-01-16 21:39:14 +0300
commit	328635a6bd949596c49a33c9c2b67d00cc2704db (patch)
tree	84104ccf9cd6c8cf47e1ac329076bf47dfabb052 /util
parent	bfa024664d4edef47218bc0af66af681cfad9a88 (diff)
download	ydb-328635a6bd949596c49a33c9c2b67d00cc2704db.tar.gz