diff options
author | dpotapov <dpotapov@yandex-team.com> | 2023-01-16 21:39:14 +0300 |
---|---|---|
committer | dpotapov <dpotapov@yandex-team.com> | 2023-01-16 21:39:14 +0300 |
commit | 328635a6bd949596c49a33c9c2b67d00cc2704db (patch) | |
tree | 84104ccf9cd6c8cf47e1ac329076bf47dfabb052 /util/charset/wide_ut.cpp | |
parent | bfa024664d4edef47218bc0af66af681cfad9a88 (diff) | |
download | ydb-328635a6bd949596c49a33c9c2b67d00cc2704db.tar.gz |
charset: do not allow surrogate pairs in UTF-8
By [RFC3629 section 3](https://datatracker.ietf.org/doc/html/rfc3629#section-3):
```
The definition of UTF-8 prohibits encoding character numbers between
U+D800 and U+DFFF, which are reserved for use with the UTF-16
encoding form (as surrogate pairs) and do not directly represent characters.
```
Current implementation of `ReadUTF8CharAndAdvance` allows `UTF-8` encoded surrogate characters
like 0xED 0xA0 0xBD or 0xED 0xB3 0x9A leaving them in the strings
that cannot be processed afterwards by external programs like `iconv`.
This patch provides `strict` template flag that disables this leniency.
This flag is not enabled by default, because Arcadia already has hundreds of
tests with inputs containing such surrogate pairs and these tests breaks in strict mode
and there is a chance that prod might affected too.
SSE4 implementation doesn't perform any validation at all, so it is left unchanged.
Diffstat (limited to 'util/charset/wide_ut.cpp')
-rw-r--r-- | util/charset/wide_ut.cpp | 61 |
1 files changed, 59 insertions, 2 deletions
diff --git a/util/charset/wide_ut.cpp b/util/charset/wide_ut.cpp index d8f3233e73..b33dd0c0de 100644 --- a/util/charset/wide_ut.cpp +++ b/util/charset/wide_ut.cpp @@ -122,21 +122,23 @@ namespace { // } // } + template <StrictUTF8 strictMode = StrictUTF8::No> void CheckRecodeOK(wchar32 expected, unsigned char* first, size_t n) { wchar32 w = 0; const unsigned char* p = first; - RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, first + n); + RECODE_RESULT r = ReadUTF8CharAndAdvance<strictMode>(w, p, first + n); UNIT_ASSERT(w == expected); UNIT_ASSERT(size_t(p - first) == n); UNIT_ASSERT(r == RECODE_OK); } + template <StrictUTF8 strictMode = StrictUTF8::No> void CheckBrokenSymbol(unsigned char* first, unsigned char* last) { wchar32 w = 0; const unsigned char* p = first; - RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, last); + RECODE_RESULT r = ReadUTF8CharAndAdvance<strictMode>(w, p, last); UNIT_ASSERT(w == BROKEN_RUNE); UNIT_ASSERT(p - first == 0); UNIT_ASSERT(r == RECODE_BROKENSYMBOL); @@ -299,6 +301,61 @@ void TConversionTest::TestReadUTF8Char() { CheckEndOfInput(first, 1); } + // leading byte of 3-byte symbol before surrogates: 1110 0001 - 1110 1100 + for (c = 0xE1; c <= 0xEC; ++c) { + u = c; + CheckBrokenSymbol<StrictUTF8::Yes>(first, last); + + u |= 0x808000; + // w: 0000 0000 0000 0000 - 0000 0111 1100 0000 + e = c & LEAD_BITS_MASK_3_BYTES; + e <<= 12; + CheckRecodeOK<StrictUTF8::Yes>(e, first, 3); + + CheckEndOfInput(first, 2); + CheckEndOfInput(first, 1); + } + + // rest of allowed characters before surrogate block + { + u = 0xED; + CheckBrokenSymbol<StrictUTF8::Yes>(first, last); + + u |= 0xBF9F00; + e = 0xD7FF; + CheckRecodeOK<StrictUTF8::Yes>(e, first, 3); + + CheckEndOfInput(first, 2); + CheckEndOfInput(first, 1); + } + + // rfc3629 section 4 forbids characters 0xD800 - 0xDFFF + { + u = 0xED; + CheckBrokenSymbol<StrictUTF8::Yes>(first, last); + + u |= 0x80A000; + CheckBrokenSymbol<StrictUTF8::Yes>(first, last); + + CheckEndOfInput(first, 2); + CheckEndOfInput(first, 1); + } + + // leading byte of 3-byte symbol after surrogates: 1110 1110 - 1110 1111 + for (c = 0xEE; c <= 0xEF; ++c) { + u = c; + CheckBrokenSymbol<StrictUTF8::Yes>(first, last); + + u |= 0x808000; + // w: 0000 0000 0000 0000 - 0000 0111 1100 0000 + e = c & LEAD_BITS_MASK_3_BYTES; + e <<= 12; + CheckRecodeOK<StrictUTF8::Yes>(e, first, 3); + + CheckEndOfInput(first, 2); + CheckEndOfInput(first, 1); + } + // possible overlong encoding with leading byte 1111 0000 { u = c = 0xF0; |