diff options
| author | vvvv <[email protected]> | 2025-10-24 14:59:50 +0300 |
|---|---|---|
| committer | vvvv <[email protected]> | 2025-10-24 15:29:24 +0300 |
| commit | 5b0d18921f2a509d8363c40a5ca208dfed026287 (patch) | |
| tree | d1369c696d3a9e9a65b68d9208e198269a48cfbc /yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c | |
| parent | e7fbdb6e81ae4a296e710b133de7a2a04b31bbc4 (diff) | |
YQL-20567 upgrade PG up to 16.10 & fix instructions
init
commit_hash:81aba13295273281d19d2d332a48ff1c44977447
Diffstat (limited to 'yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c')
| -rw-r--r-- | yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c | 106 |
1 files changed, 99 insertions, 7 deletions
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c b/yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c index fbac11deb4d..82ea3a4e834 100644 --- a/yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c +++ b/yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c @@ -12,11 +12,32 @@ */ #include "c.h" +#include <limits.h> + #include "mb/pg_wchar.h" #include "utils/ascii.h" /* + * In today's multibyte encodings other than UTF8, this two-byte sequence + * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0. + * + * For historical reasons, several verifychar implementations opt to reject + * this pair specifically. Byte pair range constraints, in encoding + * originator documentation, always excluded this pair. No core conversion + * could translate it. However, longstanding verifychar implementations + * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate + * pairs not valid per encoding originator documentation. To avoid tightening + * core or non-core conversions in a security patch, we sought this one pair. + * + * PQescapeString() historically used spaces for BYTE1; many other values + * could suffice for BYTE1. + */ +#define NONUTF8_INVALID_BYTE0 (0x8d) +#define NONUTF8_INVALID_BYTE1 (' ') + + +/* * Operations on multi-byte encodings are driven by a table of helper * functions. * @@ -1526,6 +1547,11 @@ pg_big5_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -1575,6 +1601,11 @@ pg_gbk_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -1624,6 +1655,11 @@ pg_uhc_verifychar(const unsigned char *s, int len) if (len < l) return -1; + if (l == 2 && + s[0] == NONUTF8_INVALID_BYTE0 && + s[1] == NONUTF8_INVALID_BYTE1) + return -1; + while (--l > 0) { if (*++s == '\0') @@ -2069,6 +2105,19 @@ pg_utf8_islegal(const unsigned char *source, int length) /* + * Fills the provided buffer with two bytes such that: + * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0 + */ +void +pg_encoding_set_invalid(int encoding, char *dst) +{ + Assert(pg_encoding_max_length(encoding) > 1); + + dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0); + dst[1] = NONUTF8_INVALID_BYTE1; +} + +/* *------------------------------------------------------------------- * encoding info table * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h) @@ -2122,10 +2171,27 @@ const pg_wchar_tbl pg_wchar_table[] = { /* * Returns the byte length of a multibyte character. * - * Caution: when dealing with text that is not certainly valid in the - * specified encoding, the result may exceed the actual remaining - * string length. Callers that are not prepared to deal with that - * should use pg_encoding_mblen_bounded() instead. + * Choose "mblen" functions based on the input string characteristics. + * pg_encoding_mblen() can be used when ANY of these conditions are met: + * + * - The input string is zero-terminated + * + * - The input string is known to be valid in the encoding (e.g., string + * converted from database encoding) + * + * - The encoding is not GB18030 (e.g., when only database encodings are + * passed to 'encoding' parameter) + * + * encoding==GB18030 requires examining up to two bytes to determine character + * length. Therefore, callers satisfying none of those conditions must use + * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be + * guaranteed to be within allocation bounds. + * + * When dealing with text that is not certainly valid in the specified + * encoding, the result may exceed the actual remaining string length. + * Callers that are not prepared to deal with that should use Min(remaining, + * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and + * pg_encoding_mblen_bounded() are interchangeable. */ int pg_encoding_mblen(int encoding, const char *mbstr) @@ -2136,8 +2202,28 @@ pg_encoding_mblen(int encoding, const char *mbstr) } /* - * Returns the byte length of a multibyte character; but not more than - * the distance to end of string. + * Returns the byte length of a multibyte character (possibly not + * zero-terminated), or INT_MAX if too few bytes remain to determine a length. + */ +int +pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, + size_t remaining) +{ + /* + * Define zero remaining as too few, even for single-byte encodings. + * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read + * zero; others read one. + */ + if (remaining < 1 || + (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2)) + return INT_MAX; + return pg_encoding_mblen(encoding, mbstr); +} + +/* + * Returns the byte length of a multibyte character; but not more than the + * distance to the terminating zero byte. For input that might lack a + * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()). */ int pg_encoding_mblen_bounded(int encoding, const char *mbstr) @@ -2190,5 +2276,11 @@ pg_encoding_max_length(int encoding) { Assert(PG_VALID_ENCODING(encoding)); - return pg_wchar_table[encoding].maxmblen; + /* + * Check for the encoding despite the assert, due to some mingw versions + * otherwise issuing bogus warnings. + */ + return PG_VALID_ENCODING(encoding) ? + pg_wchar_table[encoding].maxmblen : + pg_wchar_table[PG_SQL_ASCII].maxmblen; } |
