YQL-20567 upgrade PG up to 16.10 & fix instructions

init commit_hash:81aba13295273281d19d2d332a48ff1c44977447
author: vvvv <[email protected]> 2025-10-24 14:59:50 +0300
committer: vvvv <[email protected]> 2025-10-24 15:29:24 +0300
commit: 5b0d18921f2a509d8363c40a5ca208dfed026287 (patch)
tree: d1369c696d3a9e9a65b68d9208e198269a48cfbc /yql/essentials/parser/pg_wrapper/postgresql/src/common
parent: e7fbdb6e81ae4a296e710b133de7a2a04b31bbc4 (diff)
4 files changed, 113 insertions, 12 deletions
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/common/jsonapi.c b/yql/essentials/parser/pg_wrapper/postgresql/src/common/jsonapi.c
index 168001b0f3b..35d0e53b398 100644
--- a/yql/essentials/parser/pg_wrapper/postgresql/src/common/jsonapi.c
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/common/jsonapi.c
@@ -721,8 +721,11 @@ json_lex_string(JsonLexContext *lex)
 	} while (0)
 #define FAIL_AT_CHAR_END(code) \
 	do { \
-		lex->token_terminator = \
-			s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
+		ptrdiff_t	remaining = end - s; \
+		int			charlen; \
+		charlen = pg_encoding_mblen_or_incomplete(lex->input_encoding, \
+												  s, remaining); \
+		lex->token_terminator = (charlen <= remaining) ? s + charlen : end; \
 		return code; \
 	} while (0)
 
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/common/saslprep.c b/yql/essentials/parser/pg_wrapper/postgresql/src/common/saslprep.c
index 3cf498866a5..e7e909a0c87 100644
--- a/yql/essentials/parser/pg_wrapper/postgresql/src/common/saslprep.c
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/common/saslprep.c
@@ -21,8 +21,13 @@
  */
 #ifndef FRONTEND
 #include "postgres.h"
+#include "utils/memutils.h"
 #else
 #include "postgres_fe.h"
+
+/* It's possible we could use a different value for this in frontend code */
+#define MaxAllocSize	((Size) 0x3fffffff) /* 1 gigabyte - 1 */
+
 #endif
 
 #include "common/saslprep.h"
@@ -1077,6 +1082,8 @@ pg_saslprep(const char *input, char **output)
 	input_size = pg_utf8_string_len(input);
 	if (input_size < 0)
 		return SASLPREP_INVALID_UTF8;
+	if (input_size >= MaxAllocSize / sizeof(pg_wchar))
+		goto oom;
 
 	input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
 	if (!input_chars)
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/common/scram-common.c b/yql/essentials/parser/pg_wrapper/postgresql/src/common/scram-common.c
index 6448564a08c..38c97ef0b4c 100644
--- a/yql/essentials/parser/pg_wrapper/postgresql/src/common/scram-common.c
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/common/scram-common.c
@@ -74,7 +74,7 @@ scram_SaltedPassword(const char *password,
 	memcpy(result, Ui_prev, key_length);
 
 	/* Subsequent iterations */
-	for (i = 2; i <= iterations; i++)
+	for (i = 1; i < iterations; i++)
 	{
 #ifndef FRONTEND
 		/*
@@ -200,8 +200,7 @@ scram_ServerKey(const uint8 *salted_password,
  *
  * The password should already have been processed with SASLprep, if necessary!
  *
- * If iterations is 0, default number of iterations is used.  The result is
- * palloc'd or malloc'd, so caller is responsible for freeing it.
+ * The result is palloc'd or malloc'd, so caller is responsible for freeing it.
  *
  * On error, returns NULL and sets *errstr to point to a message about the
  * error details.
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c b/yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c
index fbac11deb4d..82ea3a4e834 100644
--- a/yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/common/wchar.c
@@ -12,11 +12,32 @@
  */
 #include "c.h"
 
+#include <limits.h>
+
 #include "mb/pg_wchar.h"
 #include "utils/ascii.h"
 
 
 /*
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
+ *
+ * For historical reasons, several verifychar implementations opt to reject
+ * this pair specifically.  Byte pair range constraints, in encoding
+ * originator documentation, always excluded this pair.  No core conversion
+ * could translate it.  However, longstanding verifychar implementations
+ * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
+ * pairs not valid per encoding originator documentation.  To avoid tightening
+ * core or non-core conversions in a security patch, we sought this one pair.
+ *
+ * PQescapeString() historically used spaces for BYTE1; many other values
+ * could suffice for BYTE1.
+ */
+#define NONUTF8_INVALID_BYTE0 (0x8d)
+#define NONUTF8_INVALID_BYTE1 (' ')
+
+
+/*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
  *
@@ -1526,6 +1547,11 @@ pg_big5_verifychar(const unsigned char *s, int len)
 	if (len < l)
 		return -1;
 
+	if (l == 2 &&
+		s[0] == NONUTF8_INVALID_BYTE0 &&
+		s[1] == NONUTF8_INVALID_BYTE1)
+		return -1;
+
 	while (--l > 0)
 	{
 		if (*++s == '\0')
@@ -1575,6 +1601,11 @@ pg_gbk_verifychar(const unsigned char *s, int len)
 	if (len < l)
 		return -1;
 
+	if (l == 2 &&
+		s[0] == NONUTF8_INVALID_BYTE0 &&
+		s[1] == NONUTF8_INVALID_BYTE1)
+		return -1;
+
 	while (--l > 0)
 	{
 		if (*++s == '\0')
@@ -1624,6 +1655,11 @@ pg_uhc_verifychar(const unsigned char *s, int len)
 	if (len < l)
 		return -1;
 
+	if (l == 2 &&
+		s[0] == NONUTF8_INVALID_BYTE0 &&
+		s[1] == NONUTF8_INVALID_BYTE1)
+		return -1;
+
 	while (--l > 0)
 	{
 		if (*++s == '\0')
@@ -2069,6 +2105,19 @@ pg_utf8_islegal(const unsigned char *source, int length)
 
 
 /*
+ * Fills the provided buffer with two bytes such that:
+ *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
+ */
+void
+pg_encoding_set_invalid(int encoding, char *dst)
+{
+	Assert(pg_encoding_max_length(encoding) > 1);
+
+	dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
+	dst[1] = NONUTF8_INVALID_BYTE1;
+}
+
+/*
  *-------------------------------------------------------------------
  * encoding info table
  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
@@ -2122,10 +2171,27 @@ const pg_wchar_tbl pg_wchar_table[] = {
 /*
  * Returns the byte length of a multibyte character.
  *
- * Caution: when dealing with text that is not certainly valid in the
- * specified encoding, the result may exceed the actual remaining
- * string length.  Callers that are not prepared to deal with that
- * should use pg_encoding_mblen_bounded() instead.
+ * Choose "mblen" functions based on the input string characteristics.
+ * pg_encoding_mblen() can be used when ANY of these conditions are met:
+ *
+ * - The input string is zero-terminated
+ *
+ * - The input string is known to be valid in the encoding (e.g., string
+ *   converted from database encoding)
+ *
+ * - The encoding is not GB18030 (e.g., when only database encodings are
+ *   passed to 'encoding' parameter)
+ *
+ * encoding==GB18030 requires examining up to two bytes to determine character
+ * length.  Therefore, callers satisfying none of those conditions must use
+ * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
+ * guaranteed to be within allocation bounds.
+ *
+ * When dealing with text that is not certainly valid in the specified
+ * encoding, the result may exceed the actual remaining string length.
+ * Callers that are not prepared to deal with that should use Min(remaining,
+ * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
+ * pg_encoding_mblen_bounded() are interchangeable.
  */
 int
 pg_encoding_mblen(int encoding, const char *mbstr)
@@ -2136,8 +2202,28 @@ pg_encoding_mblen(int encoding, const char *mbstr)
 }
 
 /*
- * Returns the byte length of a multibyte character; but not more than
- * the distance to end of string.
+ * Returns the byte length of a multibyte character (possibly not
+ * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
+ */
+int
+pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
+								size_t remaining)
+{
+	/*
+	 * Define zero remaining as too few, even for single-byte encodings.
+	 * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
+	 * zero; others read one.
+	 */
+	if (remaining < 1 ||
+		(encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
+		return INT_MAX;
+	return pg_encoding_mblen(encoding, mbstr);
+}
+
+/*
+ * Returns the byte length of a multibyte character; but not more than the
+ * distance to the terminating zero byte.  For input that might lack a
+ * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
  */
 int
 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
@@ -2190,5 +2276,11 @@ pg_encoding_max_length(int encoding)
 {
 	Assert(PG_VALID_ENCODING(encoding));
 
-	return pg_wchar_table[encoding].maxmblen;
+	/*
+	 * Check for the encoding despite the assert, due to some mingw versions
+	 * otherwise issuing bogus warnings.
+	 */
+	return PG_VALID_ENCODING(encoding) ?
+		pg_wchar_table[encoding].maxmblen :
+		pg_wchar_table[PG_SQL_ASCII].maxmblen;
 }
author	vvvv <[email protected]>	2025-10-24 14:59:50 +0300
committer	vvvv <[email protected]>	2025-10-24 15:29:24 +0300
commit	5b0d18921f2a509d8363c40a5ca208dfed026287 (patch)
tree	d1369c696d3a9e9a65b68d9208e198269a48cfbc /yql/essentials/parser/pg_wrapper/postgresql/src/common
parent	e7fbdb6e81ae4a296e710b133de7a2a04b31bbc4 (diff)