With GB18030, prevent SIGSEGV from reading past end of allocation.

With GB18030 as source encoding, applications could crash the server via SQL functions convert() or convert_from(). Applications themselves could crash after passing unterminated GB18030 input to libpq functions PQescapeLiteral(), PQescapeIdentifier(), PQescapeStringConn(), or PQescapeString(). Extension code could crash by passing unterminated GB18030 input to jsonapi.h functions. All those functions have been intended to handle untrusted, unterminated input safely. A crash required allocating the input such that the last byte of the allocation was the last byte of a virtual memory page. Some malloc() implementations take measures against that, making the SIGSEGV hard to reach. Back-patch to v13 (all supported versions). Author: Noah Misch <[email protected]> Author: Andres Freund <[email protected]> Reviewed-by: Masahiko Sawada <[email protected]> Backpatch-through: 13 Security: CVE-2025-4207
author: Noah Misch 2025-05-05 11:52:04 +0000
committer: Noah Misch 2025-05-05 11:52:04 +0000
commit: 627acc3caa74caa736b2c5587e944d2ea510ea67 (patch)
tree: 7aed8aec94c841de34fad669bd8b4376ff15c1a3 /src/test
parent: 5be213caaa1a9a65dfdbbf400b6a53b5e743b8d1 (diff)
3 files changed, 110 insertions, 6 deletions
diff --git a/src/test/modules/test_escape/test_escape.c b/src/test/modules/test_escape/test_escape.c
index ffd9d7166fa..59430ed46c4 100644
--- a/src/test/modules/test_escape/test_escape.c
+++ b/src/test/modules/test_escape/test_escape.c
@@ -12,6 +12,7 @@
 #include <string.h>
 #include <stdio.h>
 
+#include "common/jsonapi.h"
 #include "fe_utils/psqlscan.h"
 #include "fe_utils/string_utils.h"
 #include "getopt_long.h"
@@ -164,6 +165,88 @@ encoding_conflicts_ascii(int encoding)
 }
 
 
+/*
+ * Confirm escaping doesn't read past the end of an allocation.  Consider the
+ * result of malloc(4096), in the absence of freelist entries satisfying the
+ * allocation.  On OpenBSD, reading one byte past the end of that object
+ * yields SIGSEGV.
+ *
+ * Run this test before the program's other tests, so freelists are minimal.
+ * len=4096 didn't SIGSEGV, likely due to free() calls in libpq.  len=8192
+ * did.  Use 128 KiB, to somewhat insulate the outcome from distant new free()
+ * calls and libc changes.
+ */
+static void
+test_gb18030_page_multiple(pe_test_config *tc)
+{
+	PQExpBuffer testname;
+	size_t		input_len = 0x20000;
+	char	   *input;
+
+	/* prepare input */
+	input = pg_malloc(input_len);
+	memset(input, '-', input_len - 1);
+	input[input_len - 1] = 0xfe;
+
+	/* name to describe the test */
+	testname = createPQExpBuffer();
+	appendPQExpBuffer(testname, ">repeat(%c, %zu)", input[0], input_len - 1);
+	escapify(testname, input + input_len - 1, 1);
+	appendPQExpBuffer(testname, "< - GB18030 - PQescapeLiteral");
+
+	/* test itself */
+	PQsetClientEncoding(tc->conn, "GB18030");
+	report_result(tc, PQescapeLiteral(tc->conn, input, input_len) == NULL,
+				  testname->data, "",
+				  "input validity vs escape success", "ok");
+
+	destroyPQExpBuffer(testname);
+	pg_free(input);
+}
+
+/*
+ * Confirm json parsing doesn't read past the end of an allocation.  This
+ * exercises wchar.c infrastructure like the true "escape" tests do, but this
+ * isn't an "escape" test.
+ */
+static void
+test_gb18030_json(pe_test_config *tc)
+{
+	PQExpBuffer raw_buf;
+	PQExpBuffer testname;
+	const char	input[] = "{\"\\u\xFE";
+	size_t		input_len = sizeof(input) - 1;
+	JsonLexContext *lex;
+	JsonSemAction sem = {0};	/* no callbacks */
+	JsonParseErrorType json_error;
+
+	/* prepare input like test_one_vector_escape() does */
+	raw_buf = createPQExpBuffer();
+	appendBinaryPQExpBuffer(raw_buf, input, input_len);
+	appendPQExpBufferStr(raw_buf, NEVER_ACCESS_STR);
+	VALGRIND_MAKE_MEM_NOACCESS(&raw_buf->data[input_len],
+							   raw_buf->len - input_len);
+
+	/* name to describe the test */
+	testname = createPQExpBuffer();
+	appendPQExpBuffer(testname, ">");
+	escapify(testname, input, input_len);
+	appendPQExpBuffer(testname, "< - GB18030 - pg_parse_json");
+
+	/* test itself */
+	lex = makeJsonLexContextCstringLen(NULL, raw_buf->data, input_len,
+									   PG_GB18030, false);
+	json_error = pg_parse_json(lex, &sem);
+	report_result(tc, json_error == JSON_UNICODE_ESCAPE_FORMAT,
+				  testname->data, "",
+				  "diagnosed", json_errdetail(json_error, lex));
+
+	freeJsonLexContext(lex);
+	destroyPQExpBuffer(testname);
+	destroyPQExpBuffer(raw_buf);
+}
+
+
 static bool
 escape_literal(PGconn *conn, PQExpBuffer target,
 			   const char *unescaped, size_t unescaped_len,
@@ -451,8 +534,18 @@ static pe_test_vector pe_test_vectors[] =
 	 * Testcases that are not null terminated for the specified input length.
 	 * That's interesting to verify that escape functions don't read beyond
 	 * the intended input length.
+	 *
+	 * One interesting special case is GB18030, which has the odd behaviour
+	 * needing to read beyond the first byte to determine the length of a
+	 * multi-byte character.
 	 */
 	TV_LEN("gbk", "\x80", 1),
+	TV_LEN("GB18030", "\x80", 1),
+	TV_LEN("GB18030", "\x80\0", 2),
+	TV_LEN("GB18030", "\x80\x30", 2),
+	TV_LEN("GB18030", "\x80\x30\0", 3),
+	TV_LEN("GB18030", "\x80\x30\x30", 3),
+	TV_LEN("GB18030", "\x80\x30\x30\0", 4),
 	TV_LEN("UTF-8", "\xC3\xb6  ", 1),
 	TV_LEN("UTF-8", "\xC3\xb6  ", 2),
 };
@@ -861,6 +954,9 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	test_gb18030_page_multiple(&tc);
+	test_gb18030_json(&tc);
+
 	for (int i = 0; i < lengthof(pe_test_vectors); i++)
 	{
 		test_one_vector(&tc, &pe_test_vectors[i]);
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index d785f92561e..7dd1ef6161f 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -508,10 +508,13 @@ insert into gb18030_inputs  values
   ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
   ('\x666f6f84309c',	'incomplete char '),
   ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84',		'incomplete char at end'),
   ('\x666f6f84309c3800', 'invalid, NUL byte'),
   ('\x666f6f84309c0038', 'invalid, NUL byte');
--- Test GB18030 verification
-select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test GB18030 verification.  Round-trip through text so the backing of the
+-- bytea values is palloc, not shared_buffers.  This lets Valgrind detect
+-- reads past the end.
+select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs;
                   description                   |      inbytes       |      result      |   errorat    |                               error                               
 ------------------------------------------------+--------------------+------------------+--------------+-------------------------------------------------------------------
  valid, pure ASCII                              | \x666f6f           | \x666f6f         |              | 
@@ -520,9 +523,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from g
  valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6f84309c38 |              | 
  incomplete char                                | \x666f6f84309c     | \x666f6f         | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
  incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f         | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ incomplete char at end                         | \x666f6f84         | \x666f6f         | \x84         | invalid byte sequence for encoding "GB18030": 0x84
  invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6f84309c38 | \x00         | invalid byte sequence for encoding "GB18030": 0x00
  invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f         | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
-(8 rows)
+(9 rows)
 
 -- Test conversions from GB18030
 select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
@@ -534,9 +538,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18
  valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6fefa8aa |              | 
  incomplete char                                | \x666f6f84309c     | \x666f6f       | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
  incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f       | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ incomplete char at end                         | \x666f6f84         | \x666f6f       | \x84         | invalid byte sequence for encoding "GB18030": 0x84
  invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6fefa8aa | \x00         | invalid byte sequence for encoding "GB18030": 0x00
  invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f       | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
-(8 rows)
+(9 rows)
 
 --
 -- ISO-8859-5
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index b567a1a5721..a80d62367a2 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -300,11 +300,14 @@ insert into gb18030_inputs  values
   ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
   ('\x666f6f84309c',	'incomplete char '),
   ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84',		'incomplete char at end'),
   ('\x666f6f84309c3800', 'invalid, NUL byte'),
   ('\x666f6f84309c0038', 'invalid, NUL byte');
 
--- Test GB18030 verification
-select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test GB18030 verification.  Round-trip through text so the backing of the
+-- bytea values is palloc, not shared_buffers.  This lets Valgrind detect
+-- reads past the end.
+select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs;
 -- Test conversions from GB18030
 select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
author	Noah Misch	2025-05-05 11:52:04 +0000
committer	Noah Misch	2025-05-05 11:52:04 +0000
commit	627acc3caa74caa736b2c5587e944d2ea510ea67 (patch)
tree	7aed8aec94c841de34fad669bd8b4376ff15c1a3 /src/test
parent	5be213caaa1a9a65dfdbbf400b6a53b5e743b8d1 (diff)