Skip to content

Commit 627acc3

Browse files
committed
With GB18030, prevent SIGSEGV from reading past end of allocation.
With GB18030 as source encoding, applications could crash the server via SQL functions convert() or convert_from(). Applications themselves could crash after passing unterminated GB18030 input to libpq functions PQescapeLiteral(), PQescapeIdentifier(), PQescapeStringConn(), or PQescapeString(). Extension code could crash by passing unterminated GB18030 input to jsonapi.h functions. All those functions have been intended to handle untrusted, unterminated input safely. A crash required allocating the input such that the last byte of the allocation was the last byte of a virtual memory page. Some malloc() implementations take measures against that, making the SIGSEGV hard to reach. Back-patch to v13 (all supported versions). Author: Noah Misch <[email protected]> Author: Andres Freund <[email protected]> Reviewed-by: Masahiko Sawada <[email protected]> Backpatch-through: 13 Security: CVE-2025-4207
1 parent 5be213c commit 627acc3

File tree

9 files changed

+185
-30
lines changed

9 files changed

+185
-30
lines changed

src/backend/utils/mb/mbutils.c

+13-5
Original file line numberDiff line numberDiff line change
@@ -1087,7 +1087,7 @@ pg_mbcliplen(const char *mbstr, int len, int limit)
10871087
}
10881088

10891089
/*
1090-
* pg_mbcliplen with specified encoding
1090+
* pg_mbcliplen with specified encoding; string must be valid in encoding
10911091
*/
10921092
int
10931093
pg_encoding_mbcliplen(int encoding, const char *mbstr,
@@ -1692,12 +1692,12 @@ check_encoding_conversion_args(int src_encoding,
16921692
* report_invalid_encoding: complain about invalid multibyte character
16931693
*
16941694
* note: len is remaining length of string, not length of character;
1695-
* len must be greater than zero, as we always examine the first byte.
1695+
* len must be greater than zero (or we'd neglect initializing "buf").
16961696
*/
16971697
void
16981698
report_invalid_encoding(int encoding, const char *mbstr, int len)
16991699
{
1700-
int l = pg_encoding_mblen(encoding, mbstr);
1700+
int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
17011701
char buf[8 * 5 + 1];
17021702
char *p = buf;
17031703
int j,
@@ -1724,18 +1724,26 @@ report_invalid_encoding(int encoding, const char *mbstr, int len)
17241724
* report_untranslatable_char: complain about untranslatable character
17251725
*
17261726
* note: len is remaining length of string, not length of character;
1727-
* len must be greater than zero, as we always examine the first byte.
1727+
* len must be greater than zero (or we'd neglect initializing "buf").
17281728
*/
17291729
void
17301730
report_untranslatable_char(int src_encoding, int dest_encoding,
17311731
const char *mbstr, int len)
17321732
{
1733-
int l = pg_encoding_mblen(src_encoding, mbstr);
1733+
int l;
17341734
char buf[8 * 5 + 1];
17351735
char *p = buf;
17361736
int j,
17371737
jlimit;
17381738

1739+
/*
1740+
* We probably could use plain pg_encoding_mblen(), because
1741+
* gb18030_to_utf8() verifies before it converts. All conversions should.
1742+
* For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs. Even
1743+
* so, be defensive, since a buggy conversion might pass invalid data.
1744+
* This is not a performance-critical path.
1745+
*/
1746+
l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
17391747
jlimit = Min(l, len);
17401748
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
17411749

src/common/jsonapi.c

+5-2
Original file line numberDiff line numberDiff line change
@@ -1982,8 +1982,11 @@ json_lex_string(JsonLexContext *lex)
19821982
} while (0)
19831983
#define FAIL_AT_CHAR_END(code) \
19841984
do { \
1985-
const char *term = s + pg_encoding_mblen(lex->input_encoding, s); \
1986-
lex->token_terminator = (term <= end) ? term : end; \
1985+
ptrdiff_t remaining = end - s; \
1986+
int charlen; \
1987+
charlen = pg_encoding_mblen_or_incomplete(lex->input_encoding, \
1988+
s, remaining); \
1989+
lex->token_terminator = (charlen <= remaining) ? s + charlen : end; \
19871990
return code; \
19881991
} while (0)
19891992

src/common/wchar.c

+45-6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
*/
1313
#include "c.h"
1414

15+
#include <limits.h>
16+
1517
#include "mb/pg_wchar.h"
1618
#include "utils/ascii.h"
1719

@@ -2107,10 +2109,27 @@ const pg_wchar_tbl pg_wchar_table[] = {
21072109
/*
21082110
* Returns the byte length of a multibyte character.
21092111
*
2110-
* Caution: when dealing with text that is not certainly valid in the
2111-
* specified encoding, the result may exceed the actual remaining
2112-
* string length. Callers that are not prepared to deal with that
2113-
* should use pg_encoding_mblen_bounded() instead.
2112+
* Choose "mblen" functions based on the input string characteristics.
2113+
* pg_encoding_mblen() can be used when ANY of these conditions are met:
2114+
*
2115+
* - The input string is zero-terminated
2116+
*
2117+
* - The input string is known to be valid in the encoding (e.g., string
2118+
* converted from database encoding)
2119+
*
2120+
* - The encoding is not GB18030 (e.g., when only database encodings are
2121+
* passed to 'encoding' parameter)
2122+
*
2123+
* encoding==GB18030 requires examining up to two bytes to determine character
2124+
* length. Therefore, callers satisfying none of those conditions must use
2125+
* pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
2126+
* guaranteed to be within allocation bounds.
2127+
*
2128+
* When dealing with text that is not certainly valid in the specified
2129+
* encoding, the result may exceed the actual remaining string length.
2130+
* Callers that are not prepared to deal with that should use Min(remaining,
2131+
* pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
2132+
* pg_encoding_mblen_bounded() are interchangeable.
21142133
*/
21152134
int
21162135
pg_encoding_mblen(int encoding, const char *mbstr)
@@ -2121,8 +2140,28 @@ pg_encoding_mblen(int encoding, const char *mbstr)
21212140
}
21222141

21232142
/*
2124-
* Returns the byte length of a multibyte character; but not more than
2125-
* the distance to end of string.
2143+
* Returns the byte length of a multibyte character (possibly not
2144+
* zero-terminated), or INT_MAX if too few bytes remain to determine a length.
2145+
*/
2146+
int
2147+
pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
2148+
size_t remaining)
2149+
{
2150+
/*
2151+
* Define zero remaining as too few, even for single-byte encodings.
2152+
* pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
2153+
* zero; others read one.
2154+
*/
2155+
if (remaining < 1 ||
2156+
(encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
2157+
return INT_MAX;
2158+
return pg_encoding_mblen(encoding, mbstr);
2159+
}
2160+
2161+
/*
2162+
* Returns the byte length of a multibyte character; but not more than the
2163+
* distance to the terminating zero byte. For input that might lack a
2164+
* terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
21262165
*/
21272166
int
21282167
pg_encoding_mblen_bounded(int encoding, const char *mbstr)

src/include/mb/pg_wchar.h

+2
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,8 @@ extern int pg_valid_server_encoding_id(int encoding);
664664
*/
665665
extern void pg_encoding_set_invalid(int encoding, char *dst);
666666
extern int pg_encoding_mblen(int encoding, const char *mbstr);
667+
extern int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
668+
size_t remaining);
667669
extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr);
668670
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
669671
extern int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len);

src/interfaces/libpq/fe-exec.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -4101,7 +4101,8 @@ PQescapeStringInternal(PGconn *conn,
41014101
}
41024102

41034103
/* Slow path for possible multibyte characters */
4104-
charlen = pg_encoding_mblen(encoding, source);
4104+
charlen = pg_encoding_mblen_or_incomplete(encoding,
4105+
source, remaining);
41054106

41064107
if (remaining < charlen ||
41074108
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
@@ -4245,7 +4246,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
42454246
int charlen;
42464247

42474248
/* Slow path for possible multibyte characters */
4248-
charlen = pg_encoding_mblen(conn->client_encoding, s);
4249+
charlen = pg_encoding_mblen_or_incomplete(conn->client_encoding,
4250+
s, remaining);
42494251

42504252
if (charlen > remaining)
42514253
{

src/interfaces/libpq/fe-misc.c

+6-9
Original file line numberDiff line numberDiff line change
@@ -1221,13 +1221,9 @@ PQgetCurrentTimeUSec(void)
12211221
*/
12221222

12231223
/*
1224-
* Returns the byte length of the character beginning at s, using the
1225-
* specified encoding.
1226-
*
1227-
* Caution: when dealing with text that is not certainly valid in the
1228-
* specified encoding, the result may exceed the actual remaining
1229-
* string length. Callers that are not prepared to deal with that
1230-
* should use PQmblenBounded() instead.
1224+
* Like pg_encoding_mblen(). Use this in callers that want the
1225+
* dynamically-linked libpq's stance on encodings, even if that means
1226+
* different behavior in different startups of the executable.
12311227
*/
12321228
int
12331229
PQmblen(const char *s, int encoding)
@@ -1236,8 +1232,9 @@ PQmblen(const char *s, int encoding)
12361232
}
12371233

12381234
/*
1239-
* Returns the byte length of the character beginning at s, using the
1240-
* specified encoding; but not more than the distance to end of string.
1235+
* Like pg_encoding_mblen_bounded(). Use this in callers that want the
1236+
* dynamically-linked libpq's stance on encodings, even if that means
1237+
* different behavior in different startups of the executable.
12411238
*/
12421239
int
12431240
PQmblenBounded(const char *s, int encoding)

src/test/modules/test_escape/test_escape.c

+96
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <string.h>
1313
#include <stdio.h>
1414

15+
#include "common/jsonapi.h"
1516
#include "fe_utils/psqlscan.h"
1617
#include "fe_utils/string_utils.h"
1718
#include "getopt_long.h"
@@ -164,6 +165,88 @@ encoding_conflicts_ascii(int encoding)
164165
}
165166

166167

168+
/*
169+
* Confirm escaping doesn't read past the end of an allocation. Consider the
170+
* result of malloc(4096), in the absence of freelist entries satisfying the
171+
* allocation. On OpenBSD, reading one byte past the end of that object
172+
* yields SIGSEGV.
173+
*
174+
* Run this test before the program's other tests, so freelists are minimal.
175+
* len=4096 didn't SIGSEGV, likely due to free() calls in libpq. len=8192
176+
* did. Use 128 KiB, to somewhat insulate the outcome from distant new free()
177+
* calls and libc changes.
178+
*/
179+
static void
180+
test_gb18030_page_multiple(pe_test_config *tc)
181+
{
182+
PQExpBuffer testname;
183+
size_t input_len = 0x20000;
184+
char *input;
185+
186+
/* prepare input */
187+
input = pg_malloc(input_len);
188+
memset(input, '-', input_len - 1);
189+
input[input_len - 1] = 0xfe;
190+
191+
/* name to describe the test */
192+
testname = createPQExpBuffer();
193+
appendPQExpBuffer(testname, ">repeat(%c, %zu)", input[0], input_len - 1);
194+
escapify(testname, input + input_len - 1, 1);
195+
appendPQExpBuffer(testname, "< - GB18030 - PQescapeLiteral");
196+
197+
/* test itself */
198+
PQsetClientEncoding(tc->conn, "GB18030");
199+
report_result(tc, PQescapeLiteral(tc->conn, input, input_len) == NULL,
200+
testname->data, "",
201+
"input validity vs escape success", "ok");
202+
203+
destroyPQExpBuffer(testname);
204+
pg_free(input);
205+
}
206+
207+
/*
208+
* Confirm json parsing doesn't read past the end of an allocation. This
209+
* exercises wchar.c infrastructure like the true "escape" tests do, but this
210+
* isn't an "escape" test.
211+
*/
212+
static void
213+
test_gb18030_json(pe_test_config *tc)
214+
{
215+
PQExpBuffer raw_buf;
216+
PQExpBuffer testname;
217+
const char input[] = "{\"\\u\xFE";
218+
size_t input_len = sizeof(input) - 1;
219+
JsonLexContext *lex;
220+
JsonSemAction sem = {0}; /* no callbacks */
221+
JsonParseErrorType json_error;
222+
223+
/* prepare input like test_one_vector_escape() does */
224+
raw_buf = createPQExpBuffer();
225+
appendBinaryPQExpBuffer(raw_buf, input, input_len);
226+
appendPQExpBufferStr(raw_buf, NEVER_ACCESS_STR);
227+
VALGRIND_MAKE_MEM_NOACCESS(&raw_buf->data[input_len],
228+
raw_buf->len - input_len);
229+
230+
/* name to describe the test */
231+
testname = createPQExpBuffer();
232+
appendPQExpBuffer(testname, ">");
233+
escapify(testname, input, input_len);
234+
appendPQExpBuffer(testname, "< - GB18030 - pg_parse_json");
235+
236+
/* test itself */
237+
lex = makeJsonLexContextCstringLen(NULL, raw_buf->data, input_len,
238+
PG_GB18030, false);
239+
json_error = pg_parse_json(lex, &sem);
240+
report_result(tc, json_error == JSON_UNICODE_ESCAPE_FORMAT,
241+
testname->data, "",
242+
"diagnosed", json_errdetail(json_error, lex));
243+
244+
freeJsonLexContext(lex);
245+
destroyPQExpBuffer(testname);
246+
destroyPQExpBuffer(raw_buf);
247+
}
248+
249+
167250
static bool
168251
escape_literal(PGconn *conn, PQExpBuffer target,
169252
const char *unescaped, size_t unescaped_len,
@@ -451,8 +534,18 @@ static pe_test_vector pe_test_vectors[] =
451534
* Testcases that are not null terminated for the specified input length.
452535
* That's interesting to verify that escape functions don't read beyond
453536
* the intended input length.
537+
*
538+
* One interesting special case is GB18030, which has the odd behaviour
539+
* needing to read beyond the first byte to determine the length of a
540+
* multi-byte character.
454541
*/
455542
TV_LEN("gbk", "\x80", 1),
543+
TV_LEN("GB18030", "\x80", 1),
544+
TV_LEN("GB18030", "\x80\0", 2),
545+
TV_LEN("GB18030", "\x80\x30", 2),
546+
TV_LEN("GB18030", "\x80\x30\0", 3),
547+
TV_LEN("GB18030", "\x80\x30\x30", 3),
548+
TV_LEN("GB18030", "\x80\x30\x30\0", 4),
456549
TV_LEN("UTF-8", "\xC3\xb6 ", 1),
457550
TV_LEN("UTF-8", "\xC3\xb6 ", 2),
458551
};
@@ -861,6 +954,9 @@ main(int argc, char *argv[])
861954
exit(1);
862955
}
863956

957+
test_gb18030_page_multiple(&tc);
958+
test_gb18030_json(&tc);
959+
864960
for (int i = 0; i < lengthof(pe_test_vectors); i++)
865961
{
866962
test_one_vector(&tc, &pe_test_vectors[i]);

src/test/regress/expected/conversion.out

+9-4
Original file line numberDiff line numberDiff line change
@@ -508,10 +508,13 @@ insert into gb18030_inputs values
508508
('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'),
509509
('\x666f6f84309c', 'incomplete char '),
510510
('\x666f6f84309c0a', 'incomplete char, followed by newline '),
511+
('\x666f6f84', 'incomplete char at end'),
511512
('\x666f6f84309c3800', 'invalid, NUL byte'),
512513
('\x666f6f84309c0038', 'invalid, NUL byte');
513-
-- Test GB18030 verification
514-
select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
514+
-- Test GB18030 verification. Round-trip through text so the backing of the
515+
-- bytea values is palloc, not shared_buffers. This lets Valgrind detect
516+
-- reads past the end.
517+
select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs;
515518
description | inbytes | result | errorat | error
516519
------------------------------------------------+--------------------+------------------+--------------+-------------------------------------------------------------------
517520
valid, pure ASCII | \x666f6f | \x666f6f | |
@@ -520,9 +523,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from g
520523
valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6f84309c38 | |
521524
incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
522525
incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
526+
incomplete char at end | \x666f6f84 | \x666f6f | \x84 | invalid byte sequence for encoding "GB18030": 0x84
523527
invalid, NUL byte | \x666f6f84309c3800 | \x666f6f84309c38 | \x00 | invalid byte sequence for encoding "GB18030": 0x00
524528
invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
525-
(8 rows)
529+
(9 rows)
526530

527531
-- Test conversions from GB18030
528532
select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
@@ -534,9 +538,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18
534538
valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6fefa8aa | |
535539
incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
536540
incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
541+
incomplete char at end | \x666f6f84 | \x666f6f | \x84 | invalid byte sequence for encoding "GB18030": 0x84
537542
invalid, NUL byte | \x666f6f84309c3800 | \x666f6fefa8aa | \x00 | invalid byte sequence for encoding "GB18030": 0x00
538543
invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
539-
(8 rows)
544+
(9 rows)
540545

541546
--
542547
-- ISO-8859-5

src/test/regress/sql/conversion.sql

+5-2
Original file line numberDiff line numberDiff line change
@@ -300,11 +300,14 @@ insert into gb18030_inputs values
300300
('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'),
301301
('\x666f6f84309c', 'incomplete char '),
302302
('\x666f6f84309c0a', 'incomplete char, followed by newline '),
303+
('\x666f6f84', 'incomplete char at end'),
303304
('\x666f6f84309c3800', 'invalid, NUL byte'),
304305
('\x666f6f84309c0038', 'invalid, NUL byte');
305306

306-
-- Test GB18030 verification
307-
select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
307+
-- Test GB18030 verification. Round-trip through text so the backing of the
308+
-- bytea values is palloc, not shared_buffers. This lets Valgrind detect
309+
-- reads past the end.
310+
select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs;
308311
-- Test conversions from GB18030
309312
select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
310313

0 commit comments

Comments
 (0)