Skip to content

Commit a618682

Browse files
committed
For UTF-7, flag unnecessary extra trailing byte in Base64 section as error
This bug was found when I was fuzzing a patch related to mb_strpos. In some cases, the legacy text conversion code for UTF-7 (and UTF7-IMAP) would correctly recognize an error for a Base64-encoded section which was not correctly padded with zero bits, but the new (and faster) text conversion code would not. Specifically, if the input string ended abruptly after the 4th or 7th byte of a Base64-encoded section, the new conversion code would confirm that the trailing padding bits from the previous byte (3rd or 6th) were zeroes, but would not check whether the 4th or 7th byte itself encoded any non-zero bits. The legacy conversion code did perform this check and would treat the input string as invalid. Actually, even if the 4th or 7th byte does encode only (padding) zero bits, this is still a problem, because there is no reason to have a 4th (or 7th) byte in that case. The UTF-7 string should have ended on the previous byte instead. Apply the same fix for both UTF-7 and UTF7-IMAP.
1 parent aef9660 commit a618682

File tree

4 files changed

+34
-4
lines changed

4 files changed

+34
-4
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf7.c

+8-2
Original file line numberDiff line numberDiff line change
@@ -530,9 +530,12 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
530530
}
531531

532532
unsigned char n4 = decode_base64(*p++);
533-
if (is_base64_end(n4) || p == e) {
533+
if (is_base64_end(n4)) {
534534
out = handle_base64_end(n4, &p, out, &base64, n3 & 0x3, &surrogate1);
535535
continue;
536+
} else if (p == e) {
537+
out = handle_base64_end(n4, &p, out, &base64, true, &surrogate1);
538+
continue;
536539
}
537540
unsigned char n5 = decode_base64(*p++);
538541
if (is_base64_end(n5) || p == e) {
@@ -552,9 +555,12 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
552555
}
553556

554557
unsigned char n7 = decode_base64(*p++);
555-
if (is_base64_end(n7) || p == e) {
558+
if (is_base64_end(n7)) {
556559
out = handle_base64_end(n7, &p, out, &base64, n6 & 0xF, &surrogate1);
557560
continue;
561+
} else if (p == e) {
562+
out = handle_base64_end(n7, &p, out, &base64, true, &surrogate1);
563+
continue;
558564
}
559565
unsigned char n8 = decode_base64(*p++);
560566
if (is_base64_end(n8)) {

ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c

+8-2
Original file line numberDiff line numberDiff line change
@@ -558,9 +558,12 @@ static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t
558558
}
559559

560560
unsigned char n4 = decode_base64(*p++);
561-
if (is_base64_end(n4) || p == e) {
561+
if (is_base64_end(n4)) {
562562
out = handle_base64_end(n4, out, &base64, n3 & 0x3, &surrogate1);
563563
continue;
564+
} else if (p == e) {
565+
out = handle_base64_end(n4, out, &base64, true, &surrogate1);
566+
continue;
564567
}
565568
unsigned char n5 = decode_base64(*p++);
566569
if (is_base64_end(n5) || p == e) {
@@ -580,9 +583,12 @@ static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t
580583
}
581584

582585
unsigned char n7 = decode_base64(*p++);
583-
if (is_base64_end(n7) || p == e) {
586+
if (is_base64_end(n7)) {
584587
out = handle_base64_end(n7, out, &base64, n6 & 0xF, &surrogate1);
585588
continue;
589+
} else if (p == e) {
590+
out = handle_base64_end(n7, out, &base64, true, &surrogate1);
591+
continue;
586592
}
587593
unsigned char n8 = decode_base64(*p++);
588594
if (is_base64_end(n8)) {

ext/mbstring/tests/utf7imap_encoding.phpt

+8
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,14 @@ convertInvalidString("\x80", "%", "UTF7-IMAP", "UTF-8");
221221
convertInvalidString("abc&", "abc%", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it
222222
convertInvalidString("&**-", "%*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal
223223

224+
// Try strings where Base64 has an extra trailing byte which is not needed
225+
convertInvalidString('&RR8I', "\xE4\x94\x9F%", 'UTF7-IMAP', 'UTF-8');
226+
convertInvalidString('&RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF7-IMAP', 'UTF-8');
227+
228+
// It is useless for a Base64 section to only contain a single 'A'
229+
// (which decodes to only zero bits)
230+
convertInvalidString("&A", "\x00\x00\x00%", 'UTF7-IMAP', 'UTF-32BE');
231+
224232
echo "Done!\n";
225233
?>
226234
--EXPECT--

ext/mbstring/tests/utf_encodings.phpt

+10
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,16 @@ testInvalidString('+' . rawEncode("\xD8\x01"), "\x00\x00\x00%", 'UTF-7', 'UTF-32
10631063
testInvalidString('+' . rawEncode("\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
10641064
testInvalidString('+l', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
10651065

1066+
// Base64 section should not have 4 ASCII characters; the first 3 can encode one
1067+
// UTF-16 character, so there is no need for the 4th
1068+
testInvalidString('+RR8I', "\xE4\x94\x9F%", 'UTF-7', 'UTF-8');
1069+
// Likewise with 7 characters
1070+
testInvalidString('+RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF-7', 'UTF-8');
1071+
1072+
// Similarly, it is useless for a Base64 section to only contain a single 'A'
1073+
// (which decodes to only zero bits)
1074+
testInvalidString("+A", "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1075+
10661076
// And then, messed up Base64 encoding
10671077

10681078
// Bad padding on + section (not zeroes)

0 commit comments

Comments
 (0)