For UTF-7, flag unnecessary extra trailing byte in Base64 section as error

alexdowad · alexdowad · commit a618682373fa · 2022-11-21T14:49:01.000+02:00
This bug was found when I was fuzzing a patch related to mb_strpos.
In some cases, the legacy text conversion code for UTF-7 (and
UTF7-IMAP) would correctly recognize an error for a Base64-encoded
section which was not correctly padded with zero bits, but the new
(and faster) text conversion code would not.

Specifically, if the input string ended abruptly after the 4th or 7th
byte of a Base64-encoded section, the new conversion code would
confirm that the trailing padding bits from the previous byte (3rd or
6th) were zeroes, but would not check whether the 4th or 7th byte
itself encoded any non-zero bits. The legacy conversion code did
perform this check and would treat the input string as invalid.

Actually, even if the 4th or 7th byte does encode only (padding) zero
bits, this is still a problem, because there is no reason to have a
4th (or 7th) byte in that case. The UTF-7 string should have ended
on the previous byte instead.

Apply the same fix for both UTF-7 and UTF7-IMAP.
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c
@@ -530,9 +530,12 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 			}
 
 			unsigned char n4 = decode_base64(*p++);
-			if (is_base64_end(n4) || p == e) {
+			if (is_base64_end(n4)) {
 				out = handle_base64_end(n4, &p, out, &base64, n3 & 0x3, &surrogate1);
 				continue;
+			} else if (p == e) {
+				out = handle_base64_end(n4, &p, out, &base64, true, &surrogate1);
+				continue;
 			}
 			unsigned char n5 = decode_base64(*p++);
 			if (is_base64_end(n5) || p == e) {
@@ -552,9 +555,12 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 			}
 
 			unsigned char n7 = decode_base64(*p++);
-			if (is_base64_end(n7) || p == e) {
+			if (is_base64_end(n7)) {
 				out = handle_base64_end(n7, &p, out, &base64, n6 & 0xF, &surrogate1);
 				continue;
+			} else if (p == e) {
+				out = handle_base64_end(n7, &p, out, &base64, true, &surrogate1);
+				continue;
 			}
 			unsigned char n8 = decode_base64(*p++);
 			if (is_base64_end(n8)) {
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c
@@ -558,9 +558,12 @@ static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t
 			}
 
 			unsigned char n4 = decode_base64(*p++);
-			if (is_base64_end(n4) || p == e) {
+			if (is_base64_end(n4)) {
 				out = handle_base64_end(n4, out, &base64, n3 & 0x3, &surrogate1);
 				continue;
+			} else if (p == e) {
+				out = handle_base64_end(n4, out, &base64, true, &surrogate1);
+				continue;
 			}
 			unsigned char n5 = decode_base64(*p++);
 			if (is_base64_end(n5) || p == e) {
@@ -580,9 +583,12 @@ static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t
 			}
 
 			unsigned char n7 = decode_base64(*p++);
-			if (is_base64_end(n7) || p == e) {
+			if (is_base64_end(n7)) {
 				out = handle_base64_end(n7, out, &base64, n6 & 0xF, &surrogate1);
 				continue;
+			} else if (p == e) {
+				out = handle_base64_end(n7, out, &base64, true, &surrogate1);
+				continue;
 			}
 			unsigned char n8 = decode_base64(*p++);
 			if (is_base64_end(n8)) {
diff --git a/ext/mbstring/tests/utf7imap_encoding.phpt b/ext/mbstring/tests/utf7imap_encoding.phpt
@@ -221,6 +221,14 @@ convertInvalidString("\x80", "%", "UTF7-IMAP", "UTF-8");
 convertInvalidString("abc&", "abc%", "UTF7-IMAP", "UTF-8"); // The & starts a Base-64 coded section, which is OK... but there's no data in it
 convertInvalidString("&**-", "%*-", "UTF7-IMAP", "UTF-8"); // When we hit the first bad byte in a Base-64 coded section, it drops us back into the default mode, so the following characters are literal
 
+// Try strings where Base64 has an extra trailing byte which is not needed
+convertInvalidString('&RR8I', "\xE4\x94\x9F%", 'UTF7-IMAP', 'UTF-8');
+convertInvalidString('&RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF7-IMAP', 'UTF-8');
+
+// It is useless for a Base64 section to only contain a single 'A'
+// (which decodes to only zero bits)
+convertInvalidString("&A", "\x00\x00\x00%", 'UTF7-IMAP', 'UTF-32BE');
+
 echo "Done!\n";
 ?>
 --EXPECT--
diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt
@@ -1063,6 +1063,16 @@ testInvalidString('+' . rawEncode("\xD8\x01"), "\x00\x00\x00%", 'UTF-7', 'UTF-32
 testInvalidString('+' . rawEncode("\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
 testInvalidString('+l', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
 
+// Base64 section should not have 4 ASCII characters; the first 3 can encode one
+// UTF-16 character, so there is no need for the 4th
+testInvalidString('+RR8I', "\xE4\x94\x9F%", 'UTF-7', 'UTF-8');
+// Likewise with 7 characters
+testInvalidString('+RR8IAAA', "\xE4\x94\x9F\xE0\xA0\x80%", 'UTF-7', 'UTF-8');
+
+// Similarly, it is useless for a Base64 section to only contain a single 'A'
+// (which decodes to only zero bits)
+testInvalidString("+A", "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
+
 // And then, messed up Base64 encoding
 
 // Bad padding on + section (not zeroes)