Skip to content

Commit c4fb049

Browse files
committed
For UTF-7, emit error marker if Base64 section ends abruptly after first half of surrogate pair
This (rare) situation was already handled correctly for the 1st and 2nd of every 3 codepoints in a Base64-encoded section of a UTF-7 string. However, it was not handled correctly if it happened on the 3rd, 6th, 9th, etc. codepoint of such a Base64-encoded section.
1 parent 5f2587e commit c4fb049

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf7.c

+11-2
Original file line numberDiff line numberDiff line change
@@ -537,8 +537,10 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
537537
if (p == e) {
538538
/* It is an error if trailing padding bits are not zeroes or if we were
539539
* expecting the 2nd part of a surrogate pair when Base64 section ends */
540-
if ((n3 & 0x3) || surrogate1)
540+
if ((n3 & 0x3) || surrogate1) {
541541
*out++ = MBFL_BAD_INPUT;
542+
surrogate1 = 0;
543+
}
542544
break;
543545
}
544546

@@ -562,8 +564,10 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
562564
}
563565
out = handle_utf16_cp((n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4), out, &surrogate1);
564566
if (p == e) {
565-
if ((n6 & 0xF) || surrogate1)
567+
if ((n6 & 0xF) || surrogate1) {
566568
*out++ = MBFL_BAD_INPUT;
569+
surrogate1 = 0;
570+
}
567571
break;
568572
}
569573

@@ -603,6 +607,11 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
603607
}
604608
}
605609

610+
if (p == e && surrogate1) {
611+
ZEND_ASSERT(out < limit);
612+
*out++ = MBFL_BAD_INPUT;
613+
}
614+
606615
*state = (surrogate1 << 1) | base64;
607616
*in_len = e - p;
608617
*in = p;

ext/mbstring/tests/utf_encodings.phpt

+3
Original file line numberDiff line numberDiff line change
@@ -1074,6 +1074,9 @@ testInvalidString('+' . rawEncode("\x00.\x00.\xD8\x01\xD9\x02") . '-', "\x00\x00
10741074
// First half of surrogate pair appearing at end of string
10751075
testInvalidString('+' . rawEncode("\xD8\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
10761076
testInvalidString('+' . rawEncode("\xD8\x01"), "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');
1077+
testInvalidString("+999999uJ", "\xEF\x9F\x9F\xE7\xB7\xB7%", 'UTF-7', 'UTF-8');
1078+
testInvalidString("+999euJ", "\xEF\x9F\x9F\xE5\xBA\xB8%", "UTF-7", "UTF-8");
1079+
testInvalidString("+euJ", "\xE7\xAB\xA2%", "UTF-7", "UTF-8");
10771080

10781081
// Truncated string
10791082
testInvalidString('+' . rawEncode("\x01") . '-', "\x00\x00\x00%", 'UTF-7', 'UTF-32BE');

0 commit comments

Comments
 (0)