Skip to content

Commit d5d9900

Browse files
committed
When fuzzing mbstring encoding conversion code, compare output with different intermediate buffer sizes
Currently, php-fuzz-mbstring only confirms that no crashes (including ASAN violations) occur when converting text from one encoding to another. Try performing each conversion operation with two different sizes for the intermediate buffer which is used to pass data from the decoder to the encoder. If the encoding conversion code is correct, the size of that intermediate buffer shouldn't matter; we should always get exactly the same results. This is a much stricter test, which is more likely to catch bugs.
1 parent 50a2de7 commit d5d9900

File tree

1 file changed

+55
-3
lines changed

1 file changed

+55
-3
lines changed

sapi/fuzzer/fuzzer-mbstring.c

+55-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,34 @@
2020
#include "fuzzer-sapi.h"
2121
#include "ext/mbstring/mbstring.h"
2222

23+
zend_string* convert_encoding(const uint8_t *Data, size_t Size, const mbfl_encoding *FromEncoding, const mbfl_encoding *ToEncoding, size_t BufSize, unsigned int *NumErrors)
24+
{
25+
uint32_t *wchar_buf = ecalloc(BufSize, sizeof(uint32_t));
26+
unsigned int state = 0;
27+
28+
mb_convert_buf buf;
29+
mb_convert_buf_init(&buf, Size, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
30+
31+
while (Size) {
32+
size_t out_len = FromEncoding->to_wchar((unsigned char**)&Data, &Size, wchar_buf, BufSize, &state);
33+
ZEND_ASSERT(out_len <= BufSize);
34+
ToEncoding->from_wchar(wchar_buf, out_len, &buf, !Size);
35+
}
36+
37+
*NumErrors = buf.errors;
38+
zend_string *result = mb_convert_buf_result(&buf, ToEncoding);
39+
efree(wchar_buf);
40+
return result;
41+
}
42+
43+
void assert_zend_string_eql(zend_string *str1, zend_string *str2)
44+
{
45+
ZEND_ASSERT(ZSTR_LEN(str1) == ZSTR_LEN(str2));
46+
for (int i = 0; i < ZSTR_LEN(str1); i++) {
47+
ZEND_ASSERT(ZSTR_VAL(str1)[i] == ZSTR_VAL(str2)[i]);
48+
}
49+
}
50+
2351
int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
2452
const uint8_t *Comma1 = memchr(Data, ',', Size);
2553
if (!Comma1) {
@@ -45,14 +73,38 @@ int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
4573
const mbfl_encoding *ToEncoding = mbfl_name2encoding(ToEncodingName);
4674
const mbfl_encoding *FromEncoding = mbfl_name2encoding(FromEncodingName);
4775

48-
if (!ToEncoding || !FromEncoding || fuzzer_request_startup() == FAILURE) {
76+
if (!ToEncoding || !FromEncoding || Size < 2 || fuzzer_request_startup() == FAILURE) {
4977
efree(ToEncodingName);
5078
efree(FromEncodingName);
5179
return 0;
5280
}
5381

54-
zend_string *Result = php_mb_convert_encoding_ex((char *) Data, Size, ToEncoding, FromEncoding);
55-
zend_string_release(Result);
82+
/* Rather than converting an entire (possibly very long) string at once, mbstring converts
83+
* strings 'chunk by chunk'; the decoder will run until it fills up its output buffer with
84+
* wchars, then the encoder will process those wchars, then the decoder runs again until it
85+
* again fills up its output buffer, and so on
86+
*
87+
* The most error-prone part of the decoder/encoder code is where we exit a decoder/encoder
88+
* function and save its state to allow later resumption
89+
* To stress-test that aspect of the decoders/encoders, try performing an encoding conversion
90+
* operation with different, random buffer sizes
91+
* If the code is correct, the result should always be the same either way */
92+
size_t bufsize1 = *Data++;
93+
size_t bufsize2 = *Data++;
94+
bufsize1 = MAX(bufsize1, MBSTRING_MIN_WCHAR_BUFSIZE);
95+
bufsize2 = MAX(bufsize2, MBSTRING_MIN_WCHAR_BUFSIZE);
96+
Size -= 2;
97+
98+
unsigned int errors1 = 0, errors2 = 0;
99+
100+
zend_string *Result1 = convert_encoding(Data, Size, FromEncoding, ToEncoding, bufsize1, &errors1);
101+
zend_string *Result2 = convert_encoding(Data, Size, FromEncoding, ToEncoding, bufsize2, &errors2);
102+
103+
assert_zend_string_eql(Result1, Result2);
104+
ZEND_ASSERT(errors1 == errors2);
105+
106+
zend_string_release(Result1);
107+
zend_string_release(Result2);
56108
efree(ToEncodingName);
57109
efree(FromEncodingName);
58110

0 commit comments

Comments
 (0)