Skip to content

Commit 76a92c2

Browse files
committedJul 18, 2022
mb_decode_numericentity decodes valid entities which are truncated at end of string
Since mb_decode_numericentity does not require all HTML entities to end with ';', but allows them to be terminated by ANY non-digit character, it doesn't make sense that valid entities which butt up against the end of the input string are not converted. As it turned out, supporting this case also made it possible to simplify the code nicely.
1 parent 5d6bd55 commit 76a92c2

File tree

3 files changed

+85
-76
lines changed

3 files changed

+85
-76
lines changed
 

‎ext/mbstring/mbstring.c

+32-57
Original file line numberDiff line numberDiff line change
@@ -3319,6 +3319,11 @@ static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, in
33193319
return false;
33203320
}
33213321

3322+
#define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
3323+
#define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
3324+
#define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
3325+
#define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
3326+
33223327
static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, int mapsize)
33233328
{
33243329
uint32_t wchar_buf[128], converted_buf[128];
@@ -3389,28 +3394,20 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
33893394
uint32_t w = *++p2;
33903395
while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
33913396
w = *++p2;
3392-
if (p2 == wchar_buf + out_len) {
3393-
/* We hit the end of the buffer while still reading digits */
3394-
if ((p2 - p) <= 11 && in_len) {
3395-
/* The number of digits was legal (1-8 hex digits)
3396-
* We need to process this identity again on the next iteration of the main loop */
3397-
memmove(wchar_buf, p, (p2 - p) * 4);
3398-
wchar_buf_offset = p2 - p;
3399-
goto process_converted_wchars;
3400-
} else {
3401-
/* Invalid entity */
3402-
memcpy(converted, p, (p2 - p) * 4);
3403-
converted += p2 - p;
3404-
goto process_converted_wchars_no_offset;
3405-
}
3406-
} else if ((p2 - p) == 3 || (p2 - p) > 11) {
3407-
/* Invalid entity */
3397+
if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
3398+
/* We hit the end of the buffer while reading digits, and
3399+
* more wchars are still coming in the next buffer
3400+
* Reprocess this identity on next iteration */
3401+
memmove(wchar_buf, p, (p2 - p) * 4);
3402+
wchar_buf_offset = p2 - p;
3403+
goto process_converted_wchars;
3404+
} else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
3405+
/* Invalid entity (too long or "&#x" only) */
34083406
memcpy(converted, p, (p2 - p) * 4);
34093407
converted += p2 - p;
34103408
} else {
34113409
/* Valid hexadecimal entity */
3412-
uint32_t value = 0;
3413-
uint32_t *p3 = p + 3;
3410+
uint32_t value = 0, *p3 = p + 3;
34143411
while (p3 < p2) {
34153412
w = *p3++;
34163413
if (w <= '9') {
@@ -3421,48 +3418,33 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34213418
value = (value * 16) + 10 + (w - 'A');
34223419
}
34233420
}
3424-
uint32_t original;
3425-
if (html_numeric_entity_deconvert(value, convmap, mapsize, &original)) {
3426-
*converted++ = original;
3421+
if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3422+
converted++;
34273423
if (*p2 == ';')
34283424
p2++;
34293425
} else {
34303426
memcpy(converted, p, (p2 - p) * 4);
34313427
converted += p2 - p;
34323428
}
34333429
}
3434-
} else if (p2 == wchar_buf + out_len && in_len) {
3435-
wchar_buf[0] = '&';
3436-
wchar_buf[1] = '#';
3437-
wchar_buf_offset = 2;
3438-
goto process_converted_wchars;
34393430
} else {
34403431
/* Possible decimal entity */
34413432
uint32_t w = *p2;
34423433
while (w >= '0' && w <= '9')
34433434
w = *++p2;
3444-
if (p2 == wchar_buf + out_len) {
3445-
if ((p2 - p) <= 12 && in_len) {
3446-
/* The number of digits was legal (1-10 decimal digits)
3447-
* We need to process this identity again on next iteration of main loop */
3448-
memmove(wchar_buf, p, (p2 - p) * 4);
3449-
wchar_buf_offset = p2 - p;
3450-
goto process_converted_wchars;
3451-
} else {
3452-
/* Invalid entity */
3453-
memcpy(converted, p, (p2 - p) * 4);
3454-
converted += p2 - p;
3455-
wchar_buf_offset = 0;
3456-
goto process_converted_wchars_no_offset;
3457-
}
3458-
} else if ((p2 - p) == 2 || (p2 - p) > 12) {
3459-
/* Invalid entity */
3435+
if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
3436+
/* The number of digits was legal (no more than 10 decimal digits)
3437+
* Reprocess this identity on next iteration of main loop */
3438+
memmove(wchar_buf, p, (p2 - p) * 4);
3439+
wchar_buf_offset = p2 - p;
3440+
goto process_converted_wchars;
3441+
} else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
3442+
/* Invalid entity (too long or "&#" only) */
34603443
memcpy(converted, p, (p2 - p) * 4);
34613444
converted += p2 - p;
34623445
} else {
34633446
/* Valid decimal entity */
3464-
uint32_t value = 0;
3465-
uint32_t *p3 = p + 2;
3447+
uint32_t value = 0, *p3 = p + 2;
34663448
while (p3 < p2) {
34673449
/* If unsigned integer overflow would occur in the below
34683450
* multiplication by 10, this entity is no good
@@ -3474,9 +3456,8 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34743456
}
34753457
value = (value * 10) + (*p3++ - '0');
34763458
}
3477-
uint32_t original;
3478-
if (html_numeric_entity_deconvert(value, convmap, mapsize, &original)) {
3479-
*converted++ = original;
3459+
if (html_numeric_entity_deconvert(value, convmap, mapsize, converted)) {
3460+
converted++;
34803461
if (*p2 == ';')
34813462
p2++;
34823463
} else {
@@ -3485,16 +3466,11 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34853466
}
34863467
}
34873468
}
3488-
} else if (p2 == wchar_buf + out_len) {
3469+
} else if ((p2 == wchar_buf + out_len) && in_len) {
34893470
/* Corner case: & at end of buffer */
3490-
if (in_len) {
3491-
wchar_buf[0] = '&';
3492-
wchar_buf_offset = 1;
3493-
goto process_converted_wchars;
3494-
} else {
3495-
*converted++ = '&';
3496-
goto process_converted_wchars_no_offset;
3497-
}
3471+
wchar_buf[0] = '&';
3472+
wchar_buf_offset = 1;
3473+
goto process_converted_wchars;
34983474
} else {
34993475
*converted++ = '&';
35003476
}
@@ -3515,7 +3491,6 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
35153491
if (p < wchar_buf + out_len)
35163492
goto found_ampersand;
35173493

3518-
process_converted_wchars_no_offset:
35193494
/* We do not have any wchars remaining at the end of this buffer which
35203495
* we need to reprocess on the next call */
35213496
wchar_buf_offset = 0;

‎ext/mbstring/tests/bug40685.phpt

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ string(1) "&"
1919
string(3) "&&&"
2020
string(2) "&#"
2121
string(3) "&#x"
22-
string(4) "&#61"
23-
string(5) "&#x3d"
22+
string(1) "="
23+
string(1) "="
2424
string(1) "="
2525
string(1) "="

‎ext/mbstring/tests/mb_decode_numericentity.phpt

+51-17
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,19 @@ echo "2: " . mb_decode_numericentity($str2, $convmap, "UTF-8") . "\n";
4141
echo "3: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n";
4242

4343
// Numeric entities which are truncated at end of string
44-
// We do NOT decode such entities; they can be terminated by any non-digit character, but not by the end of the string
45-
echo "4: " . mb_decode_numericentity('&#1000000000', $convmap), "\n";
46-
echo "5: " . mb_decode_numericentity('&#9000000000', $convmap), "\n";
47-
echo "6: " . mb_decode_numericentity('&#10000000000', $convmap), "\n";
48-
echo "7: " . mb_decode_numericentity('&#100000000000', $convmap), "\n";
49-
echo "8: " . mb_decode_numericentity('&#000000000000', $convmap), "\n";
50-
echo "9: " . mb_decode_numericentity('&#00000000000', $convmap), "\n";
51-
echo "10: " . mb_decode_numericentity('&#0000000000', $convmap), "\n";
52-
echo "11: " . mb_decode_numericentity('&#000000000', $convmap), "\n";
44+
echo "4: " . mb_decode_numericentity('&#1000000000', $convmap), "\n"; // Entity is too big
45+
echo "5: " . mb_decode_numericentity('&#9000000000', $convmap), "\n"; // Entity is too big
46+
echo "6: " . mb_decode_numericentity('&#10000000000', $convmap), "\n"; // Too many digits
47+
echo "7: " . mb_decode_numericentity('&#100000000000', $convmap), "\n"; // Too many digits
48+
echo "8: " . mb_decode_numericentity('&#000000000000', $convmap), "\n"; // Too many digits
49+
echo "9: " . mb_decode_numericentity('&#00000000000', $convmap), "\n"; // Too many digits
50+
echo "10: " . bin2hex(mb_decode_numericentity('&#0000000000', $convmap)), "\n"; // OK
51+
echo "11: " . bin2hex(mb_decode_numericentity('&#000000000', $convmap)), "\n"; // OK
5352
// Try with hex, not just decimal entities
54-
echo "11b: " . mb_decode_numericentity('&#x0000000', $convmap), "\n";
55-
echo "11c: " . mb_decode_numericentity('&#x00000000', $convmap), "\n";
56-
echo "11d: " . mb_decode_numericentity('&#x10000', $convmap), "\n";
53+
echo "11b: " . bin2hex(mb_decode_numericentity('&#x0000000', $convmap)), "\n"; // OK
54+
echo "11c: " . bin2hex(mb_decode_numericentity('&#x00000000', $convmap)), "\n"; // OK
55+
echo "11d: " . bin2hex(mb_decode_numericentity('&#x10000', $convmap)), "\n"; // OK
56+
echo "11e: " . mb_decode_numericentity('&#x000000000', $convmap), "\n"; // Too many digits
5757

5858
// Large decimal entity, converting from non-ASCII input encoding
5959
echo "12: " . bin2hex(mb_decode_numericentity(mb_convert_encoding('&#12345678;', 'UCS-4', 'ASCII'), [0, 0x7FFFFFFF, 0, 0x7FFFFFFF], 'UCS-4')), "\n";
@@ -100,6 +100,8 @@ test("Successive &", "&&#65,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
100100
test("Successive &#", "&#&#x32;", "&#2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
101101
test("Successive &#x", "&#x&#x32;", "&#x2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
102102

103+
test("&#x only", "&#x;", "&#x;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
104+
103105
// The starting & of an entity can terminate a preceding entity
104106
test("Successive &#65", "&#65&#65;", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
105107
test("Successive hex entities", "&#x32&#x32;", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
@@ -131,6 +133,36 @@ test("Regression test (truncation of successive & with JIS encoding)", "&&&", "&
131133
// Previously, signed arithmetic was used on convmap entries
132134
test("Regression test (convmap entries are now treated as unsigned)", "&#7,", "?,", [0x22FFFF11, 0xBF111189, 0x67726511, 0x1161E719], "ASCII");
133135

136+
// Try with '&', '&#', or '&#' at the end of a buffer of wchars, with more input
137+
// still left to process in the next buffer
138+
// (mb_decode_numericentity splits its input into 'chunks' and processes it one
139+
// chunk at a time)
140+
$convmap = [0, 0xFFFF, 0, 0xFFFF];
141+
for ($i = 0; $i < 256; $i++) {
142+
$padding = str_repeat("a", $i);
143+
// First try invalid decimal/hex entities
144+
if (mb_decode_numericentity($padding . "&#ZZZ", $convmap, 'UTF-8') !== $padding . "&#ZZZ")
145+
die("&#ZZZ is broken when it spans two buffers!");
146+
if (mb_decode_numericentity($padding . "&#xZZZ", $convmap, 'UTF-8') !== $padding . "&#xZZZ")
147+
die("&#xZZZ is broken when it spans two buffers!");
148+
// Now try valid decimal/hex entities
149+
if (mb_decode_numericentity($padding . "&#65", $convmap, 'UTF-8') !== $padding . "A")
150+
die("&#65 is broken when it spans two buffers!");
151+
if (mb_decode_numericentity($padding . "&#x41", $convmap, 'UTF-8') !== $padding . "A")
152+
die("&#x41 is broken when it spans two buffers!");
153+
}
154+
155+
// Try huge entities, big enough to fill an entire buffer
156+
for ($i = 12; $i < 256; $i++) {
157+
$str = "&#" . str_repeat("0", $i) . "65";
158+
if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
159+
die("Decimal entity with huge number of digits broken");
160+
161+
$str = "&#x" . str_repeat("0", $i) . "41";
162+
if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
163+
die("Hexadecimal entity with huge number of digits broken");
164+
}
165+
134166
?>
135167
--EXPECT--
136168
1: ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
@@ -142,11 +174,12 @@ test("Regression test (convmap entries are now treated as unsigned)", "&#7,", "?
142174
7: &#100000000000
143175
8: &#000000000000
144176
9: &#00000000000
145-
10: &#0000000000
146-
11: &#000000000
147-
11b: &#x0000000
148-
11c: &#x00000000
149-
11d: &#x10000
177+
10: 00
178+
11: 00
179+
11b: 00
180+
11c: 00
181+
11d: f0908080
182+
11e: &#x000000000
150183
12: 00bc614e
151184
13: f&ouml;o
152185
14: mb_decode_numericentity(): Argument #2 ($map) must have a multiple of 4 elements
@@ -164,6 +197,7 @@ Single &: string(1) "&" => string(1) "&" (Good)
164197
Successive &: string(6) "&&#65," => string(3) "&A," (Good)
165198
Successive &#: string(8) "&#&#x32;" => string(3) "&#2" (Good)
166199
Successive &#x: string(9) "&#x&#x32;" => string(4) "&#x2" (Good)
200+
&#x only: string(4) "&#x;" => string(4) "&#x;" (Good)
167201
Successive &#65: string(9) "&#65&#65;" => string(2) "AA" (Good)
168202
Successive hex entities: string(11) "&#x32&#x32;" => string(2) "22" (Good)
169203
Starting entity immediately after decimal entity which is too long: string(18) "&#10000000000&#65;" => string(14) "&#10000000000A" (Good)

0 commit comments

Comments
 (0)
Please sign in to comment.