Skip to content

Commit 5d6bd55

Browse files
committedJul 18, 2022
mb_decode_numericentity converts entities which immediately follow a valid/invalid entity
Thanks to Kamil Tieleka for suggesting that some of the behaviors of the legacy implementation which the new mb_decode_numericentity implementation took care to maintain were actually bugs and should be fixed. Thanks also to Trevor Rowbotham for providing a link to the HTML specification, showing how HTML numeric entities should be interpreted. mb_decode_numericentity now processes numeric entities in the following situations where the old implementation would not: - &<ENTITY> (for example, &&#65;) - &#<ENTITY> - &#x<ENTITY> - <VALID BUT UNTERMINATED DECIMAL ENTITY><ENTITY> (for example, &#65&#65;) - <VALID BUT UNTERMINATED HEX ENTITY><ENTITY> - <INVALID AND UNTERMINATED DECIMAL ENTITY><ENTITY> (it does not matter why the first entity is invalid; the value could be too big, it could have too many digits, or it could not match the 'convmap' parameter) - <INVALID AND UNTERMINATED HEX ENTITY><ENTITY> This is consistent with the way that web browsers process HTML entities.
1 parent 30bfeef commit 5d6bd55

File tree

2 files changed

+30
-63
lines changed

2 files changed

+30
-63
lines changed
 

‎ext/mbstring/mbstring.c

+8-37
Original file line numberDiff line numberDiff line change
@@ -3407,11 +3407,6 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34073407
/* Invalid entity */
34083408
memcpy(converted, p, (p2 - p) * 4);
34093409
converted += p2 - p;
3410-
if ((p2 - p) == 3) {
3411-
/* For compatibility with legacy behavior; if &#x is followed
3412-
* by another entity, we don't convert the 2nd entity */
3413-
*converted++ = *p2++;
3414-
}
34153410
} else {
34163411
/* Valid hexadecimal entity */
34173412
uint32_t value = 0;
@@ -3429,16 +3424,12 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34293424
uint32_t original;
34303425
if (html_numeric_entity_deconvert(value, convmap, mapsize, &original)) {
34313426
*converted++ = original;
3432-
if (*p2 != ';')
3433-
*converted++ = *p2;
3427+
if (*p2 == ';')
3428+
p2++;
34343429
} else {
3435-
memcpy(converted, p, (p2 - p + 1) * 4);
3436-
converted += p2 - p + 1;
3430+
memcpy(converted, p, (p2 - p) * 4);
3431+
converted += p2 - p;
34373432
}
3438-
/* For compatibility with legacy behavior:
3439-
* We don't consider & as starting a new entity if it
3440-
* terminates a preceding entity */
3441-
p2++;
34423433
}
34433434
} else if (p2 == wchar_buf + out_len && in_len) {
34443435
wchar_buf[0] = '&';
@@ -3468,11 +3459,6 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34683459
/* Invalid entity */
34693460
memcpy(converted, p, (p2 - p) * 4);
34703461
converted += p2 - p;
3471-
if ((p2 - p) == 2) {
3472-
/* For compatibility with legacy behavior; if &# is followed
3473-
* by another entity, we don't convert the 2nd entity */
3474-
*converted++ = *p2++;
3475-
}
34763462
} else {
34773463
/* Valid decimal entity */
34783464
uint32_t value = 0;
@@ -3491,16 +3477,12 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34913477
uint32_t original;
34923478
if (html_numeric_entity_deconvert(value, convmap, mapsize, &original)) {
34933479
*converted++ = original;
3494-
if (*p2 != ';')
3495-
*converted++ = *p2;
3480+
if (*p2 == ';')
3481+
p2++;
34963482
} else {
3497-
memcpy(converted, p, (p2 - p + 1) * 4);
3498-
converted += p2 - p + 1;
3483+
memcpy(converted, p, (p2 - p) * 4);
3484+
converted += p2 - p;
34993485
}
3500-
/* For compatibility with legacy behavior:
3501-
* We don't consider & as starting a new entity if it
3502-
* terminates a preceding entity */
3503-
p2++;
35043486
}
35053487
}
35063488
} else if (p2 == wchar_buf + out_len) {
@@ -3515,17 +3497,6 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
35153497
}
35163498
} else {
35173499
*converted++ = '&';
3518-
if (*p2 == '&') {
3519-
/* Two successive ampersands convert to a single one,
3520-
* and we do NOT allow the second one to start an entity
3521-
* Duplicating the legacy behavior of mb_decode_numericentity here */
3522-
*converted++ = '&';
3523-
p2++;
3524-
if (p2 == wchar_buf + out_len) {
3525-
/* Corner case: two &'s at end of buffer */
3526-
goto process_converted_wchars_no_offset;
3527-
}
3528-
}
35293500
}
35303501
decimal_entity_too_big:
35313502

‎ext/mbstring/tests/mb_decode_numericentity.phpt

+22-26
Original file line numberDiff line numberDiff line change
@@ -93,33 +93,28 @@ test("More than 8 digits for hex entity", "&#x000000141;", "&#x000000141;", [0,
9393

9494
test("Single &", "&", "&", [0, 0xFFFF, 0, 0xFFFF], "ASCII");
9595

96-
// We don't allow an entity to come right after a preceding ampersand
97-
// (This is for compatibility with the legacy behavior of mb_decode_numericentity)
98-
test("Successive &", "&&#2,", "&&#2,", [0xffe9ade7, 0x6237b6ff, 0xaa597469, 0x612800], 'ASCII');
96+
// An entity can come right after a preceding ampersand
97+
test("Successive &", "&&#65,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
9998

100-
// We don't allow an entity to come right after a preceding &#
101-
// (Also for compatibility with the legacy behavior of mb_decode_numericentity)
102-
test("Successive &#", "&#&#x32;", "&#&#x32;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
103-
test("Successive &#x", "&#x&#x32;", "&#x&#x32;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
99+
// An entity can come right after a preceding &#
100+
test("Successive &#", "&#&#x32;", "&#2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
101+
test("Successive &#x", "&#x&#x32;", "&#x2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
104102

105-
// Don't allow the starting & of an entity to terminate a preceding entity
106-
// (Also for compatibility with the legacy behavior of mb_decode_numericentity)
107-
test("Successive &#65", "&#65&#65;", "A&#65;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
103+
// The starting & of an entity can terminate a preceding entity
104+
test("Successive &#65", "&#65&#65;", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
105+
test("Successive hex entities", "&#x32&#x32;", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
108106

109-
// An entity CAN come right after an entity which is invalid because of being too long
107+
// An entity can come right after an entity which is invalid because of being too long
110108
test("Starting entity immediately after decimal entity which is too long", "&#10000000000&#65;", "&#10000000000A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
111109
test("Starting entity immediately after hex entity which is too long", "&#x111111111&#65;", "&#x111111111A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
112110

113-
// The second entity is not accepted here, because it terminates a preceding entity
114-
// (To test entities which are as large as possible without being too large, we need to use UCS-4;
115-
// any other encoding would not allow codepoints that large)
116111
$ucs4_test1 = mb_convert_encoding("&#1000000000&#65;", 'UCS-4BE', 'ASCII');
117-
testNonAscii("Starting entity immediately after valid decimal entity which is just within maximum length", $ucs4_test1, "\x3B\x9A\xCA\x00\x00\x00\x00&\x00\x00\x00#\x00\x00\x006\x00\x00\x005\x00\x00\x00;", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
112+
testNonAscii("Starting entity immediately after valid decimal entity which is just within maximum length", $ucs4_test1, "\x3B\x9A\xCA\x00\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
118113
$ucs4_test2 = mb_convert_encoding("&#x11111111&#65;", 'UCS-4BE', 'ASCII');
119-
testNonAscii("Starting entity immediately after valid hex entity which is just within maximum length", $ucs4_test2, "\x11\x11\x11\x11\x00\x00\x00&\x00\x00\x00#\x00\x00\x006\x00\x00\x005\x00\x00\x00;", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
114+
testNonAscii("Starting entity immediately after valid hex entity which is just within maximum length", $ucs4_test2, "\x11\x11\x11\x11\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
120115

121-
test("Starting entity immediately after invalid decimal entity", "&#0&#65;", "&#0&#65;", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
122-
test("Starting entity immediately after invalid hex entity", "&#x0&#65;", "&#x0&#65;", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
116+
test("Starting entity immediately after invalid decimal entity", "&#0&#65;", "&#0A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
117+
test("Starting entity immediately after invalid hex entity", "&#x0&#65;", "&#x0A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
123118

124119
test("Starting entity immediately after too-big decimal entity", "&#7001492542&#65;", "&#7001492542A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'ASCII');
125120

@@ -166,16 +161,17 @@ More than 10 digits for decimal entity: string(14) "&#00000000165;" => string(14
166161
8 digits for hex entity: string(12) "&#x00000041;" => string(1) "A" (Good)
167162
More than 8 digits for hex entity: string(13) "&#x000000141;" => string(13) "&#x000000141;" (Good)
168163
Single &: string(1) "&" => string(1) "&" (Good)
169-
Successive &: string(5) "&&#2," => string(5) "&&#2," (Good)
170-
Successive &#: string(8) "&#&#x32;" => string(8) "&#&#x32;" (Good)
171-
Successive &#x: string(9) "&#x&#x32;" => string(9) "&#x&#x32;" (Good)
172-
Successive &#65: string(9) "&#65&#65;" => string(6) "A&#65;" (Good)
164+
Successive &: string(6) "&&#65," => string(3) "&A," (Good)
165+
Successive &#: string(8) "&#&#x32;" => string(3) "&#2" (Good)
166+
Successive &#x: string(9) "&#x&#x32;" => string(4) "&#x2" (Good)
167+
Successive &#65: string(9) "&#65&#65;" => string(2) "AA" (Good)
168+
Successive hex entities: string(11) "&#x32&#x32;" => string(2) "22" (Good)
173169
Starting entity immediately after decimal entity which is too long: string(18) "&#10000000000&#65;" => string(14) "&#10000000000A" (Good)
174170
Starting entity immediately after hex entity which is too long: string(17) "&#x111111111&#65;" => string(13) "&#x111111111A" (Good)
175-
Starting entity immediately after valid decimal entity which is just within maximum length: 000000260000002300000031000000300000003000000030000000300000003000000030000000300000003000000030000000260000002300000036000000350000003b => 3b9aca00000000260000002300000036000000350000003b (Good)
176-
Starting entity immediately after valid hex entity which is just within maximum length: 0000002600000023000000780000003100000031000000310000003100000031000000310000003100000031000000260000002300000036000000350000003b => 11111111000000260000002300000036000000350000003b (Good)
177-
Starting entity immediately after invalid decimal entity: string(8) "&#0&#65;" => string(8) "&#0&#65;" (Good)
178-
Starting entity immediately after invalid hex entity: string(9) "&#x0&#65;" => string(9) "&#x0&#65;" (Good)
171+
Starting entity immediately after valid decimal entity which is just within maximum length: 000000260000002300000031000000300000003000000030000000300000003000000030000000300000003000000030000000260000002300000036000000350000003b => 3b9aca0000000041 (Good)
172+
Starting entity immediately after valid hex entity which is just within maximum length: 0000002600000023000000780000003100000031000000310000003100000031000000310000003100000031000000260000002300000036000000350000003b => 1111111100000041 (Good)
173+
Starting entity immediately after invalid decimal entity: string(8) "&#0&#65;" => string(4) "&#0A" (Good)
174+
Starting entity immediately after invalid hex entity: string(9) "&#x0&#65;" => string(5) "&#x0A" (Good)
179175
Starting entity immediately after too-big decimal entity: string(17) "&#7001492542&#65;" => string(13) "&#7001492542A" (Good)
180176
Regression test (entity which decodes to 0xFFFFFFFF): string(5) "&#xe;" => string(1) "?" (Good)
181177
Regression test (truncation of successive & with JIS encoding): string(3) "&&&" => string(3) "&&&" (Good)

0 commit comments

Comments
 (0)