Skip to content

Commit 3cf4327

Browse files
committedJul 18, 2022
Fix new conversion filter for CP50220 (multi-codepoint kana at end of buffer)
If two codepoints which needed to be collapsed into a single kuten code were separated, with one at the end of one buffer and the other at the beginning of the next buffer, they were not converted correctly. This was discovered while fuzzing the new implementation of mb_decode_numericentity.
1 parent 7559bf7 commit 3cf4327

File tree

2 files changed

+27
-3
lines changed

2 files changed

+27
-3
lines changed
 

‎ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c

+19-3
Original file line numberDiff line numberDiff line change
@@ -847,11 +847,27 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
847847
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
848848

849849
bool consumed = false;
850+
uint32_t w;
850851

851-
while (len--) {
852-
uint32_t w = *in++;
852+
if (buf->state & 0xFFFF00) {
853+
/* Reprocess cached codepoint */
854+
w = buf->state >> 8;
855+
buf->state &= 0xFF;
856+
goto reprocess_codepoint;
857+
}
853858

854-
w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
859+
while (len--) {
860+
w = *in++;
861+
reprocess_codepoint:
862+
863+
if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) {
864+
/* This codepoint may need to combine with the next one,
865+
* but the 'next one' will come in a separate buffer */
866+
buf->state |= w << 8;
867+
break;
868+
} else {
869+
w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
870+
}
855871

856872
if (consumed) {
857873
/* Two successive codepoints were converted into one */

‎ext/mbstring/tests/cp5022x_encoding.phpt

+8
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,14 @@ $converted = mb_convert_encoding("\xff\xff\x00&", 'CP50220', 'UTF-16BE');
382382
if ($converted !== '?&')
383383
die("Bad handling of erroneous codepoint followed by good one (got " . bin2hex($converted) . ")");
384384

385+
// In CP50220, two codepoints can be collapsed into a single kuten code in some cases
386+
// This should work even on a boundary between separately processed buffers
387+
$shouldCollapse = "\xFF\x76\xFF\x9E";
388+
$expected = "\x1B\$B%,\x1B(B";
389+
for ($i = 0; $i < 256; $i++) {
390+
convertValidString(str_repeat("\x00a", $i) . $shouldCollapse, str_repeat('a', $i) . $expected, 'UTF-16BE', 'CP50220', false);
391+
}
392+
385393
?>
386394
--EXPECT--
387395
ASCII support OK

0 commit comments

Comments
 (0)
Please sign in to comment.