Skip to content

Commit ffbddc4

Browse files
committedJan 4, 2023
Optimize conversion of GB18030 to Unicode
As with CP936, iterating over the PUA table and looking for matches in it was a significant bottleneck for GB18030 decoding (though not as severe a bottleneck as for CP936, since more is involved in GB18030 decoding than CP936 decoding). Here are some benchmark results after optimizing out that bottleneck: GB18030, medium - to UTF-16BE - faster by 60.71% (0.0007 vs 0.0017) GB18030, medium - to UTF-8 - faster by 59.88% (0.0007 vs 0.0017) GB18030, long - to UTF-8 - faster by 44.91% (0.0669 vs 0.1214) GB18030, long - to UTF-16BE - faster by 43.05% (0.0672 vs 0.1181) GB18030, short - to UTF-8 - faster by 27.22% (0.0003 vs 0.0004) GB18030, short - to UTF-16BE - faster by 26.98% (0.0003 vs 0.0004) (The 'short' test strings had 0-5 codepoints each, 'medium' ~100 codepoints, and 'long' ~10,000 codepoints. For each benchmark, the test harness cycled through all the test strings 40,000 times.)
1 parent 703725e commit ffbddc4

File tree

1 file changed

+45
-17
lines changed

1 file changed

+45
-17
lines changed
 

‎ext/mbstring/libmbfl/filters/mbfilter_gb18030.c

+45-17
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,22 @@ int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
388388
return 0;
389389
}
390390

391+
static const unsigned short gb18030_pua_tbl3[] = {
392+
/* 0xFE50 */
393+
0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000,
394+
0x0000,0xE81E,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
395+
0x0000,0xE826,0x0000,0x0000,0x0000,0x0000,0xE82B,0xE82C,
396+
0x0000,0x0000,0x0000,0x0000,0xE831,0xE832,0x0000,0x0000,
397+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE83B,0x0000,
398+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0xE843,0x0000,
399+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
400+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
401+
0xE854,0xE855,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
402+
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
403+
/* 0xFEA0 */
404+
0xE864
405+
};
406+
391407
static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
392408
{
393409
unsigned char *p = *in, *e = p + *in_len;
@@ -398,9 +414,14 @@ static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
398414

399415
if (c < 0x80) {
400416
*out++ = c;
401-
} else if (c > 0x80 && c < 0xFF && p < e) {
417+
} else if (c == 0x80 || c == 0xFF) {
418+
*out++ = MBFL_BAD_INPUT;
419+
} else {
420+
if (p == e) {
421+
*out++ = MBFL_BAD_INPUT;
422+
break;
423+
}
402424
unsigned char c2 = *p++;
403-
unsigned int s = (c << 8) | c2;
404425

405426
if (((c >= 0x81 && c <= 0x84) || (c >= 0x90 && c <= 0xE3)) && c2 >= 0x30 && c2 <= 0x39) {
406427
if (p >= e) {
@@ -437,32 +458,39 @@ static size_t mb_gb18030_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
437458
} else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F) {
438459
/* UDA part 3: U+E4C6-U+E765 */
439460
*out++ = 96*(c - 0xA1) + c2 - (c2 >= 0x80 ? 0x41 : 0x40) + 0xE4C6;
440-
} else {
441-
if ((s >= 0xA2AB && s <= 0xA9FE) || (s >= 0xD7FA && s <= 0xD7FE) || (s >= 0xFE50 && s <= 0xFEA0)) {
442-
for (int i = 0; i < mbfl_gb18030_pua_tbl_max; i++) {
443-
if (s >= mbfl_gb18030_pua_tbl[i][2] && s <= mbfl_gb18030_pua_tbl[i][2] + mbfl_gb18030_pua_tbl[i][1] - mbfl_gb18030_pua_tbl[i][0]) {
444-
*out++ = s - mbfl_gb18030_pua_tbl[i][2] + mbfl_gb18030_pua_tbl[i][0];
445-
goto next_iteration;
461+
} else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF) {
462+
unsigned int w = (c - 0x81)*192 + c2 - 0x40;
463+
464+
if (w >= 0x192B) {
465+
if (w <= 0x1EBE) {
466+
if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55) && w != 0x1E7F) {
467+
*out++ = cp936_pua_tbl1[w - 0x192B];
468+
continue;
469+
}
470+
} else if (w >= 0x413A) {
471+
if (w <= 0x413E) {
472+
*out++ = cp936_pua_tbl2[w - 0x413A];
473+
continue;
474+
} else if (w >= 0x5DD0 && w <= 0x5E20) {
475+
unsigned int c = gb18030_pua_tbl3[w - 0x5DD0];
476+
if (c) {
477+
*out++ = c;
478+
continue;
479+
}
446480
}
447481
}
448482
}
449483

450-
if ((c >= 0xA1 && c <= 0xA9 && c2 >= 0xA1 && c2 <= 0xFE) ||
451-
(c >= 0xB0 && c <= 0xf7 && c2 >= 0xa1 && c2 <= 0xfe) ||
452-
(c >= 0x81 && c <= 0xa0 && c2 >= 0x40 && c2 <= 0xfe && c2 != 0x7f) ||
453-
(c >= 0xAA && c <= 0xfe && c2 >= 0x40 && c2 <= 0xa0 && c2 != 0x7f) ||
454-
(c >= 0xA8 && c <= 0xa9 && c2 >= 0x40 && c2 <= 0xa0 && c2 != 0x7F)) {
455-
unsigned int w = (c - 0x81)*192 + c2 - 0x40;
484+
if ((c >= 0x81 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0)) {
456485
ZEND_ASSERT(w < cp936_ucs_table_size);
457486
*out++ = cp936_ucs_table[w];
458487
} else {
459488
*out++ = MBFL_BAD_INPUT;
460489
}
490+
} else {
491+
*out++ = MBFL_BAD_INPUT;
461492
}
462-
} else {
463-
*out++ = MBFL_BAD_INPUT;
464493
}
465-
next_iteration: ;
466494
}
467495

468496
*in_len = e - p;

0 commit comments

Comments
 (0)
Please sign in to comment.