Skip to content

Commit 9ac49c0

Browse files
committed
New implementation of mb_convert_kana
mb_convert_kana now uses the new text encoding conversion filters. Microbenchmarking shows speed gains of 50%-150% across various text encodings and input string lengths. The behavior is the same as the old mb_convert_kana except for one fix: if the 'zero codepoint' U+0000 appeared in the input, the old implementation would sometimes drop it, not passing it through to the output. This is now fixed.
1 parent 840423d commit 9ac49c0

10 files changed

+308
-421
lines changed

ext/mbstring/config.m4

-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [
118118
libmbfl/filters/mbfilter_sjis_mobile.c
119119
libmbfl/filters/mbfilter_sjis_mac.c
120120
libmbfl/filters/mbfilter_sjis_2004.c
121-
libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c
122121
libmbfl/filters/mbfilter_ucs2.c
123122
libmbfl/filters/mbfilter_ucs4.c
124123
libmbfl/filters/mbfilter_uhc.c

ext/mbstring/config.w32

+1-2
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ if (PHP_MBSTRING != "no") {
2828
mbfilter_utf8_mobile.c mbfilter_uuencode.c \
2929
mbfilter_cp5022x.c mbfilter_sjis_mobile.c \
3030
mbfilter_sjis_mac.c \
31-
mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c \
32-
mbfilter_tl_jisx0201_jisx0208.c", "mbstring");
31+
mbfilter_iso2022jp_mobile.c mbfilter_singlebyte.c", "mbstring");
3332

3433
ADD_SOURCES("ext/mbstring/libmbfl/mbfl", "mbfilter.c mbfilter_8bit.c \
3534
mbfilter_pass.c mbfilter_wchar.c mbfl_convert.c mbfl_encoding.c \

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c

+7-4
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@
2525
#include "mbfilter.h"
2626
#include "mbfilter_cp5022x.h"
2727
#include "mbfilter_jis.h"
28-
#include "mbfilter_tl_jisx0201_jisx0208.h"
2928

3029
#include "unicode_table_cp932_ext.h"
3130
#include "unicode_table_jis.h"
3231
#include "cp932_table.h"
32+
#include "translit_kana_jisx0201_jisx0208.h"
3333

3434
static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
3535
static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter);
@@ -40,6 +40,9 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
4040
static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
4141
static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
4242

43+
/* See mbstring.c */
44+
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
45+
4346
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
4447
* This was just CP50220, but the implementation was less strict regarding
4548
* invalid characters; it would silently pass some through
@@ -336,7 +339,7 @@ static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
336339
bool consumed = false;
337340

338341
if (filter->cache) {
339-
int s = mbfl_convert_kana(filter->cache, c, &consumed, NULL, mode);
342+
int s = mb_convert_kana_codepoint(filter->cache, c, &consumed, NULL, mode);
340343
filter->cache = consumed ? 0 : c;
341344
/* Terrible hack to get CP50220 to emit error markers in the proper
342345
* position, not reordering them with subsequent characters */
@@ -359,7 +362,7 @@ static int mbfl_filt_conv_wchar_cp50220_flush(mbfl_convert_filter *filter)
359362
int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;
360363

361364
if (filter->cache) {
362-
int s = mbfl_convert_kana(filter->cache, 0, NULL, NULL, mode);
365+
int s = mb_convert_kana_codepoint(filter->cache, 0, NULL, NULL, mode);
363366
mbfl_filt_conv_wchar_cp50221(s, filter);
364367
filter->cache = 0;
365368
}
@@ -866,7 +869,7 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
866869
buf->state |= w << 8;
867870
break;
868871
} else {
869-
w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
872+
w = mb_convert_kana_codepoint(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
870873
}
871874

872875
if (consumed) {

ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.c

-252
This file was deleted.

ext/mbstring/libmbfl/filters/mbfilter_tl_jisx0201_jisx0208.h

-56
This file was deleted.

0 commit comments

Comments
 (0)