Skip to content

Commit 161ee22

Browse files
committed
Implement fast text conversion interface for ISO-2022-JP-KDDI
One bug in the previous implementation; when it saw a sequence of codepoints which looked like they might need to be emitted as a special KDDI emoji, it would totally forget whether it was in ASCII mode, JISX 0208 mode, or something else. So it could not reliably emit the correct escape sequence to switch to the right mode. Further, if the input ends with a codepoint which looks like it could be part of a special KDDI emoji, then the legacy code did not emit an escape sequence to switch back to ASCII mode at the end of the string. This means that the emitted ISO-2022-JP-KDDI strings could not always be safely concatenated.
1 parent 461b09b commit 161ee22

File tree

1 file changed

+285
-2
lines changed

1 file changed

+285
-2
lines changed

ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c

+285-2
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,16 @@
3434
#include "unicode_table_cp932_ext.h"
3535
#include "unicode_table_jis.h"
3636
#include "cp932_table.h"
37+
#include "emoji2uni.h"
38+
39+
static size_t mb_iso2022jp_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
40+
static void mb_wchar_to_iso2022jp_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
3741

3842
static int mbfl_filt_conv_2022jp_mobile_wchar_flush(mbfl_convert_filter *filter);
3943
static int mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter *filter);
4044

45+
extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n);
46+
4147
static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
4248

4349
const mbfl_encoding mbfl_encoding_2022jp_kddi = {
@@ -49,8 +55,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
4955
MBFL_ENCTYPE_GL_UNSAFE,
5056
&vtbl_2022jp_kddi_wchar,
5157
&vtbl_wchar_2022jp_kddi,
52-
NULL,
53-
NULL
58+
mb_iso2022jp_kddi_to_wchar,
59+
mb_wchar_to_iso2022jp_kddi
5460
};
5561

5662
const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {
@@ -115,6 +121,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_2022jp_kddi = {
115121
s1 = ((c1) << 8) | (c2); \
116122
s2 = 1
117123

124+
#define ASCII 0
118125
#define JISX0201_KANA 0x20
119126
#define JISX0208_KANJI 0x80
120127

@@ -363,3 +370,279 @@ static int mbfl_filt_conv_wchar_2022jp_mobile_flush(mbfl_convert_filter *filter)
363370

364371
return 0;
365372
}
373+
374+
static size_t mb_iso2022jp_kddi_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
375+
{
376+
unsigned char *p = *in, *e = p + *in_len;
377+
uint32_t *out = buf, *limit = buf + bufsize - 1;
378+
379+
while (p < e && out < limit) {
380+
unsigned char c = *p++;
381+
382+
if (c == 0x1B) {
383+
if ((e - p) < 2) {
384+
p = e;
385+
*out++ = MBFL_BAD_INPUT;
386+
break;
387+
}
388+
unsigned char c2 = *p++;
389+
unsigned char c3 = *p++;
390+
391+
if (c2 == '$') {
392+
if (c3 == '@' || c3 == 'B') {
393+
*state = JISX0208_KANJI;
394+
} else if (c3 == '(') {
395+
if (p == e) {
396+
*out++ = MBFL_BAD_INPUT;
397+
break;
398+
}
399+
unsigned char c4 = *p++;
400+
401+
if (c4 == '@' || c4 == 'B') {
402+
*state = JISX0208_KANJI;
403+
} else {
404+
*out++ = MBFL_BAD_INPUT;
405+
}
406+
} else {
407+
*out++ = MBFL_BAD_INPUT;
408+
}
409+
} else if (c2 == '(') {
410+
if (c3 == 'B' || c3 == 'J') {
411+
*state = ASCII;
412+
} else if (c3 == 'I') {
413+
*state = JISX0201_KANA;
414+
} else {
415+
*out++ = MBFL_BAD_INPUT;
416+
}
417+
} else {
418+
p--;
419+
*out++ = MBFL_BAD_INPUT;
420+
}
421+
} else if (*state == JISX0201_KANA && c >= 0x21 && c <= 0x5F) {
422+
*out++ = 0xFF40 + c;
423+
} else if (*state == JISX0208_KANJI && c >= 0x21 && c <= 0x7F) {
424+
if (p == e) {
425+
*out++ = MBFL_BAD_INPUT;
426+
break;
427+
}
428+
unsigned char c2 = *p++;
429+
430+
if (c2 >= 0x21 && c2 <= 0x7E) {
431+
unsigned int s = ((c - 0x21) * 94) + c2 - 0x21;
432+
uint32_t w = 0;
433+
434+
if (s <= 137) {
435+
if (s == 31) {
436+
w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */
437+
} else if (s == 32) {
438+
w = 0xFF5E; /* FULLWIDTH TILDE */
439+
} else if (s == 33) {
440+
w = 0x2225; /* PARALLEL TO */
441+
} else if (s == 60) {
442+
w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */
443+
} else if (s == 80) {
444+
w = 0xFFE0; /* FULLWIDTH CENT SIGN */
445+
} else if (s == 81) {
446+
w = 0xFFE1; /* FULLWIDTH POUND SIGN */
447+
} else if (s == 137) {
448+
w = 0xFFE2; /* FULLWIDTH NOT SIGN */
449+
}
450+
}
451+
452+
if (s >= (84 * 94) && s < (91 * 94)) {
453+
int snd = 0;
454+
s += 22 * 94;
455+
w = mbfilter_sjis_emoji_kddi2unicode(s, &snd);
456+
if (w && snd) {
457+
*out++ = snd;
458+
}
459+
}
460+
461+
if (!w) {
462+
if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {
463+
w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
464+
} else if (s < jisx0208_ucs_table_size) {
465+
w = jisx0208_ucs_table[s];
466+
}
467+
}
468+
469+
*out++ = w ? w : MBFL_BAD_INPUT;
470+
} else {
471+
*out++ = MBFL_BAD_INPUT;
472+
}
473+
} else if (c <= 0x7F) {
474+
*out++ = c;
475+
} else if (c >= 0xA1 && c <= 0xDF) {
476+
*out++ = 0xFEC0 + c;
477+
} else {
478+
*out++ = MBFL_BAD_INPUT;
479+
}
480+
}
481+
482+
*in_len = e - p;
483+
*in = p;
484+
return out - buf;
485+
}
486+
487+
/* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF
488+
* These correspond to the letters A-Z
489+
* To display the flag emoji for a country, two unicode codepoints are combined,
490+
* which correspond to the two-letter code for that country
491+
* This macro converts uppercase ASCII values to Regional Indicator codepoints */
492+
#define NFLAGS(c) (0x1F1A5+((unsigned int)(c)))
493+
494+
static const char nflags_s[10][2] = {
495+
"CN","DE","ES","FR","GB","IT","JP","KR","RU","US"
496+
};
497+
static const int nflags_code_kddi[10] = {
498+
0x2549, 0x2546, 0x24C0, 0x2545, 0x2548, 0x2547, 0x2750, 0x254A, 0x24C1, 0x27F7
499+
};
500+
501+
static void mb_wchar_to_iso2022jp_kddi(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
502+
{
503+
unsigned char *out, *limit;
504+
MB_CONVERT_BUF_LOAD(buf, out, limit);
505+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
506+
507+
while (len--) {
508+
uint32_t w = *in++;
509+
unsigned int s = 0;
510+
511+
if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
512+
s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
513+
} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
514+
s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
515+
} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
516+
s = ucs_i_jis_table[w - ucs_i_jis_table_min];
517+
} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
518+
s = ucs_r_jis_table[w - ucs_r_jis_table_min];
519+
}
520+
521+
if (!s) {
522+
if (w == 0xA5) { /* YEN SIGN */
523+
s = 0x216F; /* FULLWIDTH YEN SIGN */
524+
} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
525+
s = 0x2140;
526+
} else if (w == 0x2225) { /* PARALLEL TO */
527+
s = 0x2142;
528+
} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
529+
s = 0x215D;
530+
} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
531+
s = 0x2171;
532+
} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
533+
s = 0x2172;
534+
} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
535+
s = 0x224C;
536+
}
537+
}
538+
539+
if ((w == '#' || (w >= '0' && w <= '9')) && len) {
540+
uint32_t w2 = *in++; len--;
541+
542+
if (w2 == 0x20E3) {
543+
unsigned int s1 = 0;
544+
if (w == '#') {
545+
s1 = 0x25BC;
546+
} else if (w == '0') {
547+
s1 = 0x2830;
548+
} else { /* Previous character was '1'-'9' */
549+
s1 = 0x27A6 + (w - '1');
550+
}
551+
s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
552+
} else {
553+
in--; len++;
554+
}
555+
} else if (w >= NFLAGS('C') && w <= NFLAGS('U') && len) { /* C for CN, U for US */
556+
uint32_t w2 = *in++; len--;
557+
558+
if (w2 >= NFLAGS('B') && w2 <= NFLAGS('U')) { /* B for GB, U for RU */
559+
for (int i = 0; i < 10; i++) {
560+
if (w == NFLAGS(nflags_s[i][0]) && w2 == NFLAGS(nflags_s[i][1])) {
561+
unsigned int s1 = nflags_code_kddi[i];
562+
s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
563+
goto found_flag_emoji;
564+
}
565+
}
566+
}
567+
568+
in--; len++;
569+
found_flag_emoji: ;
570+
}
571+
572+
if (w == 0xA9) { /* Copyright sign */
573+
unsigned int s1 = 0x27DC;
574+
s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
575+
} else if (w == 0xAE) { /* Registered sign */
576+
unsigned int s1 = 0x27DD;
577+
s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
578+
} else if (w >= mb_tbl_uni_kddi2code2_min && w <= mb_tbl_uni_kddi2code2_max) {
579+
int i = mbfl_bisec_srch2(w, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len);
580+
if (i >= 0) {
581+
unsigned int s1 = mb_tbl_uni_kddi2code2_value[i];
582+
s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
583+
}
584+
} else if (w >= mb_tbl_uni_kddi2code3_min && w <= mb_tbl_uni_kddi2code3_max) {
585+
int i = mbfl_bisec_srch2(w - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len);
586+
if (i >= 0) {
587+
unsigned int s1 = mb_tbl_uni_kddi2code3_value[i];
588+
s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
589+
}
590+
} else if (w >= mb_tbl_uni_kddi2code5_min && w <= mb_tbl_uni_kddi2code5_max) {
591+
int i = mbfl_bisec_srch2(w - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len);
592+
if (i >= 0) {
593+
unsigned int s1 = mb_tbl_uni_kddi2code5_val[i];
594+
s = (((s1 / 94) + 0x21) << 8) + ((s1 % 94) + 0x21) - 0x1600;
595+
}
596+
}
597+
598+
if (!s || s >= 0xA1A1) {
599+
s = 0;
600+
for (int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
601+
if (w == cp932ext1_ucs_table[i]) {
602+
s = (((i / 94) + 0x2D) << 8) + (i % 94) + 0x21;
603+
break;
604+
}
605+
}
606+
if (w == 0)
607+
s = 0;
608+
}
609+
610+
if (!s && w) {
611+
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
612+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
613+
} else if (s <= 0x7F) {
614+
if (buf->state != ASCII) {
615+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
616+
out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
617+
buf->state = ASCII;
618+
}
619+
out = mb_convert_buf_add(out, s);
620+
} else if (s >= 0xA1 && s <= 0xDF) {
621+
if (buf->state != JISX0201_KANA) {
622+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4);
623+
out = mb_convert_buf_add3(out, 0x1B, '(', 'I');
624+
buf->state = JISX0201_KANA;
625+
}
626+
out = mb_convert_buf_add(out, s & 0x7F);
627+
} else if (s <= 0x7E7E) {
628+
if (buf->state != JISX0208_KANJI) {
629+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 5);
630+
out = mb_convert_buf_add3(out, 0x1B, '$', 'B');
631+
buf->state = JISX0208_KANJI;
632+
} else {
633+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2);
634+
}
635+
out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF);
636+
} else {
637+
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_iso2022jp_kddi);
638+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
639+
}
640+
}
641+
642+
if (end && buf->state != ASCII) {
643+
MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
644+
out = mb_convert_buf_add3(out, 0x1B, '(', 'B');
645+
}
646+
647+
MB_CONVERT_BUF_STORE(buf, out, limit);
648+
}

0 commit comments

Comments
 (0)