Skip to content

Commit 78ee184

Browse files
committed
Move kana conversion function to mbfilter_cp5022x.c
...To avoid a dependency from libmbfl to mbstring. Thanks to Nikita Popov for pointing this issue out.
1 parent e1351eb commit 78ee184

File tree

2 files changed

+195
-195
lines changed

2 files changed

+195
-195
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c

+192-3
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
4040
static void mb_wchar_to_cp50221(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
4141
static void mb_wchar_to_cp50222(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
4242

43-
/* See mbstring.c */
44-
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
45-
4643
/* Previously, a dubious 'encoding' called 'cp50220raw' was supported
4744
* This was just CP50220, but the implementation was less strict regarding
4845
* invalid characters; it would silently pass some through
@@ -333,6 +330,198 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
333330
return 0;
334331
}
335332

333+
/* Apply various transforms to input codepoint, such as converting halfwidth katakana
334+
* to fullwidth katakana. `mode` is a bitfield which controls which transforms are
335+
* actually performed. The bit values are defined in translit_kana_jisx0201_jisx0208.h.
336+
* `mode` must not call for transforms which are inverses (i.e. which would cancel
337+
* each other out).
338+
*
339+
* In some cases, successive input codepoints may be merged into one output codepoint.
340+
* (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
341+
* and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
342+
* will not be modified. If there is no following codepoint, `next` should be zero.
343+
*
344+
* Again, in some cases, one input codepoint may convert to two output codepoints.
345+
* If so, the second output codepoint will be stored in `*second`.
346+
*
347+
* Return the resulting codepoint. If none of the requested transforms apply, return
348+
* the input codepoint unchanged.
349+
*/
350+
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, unsigned int mode)
351+
{
352+
if ((mode & MBFL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7D && c != '"' && c != '\'' && c != '\\') {
353+
return c + 0xFEE0;
354+
}
355+
if ((mode & MBFL_HAN2ZEN_ALPHA) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
356+
return c + 0xFEE0;
357+
}
358+
if ((mode & MBFL_HAN2ZEN_NUMERIC) && c >= '0' && c <= '9') {
359+
return c + 0xFEE0;
360+
}
361+
if ((mode & MBFL_HAN2ZEN_SPACE) && c == ' ') {
362+
return 0x3000;
363+
}
364+
365+
if (mode & (MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_HIRAGANA)) {
366+
/* Convert Hankaku kana to Zenkaku kana
367+
* Either all Hankaku kana (including katakana and hiragana) will be converted
368+
* to Zenkaku katakana, or to Zenkaku hiragana */
369+
if ((mode & MBFL_HAN2ZEN_KATAKANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
370+
if (c >= 0xFF61 && c <= 0xFF9F) {
371+
int n = c - 0xFF60;
372+
373+
if (next >= 0xFF61 && next <= 0xFF9F) {
374+
if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
375+
*consumed = true;
376+
return 0x3001 + hankana2zenkana_table[n];
377+
}
378+
if (next == 0xFF9E && n == 19) {
379+
*consumed = true;
380+
return 0x30F4;
381+
}
382+
if (next == 0xFF9F && n >= 42 && n <= 46) {
383+
*consumed = true;
384+
return 0x3002 + hankana2zenkana_table[n];
385+
}
386+
}
387+
388+
return 0x3000 + hankana2zenkana_table[n];
389+
}
390+
}
391+
if ((mode & MBFL_HAN2ZEN_HIRAGANA) && (mode & MBFL_HAN2ZEN_GLUE)) {
392+
if (c >= 0xFF61 && c <= 0xFF9F) {
393+
int n = c - 0xFF60;
394+
395+
if (next >= 0xFF61 && next <= 0xFF9F) {
396+
if (next == 0xFF9E && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
397+
*consumed = true;
398+
return 0x3001 + hankana2zenhira_table[n];
399+
}
400+
if (next == 0xFF9F && n >= 42 && n <= 46) {
401+
*consumed = true;
402+
return 0x3002 + hankana2zenhira_table[n];
403+
}
404+
}
405+
406+
return 0x3000 + hankana2zenhira_table[n];
407+
}
408+
}
409+
if ((mode & MBFL_HAN2ZEN_KATAKANA) && c >= 0xFF61 && c <= 0xFF9F) {
410+
return 0x3000 + hankana2zenkana_table[c - 0xFF60];
411+
}
412+
if ((mode & MBFL_HAN2ZEN_HIRAGANA) && c >= 0xFF61 && c <= 0xFF9F) {
413+
return 0x3000 + hankana2zenhira_table[c - 0xFF60];
414+
}
415+
}
416+
417+
if (mode & MBFL_HAN2ZEN_SPECIAL) { /* special ascii to symbol */
418+
if (c == '\\' || c == 0xA5) { /* YEN SIGN */
419+
return 0xFFE5; /* FULLWIDTH YEN SIGN */
420+
}
421+
if (c == 0x7E || c == 0x203E) {
422+
return 0xFFE3; /* FULLWIDTH MACRON */
423+
}
424+
if (c == '\'') {
425+
return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
426+
}
427+
if (c == '"') {
428+
return 0x201D; /* RIGHT DOUBLE QUOTATION MARK */
429+
}
430+
}
431+
432+
if (mode & (MBFL_ZEN2HAN_ALL | MBFL_ZEN2HAN_ALPHA | MBFL_ZEN2HAN_NUMERIC | MBFL_ZEN2HAN_SPACE)) {
433+
/* Zenkaku to Hankaku */
434+
if ((mode & MBFL_ZEN2HAN_ALL) && c >= 0xFF01 && c <= 0xFF5D && c != 0xFF02 && c != 0xFF07 && c != 0xFF3C) {
435+
/* all except " ' \ ~ */
436+
return c - 0xFEE0;
437+
}
438+
if ((mode & MBFL_ZEN2HAN_ALPHA) && ((c >= 0xFF21 && c <= 0xFF3A) || (c >= 0xFF41 && c <= 0xFF5A))) {
439+
return c - 0xFEE0;
440+
}
441+
if ((mode & MBFL_ZEN2HAN_NUMERIC) && (c >= 0xFF10 && c <= 0xFF19)) {
442+
return c - 0xFEE0;
443+
}
444+
if ((mode & MBFL_ZEN2HAN_SPACE) && (c == 0x3000)) {
445+
return ' ';
446+
}
447+
if ((mode & MBFL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
448+
return '-';
449+
}
450+
}
451+
452+
if (mode & (MBFL_ZEN2HAN_KATAKANA | MBFL_ZEN2HAN_HIRAGANA)) {
453+
/* Zenkaku kana to hankaku kana */
454+
if ((mode & MBFL_ZEN2HAN_KATAKANA) && c >= 0x30A1 && c <= 0x30F4) {
455+
/* Zenkaku katakana to hankaku kana */
456+
int n = c - 0x30A1;
457+
if (zenkana2hankana_table[n][1]) {
458+
*second = 0xFF00 + zenkana2hankana_table[n][1];
459+
}
460+
return 0xFF00 + zenkana2hankana_table[n][0];
461+
}
462+
if ((mode & MBFL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
463+
/* Zenkaku hiragana to hankaku kana */
464+
int n = c - 0x3041;
465+
if (zenkana2hankana_table[n][1]) {
466+
*second = 0xFF00 + zenkana2hankana_table[n][1];
467+
}
468+
return 0xFF00 + zenkana2hankana_table[n][0];
469+
}
470+
if (c == 0x3001) {
471+
return 0xFF64; /* HALFWIDTH IDEOGRAPHIC COMMA */
472+
}
473+
if (c == 0x3002) {
474+
return 0xFF61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
475+
}
476+
if (c == 0x300C) {
477+
return 0xFF62; /* HALFWIDTH LEFT CORNER BRACKET */
478+
}
479+
if (c == 0x300D) {
480+
return 0xFF63; /* HALFWIDTH RIGHT CORNER BRACKET */
481+
}
482+
if (c == 0x309B) {
483+
return 0xFF9E; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
484+
}
485+
if (c == 0x309C) {
486+
return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
487+
}
488+
if (c == 0x30FC) {
489+
return 0xFF70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
490+
}
491+
if (c == 0x30FB) {
492+
return 0xFF65; /* HALFWIDTH KATAKANA MIDDLE DOT */
493+
}
494+
}
495+
496+
if (mode & (MBFL_ZENKAKU_HIRA2KATA | MBFL_ZENKAKU_KATA2HIRA)) {
497+
if ((mode & MBFL_ZENKAKU_HIRA2KATA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309D || c == 0x309E)) {
498+
/* Zenkaku hiragana to Zenkaku katakana */
499+
return c + 0x60;
500+
}
501+
if ((mode & MBFL_ZENKAKU_KATA2HIRA) && ((c >= 0x30A1 && c <= 0x30F3) || c == 0x30FD || c == 0x30FE)) {
502+
/* Zenkaku katakana to Zenkaku hiragana */
503+
return c - 0x60;
504+
}
505+
}
506+
507+
if (mode & MBFL_ZEN2HAN_SPECIAL) { /* special symbol to ascii */
508+
if (c == 0xFFE5 || c == 0xFF3C) { /* FULLWIDTH YEN SIGN/FULLWIDTH REVERSE SOLIDUS */
509+
return '\\';
510+
}
511+
if (c == 0xFFE3 || c == 0x203E) { /* FULLWIDTH MACRON/OVERLINE */
512+
return '~';
513+
}
514+
if (c == 0x2018 || c == 0x2019) { /* LEFT/RIGHT SINGLE QUOTATION MARK*/
515+
return '\'';
516+
}
517+
if (c == 0x201C || c == 0x201D) { /* LEFT/RIGHT DOUBLE QUOTATION MARK */
518+
return '"';
519+
}
520+
}
521+
522+
return c;
523+
}
524+
336525
static int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter)
337526
{
338527
int mode = MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE;

0 commit comments

Comments
 (0)