Skip to content

Commit b1954f5

Browse files
committed
Use fast text conversion filters to implement mb_convert_variables
1 parent adfdfb2 commit b1954f5

File tree

2 files changed

+79
-63
lines changed

2 files changed

+79
-63
lines changed

ext/mbstring/mbstring.c

+15-36
Original file line numberDiff line numberDiff line change
@@ -3180,7 +3180,7 @@ PHP_FUNCTION(mb_convert_kana)
31803180
RETVAL_STR(jp_kana_convert(str, enc, opt));
31813181
}
31823182

3183-
static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
3183+
static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, bool *recursion_error) /* {{{ */
31843184
{
31853185
mbfl_string string;
31863186
HashTable *ht;
@@ -3196,7 +3196,7 @@ static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zv
31963196
} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
31973197
if (Z_REFCOUNTED_P(var)) {
31983198
if (Z_IS_RECURSIVE_P(var)) {
3199-
*recursion_error = 1;
3199+
*recursion_error = true;
32003200
return 0;
32013201
}
32023202
Z_PROTECT_RECURSION_P(var);
@@ -3226,43 +3226,37 @@ static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zv
32263226
return 0;
32273227
} /* }}} */
32283228

3229-
static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var) /* {{{ */
3229+
static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
32303230
{
3231-
mbfl_string string, result, *ret;
32323231
HashTable *ht;
32333232
zval *entry, *orig_var;
32343233

32353234
orig_var = var;
32363235
ZVAL_DEREF(var);
3236+
32373237
if (Z_TYPE_P(var) == IS_STRING) {
3238-
string.val = (unsigned char *)Z_STRVAL_P(var);
3239-
string.len = Z_STRLEN_P(var);
3240-
ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
3241-
if (ret != NULL) {
3242-
zval_ptr_dtor(orig_var);
3243-
// TODO: avoid reallocation ???
3244-
ZVAL_STRINGL(orig_var, (char *)ret->val, ret->len);
3245-
efree(ret->val);
3246-
}
3238+
zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3239+
zval_ptr_dtor(orig_var);
3240+
ZVAL_STR(orig_var, ret);
32473241
} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
32483242
if (Z_TYPE_P(var) == IS_ARRAY) {
32493243
SEPARATE_ARRAY(var);
32503244
}
32513245
if (Z_REFCOUNTED_P(var)) {
32523246
if (Z_IS_RECURSIVE_P(var)) {
3253-
return 1;
3247+
return true;
32543248
}
32553249
Z_PROTECT_RECURSION_P(var);
32563250
}
32573251

32583252
ht = HASH_OF(var);
32593253
if (ht != NULL) {
32603254
ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
3261-
if (mb_recursive_convert_variable(convd, entry)) {
3255+
if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
32623256
if (Z_REFCOUNTED_P(var)) {
32633257
Z_UNPROTECT_RECURSION_P(var);
32643258
}
3265-
return 1;
3259+
return true;
32663260
}
32673261
} ZEND_HASH_FOREACH_END();
32683262
}
@@ -3271,8 +3265,9 @@ static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var
32713265
Z_UNPROTECT_RECURSION_P(var);
32723266
}
32733267
}
3274-
return 0;
3275-
} /* }}} */
3268+
3269+
return false;
3270+
}
32763271

32773272
/* {{{ Converts the string resource in variables to desired encoding */
32783273
PHP_FUNCTION(mb_convert_variables)
@@ -3281,14 +3276,12 @@ PHP_FUNCTION(mb_convert_variables)
32813276
zend_string *to_enc_str;
32823277
zend_string *from_enc_str;
32833278
HashTable *from_enc_ht;
3284-
mbfl_string string, result;
32853279
const mbfl_encoding *from_encoding, *to_encoding;
32863280
mbfl_encoding_detector *identd;
3287-
mbfl_buffer_converter *convd;
32883281
int n, argc;
32893282
size_t elistsz;
32903283
const mbfl_encoding **elist;
3291-
int recursion_error = 0;
3284+
bool recursion_error = false;
32923285

32933286
ZEND_PARSE_PARAMETERS_START(3, -1)
32943287
Z_PARAM_STR(to_enc_str)
@@ -3302,10 +3295,7 @@ PHP_FUNCTION(mb_convert_variables)
33023295
RETURN_THROWS();
33033296
}
33043297

3305-
/* initialize string */
33063298
from_encoding = MBSTRG(current_internal_encoding);
3307-
mbfl_string_init_set(&string, from_encoding);
3308-
mbfl_string_init(&result);
33093299

33103300
/* pre-conversion encoding */
33113301
if (from_enc_ht) {
@@ -3356,29 +3346,18 @@ PHP_FUNCTION(mb_convert_variables)
33563346

33573347
efree(ZEND_VOIDP(elist));
33583348

3359-
convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0);
3360-
/* If this assertion fails this means some memory allocation failure which is a bug */
3361-
ZEND_ASSERT(convd != NULL);
3362-
3363-
mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
3364-
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
3365-
33663349
/* convert */
33673350
n = 0;
33683351
while (n < argc) {
33693352
zval *zv = &args[n];
3370-
33713353
ZVAL_DEREF(zv);
3372-
recursion_error = mb_recursive_convert_variable(convd, zv);
3354+
recursion_error = mb_recursive_convert_variable(zv, from_encoding, to_encoding);
33733355
if (recursion_error) {
33743356
break;
33753357
}
33763358
n++;
33773359
}
33783360

3379-
MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
3380-
mbfl_buffer_converter_delete(convd);
3381-
33823361
if (recursion_error) {
33833362
php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
33843363
RETURN_FALSE;

ext/mbstring/tests/mb_convert_variables.phpt

+64-27
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,19 @@ $sjis = base64_decode('k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==');
1717
// JIS string (BASE64 encoded)
1818
$jis = base64_decode('GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==');
1919
// EUC-JP string
20-
$euc_jp = '日本語テキストです。0123456789。';
20+
$euc_jp = mb_convert_encoding("日本語テキストです。0123456789。", 'EUC-JP', 'UTF-8');
2121

2222
// Test for single scalar
2323
echo "== SCALAR TEST ==\n";
2424
$s = $sjis;
2525
$encoding = mb_convert_variables('EUC-JP', 'SJIS', $s);
2626
print("$encoding\n"); // SJIS
27-
print("$s\n"); // Converted to EUC-JP
27+
echo bin2hex($s), "\n"; // Converted to EUC-JP
2828

2929
$s = $jis;
3030
$encoding = mb_convert_variables('EUC-JP', 'JIS', $s);
3131
print("$encoding\n"); // JIS
32-
print("$s\n"); // Converted to EUC-JP
32+
echo bin2hex($s), "\n"; // Converted to EUC-JP
3333

3434
$s = $euc_jp;
3535
$encoding = mb_convert_variables('SJIS', 'EUC-JP', $s);
@@ -47,9 +47,7 @@ $s2 = $euc_jp;
4747
$s3 = $euc_jp;
4848
$encoding = mb_convert_variables('EUC-JP', 'auto', $s1, $s2, $s3);
4949
print("$encoding\n"); // EUC-JP
50-
print("$s1$s2$s3\n"); // Converted to EUC-JP
51-
52-
50+
echo bin2hex("$s1$s2$s3"), "\n"; // Converted to EUC-JP
5351

5452
// Note: Mixing encoding in array/object is not supported?
5553
// Test for array
@@ -58,15 +56,13 @@ $a = array($s3, $s2, $s1);
5856
$aa = $a;
5957
$encoding = mb_convert_variables('EUC-JP', 'auto', $aa);
6058
print("$encoding\n"); // EUC-JP
61-
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP
59+
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP
6260

6361
$a = array($s1, $s2, $s3);
6462
$aa = $a;
6563
$encoding = mb_convert_variables('EUC-JP', 'auto', $aa);
6664
print("$encoding\n"); // EUC-JP
67-
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP
68-
69-
65+
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP
7066

7167
// Test for object
7268
echo "== OBJECT TEST ==\n";
@@ -102,19 +98,17 @@ class bar
10298
}
10399
}
104100

105-
106101
$o = new foo;
107102
$oo = $o;
108103
$encoding = mb_convert_variables('EUC-JP', 'auto', $oo);
109104
print("$encoding\n"); // EUC-JP
110-
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP
105+
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP
111106

112107
$o = new bar;
113108
$oo = $o;
114109
$encoding = mb_convert_variables('EUC-JP', 'auto', $oo);
115110
print("$encoding\n"); // EUC-JP
116-
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP
117-
111+
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP
118112

119113
// Test for scalar, array and object
120114
echo "== SCALAR, ARRAY AND OBJECT TEST ==\n";
@@ -127,36 +121,79 @@ $oo = $o;
127121

128122
$encoding = mb_convert_variables('EUC-JP', 'auto', $s1, $s2, $s3, $aa, $oo);
129123
print("$encoding\n"); // EUC-JP
130-
print("$s1$s2$s3\n"); // Converted to EUC-JP
131-
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP
132-
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP
124+
echo bin2hex("$s1$s2$s3"), "\n"; // Converted to EUC-JP
125+
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP
126+
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP
127+
128+
echo "== DEEPLY NESTED OBJECT/ARRAY TEST ==\n";
129+
130+
class Nested
131+
{
132+
public $inner;
133+
134+
function __construct($value)
135+
{
136+
$this->inner = $value;
137+
}
138+
}
139+
140+
$deeplyNested = array(new Nested(array(new Nested(array(new Nested("BLAH"))))));
141+
142+
$encoding = mb_convert_variables('UTF-16LE', 'UTF-8', $deeplyNested);
143+
echo $encoding, "\n";
144+
echo bin2hex($deeplyNested[0]->inner[0]->inner[0]->inner), "\n";
145+
146+
echo "== INVALID STRING ENCODING TEST ==\n";
147+
// Make sure both that the correct invalid encoding marker is used,
148+
// and that the count of illegal characters is incremented
149+
150+
$illegalCount = mb_get_info('illegal_chars');
151+
$nested = array(new Nested("\xFF"));
152+
mb_substitute_character(0x25);
153+
mb_convert_variables('UTF-16LE', 'UTF-8', $nested);
154+
echo bin2hex($nested[0]->inner), "\n";
155+
echo "# of illegal characters detected: ", mb_get_info('illegal_chars') - $illegalCount, "\n";
133156

157+
$illegalCount = mb_get_info('illegal_chars');
158+
$nested = array(new Nested("\xFF"));
159+
mb_substitute_character(0x26);
160+
mb_convert_variables('UTF-16LE', 'UTF-8', $nested);
161+
echo bin2hex($nested[0]->inner), "\n";
162+
echo "# of illegal characters detected: ", mb_get_info('illegal_chars') - $illegalCount, "\n";
134163

135164
?>
136165
--EXPECT--
137166
== SCALAR TEST ==
138167
SJIS
139-
日本語テキストです。0123456789。
168+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
140169
JIS
141-
日本語テキストです。0123456789。
170+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
142171
EUC-JP
143172
k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==
144173
EUC-JP
145174
GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
146175
EUC-JP
147-
日本語テキストです。0123456789。日本語テキストです。0123456789。日本語テキストです。0123456789。
176+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
148177
== ARRAY TEST ==
149178
EUC-JP
150-
日本語テキストです。0123456789。日本語テキストです。0123456789。日本語テキストです。0123456789。
179+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
151180
EUC-JP
152-
日本語テキストです。0123456789。日本語テキストです。0123456789。日本語テキストです。0123456789。
181+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
153182
== OBJECT TEST ==
154183
EUC-JP
155-
日本語テキストです。0123456789。日本語テキストです。0123456789。日本語テキストです。0123456789。
184+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
156185
EUC-JP
157-
日本語テキストです。0123456789。日本語テキストです。0123456789。日本語テキストです。0123456789。
186+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
158187
== SCALAR, ARRAY AND OBJECT TEST ==
159188
EUC-JP
160-
日本語テキストです。0123456789。日本語テキストです。0123456789。日本語テキストです。0123456789。
161-
日本語テキストです。0123456789。日本語テキストです。0123456789。日本語テキストです。0123456789。
162-
日本語テキストです。0123456789。日本語テキストです。0123456789。日本語テキストです。0123456789。
189+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
190+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
191+
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
192+
== DEEPLY NESTED OBJECT/ARRAY TEST ==
193+
UTF-8
194+
42004c0041004800
195+
== INVALID STRING ENCODING TEST ==
196+
2500
197+
# of illegal characters detected: 1
198+
2600
199+
# of illegal characters detected: 1

0 commit comments

Comments
 (0)