Skip to content

Commit 3a9f5c6

Browse files
committed
[RFC] Implement mb_str_pad()
Closes GH-10203.
1 parent d9e2da3 commit 3a9f5c6

File tree

6 files changed

+283
-1
lines changed

6 files changed

+283
-1
lines changed

NEWS

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ PHP NEWS
22
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
33
?? ??? ????, PHP 8.3.0alpha3
44

5+
- MBString:
6+
. Implement mb_str_pad() RFC. (nielsdos)
57

68
22 Jun 2023, PHP 8.3.0alpha2
79

UPGRADING

+4
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,10 @@ PHP 8.3 UPGRADE NOTES
208208
the given $depth and $options.
209209
RFC: https://wiki.php.net/rfc/json_validate
210210

211+
- MBString:
212+
. Added mb_str_pad(), which is the mbstring equivalent of str_pad().
213+
RFC: https://wiki.php.net/rfc/mb_str_pad
214+
211215
- Posix:
212216
. Added posix_sysconf call to get runtime informations.
213217
. Added posix_pathconf call to get configuration value from a directory/file.

ext/mbstring/mbstring.c

+126
Original file line numberDiff line numberDiff line change
@@ -5522,6 +5522,132 @@ PHP_FUNCTION(mb_chr)
55225522
}
55235523
/* }}} */
55245524

5525+
PHP_FUNCTION(mb_str_pad)
5526+
{
5527+
zend_string *input, *encoding_str = NULL, *pad = NULL;
5528+
zend_long pad_to_length;
5529+
zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5530+
5531+
ZEND_PARSE_PARAMETERS_START(2, 5)
5532+
Z_PARAM_STR(input)
5533+
Z_PARAM_LONG(pad_to_length)
5534+
Z_PARAM_OPTIONAL
5535+
Z_PARAM_STR(pad)
5536+
Z_PARAM_LONG(pad_type_val)
5537+
Z_PARAM_STR_OR_NULL(encoding_str)
5538+
ZEND_PARSE_PARAMETERS_END();
5539+
5540+
const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5541+
if (!encoding) {
5542+
RETURN_THROWS();
5543+
}
5544+
5545+
size_t input_length = mb_get_strlen(input, encoding);
5546+
5547+
/* If resulting string turns out to be shorter than input string,
5548+
we simply copy the input and return. */
5549+
if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5550+
RETURN_STR_COPY(input);
5551+
}
5552+
5553+
if (ZSTR_LEN(pad) == 0) {
5554+
zend_argument_value_error(3, "must be a non-empty string");
5555+
RETURN_THROWS();
5556+
}
5557+
5558+
if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5559+
zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5560+
RETURN_THROWS();
5561+
}
5562+
5563+
size_t pad_length = mb_get_strlen(pad, encoding);
5564+
5565+
size_t num_mb_pad_chars = pad_to_length - input_length;
5566+
5567+
/* We need to figure out the left/right padding lengths. */
5568+
size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5569+
switch (pad_type_val) {
5570+
case PHP_STR_PAD_RIGHT:
5571+
right_pad = num_mb_pad_chars;
5572+
break;
5573+
5574+
case PHP_STR_PAD_LEFT:
5575+
left_pad = num_mb_pad_chars;
5576+
break;
5577+
5578+
case PHP_STR_PAD_BOTH:
5579+
left_pad = num_mb_pad_chars / 2;
5580+
right_pad = num_mb_pad_chars - left_pad;
5581+
break;
5582+
}
5583+
5584+
/* How many full block copies need to happen, and how many characters are then left over? */
5585+
size_t full_left_pad_copies = left_pad / pad_length;
5586+
size_t full_right_pad_copies = right_pad / pad_length;
5587+
size_t remaining_left_pad_chars = left_pad % pad_length;
5588+
size_t remaining_right_pad_chars = right_pad % pad_length;
5589+
5590+
if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5591+
goto overflow_no_release;
5592+
}
5593+
5594+
/* Compute the number of bytes required for the padding */
5595+
size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5596+
size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5597+
5598+
/* No special fast-path handling necessary for zero-length pads because these functions will not
5599+
* allocate memory in case a zero-length pad is required. */
5600+
zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5601+
zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5602+
5603+
if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5604+
|| full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5605+
goto overflow;
5606+
}
5607+
5608+
size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5609+
size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5610+
5611+
if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5612+
|| ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5613+
goto overflow;
5614+
}
5615+
5616+
zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5617+
char *buffer = ZSTR_VAL(result);
5618+
5619+
/* First we pad the left. */
5620+
for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5621+
memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5622+
}
5623+
memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5624+
buffer += ZSTR_LEN(remaining_left_pad_str);
5625+
5626+
/* Then we copy the input string. */
5627+
memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5628+
buffer += ZSTR_LEN(input);
5629+
5630+
/* Finally, we pad on the right. */
5631+
for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5632+
memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5633+
}
5634+
memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5635+
5636+
ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5637+
5638+
zend_string_release_ex(remaining_left_pad_str, false);
5639+
zend_string_release_ex(remaining_right_pad_str, false);
5640+
5641+
RETURN_NEW_STR(result);
5642+
5643+
overflow:
5644+
zend_string_release_ex(remaining_left_pad_str, false);
5645+
zend_string_release_ex(remaining_right_pad_str, false);
5646+
overflow_no_release:
5647+
zend_throw_error(NULL, "String size overflow");
5648+
RETURN_THROWS();
5649+
}
5650+
55255651
/* {{{ */
55265652
PHP_FUNCTION(mb_scrub)
55275653
{

ext/mbstring/mbstring.stub.php

+2
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ function mb_ord(string $string, ?string $encoding = null): int|false {}
183183

184184
function mb_chr(int $codepoint, ?string $encoding = null): string|false {}
185185

186+
function mb_str_pad(string $string, int $length, string $pad_string = " ", int $pad_type = STR_PAD_RIGHT, ?string $encoding = null): string {}
187+
186188
#ifdef HAVE_MBREGEX
187189
/** @refcount 1 */
188190
function mb_regex_encoding(?string $encoding = null): string|bool {}

ext/mbstring/mbstring_arginfo.h

+11-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ext/mbstring/tests/mb_str_pad.phpt

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
--TEST--
2+
mb_str_pad()
3+
--EXTENSIONS--
4+
mbstring
5+
--FILE--
6+
<?php
7+
8+
echo "--- Error conditions ---\n";
9+
try {
10+
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_RIGHT));
11+
} catch (ValueError $e) {
12+
var_dump($e->getMessage());
13+
}
14+
try {
15+
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_LEFT));
16+
} catch (ValueError $e) {
17+
var_dump($e->getMessage());
18+
}
19+
try {
20+
var_dump(mb_str_pad('▶▶', 6, '', STR_PAD_BOTH));
21+
} catch (ValueError $e) {
22+
var_dump($e->getMessage());
23+
}
24+
try {
25+
var_dump(mb_str_pad('▶▶', 6, ' ', 123456));
26+
} catch (ValueError $e) {
27+
var_dump($e->getMessage());
28+
}
29+
try {
30+
var_dump(mb_str_pad('▶▶', 6, ' ', STR_PAD_BOTH, 'unexisting'));
31+
} catch (ValueError $e) {
32+
var_dump($e->getMessage());
33+
}
34+
35+
echo "--- Simple ASCII strings ---\n";
36+
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_BOTH));
37+
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_BOTH));
38+
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_LEFT));
39+
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_LEFT));
40+
var_dump(mb_str_pad('Hello', 7, '+-', STR_PAD_RIGHT));
41+
var_dump(mb_str_pad('World', 10, '+-', STR_PAD_RIGHT));
42+
43+
echo "--- Edge cases pad length ---\n";
44+
var_dump(mb_str_pad('▶▶', 2, ' ', STR_PAD_BOTH));
45+
var_dump(mb_str_pad('▶▶', 1, ' ', STR_PAD_BOTH));
46+
var_dump(mb_str_pad('▶▶', 0, ' ', STR_PAD_BOTH));
47+
var_dump(mb_str_pad('▶▶', -1, ' ', STR_PAD_BOTH));
48+
49+
echo "--- Empty input string ---\n";
50+
var_dump(mb_str_pad('', 2, ' ', STR_PAD_BOTH));
51+
var_dump(mb_str_pad('', 1, ' ', STR_PAD_BOTH));
52+
var_dump(mb_str_pad('', 0, ' ', STR_PAD_BOTH));
53+
var_dump(mb_str_pad('', -1, ' ', STR_PAD_BOTH));
54+
55+
echo "--- No default argument ---\n";
56+
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_RIGHT));
57+
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_LEFT));
58+
var_dump(mb_str_pad('▶▶', 6, pad_type: STR_PAD_BOTH));
59+
60+
echo "--- UTF-8 emojis ---\n";
61+
for ($i = 6; $i > 0; $i--) {
62+
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_RIGHT));
63+
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_LEFT));
64+
var_dump(mb_str_pad('▶▶', $i, '❤❓❇', STR_PAD_BOTH));
65+
}
66+
67+
echo "--- UTF-8, 32, 7 test ---\n";
68+
69+
// Taken from mb_substr.phpt
70+
$utf8 = "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь";
71+
$utf32 = mb_convert_encoding($utf8, 'UTF-32', 'UTF-8');
72+
$utf7 = mb_convert_encoding($utf8, 'UTF-7', 'UTF-8');
73+
$tests = ["UTF-8" => $utf8, "UTF-32" => $utf32, "UTF-7" => $utf7];
74+
75+
foreach ($tests as $encoding => $test) {
76+
$pad_str = mb_convert_encoding('▶▶', $encoding, 'UTF-8');
77+
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_RIGHT, $encoding), 'UTF-8', $encoding));
78+
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_LEFT, $encoding), 'UTF-8', $encoding));
79+
var_dump(mb_convert_encoding(mb_str_pad($test, 44, $pad_str, STR_PAD_BOTH, $encoding), 'UTF-8', $encoding));
80+
}
81+
?>
82+
--EXPECT--
83+
--- Error conditions ---
84+
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
85+
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
86+
string(66) "mb_str_pad(): Argument #3 ($pad_string) must be a non-empty string"
87+
string(90) "mb_str_pad(): Argument #4 ($pad_type) must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH"
88+
string(82) "mb_str_pad(): Argument #5 ($encoding) must be a valid encoding, "unexisting" given"
89+
--- Simple ASCII strings ---
90+
string(7) "+Hello+"
91+
string(10) "+-World+-+"
92+
string(7) "+-Hello"
93+
string(10) "+-+-+World"
94+
string(7) "Hello+-"
95+
string(10) "World+-+-+"
96+
--- Edge cases pad length ---
97+
string(6) "▶▶"
98+
string(6) "▶▶"
99+
string(6) "▶▶"
100+
string(6) "▶▶"
101+
--- Empty input string ---
102+
string(2) " "
103+
string(1) " "
104+
string(0) ""
105+
string(0) ""
106+
--- No default argument ---
107+
string(10) "▶▶ "
108+
string(10) " ▶▶"
109+
string(10) " ▶▶ "
110+
--- UTF-8 emojis ---
111+
string(18) "▶▶❤❓❇❤"
112+
string(18) "❤❓❇❤▶▶"
113+
string(18) "❤❓▶▶❤❓"
114+
string(15) "▶▶❤❓❇"
115+
string(15) "❤❓❇▶▶"
116+
string(15) "❤▶▶❤❓"
117+
string(12) "▶▶❤❓"
118+
string(12) "❤❓▶▶"
119+
string(12) "❤▶▶❤"
120+
string(9) "▶▶❤"
121+
string(9) "❤▶▶"
122+
string(9) "▶▶❤"
123+
string(6) "▶▶"
124+
string(6) "▶▶"
125+
string(6) "▶▶"
126+
string(6) "▶▶"
127+
string(6) "▶▶"
128+
string(6) "▶▶"
129+
--- UTF-8, 32, 7 test ---
130+
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
131+
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
132+
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"
133+
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
134+
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
135+
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"
136+
string(92) "Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶▶"
137+
string(92) "▶▶▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь"
138+
string(92) "▶Σὲ γνωρίζω ἀπὸ τὴν κόψη Зарегистрируйтесь▶▶"

0 commit comments

Comments
 (0)