Skip to content

Commit 7b23470

Browse files
authored
ext/pcre: Add "/r" modifier (#13583)
Adds support for "Caseless restricted" matching added in PCRE2lib 10.43 with the "r" modifier. This is `PCRE2_EXTRA_CASELESS_RESTRICT` in PCRE2. This is an "extra" option, which means it is not possible to pass this option as pcre2_compile() function parameter. This option is passed in a pcre2_set_compile_extra_options() call. Previously, these extra options are set at php_pcre_init_pcre2(), but after this change, it is possible to customize the options by adding bits to `eoptions` in pcre_get_compiled_regex_cache_ex(). The tests for this change are ported from upstream test suite[^1]. [^1]: PCRE2Project/pcre2@c13d54f6581#diff-8c8312e4eb2d35bb16485404b7b5cc0eaef0bca1aa95ff5febf6a1890048305c
1 parent 353d4ce commit 7b23470

File tree

4 files changed

+112
-0
lines changed

4 files changed

+112
-0
lines changed

UPGRADING

+4
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,10 @@ PHP 8.4 UPGRADE NOTES
210210
As a consequence, LoongArch JIT support has been added, spaces
211211
are now allowed between braces in Perl-compatible items, and
212212
variable-length lookbehind assertions are now supported.
213+
. Added support for the "r" (PCRE2_EXTRA_CASELESS_RESTRICT) modifier, as well
214+
as the (?r) mode modifier. When enabled along with the case-insensitive
215+
modifier ("i"), the expression locks out mixing of ASCII and non-ASCII
216+
characters.
213217

214218
- PDO:
215219
. Added support for driver-specific subclasses.

UPGRADING.INTERNALS

+3
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,9 @@ PHP 8.4 INTERNALS UPGRADE NOTES
185185
When flags should be ignored, pass 0 to the flags argument.
186186
- php_pcre_match_impl() and pcre_get_compiled_regex_cache_ex() now use
187187
proper boolean argument types instead of integer types.
188+
- pcre_get_compiled_regex_cache_ex() now provides an option to collect extra
189+
options (from modifiers used in the expression, for example), and calls
190+
pcre2_set_compile_extra_options() with those options.
188191

189192
========================
190193
4. OpCode changes

ext/pcre/php_pcre.c

+4
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo
592592
#else
593593
uint32_t coptions = 0;
594594
#endif
595+
uint32_t eoptions = PHP_PCRE_DEFAULT_EXTRA_COPTIONS;
595596
PCRE2_UCHAR error[128];
596597
PCRE2_SIZE erroffset;
597598
int errnumber;
@@ -722,6 +723,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo
722723
/* PCRE specific options */
723724
case 'A': coptions |= PCRE2_ANCHORED; break;
724725
case 'D': coptions |= PCRE2_DOLLAR_ENDONLY;break;
726+
case 'r': eoptions |= PCRE2_EXTRA_CASELESS_RESTRICT; break;
725727
case 'S': /* Pass. */ break;
726728
case 'X': /* Pass. */ break;
727729
case 'U': coptions |= PCRE2_UNGREEDY; break;
@@ -776,6 +778,8 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo
776778
}
777779
pcre2_set_character_tables(cctx, tables);
778780

781+
pcre2_set_compile_extra_options(cctx, eoptions);
782+
779783
/* Compile pattern and display a warning if compilation failed. */
780784
re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx);
781785

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
--TEST--
2+
testing /r modifier in preg_* functions
3+
--FILE--
4+
<?php
5+
echo "SK substitute matching" . PHP_EOL;
6+
var_dump(preg_match('/AskZ/iur', 'AskZ')); // match
7+
var_dump(preg_match('/AskZ/iur', 'aSKz')); // match
8+
var_dump(preg_match('/AskZ/iur', "A\u{17f}kZ")); // no match
9+
var_dump(preg_match('/AskZ/iur', "As\u{212a}Z")); // no match
10+
var_dump(preg_match('/AskZ/iu', 'AskZ')); // match
11+
var_dump(preg_match('/AskZ/iu', 'aSKz')); // match
12+
var_dump(preg_match('/AskZ/iu', "A\u{17f}kZ")); // match
13+
var_dump(preg_match('/AskZ/iu', "As\u{212a}Z")); // match
14+
15+
echo "K substitute matching" . PHP_EOL;
16+
var_dump(preg_match('/k/iu', "\u{212A}"));
17+
var_dump(preg_match('/k/iur', "\u{212A}"));
18+
19+
echo "non-ASCII in expressions" . PHP_EOL;
20+
var_dump(preg_match('/A\x{17f}\x{212a}Z/iu', 'AskZ')); // match
21+
var_dump(preg_match('/A\x{17f}\x{212a}Z/iur', 'AskZ')); // no match
22+
23+
echo "Character sets" . PHP_EOL;
24+
var_dump(preg_match('/[AskZ]+/iur', 'AskZ')); // match
25+
var_dump(preg_match('/[AskZ]+/iur', 'aSKz')); // match
26+
var_dump(preg_match('/[AskZ]+/iur', "A\u{17f}kZ")); // match
27+
var_dump(preg_match('/[AskZ]+/iur', "As\u{212a}Z")); // match
28+
var_dump(preg_match('/[AskZ]+/iu', 'AskZ')); // match
29+
var_dump(preg_match('/[AskZ]+/iu', 'aSKz')); // match
30+
var_dump(preg_match('/[AskZ]+/iu', "A\u{17f}kZ")); // match
31+
var_dump(preg_match('/[AskZ]+/iu', "As\u{212a}Z")); // match
32+
33+
echo "non-ASCII in character sets" . PHP_EOL;
34+
var_dump(preg_match('/[\x{17f}\x{212a}]+/iur', 'AskZ')); // no match
35+
var_dump(preg_match('/[\x{17f}\x{212a}]+/iu', 'AskZ')); // match
36+
37+
echo "Meta characters and negate character sets". PHP_EOL;
38+
var_dump(preg_match('/[^s]+/iur', "A\u{17f}Z")); // match
39+
var_dump(preg_match('/[^s]+/iu', "A\u{17f}Z")); // match
40+
var_dump(preg_match('/[^s]+/iu', "A\u{17f}Z")); // match
41+
var_dump(preg_match('/[^k]+/iur', "A\u{212a}Z")); // match
42+
var_dump(preg_match('/[^k]+/iu', "A\u{212a}Z")); // match
43+
var_dump(preg_match('/[^sk]+/iur', "A\u{17f}\u{212a}Z")); // match
44+
var_dump(preg_match('/[^sk]+/iu', "A\u{17f}\u{212a}Z")); // match
45+
var_dump(preg_match('/[^\x{17f}]+/iur', "AsSZ")); // match
46+
var_dump(preg_match('/[^\x{17f}]+/iu', "AsSZ")); // match
47+
48+
echo "Modifier used within the expression" . PHP_EOL;
49+
var_dump(preg_match('/s(?r)s(?-r)s(?r:s)s/iu', "\u{17f}S\u{17f}S\u{17f}")); // match
50+
var_dump(preg_match('/s(?r)s(?-r)s(?r:s)s/iu', "\u{17f}\u{17f}\u{17f}S\u{17f}")); // no match
51+
var_dump(preg_match('/s(?r)s(?-r)s(?r:s)s/iu', "\u{17f}S\u{17f}\u{17f}\u{17f}")); // no match
52+
var_dump(preg_match('/k(?^i)k/iur', "K\u{212a}")); // match
53+
var_dump(preg_match('/k(?^i)k/iur', "\u{212a}\u{212a}")); // no match
54+
55+
echo "Done";
56+
?>
57+
--EXPECT--
58+
SK substitute matching
59+
int(1)
60+
int(1)
61+
int(0)
62+
int(0)
63+
int(1)
64+
int(1)
65+
int(1)
66+
int(1)
67+
K substitute matching
68+
int(1)
69+
int(0)
70+
non-ASCII in expressions
71+
int(1)
72+
int(0)
73+
Character sets
74+
int(1)
75+
int(1)
76+
int(1)
77+
int(1)
78+
int(1)
79+
int(1)
80+
int(1)
81+
int(1)
82+
non-ASCII in character sets
83+
int(0)
84+
int(1)
85+
Meta characters and negate character sets
86+
int(1)
87+
int(1)
88+
int(1)
89+
int(1)
90+
int(1)
91+
int(1)
92+
int(1)
93+
int(1)
94+
int(1)
95+
Modifier used within the expression
96+
int(1)
97+
int(0)
98+
int(0)
99+
int(1)
100+
int(0)
101+
Done

0 commit comments

Comments
 (0)