Skip to content

Commit a903586

Browse files
committed
Implement conditional casing for Greek letter sigma when title-casing text
1 parent 290efe8 commit a903586

File tree

3 files changed

+68
-2
lines changed

3 files changed

+68
-2
lines changed

ext/mbstring/php_unicode.c

+23-1
Original file line numberDiff line numberDiff line change
@@ -427,12 +427,34 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
427427
*p++ = w;
428428
continue;
429429
}
430-
uint32_t w2 = title_mode ? php_unicode_tolower_raw(w, src_encoding) : php_unicode_totitle_raw(w, src_encoding);
430+
uint32_t w2;
431+
if (title_mode) {
432+
if (w == 0x3A3) {
433+
int j = i - 1;
434+
while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
435+
j--;
436+
}
437+
if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
438+
j = i + 1;
439+
while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
440+
j++;
441+
}
442+
if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
443+
*p++ = 0x3C2;
444+
goto set_title_mode;
445+
}
446+
}
447+
}
448+
w2 = php_unicode_tolower_raw(w, src_encoding);
449+
} else {
450+
w2 = php_unicode_totitle_raw(w, src_encoding);
451+
}
431452
if (UNEXPECTED(w2 > 0xFFFFFF)) {
432453
p = emit_special_casing_sequence(w2, p);
433454
} else {
434455
*p++ = w2;
435456
}
457+
set_title_mode:
436458
if (!php_unicode_is_case_ignorable(w)) {
437459
title_mode = php_unicode_is_cased(w);
438460
}

ext/mbstring/tests/casemapping.phpt

+1-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ dd
115115
dd
116116
69
117117
69
118-
Καλησπερα Σασ
118+
Καλησπερα Σας
119119
Καλησπερα Σασ
120120
καλησπερα σας
121121
καλησπερα σασ

ext/mbstring/tests/mb_convert_case_various_mode.phpt

+44
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,33 @@ try {
2121
echo $e->getMessage() . \PHP_EOL;
2222
}
2323

24+
echo "\n-- Greek letter sigma --\n";
25+
var_dump(mb_convert_case("Σ", MB_CASE_TITLE, 'UTF-8'));
26+
var_dump(mb_convert_case("", MB_CASE_TITLE, 'UTF-8'));
27+
var_dump(mb_convert_case("aΣb", MB_CASE_TITLE, 'UTF-8'));
28+
var_dump(mb_convert_case("aΣ b", MB_CASE_TITLE, 'UTF-8'));
29+
var_dump(mb_convert_case(" ΣΣΣΣ ", MB_CASE_TITLE, 'UTF-8'));
30+
31+
// Apostrophe, full stop, colon, etc. are "case-ignorable"
32+
// When checking whether capital sigma is at the end of a word or not, we skip over
33+
// any number of case-ignorable characters, both when scanning back and when scanning forward
34+
var_dump(mb_convert_case("", MB_CASE_TITLE, 'UTF-8'));
35+
var_dump(mb_convert_case("ab'Σ", MB_CASE_TITLE, 'UTF-8'));
36+
var_dump(mb_convert_case("Σ'", MB_CASE_TITLE, 'UTF-8'));
37+
var_dump(mb_convert_case("Σ'a", MB_CASE_TITLE, 'UTF-8'));
38+
var_dump(mb_convert_case("a'Σ'a", MB_CASE_TITLE, 'UTF-8'));
39+
40+
// We scan back by at least 63 characters when necessary,
41+
// but there is no guarantee that we will scan back further than that
42+
var_dump(mb_convert_case('a' . str_repeat('.', 63) . "Σ", MB_CASE_TITLE, 'UTF-8'));
43+
var_dump(mb_convert_case('a' . str_repeat('.', 64) . "Σ", MB_CASE_TITLE, 'UTF-8')); // Context-sensitive casing doesn't work here!
44+
45+
// When scanning forward to confirm if capital sigma is at the end of a word or not,
46+
// there is no limit as to how far we will scan
47+
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . ' abc', MB_CASE_TITLE, 'UTF-8'));
48+
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . 'a abc', MB_CASE_TITLE, 'UTF-8'));
49+
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 256) . ' abc', MB_CASE_TITLE, 'UTF-8'));
50+
2451
/* Regression test for new implementation;
2552
* When converting a codepoint, if we overwrite it with the converted version before
2653
* checking whether we should shift in/out of 'title mode', then the conversion will be incorrect */
@@ -38,5 +65,22 @@ string(13) "foo bar spaß"
3865
string(13) "Foo Bar Spaß"
3966
string(13) "foo bar spaß"
4067
mb_convert_case(): Argument #2 ($mode) must be one of the MB_CASE_* constants
68+
69+
-- Greek letter sigma --
70+
string(2) "Σ"
71+
string(3) "Aς"
72+
string(4) "Aσb"
73+
string(5) "Aς B"
74+
string(10) " Σσσς "
75+
string(3) "'Σ"
76+
string(5) "Ab'ς"
77+
string(3) "Σ'"
78+
string(4) "Σ'a"
79+
string(6) "A'σ'a"
80+
string(66) "A...............................................................ς"
81+
string(67) "A................................................................σ"
82+
string(73) "Abcς................................................................ Abc"
83+
string(74) "Abcσ................................................................a Abc"
84+
string(265) "Abcς................................................................................................................................................................................................................................................................ Abc"
4185
string(12) "02bc004e012d"
4286
string(8) "0149012d"

0 commit comments

Comments
 (0)