Skip to content

Commit b3766b1

Browse files
committed
Test requiring files with non-BMP characters (encoded as surrogate pairs).
1 parent 02512a6 commit b3766b1

File tree

1 file changed

+20
-3
lines changed

1 file changed

+20
-3
lines changed

t/comp/utf.t

+20-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!./perl -w
22

3-
print "1..76\n";
3+
print "1..100\n";
44
my $test = 0;
55

66
my %templates = (
@@ -13,7 +13,23 @@ sub bytes_to_utf {
1313
my ($enc, $content, $do_bom) = @_;
1414
my $template = $templates{$enc};
1515
die "Unsupported encoding $enc" unless $template;
16-
return pack "$template*", ($do_bom ? 0xFEFF : ()), unpack "U*", $content;
16+
my @chars = unpack "U*", $content;
17+
if ($enc ne 'utf8') {
18+
# Make surrogate pairs
19+
my @remember_that_utf_16_is_variable_length;
20+
foreach my $ord (@chars) {
21+
if ($ord < 0x10000) {
22+
push @remember_that_utf_16_is_variable_length,
23+
$ord;
24+
} else {
25+
$ord -= 0x10000;
26+
push @remember_that_utf_16_is_variable_length,
27+
(0xD800 | ($ord >> 10)), (0xDC00 | ($ord & 0x3FF));
28+
}
29+
}
30+
@chars = @remember_that_utf_16_is_variable_length;
31+
}
32+
return pack "$template*", ($do_bom ? 0xFEFF : ()), @chars;
1733
}
1834

1935
sub test {
@@ -45,8 +61,9 @@ for my $bom (0, 1) {
4561
# right now, as here we're testing the input filter itself.
4662

4763
for my $expect ("N", "\xFF", "\x{100}", "\x{010a}", "\x{0a23}",
64+
"\x{10000}", "\x{64321}", "\x{10FFFD}",
4865
) {
49-
# A space so that the UTF-16 heuristc triggers - " '" gives two
66+
# A space so that the UTF-16 heuristic triggers - " '" gives two
5067
# characters of ASCII.
5168
my $write = " '$expect'";
5269
my $name = 'chrs ' . join ', ', map {ord $_} split '', $expect;

0 commit comments

Comments
 (0)