Skip to content

Commit ba77e4c

Browse files
committed
S_utf16_textfilter() needs to avoid splitting UTF-16 surrogate pairs.
Easier said than done.
1 parent b3766b1 commit ba77e4c

File tree

2 files changed

+38
-2
lines changed

2 files changed

+38
-2
lines changed

t/comp/utf.t

+18-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!./perl -w
22

3-
print "1..100\n";
3+
print "1..3980\n";
44
my $test = 0;
55

66
my %templates = (
@@ -62,13 +62,30 @@ for my $bom (0, 1) {
6262

6363
for my $expect ("N", "\xFF", "\x{100}", "\x{010a}", "\x{0a23}",
6464
"\x{10000}", "\x{64321}", "\x{10FFFD}",
65+
"\x{1000a}", # 0xD800 0xDC0A
66+
"\x{12800}", # 0xD80A 0xDC00
6567
) {
6668
# A space so that the UTF-16 heuristic triggers - " '" gives two
6769
# characters of ASCII.
6870
my $write = " '$expect'";
6971
my $name = 'chrs ' . join ', ', map {ord $_} split '', $expect;
7072
test($enc, $write, $expect, $bom, $nl, $name);
7173
}
74+
75+
# This is designed to try to trip over the end of the buffer,
76+
# with similar results to U-1000A and U-12800 above.
77+
for my $pad (2 .. 162) {
78+
for my $chr ("\x{10000}", "\x{1000a}", "\x{12800}") {
79+
my $padding = ' ' x $pad;
80+
# Need 4 octets that were from 2 ASCII characters to trigger
81+
# the heuristic that detects UTF-16 without a BOM. For
82+
# UTF-16BE, one space and the newline will do, as the
83+
# newline's high octet comes first. But for UTF-16LE, a
84+
# newline is "\n\0", so it doesn't trigger it.
85+
test($enc, " \n$padding'$chr'", $chr, $bom, $nl,
86+
sprintf "'\\x{%x}' with $pad spaces before it", ord $chr);
87+
}
88+
}
7289
}
7390
}
7491
}

toke.c

+20-1
Original file line numberDiff line numberDiff line change
@@ -12822,13 +12822,32 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen)
1282212822
sv_chop(utf8_buffer, nl);
1282312823
break;
1282412824
}
12825+
1282512826
/* OK, not a complete line there, so need to read some more UTF-16.
1282612827
Read an extra octect if the buffer currently has an odd number. */
12828+
while (1) {
12829+
if (status <= 0)
12830+
break;
12831+
if (SvCUR(utf16_buffer) >= 2) {
12832+
/* Location of the high octet of the last complete code point.
12833+
Gosh, UTF-16 is a pain. All the benefits of variable length,
12834+
*coupled* with all the benefits of partial reads and
12835+
endianness. */
12836+
const U8 *const last_hi = (U8*)SvPVX(utf16_buffer)
12837+
+ ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2));
12838+
12839+
if (*last_hi < 0xd8 || *last_hi > 0xdb) {
12840+
break;
12841+
}
12842+
12843+
/* We have the first half of a surrogate. Read more. */
12844+
DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi));
12845+
}
1282712846

12828-
while(SvCUR(utf16_buffer) < 2 && status > 0) {
1282912847
status = FILTER_READ(idx + 1, utf16_buffer,
1283012848
160 + (SvCUR(utf16_buffer) & 1));
1283112849
DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer)));
12850+
DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);});
1283212851
if (status < 0) {
1283312852
/* Error */
1283412853
IoPAGE(filter) = status;

0 commit comments

Comments
 (0)