S_utf16_textfilter() needs to avoid splitting UTF-16 surrogate pairs.

nwc10 · nwc10 · commit ba77e4cc9d1c · 2009-10-22T19:39:30.000+01:00
Easier said than done.
diff --git a/t/comp/utf.t b/t/comp/utf.t
@@ -1,6 +1,6 @@
 #!./perl -w
 
-print "1..100\n";
+print "1..3980\n";
 my $test = 0;
 
 my %templates = (
@@ -62,13 +62,30 @@ for my $bom (0, 1) {
 
 	    for my $expect ("N", "\xFF", "\x{100}", "\x{010a}", "\x{0a23}",
 			    "\x{10000}", "\x{64321}", "\x{10FFFD}",
+			    "\x{1000a}", # 0xD800 0xDC0A
+			    "\x{12800}", # 0xD80A 0xDC00
 			   ) {
 		# A space so that the UTF-16 heuristic triggers - " '" gives two
 		# characters of ASCII.
 		my $write = " '$expect'";
 		my $name = 'chrs ' . join ', ', map {ord $_} split '', $expect;
 		test($enc, $write, $expect, $bom, $nl, $name);
 	    }
+
+	    # This is designed to try to trip over the end of the buffer,
+	    # with similar results to U-1000A and U-12800 above.
+	    for my $pad (2 .. 162) {
+		for my $chr ("\x{10000}", "\x{1000a}", "\x{12800}") {
+		    my $padding = ' ' x $pad;
+		    # Need 4 octets that were from 2 ASCII characters to trigger
+		    # the heuristic that detects UTF-16 without a BOM. For
+		    # UTF-16BE, one space and the newline will do, as the
+		    # newline's high octet comes first. But for UTF-16LE, a
+		    # newline is "\n\0", so it doesn't trigger it.
+		    test($enc, "  \n$padding'$chr'", $chr, $bom, $nl,
+			 sprintf "'\\x{%x}' with $pad spaces before it", ord $chr);
+		}
+	    }
 	}
     }
 }
diff --git a/toke.c b/toke.c
@@ -12822,13 +12822,32 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen)
 	    sv_chop(utf8_buffer, nl);
 	    break;
 	}
+
 	/* OK, not a complete line there, so need to read some more UTF-16.
 	   Read an extra octect if the buffer currently has an odd number. */
+	while (1) {
+	    if (status <= 0)
+		break;
+	    if (SvCUR(utf16_buffer) >= 2) {
+		/* Location of the high octet of the last complete code point.
+		   Gosh, UTF-16 is a pain. All the benefits of variable length,
+		   *coupled* with all the benefits of partial reads and
+		   endianness.  */
+		const U8 *const last_hi = (U8*)SvPVX(utf16_buffer)
+		    + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2));
+
+		if (*last_hi < 0xd8 || *last_hi > 0xdb) {
+		    break;
+		}
+
+		/* We have the first half of a surrogate. Read more.  */
+		DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi));
+	    }
 
-	while(SvCUR(utf16_buffer) < 2 && status > 0) {
 	    status = FILTER_READ(idx + 1, utf16_buffer,
 				 160 + (SvCUR(utf16_buffer) & 1));
 	    DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer)));
+	    DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);});
 	    if (status < 0) {
 		/* Error */
 		IoPAGE(filter) = status;