Skip to content

Commit 6f0251e

Browse files
authored
fix(parse/js): lex surrogate codepoints in string literal escapes (#10112)
1 parent c548d11 commit 6f0251e

5 files changed

Lines changed: 844 additions & 18 deletions

File tree

.changeset/smooth-pugs-brake.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@biomejs/biome": patch
3+
---
4+
5+
Fixed [#10110](https://github.com/biomejs/biome/issues/10110): Biome's parser now accepts surrogate code points in JavaScript string `\u{...}` escapes.

crates/biome_js_parser/src/lexer/mod.rs

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -557,8 +557,13 @@ impl<'src> JsLexer<'src> {
557557
}
558558
}
559559

560-
// Read a `\u{000...}` escape sequence, this expects the cur char to be the `{`
561-
fn read_codepoint_escape_char(&mut self) -> Result<char, ()> {
560+
/// Read a `\u{000...}` escape sequence and return the code point value.
561+
/// This expects the current char to be the `{`.
562+
///
563+
/// This is intended for use in string literal escapes.
564+
///
565+
/// This doesn't return `char` intentionally. JS strings allow surrogate code points in unicode escapes, which are not valid unicode scalar values and would cause `char::from_u32` to return None.
566+
fn read_codepoint_escape(&mut self) -> Result<u32, ()> {
562567
let start = self.position + 1;
563568
self.read_hexnumber();
564569

@@ -600,20 +605,7 @@ impl<'src> JsLexer<'src> {
600605
};
601606

602607
match u32::from_str_radix(digits_str, 16) {
603-
Ok(digits) if digits <= 0x10_FFFF => {
604-
let res = std::char::from_u32(digits);
605-
if let Some(chr) = res {
606-
Ok(chr)
607-
} else {
608-
let err = ParseDiagnostic::new(
609-
"invalid codepoint for unicode escape",
610-
start..self.position,
611-
);
612-
self.push_diagnostic(err);
613-
Err(())
614-
}
615-
}
616-
608+
Ok(digits) if digits <= 0x10_FFFF => Ok(digits),
617609
_ => {
618610
let err = ParseDiagnostic::new(
619611
"out of bounds codepoint for unicode codepoint escape sequence",
@@ -626,6 +618,25 @@ impl<'src> JsLexer<'src> {
626618
}
627619
}
628620

621+
// Read a `\u{000...}` escape sequence and convert it to a valid Unicode scalar value.
622+
// This expects the current char to be the `{`.
623+
//
624+
// This is intended for use in identifier escapes, so it will not attempt to match surrogate pairs, since those are not valid characters in JS identifiers.
625+
fn read_codepoint_escape_char(&mut self) -> Result<char, ()> {
626+
debug_assert!(self.current_byte() == Some(b'{'));
627+
let start = self.position + 1;
628+
629+
self.read_codepoint_escape().and_then(|codepoint| {
630+
std::char::from_u32(codepoint).ok_or_else(|| {
631+
let err = ParseDiagnostic::new(
632+
"invalid codepoint for unicode escape",
633+
start..self.position,
634+
);
635+
self.push_diagnostic(err);
636+
})
637+
})
638+
}
639+
629640
/// Reads a `\u0000` escape sequence.
630641
///
631642
/// This expects the current char to be the `u`. Afterwards, the current
@@ -745,8 +756,9 @@ impl<'src> JsLexer<'src> {
745756
true
746757
}
747758
b'u' if self.peek_byte() == Some(b'{') => {
748-
self.advance(1); // eats '{'
749-
self.read_codepoint_escape_char().is_ok()
759+
self.advance(1); // eats 'u'
760+
761+
self.read_codepoint_escape().is_ok()
750762
}
751763
b'u' => self.read_unicode_escape().is_ok(),
752764
b'x' => self.validate_hex_escape(),

crates/biome_js_parser/src/lexer/tests.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
use super::{JsLexContext, JsLexer, JsReLexContext, TextRange, TextSize};
55
use crate::span::Span;
6+
use crate::{JsFileSource, JsParserOptions, parse};
67
use biome_js_syntax::JsSyntaxKind::{self, EOF, ERROR_TOKEN};
78
use biome_js_syntax::JsSyntaxKind::{JS_NUMBER_LITERAL, NEWLINE, WHITESPACE};
89
use biome_js_syntax::T;
@@ -337,6 +338,48 @@ fn string_unicode_escape_surrogates() {
337338
}
338339
}
339340

341+
#[test]
342+
fn string_unicode_codepoint_escape_surrogates() {
343+
assert_lex! {
344+
r#""\u{daff}\u{dfff}""#,
345+
JS_STRING_LITERAL:18
346+
}
347+
348+
assert_lex! {
349+
r#""\u{D83D}\u{DE0A}""#,
350+
JS_STRING_LITERAL:18
351+
}
352+
}
353+
354+
#[test]
355+
fn parser_accepts_unicode_codepoint_escape_surrogates_in_strings() {
356+
let parsed = parse(
357+
"const a = \"\\u{daff}\\u{dfff}\";\nconst b = \"\\u{D83D}\\u{DE0A}\";\n",
358+
JsFileSource::js_module(),
359+
JsParserOptions::default(),
360+
);
361+
362+
assert!(
363+
!parsed.has_errors(),
364+
"expected no parse errors, found: {:#?}",
365+
parsed.diagnostics()
366+
);
367+
}
368+
369+
#[test]
370+
fn identifier_unicode_codepoint_escape_surrogates() {
371+
let parsed = parse(
372+
"const \\u{D83D} = 1;\nconst \\u{DC00} = 2;\n",
373+
JsFileSource::js_module(),
374+
JsParserOptions::default(),
375+
);
376+
377+
assert!(
378+
parsed.has_errors(),
379+
"expected parse errors for surrogate escapes in identifiers, found none"
380+
);
381+
}
382+
340383
#[test]
341384
fn string_unicode_escape_valid_resolving_to_endquote() {
342385
assert_lex! {
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
const highMin = "\u{d800}";
2+
const highMid = "\u{d8ff}";
3+
const highMax = "\u{DBFF}";
4+
5+
const lowMin = "\u{dc00}";
6+
const lowMid = "\u{DdEf}";
7+
const lowMax = "\u{DFFF}";
8+
9+
const adjacentHighLow = "\u{d83d}\u{de0a}";
10+
const adjacentLowHigh = "\u{de0a}\u{d83d}";
11+
const repeatedHigh = "\u{d800}\u{d801}\u{d802}";
12+
const repeatedLow = "\u{dc00}\u{dc01}\u{dc02}";
13+
14+
const wrapped = "start:\u{dabc}:end";
15+
const mixedCase = "\u{DaFf}\u{dFfF}";
16+
const interleaved = "A\u{d912}B\u{dd34}C";
17+
const escapedQuote = "\u{d834}\"\u{dd1e}";
18+
19+
const templateA = `\u{d800}`;
20+
const templateB = `left \u{dbff} right`;
21+
const templateC = `\u{dc00}\u{DFFF}`;
22+
const templateD = `\u{d83d} smile? \u{de42}`;

0 commit comments

Comments
 (0)