fix(parse/js): lex surrogate codepoints in string literal escapes (#10112)

dyc3 · web-flow · commit 6f0251ea12cd · 2026-04-25T13:55:53.000-04:00
diff --git a/.changeset/smooth-pugs-brake.md b/.changeset/smooth-pugs-brake.md
@@ -0,0 +1,5 @@
+---
+"@biomejs/biome": patch
+---
+
+Fixed [#10110](https://github.com/biomejs/biome/issues/10110): Biome's parser now accepts surrogate code points in JavaScript string `\u{...}` escapes.
diff --git a/crates/biome_js_parser/src/lexer/mod.rs b/crates/biome_js_parser/src/lexer/mod.rs
@@ -557,8 +557,13 @@ impl<'src> JsLexer<'src> {
         }
     }
 
-    // Read a `\u{000...}` escape sequence, this expects the cur char to be the `{`
-    fn read_codepoint_escape_char(&mut self) -> Result<char, ()> {
+    /// Read a `\u{000...}` escape sequence and return the code point value.
+    /// This expects the current char to be the `{`.
+    ///
+    /// This is intended for use in string literal escapes.
+    ///
+    /// This doesn't return `char` intentionally. JS strings allow surrogate code points in unicode escapes, which are not valid unicode scalar values and would cause `char::from_u32` to return None.
+    fn read_codepoint_escape(&mut self) -> Result<u32, ()> {
         let start = self.position + 1;
         self.read_hexnumber();
 
@@ -600,20 +605,7 @@ impl<'src> JsLexer<'src> {
         };
 
         match u32::from_str_radix(digits_str, 16) {
-            Ok(digits) if digits <= 0x10_FFFF => {
-                let res = std::char::from_u32(digits);
-                if let Some(chr) = res {
-                    Ok(chr)
-                } else {
-                    let err = ParseDiagnostic::new(
-                        "invalid codepoint for unicode escape",
-                        start..self.position,
-                    );
-                    self.push_diagnostic(err);
-                    Err(())
-                }
-            }
-
+            Ok(digits) if digits <= 0x10_FFFF => Ok(digits),
             _ => {
                 let err = ParseDiagnostic::new(
                     "out of bounds codepoint for unicode codepoint escape sequence",
@@ -626,6 +618,25 @@ impl<'src> JsLexer<'src> {
         }
     }
 
+    // Read a `\u{000...}` escape sequence and convert it to a valid Unicode scalar value.
+    // This expects the current char to be the `{`.
+    //
+    // This is intended for use in identifier escapes, so it will not attempt to match surrogate pairs, since those are not valid characters in JS identifiers.
+    fn read_codepoint_escape_char(&mut self) -> Result<char, ()> {
+        debug_assert!(self.current_byte() == Some(b'{'));
+        let start = self.position + 1;
+
+        self.read_codepoint_escape().and_then(|codepoint| {
+            std::char::from_u32(codepoint).ok_or_else(|| {
+                let err = ParseDiagnostic::new(
+                    "invalid codepoint for unicode escape",
+                    start..self.position,
+                );
+                self.push_diagnostic(err);
+            })
+        })
+    }
+
     /// Reads a `\u0000` escape sequence.
     ///
     /// This expects the current char to be the `u`. Afterwards, the current
@@ -745,8 +756,9 @@ impl<'src> JsLexer<'src> {
                     true
                 }
                 b'u' if self.peek_byte() == Some(b'{') => {
-                    self.advance(1); // eats '{'
-                    self.read_codepoint_escape_char().is_ok()
+                    self.advance(1); // eats 'u'
+
+                    self.read_codepoint_escape().is_ok()
                 }
                 b'u' => self.read_unicode_escape().is_ok(),
                 b'x' => self.validate_hex_escape(),
diff --git a/crates/biome_js_parser/src/lexer/tests.rs b/crates/biome_js_parser/src/lexer/tests.rs
@@ -3,6 +3,7 @@
 
 use super::{JsLexContext, JsLexer, JsReLexContext, TextRange, TextSize};
 use crate::span::Span;
+use crate::{JsFileSource, JsParserOptions, parse};
 use biome_js_syntax::JsSyntaxKind::{self, EOF, ERROR_TOKEN};
 use biome_js_syntax::JsSyntaxKind::{JS_NUMBER_LITERAL, NEWLINE, WHITESPACE};
 use biome_js_syntax::T;
@@ -337,6 +338,48 @@ fn string_unicode_escape_surrogates() {
     }
 }
 
+#[test]
+fn string_unicode_codepoint_escape_surrogates() {
+    assert_lex! {
+        r#""\u{daff}\u{dfff}""#,
+        JS_STRING_LITERAL:18
+    }
+
+    assert_lex! {
+        r#""\u{D83D}\u{DE0A}""#,
+        JS_STRING_LITERAL:18
+    }
+}
+
+#[test]
+fn parser_accepts_unicode_codepoint_escape_surrogates_in_strings() {
+    let parsed = parse(
+        "const a = \"\\u{daff}\\u{dfff}\";\nconst b = \"\\u{D83D}\\u{DE0A}\";\n",
+        JsFileSource::js_module(),
+        JsParserOptions::default(),
+    );
+
+    assert!(
+        !parsed.has_errors(),
+        "expected no parse errors, found: {:#?}",
+        parsed.diagnostics()
+    );
+}
+
+#[test]
+fn identifier_unicode_codepoint_escape_surrogates() {
+    let parsed = parse(
+        "const \\u{D83D} = 1;\nconst \\u{DC00} = 2;\n",
+        JsFileSource::js_module(),
+        JsParserOptions::default(),
+    );
+
+    assert!(
+        parsed.has_errors(),
+        "expected parse errors for surrogate escapes in identifiers, found none"
+    );
+}
+
 #[test]
 fn string_unicode_escape_valid_resolving_to_endquote() {
     assert_lex! {
diff --git a/crates/biome_js_parser/tests/js_test_suite/ok/unicode_codepoint_escape_surrogates.js b/crates/biome_js_parser/tests/js_test_suite/ok/unicode_codepoint_escape_surrogates.js
@@ -0,0 +1,22 @@
+const highMin = "\u{d800}";
+const highMid = "\u{d8ff}";
+const highMax = "\u{DBFF}";
+
+const lowMin = "\u{dc00}";
+const lowMid = "\u{DdEf}";
+const lowMax = "\u{DFFF}";
+
+const adjacentHighLow = "\u{d83d}\u{de0a}";
+const adjacentLowHigh = "\u{de0a}\u{d83d}";
+const repeatedHigh = "\u{d800}\u{d801}\u{d802}";
+const repeatedLow = "\u{dc00}\u{dc01}\u{dc02}";
+
+const wrapped = "start:\u{dabc}:end";
+const mixedCase = "\u{DaFf}\u{dFfF}";
+const interleaved = "A\u{d912}B\u{dd34}C";
+const escapedQuote = "\u{d834}\"\u{dd1e}";
+
+const templateA = `\u{d800}`;
+const templateB = `left \u{dbff} right`;
+const templateC = `\u{dc00}\u{DFFF}`;
+const templateD = `\u{d83d} smile? \u{de42}`;
diff --git a/crates/biome_js_parser/tests/js_test_suite/ok/unicode_codepoint_escape_surrogates.js.snap b/crates/biome_js_parser/tests/js_test_suite/ok/unicode_codepoint_escape_surrogates.js.snap

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@biomejs/biome": patch
 +---
++
 +Fixed [#10110](https://github.com/biomejs/biome/issues/10110): Biome's parser now accepts surrogate code points in JavaScript string `\u{...}` escapes.