summaryrefslogtreecommitdiff
path: root/ext/json/parser/parser.rl
diff options
context:
space:
mode:
Diffstat (limited to 'ext/json/parser/parser.rl')
-rw-r--r--ext/json/parser/parser.rl32
1 files changed, 22 insertions, 10 deletions
diff --git a/ext/json/parser/parser.rl b/ext/json/parser/parser.rl
index 873c1b3007..959b6e7384 100644
--- a/ext/json/parser/parser.rl
+++ b/ext/json/parser/parser.rl
@@ -20,26 +20,28 @@ static const signed char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1
};
-static UTF32 unescape_unicode(const unsigned char *p)
+static uint32_t unescape_unicode(const unsigned char *p)
{
+ const uint32_t replacement_char = 0xFFFD;
+
signed char b;
- UTF32 result = 0;
+ uint32_t result = 0;
b = digit_values[p[0]];
- if (b < 0) return UNI_REPLACEMENT_CHAR;
+ if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]];
- if (b < 0) return UNI_REPLACEMENT_CHAR;
+ if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]];
- if (b < 0) return UNI_REPLACEMENT_CHAR;
+ if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]];
- if (b < 0) return UNI_REPLACEMENT_CHAR;
+ if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b;
return result;
}
-static int convert_UTF32_to_UTF8(char *buf, UTF32 ch)
+static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
{
int len = 1;
if (ch <= 0x7F) {
@@ -493,9 +495,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
"incomplete unicode character escape sequence at '%s'", p
);
} else {
- UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+ uint32_t ch = unescape_unicode((unsigned char *) ++pe);
pe += 3;
- if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+ /* To handle values above U+FFFF, we take a sequence of
+ * \uXXXX escapes in the U+D800..U+DBFF then
+ * U+DC00..U+DFFF ranges, take the low 10 bits from each
+ * to make a 20-bit number, then add 0x10000 to get the
+ * final codepoint.
+ *
+ * See Unicode 15: §3.8 "Surrogates", §5.3 "Handling
+ * Surrogate Pairs in UTF-16", and §23.6 "Surrogates
+ * Area".
+ */
+ if ((ch & 0xFC00) == 0xD800) {
pe++;
if (pe > stringEnd - 6) {
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
@@ -507,7 +519,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
);
}
if (pe[0] == '\\' && pe[1] == 'u') {
- UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+ uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
| (sur & 0x3FF));
pe += 5;