diff options
author | Luke T. Shumaker <[email protected]> | 2024-02-22 20:51:28 -0700 |
---|---|---|
committer | Hiroshi SHIBATA <[email protected]> | 2024-10-08 14:10:05 +0900 |
commit | 74d459fd52ef85f92f7c20819afcc4ffcf11714d (patch) | |
tree | c967d95e7b5f20bc32956087368571e831f7ded6 /ext/json | |
parent | 6e47968929f2ee77376d28a6561266d8f8e3a4f7 (diff) |
[ruby/json] Adjust to the CVTUTF code being gone
I, Luke T. Shumaker, am the sole author of the added code.
I did not reference CVTUTF when writing it. I did reference the
Unicode standard (15.0.0), the Wikipedia article on UTF-8, and the
Wikipedia article on UTF-16. When I saw some tests fail, I did
reference the old deleted code (but a JSON-specific part, inherently
not as based on CVTUTF) to determine that script_safe should also
escape U+2028 and U+2029.
I targeted simplicity and clarity when writing the code--it can likely
be optimized. In my mind, the obvious next optimization is to have it
combine contiguous non-escaped characters into just one call to
fbuffer_append(), instead of calling fbuffer_append() for each
character.
Regarding the use of the "modern" types `uint32_t`, `uint16_t`, and
`bool`:
- ruby.h is guaranteed to give us uint32_t and uint16_t.
- Since Ruby 3.0.0, ruby.h is guaranteed to give us bool... but we
support down to Ruby 2.3. But, ruby.h is guaranteed to give us
HAVE_STDBOOL_H for the C99 stdbool.h; so use that to include
stdbool.h if we can, and if not then fall back to a copy of the
same bool definition that Ruby 3.0.5 uses with C89.
https://github.com/ruby/json/commit/c96351f874
Diffstat (limited to 'ext/json')
-rw-r--r-- | ext/json/generator/generator.c | 152 | ||||
-rw-r--r-- | ext/json/generator/generator.h | 13 | ||||
-rw-r--r-- | ext/json/parser/parser.c | 182 | ||||
-rw-r--r-- | ext/json/parser/parser.h | 4 | ||||
-rw-r--r-- | ext/json/parser/parser.rl | 32 |
5 files changed, 239 insertions, 144 deletions
diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c index d3f6516511..4d853ec343 100644 --- a/ext/json/generator/generator.c +++ b/ext/json/generator/generator.c @@ -18,50 +18,119 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before, i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth, i_buffer_initial_length, i_dup, i_script_safe, i_escape_slash, i_strict; -/* Escapes the UTF16 character and stores the result in the buffer buf. */ -static void unicode_escape(char *buf, UTF16 character) +/* Converts in_string to a JSON string (without the wrapping '"' + * characters) in FBuffer out_buffer. + * + * Character are JSON-escaped according to: + * + * - Always: ASCII control characters (0x00-0x1F), dquote, and + * backslash. + * + * - If out_ascii_only: non-ASCII characters (>0x7F) + * + * - If out_script_safe: forwardslash, line separator (U+2028), and + * paragraph separator (U+2029) + * + * Everything else (should be UTF-8) is just passed through and + * appended to the result. + */ +static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe) { - const char *digits = "0123456789abcdef"; + const char *hexdig = "0123456789abcdef"; + char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; - buf[2] = digits[character >> 12]; - buf[3] = digits[(character >> 8) & 0xf]; - buf[4] = digits[(character >> 4) & 0xf]; - buf[5] = digits[character & 0xf]; -} + const char *in_utf8_str = RSTRING_PTR(in_string); + unsigned long in_utf8_len = RSTRING_LEN(in_string); + bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string); -/* Escapes the UTF16 character and stores the result in the buffer buf, then - * the buffer buf is appended to the FBuffer buffer. */ -static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 - character) -{ - unicode_escape(buf, character); - fbuffer_append(buffer, buf, 6); -} + unsigned long pos; -/* Converts string to a JSON string in FBuffer buffer, where all but the ASCII - * and control characters are JSON escaped. */ -static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe) -{ - const UTF8 *source = (UTF8 *) RSTRING_PTR(string); - const UTF8 *sourceEnd = source + RSTRING_LEN(string); - char buf[6] = { '\\', 'u' }; + for (pos = 0; pos < in_utf8_len;) { + uint32_t ch; + unsigned long ch_len; + bool should_escape; - RB_GC_GUARD(string); -} + /* UTF-8 decoding */ + if (in_is_ascii_only) { + ch = in_utf8_str[pos]; + ch_len = 1; + } else { + short i; + if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */ + else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */ + else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */ + else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */ + else + rb_raise(rb_path2class("JSON::GeneratorError"), + "source sequence is illegal/malformed utf-8"); + if ((pos+ch_len) > in_utf8_len) + rb_raise(rb_path2class("JSON::GeneratorError"), + "partial character in source, but hit end"); + for (i = 1; i < ch_len; i++) { + if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */ + rb_raise(rb_path2class("JSON::GeneratorError"), + "source sequence is illegal/malformed utf-8"); + ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F); + } + if (ch > 0x10FFFF) + rb_raise(rb_path2class("JSON::GeneratorError"), + "source sequence is illegal/malformed utf-8"); + } -/* Converts string to a JSON string in FBuffer buffer, where only the - * characters required by the JSON standard are JSON escaped. The remaining - * characters (should be UTF8) are just passed through and appended to the - * result. */ -static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe) -{ - const char *ptr = RSTRING_PTR(string), *p; - unsigned long len = RSTRING_LEN(string), start = 0, end = 0; - const char *escape = NULL; - int escape_len; - unsigned char c; - char buf[6] = { '\\', 'u' }; - int ascii_only = rb_enc_str_asciionly_p(string); + /* JSON policy */ + should_escape = + (ch < 0x20) || + (ch == '"') || + (ch == '\\') || + (out_ascii_only && (ch > 0x7F)) || + (out_script_safe && (ch == '/')) || + (out_script_safe && (ch == 0x2028)) || + (out_script_safe && (ch == 0x2029)); + + /* JSON encoding */ + if (should_escape) { + switch (ch) { + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; + case '/': fbuffer_append(out_buffer, "\\/", 2); break; + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; + default: + if (ch <= 0xFFFF) { + scratch[2] = hexdig[ch >> 12]; + scratch[3] = hexdig[(ch >> 8) & 0xf]; + scratch[4] = hexdig[(ch >> 4) & 0xf]; + scratch[5] = hexdig[ch & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + } else { + uint16_t hi, lo; + ch -= 0x10000; + hi = 0xD800 + (uint16_t)(ch >> 10); + lo = 0xDC00 + (uint16_t)(ch & 0x3FF); + + scratch[2] = hexdig[hi >> 12]; + scratch[3] = hexdig[(hi >> 8) & 0xf]; + scratch[4] = hexdig[(hi >> 4) & 0xf]; + scratch[5] = hexdig[hi & 0xf]; + + scratch[8] = hexdig[lo >> 12]; + scratch[9] = hexdig[(lo >> 8) & 0xf]; + scratch[10] = hexdig[(lo >> 4) & 0xf]; + scratch[11] = hexdig[lo & 0xf]; + + fbuffer_append(out_buffer, scratch, 12); + } + } + } else { + fbuffer_append(out_buffer, &in_utf8_str[pos], ch_len); + } + + pos += ch_len; + } + RB_GC_GUARD(in_string); } static char *fstrndup(const char *ptr, unsigned long len) { @@ -698,12 +767,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S if (!enc_utf8_compatible_p(rb_enc_get(obj))) { obj = rb_str_export_to_enc(obj, rb_utf8_encoding()); } - - if (state->ascii_only) { - convert_UTF8_to_JSON_ASCII(buffer, obj, state->script_safe); - } else { - convert_UTF8_to_JSON(buffer, obj, state->script_safe); - } + convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe); fbuffer_append_char(buffer, '"'); } diff --git a/ext/json/generator/generator.h b/ext/json/generator/generator.h index 95ce2479c6..16aae7bc62 100644 --- a/ext/json/generator/generator.h +++ b/ext/json/generator/generator.h @@ -6,6 +6,14 @@ #include "ruby.h" +#ifdef HAVE_STDBOOL_H +#include <stdbool.h> +#else +/* This is the fallback definition from Ruby 3.0.5. */ +typedef unsigned char _Bool +#define bool _Bool +#endif + #ifdef HAVE_RUBY_RE_H #include "ruby/re.h" #else @@ -22,10 +30,7 @@ #define option_given_p(opts, key) RTEST(rb_funcall(opts, i_key_p, 1, key)) -static void unicode_escape(char *buf, UTF16 character); -static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character); -static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe); -static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe); +static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe); static char *fstrndup(const char *ptr, unsigned long len); /* ruby api and some helpers */ diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c index 128f683e0f..b7428891a5 100644 --- a/ext/json/parser/parser.c +++ b/ext/json/parser/parser.c @@ -22,26 +22,28 @@ static const signed char digit_values[256] = { -1, -1, -1, -1, -1, -1, -1 }; -static UTF32 unescape_unicode(const unsigned char *p) +static uint32_t unescape_unicode(const unsigned char *p) { + const uint32_t replacement_char = 0xFFFD; + signed char b; - UTF32 result = 0; + uint32_t result = 0; b = digit_values[p[0]]; - if (b < 0) return UNI_REPLACEMENT_CHAR; + if (b < 0) return replacement_char; result = (result << 4) | (unsigned char)b; b = digit_values[p[1]]; - if (b < 0) return UNI_REPLACEMENT_CHAR; + if (b < 0) return replacement_char; result = (result << 4) | (unsigned char)b; b = digit_values[p[2]]; - if (b < 0) return UNI_REPLACEMENT_CHAR; + if (b < 0) return replacement_char; result = (result << 4) | (unsigned char)b; b = digit_values[p[3]]; - if (b < 0) return UNI_REPLACEMENT_CHAR; + if (b < 0) return replacement_char; result = (result << 4) | (unsigned char)b; return result; } -static int convert_UTF32_to_UTF8(char *buf, UTF32 ch) +static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) { int len = 1; if (ch <= 0x7F) { @@ -77,11 +79,11 @@ static ID i_json_creatable_p, i_json_create, i_create_id, i_create_additions, i_leftshift, i_new, i_try_convert, i_freeze, i_uminus; -#line 125 "parser.rl" +#line 105 "parser.rl" -#line 107 "parser.c" +#line 87 "parser.c" enum {JSON_object_start = 1}; enum {JSON_object_first_final = 27}; enum {JSON_object_error = 0}; @@ -89,7 +91,7 @@ enum {JSON_object_error = 0}; enum {JSON_object_en_main = 1}; -#line 167 "parser.rl" +#line 147 "parser.rl" static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting) @@ -105,14 +107,14 @@ static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *resu *result = NIL_P(object_class) ? rb_hash_new() : rb_class_new_instance(0, 0, object_class); -#line 131 "parser.c" +#line 111 "parser.c" { cs = JSON_object_start; } -#line 182 "parser.rl" +#line 162 "parser.rl" -#line 138 "parser.c" +#line 118 "parser.c" { if ( p == pe ) goto _test_eof; @@ -140,7 +142,7 @@ case 2: goto st2; goto st0; tr2: -#line 149 "parser.rl" +#line 129 "parser.rl" { char *np; json->parsing_name = 1; @@ -153,7 +155,7 @@ st3: if ( ++p == pe ) goto _test_eof3; case 3: -#line 179 "parser.c" +#line 159 "parser.c" switch( (*p) ) { case 13: goto st3; case 32: goto st3; @@ -220,7 +222,7 @@ case 8: goto st8; goto st0; tr11: -#line 133 "parser.rl" +#line 113 "parser.rl" { VALUE v = Qnil; char *np = JSON_parse_value(json, p, pe, &v, current_nesting); @@ -241,7 +243,7 @@ st9: if ( ++p == pe ) goto _test_eof9; case 9: -#line 267 "parser.c" +#line 247 "parser.c" switch( (*p) ) { case 13: goto st9; case 32: goto st9; @@ -330,14 +332,14 @@ case 18: goto st9; goto st18; tr4: -#line 157 "parser.rl" +#line 137 "parser.rl" { p--; {p++; cs = 27; goto _out;} } goto st27; st27: if ( ++p == pe ) goto _test_eof27; case 27: -#line 363 "parser.c" +#line 343 "parser.c" goto st0; st19: if ( ++p == pe ) @@ -435,7 +437,7 @@ case 26: _out: {} } -#line 183 "parser.rl" +#line 163 "parser.rl" if (cs >= JSON_object_first_final) { if (json->create_additions) { @@ -460,7 +462,7 @@ case 26: -#line 486 "parser.c" +#line 466 "parser.c" enum {JSON_value_start = 1}; enum {JSON_value_first_final = 29}; enum {JSON_value_error = 0}; @@ -468,7 +470,7 @@ enum {JSON_value_error = 0}; enum {JSON_value_en_main = 1}; -#line 283 "parser.rl" +#line 263 "parser.rl" static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting) @@ -476,14 +478,14 @@ static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *resul int cs = EVIL; -#line 502 "parser.c" +#line 482 "parser.c" { cs = JSON_value_start; } -#line 290 "parser.rl" +#line 270 "parser.rl" -#line 509 "parser.c" +#line 489 "parser.c" { if ( p == pe ) goto _test_eof; @@ -517,14 +519,14 @@ st0: cs = 0; goto _out; tr2: -#line 235 "parser.rl" +#line 215 "parser.rl" { char *np = JSON_parse_string(json, p, pe, result); if (np == NULL) { p--; {p++; cs = 29; goto _out;} } else {p = (( np))-1;} } goto st29; tr3: -#line 240 "parser.rl" +#line 220 "parser.rl" { char *np; if(pe > p + 8 && !strncmp(MinusInfinity, p, 9)) { @@ -544,7 +546,7 @@ tr3: } goto st29; tr7: -#line 258 "parser.rl" +#line 238 "parser.rl" { char *np; np = JSON_parse_array(json, p, pe, result, current_nesting + 1); @@ -552,7 +554,7 @@ tr7: } goto st29; tr11: -#line 264 "parser.rl" +#line 244 "parser.rl" { char *np; np = JSON_parse_object(json, p, pe, result, current_nesting + 1); @@ -560,7 +562,7 @@ tr11: } goto st29; tr25: -#line 228 "parser.rl" +#line 208 "parser.rl" { if (json->allow_nan) { *result = CInfinity; @@ -570,7 +572,7 @@ tr25: } goto st29; tr27: -#line 221 "parser.rl" +#line 201 "parser.rl" { if (json->allow_nan) { *result = CNaN; @@ -580,19 +582,19 @@ tr27: } goto st29; tr31: -#line 215 "parser.rl" +#line 195 "parser.rl" { *result = Qfalse; } goto st29; tr34: -#line 212 "parser.rl" +#line 192 "parser.rl" { *result = Qnil; } goto st29; tr37: -#line 218 "parser.rl" +#line 198 "parser.rl" { *result = Qtrue; } @@ -601,9 +603,9 @@ st29: if ( ++p == pe ) goto _test_eof29; case 29: -#line 270 "parser.rl" +#line 250 "parser.rl" { p--; {p++; cs = 29; goto _out;} } -#line 629 "parser.c" +#line 609 "parser.c" switch( (*p) ) { case 13: goto st29; case 32: goto st29; @@ -844,7 +846,7 @@ case 28: _out: {} } -#line 291 "parser.rl" +#line 271 "parser.rl" if (json->freeze) { OBJ_FREEZE(*result); @@ -858,7 +860,7 @@ case 28: } -#line 884 "parser.c" +#line 864 "parser.c" enum {JSON_integer_start = 1}; enum {JSON_integer_first_final = 3}; enum {JSON_integer_error = 0}; @@ -866,7 +868,7 @@ enum {JSON_integer_error = 0}; enum {JSON_integer_en_main = 1}; -#line 311 "parser.rl" +#line 291 "parser.rl" static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result) @@ -874,15 +876,15 @@ static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *res int cs = EVIL; -#line 900 "parser.c" +#line 880 "parser.c" { cs = JSON_integer_start; } -#line 318 "parser.rl" +#line 298 "parser.rl" json->memo = p; -#line 908 "parser.c" +#line 888 "parser.c" { if ( p == pe ) goto _test_eof; @@ -916,14 +918,14 @@ case 3: goto st0; goto tr4; tr4: -#line 308 "parser.rl" +#line 288 "parser.rl" { p--; {p++; cs = 4; goto _out;} } goto st4; st4: if ( ++p == pe ) goto _test_eof4; case 4: -#line 949 "parser.c" +#line 929 "parser.c" goto st0; st5: if ( ++p == pe ) @@ -942,7 +944,7 @@ case 5: _out: {} } -#line 320 "parser.rl" +#line 300 "parser.rl" if (cs >= JSON_integer_first_final) { long len = p - json->memo; @@ -957,7 +959,7 @@ case 5: } -#line 983 "parser.c" +#line 963 "parser.c" enum {JSON_float_start = 1}; enum {JSON_float_first_final = 8}; enum {JSON_float_error = 0}; @@ -965,7 +967,7 @@ enum {JSON_float_error = 0}; enum {JSON_float_en_main = 1}; -#line 345 "parser.rl" +#line 325 "parser.rl" static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *result) @@ -973,15 +975,15 @@ static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *resul int cs = EVIL; -#line 999 "parser.c" +#line 979 "parser.c" { cs = JSON_float_start; } -#line 352 "parser.rl" +#line 332 "parser.rl" json->memo = p; -#line 1007 "parser.c" +#line 987 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1039,14 +1041,14 @@ case 8: goto st0; goto tr9; tr9: -#line 339 "parser.rl" +#line 319 "parser.rl" { p--; {p++; cs = 9; goto _out;} } goto st9; st9: if ( ++p == pe ) goto _test_eof9; case 9: -#line 1072 "parser.c" +#line 1052 "parser.c" goto st0; st5: if ( ++p == pe ) @@ -1107,7 +1109,7 @@ case 7: _out: {} } -#line 354 "parser.rl" +#line 334 "parser.rl" if (cs >= JSON_float_first_final) { VALUE mod = Qnil; @@ -1158,7 +1160,7 @@ case 7: -#line 1184 "parser.c" +#line 1164 "parser.c" enum {JSON_array_start = 1}; enum {JSON_array_first_final = 17}; enum {JSON_array_error = 0}; @@ -1166,7 +1168,7 @@ enum {JSON_array_error = 0}; enum {JSON_array_en_main = 1}; -#line 432 "parser.rl" +#line 412 "parser.rl" static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting) @@ -1180,14 +1182,14 @@ static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *resul *result = NIL_P(array_class) ? rb_ary_new() : rb_class_new_instance(0, 0, array_class); -#line 1206 "parser.c" +#line 1186 "parser.c" { cs = JSON_array_start; } -#line 445 "parser.rl" +#line 425 "parser.rl" -#line 1213 "parser.c" +#line 1193 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1226,7 +1228,7 @@ case 2: goto st2; goto st0; tr2: -#line 409 "parser.rl" +#line 389 "parser.rl" { VALUE v = Qnil; char *np = JSON_parse_value(json, p, pe, &v, current_nesting); @@ -1246,7 +1248,7 @@ st3: if ( ++p == pe ) goto _test_eof3; case 3: -#line 1272 "parser.c" +#line 1252 "parser.c" switch( (*p) ) { case 13: goto st3; case 32: goto st3; @@ -1346,14 +1348,14 @@ case 12: goto st3; goto st12; tr4: -#line 424 "parser.rl" +#line 404 "parser.rl" { p--; {p++; cs = 17; goto _out;} } goto st17; st17: if ( ++p == pe ) goto _test_eof17; case 17: -#line 1379 "parser.c" +#line 1359 "parser.c" goto st0; st13: if ( ++p == pe ) @@ -1409,7 +1411,7 @@ case 16: _out: {} } -#line 446 "parser.rl" +#line 426 "parser.rl" if(cs >= JSON_array_first_final) { return p + 1; @@ -1482,9 +1484,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int "incomplete unicode character escape sequence at '%s'", p ); } else { - UTF32 ch = unescape_unicode((unsigned char *) ++pe); + uint32_t ch = unescape_unicode((unsigned char *) ++pe); pe += 3; - if (UNI_SUR_HIGH_START == (ch & 0xFC00)) { + /* To handle values above U+FFFF, we take a sequence of + * \uXXXX escapes in the U+D800..U+DBFF then + * U+DC00..U+DFFF ranges, take the low 10 bits from each + * to make a 20-bit number, then add 0x10000 to get the + * final codepoint. + * + * See Unicode 15: §3.8 "Surrogates", §5.3 "Handling + * Surrogate Pairs in UTF-16", and §23.6 "Surrogates + * Area". + */ + if ((ch & 0xFC00) == 0xD800) { pe++; if (pe > stringEnd - 6) { if (bufferSize > MAX_STACK_BUFFER_SIZE) { @@ -1496,7 +1508,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int ); } if (pe[0] == '\\' && pe[1] == 'u') { - UTF32 sur = unescape_unicode((unsigned char *) pe + 2); + uint32_t sur = unescape_unicode((unsigned char *) pe + 2); ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF)); pe += 5; @@ -1566,7 +1578,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int } -#line 1592 "parser.c" +#line 1582 "parser.c" enum {JSON_string_start = 1}; enum {JSON_string_first_final = 8}; enum {JSON_string_error = 0}; @@ -1574,7 +1586,7 @@ enum {JSON_string_error = 0}; enum {JSON_string_en_main = 1}; -#line 620 "parser.rl" +#line 610 "parser.rl" static int @@ -1595,15 +1607,15 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu VALUE match_string; -#line 1621 "parser.c" +#line 1611 "parser.c" { cs = JSON_string_start; } -#line 640 "parser.rl" +#line 630 "parser.rl" json->memo = p; -#line 1629 "parser.c" +#line 1619 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1628,7 +1640,7 @@ case 2: goto st0; goto st2; tr2: -#line 607 "parser.rl" +#line 597 "parser.rl" { *result = json_string_unescape(json->memo + 1, p, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names); if (NIL_P(*result)) { @@ -1638,14 +1650,14 @@ tr2: {p = (( p + 1))-1;} } } -#line 617 "parser.rl" +#line 607 "parser.rl" { p--; {p++; cs = 8; goto _out;} } goto st8; st8: if ( ++p == pe ) goto _test_eof8; case 8: -#line 1671 "parser.c" +#line 1661 "parser.c" goto st0; st3: if ( ++p == pe ) @@ -1721,7 +1733,7 @@ case 7: _out: {} } -#line 642 "parser.rl" +#line 632 "parser.rl" if (json->create_additions && RTEST(match_string = json->match_string)) { VALUE klass; @@ -1755,6 +1767,7 @@ case 7: static VALUE convert_encoding(VALUE source) { +#ifdef HAVE_RUBY_ENCODING_H rb_encoding *enc = rb_enc_get(source); if (enc == rb_ascii8bit_encoding()) { if (OBJ_FROZEN(source)) { @@ -1764,7 +1777,8 @@ static VALUE convert_encoding(VALUE source) } else { source = rb_str_conv_enc(source, rb_enc_get(source), rb_utf8_encoding()); } - return source; +#endif + return source; } /* @@ -1892,7 +1906,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self) } -#line 1920 "parser.c" +#line 1910 "parser.c" enum {JSON_start = 1}; enum {JSON_first_final = 10}; enum {JSON_error = 0}; @@ -1900,7 +1914,7 @@ enum {JSON_error = 0}; enum {JSON_en_main = 1}; -#line 828 "parser.rl" +#line 818 "parser.rl" /* @@ -1918,16 +1932,16 @@ static VALUE cParser_parse(VALUE self) GET_PARSER; -#line 1946 "parser.c" +#line 1936 "parser.c" { cs = JSON_start; } -#line 845 "parser.rl" +#line 835 "parser.rl" p = json->source; pe = p + json->len; -#line 1955 "parser.c" +#line 1945 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1961,7 +1975,7 @@ st0: cs = 0; goto _out; tr2: -#line 820 "parser.rl" +#line 810 "parser.rl" { char *np = JSON_parse_value(json, p, pe, &result, 0); if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;} @@ -1971,7 +1985,7 @@ st10: if ( ++p == pe ) goto _test_eof10; case 10: -#line 1999 "parser.c" +#line 1989 "parser.c" switch( (*p) ) { case 13: goto st10; case 32: goto st10; @@ -2060,7 +2074,7 @@ case 9: _out: {} } -#line 848 "parser.rl" +#line 838 "parser.rl" if (cs >= JSON_first_final && p == pe) { return result; diff --git a/ext/json/parser/parser.h b/ext/json/parser/parser.h index d80f1b7303..f6974461ae 100644 --- a/ext/json/parser/parser.h +++ b/ext/json/parser/parser.h @@ -48,8 +48,8 @@ typedef struct JSON_ParserStruct { #define MinusInfinity "-Infinity" #define EVIL 0x666 -static UTF32 unescape_unicode(const unsigned char *p); -static int convert_UTF32_to_UTF8(char *buf, UTF32 ch); +static uint32_t unescape_unicode(const unsigned char *p); +static int convert_UTF32_to_UTF8(char *buf, uint32_t ch); static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting); static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting); static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result); diff --git a/ext/json/parser/parser.rl b/ext/json/parser/parser.rl index 873c1b3007..959b6e7384 100644 --- a/ext/json/parser/parser.rl +++ b/ext/json/parser/parser.rl @@ -20,26 +20,28 @@ static const signed char digit_values[256] = { -1, -1, -1, -1, -1, -1, -1 }; -static UTF32 unescape_unicode(const unsigned char *p) +static uint32_t unescape_unicode(const unsigned char *p) { + const uint32_t replacement_char = 0xFFFD; + signed char b; - UTF32 result = 0; + uint32_t result = 0; b = digit_values[p[0]]; - if (b < 0) return UNI_REPLACEMENT_CHAR; + if (b < 0) return replacement_char; result = (result << 4) | (unsigned char)b; b = digit_values[p[1]]; - if (b < 0) return UNI_REPLACEMENT_CHAR; + if (b < 0) return replacement_char; result = (result << 4) | (unsigned char)b; b = digit_values[p[2]]; - if (b < 0) return UNI_REPLACEMENT_CHAR; + if (b < 0) return replacement_char; result = (result << 4) | (unsigned char)b; b = digit_values[p[3]]; - if (b < 0) return UNI_REPLACEMENT_CHAR; + if (b < 0) return replacement_char; result = (result << 4) | (unsigned char)b; return result; } -static int convert_UTF32_to_UTF8(char *buf, UTF32 ch) +static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) { int len = 1; if (ch <= 0x7F) { @@ -493,9 +495,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int "incomplete unicode character escape sequence at '%s'", p ); } else { - UTF32 ch = unescape_unicode((unsigned char *) ++pe); + uint32_t ch = unescape_unicode((unsigned char *) ++pe); pe += 3; - if (UNI_SUR_HIGH_START == (ch & 0xFC00)) { + /* To handle values above U+FFFF, we take a sequence of + * \uXXXX escapes in the U+D800..U+DBFF then + * U+DC00..U+DFFF ranges, take the low 10 bits from each + * to make a 20-bit number, then add 0x10000 to get the + * final codepoint. + * + * See Unicode 15: §3.8 "Surrogates", §5.3 "Handling + * Surrogate Pairs in UTF-16", and §23.6 "Surrogates + * Area". + */ + if ((ch & 0xFC00) == 0xD800) { pe++; if (pe > stringEnd - 6) { if (bufferSize > MAX_STACK_BUFFER_SIZE) { @@ -507,7 +519,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int ); } if (pe[0] == '\\' && pe[1] == 'u') { - UTF32 sur = unescape_unicode((unsigned char *) pe + 2); + uint32_t sur = unescape_unicode((unsigned char *) pe + 2); ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF)); pe += 5; |