[ruby/json] Adjust to the CVTUTF code being gone

I, Luke T. Shumaker, am the sole author of the added code. I did not reference CVTUTF when writing it. I did reference the Unicode standard (15.0.0), the Wikipedia article on UTF-8, and the Wikipedia article on UTF-16. When I saw some tests fail, I did reference the old deleted code (but a JSON-specific part, inherently not as based on CVTUTF) to determine that script_safe should also escape U+2028 and U+2029. I targeted simplicity and clarity when writing the code--it can likely be optimized. In my mind, the obvious next optimization is to have it combine contiguous non-escaped characters into just one call to fbuffer_append(), instead of calling fbuffer_append() for each character. Regarding the use of the "modern" types `uint32_t`, `uint16_t`, and `bool`: - ruby.h is guaranteed to give us uint32_t and uint16_t. - Since Ruby 3.0.0, ruby.h is guaranteed to give us bool... but we support down to Ruby 2.3. But, ruby.h is guaranteed to give us HAVE_STDBOOL_H for the C99 stdbool.h; so use that to include stdbool.h if we can, and if not then fall back to a copy of the same bool definition that Ruby 3.0.5 uses with C89. https://github.com/ruby/json/commit/c96351f874
author: Luke T. Shumaker <[email protected]> 2024-02-22 20:51:28 -0700
committer: Hiroshi SHIBATA <[email protected]> 2024-10-08 14:10:05 +0900
commit: 74d459fd52ef85f92f7c20819afcc4ffcf11714d (patch)
tree: c967d95e7b5f20bc32956087368571e831f7ded6 /ext/json
parent: 6e47968929f2ee77376d28a6561266d8f8e3a4f7 (diff)
5 files changed, 239 insertions, 144 deletions
diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c
index d3f6516511..4d853ec343 100644
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@@ -18,50 +18,119 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before,
           i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth,
           i_buffer_initial_length, i_dup, i_script_safe, i_escape_slash, i_strict;
 
-/* Escapes the UTF16 character and stores the result in the buffer buf. */
-static void unicode_escape(char *buf, UTF16 character)
+/* Converts in_string to a JSON string (without the wrapping '"'
+ * characters) in FBuffer out_buffer.
+ *
+ * Character are JSON-escaped according to:
+ *
+ * - Always: ASCII control characters (0x00-0x1F), dquote, and
+ *   backslash.
+ *
+ * - If out_ascii_only: non-ASCII characters (>0x7F)
+ *
+ * - If out_script_safe: forwardslash, line separator (U+2028), and
+ *   paragraph separator (U+2029)
+ *
+ * Everything else (should be UTF-8) is just passed through and
+ * appended to the result.
+ */
+static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
 {
-    const char *digits = "0123456789abcdef";
+    const char *hexdig = "0123456789abcdef";
+    char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
 
-    buf[2] = digits[character >> 12];
-    buf[3] = digits[(character >> 8) & 0xf];
-    buf[4] = digits[(character >> 4) & 0xf];
-    buf[5] = digits[character & 0xf];
-}
+    const char *in_utf8_str = RSTRING_PTR(in_string);
+    unsigned long in_utf8_len = RSTRING_LEN(in_string);
+    bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);
 
-/* Escapes the UTF16 character and stores the result in the buffer buf, then
- * the buffer buf is appended to the FBuffer buffer. */
-static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16
-        character)
-{
-    unicode_escape(buf, character);
-    fbuffer_append(buffer, buf, 6);
-}
+    unsigned long pos;
 
-/* Converts string to a JSON string in FBuffer buffer, where all but the ASCII
- * and control characters are JSON escaped. */
-static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe)
-{
-    const UTF8 *source = (UTF8 *) RSTRING_PTR(string);
-    const UTF8 *sourceEnd = source + RSTRING_LEN(string);
-    char buf[6] = { '\\', 'u' };
+    for (pos =  0; pos < in_utf8_len;) {
+        uint32_t ch;
+        unsigned long ch_len;
+        bool should_escape;
 
-    RB_GC_GUARD(string);
-}
+        /* UTF-8 decoding */
+        if (in_is_ascii_only) {
+            ch = in_utf8_str[pos];
+            ch_len = 1;
+        } else {
+            short i;
+            if      ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos];        } /* leading 1 bit is   0b0     */
+            else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110   */
+            else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110  */
+            else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
+            else
+                rb_raise(rb_path2class("JSON::GeneratorError"),
+                         "source sequence is illegal/malformed utf-8");
+            if ((pos+ch_len) > in_utf8_len)
+                rb_raise(rb_path2class("JSON::GeneratorError"),
+                         "partial character in source, but hit end");
+            for (i = 1; i < ch_len; i++) {
+                if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
+                    rb_raise(rb_path2class("JSON::GeneratorError"),
+                             "source sequence is illegal/malformed utf-8");
+                ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
+            }
+            if (ch > 0x10FFFF)
+                rb_raise(rb_path2class("JSON::GeneratorError"),
+                         "source sequence is illegal/malformed utf-8");
+        }
 
-/* Converts string to a JSON string in FBuffer buffer, where only the
- * characters required by the JSON standard are JSON escaped. The remaining
- * characters (should be UTF8) are just passed through and appended to the
- * result. */
-static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe)
-{
-    const char *ptr = RSTRING_PTR(string), *p;
-    unsigned long len = RSTRING_LEN(string), start = 0, end = 0;
-    const char *escape = NULL;
-    int escape_len;
-    unsigned char c;
-    char buf[6] = { '\\', 'u' };
-    int ascii_only = rb_enc_str_asciionly_p(string);
+        /* JSON policy */
+        should_escape =
+            (ch < 0x20) ||
+            (ch == '"') ||
+            (ch == '\\') ||
+            (out_ascii_only && (ch > 0x7F)) ||
+            (out_script_safe && (ch == '/')) ||
+            (out_script_safe && (ch == 0x2028)) ||
+            (out_script_safe && (ch == 0x2029));
+
+        /* JSON encoding */
+        if (should_escape) {
+            switch (ch) {
+                case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
+                case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
+                case '/':  fbuffer_append(out_buffer, "\\/", 2); break;
+                case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
+                case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
+                case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
+                case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
+                case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
+                default:
+                    if (ch <= 0xFFFF) {
+                        scratch[2] = hexdig[ch >> 12];
+                        scratch[3] = hexdig[(ch >> 8) & 0xf];
+                        scratch[4] = hexdig[(ch >> 4) & 0xf];
+                        scratch[5] = hexdig[ch & 0xf];
+                        fbuffer_append(out_buffer, scratch, 6);
+                    } else {
+			uint16_t hi, lo;
+                        ch -= 0x10000;
+                        hi = 0xD800 + (uint16_t)(ch >> 10);
+                        lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
+
+                        scratch[2] = hexdig[hi >> 12];
+                        scratch[3] = hexdig[(hi >> 8) & 0xf];
+                        scratch[4] = hexdig[(hi >> 4) & 0xf];
+                        scratch[5] = hexdig[hi & 0xf];
+
+                        scratch[8] = hexdig[lo >> 12];
+                        scratch[9] = hexdig[(lo >> 8) & 0xf];
+                        scratch[10] = hexdig[(lo >> 4) & 0xf];
+                        scratch[11] = hexdig[lo & 0xf];
+
+                        fbuffer_append(out_buffer, scratch, 12);
+                    }
+            }
+        } else {
+            fbuffer_append(out_buffer, &in_utf8_str[pos], ch_len);
+        }
+
+        pos += ch_len;
+    }
+    RB_GC_GUARD(in_string);
 }
 
 static char *fstrndup(const char *ptr, unsigned long len) {
@@ -698,12 +767,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
     if (!enc_utf8_compatible_p(rb_enc_get(obj))) {
         obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
     }
-
-    if (state->ascii_only) {
-        convert_UTF8_to_JSON_ASCII(buffer, obj, state->script_safe);
-    } else {
-        convert_UTF8_to_JSON(buffer, obj, state->script_safe);
-    }
+    convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);
     fbuffer_append_char(buffer, '"');
 }
 
diff --git a/ext/json/generator/generator.h b/ext/json/generator/generator.h
index 95ce2479c6..16aae7bc62 100644
--- a/ext/json/generator/generator.h
+++ b/ext/json/generator/generator.h
@@ -6,6 +6,14 @@
 
 #include "ruby.h"
 
+#ifdef HAVE_STDBOOL_H
+#include <stdbool.h>
+#else
+/* This is the fallback definition from Ruby 3.0.5. */
+typedef unsigned char _Bool
+#define bool _Bool
+#endif
+
 #ifdef HAVE_RUBY_RE_H
 #include "ruby/re.h"
 #else
@@ -22,10 +30,7 @@
 
 #define option_given_p(opts, key) RTEST(rb_funcall(opts, i_key_p, 1, key))
 
-static void unicode_escape(char *buf, UTF16 character);
-static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
-static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);
-static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe);
+static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe);
 static char *fstrndup(const char *ptr, unsigned long len);
 
 /* ruby api and some helpers */
diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c
index 128f683e0f..b7428891a5 100644
--- a/ext/json/parser/parser.c
+++ b/ext/json/parser/parser.c
@@ -22,26 +22,28 @@ static const signed char digit_values[256] = {
     -1, -1, -1, -1, -1, -1, -1
 };
 
-static UTF32 unescape_unicode(const unsigned char *p)
+static uint32_t unescape_unicode(const unsigned char *p)
 {
+    const uint32_t replacement_char = 0xFFFD;
+
     signed char b;
-    UTF32 result = 0;
+    uint32_t result = 0;
     b = digit_values[p[0]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[1]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[2]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[3]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     return result;
 }
 
-static int convert_UTF32_to_UTF8(char *buf, UTF32 ch)
+static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
 {
     int len = 1;
     if (ch <= 0x7F) {
@@ -77,11 +79,11 @@ static ID i_json_creatable_p, i_json_create, i_create_id, i_create_additions,
           i_leftshift, i_new, i_try_convert, i_freeze, i_uminus;
 
 
-#line 125 "parser.rl"
+#line 105 "parser.rl"
 
 
 
-#line 107 "parser.c"
+#line 87 "parser.c"
 enum {JSON_object_start = 1};
 enum {JSON_object_first_final = 27};
 enum {JSON_object_error = 0};
@@ -89,7 +91,7 @@ enum {JSON_object_error = 0};
 enum {JSON_object_en_main = 1};
 
 
-#line 167 "parser.rl"
+#line 147 "parser.rl"
 
 
 static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting)
@@ -105,14 +107,14 @@ static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *resu
     *result = NIL_P(object_class) ? rb_hash_new() : rb_class_new_instance(0, 0, object_class);
 
 
-#line 131 "parser.c"
+#line 111 "parser.c"
 	{
 	cs = JSON_object_start;
 	}
 
-#line 182 "parser.rl"
+#line 162 "parser.rl"
 
-#line 138 "parser.c"
+#line 118 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -140,7 +142,7 @@ case 2:
 		goto st2;
 	goto st0;
 tr2:
-#line 149 "parser.rl"
+#line 129 "parser.rl"
 	{
         char *np;
         json->parsing_name = 1;
@@ -153,7 +155,7 @@ st3:
 	if ( ++p == pe )
 		goto _test_eof3;
 case 3:
-#line 179 "parser.c"
+#line 159 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st3;
 		case 32: goto st3;
@@ -220,7 +222,7 @@ case 8:
 		goto st8;
 	goto st0;
 tr11:
-#line 133 "parser.rl"
+#line 113 "parser.rl"
 	{
         VALUE v = Qnil;
         char *np = JSON_parse_value(json, p, pe, &v, current_nesting);
@@ -241,7 +243,7 @@ st9:
 	if ( ++p == pe )
 		goto _test_eof9;
 case 9:
-#line 267 "parser.c"
+#line 247 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st9;
 		case 32: goto st9;
@@ -330,14 +332,14 @@ case 18:
 		goto st9;
 	goto st18;
 tr4:
-#line 157 "parser.rl"
+#line 137 "parser.rl"
 	{ p--; {p++; cs = 27; goto _out;} }
 	goto st27;
 st27:
 	if ( ++p == pe )
 		goto _test_eof27;
 case 27:
-#line 363 "parser.c"
+#line 343 "parser.c"
 	goto st0;
 st19:
 	if ( ++p == pe )
@@ -435,7 +437,7 @@ case 26:
 	_out: {}
 	}
 
-#line 183 "parser.rl"
+#line 163 "parser.rl"
 
     if (cs >= JSON_object_first_final) {
         if (json->create_additions) {
@@ -460,7 +462,7 @@ case 26:
 
 
 
-#line 486 "parser.c"
+#line 466 "parser.c"
 enum {JSON_value_start = 1};
 enum {JSON_value_first_final = 29};
 enum {JSON_value_error = 0};
@@ -468,7 +470,7 @@ enum {JSON_value_error = 0};
 enum {JSON_value_en_main = 1};
 
 
-#line 283 "parser.rl"
+#line 263 "parser.rl"
 
 
 static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting)
@@ -476,14 +478,14 @@ static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *resul
     int cs = EVIL;
 
 
-#line 502 "parser.c"
+#line 482 "parser.c"
 	{
 	cs = JSON_value_start;
 	}
 
-#line 290 "parser.rl"
+#line 270 "parser.rl"
 
-#line 509 "parser.c"
+#line 489 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -517,14 +519,14 @@ st0:
 cs = 0;
 	goto _out;
 tr2:
-#line 235 "parser.rl"
+#line 215 "parser.rl"
 	{
         char *np = JSON_parse_string(json, p, pe, result);
         if (np == NULL) { p--; {p++; cs = 29; goto _out;} } else {p = (( np))-1;}
     }
 	goto st29;
 tr3:
-#line 240 "parser.rl"
+#line 220 "parser.rl"
 	{
         char *np;
         if(pe > p + 8 && !strncmp(MinusInfinity, p, 9)) {
@@ -544,7 +546,7 @@ tr3:
     }
 	goto st29;
 tr7:
-#line 258 "parser.rl"
+#line 238 "parser.rl"
 	{
         char *np;
         np = JSON_parse_array(json, p, pe, result, current_nesting + 1);
@@ -552,7 +554,7 @@ tr7:
     }
 	goto st29;
 tr11:
-#line 264 "parser.rl"
+#line 244 "parser.rl"
 	{
         char *np;
         np =  JSON_parse_object(json, p, pe, result, current_nesting + 1);
@@ -560,7 +562,7 @@ tr11:
     }
 	goto st29;
 tr25:
-#line 228 "parser.rl"
+#line 208 "parser.rl"
 	{
         if (json->allow_nan) {
             *result = CInfinity;
@@ -570,7 +572,7 @@ tr25:
     }
 	goto st29;
 tr27:
-#line 221 "parser.rl"
+#line 201 "parser.rl"
 	{
         if (json->allow_nan) {
             *result = CNaN;
@@ -580,19 +582,19 @@ tr27:
     }
 	goto st29;
 tr31:
-#line 215 "parser.rl"
+#line 195 "parser.rl"
 	{
         *result = Qfalse;
     }
 	goto st29;
 tr34:
-#line 212 "parser.rl"
+#line 192 "parser.rl"
 	{
         *result = Qnil;
     }
 	goto st29;
 tr37:
-#line 218 "parser.rl"
+#line 198 "parser.rl"
 	{
         *result = Qtrue;
     }
@@ -601,9 +603,9 @@ st29:
 	if ( ++p == pe )
 		goto _test_eof29;
 case 29:
-#line 270 "parser.rl"
+#line 250 "parser.rl"
 	{ p--; {p++; cs = 29; goto _out;} }
-#line 629 "parser.c"
+#line 609 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st29;
 		case 32: goto st29;
@@ -844,7 +846,7 @@ case 28:
 	_out: {}
 	}
 
-#line 291 "parser.rl"
+#line 271 "parser.rl"
 
     if (json->freeze) {
         OBJ_FREEZE(*result);
@@ -858,7 +860,7 @@ case 28:
 }
 
 
-#line 884 "parser.c"
+#line 864 "parser.c"
 enum {JSON_integer_start = 1};
 enum {JSON_integer_first_final = 3};
 enum {JSON_integer_error = 0};
@@ -866,7 +868,7 @@ enum {JSON_integer_error = 0};
 enum {JSON_integer_en_main = 1};
 
 
-#line 311 "parser.rl"
+#line 291 "parser.rl"
 
 
 static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result)
@@ -874,15 +876,15 @@ static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *res
     int cs = EVIL;
 
 
-#line 900 "parser.c"
+#line 880 "parser.c"
 	{
 	cs = JSON_integer_start;
 	}
 
-#line 318 "parser.rl"
+#line 298 "parser.rl"
     json->memo = p;
 
-#line 908 "parser.c"
+#line 888 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -916,14 +918,14 @@ case 3:
 		goto st0;
 	goto tr4;
 tr4:
-#line 308 "parser.rl"
+#line 288 "parser.rl"
 	{ p--; {p++; cs = 4; goto _out;} }
 	goto st4;
 st4:
 	if ( ++p == pe )
 		goto _test_eof4;
 case 4:
-#line 949 "parser.c"
+#line 929 "parser.c"
 	goto st0;
 st5:
 	if ( ++p == pe )
@@ -942,7 +944,7 @@ case 5:
 	_out: {}
 	}
 
-#line 320 "parser.rl"
+#line 300 "parser.rl"
 
     if (cs >= JSON_integer_first_final) {
         long len = p - json->memo;
@@ -957,7 +959,7 @@ case 5:
 }
 
 
-#line 983 "parser.c"
+#line 963 "parser.c"
 enum {JSON_float_start = 1};
 enum {JSON_float_first_final = 8};
 enum {JSON_float_error = 0};
@@ -965,7 +967,7 @@ enum {JSON_float_error = 0};
 enum {JSON_float_en_main = 1};
 
 
-#line 345 "parser.rl"
+#line 325 "parser.rl"
 
 
 static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *result)
@@ -973,15 +975,15 @@ static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *resul
     int cs = EVIL;
 
 
-#line 999 "parser.c"
+#line 979 "parser.c"
 	{
 	cs = JSON_float_start;
 	}
 
-#line 352 "parser.rl"
+#line 332 "parser.rl"
     json->memo = p;
 
-#line 1007 "parser.c"
+#line 987 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -1039,14 +1041,14 @@ case 8:
 		goto st0;
 	goto tr9;
 tr9:
-#line 339 "parser.rl"
+#line 319 "parser.rl"
 	{ p--; {p++; cs = 9; goto _out;} }
 	goto st9;
 st9:
 	if ( ++p == pe )
 		goto _test_eof9;
 case 9:
-#line 1072 "parser.c"
+#line 1052 "parser.c"
 	goto st0;
 st5:
 	if ( ++p == pe )
@@ -1107,7 +1109,7 @@ case 7:
 	_out: {}
 	}
 
-#line 354 "parser.rl"
+#line 334 "parser.rl"
 
     if (cs >= JSON_float_first_final) {
         VALUE mod = Qnil;
@@ -1158,7 +1160,7 @@ case 7:
 
 
 
-#line 1184 "parser.c"
+#line 1164 "parser.c"
 enum {JSON_array_start = 1};
 enum {JSON_array_first_final = 17};
 enum {JSON_array_error = 0};
@@ -1166,7 +1168,7 @@ enum {JSON_array_error = 0};
 enum {JSON_array_en_main = 1};
 
 
-#line 432 "parser.rl"
+#line 412 "parser.rl"
 
 
 static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting)
@@ -1180,14 +1182,14 @@ static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *resul
     *result = NIL_P(array_class) ? rb_ary_new() : rb_class_new_instance(0, 0, array_class);
 
 
-#line 1206 "parser.c"
+#line 1186 "parser.c"
 	{
 	cs = JSON_array_start;
 	}
 
-#line 445 "parser.rl"
+#line 425 "parser.rl"
 
-#line 1213 "parser.c"
+#line 1193 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -1226,7 +1228,7 @@ case 2:
 		goto st2;
 	goto st0;
 tr2:
-#line 409 "parser.rl"
+#line 389 "parser.rl"
 	{
         VALUE v = Qnil;
         char *np = JSON_parse_value(json, p, pe, &v, current_nesting);
@@ -1246,7 +1248,7 @@ st3:
 	if ( ++p == pe )
 		goto _test_eof3;
 case 3:
-#line 1272 "parser.c"
+#line 1252 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st3;
 		case 32: goto st3;
@@ -1346,14 +1348,14 @@ case 12:
 		goto st3;
 	goto st12;
 tr4:
-#line 424 "parser.rl"
+#line 404 "parser.rl"
 	{ p--; {p++; cs = 17; goto _out;} }
 	goto st17;
 st17:
 	if ( ++p == pe )
 		goto _test_eof17;
 case 17:
-#line 1379 "parser.c"
+#line 1359 "parser.c"
 	goto st0;
 st13:
 	if ( ++p == pe )
@@ -1409,7 +1411,7 @@ case 16:
 	_out: {}
 	}
 
-#line 446 "parser.rl"
+#line 426 "parser.rl"
 
     if(cs >= JSON_array_first_final) {
         return p + 1;
@@ -1482,9 +1484,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
                         "incomplete unicode character escape sequence at '%s'", p
                       );
                     } else {
-                        UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+                        uint32_t ch = unescape_unicode((unsigned char *) ++pe);
                         pe += 3;
-                        if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+                        /* To handle values above U+FFFF, we take a sequence of
+                         * \uXXXX escapes in the U+D800..U+DBFF then
+                         * U+DC00..U+DFFF ranges, take the low 10 bits from each
+                         * to make a 20-bit number, then add 0x10000 to get the
+                         * final codepoint.
+                         *
+                         * See Unicode 15: §3.8 "Surrogates", §5.3 "Handling
+                         * Surrogate Pairs in UTF-16", and §23.6 "Surrogates
+                         * Area".
+                         */
+                        if ((ch & 0xFC00) == 0xD800) {
                             pe++;
                             if (pe > stringEnd - 6) {
                               if (bufferSize > MAX_STACK_BUFFER_SIZE) {
@@ -1496,7 +1508,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
                                 );
                             }
                             if (pe[0] == '\\' && pe[1] == 'u') {
-                                UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+                                uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
                                 ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
                                         | (sur & 0x3FF));
                                 pe += 5;
@@ -1566,7 +1578,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
 }
 
 
-#line 1592 "parser.c"
+#line 1582 "parser.c"
 enum {JSON_string_start = 1};
 enum {JSON_string_first_final = 8};
 enum {JSON_string_error = 0};
@@ -1574,7 +1586,7 @@ enum {JSON_string_error = 0};
 enum {JSON_string_en_main = 1};
 
 
-#line 620 "parser.rl"
+#line 610 "parser.rl"
 
 
 static int
@@ -1595,15 +1607,15 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
     VALUE match_string;
 
 
-#line 1621 "parser.c"
+#line 1611 "parser.c"
 	{
 	cs = JSON_string_start;
 	}
 
-#line 640 "parser.rl"
+#line 630 "parser.rl"
     json->memo = p;
 
-#line 1629 "parser.c"
+#line 1619 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -1628,7 +1640,7 @@ case 2:
 		goto st0;
 	goto st2;
 tr2:
-#line 607 "parser.rl"
+#line 597 "parser.rl"
 	{
         *result = json_string_unescape(json->memo + 1, p, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names);
         if (NIL_P(*result)) {
@@ -1638,14 +1650,14 @@ tr2:
             {p = (( p + 1))-1;}
         }
     }
-#line 617 "parser.rl"
+#line 607 "parser.rl"
 	{ p--; {p++; cs = 8; goto _out;} }
 	goto st8;
 st8:
 	if ( ++p == pe )
 		goto _test_eof8;
 case 8:
-#line 1671 "parser.c"
+#line 1661 "parser.c"
 	goto st0;
 st3:
 	if ( ++p == pe )
@@ -1721,7 +1733,7 @@ case 7:
 	_out: {}
 	}
 
-#line 642 "parser.rl"
+#line 632 "parser.rl"
 
     if (json->create_additions && RTEST(match_string = json->match_string)) {
           VALUE klass;
@@ -1755,6 +1767,7 @@ case 7:
 
 static VALUE convert_encoding(VALUE source)
 {
+#ifdef HAVE_RUBY_ENCODING_H
   rb_encoding *enc = rb_enc_get(source);
   if (enc == rb_ascii8bit_encoding()) {
     if (OBJ_FROZEN(source)) {
@@ -1764,7 +1777,8 @@ static VALUE convert_encoding(VALUE source)
   } else {
     source = rb_str_conv_enc(source, rb_enc_get(source), rb_utf8_encoding());
   }
-  return source;
+#endif
+    return source;
 }
 
 /*
@@ -1892,7 +1906,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
 }
 
 
-#line 1920 "parser.c"
+#line 1910 "parser.c"
 enum {JSON_start = 1};
 enum {JSON_first_final = 10};
 enum {JSON_error = 0};
@@ -1900,7 +1914,7 @@ enum {JSON_error = 0};
 enum {JSON_en_main = 1};
 
 
-#line 828 "parser.rl"
+#line 818 "parser.rl"
 
 
 /*
@@ -1918,16 +1932,16 @@ static VALUE cParser_parse(VALUE self)
   GET_PARSER;
 
 
-#line 1946 "parser.c"
+#line 1936 "parser.c"
 	{
 	cs = JSON_start;
 	}
 
-#line 845 "parser.rl"
+#line 835 "parser.rl"
   p = json->source;
   pe = p + json->len;
 
-#line 1955 "parser.c"
+#line 1945 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -1961,7 +1975,7 @@ st0:
 cs = 0;
 	goto _out;
 tr2:
-#line 820 "parser.rl"
+#line 810 "parser.rl"
 	{
         char *np = JSON_parse_value(json, p, pe, &result, 0);
         if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
@@ -1971,7 +1985,7 @@ st10:
 	if ( ++p == pe )
 		goto _test_eof10;
 case 10:
-#line 1999 "parser.c"
+#line 1989 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st10;
 		case 32: goto st10;
@@ -2060,7 +2074,7 @@ case 9:
 	_out: {}
 	}
 
-#line 848 "parser.rl"
+#line 838 "parser.rl"
 
   if (cs >= JSON_first_final && p == pe) {
     return result;
diff --git a/ext/json/parser/parser.h b/ext/json/parser/parser.h
index d80f1b7303..f6974461ae 100644
--- a/ext/json/parser/parser.h
+++ b/ext/json/parser/parser.h
@@ -48,8 +48,8 @@ typedef struct JSON_ParserStruct {
 #define MinusInfinity "-Infinity"
 #define EVIL 0x666
 
-static UTF32 unescape_unicode(const unsigned char *p);
-static int convert_UTF32_to_UTF8(char *buf, UTF32 ch);
+static uint32_t unescape_unicode(const unsigned char *p);
+static int convert_UTF32_to_UTF8(char *buf, uint32_t ch);
 static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting);
 static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting);
 static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result);
diff --git a/ext/json/parser/parser.rl b/ext/json/parser/parser.rl
index 873c1b3007..959b6e7384 100644
--- a/ext/json/parser/parser.rl
+++ b/ext/json/parser/parser.rl
@@ -20,26 +20,28 @@ static const signed char digit_values[256] = {
     -1, -1, -1, -1, -1, -1, -1
 };
 
-static UTF32 unescape_unicode(const unsigned char *p)
+static uint32_t unescape_unicode(const unsigned char *p)
 {
+    const uint32_t replacement_char = 0xFFFD;
+
     signed char b;
-    UTF32 result = 0;
+    uint32_t result = 0;
     b = digit_values[p[0]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[1]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[2]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[3]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     return result;
 }
 
-static int convert_UTF32_to_UTF8(char *buf, UTF32 ch)
+static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
 {
     int len = 1;
     if (ch <= 0x7F) {
@@ -493,9 +495,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
                         "incomplete unicode character escape sequence at '%s'", p
                       );
                     } else {
-                        UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+                        uint32_t ch = unescape_unicode((unsigned char *) ++pe);
                         pe += 3;
-                        if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+                        /* To handle values above U+FFFF, we take a sequence of
+                         * \uXXXX escapes in the U+D800..U+DBFF then
+                         * U+DC00..U+DFFF ranges, take the low 10 bits from each
+                         * to make a 20-bit number, then add 0x10000 to get the
+                         * final codepoint.
+                         *
+                         * See Unicode 15: §3.8 "Surrogates", §5.3 "Handling
+                         * Surrogate Pairs in UTF-16", and §23.6 "Surrogates
+                         * Area".
+                         */
+                        if ((ch & 0xFC00) == 0xD800) {
                             pe++;
                             if (pe > stringEnd - 6) {
                               if (bufferSize > MAX_STACK_BUFFER_SIZE) {
@@ -507,7 +519,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
                                 );
                             }
                             if (pe[0] == '\\' && pe[1] == 'u') {
-                                UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+                                uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
                                 ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
                                         | (sur & 0x3FF));
                                 pe += 5;
author	Luke T. Shumaker <[email protected]>	2024-02-22 20:51:28 -0700
committer	Hiroshi SHIBATA <[email protected]>	2024-10-08 14:10:05 +0900
commit	74d459fd52ef85f92f7c20819afcc4ffcf11714d (patch)
tree	c967d95e7b5f20bc32956087368571e831f7ded6 /ext/json
parent	6e47968929f2ee77376d28a6561266d8f8e3a4f7 (diff)