summaryrefslogtreecommitdiff
path: root/ext/json/generator/generator.c
diff options
context:
space:
mode:
authorLuke T. Shumaker <[email protected]>2024-02-22 20:51:28 -0700
committerHiroshi SHIBATA <[email protected]>2024-10-08 14:10:05 +0900
commit74d459fd52ef85f92f7c20819afcc4ffcf11714d (patch)
treec967d95e7b5f20bc32956087368571e831f7ded6 /ext/json/generator/generator.c
parent6e47968929f2ee77376d28a6561266d8f8e3a4f7 (diff)
[ruby/json] Adjust to the CVTUTF code being gone
I, Luke T. Shumaker, am the sole author of the added code. I did not reference CVTUTF when writing it. I did reference the Unicode standard (15.0.0), the Wikipedia article on UTF-8, and the Wikipedia article on UTF-16. When I saw some tests fail, I did reference the old deleted code (but a JSON-specific part, inherently not as based on CVTUTF) to determine that script_safe should also escape U+2028 and U+2029. I targeted simplicity and clarity when writing the code--it can likely be optimized. In my mind, the obvious next optimization is to have it combine contiguous non-escaped characters into just one call to fbuffer_append(), instead of calling fbuffer_append() for each character. Regarding the use of the "modern" types `uint32_t`, `uint16_t`, and `bool`: - ruby.h is guaranteed to give us uint32_t and uint16_t. - Since Ruby 3.0.0, ruby.h is guaranteed to give us bool... but we support down to Ruby 2.3. But, ruby.h is guaranteed to give us HAVE_STDBOOL_H for the C99 stdbool.h; so use that to include stdbool.h if we can, and if not then fall back to a copy of the same bool definition that Ruby 3.0.5 uses with C89. https://github.com/ruby/json/commit/c96351f874
Diffstat (limited to 'ext/json/generator/generator.c')
-rw-r--r--ext/json/generator/generator.c152
1 files changed, 108 insertions, 44 deletions
diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c
index d3f6516511..4d853ec343 100644
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@@ -18,50 +18,119 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before,
i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth,
i_buffer_initial_length, i_dup, i_script_safe, i_escape_slash, i_strict;
-/* Escapes the UTF16 character and stores the result in the buffer buf. */
-static void unicode_escape(char *buf, UTF16 character)
+/* Converts in_string to a JSON string (without the wrapping '"'
+ * characters) in FBuffer out_buffer.
+ *
+ * Character are JSON-escaped according to:
+ *
+ * - Always: ASCII control characters (0x00-0x1F), dquote, and
+ * backslash.
+ *
+ * - If out_ascii_only: non-ASCII characters (>0x7F)
+ *
+ * - If out_script_safe: forwardslash, line separator (U+2028), and
+ * paragraph separator (U+2029)
+ *
+ * Everything else (should be UTF-8) is just passed through and
+ * appended to the result.
+ */
+static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
{
- const char *digits = "0123456789abcdef";
+ const char *hexdig = "0123456789abcdef";
+ char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
- buf[2] = digits[character >> 12];
- buf[3] = digits[(character >> 8) & 0xf];
- buf[4] = digits[(character >> 4) & 0xf];
- buf[5] = digits[character & 0xf];
-}
+ const char *in_utf8_str = RSTRING_PTR(in_string);
+ unsigned long in_utf8_len = RSTRING_LEN(in_string);
+ bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);
-/* Escapes the UTF16 character and stores the result in the buffer buf, then
- * the buffer buf is appended to the FBuffer buffer. */
-static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16
- character)
-{
- unicode_escape(buf, character);
- fbuffer_append(buffer, buf, 6);
-}
+ unsigned long pos;
-/* Converts string to a JSON string in FBuffer buffer, where all but the ASCII
- * and control characters are JSON escaped. */
-static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe)
-{
- const UTF8 *source = (UTF8 *) RSTRING_PTR(string);
- const UTF8 *sourceEnd = source + RSTRING_LEN(string);
- char buf[6] = { '\\', 'u' };
+ for (pos = 0; pos < in_utf8_len;) {
+ uint32_t ch;
+ unsigned long ch_len;
+ bool should_escape;
- RB_GC_GUARD(string);
-}
+ /* UTF-8 decoding */
+ if (in_is_ascii_only) {
+ ch = in_utf8_str[pos];
+ ch_len = 1;
+ } else {
+ short i;
+ if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
+ else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
+ else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
+ else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
+ else
+ rb_raise(rb_path2class("JSON::GeneratorError"),
+ "source sequence is illegal/malformed utf-8");
+ if ((pos+ch_len) > in_utf8_len)
+ rb_raise(rb_path2class("JSON::GeneratorError"),
+ "partial character in source, but hit end");
+ for (i = 1; i < ch_len; i++) {
+ if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
+ rb_raise(rb_path2class("JSON::GeneratorError"),
+ "source sequence is illegal/malformed utf-8");
+ ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
+ }
+ if (ch > 0x10FFFF)
+ rb_raise(rb_path2class("JSON::GeneratorError"),
+ "source sequence is illegal/malformed utf-8");
+ }
-/* Converts string to a JSON string in FBuffer buffer, where only the
- * characters required by the JSON standard are JSON escaped. The remaining
- * characters (should be UTF8) are just passed through and appended to the
- * result. */
-static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe)
-{
- const char *ptr = RSTRING_PTR(string), *p;
- unsigned long len = RSTRING_LEN(string), start = 0, end = 0;
- const char *escape = NULL;
- int escape_len;
- unsigned char c;
- char buf[6] = { '\\', 'u' };
- int ascii_only = rb_enc_str_asciionly_p(string);
+ /* JSON policy */
+ should_escape =
+ (ch < 0x20) ||
+ (ch == '"') ||
+ (ch == '\\') ||
+ (out_ascii_only && (ch > 0x7F)) ||
+ (out_script_safe && (ch == '/')) ||
+ (out_script_safe && (ch == 0x2028)) ||
+ (out_script_safe && (ch == 0x2029));
+
+ /* JSON encoding */
+ if (should_escape) {
+ switch (ch) {
+ case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
+ case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
+ case '/': fbuffer_append(out_buffer, "\\/", 2); break;
+ case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
+ case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
+ case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
+ case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
+ case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
+ default:
+ if (ch <= 0xFFFF) {
+ scratch[2] = hexdig[ch >> 12];
+ scratch[3] = hexdig[(ch >> 8) & 0xf];
+ scratch[4] = hexdig[(ch >> 4) & 0xf];
+ scratch[5] = hexdig[ch & 0xf];
+ fbuffer_append(out_buffer, scratch, 6);
+ } else {
+ uint16_t hi, lo;
+ ch -= 0x10000;
+ hi = 0xD800 + (uint16_t)(ch >> 10);
+ lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
+
+ scratch[2] = hexdig[hi >> 12];
+ scratch[3] = hexdig[(hi >> 8) & 0xf];
+ scratch[4] = hexdig[(hi >> 4) & 0xf];
+ scratch[5] = hexdig[hi & 0xf];
+
+ scratch[8] = hexdig[lo >> 12];
+ scratch[9] = hexdig[(lo >> 8) & 0xf];
+ scratch[10] = hexdig[(lo >> 4) & 0xf];
+ scratch[11] = hexdig[lo & 0xf];
+
+ fbuffer_append(out_buffer, scratch, 12);
+ }
+ }
+ } else {
+ fbuffer_append(out_buffer, &in_utf8_str[pos], ch_len);
+ }
+
+ pos += ch_len;
+ }
+ RB_GC_GUARD(in_string);
}
static char *fstrndup(const char *ptr, unsigned long len) {
@@ -698,12 +767,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
if (!enc_utf8_compatible_p(rb_enc_get(obj))) {
obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
}
-
- if (state->ascii_only) {
- convert_UTF8_to_JSON_ASCII(buffer, obj, state->script_safe);
- } else {
- convert_UTF8_to_JSON(buffer, obj, state->script_safe);
- }
+ convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);
fbuffer_append_char(buffer, '"');
}