summaryrefslogtreecommitdiff
path: root/ext/json/generator/generator.c
diff options
context:
space:
mode:
authorJean Boussier <[email protected]>2025-01-03 19:41:08 +0100
committerHiroshi SHIBATA <[email protected]>2025-01-07 13:21:46 +0900
commitf756950d82dfd5049448a69f4fe4ddad5a7f7ff6 (patch)
tree1af940a8533b1f002e72785ea0a5bb6c47584e7e /ext/json/generator/generator.c
parentb176d4f52e4af67654814dab3e9c5f4bf9170e54 (diff)
Improve lookup tables for string escaping.
Introduce a simplified table for the most common case, which is `script_safe: false, ascii_only: false`. On the `script_safe` table, now only `0xE2` does a multi-byte check. Merge back `convert_ASCII_to_JSON`, as it no longer help much with the simplified escape table. ``` == Encoding mixed utf8 (5003001 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 38.000 i/100ms Calculating ------------------------------------- after 398.220 (± 3.0%) i/s (2.51 ms/i) - 2.014k in 5.061659s Comparison: before: 381.8 i/s after: 398.2 i/s - same-ish: difference falls within error == Encoding mostly utf8 (5001001 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 39.000 i/100ms Calculating ------------------------------------- after 393.337 (± 2.5%) i/s (2.54 ms/i) - 1.989k in 5.059397s Comparison: before: 304.3 i/s after: 393.3 i/s - 1.29x faster == Encoding twitter.json (466906 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 244.000 i/100ms Calculating ------------------------------------- after 2.436k (± 0.9%) i/s (410.43 μs/i) - 12.200k in 5.007702s Comparison: before: 2125.9 i/s after: 2436.5 i/s - 1.15x faster ```
Diffstat (limited to 'ext/json/generator/generator.c')
-rw-r--r--ext/json/generator/generator.c187
1 files changed, 79 insertions, 108 deletions
diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c
index d5c8bfd42a..a76cf7d864 100644
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@@ -96,6 +96,73 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
raise_generator_error_str(invalid_object, str);
}
+// 0 - single byte char that don't need to be escaped.
+// (x | 8) - char that needs to be escaped.
+static const unsigned char CHAR_LENGTH_MASK = 7;
+
+static const unsigned char escape_table[256] = {
+ // ASCII Control Characters
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ // ASCII Characters
+ 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const unsigned char ascii_only_escape_table[256] = {
+ // ASCII Control Characters
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ // ASCII Characters
+ 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ // Continuation byte
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ // First byte of a 2-byte code point
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ // First byte of a 3-byte code point
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ //First byte of a 4+ byte code point
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
+};
+
+static const unsigned char script_safe_escape_table[256] = {
+ // ASCII Control Characters
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ // ASCII Characters
+ 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/'
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ // Continuation byte
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ // First byte of a 2-byte code point
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ // First byte of a 3-byte code point
+ 3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
+ //First byte of a 4+ byte code point
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
+};
+
/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
@@ -106,13 +173,13 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
- * - If out_script_safe: forwardslash, line separator (U+2028), and
+ * - If script_safe: forwardslash (/), line separator (U+2028), and
* paragraph separator (U+2029)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
-static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
+static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -131,7 +198,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
- case 1: {
+ case 9: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -153,9 +220,9 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
}
break;
}
- case 3: {
+ case 11: {
unsigned char b2 = ptr[pos + 1];
- if (RB_UNLIKELY(out_script_safe && ch == 0xE2 && b2 == 0x80)) {
+ if (RB_UNLIKELY(b2 == 0x80)) {
unsigned char b3 = ptr[pos + 2];
if (b3 == 0xA8) {
FLUSH_POS(3);
@@ -167,6 +234,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
break;
}
}
+ ch_len = 3;
// fallthrough
}
default:
@@ -186,104 +254,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
RB_GC_GUARD(str);
}
-static const char escape_table[256] = {
- // ASCII Control Characters
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- // ASCII Characters
- 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- // Continuation byte
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- // First byte of a 2-byte code point
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- // First byte of a 4-byte code point
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- //First byte of a 4+byte code point
- 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
-};
-
-static const char script_safe_escape_table[256] = {
- // ASCII Control Characters
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- // ASCII Characters
- 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- // Continuation byte
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- // First byte of a 2-byte code point
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- // First byte of a 4-byte code point
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- //First byte of a 4+byte code point
- 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
-};
-
-static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
-{
- const char *hexdig = "0123456789abcdef";
- char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
-
- const char *ptr = RSTRING_PTR(str);
- unsigned long len = RSTRING_LEN(str);
-
- unsigned long beg = 0, pos;
-
- for (pos = 0; pos < len;) {
- unsigned char ch = ptr[pos];
- /* JSON encoding */
- if (escape_table[ch]) {
- if (pos > beg) {
- fbuffer_append(out_buffer, &ptr[beg], pos - beg);
- }
-
- beg = pos + 1;
- switch (ch) {
- case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
- case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
- case '/': fbuffer_append(out_buffer, "\\/", 2); break;
- case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
- case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
- case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
- case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
- case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
- default:
- scratch[2] = '0';
- scratch[3] = '0';
- scratch[4] = hexdig[(ch >> 4) & 0xf];
- scratch[5] = hexdig[ch & 0xf];
- fbuffer_append(out_buffer, scratch, 6);
- }
- }
-
- pos++;
- }
-
- if (beg < len) {
- fbuffer_append(out_buffer, &ptr[beg], len - beg);
- }
-
- RB_GC_GUARD(str);
-}
-
-static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
+static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -301,7 +272,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
- case 1: {
+ case 9: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
@@ -325,6 +296,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
}
default: {
uint32_t wchar = 0;
+ ch_len = ch_len & CHAR_LENGTH_MASK;
+
switch(ch_len) {
case 2:
wchar = ptr[pos] & 0x1F;
@@ -935,13 +908,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
switch(rb_enc_str_coderange(obj)) {
case ENC_CODERANGE_7BIT:
- convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
- break;
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
- convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
+ convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
} else {
- convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
+ convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
}
break;
default: