From 259090c18bce2853c7cc536a98b8975f3586f13c Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 31 Jan 2025 22:39:40 -0600 Subject: [PATCH 01/13] Introduce ARM Neon SIMD. --- ext/json/ext/generator/extconf.rb | 19 ++ ext/json/ext/generator/generator.c | 287 ++++++++++++++++++++++++++++- ext/json/ext/generator/simd.h | 48 +++++ test/json/json_generator_test.rb | 48 ++++- 4 files changed, 399 insertions(+), 3 deletions(-) create mode 100644 ext/json/ext/generator/simd.h diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 078068cf6..109a73a99 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -6,5 +6,24 @@ else append_cflags("-std=c99") $defs << "-DJSON_GENERATOR" + + if enable_config('generator-use-simd', default=true) + if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ + # Try to compile a small program using NEON instructions + if have_header('arm_neon.h') + have_type('uint8x16_t', headers=['arm_neon.h']) && try_compile(<<~'SRC') + #include + int main() { + uint8x16_t test = vdupq_n_u8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + end + end + end + + create_header + create_makefile 'json/ext/generator' end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 1bd6af6ed..f8744666c 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -4,6 +4,8 @@ #include #include +#include "simd.h" + /* ruby api and some helpers */ typedef struct JSON_Generator_StateStruct { @@ -166,14 +168,36 @@ static const unsigned char script_safe_escape_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9, }; +#ifdef ENABLE_SIMD + +struct _simd_state { +#ifdef HAVE_SIMD_NEON + struct { + uint8x16x4_t escape_table[4]; + uint8x16x4_t script_safe_escape_table[4]; + } neon; +#endif /* HAVE_SIMD_NEON */ +}; + +static struct _simd_state simd_state; + +#endif /* ENABLE_SIMD */ typedef struct _search_state { const char *ptr; const char *end; const char *cursor; FBuffer *buffer; + +#ifdef ENABLE_SIMD + const char *returned_from; + unsigned char maybe_matches[16]; + unsigned long current_match_index; +#endif /* ENABLE_SIMD */ } search_state; +unsigned char (*search_escape_impl)(search_state *, const unsigned char escape_table[256]); + static inline void search_flush(search_state *search) { fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); @@ -208,6 +232,227 @@ static inline unsigned char search_escape(search_state *search, const unsigned c return 0; } +#ifdef ENABLE_SIMD +#ifdef HAVE_SIMD_NEON + +static inline unsigned char search_update_matches_neon_lut(search_state *search, uint8x16x4_t *tables) { + while (search->ptr + 16 < search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + + uint8x16_t tmp1 = vqtbl4q_u8(tables[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(tables[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + + uint8x16_t result = vorrq_u8(tmp1, tmp2); + + // The top 128 bytes of the escape_table are all 0. + // TODO is this a safe to do? + if (tables == simd_state.neon.script_safe_escape_table) { + uint8x16_t tmp3 = vqtbl4q_u8(tables[2], veorq_u8(chunk, vdupq_n_u8(0x80))); + uint8x16_t tmp4 = vqtbl4q_u8(tables[3], veorq_u8(chunk, vdupq_n_u8(0xc0))); + result = vorrq_u8(result, vorrq_u8(tmp3, tmp4)); + } + + if (vmaxvq_u8(result) == 0) { + search->ptr += 16; + continue; + } + + vst1q_u8(search->maybe_matches, result); + return 1; + } + + return 0; +} + +static unsigned char search_update_matches_neon_rules(search_state *search, const unsigned char escape_table[256]) { + const uint8x16_t lower_bound = vdupq_n_u8(' '); + const uint8x16_t backslash = vdupq_n_u8('\\'); + const uint8x16_t dblquote = vdupq_n_u8('\"'); + + if (escape_table == script_safe_escape_table) { + /* + * This works almost exactly the same as what is described above. The difference in this case comes after we know + * there is a byte to be escaped. In the previous case, all bytes were handled the same way. In this case, however, + * some bytes need to be handled differently. + * + * Since we know each byte in chunk can only match a single case, we logical AND each of the has_backslash, + * has_dblquote, and has_forward_slash with a different bit (0x1, 0x2 and 0x4 respectively) and combine + * the results with a logical OR. + * + * Now we loop over the result vector and switch on the particular pattern we just created. If we find a + * case we don't know, we simply lookup the byte in the script_safe_escape_table to determine the correct + * action. + */ + const uint8x16_t upper_bound = vdupq_n_u8('~'); + const uint8x16_t forward_slash = vdupq_n_u8('/'); + + while (search->ptr+16 < search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); + + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); + + uint8x16_t needs_escape = vorrq_u8(too_low, too_high); + uint8x16_t has_escaped_char = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote)); + needs_escape = vorrq_u8(needs_escape, has_escaped_char); + + if (vmaxvq_u8(needs_escape) == 0) { + search->ptr += 16; + continue; + } + + for(int i=0; i<16; i++) { + unsigned char ch = *(search->ptr+i); + search->maybe_matches[i] = escape_table[ch]; + } + + return 1; + } + } else { + /* + * The code below implements an SIMD-based algorithm to determine if N bytes at a time + * need to be escaped. + * + * Assume the ptr = "Te\sting!" (the double quotes are included in the string) + * + * The explanination will be limited to the first 8 bytes of the string for simplicity. However + * the vector insructions may work on larger vectors. + * + * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. + * + * lower_bound: [20 20 20 20 20 20 20 20] + * backslash: [5C 5C 5C 5C 5C 5C 5C 5C] + * dblquote: [22 22 22 22 22 22 22 22] + * + * Next we load the first chunk of the ptr: + * [22 54 65 5C 73 74 69 6E] (" T e \ s t i n) + * + * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector + * as no bytes are less than 32 (0x20): + * [0 0 0 0 0 0 0 0] + * + * Next, we check if any byte in chunk is equal to a backslash: + * [0 0 0 FF 0 0 0 0] + * + * Finally we check if any byte in chunk is equal to a double quote: + * [FF 0 0 0 0 0 0 0] + * + * Now we have three vectors where each byte indicates if the corresponding byte in chunk + * needs to be escaped. We combine these vectors with a series of logical OR instructions. + * This is the needs_escape vector and it is equal to: + * [FF 0 0 FF 0 0 0 0] + * + * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of + * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be + * escaped. + * + * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and + * we advance pos by the width of the vector. + * + * To determine how to escape characters, we look at each value in the needs_escape vector and take + * the appropriate action. + */ + while (search->ptr+16 < search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + + if (vmaxvq_u8(needs_escape) == 0) { + search->ptr += 16; + continue; + } + + for(int i=0; i<16; i++) { + unsigned char ch = *(search->ptr+i); + search->maybe_matches[i] = escape_table[ch]; + } + + return 1; + } + } + return 0; +} + +// TODO This can likely be made generic if we know the stride width of the vector. +static inline unsigned char search_return_next_match_neon(search_state *search) { + for(; search->current_match_index < 16 && search->ptr < search->end; ) { + unsigned char ch_len = search->maybe_matches[search->current_match_index]; + + if (RB_UNLIKELY(ch_len)) { + if (ch_len & ESCAPE_MASK) { + if (RB_UNLIKELY(ch_len == 11)) { + const unsigned char *uptr = (const unsigned char *)search->ptr; + if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) { + search->ptr += 3; + search->current_match_index += 3; + continue; + } + } + search->returned_from = search->ptr; + search_flush(search); + return ch_len & CHAR_LENGTH_MASK; + } else { + search->ptr += ch_len; + search->current_match_index += ch_len; + } + } else { + search->ptr++; + search->current_match_index++; + } + } + return 0; +} + +// TODO This can likely be made generic if we know the stride width of the vector and make the SIMD kernel a function pointer and which lookup tables to use. +static inline unsigned char search_escape_neon(search_state *search, const unsigned char escape_table[256]) +{ + if (RB_UNLIKELY(search->returned_from != NULL)) { + search->current_match_index += (search->ptr - search->returned_from); + search->returned_from = NULL; + unsigned char ch_len = search_return_next_match_neon(search); + if (RB_UNLIKELY(ch_len)) { + return ch_len; + } + } + + uint8x16x4_t *tables; + if (escape_table == script_safe_escape_table) { + tables = simd_state.neon.script_safe_escape_table; + } else { + tables = simd_state.neon.escape_table; + } + + while (search->ptr + 16 < search->end) { + if (!search_update_matches_neon_lut(search, tables)) { + break; + } + + // if (!search_update_matches_neon_rules(search, escape_table)) { + // break; + // } + + search->current_match_index=0; + unsigned char ch_len = search_return_next_match_neon(search); + if (RB_UNLIKELY(ch_len)) { + return ch_len; + } + } + + if (search->ptr < search->end) { + return search_escape(search, escape_table); + } + + search_flush(search); + return 0; +} +#endif /* HAVE_SIMD_NEON */ +#endif /* ENABLE_SIMD */ + static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_len) { const unsigned char ch = (unsigned char)*search->ptr; switch (ch_len) { @@ -263,7 +508,7 @@ static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_ static inline void convert_UTF8_to_JSON(search_state *search, const unsigned char escape_table[256]) { unsigned char ch_len; - while ((ch_len = search_escape(search, escape_table))) { + while ((ch_len = search_escape_impl(search, escape_table))) { fast_escape_UTF8_char(search, ch_len); } } @@ -929,6 +1174,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat search.cursor = search.ptr; search.end = search.ptr + len; +#ifdef ENABLE_SIMD + search.current_match_index = 0; + search.returned_from = NULL; +#endif /* ENABLE_SIMD */ + switch(rb_enc_str_coderange(obj)) { case ENC_CODERANGE_7BIT: case ENC_CODERANGE_VALID: @@ -1088,6 +1338,25 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) return Qundef; } +/* SIMD Utilities (if enabled) */ +#ifdef ENABLE_SIMD + +#ifdef HAVE_SIMD_NEON +static void initialize_simd_neon(void) { + simd_state.neon.escape_table[0] = load_uint8x16_4(escape_table, 0); + simd_state.neon.escape_table[1] = load_uint8x16_4(escape_table, 64); + simd_state.neon.escape_table[2] = load_uint8x16_4(escape_table, 128); + simd_state.neon.escape_table[3] = load_uint8x16_4(escape_table, 192); + + simd_state.neon.script_safe_escape_table[0] = load_uint8x16_4(script_safe_escape_table, 0); + simd_state.neon.script_safe_escape_table[1] = load_uint8x16_4(script_safe_escape_table, 64); + simd_state.neon.script_safe_escape_table[2] = load_uint8x16_4(script_safe_escape_table, 128); + simd_state.neon.script_safe_escape_table[3] = load_uint8x16_4(script_safe_escape_table, 192); +} +#endif /* HAVE_NEON_SIMD */ + +#endif + static VALUE cState_partial_generate(VALUE self, VALUE obj, generator_func func, VALUE io) { GET_STATE(self); @@ -1744,4 +2013,20 @@ void Init_generator(void) binary_encindex = rb_ascii8bit_encindex(); rb_require("json/ext/generator/state"); + + + switch(find_simd_implementation()) { +#ifdef ENABLE_SIMD +#ifdef HAVE_SIMD_NEON + case SIMD_NEON: + /* Initialize ARM Neon SIMD Implementation. */ + initialize_simd_neon(); + search_escape_impl = search_escape_neon; + break; +#endif /* HAVE_SIMD_NEON */ +#endif /* ENABLE_SIMD */ + default: + search_escape_impl = search_escape; + break; + } } diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h new file mode 100644 index 000000000..ba6e57b4b --- /dev/null +++ b/ext/json/ext/generator/simd.h @@ -0,0 +1,48 @@ +#include "extconf.h" + +typedef enum { + SIMD_NONE, + SIMD_NEON, +} SIMD_Implementation; + +#ifdef ENABLE_SIMD + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) +#include + +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 +SIMD_Implementation find_simd_implementation() { + return SIMD_NEON; +} + +#define HAVE_SIMD_NEON 1 + +uint8x16x4_t load_uint8x16_4(const unsigned char *table, int offset) { + uint8x16x4_t tab; + for(int i=0; i<4; i++) { + tab.val[i] = vld1q_u8(table+offset+(i*16)); + } + return tab; +} + +void print_uint8x16(char *msg, uint8x16_t vec) { + printf("%s\n[ ", msg); + uint8_t store[16] = {0}; + vst1q_u8(store, vec); + for(int i=0; i<16; i++) { + printf("%3d ", store[i]); + } + printf("]\n"); +} + +#endif /* ARM Neon Support.*/ + +/* Other SIMD implementation checks here. */ + +#endif /* ENABLE_SIMD */ + +#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED +SIMD_Implementation find_simd_implementation(void) { + return SIMD_NONE; +} +#endif \ No newline at end of file diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index d97f0505f..f4621fa2b 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -427,18 +427,34 @@ def test_backslash json = '["\\\\.(?i:gif|jpe?g|png)$"]' assert_equal json, generate(data) # - data = [ '\\"' ] - json = '["\\\\\""]' + data = [ '\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$' ] + json = '["\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$"]' + assert_equal json, generate(data) + # + data = [ '\\"\\"\\"\\"\\"\\"\\"\\"\\"\\"\\"' ] + json = '["\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\""]' assert_equal json, generate(data) # data = [ '/' ] json = '["/"]' assert_equal json, generate(data) # + data = [ '////////////////////////////////////////////////////////////////////////////////////' ] + json = '["////////////////////////////////////////////////////////////////////////////////////"]' + assert_equal json, generate(data) + # data = [ '/' ] json = '["\/"]' assert_equal json, generate(data, :script_safe => true) # + data = [ '///////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # + data = [ '///////////////////////////////////////////////////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # data = [ "\u2028\u2029" ] json = '["\u2028\u2029"]' assert_equal json, generate(data, :script_safe => true) @@ -455,6 +471,10 @@ def test_backslash json = '["\""]' assert_equal json, generate(data) # + data = ['"""""""""""""""""""""""""'] + json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' + assert_equal json, generate(data) + # data = ["'"] json = '["\\\'"]' assert_equal '["\'"]', generate(data) @@ -462,6 +482,30 @@ def test_backslash data = ["倩", "瀨"] json = '["倩","瀨"]' assert_equal json, generate(data, script_safe: true) + # + data = '["This is a "test" of the emergency broadcast system."]' + json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\"" + assert_equal json, generate(data) + # + data = '\tThis is a test of the emergency broadcast system.' + json = "\"\\\\tThis is a test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This\tis a test of the emergency broadcast system.' + json = "\"This\\\\tis a test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This is\ta test of the emergency broadcast system.' + json = "\"This is\\\\ta test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This is a test of the emergency broadcast\tsystem.' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\"" + assert_equal json, generate(data) + # + data = 'This is a test of the emergency broadcast\tsystem.\n' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\\\\n\"" + assert_equal json, generate(data) end def test_string_subclass From 9ad196e996249fe4fb04e625059e54251646e64d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Tue, 4 Feb 2025 20:23:08 -0600 Subject: [PATCH 02/13] Use the 'rules' implementation instead of the lookup table implementation. Also store the potential matches directly rather than looking up values in the escape table. --- ext/json/ext/generator/generator.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index f8744666c..a6a2e5dfd 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -303,7 +303,7 @@ static unsigned char search_update_matches_neon_rules(search_state *search, cons search->ptr += 16; continue; } - + for(int i=0; i<16; i++) { unsigned char ch = *(search->ptr+i); search->maybe_matches[i] = escape_table[ch]; @@ -367,10 +367,8 @@ static unsigned char search_update_matches_neon_rules(search_state *search, cons continue; } - for(int i=0; i<16; i++) { - unsigned char ch = *(search->ptr+i); - search->maybe_matches[i] = escape_table[ch]; - } + uint8x16_t maybe_matches = vandq_u8(needs_escape, vdupq_n_u8(0x9)); + vst1q_u8(search->maybe_matches, maybe_matches); return 1; } @@ -428,14 +426,14 @@ static inline unsigned char search_escape_neon(search_state *search, const unsig } while (search->ptr + 16 < search->end) { - if (!search_update_matches_neon_lut(search, tables)) { - break; - } - - // if (!search_update_matches_neon_rules(search, escape_table)) { + // if (!search_update_matches_neon_lut(search, tables)) { // break; // } + if (!search_update_matches_neon_rules(search, escape_table)) { + break; + } + search->current_match_index=0; unsigned char ch_len = search_return_next_match_neon(search); if (RB_UNLIKELY(ch_len)) { From d8a2e56f391c17e7a5f20b5aa34bcb599acc6243 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Tue, 4 Feb 2025 21:11:26 -0600 Subject: [PATCH 03/13] Refactoring and simplifications. --- ext/json/ext/generator/generator.c | 100 ++++++++++++++--------------- 1 file changed, 47 insertions(+), 53 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 75222a9fa..de9619397 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -253,12 +253,42 @@ static struct _simd_state simd_state; #ifdef ENABLE_SIMD #ifdef HAVE_SIMD_NEON -static inline unsigned char search_update_matches_neon_lut(search_state *search, uint8x16x4_t *tables) { +// TODO This can likely be made generic if we know the stride width of the vector. +static inline unsigned char search_escape_basic_neon_next_match(search_state *search) { + for(; search->current_match_index < 16 && search->ptr < search->end; ) { + unsigned char ch_len = search->maybe_matches[search->current_match_index]; + + if (RB_UNLIKELY(ch_len)) { + if (ch_len & ESCAPE_MASK) { + if (RB_UNLIKELY(ch_len == 11)) { + const unsigned char *uptr = (const unsigned char *)search->ptr; + if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) { + search->ptr += 3; + search->current_match_index += 3; + continue; + } + } + search->returned_from = search->ptr; + search_flush(search); + return ch_len & CHAR_LENGTH_MASK; + } else { + search->ptr += ch_len; + search->current_match_index += ch_len; + } + } else { + search->ptr++; + search->current_match_index++; + } + } + return 0; +} + +static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { while (search->ptr + 16 < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t tmp1 = vqtbl4q_u8(tables[0], chunk); - uint8x16_t tmp2 = vqtbl4q_u8(tables[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table[1], veorq_u8(chunk, vdupq_n_u8(0x40))); uint8x16_t result = vorrq_u8(tmp1, tmp2); @@ -268,13 +298,15 @@ static inline unsigned char search_update_matches_neon_lut(search_state *search, } vst1q_u8(search->maybe_matches, result); - return 1; + + search->current_match_index=0; + return search_escape_basic_neon_next_match(search); } return 0; } -static unsigned char search_update_matches_neon_rules(search_state *search) { +static unsigned char search_escape_basic_neon_advance_rules(search_state *search) { const uint8x16_t lower_bound = vdupq_n_u8(' '); const uint8x16_t backslash = vdupq_n_u8('\\'); const uint8x16_t dblquote = vdupq_n_u8('\"'); @@ -337,72 +369,34 @@ static unsigned char search_update_matches_neon_rules(search_state *search) { uint8x16_t maybe_matches = vandq_u8(needs_escape, vdupq_n_u8(0x9)); vst1q_u8(search->maybe_matches, maybe_matches); - return 1; + search->current_match_index=0; + return search_escape_basic_neon_next_match(search); } return 0; } -// TODO This can likely be made generic if we know the stride width of the vector. -static inline unsigned char search_return_next_match_neon(search_state *search) { - for(; search->current_match_index < 16 && search->ptr < search->end; ) { - unsigned char ch_len = search->maybe_matches[search->current_match_index]; - - if (RB_UNLIKELY(ch_len)) { - if (ch_len & ESCAPE_MASK) { - if (RB_UNLIKELY(ch_len == 11)) { - const unsigned char *uptr = (const unsigned char *)search->ptr; - if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) { - search->ptr += 3; - search->current_match_index += 3; - continue; - } - } - search->returned_from = search->ptr; - search_flush(search); - return ch_len & CHAR_LENGTH_MASK; - } else { - search->ptr += ch_len; - search->current_match_index += ch_len; - } - } else { - search->ptr++; - search->current_match_index++; - } - } - return 0; -} - // TODO This can likely be made generic if we know the stride width of the vector and make the SIMD kernel a function pointer and which lookup tables to use. static inline unsigned char search_escape_basic_neon(search_state *search) { if (RB_UNLIKELY(search->returned_from != NULL)) { search->current_match_index += (search->ptr - search->returned_from); search->returned_from = NULL; - unsigned char ch_len = search_return_next_match_neon(search); + unsigned char ch_len = search_escape_basic_neon_next_match(search); if (RB_UNLIKELY(ch_len)) { return ch_len; } } - // uint8x16x4_t *tables = simd_state.neon.escape_table; - - while (search->ptr + 16 < search->end) { - // if (!search_update_matches_neon_lut(search, tables)) { - // break; - // } - - if (!search_update_matches_neon_rules(search)) { - break; - } - - search->current_match_index=0; - unsigned char ch_len = search_return_next_match_neon(search); - if (RB_UNLIKELY(ch_len)) { - return ch_len; - } + unsigned char ch_len; + if ((ch_len = search_escape_basic_neon_advance_lut(search)) != 0) { + return ch_len; } + // if ((ch_len = search_escape_basic_neon_advance_rules(search)) != 0) { + // return ch_len; + // } + if (search->ptr < search->end) { return search_escape_basic(search); } From 89ba0be1038fe3c0f8c20c0f0c53e0a597db31c7 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 5 Feb 2025 20:46:43 -0600 Subject: [PATCH 04/13] Load the SIMD lookup table explicitly without loops. --- ext/json/ext/generator/generator.c | 8 ++++---- ext/json/ext/generator/simd.h | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index de9619397..a33fa0335 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -1325,10 +1325,10 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) #ifdef HAVE_SIMD_NEON static void initialize_simd_neon(void) { - simd_state.neon.escape_table[0] = load_uint8x16_4(escape_table_basic, 0); - simd_state.neon.escape_table[1] = load_uint8x16_4(escape_table_basic, 64); - simd_state.neon.escape_table[2] = load_uint8x16_4(escape_table_basic, 128); - simd_state.neon.escape_table[3] = load_uint8x16_4(escape_table_basic, 192); + simd_state.neon.escape_table[0] = load_uint8x16_4(escape_table_basic); + simd_state.neon.escape_table[1] = load_uint8x16_4(escape_table_basic+64); + simd_state.neon.escape_table[2] = load_uint8x16_4(escape_table_basic+128); + simd_state.neon.escape_table[3] = load_uint8x16_4(escape_table_basic+192); } #endif /* HAVE_NEON_SIMD */ diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index ba6e57b4b..11332ee15 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -17,11 +17,12 @@ SIMD_Implementation find_simd_implementation() { #define HAVE_SIMD_NEON 1 -uint8x16x4_t load_uint8x16_4(const unsigned char *table, int offset) { +uint8x16x4_t load_uint8x16_4(const unsigned char *table) { uint8x16x4_t tab; - for(int i=0; i<4; i++) { - tab.val[i] = vld1q_u8(table+offset+(i*16)); - } + tab.val[0] = vld1q_u8(table); + tab.val[1] = vld1q_u8(table+16); + tab.val[2] = vld1q_u8(table+32); + tab.val[3] = vld1q_u8(table+48); return tab; } From a23b84e1da0277ca5cb6819853cf37ea66a5cc8f Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Wed, 5 Feb 2025 21:05:20 -0600 Subject: [PATCH 05/13] Use only 2 64-byte lookup tables for the neon escape_table_basic as we only need 128 bytes for the lookup table as the top 128 bytes are all zeros. --- ext/json/ext/generator/generator.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index a33fa0335..0525975b4 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -241,7 +241,7 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) struct _simd_state { #ifdef HAVE_SIMD_NEON struct { - uint8x16x4_t escape_table[4]; + uint8x16x4_t escape_table_basic[2]; } neon; #endif /* HAVE_SIMD_NEON */ }; @@ -287,8 +287,8 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s while (search->ptr + 16 < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table[0], chunk); - uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); uint8x16_t result = vorrq_u8(tmp1, tmp2); @@ -1325,10 +1325,8 @@ static VALUE generate_json_rescue(VALUE d, VALUE exc) #ifdef HAVE_SIMD_NEON static void initialize_simd_neon(void) { - simd_state.neon.escape_table[0] = load_uint8x16_4(escape_table_basic); - simd_state.neon.escape_table[1] = load_uint8x16_4(escape_table_basic+64); - simd_state.neon.escape_table[2] = load_uint8x16_4(escape_table_basic+128); - simd_state.neon.escape_table[3] = load_uint8x16_4(escape_table_basic+192); + simd_state.neon.escape_table_basic[0] = load_uint8x16_4(escape_table_basic); + simd_state.neon.escape_table_basic[1] = load_uint8x16_4(escape_table_basic+64); } #endif /* HAVE_NEON_SIMD */ From 5506091c7c961eebfa08879af1bc23c28061b213 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 9 Feb 2025 20:45:03 -0600 Subject: [PATCH 06/13] Simplifications. --- ext/json/ext/generator/generator.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 0525975b4..d70654c2b 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -259,22 +259,9 @@ static inline unsigned char search_escape_basic_neon_next_match(search_state *se unsigned char ch_len = search->maybe_matches[search->current_match_index]; if (RB_UNLIKELY(ch_len)) { - if (ch_len & ESCAPE_MASK) { - if (RB_UNLIKELY(ch_len == 11)) { - const unsigned char *uptr = (const unsigned char *)search->ptr; - if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) { - search->ptr += 3; - search->current_match_index += 3; - continue; - } - } - search->returned_from = search->ptr; - search_flush(search); - return ch_len & CHAR_LENGTH_MASK; - } else { - search->ptr += ch_len; - search->current_match_index += ch_len; - } + search->returned_from = search->ptr; + search_flush(search); + return 1; } else { search->ptr++; search->current_match_index++; From 3ae56773b3068ee1709aab08071aba1a05bda72d Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 9 Feb 2025 21:08:33 -0600 Subject: [PATCH 07/13] A few more cleanups. --- ext/json/ext/generator/generator.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d70654c2b..a91b2f69d 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -369,19 +369,17 @@ static inline unsigned char search_escape_basic_neon(search_state *search) if (RB_UNLIKELY(search->returned_from != NULL)) { search->current_match_index += (search->ptr - search->returned_from); search->returned_from = NULL; - unsigned char ch_len = search_escape_basic_neon_next_match(search); - if (RB_UNLIKELY(ch_len)) { - return ch_len; + if (RB_UNLIKELY(search_escape_basic_neon_next_match(search))) { + return 1; } } - unsigned char ch_len; - if ((ch_len = search_escape_basic_neon_advance_lut(search)) != 0) { - return ch_len; + if (search_escape_basic_neon_advance_lut(search)) { + return 1; } - // if ((ch_len = search_escape_basic_neon_advance_rules(search)) != 0) { - // return ch_len; + // if (search_escape_basic_neon_advance_rules(search)) { + // return 1; // } if (search->ptr < search->end) { From 332107dbf40634faa0bc5aeeda3375f3f17dbd97 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 23 Mar 2025 21:38:59 -0400 Subject: [PATCH 08/13] Use SIMD for fewer than 16 characters (but at least 8) remaining. --- ext/json/ext/generator/generator.c | 108 ++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 17 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index a91b2f69d..6b0488667 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -120,8 +120,10 @@ typedef struct _search_state { static inline void search_flush(search_state *search) { - fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); - search->cursor = search->ptr; + if (search->cursor < search->ptr) { + fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); + search->cursor = search->ptr; + } } static const unsigned char escape_table_basic[256] = { @@ -270,14 +272,19 @@ static inline unsigned char search_escape_basic_neon_next_match(search_state *se return 0; } -static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { - while (search->ptr + 16 < search->end) { - uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); +static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { + uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); + uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); - uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); - uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); + uint8x16_t result = vorrq_u8(tmp1, tmp2); + return result; +} - uint8x16_t result = vorrq_u8(tmp1, tmp2); + +static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { + while (search->ptr + 16 < search->end) { + uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); + uint8x16_t result = neon_lut_update(chunk); if (vmaxvq_u8(result) == 0) { search->ptr += 16; @@ -290,14 +297,52 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s return search_escape_basic_neon_next_match(search); } + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= 8) { + // Flush the buffer so everything up until the last 'remaining' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, 16); + + char *s = (buf->ptr + buf->len); + + memset(s, 'X', 16); + + // Optimistically copy the remaining characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + memcpy(s, search->ptr, remaining); + + uint8x16_t chunk = vld1q_u8((const unsigned char *) s); + uint8x16_t result = neon_lut_update(chunk); + if (vmaxvq_u8(result) == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + buf->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + } + return 0; } -static unsigned char search_escape_basic_neon_advance_rules(search_state *search) { +static inline uint8x16_t neon_rules_update(uint8x16_t chunk) { const uint8x16_t lower_bound = vdupq_n_u8(' '); const uint8x16_t backslash = vdupq_n_u8('\\'); const uint8x16_t dblquote = vdupq_n_u8('\"'); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + + return needs_escape; +} + +static unsigned char search_escape_basic_neon_advance_rules(search_state *search) { /* * The code below implements an SIMD-based algorithm to determine if N bytes at a time * need to be escaped. @@ -343,10 +388,7 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search */ while (search->ptr+16 < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); - uint8x16_t too_low = vcltq_u8(chunk, lower_bound); - uint8x16_t has_backslash = vceqq_u8(chunk, backslash); - uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); - uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + uint8x16_t needs_escape = neon_rules_update(chunk); if (vmaxvq_u8(needs_escape) == 0) { search->ptr += 16; @@ -360,6 +402,35 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search return search_escape_basic_neon_next_match(search); } + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= 8) { + // Flush the buffer so everything up until the last 'remaining' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, 16); + + char *s = (buf->ptr + buf->len); + + memset(s, 'X', 16); + + // Optimistically copy the remaining characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + memcpy(s, search->ptr, remaining); + + uint8x16_t chunk = vld1q_u8((const unsigned char *) s); + uint8x16_t result = neon_rules_update(chunk); + if (vmaxvq_u8(result) == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + buf->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + } + return 0; } @@ -374,14 +445,17 @@ static inline unsigned char search_escape_basic_neon(search_state *search) } } - if (search_escape_basic_neon_advance_lut(search)) { - return 1; - } + // TODO Pick an implementation or make them configurable. Right now it looks like the "rules" based approach + // might be a bit faster. - // if (search_escape_basic_neon_advance_rules(search)) { + // if (search_escape_basic_neon_advance_lut(search)) { // return 1; // } + if (search_escape_basic_neon_advance_rules(search)) { + return 1; + } + if (search->ptr < search->end) { return search_escape_basic(search); } From a47ffa02cf53d9b2b08b60a381d1cfda24ad13fa Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 4 Apr 2025 22:05:33 -0500 Subject: [PATCH 09/13] Add x86-64 SSE2 support with runtime detection. --- ext/json/ext/generator/extconf.rb | 12 +++ ext/json/ext/generator/generator.c | 135 ++++++++++++++++++++++++++--- ext/json/ext/generator/simd.h | 44 +++++++++- 3 files changed, 178 insertions(+), 13 deletions(-) diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 109a73a99..65f87434b 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -21,6 +21,18 @@ $defs.push("-DENABLE_SIMD") end end + + if have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-msse2') + #include + int main() { + __m128i test = _mm_set1_epi8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + end + + have_header('cpuid.h') end create_header diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 6b0488667..d2a69c23a 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -115,6 +115,7 @@ typedef struct _search_state { const char *returned_from; unsigned char maybe_matches[16]; unsigned long current_match_index; + unsigned long maybe_match_length; #endif /* ENABLE_SIMD */ } search_state; @@ -240,24 +241,22 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) #ifdef ENABLE_SIMD -struct _simd_state { #ifdef HAVE_SIMD_NEON +struct _simd_state { + struct { uint8x16x4_t escape_table_basic[2]; } neon; -#endif /* HAVE_SIMD_NEON */ }; static struct _simd_state simd_state; - +#endif /* HAVE_SIMD_NEON */ #endif /* ENABLE_SIMD */ #ifdef ENABLE_SIMD -#ifdef HAVE_SIMD_NEON - // TODO This can likely be made generic if we know the stride width of the vector. -static inline unsigned char search_escape_basic_neon_next_match(search_state *search) { - for(; search->current_match_index < 16 && search->ptr < search->end; ) { +static inline unsigned char search_escape_basic_simd_next_match(search_state *search) { + for(; search->current_match_index < search->maybe_match_length && search->ptr < search->end; ) { unsigned char ch_len = search->maybe_matches[search->current_match_index]; if (RB_UNLIKELY(ch_len)) { @@ -272,6 +271,8 @@ static inline unsigned char search_escape_basic_neon_next_match(search_state *se return 0; } +#ifdef HAVE_SIMD_NEON + static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); @@ -293,8 +294,9 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s vst1q_u8(search->maybe_matches, result); - search->current_match_index=0; - return search_escape_basic_neon_next_match(search); + search->current_match_index = 0; + search->maybe_match_length = sizeof(uint8x16_t); + return search_escape_basic_simd_next_match(search); } // There are fewer than 16 bytes left. @@ -398,8 +400,9 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search uint8x16_t maybe_matches = vandq_u8(needs_escape, vdupq_n_u8(0x9)); vst1q_u8(search->maybe_matches, maybe_matches); - search->current_match_index=0; - return search_escape_basic_neon_next_match(search); + search->current_match_index = 0; + search->maybe_match_length = sizeof(uint8x16_t); + return search_escape_basic_simd_next_match(search); } // There are fewer than 16 bytes left. @@ -440,7 +443,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) if (RB_UNLIKELY(search->returned_from != NULL)) { search->current_match_index += (search->ptr - search->returned_from); search->returned_from = NULL; - if (RB_UNLIKELY(search_escape_basic_neon_next_match(search))) { + if (RB_UNLIKELY(search_escape_basic_simd_next_match(search))) { return 1; } } @@ -464,6 +467,109 @@ static inline unsigned char search_escape_basic_neon(search_state *search) return 0; } #endif /* HAVE_SIMD_NEON */ + +#ifdef HAVE_SIMD_SSE2 + +#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) + +#ifdef __GNUC__ +#pragma GCC push_options +#pragma GCC target ("sse2") +#endif /* __GNUC__ */ + +#ifdef __clang__ +__attribute__((target("sse2"))) +#endif /* __clang__ */ +static unsigned char search_escape_basic_sse2(search_state *search) { + if (RB_UNLIKELY(search->returned_from != NULL)) { + search->current_match_index += (search->ptr - search->returned_from); + search->returned_from = NULL; + if (RB_UNLIKELY(search_escape_basic_simd_next_match(search))) { + return 1; + } + } + + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + + while (search->ptr+sizeof(__m128i) < search->end) { + __m128i chunk = _mm_loadu_si128((__m128i const*)search->ptr); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + search->ptr += sizeof(__m128i); + continue; + } + + __m128i nines = _mm_set1_epi8(9); + __m128i maybe_matches = _mm_and_si128(needs_escape, nines); + + _mm_storeu_si128((__m128i *)search->maybe_matches, maybe_matches); + + search->current_match_index = 0; + search->maybe_match_length = sizeof(__m128i); + return search_escape_basic_simd_next_match(search); + } + + + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= 8) { + // Flush the buffer so everything up until the last 'remaining' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, 16); + + char *s = (buf->ptr + buf->len); + + memset(s, 'X', 16); + + // Optimistically copy the remaining characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + memcpy(s, search->ptr, remaining); + + __m128i chunk = _mm_loadu_si128((__m128i const *) s); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + buf->len += remaining; + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + } + + if (search->ptr < search->end) { + return search_escape_basic(search); + } + + search_flush(search); + return 0; +} + +#ifdef __GNUC__ +#pragma GCC reset_options +#endif /* __GNUC__ */ + +#endif /* HAVE_SIMD_SSE2 */ + #endif /* ENABLE_SIMD */ static const unsigned char script_safe_escape_table[256] = { @@ -2058,6 +2164,11 @@ void Init_generator(void) search_escape_basic_impl = search_escape_basic_neon; break; #endif /* HAVE_SIMD_NEON */ +#ifdef HAVE_SIMD_SSE2 + case SIMD_SSE2: + search_escape_basic_impl = search_escape_basic_sse2; + break; +#endif /* HAVE_SIMD_SSE2 */ #endif /* ENABLE_SIMD */ default: search_escape_basic_impl = search_escape_basic; diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 11332ee15..73100723d 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -3,6 +3,7 @@ typedef enum { SIMD_NONE, SIMD_NEON, + SIMD_SSE2 } SIMD_Implementation; #ifdef ENABLE_SIMD @@ -38,7 +39,48 @@ void print_uint8x16(char *msg, uint8x16_t vec) { #endif /* ARM Neon Support.*/ -/* Other SIMD implementation checks here. */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) + +#ifdef HAVE_X86INTRIN_H +#include + +#define HAVE_SIMD_SSE2 1 + +void print_m128i(const char *prefix, __m128i vec) { + uint8_t r[16]; + _mm_storeu_si128((__m128i *) r, vec); + + printf("%s = [ ", prefix); + for(int i=0; i<16; i++) { + printf("%02x ", r[i]); + } + printf("]\n"); +} + +#ifdef HAVE_CPUID_H +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 + +#include +#endif /* HAVE_CPUID_H */ + +SIMD_Implementation find_simd_implementation(void) { + +#if defined(__GNUC__ ) || defined(__clang__) +#ifdef __GNUC__ + __builtin_cpu_init(); +#endif /* __GNUC__ */ + + // TODO Revisit. I think the SSE version now only uses SSE2 instructions. + if (__builtin_cpu_supports("sse2")) { + return SIMD_SSE2; + } +#endif /* __GNUC__ || __clang__*/ + + return SIMD_NONE; +} + +#endif /* HAVE_X86INTRIN_H */ +#endif /* X86_64 Support */ #endif /* ENABLE_SIMD */ From b2cab3380fac7cee92f37b7785490d4eab5b1ddb Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 5 Apr 2025 10:59:36 -0500 Subject: [PATCH 10/13] Simplified the SSE2 implementation. --- ext/json/ext/generator/generator.c | 53 ++++++++++++++++++------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d2a69c23a..9a0473aaf 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -470,16 +470,40 @@ static inline unsigned char search_escape_basic_neon(search_state *search) #ifdef HAVE_SIMD_SSE2 -#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) -#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) -#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) -#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) +// #define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +// #define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +// #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +// #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) #ifdef __GNUC__ #pragma GCC push_options #pragma GCC target ("sse2") #endif /* __GNUC__ */ +#ifdef __clang__ +__attribute__((target("sse2"))) +#endif /* __clang__ */ +static inline __m128i sse2_update(__m128i chunk) { + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + const __m128i high_bit = _mm_set1_epi8(0x80); + + // __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + + // This is a signed comparison. We need special handling for bytes > 127. + __m128i too_low = _mm_cmplt_epi8(chunk, lower_bound); + + // Determine which bytes have the high bit set and remove them from 'too_low'. + __m128i high_bit_set = _mm_cmpeq_epi8(_mm_and_si128(chunk, high_bit), high_bit); + too_low = _mm_andnot_si128(high_bit_set, too_low); + + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + return needs_escape; +} + #ifdef __clang__ __attribute__((target("sse2"))) #endif /* __clang__ */ @@ -492,16 +516,9 @@ static unsigned char search_escape_basic_sse2(search_state *search) { } } - const __m128i lower_bound = _mm_set1_epi8(' '); - const __m128i backslash = _mm_set1_epi8('\\'); - const __m128i dblquote = _mm_set1_epi8('\"'); - while (search->ptr+sizeof(__m128i) < search->end) { __m128i chunk = _mm_loadu_si128((__m128i const*)search->ptr); - __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); - __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); - __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); - __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + __m128i needs_escape = sse2_update(chunk); int needs_escape_mask = _mm_movemask_epi8(needs_escape); @@ -510,17 +527,14 @@ static unsigned char search_escape_basic_sse2(search_state *search) { continue; } - __m128i nines = _mm_set1_epi8(9); - __m128i maybe_matches = _mm_and_si128(needs_escape, nines); - - _mm_storeu_si128((__m128i *)search->maybe_matches, maybe_matches); + // It doesn't matter what the value of each byte in 'maybe_matches' as long as a match is non-zero. + _mm_storeu_si128((__m128i *)search->maybe_matches, needs_escape); search->current_match_index = 0; search->maybe_match_length = sizeof(__m128i); return search_escape_basic_simd_next_match(search); } - // There are fewer than 16 bytes left. unsigned long remaining = (search->end - search->ptr); if (remaining >= 8) { @@ -539,10 +553,7 @@ static unsigned char search_escape_basic_sse2(search_state *search) { memcpy(s, search->ptr, remaining); __m128i chunk = _mm_loadu_si128((__m128i const *) s); - __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); - __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); - __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); - __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + __m128i needs_escape = sse2_update(chunk); int needs_escape_mask = _mm_movemask_epi8(needs_escape); From 5cd7b5e8034e6dab1381f41c56326b600ecbc1bb Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 5 Apr 2025 11:09:05 -0500 Subject: [PATCH 11/13] A small simplification to the ARM Neon implementation. --- ext/json/ext/generator/generator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 9a0473aaf..c6fd527e4 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -397,8 +397,8 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search continue; } - uint8x16_t maybe_matches = vandq_u8(needs_escape, vdupq_n_u8(0x9)); - vst1q_u8(search->maybe_matches, maybe_matches); + // It doesn't matter what the value of each byte in 'maybe_matches' as long as a match is non-zero. + vst1q_u8(search->maybe_matches, needs_escape); search->current_match_index = 0; search->maybe_match_length = sizeof(uint8x16_t); From 1d00db9967c66b7730c4a3a87bbadd5a271d1690 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 5 Apr 2025 11:20:59 -0500 Subject: [PATCH 12/13] More cleanups. --- ext/json/ext/generator/generator.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index c6fd527e4..d40af4cc9 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -254,7 +254,7 @@ static struct _simd_state simd_state; #endif /* ENABLE_SIMD */ #ifdef ENABLE_SIMD -// TODO This can likely be made generic if we know the stride width of the vector. + static inline unsigned char search_escape_basic_simd_next_match(search_state *search) { for(; search->current_match_index < search->maybe_match_length && search->ptr < search->end; ) { unsigned char ch_len = search->maybe_matches[search->current_match_index]; @@ -283,12 +283,12 @@ static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { - while (search->ptr + 16 < search->end) { + while (search->ptr+sizeof(uint8x16_t) < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t result = neon_lut_update(chunk); if (vmaxvq_u8(result) == 0) { - search->ptr += 16; + search->ptr += sizeof(uint8x16_t); continue; } @@ -306,11 +306,11 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s search_flush(search); FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, 16); + fbuffer_inc_capa(buf, sizeof(uint8x16_t)); char *s = (buf->ptr + buf->len); - memset(s, 'X', 16); + memset(s, 'X', sizeof(uint8x16_t)); // Optimistically copy the remaining characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. @@ -388,16 +388,16 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search * To determine how to escape characters, we look at each value in the needs_escape vector and take * the appropriate action. */ - while (search->ptr+16 < search->end) { + while (search->ptr+sizeof(uint8x16_t) < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); uint8x16_t needs_escape = neon_rules_update(chunk); if (vmaxvq_u8(needs_escape) == 0) { - search->ptr += 16; + search->ptr += sizeof(uint8x16_t); continue; } - // It doesn't matter what the value of each byte in 'maybe_matches' as long as a match is non-zero. + // It doesn't matter the value of each byte in 'maybe_matches' as long as a match is non-zero. vst1q_u8(search->maybe_matches, needs_escape); search->current_match_index = 0; @@ -412,11 +412,11 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search search_flush(search); FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, 16); + fbuffer_inc_capa(buf, sizeof(uint8x16_t)); char *s = (buf->ptr + buf->len); - memset(s, 'X', 16); + memset(s, 'X', sizeof(uint8x16_t)); // Optimistically copy the remaining characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. @@ -437,7 +437,6 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search return 0; } -// TODO This can likely be made generic if we know the stride width of the vector and make the SIMD kernel a function pointer and which lookup tables to use. static inline unsigned char search_escape_basic_neon(search_state *search) { if (RB_UNLIKELY(search->returned_from != NULL)) { @@ -527,7 +526,7 @@ static unsigned char search_escape_basic_sse2(search_state *search) { continue; } - // It doesn't matter what the value of each byte in 'maybe_matches' as long as a match is non-zero. + // It doesn't matter the value of each byte in 'maybe_matches' as long as a match is non-zero. _mm_storeu_si128((__m128i *)search->maybe_matches, needs_escape); search->current_match_index = 0; @@ -542,11 +541,11 @@ static unsigned char search_escape_basic_sse2(search_state *search) { search_flush(search); FBuffer *buf = search->buffer; - fbuffer_inc_capa(buf, 16); + fbuffer_inc_capa(buf, sizeof(__m128i)); char *s = (buf->ptr + buf->len); - memset(s, 'X', 16); + memset(s, 'X', sizeof(__m128i)); // Optimistically copy the remaining characters to the output FBuffer. If there are no characters // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. From 475925429b22a27d2f7adf2b62fc92fba5cb7f6b Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sun, 6 Apr 2025 11:43:54 -0500 Subject: [PATCH 13/13] Neon: Use a mask to locate the characters that need to be escaped instead of iterating through the chunk one byte/result at a time. --- ext/json/ext/generator/generator.c | 74 ++++++++++++++++++++++-------- ext/json/ext/generator/simd.h | 26 +++++++++++ 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index d40af4cc9..972595353 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -114,6 +114,13 @@ typedef struct _search_state { #ifdef ENABLE_SIMD const char *returned_from; unsigned char maybe_matches[16]; + +#ifdef HAVE_SIMD_NEON + uint64_t matches_mask; + const char *chunk_base; + uint8_t has_matches; +#endif /* HAVE_SIMD_NEON */ + unsigned long current_match_index; unsigned long maybe_match_length; #endif /* ENABLE_SIMD */ @@ -273,15 +280,40 @@ static inline unsigned char search_escape_basic_simd_next_match(search_state *se #ifdef HAVE_SIMD_NEON +static inline unsigned char neon_mask_next_match(search_state *search) { + uint64_t mask = search->matches_mask; + if (mask > 0) { + uint32_t index = trailing_zeros(mask) >> 2; + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; + } + return 0; +} + +// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon +static inline uint64_t neon_match_mask(uint8x16_t matches) { + const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4); + const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0); + return mask & 0x8888888888888888ull; +} + static inline uint8x16_t neon_lut_update(uint8x16_t chunk) { uint8x16_t tmp1 = vqtbl4q_u8(simd_state.neon.escape_table_basic[0], chunk); uint8x16_t tmp2 = vqtbl4q_u8(simd_state.neon.escape_table_basic[1], veorq_u8(chunk, vdupq_n_u8(0x40))); - uint8x16_t result = vorrq_u8(tmp1, tmp2); return result; } - static inline unsigned char search_escape_basic_neon_advance_lut(search_state *search) { while (search->ptr+sizeof(uint8x16_t) < search->end) { uint8x16_t chunk = vld1q_u8((const unsigned char *)search->ptr); @@ -292,11 +324,10 @@ static inline unsigned char search_escape_basic_neon_advance_lut(search_state *s continue; } - vst1q_u8(search->maybe_matches, result); - - search->current_match_index = 0; - search->maybe_match_length = sizeof(uint8x16_t); - return search_escape_basic_simd_next_match(search); + search->matches_mask = neon_match_mask(vceqq_u8(result, vdupq_n_u8(9))); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_mask_next_match(search); } // There are fewer than 16 bytes left. @@ -396,13 +427,11 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search search->ptr += sizeof(uint8x16_t); continue; } - - // It doesn't matter the value of each byte in 'maybe_matches' as long as a match is non-zero. - vst1q_u8(search->maybe_matches, needs_escape); - search->current_match_index = 0; - search->maybe_match_length = sizeof(uint8x16_t); - return search_escape_basic_simd_next_match(search); + search->matches_mask = neon_match_mask(needs_escape); + search->has_matches = 1; + search->chunk_base = search->ptr; + return neon_mask_next_match(search); } // There are fewer than 16 bytes left. @@ -439,11 +468,17 @@ static unsigned char search_escape_basic_neon_advance_rules(search_state *search static inline unsigned char search_escape_basic_neon(search_state *search) { - if (RB_UNLIKELY(search->returned_from != NULL)) { - search->current_match_index += (search->ptr - search->returned_from); - search->returned_from = NULL; - if (RB_UNLIKELY(search_escape_basic_simd_next_match(search))) { - return 1; + if (RB_UNLIKELY(search->has_matches)) { + // There are more matches if search->matches_mask > 0. + if (search->matches_mask > 0) { + if (RB_LIKELY(neon_mask_next_match(search))) { + return 1; + } + } else { + // neon_mask_next_match will only advance search->ptr up to the last matching character. + // Skip over any characters in the last chunk that occur after the last match. + search->has_matches = 0; + search->ptr = search->chunk_base+sizeof(uint8x16_t); } } @@ -1331,7 +1366,8 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat #ifdef ENABLE_SIMD search.current_match_index = 0; - search.returned_from = NULL; + search.matches_mask = 0; + search.has_matches = 0; #endif /* ENABLE_SIMD */ switch(rb_enc_str_coderange(obj)) { diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 73100723d..15fcb1ede 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -8,6 +8,32 @@ typedef enum { #ifdef ENABLE_SIMD +#ifdef __clang__ + #if __has_builtin(__builtin_ctzll) + #define HAVE_BUILTIN_CTZLL 1 + #else + #define HAVE_BUILTIN_CTZLL 0 + #endif +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + #define HAVE_BUILTIN_CTZLL 1 +#else + #define HAVE_BUILTIN_CTZLL 0 +#endif + +static inline uint32_t trailing_zeros(uint64_t input) { +#if HAVE_BUILTIN_CTZLL + return __builtin_ctzll(input); +#else + uint32_t trailing_zeros = 0; + uint64_t temp = input; + while ((temp & 1) == 0 && temp > 0) { + trailing_zeros++; + temp >>= 1; + } + return trailing_zeros; +#endif +} + #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) #include