Skip to content

Implement JSON string escaping using SIMD (ARM + X86) #769

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 247 additions & 9 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,222 @@ static void convert_UTF8_to_ASCII_only_JSON(search_state *search, const unsigned
}
}

/* Converts input string (in ptr and len) to a JSON string, (without the wrapping
* '"' characters) in FBuffer buffer. ASCII control characters (0x00-0x1F),
* dquote, and backslash are escaped, but no other characters.
*
* This implementation is not suited for ascii_only and script_safe mode.
*/
#include "./simd.h"

#define SIMD_BATCH_SIZE sizeof(Vector8)

#define SIMD_MINIMAL_SIZE 8

static inline bool needs_json_escaping(const char* ptr) {
Vector8 chunk;

vector8_load(&chunk, (const uint8 *)ptr);

/* Break for ASCII control characters (0x00-0x1F), dquote, and backslash. */
return vector8_has(chunk, '"') || vector8_has(chunk, '\\') || vector8_has_le(chunk, 0x1F);
}


static const char universal_escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};

/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
* The following characters are JSON-escaped: ASCII control
* characters (0x00-0x1F), dquote, and backslash.
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON_wo_simd(FBuffer *out_buffer, const char *ptr, unsigned long len)
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

unsigned long beg = 0, pos = 0;

#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;

while (pos < len) {
unsigned char ch = ptr[pos];
unsigned char ch_len = universal_escape_table[ch];
/* JSON encoding */

if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 1: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default: {
scratch[2] = '0';
scratch[3] = '0';
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
break;
}
}
break;
}
default:
pos += ch_len;
break;
}
} else {
pos++;
}
}
#undef FLUSH_POS

if (beg < len) {
fbuffer_append(out_buffer, &ptr[beg], len - beg);
}
}

static inline void append_escaped_json_string_simd(FBuffer *buffer, const char *str, unsigned long len)
{
if(len < SIMD_MINIMAL_SIZE) {
convert_UTF8_to_JSON_wo_simd(buffer, str, len);
return;
}

/* How many bytes can be processed using SIMD? Round 'len' down
* to the previous multiple of sizeof(Vector8), assuming that's a
* power-of-2.
*/
unsigned long vlen = len & (long) (~(SIMD_BATCH_SIZE - 1));

unsigned copypos = 0, i = 0;
while(i < vlen) {
/*
* To speed this up try searching sizeof(Vector8) bytes at once for
* special characters that we need to escape. When we find one, we
* fall out of this first loop and copy the parts we've vector
* searched before processing the special-char vector byte-by-byte.
* Once we're done with that, come back and try doing vector searching
* again. We'll also process the tail end of the string byte-by-byte.
*/
for (; i < vlen; i += SIMD_BATCH_SIZE) {
if(needs_json_escaping(str + i)) {
break;
}
}

/*
* Write to the destination up to the point of that we've vector
* searched so far.
*/
if (copypos < i) {
fbuffer_append(buffer, str + copypos, i - copypos);
copypos = i;
}

if(i < vlen) {
/* The current block needs escaping, so let's escape it. */
convert_UTF8_to_JSON_wo_simd(buffer, str + i, SIMD_BATCH_SIZE);
i += SIMD_BATCH_SIZE;
copypos = i;
}
}

/* Any characters that didn't fit into multiples of SIMD_BATCH_SIZE? If we
* have more than SIMD_MINIMAL_SIZE we check w/simd if we need escaping.
*/
if(i == len) {
return;
}

const char* s = str + i;;
unsigned cnt = (unsigned)(len - i);

if(cnt >= SIMD_MINIMAL_SIZE) {
/* Convert using SIMD, even though we don't have SIMD_BATCH_SIZE chars.
*
* We cannot read SIMD_BATCH_SIZE bytes from the source, but we need
* that many. So we copy the remainiing input chars, and fill up with
* 'X' bytes that don't need escaping.
*
* If `needs_json_escaping` returns false, we already have the right
* bytes in the target. Otherwise we escape from the source `s` via
* `fbuffer_append_escaped_UTF8_string`.
*/
fbuffer_inc_capa(buffer, SIMD_BATCH_SIZE);

memset(buffer->ptr + buffer->len, 'X', SIMD_BATCH_SIZE);
memcpy(buffer->ptr + buffer->len, s, cnt);

if(!needs_json_escaping(buffer->ptr + buffer->len)) {
buffer->len += cnt;
return;
}
}

convert_UTF8_to_JSON_wo_simd(buffer, s, cnt);
}
#undef SIMD_BATCH_SIZE




/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
* This function is only called with `ascii_only` and `script_safe` disabled.
* We escape ASCII control characters (0x00-0x1F), dquote, and backslash.
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static inline void append_escaped_json_string(FBuffer *buffer, VALUE str) {
const char *ptr = RSTRING_PTR(str);
unsigned long len = RSTRING_LEN(str);

if(!len) {
return;
}

append_escaped_json_string_simd(buffer, ptr, len);

RB_GC_GUARD(str);
}

/*
* Document-module: JSON::Ext::Generator
*
Expand Down Expand Up @@ -966,22 +1182,44 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat

fbuffer_append_char(buffer, '"');

long len;
search_state search;
search.buffer = buffer;
RSTRING_GETMEM(obj, search.ptr, len);
search.cursor = search.ptr;
search.end = search.ptr + len;

switch(rb_enc_str_coderange(obj)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
long len;
search_state search;
search.buffer = buffer;
RSTRING_GETMEM(obj, search.ptr, len);
search.cursor = search.ptr;
search.end = search.ptr + len;

convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
} else if (RB_UNLIKELY(state->script_safe)) {
convert_UTF8_to_script_safe_JSON(&search);
{
long len;
search_state search;
search.buffer = buffer;
RSTRING_GETMEM(obj, search.ptr, len);
search.cursor = search.ptr;
search.end = search.ptr + len;

convert_UTF8_to_script_safe_JSON(&search);
}
} else {
convert_UTF8_to_JSON(&search);
#if 0
{
long len;
search_state search;
search.buffer = buffer;
RSTRING_GETMEM(obj, search.ptr, len);
search.cursor = search.ptr;
search.end = search.ptr + len;

convert_UTF8_to_JSON(&search);
}
#else
append_escaped_json_string(buffer, obj);
#endif
}
break;
default:
Expand Down
8 changes: 8 additions & 0 deletions ext/json/ext/generator/simd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#include <stdint.h>

typedef uint8_t uint8;
typedef uint32_t uint32;
#define Assert(_) (void)0

#include "../vendor/postgres/src/include/port/simd.h"
// https://github.com/postgres/postgres/blob/REL_17_4/src/include/port/simd.h
23 changes: 23 additions & 0 deletions ext/json/ext/vendor/postgres/COPYRIGHT
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
PostgreSQL Database Management System
(formerly known as Postgres, then as Postgres95)

Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

Portions Copyright (c) 1994, The Regents of the University of California

Permission to use, copy, modify, and distribute this software and its
documentation for any purpose, without fee, and without a written agreement
is hereby granted, provided that the above copyright notice and this
paragraph and the following two paragraphs appear in all copies.

IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
3 changes: 3 additions & 0 deletions ext/json/ext/vendor/postgres/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
The contents of this directory is extracted from https://github.com/postgres/postgres

It is licensed under to provisions in the COPYRIGHT file.
Loading
Loading