From e3d79a4f1400a2a1eaaafe723d96b7657ae06574 Mon Sep 17 00:00:00 2001 From: eno Date: Sun, 16 Mar 2025 22:03:07 +0100 Subject: [PATCH] Implement JSON string escaping using SIMD (ARM + X86) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It integrates a `simd.h` shim extracted from Postgresql ([src](https://github.com/postgres/postgres/blob/REL_17_4/src/include/port/simd.h)) Postgresql is licensed under a MIT/BSD-style license ([link](https://github.com/postgres/postgres/blob/REL_17_4/COPYRIGHT)). This shim is available for ARM and x86, and also comes with a pure C implementation. As a result I see a 55% speedup on Apple Silicon M1 for a string set of benchmarks. ``` == Encoding strings (2524333 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- json (local) 62.000 i/100ms Calculating ------------------------------------- json (local) 662.248 (± 4.5%) i/s (1.51 ms/i) - 3.348k in 5.065760s Normalize to 393501 byte == Encoding strings (2524333 bytes) ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- json (2.10.2) 43.000 i/100ms Calculating ------------------------------------- json (2.10.2) 425.189 (± 6.4%) i/s (2.35 ms/i) - 2.150k in 5.079077s ``` These benchmarks are run via a script ([link](https://gist.github.com/radiospiel/04019402726a28b31616df3d0c17bd1c)) which is based on the gem's `benchmark/encoder.rb` file. There are probably better ways to run benchmarks :) My version allows to combine multiple test cases into a single one. The `dumps` benchmark, which covers the JSON files in `benchmark/data/*.json` – with the exception of `canada.json` – , reported a speedup of ~13% on Apple M1. --- ext/json/ext/generator/generator.c | 256 ++++++++++- ext/json/ext/generator/simd.h | 8 + ext/json/ext/vendor/postgres/COPYRIGHT | 23 + ext/json/ext/vendor/postgres/README.md | 3 + .../vendor/postgres/src/include/port/simd.h | 422 ++++++++++++++++++ 5 files changed, 703 insertions(+), 9 deletions(-) create mode 100644 ext/json/ext/generator/simd.h create mode 100644 ext/json/ext/vendor/postgres/COPYRIGHT create mode 100644 ext/json/ext/vendor/postgres/README.md create mode 100644 ext/json/ext/vendor/postgres/src/include/port/simd.h diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 08338853..ea1de418 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -414,6 +414,222 @@ static void convert_UTF8_to_ASCII_only_JSON(search_state *search, const unsigned } } +/* Converts input string (in ptr and len) to a JSON string, (without the wrapping + * '"' characters) in FBuffer buffer. ASCII control characters (0x00-0x1F), + * dquote, and backslash are escaped, but no other characters. + * + * This implementation is not suited for ascii_only and script_safe mode. + */ +#include "./simd.h" + +#define SIMD_BATCH_SIZE sizeof(Vector8) + +#define SIMD_MINIMAL_SIZE 8 + +static inline bool needs_json_escaping(const char* ptr) { + Vector8 chunk; + + vector8_load(&chunk, (const uint8 *)ptr); + + /* Break for ASCII control characters (0x00-0x1F), dquote, and backslash. */ + return vector8_has(chunk, '"') || vector8_has(chunk, '\\') || vector8_has_le(chunk, 0x1F); +} + + +static const char universal_escape_table[256] = { + // ASCII Control Characters + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + // ASCII Characters + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"' + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\' + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + // Continuation byte + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + // First byte of a 2-byte code point + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + // First byte of a 4-byte code point + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + //First byte of a 4+byte code point + 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, +}; + +/* Converts in_string to a JSON string (without the wrapping '"' + * characters) in FBuffer out_buffer. + * + * The following characters are JSON-escaped: ASCII control + * characters (0x00-0x1F), dquote, and backslash. + * + * Everything else (should be UTF-8) is just passed through and + * appended to the result. + */ +static void convert_UTF8_to_JSON_wo_simd(FBuffer *out_buffer, const char *ptr, unsigned long len) +{ + const char *hexdig = "0123456789abcdef"; + char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; + + unsigned long beg = 0, pos = 0; + +#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; + + while (pos < len) { + unsigned char ch = ptr[pos]; + unsigned char ch_len = universal_escape_table[ch]; + /* JSON encoding */ + + if (RB_UNLIKELY(ch_len)) { + switch (ch_len) { + case 1: { + FLUSH_POS(1); + switch (ch) { + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; + case '/': fbuffer_append(out_buffer, "\\/", 2); break; + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; + default: { + scratch[2] = '0'; + scratch[3] = '0'; + scratch[4] = hexdig[(ch >> 4) & 0xf]; + scratch[5] = hexdig[ch & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + break; + } + } + break; + } + default: + pos += ch_len; + break; + } + } else { + pos++; + } + } +#undef FLUSH_POS + + if (beg < len) { + fbuffer_append(out_buffer, &ptr[beg], len - beg); + } +} + +static inline void append_escaped_json_string_simd(FBuffer *buffer, const char *str, unsigned long len) +{ + if(len < SIMD_MINIMAL_SIZE) { + convert_UTF8_to_JSON_wo_simd(buffer, str, len); + return; + } + + /* How many bytes can be processed using SIMD? Round 'len' down + * to the previous multiple of sizeof(Vector8), assuming that's a + * power-of-2. + */ + unsigned long vlen = len & (long) (~(SIMD_BATCH_SIZE - 1)); + + unsigned copypos = 0, i = 0; + while(i < vlen) { + /* + * To speed this up try searching sizeof(Vector8) bytes at once for + * special characters that we need to escape. When we find one, we + * fall out of this first loop and copy the parts we've vector + * searched before processing the special-char vector byte-by-byte. + * Once we're done with that, come back and try doing vector searching + * again. We'll also process the tail end of the string byte-by-byte. + */ + for (; i < vlen; i += SIMD_BATCH_SIZE) { + if(needs_json_escaping(str + i)) { + break; + } + } + + /* + * Write to the destination up to the point of that we've vector + * searched so far. + */ + if (copypos < i) { + fbuffer_append(buffer, str + copypos, i - copypos); + copypos = i; + } + + if(i < vlen) { + /* The current block needs escaping, so let's escape it. */ + convert_UTF8_to_JSON_wo_simd(buffer, str + i, SIMD_BATCH_SIZE); + i += SIMD_BATCH_SIZE; + copypos = i; + } + } + + /* Any characters that didn't fit into multiples of SIMD_BATCH_SIZE? If we + * have more than SIMD_MINIMAL_SIZE we check w/simd if we need escaping. + */ + if(i == len) { + return; + } + + const char* s = str + i;; + unsigned cnt = (unsigned)(len - i); + + if(cnt >= SIMD_MINIMAL_SIZE) { + /* Convert using SIMD, even though we don't have SIMD_BATCH_SIZE chars. + * + * We cannot read SIMD_BATCH_SIZE bytes from the source, but we need + * that many. So we copy the remainiing input chars, and fill up with + * 'X' bytes that don't need escaping. + * + * If `needs_json_escaping` returns false, we already have the right + * bytes in the target. Otherwise we escape from the source `s` via + * `fbuffer_append_escaped_UTF8_string`. + */ + fbuffer_inc_capa(buffer, SIMD_BATCH_SIZE); + + memset(buffer->ptr + buffer->len, 'X', SIMD_BATCH_SIZE); + memcpy(buffer->ptr + buffer->len, s, cnt); + + if(!needs_json_escaping(buffer->ptr + buffer->len)) { + buffer->len += cnt; + return; + } + } + + convert_UTF8_to_JSON_wo_simd(buffer, s, cnt); +} +#undef SIMD_BATCH_SIZE + + + + +/* Converts in_string to a JSON string (without the wrapping '"' + * characters) in FBuffer out_buffer. + * + * This function is only called with `ascii_only` and `script_safe` disabled. + * We escape ASCII control characters (0x00-0x1F), dquote, and backslash. + * + * Everything else (should be UTF-8) is just passed through and + * appended to the result. + */ +static inline void append_escaped_json_string(FBuffer *buffer, VALUE str) { + const char *ptr = RSTRING_PTR(str); + unsigned long len = RSTRING_LEN(str); + + if(!len) { + return; + } + + append_escaped_json_string_simd(buffer, ptr, len); + + RB_GC_GUARD(str); +} + /* * Document-module: JSON::Ext::Generator * @@ -966,22 +1182,44 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat fbuffer_append_char(buffer, '"'); - long len; - search_state search; - search.buffer = buffer; - RSTRING_GETMEM(obj, search.ptr, len); - search.cursor = search.ptr; - search.end = search.ptr + len; - switch(rb_enc_str_coderange(obj)) { case ENC_CODERANGE_7BIT: case ENC_CODERANGE_VALID: if (RB_UNLIKELY(state->ascii_only)) { + long len; + search_state search; + search.buffer = buffer; + RSTRING_GETMEM(obj, search.ptr, len); + search.cursor = search.ptr; + search.end = search.ptr + len; + convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table); } else if (RB_UNLIKELY(state->script_safe)) { - convert_UTF8_to_script_safe_JSON(&search); + { + long len; + search_state search; + search.buffer = buffer; + RSTRING_GETMEM(obj, search.ptr, len); + search.cursor = search.ptr; + search.end = search.ptr + len; + + convert_UTF8_to_script_safe_JSON(&search); + } } else { - convert_UTF8_to_JSON(&search); +#if 0 + { + long len; + search_state search; + search.buffer = buffer; + RSTRING_GETMEM(obj, search.ptr, len); + search.cursor = search.ptr; + search.end = search.ptr + len; + + convert_UTF8_to_JSON(&search); + } +#else + append_escaped_json_string(buffer, obj); +#endif } break; default: diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h new file mode 100644 index 00000000..eddf3035 --- /dev/null +++ b/ext/json/ext/generator/simd.h @@ -0,0 +1,8 @@ +#include + +typedef uint8_t uint8; +typedef uint32_t uint32; +#define Assert(_) (void)0 + +#include "../vendor/postgres/src/include/port/simd.h" +// https://github.com/postgres/postgres/blob/REL_17_4/src/include/port/simd.h \ No newline at end of file diff --git a/ext/json/ext/vendor/postgres/COPYRIGHT b/ext/json/ext/vendor/postgres/COPYRIGHT new file mode 100644 index 00000000..be2d694b --- /dev/null +++ b/ext/json/ext/vendor/postgres/COPYRIGHT @@ -0,0 +1,23 @@ +PostgreSQL Database Management System +(formerly known as Postgres, then as Postgres95) + +Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + +Portions Copyright (c) 1994, The Regents of the University of California + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose, without fee, and without a written agreement +is hereby granted, provided that the above copyright notice and this +paragraph and the following two paragraphs appear in all copies. + +IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS +DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS +ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. diff --git a/ext/json/ext/vendor/postgres/README.md b/ext/json/ext/vendor/postgres/README.md new file mode 100644 index 00000000..55291e45 --- /dev/null +++ b/ext/json/ext/vendor/postgres/README.md @@ -0,0 +1,3 @@ +The contents of this directory is extracted from https://github.com/postgres/postgres + +It is licensed under to provisions in the COPYRIGHT file. \ No newline at end of file diff --git a/ext/json/ext/vendor/postgres/src/include/port/simd.h b/ext/json/ext/vendor/postgres/src/include/port/simd.h new file mode 100644 index 00000000..597496f2 --- /dev/null +++ b/ext/json/ext/vendor/postgres/src/include/port/simd.h @@ -0,0 +1,422 @@ +/*------------------------------------------------------------------------- + * + * simd.h + * Support for platform-specific vector operations. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/simd.h + * + * NOTES + * - VectorN in this file refers to a register where the element operands + * are N bits wide. The vector width is platform-specific, so users that care + * about that will need to inspect "sizeof(VectorN)". + * + *------------------------------------------------------------------------- + */ +#ifndef SIMD_H +#define SIMD_H + +#if (defined(__x86_64__) || defined(_M_AMD64)) +/* + * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume + * that compilers targeting this architecture understand SSE2 intrinsics. + * + * We use emmintrin.h rather than the comprehensive header immintrin.h in + * order to exclude extensions beyond SSE2. This is because MSVC, at least, + * will allow the use of intrinsics that haven't been enabled at compile + * time. + */ +#include +#define USE_SSE2 +typedef __m128i Vector8; +typedef __m128i Vector32; + +#elif defined(__aarch64__) && defined(__ARM_NEON) +/* + * We use the Neon instructions if the compiler provides access to them (as + * indicated by __ARM_NEON) and we are on aarch64. While Neon support is + * technically optional for aarch64, it appears that all available 64-bit + * hardware does have it. Neon exists in some 32-bit hardware too, but we + * could not realistically use it there without a run-time check, which seems + * not worth the trouble for now. + */ +#include +#define USE_NEON +typedef uint8x16_t Vector8; +typedef uint32x4_t Vector32; + +#else +/* + * If no SIMD instructions are available, we can in some cases emulate vector + * operations using bitwise operations on unsigned integers. Note that many + * of the functions in this file presently do not have non-SIMD + * implementations. In particular, none of the functions involving Vector32 + * are implemented without SIMD since it's likely not worthwhile to represent + * two 32-bit integers using a uint64. + */ +#define USE_NO_SIMD +typedef uint64 Vector8; +#endif + +/* load/store operations */ +static inline void vector8_load(Vector8 *v, const uint8 *s); +#ifndef USE_NO_SIMD +static inline void vector32_load(Vector32 *v, const uint32 *s); +#endif + +/* assignment operations */ +static inline Vector8 vector8_broadcast(const uint8 c); +#ifndef USE_NO_SIMD +static inline Vector32 vector32_broadcast(const uint32 c); +#endif + +/* element-wise comparisons to a scalar */ +static inline bool vector8_has(const Vector8 v, const uint8 c); +static inline bool vector8_has_zero(const Vector8 v); +static inline bool vector8_has_le(const Vector8 v, const uint8 c); +static inline bool vector8_is_highbit_set(const Vector8 v); +#ifndef USE_NO_SIMD +static inline bool vector32_is_highbit_set(const Vector32 v); +static inline uint32 vector8_highbit_mask(const Vector8 v); +#endif + +/* arithmetic operations */ +static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2); +#ifndef USE_NO_SIMD +static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2); +static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2); +#endif + +/* + * comparisons between vectors + * + * Note: These return a vector rather than boolean, which is why we don't + * have non-SIMD implementations. + */ +#ifndef USE_NO_SIMD +static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2); +static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2); +static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2); +#endif + +/* + * Load a chunk of memory into the given vector. + */ +static inline void +vector8_load(Vector8 *v, const uint8 *s) +{ +#if defined(USE_SSE2) + *v = _mm_loadu_si128((const __m128i *) s); +#elif defined(USE_NEON) + *v = vld1q_u8(s); +#else + memcpy(v, s, sizeof(Vector8)); +#endif +} + +#ifndef USE_NO_SIMD +static inline void +vector32_load(Vector32 *v, const uint32 *s) +{ +#ifdef USE_SSE2 + *v = _mm_loadu_si128((const __m128i *) s); +#elif defined(USE_NEON) + *v = vld1q_u32(s); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Create a vector with all elements set to the same value. + */ +static inline Vector8 +vector8_broadcast(const uint8 c) +{ +#if defined(USE_SSE2) + return _mm_set1_epi8(c); +#elif defined(USE_NEON) + return vdupq_n_u8(c); +#else + return ~UINT64CONST(0) / 0xFF * c; +#endif +} + +#ifndef USE_NO_SIMD +static inline Vector32 +vector32_broadcast(const uint32 c) +{ +#ifdef USE_SSE2 + return _mm_set1_epi32(c); +#elif defined(USE_NEON) + return vdupq_n_u32(c); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Return true if any elements in the vector are equal to the given scalar. + */ +static inline bool +vector8_has(const Vector8 v, const uint8 c) +{ + bool result; + + /* pre-compute the result for assert checking */ +#ifdef USE_ASSERT_CHECKING + bool assert_result = false; + + for (Size i = 0; i < sizeof(Vector8); i++) + { + if (((const uint8 *) &v)[i] == c) + { + assert_result = true; + break; + } + } +#endif /* USE_ASSERT_CHECKING */ + +#if defined(USE_NO_SIMD) + /* any bytes in v equal to c will evaluate to zero via XOR */ + result = vector8_has_zero(v ^ vector8_broadcast(c)); +#else + result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c))); +#endif + + Assert(assert_result == result); + return result; +} + +/* + * Convenience function equivalent to vector8_has(v, 0) + */ +static inline bool +vector8_has_zero(const Vector8 v) +{ +#if defined(USE_NO_SIMD) + /* + * We cannot call vector8_has() here, because that would lead to a + * circular definition. + */ + return vector8_has_le(v, 0); +#else + return vector8_has(v, 0); +#endif +} + +/* + * Return true if any elements in the vector are less than or equal to the + * given scalar. + */ +static inline bool +vector8_has_le(const Vector8 v, const uint8 c) +{ + bool result = false; + + /* pre-compute the result for assert checking */ +#ifdef USE_ASSERT_CHECKING + bool assert_result = false; + + for (Size i = 0; i < sizeof(Vector8); i++) + { + if (((const uint8 *) &v)[i] <= c) + { + assert_result = true; + break; + } + } +#endif /* USE_ASSERT_CHECKING */ + +#if defined(USE_NO_SIMD) + + /* + * To find bytes <= c, we can use bitwise operations to find bytes < c+1, + * but it only works if c+1 <= 128 and if the highest bit in v is not set. + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord + */ + if ((int64) v >= 0 && c < 0x80) + result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80); + else + { + /* one byte at a time */ + for (Size i = 0; i < sizeof(Vector8); i++) + { + if (((const uint8 *) &v)[i] <= c) + { + result = true; + break; + } + } + } +#else + + /* + * Use saturating subtraction to find bytes <= c, which will present as + * NUL bytes. This approach is a workaround for the lack of unsigned + * comparison instructions on some architectures. + */ + result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c))); +#endif + + Assert(assert_result == result); + return result; +} + +/* + * Return true if the high bit of any element is set + */ +static inline bool +vector8_is_highbit_set(const Vector8 v) +{ +#ifdef USE_SSE2 + return _mm_movemask_epi8(v) != 0; +#elif defined(USE_NEON) + return vmaxvq_u8(v) > 0x7F; +#else + return v & vector8_broadcast(0x80); +#endif +} + +/* + * Exactly like vector8_is_highbit_set except for the input type, so it + * looks at each byte separately. + * + * XXX x86 uses the same underlying type for 8-bit, 16-bit, and 32-bit + * integer elements, but Arm does not, hence the need for a separate + * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i.e. + * check each 32-bit element, but that would require an additional mask + * operation on x86. + */ +#ifndef USE_NO_SIMD +static inline bool +vector32_is_highbit_set(const Vector32 v) +{ +#if defined(USE_NEON) + return vector8_is_highbit_set((Vector8) v); +#else + return vector8_is_highbit_set(v); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Return a bitmask formed from the high-bit of each element. + */ +#ifndef USE_NO_SIMD +static inline uint32 +vector8_highbit_mask(const Vector8 v) +{ +#ifdef USE_SSE2 + return (uint32) _mm_movemask_epi8(v); +#elif defined(USE_NEON) + /* + * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that + * returns a uint64, making it inconvenient to combine mask values from + * multiple vectors. + */ + static const uint8 mask[16] = { + 1 << 0, 1 << 1, 1 << 2, 1 << 3, + 1 << 4, 1 << 5, 1 << 6, 1 << 7, + 1 << 0, 1 << 1, 1 << 2, 1 << 3, + 1 << 4, 1 << 5, 1 << 6, 1 << 7, + }; + + uint8x16_t masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8((int8x16_t) v, 7)); + uint8x16_t maskedhi = vextq_u8(masked, masked, 8); + + return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi)); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Return the bitwise OR of the inputs + */ +static inline Vector8 +vector8_or(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_or_si128(v1, v2); +#elif defined(USE_NEON) + return vorrq_u8(v1, v2); +#else + return v1 | v2; +#endif +} + +#ifndef USE_NO_SIMD +static inline Vector32 +vector32_or(const Vector32 v1, const Vector32 v2) +{ +#ifdef USE_SSE2 + return _mm_or_si128(v1, v2); +#elif defined(USE_NEON) + return vorrq_u32(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Return the result of subtracting the respective elements of the input + * vectors using saturation (i.e., if the operation would yield a value less + * than zero, zero is returned instead). For more information on saturation + * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_ssub(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_subs_epu8(v1, v2); +#elif defined(USE_NEON) + return vqsubq_u8(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Return a vector with all bits set in each lane where the corresponding + * lanes in the inputs are equal. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_eq(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_cmpeq_epi8(v1, v2); +#elif defined(USE_NEON) + return vceqq_u8(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +#ifndef USE_NO_SIMD +static inline Vector32 +vector32_eq(const Vector32 v1, const Vector32 v2) +{ +#ifdef USE_SSE2 + return _mm_cmpeq_epi32(v1, v2); +#elif defined(USE_NEON) + return vceqq_u32(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +/* + * Given two vectors, return a vector with the minimum element of each. + */ +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_min(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_min_epu8(v1, v2); +#elif defined(USE_NEON) + return vminq_u8(v1, v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + +#endif /* SIMD_H */