From e3d79a4f1400a2a1eaaafe723d96b7657ae06574 Mon Sep 17 00:00:00 2001
From: eno <eno@open-lab.org>
Date: Sun, 16 Mar 2025 22:03:07 +0100
Subject: [PATCH] Implement JSON string escaping using SIMD (ARM + X86)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It integrates a `simd.h` shim extracted from Postgresql ([src](https://github.com/postgres/postgres/blob/REL_17_4/src/include/port/simd.h)) Postgresql is licensed under a MIT/BSD-style license ([link](https://github.com/postgres/postgres/blob/REL_17_4/COPYRIGHT)). This shim is available for ARM and x86, and also comes with a pure C implementation.

As a result I see a 55% speedup on Apple Silicon M1 for a string set of benchmarks.


```
== Encoding strings (2524333 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin24]
Warming up --------------------------------------
        json (local)    62.000 i/100ms
Calculating -------------------------------------
        json (local)    662.248 (± 4.5%) i/s    (1.51 ms/i) -      3.348k in   5.065760s

Normalize to 393501 byte
== Encoding strings (2524333 bytes)
ruby 3.4.1 (2024-12-25 revision 48d4efcb85) +YJIT +PRISM [arm64-darwin24]
Warming up --------------------------------------
       json (2.10.2)    43.000 i/100ms
Calculating -------------------------------------
       json (2.10.2)    425.189 (± 6.4%) i/s    (2.35 ms/i) -      2.150k in   5.079077s
```

These benchmarks are run via a script ([link](https://gist.github.com/radiospiel/04019402726a28b31616df3d0c17bd1c)) which is based on the gem's `benchmark/encoder.rb` file. There are probably better ways to run benchmarks :) My version allows to combine multiple test cases into a single one.

The `dumps` benchmark, which covers the JSON files in `benchmark/data/*.json` – with the exception of `canada.json` – , reported a speedup of ~13% on Apple M1.
---
 ext/json/ext/generator/generator.c            | 256 ++++++++++-
 ext/json/ext/generator/simd.h                 |   8 +
 ext/json/ext/vendor/postgres/COPYRIGHT        |  23 +
 ext/json/ext/vendor/postgres/README.md        |   3 +
 .../vendor/postgres/src/include/port/simd.h   | 422 ++++++++++++++++++
 5 files changed, 703 insertions(+), 9 deletions(-)
 create mode 100644 ext/json/ext/generator/simd.h
 create mode 100644 ext/json/ext/vendor/postgres/COPYRIGHT
 create mode 100644 ext/json/ext/vendor/postgres/README.md
 create mode 100644 ext/json/ext/vendor/postgres/src/include/port/simd.h

diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c
index 08338853..ea1de418 100644
--- a/ext/json/ext/generator/generator.c
+++ b/ext/json/ext/generator/generator.c
@@ -414,6 +414,222 @@ static void convert_UTF8_to_ASCII_only_JSON(search_state *search, const unsigned
     }
 }
 
+/* Converts input string (in ptr and len) to a JSON string, (without the wrapping
+ * '"' characters) in FBuffer buffer. ASCII control characters (0x00-0x1F),
+ * dquote, and backslash are escaped, but no other characters.
+ *
+ * This implementation is not suited for ascii_only and script_safe mode.
+ */
+#include "./simd.h"
+
+#define SIMD_BATCH_SIZE sizeof(Vector8)
+
+#define SIMD_MINIMAL_SIZE 8
+
+static inline bool needs_json_escaping(const char* ptr) {
+    Vector8 chunk;
+
+    vector8_load(&chunk, (const uint8 *)ptr);
+
+    /* Break for ASCII control characters (0x00-0x1F), dquote, and backslash. */
+    return vector8_has(chunk, '"') || vector8_has(chunk, '\\') || vector8_has_le(chunk, 0x1F);
+}
+
+
+static const char universal_escape_table[256] = {
+    // ASCII Control Characters
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    // ASCII Characters
+    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    // Continuation byte
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    // First byte of a 2-byte code point
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    // First byte of a 4-byte code point
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    //First byte of a 4+byte code point
+    4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
+};
+
+/* Converts in_string to a JSON string (without the wrapping '"'
+ * characters) in FBuffer out_buffer.
+ *
+ * The following characters are JSON-escaped: ASCII control
+ * characters (0x00-0x1F), dquote, and backslash.
+ *
+ * Everything else (should be UTF-8) is just passed through and
+ * appended to the result.
+ */
+static void convert_UTF8_to_JSON_wo_simd(FBuffer *out_buffer, const char *ptr, unsigned long len)
+{
+    const char *hexdig = "0123456789abcdef";
+    char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
+
+    unsigned long beg = 0, pos = 0;
+
+#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
+
+    while (pos < len) {
+        unsigned char ch = ptr[pos];
+        unsigned char ch_len = universal_escape_table[ch];
+        /* JSON encoding */
+
+        if (RB_UNLIKELY(ch_len)) {
+            switch (ch_len) {
+                case 1: {
+                    FLUSH_POS(1);
+                    switch (ch) {
+                        case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
+                        case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
+                        case '/':  fbuffer_append(out_buffer, "\\/", 2); break;
+                        case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
+                        case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
+                        case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
+                        case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
+                        case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
+                        default: {
+                            scratch[2] = '0';
+                            scratch[3] = '0';
+                            scratch[4] = hexdig[(ch >> 4) & 0xf];
+                            scratch[5] = hexdig[ch & 0xf];
+                            fbuffer_append(out_buffer, scratch, 6);
+                            break;
+                        }
+                    }
+                    break;
+                }
+                default:
+                    pos += ch_len;
+                    break;
+            }
+        } else {
+            pos++;
+        }
+    }
+#undef FLUSH_POS
+
+    if (beg < len) {
+        fbuffer_append(out_buffer, &ptr[beg], len - beg);
+    }
+}
+
+static inline void append_escaped_json_string_simd(FBuffer *buffer, const char *str, unsigned long len)
+{
+    if(len < SIMD_MINIMAL_SIZE) {
+        convert_UTF8_to_JSON_wo_simd(buffer, str, len);
+        return;
+    }
+
+    /* How many bytes can be processed using SIMD?  Round 'len' down
+     * to the previous multiple of sizeof(Vector8), assuming that's a
+     * power-of-2.
+     */
+    unsigned long vlen = len & (long) (~(SIMD_BATCH_SIZE - 1));
+
+    unsigned copypos = 0, i = 0;
+    while(i < vlen) {
+        /*
+         * To speed this up try searching sizeof(Vector8) bytes at once for
+         * special characters that we need to escape.  When we find one, we
+         * fall out of this first loop and copy the parts we've vector
+         * searched before processing the special-char vector byte-by-byte.
+         * Once we're done with that, come back and try doing vector searching
+         * again.  We'll also process the tail end of the string byte-by-byte.
+         */
+        for (; i < vlen; i += SIMD_BATCH_SIZE) {
+            if(needs_json_escaping(str + i)) {
+                break;
+            }
+        }
+
+        /*
+         * Write to the destination up to the point of that we've vector
+         * searched so far.
+         */
+        if (copypos < i) {
+            fbuffer_append(buffer, str + copypos, i - copypos);
+            copypos = i;
+        }
+
+        if(i < vlen) {
+            /* The current block needs escaping, so let's escape it. */
+            convert_UTF8_to_JSON_wo_simd(buffer, str + i, SIMD_BATCH_SIZE);
+            i += SIMD_BATCH_SIZE;
+            copypos = i;
+        }
+    }
+
+    /* Any characters that didn't fit into multiples of SIMD_BATCH_SIZE? If we
+    * have more than SIMD_MINIMAL_SIZE we check w/simd if we need escaping.
+    */
+    if(i == len) {
+        return;
+    }
+
+    const char* s = str + i;;
+    unsigned cnt = (unsigned)(len - i);
+
+    if(cnt >= SIMD_MINIMAL_SIZE) {
+        /* Convert using SIMD, even though we don't have SIMD_BATCH_SIZE chars.
+        *
+        * We cannot read SIMD_BATCH_SIZE bytes from the source, but we need
+        * that many. So we copy the remainiing input chars, and fill up with
+        * 'X' bytes that don't need escaping.
+        *
+        * If `needs_json_escaping` returns false, we already have the right
+        * bytes in the target. Otherwise we escape from the source `s` via
+        * `fbuffer_append_escaped_UTF8_string`.
+        */
+        fbuffer_inc_capa(buffer, SIMD_BATCH_SIZE);
+
+        memset(buffer->ptr + buffer->len, 'X', SIMD_BATCH_SIZE);
+        memcpy(buffer->ptr + buffer->len, s, cnt);
+
+        if(!needs_json_escaping(buffer->ptr + buffer->len)) {
+            buffer->len += cnt;
+            return;
+        }
+    }
+
+    convert_UTF8_to_JSON_wo_simd(buffer, s, cnt);
+}
+#undef SIMD_BATCH_SIZE
+
+
+
+
+/* Converts in_string to a JSON string (without the wrapping '"'
+ * characters) in FBuffer out_buffer.
+ *
+ * This function is only called with `ascii_only` and `script_safe` disabled.
+ * We escape ASCII control characters (0x00-0x1F), dquote, and backslash.
+ *
+ * Everything else (should be UTF-8) is just passed through and
+ * appended to the result.
+ */
+static inline void append_escaped_json_string(FBuffer *buffer, VALUE str) {
+    const char *ptr = RSTRING_PTR(str);
+    unsigned long len = RSTRING_LEN(str);
+
+    if(!len) {
+        return;
+    }
+
+    append_escaped_json_string_simd(buffer, ptr, len);
+
+    RB_GC_GUARD(str);
+}
+
 /*
  * Document-module: JSON::Ext::Generator
  *
@@ -966,22 +1182,44 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
 
     fbuffer_append_char(buffer, '"');
 
-    long len;
-    search_state search;
-    search.buffer = buffer;
-    RSTRING_GETMEM(obj, search.ptr, len);
-    search.cursor = search.ptr;
-    search.end = search.ptr + len;
-
     switch(rb_enc_str_coderange(obj)) {
         case ENC_CODERANGE_7BIT:
         case ENC_CODERANGE_VALID:
             if (RB_UNLIKELY(state->ascii_only)) {
+                long len;
+                search_state search;
+                search.buffer = buffer;
+                RSTRING_GETMEM(obj, search.ptr, len);
+                search.cursor = search.ptr;
+                search.end = search.ptr + len;
+
                 convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
             } else if (RB_UNLIKELY(state->script_safe)) {
-                convert_UTF8_to_script_safe_JSON(&search);
+                {
+                    long len;
+                    search_state search;
+                    search.buffer = buffer;
+                    RSTRING_GETMEM(obj, search.ptr, len);
+                    search.cursor = search.ptr;
+                    search.end = search.ptr + len;
+
+                    convert_UTF8_to_script_safe_JSON(&search);
+                }
             } else {
-                convert_UTF8_to_JSON(&search);
+#if 0
+                {
+                    long len;
+                    search_state search;
+                    search.buffer = buffer;
+                    RSTRING_GETMEM(obj, search.ptr, len);
+                    search.cursor = search.ptr;
+                    search.end = search.ptr + len;
+
+                    convert_UTF8_to_JSON(&search);
+                }
+#else
+                append_escaped_json_string(buffer, obj);
+#endif
             }
             break;
         default:
diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h
new file mode 100644
index 00000000..eddf3035
--- /dev/null
+++ b/ext/json/ext/generator/simd.h
@@ -0,0 +1,8 @@
+#include <stdint.h>
+    
+typedef uint8_t uint8;
+typedef uint32_t uint32;
+#define Assert(_) (void)0
+
+#include "../vendor/postgres/src/include/port/simd.h"
+// https://github.com/postgres/postgres/blob/REL_17_4/src/include/port/simd.h
\ No newline at end of file
diff --git a/ext/json/ext/vendor/postgres/COPYRIGHT b/ext/json/ext/vendor/postgres/COPYRIGHT
new file mode 100644
index 00000000..be2d694b
--- /dev/null
+++ b/ext/json/ext/vendor/postgres/COPYRIGHT
@@ -0,0 +1,23 @@
+PostgreSQL Database Management System
+(formerly known as Postgres, then as Postgres95)
+
+Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+
+Portions Copyright (c) 1994, The Regents of the University of California
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose, without fee, and without a written agreement
+is hereby granted, provided that the above copyright notice and this
+paragraph and the following two paragraphs appear in all copies.
+
+IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
+DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
+PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
diff --git a/ext/json/ext/vendor/postgres/README.md b/ext/json/ext/vendor/postgres/README.md
new file mode 100644
index 00000000..55291e45
--- /dev/null
+++ b/ext/json/ext/vendor/postgres/README.md
@@ -0,0 +1,3 @@
+The contents of this directory is extracted from https://github.com/postgres/postgres
+
+It is licensed under to provisions in the COPYRIGHT file.
\ No newline at end of file
diff --git a/ext/json/ext/vendor/postgres/src/include/port/simd.h b/ext/json/ext/vendor/postgres/src/include/port/simd.h
new file mode 100644
index 00000000..597496f2
--- /dev/null
+++ b/ext/json/ext/vendor/postgres/src/include/port/simd.h
@@ -0,0 +1,422 @@
+/*-------------------------------------------------------------------------
+ *
+ * simd.h
+ *	  Support for platform-specific vector operations.
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/simd.h
+ *
+ * NOTES
+ * - VectorN in this file refers to a register where the element operands
+ * are N bits wide. The vector width is platform-specific, so users that care
+ * about that will need to inspect "sizeof(VectorN)".
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SIMD_H
+#define SIMD_H
+
+#if (defined(__x86_64__) || defined(_M_AMD64))
+/*
+ * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
+ * that compilers targeting this architecture understand SSE2 intrinsics.
+ *
+ * We use emmintrin.h rather than the comprehensive header immintrin.h in
+ * order to exclude extensions beyond SSE2. This is because MSVC, at least,
+ * will allow the use of intrinsics that haven't been enabled at compile
+ * time.
+ */
+#include <emmintrin.h>
+#define USE_SSE2
+typedef __m128i Vector8;
+typedef __m128i Vector32;
+
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+/*
+ * We use the Neon instructions if the compiler provides access to them (as
+ * indicated by __ARM_NEON) and we are on aarch64.  While Neon support is
+ * technically optional for aarch64, it appears that all available 64-bit
+ * hardware does have it.  Neon exists in some 32-bit hardware too, but we
+ * could not realistically use it there without a run-time check, which seems
+ * not worth the trouble for now.
+ */
+#include <arm_neon.h>
+#define USE_NEON
+typedef uint8x16_t Vector8;
+typedef uint32x4_t Vector32;
+
+#else
+/*
+ * If no SIMD instructions are available, we can in some cases emulate vector
+ * operations using bitwise operations on unsigned integers.  Note that many
+ * of the functions in this file presently do not have non-SIMD
+ * implementations.  In particular, none of the functions involving Vector32
+ * are implemented without SIMD since it's likely not worthwhile to represent
+ * two 32-bit integers using a uint64.
+ */
+#define USE_NO_SIMD
+typedef uint64 Vector8;
+#endif
+
+/* load/store operations */
+static inline void vector8_load(Vector8 *v, const uint8 *s);
+#ifndef USE_NO_SIMD
+static inline void vector32_load(Vector32 *v, const uint32 *s);
+#endif
+
+/* assignment operations */
+static inline Vector8 vector8_broadcast(const uint8 c);
+#ifndef USE_NO_SIMD
+static inline Vector32 vector32_broadcast(const uint32 c);
+#endif
+
+/* element-wise comparisons to a scalar */
+static inline bool vector8_has(const Vector8 v, const uint8 c);
+static inline bool vector8_has_zero(const Vector8 v);
+static inline bool vector8_has_le(const Vector8 v, const uint8 c);
+static inline bool vector8_is_highbit_set(const Vector8 v);
+#ifndef USE_NO_SIMD
+static inline bool vector32_is_highbit_set(const Vector32 v);
+static inline uint32 vector8_highbit_mask(const Vector8 v);
+#endif
+
+/* arithmetic operations */
+static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
+#ifndef USE_NO_SIMD
+static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
+static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
+#endif
+
+/*
+ * comparisons between vectors
+ *
+ * Note: These return a vector rather than boolean, which is why we don't
+ * have non-SIMD implementations.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
+static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
+#endif
+
+/*
+ * Load a chunk of memory into the given vector.
+ */
+static inline void
+vector8_load(Vector8 *v, const uint8 *s)
+{
+#if defined(USE_SSE2)
+	*v = _mm_loadu_si128((const __m128i *) s);
+#elif defined(USE_NEON)
+	*v = vld1q_u8(s);
+#else
+	memcpy(v, s, sizeof(Vector8));
+#endif
+}
+
+#ifndef USE_NO_SIMD
+static inline void
+vector32_load(Vector32 *v, const uint32 *s)
+{
+#ifdef USE_SSE2
+	*v = _mm_loadu_si128((const __m128i *) s);
+#elif defined(USE_NEON)
+	*v = vld1q_u32(s);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+/*
+ * Create a vector with all elements set to the same value.
+ */
+static inline Vector8
+vector8_broadcast(const uint8 c)
+{
+#if defined(USE_SSE2)
+	return _mm_set1_epi8(c);
+#elif defined(USE_NEON)
+	return vdupq_n_u8(c);
+#else
+	return ~UINT64CONST(0) / 0xFF * c;
+#endif
+}
+
+#ifndef USE_NO_SIMD
+static inline Vector32
+vector32_broadcast(const uint32 c)
+{
+#ifdef USE_SSE2
+	return _mm_set1_epi32(c);
+#elif defined(USE_NEON)
+	return vdupq_n_u32(c);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+/*
+ * Return true if any elements in the vector are equal to the given scalar.
+ */
+static inline bool
+vector8_has(const Vector8 v, const uint8 c)
+{
+	bool		result;
+
+	/* pre-compute the result for assert checking */
+#ifdef USE_ASSERT_CHECKING
+	bool		assert_result = false;
+
+	for (Size i = 0; i < sizeof(Vector8); i++)
+	{
+		if (((const uint8 *) &v)[i] == c)
+		{
+			assert_result = true;
+			break;
+		}
+	}
+#endif							/* USE_ASSERT_CHECKING */
+
+#if defined(USE_NO_SIMD)
+	/* any bytes in v equal to c will evaluate to zero via XOR */
+	result = vector8_has_zero(v ^ vector8_broadcast(c));
+#else
+	result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c)));
+#endif
+
+	Assert(assert_result == result);
+	return result;
+}
+
+/*
+ * Convenience function equivalent to vector8_has(v, 0)
+ */
+static inline bool
+vector8_has_zero(const Vector8 v)
+{
+#if defined(USE_NO_SIMD)
+	/*
+	 * We cannot call vector8_has() here, because that would lead to a
+	 * circular definition.
+	 */
+	return vector8_has_le(v, 0);
+#else
+	return vector8_has(v, 0);
+#endif
+}
+
+/*
+ * Return true if any elements in the vector are less than or equal to the
+ * given scalar.
+ */
+static inline bool
+vector8_has_le(const Vector8 v, const uint8 c)
+{
+	bool		result = false;
+
+	/* pre-compute the result for assert checking */
+#ifdef USE_ASSERT_CHECKING
+	bool		assert_result = false;
+
+	for (Size i = 0; i < sizeof(Vector8); i++)
+	{
+		if (((const uint8 *) &v)[i] <= c)
+		{
+			assert_result = true;
+			break;
+		}
+	}
+#endif							/* USE_ASSERT_CHECKING */
+
+#if defined(USE_NO_SIMD)
+
+	/*
+	 * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
+	 * but it only works if c+1 <= 128 and if the highest bit in v is not set.
+	 * Adapted from
+	 * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
+	 */
+	if ((int64) v >= 0 && c < 0x80)
+		result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80);
+	else
+	{
+		/* one byte at a time */
+		for (Size i = 0; i < sizeof(Vector8); i++)
+		{
+			if (((const uint8 *) &v)[i] <= c)
+			{
+				result = true;
+				break;
+			}
+		}
+	}
+#else
+
+	/*
+	 * Use saturating subtraction to find bytes <= c, which will present as
+	 * NUL bytes.  This approach is a workaround for the lack of unsigned
+	 * comparison instructions on some architectures.
+	 */
+	result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c)));
+#endif
+
+	Assert(assert_result == result);
+	return result;
+}
+
+/*
+ * Return true if the high bit of any element is set
+ */
+static inline bool
+vector8_is_highbit_set(const Vector8 v)
+{
+#ifdef USE_SSE2
+	return _mm_movemask_epi8(v) != 0;
+#elif defined(USE_NEON)
+	return vmaxvq_u8(v) > 0x7F;
+#else
+	return v & vector8_broadcast(0x80);
+#endif
+}
+
+/*
+ * Exactly like vector8_is_highbit_set except for the input type, so it
+ * looks at each byte separately.
+ *
+ * XXX x86 uses the same underlying type for 8-bit, 16-bit, and 32-bit
+ * integer elements, but Arm does not, hence the need for a separate
+ * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i.e.
+ * check each 32-bit element, but that would require an additional mask
+ * operation on x86.
+ */
+#ifndef USE_NO_SIMD
+static inline bool
+vector32_is_highbit_set(const Vector32 v)
+{
+#if defined(USE_NEON)
+	return vector8_is_highbit_set((Vector8) v);
+#else
+	return vector8_is_highbit_set(v);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+/*
+ * Return a bitmask formed from the high-bit of each element.
+ */
+#ifndef USE_NO_SIMD
+static inline uint32
+vector8_highbit_mask(const Vector8 v)
+{
+#ifdef USE_SSE2
+	return (uint32) _mm_movemask_epi8(v);
+#elif defined(USE_NEON)
+	/*
+	 * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
+	 * returns a uint64, making it inconvenient to combine mask values from
+	 * multiple vectors.
+	 */
+	static const uint8 mask[16] = {
+		1 << 0, 1 << 1, 1 << 2, 1 << 3,
+		1 << 4, 1 << 5, 1 << 6, 1 << 7,
+		1 << 0, 1 << 1, 1 << 2, 1 << 3,
+		1 << 4, 1 << 5, 1 << 6, 1 << 7,
+	};
+
+	uint8x16_t	masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8((int8x16_t) v, 7));
+	uint8x16_t	maskedhi = vextq_u8(masked, masked, 8);
+
+	return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+/*
+ * Return the bitwise OR of the inputs
+ */
+static inline Vector8
+vector8_or(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_or_si128(v1, v2);
+#elif defined(USE_NEON)
+	return vorrq_u8(v1, v2);
+#else
+	return v1 | v2;
+#endif
+}
+
+#ifndef USE_NO_SIMD
+static inline Vector32
+vector32_or(const Vector32 v1, const Vector32 v2)
+{
+#ifdef USE_SSE2
+	return _mm_or_si128(v1, v2);
+#elif defined(USE_NEON)
+	return vorrq_u32(v1, v2);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+/*
+ * Return the result of subtracting the respective elements of the input
+ * vectors using saturation (i.e., if the operation would yield a value less
+ * than zero, zero is returned instead).  For more information on saturation
+ * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_ssub(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_subs_epu8(v1, v2);
+#elif defined(USE_NEON)
+	return vqsubq_u8(v1, v2);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+/*
+ * Return a vector with all bits set in each lane where the corresponding
+ * lanes in the inputs are equal.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_eq(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_cmpeq_epi8(v1, v2);
+#elif defined(USE_NEON)
+	return vceqq_u8(v1, v2);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+#ifndef USE_NO_SIMD
+static inline Vector32
+vector32_eq(const Vector32 v1, const Vector32 v2)
+{
+#ifdef USE_SSE2
+	return _mm_cmpeq_epi32(v1, v2);
+#elif defined(USE_NEON)
+	return vceqq_u32(v1, v2);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+/*
+ * Given two vectors, return a vector with the minimum element of each.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_min(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_min_epu8(v1, v2);
+#elif defined(USE_NEON)
+	return vminq_u8(v1, v2);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+#endif							/* SIMD_H */