Skip to content

Commit 5dd152e

Browse files
authored
perf: improve non-SIMD with wordwise validation (#123)
A 1st pass at improving non-SIMD perf. On aarch64/M1 we observe a ~5x asymptotic improvement in uri parsing and ~2.5x in header names & values. We also observe a -40% time reduction in the `req/req` bench. I briefly benched it on x86_64, we see a -20% time reduction in the `req/req` bench there for non-SIMD, needs more testing on x86_64 and possibly only enable these fastpaths if SIMD is off
1 parent 45b60fe commit 5dd152e

File tree

1 file changed

+202
-38
lines changed

1 file changed

+202
-38
lines changed

src/lib.rs

Lines changed: 202 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,61 @@ fn is_uri_token(b: u8) -> bool {
9191
URI_MAP[b as usize]
9292
}
9393

94+
// A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
95+
// creates a u64 whose bytes are each equal to b
96+
const fn uniform_block(b: u8) -> u64 {
97+
b as u64 * 0x01_01_01_01_01_01_01_01 // [1_u8; 8]
98+
}
99+
100+
// A byte-wise range-check on an enire word/block,
101+
// ensuring all bytes in the word satisfy
102+
// `33 <= x <= 126 && x != '>' && x != '<'`
103+
// it false negatives if the block contains '?'
104+
#[inline]
105+
fn validate_uri_block(block: [u8; 8]) -> usize {
106+
// 33 <= x <= 126
107+
const M: u8 = 0x21;
108+
const N: u8 = 0x7E;
109+
const BM: u64 = uniform_block(M);
110+
const BN: u64 = uniform_block(127-N);
111+
const M128: u64 = uniform_block(128);
112+
113+
let x = u64::from_ne_bytes(block); // Really just a transmute
114+
let lt = x.wrapping_sub(BM) & !x; // <= m
115+
let gt = x.wrapping_add(BN) | x; // >= n
116+
117+
// XOR checks to catch '<' & '>' for correctness
118+
//
119+
// XOR can be thought of as a "distance function"
120+
// (somewhat extrapolating from the `xor(x, x) = 0` identity and ∀ x != y: xor(x, y) != 0`
121+
// (each u8 "xor key" providing a unique total ordering of u8)
122+
// '<' and '>' have a "xor distance" of 2 (`xor('<', '>') = 2`)
123+
// xor(x, '>') <= 2 => {'>', '?', '<'}
124+
// xor(x, '<') <= 2 => {'<', '=', '>'}
125+
//
126+
// We assume P('=') > P('?'),
127+
// given well/commonly-formatted URLs with querystrings contain
128+
// a single '?' but possibly many '='
129+
//
130+
// Thus it's preferable/near-optimal to "xor distance" on '>',
131+
// since we'll slowpath at most one block per URL
132+
//
133+
// Some rust code to sanity check this yourself:
134+
// ```rs
135+
// fn xordist(x: u8, n: u8) -> Vec<(char, u8)> {
136+
// (0..=255).into_iter().map(|c| (c as char, c ^ x)).filter(|(_c, y)| *y <= n).collect()
137+
// }
138+
// (xordist(b'<', 2), xordist(b'>', 2))
139+
// ```
140+
const B3: u64 = uniform_block(3); // (dist <= 2) + 1 to wrap
141+
const BGT: u64 = uniform_block(b'>');
142+
143+
let xgt = x ^ BGT;
144+
let ltgtq = xgt.wrapping_sub(B3) & !xgt;
145+
146+
offsetnz((ltgtq | lt | gt) & M128)
147+
}
148+
94149
static HEADER_NAME_MAP: [bool; 256] = byte_map![
95150
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96151
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -140,6 +195,41 @@ fn is_header_value_token(b: u8) -> bool {
140195
HEADER_VALUE_MAP[b as usize]
141196
}
142197

198+
// A byte-wise range-check on an entire word/block,
199+
// ensuring all bytes in the word satisfy `32 <= x <= 126`
200+
#[inline]
201+
fn validate_header_value_block(block: [u8; 8]) -> usize {
202+
// 32 <= x <= 126
203+
const M: u8 = 0x20;
204+
const N: u8 = 0x7E;
205+
const BM: u64 = uniform_block(M);
206+
const BN: u64 = uniform_block(127-N);
207+
const M128: u64 = uniform_block(128);
208+
209+
let x = u64::from_ne_bytes(block); // Really just a transmute
210+
let lt = x.wrapping_sub(BM) & !x; // <= m
211+
let gt = x.wrapping_add(BN) | x; // >= n
212+
offsetnz((lt | gt) & M128)
213+
}
214+
215+
#[inline]
216+
/// Check block to find offset of first non-zero byte
217+
// NOTE: Curiously `block.trailing_zeros() >> 3` appears to be slower, maybe revisit
218+
fn offsetnz(block: u64) -> usize {
219+
// fast path optimistic case (common for long valid sequences)
220+
if block == 0 {
221+
return 8;
222+
}
223+
224+
// perf: rust will unroll this loop
225+
for (i, b) in block.to_ne_bytes().iter().copied().enumerate() {
226+
if b != 0 {
227+
return i;
228+
}
229+
}
230+
unreachable!()
231+
}
232+
143233
/// An error in parsing.
144234
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
145235
pub enum Error {
@@ -742,11 +832,18 @@ pub const EMPTY_HEADER: Header<'static> = Header { name: "", value: b"" };
742832
// WARNING: Exported for internal benchmarks, not fit for public consumption
743833
pub fn parse_version(bytes: &mut Bytes) -> Result<u8> {
744834
if let Some(eight) = bytes.peek_n::<[u8; 8]>(8) {
835+
// NOTE: should be const once MSRV >= 1.44
836+
let h10: u64 = u64::from_ne_bytes(*b"HTTP/1.0");
837+
let h11: u64 = u64::from_ne_bytes(*b"HTTP/1.1");
745838
unsafe { bytes.advance(8); }
746-
return match &eight {
747-
b"HTTP/1.0" => Ok(Status::Complete(0)),
748-
b"HTTP/1.1" => Ok(Status::Complete(1)),
749-
_ => Err(Error::Version),
839+
let block = u64::from_ne_bytes(eight);
840+
// NOTE: should be match once h10 & h11 are consts
841+
return if block == h10 {
842+
Ok(Status::Complete(0))
843+
} else if block == h11 {
844+
Ok(Status::Complete(1))
845+
} else {
846+
Err(Error::Version)
750847
}
751848
}
752849

@@ -871,20 +968,29 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
871968

872969
simd::match_uri_vectored(bytes);
873970

971+
let mut b;
874972
loop {
875-
let b = next!(bytes);
876-
if b == b' ' {
877-
return Ok(Status::Complete(unsafe {
878-
// all bytes up till `i` must have been `is_token`.
879-
str::from_utf8_unchecked(bytes.slice_skip(1))
880-
}));
881-
} else if !is_uri_token(b) {
882-
return Err(Error::Token);
973+
if let Some(bytes8) = bytes.peek_n::<[u8; 8]>(8) {
974+
let n = validate_uri_block(bytes8);
975+
unsafe { bytes.advance(n); }
976+
if n == 8 { continue; }
977+
}
978+
b = next!(bytes);
979+
if !is_uri_token(b) {
980+
break;
883981
}
884982
}
983+
984+
if b == b' ' {
985+
return Ok(Status::Complete(unsafe {
986+
// all bytes up till `i` must have been `is_token`.
987+
str::from_utf8_unchecked(bytes.slice_skip(1))
988+
}));
989+
} else {
990+
return Err(Error::Token);
991+
}
885992
}
886993

887-
888994
#[inline]
889995
fn parse_code(bytes: &mut Bytes<'_>) -> Result<u16> {
890996
let hundreds = expect!(bytes.next() == b'0'..=b'9' => Err(Error::Status));
@@ -1066,11 +1172,35 @@ fn parse_headers_iter_uninit<'a, 'b>(
10661172
}
10671173

10681174
// parse header name until colon
1175+
let mut b;
10691176
let header_name: &str = 'name: loop {
1070-
let mut b = next!(bytes);
1177+
'name_inner: loop {
1178+
if let Some(bytes8) = bytes.peek_n::<[u8; 8]>(8) {
1179+
macro_rules! check {
1180+
($bytes:ident, $i:literal) => ({
1181+
b = $bytes[$i];
1182+
if !is_header_name_token(b) {
1183+
unsafe { bytes.advance($i + 1); }
1184+
break 'name_inner;
1185+
}
1186+
});
1187+
}
10711188

1072-
if is_header_name_token(b) {
1073-
continue 'name;
1189+
check!(bytes8, 0);
1190+
check!(bytes8, 1);
1191+
check!(bytes8, 2);
1192+
check!(bytes8, 3);
1193+
check!(bytes8, 4);
1194+
check!(bytes8, 5);
1195+
check!(bytes8, 6);
1196+
check!(bytes8, 7);
1197+
unsafe { bytes.advance(8); }
1198+
} else {
1199+
b = next!(bytes);
1200+
if !is_header_name_token(b) {
1201+
break 'name_inner;
1202+
}
1203+
}
10741204
}
10751205

10761206
count += bytes.pos();
@@ -1135,29 +1265,10 @@ fn parse_headers_iter_uninit<'a, 'b>(
11351265

11361266
'value_line: loop {
11371267
if let Some(bytes8) = bytes.peek_n::<[u8; 8]>(8) {
1138-
macro_rules! check {
1139-
($bytes:ident, $i:literal) => ({
1140-
b = $bytes[$i];
1141-
if !is_header_value_token(b) {
1142-
unsafe { bytes.advance($i + 1); }
1143-
break 'value_line;
1144-
}
1145-
});
1146-
}
1147-
1148-
check!(bytes8, 0);
1149-
check!(bytes8, 1);
1150-
check!(bytes8, 2);
1151-
check!(bytes8, 3);
1152-
check!(bytes8, 4);
1153-
check!(bytes8, 5);
1154-
check!(bytes8, 6);
1155-
check!(bytes8, 7);
1156-
unsafe { bytes.advance(8); }
1157-
1158-
continue 'value_line;
1268+
let n = validate_header_value_block(bytes8);
1269+
unsafe { bytes.advance(n); }
1270+
if n == 8 { continue 'value_line; }
11591271
}
1160-
11611272
b = next!(bytes);
11621273
if !is_header_value_token(b) {
11631274
break 'value_line;
@@ -1292,6 +1403,7 @@ pub fn parse_chunk_size(buf: &[u8])
12921403
#[cfg(test)]
12931404
mod tests {
12941405
use super::{Request, Response, Status, EMPTY_HEADER, parse_chunk_size};
1406+
use super::{offsetnz, validate_header_value_block, validate_uri_block};
12951407

12961408
const NUM_OF_HEADERS: usize = 4;
12971409

@@ -2257,4 +2369,56 @@ mod tests {
22572369
assert_eq!(response.headers[0].name, "Bread");
22582370
assert_eq!(response.headers[0].value, &b"baguette"[..]);
22592371
}
2372+
2373+
#[test]
2374+
fn test_is_header_value_block() {
2375+
let is_header_value_block = |b| validate_header_value_block(b) == 8;
2376+
2377+
// 0..32 => false
2378+
for b in 0..32_u8 {
2379+
assert_eq!(is_header_value_block([b; 8]), false, "b={}", b);
2380+
}
2381+
// 32..127 => true
2382+
for b in 32..127_u8 {
2383+
assert_eq!(is_header_value_block([b; 8]), true, "b={}", b);
2384+
}
2385+
// 127..=255 => false
2386+
for b in 127..=255_u8 {
2387+
assert_eq!(is_header_value_block([b; 8]), false, "b={}", b);
2388+
}
2389+
2390+
// A few sanity checks on non-uniform bytes for safe-measure
2391+
assert!(!is_header_value_block(*b"foo.com\n"));
2392+
assert!(!is_header_value_block(*b"o.com\r\nU"));
2393+
}
2394+
2395+
#[test]
2396+
fn test_is_uri_block() {
2397+
let is_uri_block = |b| validate_uri_block(b) == 8;
2398+
2399+
// 0..33 => false
2400+
for b in 0..33_u8 {
2401+
assert_eq!(is_uri_block([b; 8]), false, "b={}", b);
2402+
}
2403+
// 33..127 => true if b not in { '<', '?', '>' }
2404+
let falsy = |b| b"<?>".contains(&b);
2405+
for b in 33..127_u8 {
2406+
assert_eq!(is_uri_block([b; 8]), !falsy(b), "b={}", b);
2407+
}
2408+
// 127..=255 => false
2409+
for b in 127..=255_u8 {
2410+
assert_eq!(is_uri_block([b; 8]), false, "b={}", b);
2411+
}
2412+
}
2413+
2414+
#[test]
2415+
fn test_offsetnz() {
2416+
let seq = [0_u8; 8];
2417+
for i in 0..8 {
2418+
let mut seq = seq.clone();
2419+
seq[i] = 1;
2420+
let x = u64::from_ne_bytes(seq);
2421+
assert_eq!(offsetnz(x), i);
2422+
}
2423+
}
22602424
}

0 commit comments

Comments
 (0)