Skip to content

Commit d745bd2

Browse files
authored
perf: fix SIMD-inlining (#131)
Drastically improving throughput on larger inputs (3x+ for large URIs or header-values)
1 parent 5dd152e commit d745bd2

File tree

2 files changed

+23
-22
lines changed

2 files changed

+23
-22
lines changed

src/simd/avx2.rs

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,13 @@ pub enum Scan {
77
TooShort,
88
}
99

10+
#[cfg(target_arch = "x86")]
11+
unsafe fn parse_uri_batch_32(_: &[u8]) -> usize {
12+
unreachable!("AVX2 detection should be disabled for x86");
13+
}
1014

15+
#[cfg(target_arch = "x86_64")]
16+
#[target_feature(enable = "avx2")]
1117
pub unsafe fn parse_uri_batch_32(bytes: &mut Bytes) -> Scan {
1218
while bytes.as_ref().len() >= 32 {
1319
let advance = match_url_char_32_avx(bytes.as_ref());
@@ -20,9 +26,7 @@ pub unsafe fn parse_uri_batch_32(bytes: &mut Bytes) -> Scan {
2026
Scan::TooShort
2127
}
2228

23-
#[cfg(target_arch = "x86_64")]
24-
#[target_feature(enable = "avx2")]
25-
#[inline]
29+
#[inline(always)]
2630
#[allow(non_snake_case, overflowing_literals)]
2731
unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
2832
debug_assert!(buf.len() >= 32);
@@ -59,16 +63,18 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
5963
let bits = _mm256_and_si256(_mm256_shuffle_epi8(ARF, cols), rbms);
6064

6165
let v = _mm256_cmpeq_epi8(bits, _mm256_setzero_si256());
62-
let r = 0xffff_ffff_0000_0000 | _mm256_movemask_epi8(v) as u64;
66+
let r = _mm256_movemask_epi8(v) as u32;
6367

64-
_tzcnt_u64(r) as usize
68+
r.trailing_zeros() as usize
6569
}
6670

6771
#[cfg(target_arch = "x86")]
68-
unsafe fn match_url_char_32_avx(_: &[u8]) -> usize {
72+
unsafe fn match_header_value_batch_32(_: &[u8]) -> usize {
6973
unreachable!("AVX2 detection should be disabled for x86");
7074
}
7175

76+
#[cfg(target_arch = "x86_64")]
77+
#[target_feature(enable = "avx2")]
7278
pub unsafe fn match_header_value_batch_32(bytes: &mut Bytes) -> Scan {
7379
while bytes.as_ref().len() >= 32 {
7480
let advance = match_header_value_char_32_avx(bytes.as_ref());
@@ -81,9 +87,7 @@ pub unsafe fn match_header_value_batch_32(bytes: &mut Bytes) -> Scan {
8187
Scan::TooShort
8288
}
8389

84-
#[cfg(target_arch = "x86_64")]
85-
#[target_feature(enable = "avx2")]
86-
#[inline]
90+
#[inline(always)]
8791
#[allow(non_snake_case)]
8892
unsafe fn match_header_value_char_32_avx(buf: &[u8]) -> usize {
8993
debug_assert!(buf.len() >= 32);
@@ -109,14 +113,9 @@ unsafe fn match_header_value_char_32_avx(buf: &[u8]) -> usize {
109113
let del = _mm256_cmpeq_epi8(dat, DEL);
110114
let bit = _mm256_andnot_si256(del, _mm256_or_si256(low, tab));
111115
let rev = _mm256_cmpeq_epi8(bit, _mm256_setzero_si256());
112-
let res = 0xffff_ffff_0000_0000 | _mm256_movemask_epi8(rev) as u64;
116+
let res = _mm256_movemask_epi8(rev) as u32;
113117

114-
_tzcnt_u64(res) as usize
115-
}
116-
117-
#[cfg(target_arch = "x86")]
118-
unsafe fn match_header_value_char_32_avx(_: &[u8]) -> usize {
119-
unreachable!("AVX2 detection should be disabled for x86");
118+
res.trailing_zeros() as usize
120119
}
121120

122121
#[test]

src/simd/sse42.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use crate::iter::Bytes;
22

3+
#[target_feature(enable = "sse4.2")]
34
pub unsafe fn parse_uri_batch_16(bytes: &mut Bytes) {
45
while bytes.as_ref().len() >= 16 {
56
let advance = match_url_char_16_sse(bytes.as_ref());
@@ -11,7 +12,7 @@ pub unsafe fn parse_uri_batch_16(bytes: &mut Bytes) {
1112
}
1213
}
1314

14-
#[target_feature(enable = "sse4.2")]
15+
#[inline(always)]
1516
#[allow(non_snake_case, overflowing_literals)]
1617
unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
1718
debug_assert!(buf.len() >= 16);
@@ -54,11 +55,12 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
5455
let bits = _mm_and_si128(_mm_shuffle_epi8(ARF, cols), rbms);
5556

5657
let v = _mm_cmpeq_epi8(bits, _mm_setzero_si128());
57-
let r = 0xffff_0000 | _mm_movemask_epi8(v) as u32;
58+
let r = _mm_movemask_epi8(v) as u16;
5859

59-
_tzcnt_u32(r) as usize
60+
r.trailing_zeros() as usize
6061
}
6162

63+
#[target_feature(enable = "sse4.2")]
6264
pub unsafe fn match_header_value_batch_16(bytes: &mut Bytes) {
6365
while bytes.as_ref().len() >= 16 {
6466
let advance = match_header_value_char_16_sse(bytes.as_ref());
@@ -70,7 +72,7 @@ pub unsafe fn match_header_value_batch_16(bytes: &mut Bytes) {
7072
}
7173
}
7274

73-
#[target_feature(enable = "sse4.2")]
75+
#[inline(always)]
7476
#[allow(non_snake_case)]
7577
unsafe fn match_header_value_char_16_sse(buf: &[u8]) -> usize {
7678
debug_assert!(buf.len() >= 16);
@@ -94,9 +96,9 @@ unsafe fn match_header_value_char_16_sse(buf: &[u8]) -> usize {
9496
let del = _mm_cmpeq_epi8(dat, DEL);
9597
let bit = _mm_andnot_si128(del, _mm_or_si128(low, tab));
9698
let rev = _mm_cmpeq_epi8(bit, _mm_setzero_si128());
97-
let res = 0xffff_0000 | _mm_movemask_epi8(rev) as u32;
99+
let res = _mm_movemask_epi8(rev) as u16;
98100

99-
_tzcnt_u32(res) as usize
101+
res.trailing_zeros() as usize
100102
}
101103

102104
#[test]

0 commit comments

Comments
 (0)