Skip to content

Commit

Permalink
Refactor SIMD pre-filter to improve handling of AVX-512 mask types
Browse files Browse the repository at this point in the history
  • Loading branch information
althonos committed Nov 20, 2023
1 parent e14af3d commit 7c5a73d
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 41 deletions.
9 changes: 6 additions & 3 deletions pyrodigal/impl/avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
#define simd_load(m) _mm256_load_si256((__m256i*) (m))
#define simd_store(x, m) _mm256_store_si256((__m256i*) (m), x)
#define simd_set1(x) _mm256_set1_epi8(x)
#define simd_or(x, y) _mm256_or_si256(x, y)
#define simd_eq(x, y) _mm256_cmpeq_epi8(x, y)
#define simd_and(x, y) _mm256_and_si256(x, y)
#define simd_andnot(x, y) _mm256_andnot_si256(y, x)

#define mask_t __m256i
#define mask_or(x, y) _mm256_or_si256(x, y)
#define mask_and(x, y) _mm256_and_si256(x, y)
#define mask_andnot(x, y) _mm256_andnot_si256(y, x)
#define mask_convert(x) (x)

#define SIMD_LANES 32
#define SIMD_MASK 0x1F
Expand Down
11 changes: 7 additions & 4 deletions pyrodigal/impl/avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
#define simd_load(m) _mm512_load_si512((__m512i*) (m))
#define simd_store(x, m) _mm512_store_si512((__m512i*) (m), x)
#define simd_set1(x) _mm512_set1_epi8(x)
#define simd_or(x, y) _mm512_or_si512(x, y)
#define simd_eq(x, y) _mm512_movm_epi8(_mm512_cmpeq_epi8_mask(x, y))
#define simd_and(x, y) _mm512_and_si512(x, y)
#define simd_andnot(x, y) _mm512_andnot_si512(y, x)
#define simd_eq(x, y) _mm512_cmpeq_epi8_mask(x, y)

#define mask_t __mmask64
#define mask_or(x, y) _kor_mask64(x, y)
#define mask_and(x, y) _kand_mask64(x, y)
#define mask_andnot(x, y) _kandn_mask64(y, x)
#define mask_convert(x) _mm512_movm_epi8(x)

#define SIMD_LANES 64
#define SIMD_MASK 0x3F
Expand Down
9 changes: 6 additions & 3 deletions pyrodigal/impl/mmx.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
#define simd_load(m) (((__m64*) (m))[0])
#define simd_store(x, m) (((__m64*) (m))[0] = x)
#define simd_set1(x) _mm_set1_pi8(x)
#define simd_or(x, y) _mm_or_si64(x, y)
#define simd_eq(x, y) _mm_cmpeq_pi8(x, y)
#define simd_and(x, y) _mm_and_si64(x, y)
#define simd_andnot(x, y) _mm_andnot_si64(y, x)

#define mask_t __m64
#define mask_or(x, y) _mm_or_si64(x, y)
#define mask_and(x, y) _mm_and_si64(x, y)
#define mask_andnot(x, y) _mm_andnot_si64(y, x)
#define mask_convert(x) (x)

#define SIMD_LANES 8
#define SIMD_MASK 0x7
Expand Down
9 changes: 6 additions & 3 deletions pyrodigal/impl/neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
#define simd_load(m) vld1q_u8((uint8_t*) (m))
#define simd_store(x, m) vst1q_u8((uint8_t*) (m), x)
#define simd_set1(x) vdupq_n_u8(x)
#define simd_or(x, y) vorrq_u8(x, y)
#define simd_eq(x, y) vceqq_u8(x, y)
#define simd_and(x, y) vandq_u8(x, y)
#define simd_andnot(x, y) vbicq_u8(x, y)

#define mask_t uint8x16_t
#define mask_or(x, y) vorrq_u8(x, y)
#define mask_and(x, y) vandq_u8(x, y)
#define mask_andnot(x, y) vbicq_u8(x, y)
#define mask_convert(x) (x)

#define SIMD_LANES 16
#define SIMD_MASK 0xF
Expand Down
9 changes: 6 additions & 3 deletions pyrodigal/impl/sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
#define simd_load(m) _mm_load_si128((__m128i*) (m))
#define simd_store(x, m) _mm_store_si128((__m128i*) (m), x)
#define simd_set1(x) _mm_set1_epi8(x)
#define simd_or(x, y) _mm_or_si128(x, y)
#define simd_eq(x, y) _mm_cmpeq_epi8(x, y)
#define simd_and(x, y) _mm_and_si128(x, y)
#define simd_andnot(x, y) _mm_andnot_si128(y, x)

#define mask_t __m128i
#define mask_or(x, y) _mm_or_si128(x, y)
#define mask_and(x, y) _mm_and_si128(x, y)
#define mask_andnot(x, y) _mm_andnot_si128(y, x)
#define mask_convert(x) (x)

#define SIMD_LANES 16
#define SIMD_MASK 0xF
Expand Down
50 changes: 25 additions & 25 deletions pyrodigal/impl/template.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
const simd_t ALL_BWD = simd_set1(-1); \
\
int j; \
simd_t x0, x1, x2, x3, x4, x5; \
simd_t s; \
mask_t x0, x1, x2, x3, x4, x5; \
mask_t s; \
simd_t n1_strands; \
simd_t n1_types; \
simd_t n1_frames; \
Expand All @@ -21,41 +21,41 @@
n1_frames = simd_load(&frames[j]); \
\
x0 = simd_eq(n1_strands, n2_strands); \
x0 = simd_andnot(x0, simd_eq(n2_types, ALL_STOPS)); \
x0 = simd_andnot(x0, simd_eq(n1_types, ALL_STOPS)); \
x0 = mask_andnot(x0, simd_eq(n2_types, ALL_STOPS)); \
x0 = mask_andnot(x0, simd_eq(n1_types, ALL_STOPS)); \
s = x0; \
\
x1 = simd_eq(n2_strands, ALL_BWD); \
x1 = simd_and( x1, simd_eq(n1_strands, ALL_FWD)); \
x1 = simd_andnot(x1, simd_eq(n1_types, ALL_STOPS)); \
s = simd_or( x1, s); \
x1 = mask_and( x1, simd_eq(n1_strands, ALL_FWD)); \
x1 = mask_andnot(x1, simd_eq(n1_types, ALL_STOPS)); \
s = mask_or( x1, s); \
\
x2 = simd_eq(n1_types, ALL_STOPS); \
x2 = simd_and(x2, simd_eq(n1_strands, ALL_BWD)); \
x2 = simd_and(x2, simd_eq(n2_strands, ALL_FWD)); \
s = simd_or( x2, s); \
x2 = mask_and(x2, simd_eq(n1_strands, ALL_BWD)); \
x2 = mask_and(x2, simd_eq(n2_strands, ALL_FWD)); \
s = mask_or( x2, s); \
\
x3 = simd_eq(n2_types, ALL_STOPS); \
x3 = simd_and( x3, simd_eq(n1_strands, ALL_BWD)); \
x3 = simd_and( x3, simd_eq(n2_strands, ALL_FWD)); \
x3 = simd_andnot(x3, simd_eq(n1_types, ALL_STOPS)); \
s = simd_or( x3, s); \
x3 = mask_and( x3, simd_eq(n1_strands, ALL_BWD)); \
x3 = mask_and( x3, simd_eq(n2_strands, ALL_FWD)); \
x3 = mask_andnot(x3, simd_eq(n1_types, ALL_STOPS)); \
s = mask_or( x3, s); \
\
x4 = simd_eq(n1_strands, n2_strands); \
x4 = simd_and( x4, simd_eq(n1_strands, ALL_FWD)); \
x4 = simd_andnot(x4, simd_eq(n1_types, ALL_STOPS)); \
x4 = simd_and( x4, simd_eq(n2_types, ALL_STOPS)); \
x4 = simd_andnot(x4, simd_eq(n1_frames, n2_frames)); \
s = simd_or( x4, s); \
x4 = mask_and( x4, simd_eq(n1_strands, ALL_FWD)); \
x4 = mask_andnot(x4, simd_eq(n1_types, ALL_STOPS)); \
x4 = mask_and( x4, simd_eq(n2_types, ALL_STOPS)); \
x4 = mask_andnot(x4, simd_eq(n1_frames, n2_frames)); \
s = mask_or( x4, s); \
\
x5 = simd_eq(n1_strands, n2_strands); \
x5 = simd_and( x5, simd_eq(n1_strands, ALL_BWD)); \
x5 = simd_and( x5, simd_eq(n1_types, ALL_STOPS)); \
x5 = simd_andnot(x5, simd_eq(n2_types, ALL_STOPS)); \
x5 = simd_andnot(x5, simd_eq(n1_frames, n2_frames)); \
s = simd_or( x5, s); \
x5 = mask_and( x5, simd_eq(n1_strands, ALL_BWD)); \
x5 = mask_and( x5, simd_eq(n1_types, ALL_STOPS)); \
x5 = mask_andnot(x5, simd_eq(n2_types, ALL_STOPS)); \
x5 = mask_andnot(x5, simd_eq(n1_frames, n2_frames)); \
s = mask_or( x5, s); \
\
simd_store(s, &skip[j]); \
simd_store(mask_convert(s), &skip[j]); \
} \
for (; j < i; j++) \
skippable_generic_single(strands, types, frames, j, i, skip); \
Expand Down

0 comments on commit 7c5a73d

Please sign in to comment.