|
| 1 | +#include <array> |
| 2 | +#include <cstdint> |
| 3 | +#include <numeric> |
| 4 | + |
| 5 | +#include "benchmark/benchmark.h" |
| 6 | + |
| 7 | +struct Entry { |
| 8 | + uint64_t key; |
| 9 | + uint64_t value; |
| 10 | +}; |
| 11 | + |
| 12 | +struct alignas(256) HashBucket { |
| 13 | + alignas(256) std::array<uint8_t, 15> fingerprints; |
| 14 | + alignas(16) std::array<Entry, 15> entries; |
| 15 | +}; |
| 16 | + |
| 17 | +static constexpr uint64_t NO_MATCH = std::numeric_limits<uint64_t>::max(); |
| 18 | + |
| 19 | +static_assert(sizeof(HashBucket) == 256, "Hash Bucket should be 256 Byte for this benchmark"); |
| 20 | + |
| 21 | +#define BM_ARGS UseRealTime()->Repetitions(10); |
| 22 | + |
| 23 | +template <typename FindFn> |
| 24 | +void BM_hash_bucket_get(benchmark::State& state) { |
| 25 | + FindFn find_fn{}; |
| 26 | + HashBucket bucket{}; |
| 27 | + bucket.fingerprints = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; |
| 28 | + bucket.entries = {Entry{11, 11}, Entry{12, 12}, Entry{33, 33}, Entry{44, 44}, Entry{55, 55}, |
| 29 | + Entry{66, 66}, Entry{77, 77}, Entry{88, 88}, Entry{99, 99}, Entry{100, 100}, |
| 30 | + Entry{110, 110}, Entry{120, 120}, Entry{130, 130}, Entry{140, 140}, Entry{150, 150}}; |
| 31 | + uint64_t key = 66; |
| 32 | + uint8_t fingerprint = 6; |
| 33 | + |
| 34 | + for (auto _ : state) { |
| 35 | + uint64_t value = find_fn(bucket, key, fingerprint); |
| 36 | + benchmark::DoNotOptimize(value); |
| 37 | + } |
| 38 | +} |
| 39 | + |
| 40 | +#if defined(__aarch64__) |
| 41 | +#include <arm_neon.h> |
| 42 | + |
| 43 | +struct neon_find { |
| 44 | + uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) { |
| 45 | + uint8_t* fingerprints = bucket.fingerprints.data(); |
| 46 | + |
| 47 | + // Load the fingerprints into a SIMD register. |
| 48 | + uint8x16_t fp_vector = vld1q_u8(fingerprints); |
| 49 | + |
| 50 | + // Broadcast the fingerprint to compare against into a SIMD register. |
| 51 | + uint8x16_t lookup_fp = vmovq_n_u8(fingerprint); |
| 52 | + |
| 53 | + // Compare fingerprints. |
| 54 | + auto matching_fingerprints = reinterpret_cast<__uint128_t>(vceqq_u8(fp_vector, lookup_fp)); |
| 55 | + |
| 56 | + // We could do this with a single movemask on x86, but ARM NEON does not support this. So we split our range into |
| 57 | + // two values that we check after each other. The extraction here is a no-op, as the __uint128_t result is stored in |
| 58 | + // two 64 bit registers anyway. This is only a logical conversion. |
| 59 | + uint64_t low_matches = *reinterpret_cast<uint64_t*>(&matching_fingerprints); |
| 60 | + uint64_t high_matches = *(reinterpret_cast<uint64_t*>(&matching_fingerprints) + 1); |
| 61 | + |
| 62 | + // We need to pass an offset here, as the higher matches need to check the second half of the entries array. |
| 63 | + auto find_key_match = [&](uint64_t matches, size_t entry_offset) { |
| 64 | + while (matches != 0) { |
| 65 | + // The comparison returns 00000000 for a mismatch, so we need to divide by 8 to get the actual number of 0's. |
| 66 | + uint32_t trailing_zeros = __builtin_ctzl(matches); |
| 67 | + uint16_t match_pos = entry_offset + (trailing_zeros / 8); |
| 68 | + |
| 69 | + // We give this a likely hint, as we expect the number of fingerprint collisions to be low. So on average, we |
| 70 | + // want this to be the happy path and immediately return if possible. |
| 71 | + if (bucket.entries[match_pos].key == key) [[likely]] { |
| 72 | + return bucket.entries[match_pos].value; |
| 73 | + } |
| 74 | + |
| 75 | + // We want to remove all 1's that we just matched. So we set all 8 bits that we just matched and invert the |
| 76 | + // number for a clean 11111111...00000000...11111111 mask. |
| 77 | + matches &= ~(255ul << trailing_zeros); |
| 78 | + } |
| 79 | + return NO_MATCH; |
| 80 | + }; |
| 81 | + |
| 82 | + uint64_t low_match = find_key_match(low_matches, 0); |
| 83 | + if (low_match != NO_MATCH) { |
| 84 | + return low_match; |
| 85 | + } |
| 86 | + |
| 87 | + return find_key_match(high_matches, 8); |
| 88 | + } |
| 89 | +}; |
| 90 | + |
| 91 | +BENCHMARK(BM_hash_bucket_get<neon_find>)->BM_ARGS; |
| 92 | + |
| 93 | +#elif defined(__x86_64__) |
| 94 | +struct x86_find { |
| 95 | + uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) { |
| 96 | + // TODO |
| 97 | + uint8_t* fingerprints = bucket.fingerprints.data(); |
| 98 | + return 0; |
| 99 | + } |
| 100 | +}; |
| 101 | + |
| 102 | +BENCHMARK(BM_hash_bucket_get<x86Find>) |
| 103 | + ->BM_ARGS; |
| 104 | +#endif |
| 105 | + |
| 106 | +struct naive_scalar_find { |
| 107 | + uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) { return NO_MATCH; } |
| 108 | +}; |
| 109 | + |
| 110 | +struct autovec_scalar_find { |
| 111 | + uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) { return NO_MATCH; } |
| 112 | +}; |
| 113 | + |
| 114 | +struct vector_find { |
| 115 | + using vec8x16 = uint8_t __attribute__((vector_size(16))); |
| 116 | + uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) { |
| 117 | + uint8_t* fingerprints = bucket.fingerprints.data(); |
| 118 | + |
| 119 | + // Load the fingerprints into a SIMD register. |
| 120 | + vec8x16 fp_vector = *reinterpret_cast<vec8x16*>(fingerprints); |
| 121 | + |
| 122 | + // Broadcast the fingerprint to compare against into a SIMD register. We only use 15 values, to the last one is 0. |
| 123 | + uint8_t f = fingerprint; |
| 124 | + vec8x16 lookup_fp = {f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, 0}; |
| 125 | + |
| 126 | + // Compare fingerprints. |
| 127 | + auto matching_fingerprints = reinterpret_cast<__uint128_t>(fp_vector == lookup_fp); |
| 128 | + |
| 129 | + // We could do this with a single movemask on x86, but ARM NEON does not support this. So we split our range into |
| 130 | + // two values that we check after each other. The extraction here is a no-op, as the __uint128_t result is stored |
| 131 | + // in two 64 bit registers anyway. This is only a logical conversion. |
| 132 | + uint64_t low_matches = *reinterpret_cast<uint64_t*>(&matching_fingerprints); |
| 133 | + uint64_t high_matches = *(reinterpret_cast<uint64_t*>(&matching_fingerprints) + 1); |
| 134 | + |
| 135 | + // We need to pass an offset here, as the higher matches need to check the second half of the entries array. |
| 136 | + auto find_key_match = [&](uint64_t matches, size_t entry_offset) { |
| 137 | + while (matches != 0) { |
| 138 | + // The comparison returns 00000000 for a mismatch, so we need to divide by 8 to get the actual number of 0's. |
| 139 | + uint32_t trailing_zeros = __builtin_ctzl(matches); |
| 140 | + uint16_t match_pos = entry_offset + (trailing_zeros / 8); |
| 141 | + |
| 142 | + // We give this a likely hint, as we expect the number of fingerprint collisions to be low. So on average, we |
| 143 | + // want this to be the happy path and immediately return if possible. |
| 144 | + if (bucket.entries[match_pos].key == key) [[likely]] { |
| 145 | + return bucket.entries[match_pos].value; |
| 146 | + } |
| 147 | + |
| 148 | + // We want to remove all 1's that we just matched. So we set all 8 bits that we just matched and invert the |
| 149 | + // number for a clean 11111111...00000000...11111111 mask. |
| 150 | + matches &= ~(255ul << trailing_zeros); |
| 151 | + } |
| 152 | + return NO_MATCH; |
| 153 | + }; |
| 154 | + |
| 155 | + uint64_t low_match = find_key_match(low_matches, 0); |
| 156 | + if (low_match != NO_MATCH) { |
| 157 | + return low_match; |
| 158 | + } |
| 159 | + |
| 160 | + return find_key_match(high_matches, 8); |
| 161 | + } |
| 162 | +}; |
| 163 | + |
| 164 | +BENCHMARK(BM_hash_bucket_get<naive_scalar_find>)->BM_ARGS; |
| 165 | +BENCHMARK(BM_hash_bucket_get<autovec_scalar_find>)->BM_ARGS; |
| 166 | +BENCHMARK(BM_hash_bucket_get<vector_find>)->BM_ARGS; |
| 167 | + |
| 168 | +BENCHMARK_MAIN(); |
0 commit comments