Skip to content

Commit 797a894

Browse files
committed
Add intial idea for benchmarks
0 parents  commit 797a894

File tree

4 files changed

+372
-0
lines changed

4 files changed

+372
-0
lines changed

.clang-format

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
---
2+
Language: Cpp
3+
# BasedOnStyle: Google
4+
AccessModifierOffset: -1
5+
AlignAfterOpenBracket: Align
6+
AlignConsecutiveMacros: None
7+
AlignConsecutiveAssignments: None
8+
AlignConsecutiveDeclarations: None
9+
AlignEscapedNewlines: Left
10+
AlignOperands: true
11+
AlignTrailingComments: true
12+
AllowAllArgumentsOnNextLine: true
13+
AllowAllConstructorInitializersOnNextLine: true
14+
AllowAllParametersOfDeclarationOnNextLine: true
15+
AllowShortBlocksOnASingleLine: Never
16+
AllowShortCaseLabelsOnASingleLine: false
17+
AllowShortFunctionsOnASingleLine: All
18+
AllowShortLambdasOnASingleLine: All
19+
AllowShortIfStatementsOnASingleLine: WithoutElse
20+
AllowShortLoopsOnASingleLine: true
21+
AlwaysBreakAfterDefinitionReturnType: None
22+
AlwaysBreakAfterReturnType: None
23+
AlwaysBreakBeforeMultilineStrings: true
24+
AlwaysBreakTemplateDeclarations: Yes
25+
BinPackArguments: true
26+
BinPackParameters: true
27+
BraceWrapping:
28+
AfterCaseLabel: false
29+
AfterClass: false
30+
AfterControlStatement: Never
31+
AfterEnum: false
32+
AfterFunction: false
33+
AfterNamespace: false
34+
AfterObjCDeclaration: false
35+
AfterStruct: false
36+
AfterUnion: false
37+
AfterExternBlock: false
38+
BeforeCatch: false
39+
BeforeElse: false
40+
IndentBraces: false
41+
SplitEmptyFunction: false
42+
SplitEmptyRecord: false
43+
SplitEmptyNamespace: false
44+
BreakBeforeBinaryOperators: None
45+
BreakBeforeBraces: Custom
46+
BreakBeforeInheritanceComma: false
47+
BreakInheritanceList: BeforeColon
48+
BreakBeforeTernaryOperators: true
49+
BreakConstructorInitializersBeforeComma: false
50+
BreakConstructorInitializers: BeforeColon
51+
BreakAfterJavaFieldAnnotations: false
52+
BreakStringLiterals: true
53+
ColumnLimit: 120
54+
CommentPragmas: '^ IWYU pragma:'
55+
CompactNamespaces: false
56+
ConstructorInitializerAllOnOneLineOrOnePerLine: true
57+
ConstructorInitializerIndentWidth: 4
58+
ContinuationIndentWidth: 4
59+
Cpp11BracedListStyle: true
60+
DeriveLineEnding: false
61+
UseCRLF: false
62+
DerivePointerAlignment: false
63+
DisableFormat: false
64+
ExperimentalAutoDetectBinPacking: false
65+
FixNamespaceComments: true
66+
ForEachMacros:
67+
- foreach
68+
- Q_FOREACH
69+
- BOOST_FOREACH
70+
IncludeBlocks: Regroup
71+
IncludeCategories:
72+
- Regex: '^<ext/.*\.h>'
73+
Priority: 2
74+
SortPriority: 0
75+
- Regex: '^<.*\.h>'
76+
Priority: 1
77+
SortPriority: 0
78+
- Regex: '^<.*'
79+
Priority: 2
80+
SortPriority: 0
81+
- Regex: '.*'
82+
Priority: 3
83+
SortPriority: 0
84+
IncludeIsMainRegex: '([-_](test|unittest))?$'
85+
IncludeIsMainSourceRegex: ''
86+
IndentCaseLabels: true
87+
IndentGotoLabels: true
88+
IndentPPDirectives: None
89+
IndentWidth: 2
90+
IndentWrappedFunctionNames: false
91+
JavaScriptQuotes: Leave
92+
JavaScriptWrapImports: true
93+
KeepEmptyLinesAtTheStartOfBlocks: false
94+
MacroBlockBegin: ''
95+
MacroBlockEnd: ''
96+
MaxEmptyLinesToKeep: 1
97+
NamespaceIndentation: None
98+
ObjCBinPackProtocolList: Never
99+
ObjCBlockIndentWidth: 2
100+
ObjCSpaceAfterProperty: false
101+
ObjCSpaceBeforeProtocolList: true
102+
PenaltyBreakAssignment: 2
103+
PenaltyBreakBeforeFirstCallParameter: 1
104+
PenaltyBreakComment: 300
105+
PenaltyBreakFirstLessLess: 120
106+
PenaltyBreakString: 1000
107+
PenaltyBreakTemplateDeclaration: 10
108+
PenaltyExcessCharacter: 1000000
109+
PenaltyReturnTypeOnItsOwnLine: 200
110+
PointerAlignment: Left
111+
RawStringFormats:
112+
- Language: Cpp
113+
Delimiters:
114+
- cc
115+
- CC
116+
- cpp
117+
- Cpp
118+
- CPP
119+
- 'c++'
120+
- 'C++'
121+
CanonicalDelimiter: ''
122+
BasedOnStyle: google
123+
- Language: TextProto
124+
Delimiters:
125+
- pb
126+
- PB
127+
- proto
128+
- PROTO
129+
EnclosingFunctions:
130+
- EqualsProto
131+
- EquivToProto
132+
- PARSE_PARTIAL_TEXT_PROTO
133+
- PARSE_TEST_PROTO
134+
- PARSE_TEXT_PROTO
135+
- ParseTextOrDie
136+
- ParseTextProtoOrDie
137+
CanonicalDelimiter: ''
138+
BasedOnStyle: google
139+
ReflowComments: true
140+
SortIncludes: CaseInsensitive
141+
SortUsingDeclarations: true
142+
SpaceAfterCStyleCast: false
143+
SpaceAfterLogicalNot: false
144+
SpaceAfterTemplateKeyword: true
145+
SpaceBeforeAssignmentOperators: true
146+
SpaceBeforeCpp11BracedList: false
147+
SpaceBeforeCtorInitializerColon: true
148+
SpaceBeforeInheritanceColon: true
149+
SpaceBeforeParens: ControlStatements
150+
SpaceBeforeRangeBasedForLoopColon: true
151+
SpaceInEmptyBlock: false
152+
SpaceInEmptyParentheses: false
153+
SpacesBeforeTrailingComments: 2
154+
SpacesInAngles: false
155+
SpacesInConditionalStatement: false
156+
SpacesInContainerLiterals: true
157+
SpacesInCStyleCastParentheses: false
158+
SpacesInParentheses: false
159+
SpacesInSquareBrackets: false
160+
SpaceBeforeSquareBrackets: false
161+
Standard: Auto
162+
StatementMacros:
163+
- Q_UNUSED
164+
- QT_REQUIRE_VERSION
165+
TabWidth: 8
166+
UseTab: Never
167+
...

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
cmake-build-*/
2+
build*/

CMakeLists.txt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
cmake_minimum_required(VERSION 3.18)
2+
project(autovec_db)
3+
4+
set(CMAKE_CXX_STANDARD 20)
5+
6+
add_library(autovec INTERFACE)
7+
target_include_directories(autovec INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
8+
9+
include(FetchContent)
10+
11+
##################### Google Benchmark ####################
12+
FetchContent_Declare(
13+
google_benchmark
14+
GIT_REPOSITORY https://github.com/google/benchmark.git
15+
GIT_TAG v1.6.1
16+
)
17+
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Suppressing google benchmark's tests" FORCE)
18+
FetchContent_MakeAvailable(google_benchmark)
19+
20+
##################### Compiler Options ####################
21+
if (CMAKE_BUILD_TYPE MATCHES Release)
22+
message(STATUS "BUILDING BENCHMARKS IN RELEASE")
23+
target_compile_options(autovec INTERFACE -O3)
24+
else()
25+
message(WARNING "Building benchmarks in ${CMAKE_BUILD_TYPE}. This may impact performance!")
26+
endif()
27+
28+
##################### Our Benchmarks ####################
29+
function(add_benchmark bm_name)
30+
add_executable(${bm_name} ${bm_name}.cpp)
31+
target_link_libraries(${bm_name} PRIVATE autovec benchmark)
32+
endfunction()
33+
34+
35+
add_benchmark(hash_bucket)

hash_bucket.cpp

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
#include <array>
2+
#include <cstdint>
3+
#include <numeric>
4+
5+
#include "benchmark/benchmark.h"
6+
7+
struct Entry {
8+
uint64_t key;
9+
uint64_t value;
10+
};
11+
12+
struct alignas(256) HashBucket {
13+
alignas(256) std::array<uint8_t, 15> fingerprints;
14+
alignas(16) std::array<Entry, 15> entries;
15+
};
16+
17+
static constexpr uint64_t NO_MATCH = std::numeric_limits<uint64_t>::max();
18+
19+
static_assert(sizeof(HashBucket) == 256, "Hash Bucket should be 256 Byte for this benchmark");
20+
21+
#define BM_ARGS UseRealTime()->Repetitions(10);
22+
23+
template <typename FindFn>
24+
void BM_hash_bucket_get(benchmark::State& state) {
25+
FindFn find_fn{};
26+
HashBucket bucket{};
27+
bucket.fingerprints = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
28+
bucket.entries = {Entry{11, 11}, Entry{12, 12}, Entry{33, 33}, Entry{44, 44}, Entry{55, 55},
29+
Entry{66, 66}, Entry{77, 77}, Entry{88, 88}, Entry{99, 99}, Entry{100, 100},
30+
Entry{110, 110}, Entry{120, 120}, Entry{130, 130}, Entry{140, 140}, Entry{150, 150}};
31+
uint64_t key = 66;
32+
uint8_t fingerprint = 6;
33+
34+
for (auto _ : state) {
35+
uint64_t value = find_fn(bucket, key, fingerprint);
36+
benchmark::DoNotOptimize(value);
37+
}
38+
}
39+
40+
#if defined(__aarch64__)
41+
#include <arm_neon.h>
42+
43+
struct neon_find {
44+
uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) {
45+
uint8_t* fingerprints = bucket.fingerprints.data();
46+
47+
// Load the fingerprints into a SIMD register.
48+
uint8x16_t fp_vector = vld1q_u8(fingerprints);
49+
50+
// Broadcast the fingerprint to compare against into a SIMD register.
51+
uint8x16_t lookup_fp = vmovq_n_u8(fingerprint);
52+
53+
// Compare fingerprints.
54+
auto matching_fingerprints = reinterpret_cast<__uint128_t>(vceqq_u8(fp_vector, lookup_fp));
55+
56+
// We could do this with a single movemask on x86, but ARM NEON does not support this. So we split our range into
57+
// two values that we check after each other. The extraction here is a no-op, as the __uint128_t result is stored in
58+
// two 64 bit registers anyway. This is only a logical conversion.
59+
uint64_t low_matches = *reinterpret_cast<uint64_t*>(&matching_fingerprints);
60+
uint64_t high_matches = *(reinterpret_cast<uint64_t*>(&matching_fingerprints) + 1);
61+
62+
// We need to pass an offset here, as the higher matches need to check the second half of the entries array.
63+
auto find_key_match = [&](uint64_t matches, size_t entry_offset) {
64+
while (matches != 0) {
65+
// The comparison returns 00000000 for a mismatch, so we need to divide by 8 to get the actual number of 0's.
66+
uint32_t trailing_zeros = __builtin_ctzl(matches);
67+
uint16_t match_pos = entry_offset + (trailing_zeros / 8);
68+
69+
// We give this a likely hint, as we expect the number of fingerprint collisions to be low. So on average, we
70+
// want this to be the happy path and immediately return if possible.
71+
if (bucket.entries[match_pos].key == key) [[likely]] {
72+
return bucket.entries[match_pos].value;
73+
}
74+
75+
// We want to remove all 1's that we just matched. So we set all 8 bits that we just matched and invert the
76+
// number for a clean 11111111...00000000...11111111 mask.
77+
matches &= ~(255ul << trailing_zeros);
78+
}
79+
return NO_MATCH;
80+
};
81+
82+
uint64_t low_match = find_key_match(low_matches, 0);
83+
if (low_match != NO_MATCH) {
84+
return low_match;
85+
}
86+
87+
return find_key_match(high_matches, 8);
88+
}
89+
};
90+
91+
BENCHMARK(BM_hash_bucket_get<neon_find>)->BM_ARGS;
92+
93+
#elif defined(__x86_64__)
94+
struct x86_find {
95+
uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) {
96+
// TODO
97+
uint8_t* fingerprints = bucket.fingerprints.data();
98+
return 0;
99+
}
100+
};
101+
102+
BENCHMARK(BM_hash_bucket_get<x86Find>)
103+
->BM_ARGS;
104+
#endif
105+
106+
struct naive_scalar_find {
107+
uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) { return NO_MATCH; }
108+
};
109+
110+
struct autovec_scalar_find {
111+
uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) { return NO_MATCH; }
112+
};
113+
114+
struct vector_find {
115+
using vec8x16 = uint8_t __attribute__((vector_size(16)));
116+
uint64_t operator()(HashBucket& bucket, uint64_t key, uint8_t fingerprint) {
117+
uint8_t* fingerprints = bucket.fingerprints.data();
118+
119+
// Load the fingerprints into a SIMD register.
120+
vec8x16 fp_vector = *reinterpret_cast<vec8x16*>(fingerprints);
121+
122+
// Broadcast the fingerprint to compare against into a SIMD register. We only use 15 values, to the last one is 0.
123+
uint8_t f = fingerprint;
124+
vec8x16 lookup_fp = {f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, 0};
125+
126+
// Compare fingerprints.
127+
auto matching_fingerprints = reinterpret_cast<__uint128_t>(fp_vector == lookup_fp);
128+
129+
// We could do this with a single movemask on x86, but ARM NEON does not support this. So we split our range into
130+
// two values that we check after each other. The extraction here is a no-op, as the __uint128_t result is stored
131+
// in two 64 bit registers anyway. This is only a logical conversion.
132+
uint64_t low_matches = *reinterpret_cast<uint64_t*>(&matching_fingerprints);
133+
uint64_t high_matches = *(reinterpret_cast<uint64_t*>(&matching_fingerprints) + 1);
134+
135+
// We need to pass an offset here, as the higher matches need to check the second half of the entries array.
136+
auto find_key_match = [&](uint64_t matches, size_t entry_offset) {
137+
while (matches != 0) {
138+
// The comparison returns 00000000 for a mismatch, so we need to divide by 8 to get the actual number of 0's.
139+
uint32_t trailing_zeros = __builtin_ctzl(matches);
140+
uint16_t match_pos = entry_offset + (trailing_zeros / 8);
141+
142+
// We give this a likely hint, as we expect the number of fingerprint collisions to be low. So on average, we
143+
// want this to be the happy path and immediately return if possible.
144+
if (bucket.entries[match_pos].key == key) [[likely]] {
145+
return bucket.entries[match_pos].value;
146+
}
147+
148+
// We want to remove all 1's that we just matched. So we set all 8 bits that we just matched and invert the
149+
// number for a clean 11111111...00000000...11111111 mask.
150+
matches &= ~(255ul << trailing_zeros);
151+
}
152+
return NO_MATCH;
153+
};
154+
155+
uint64_t low_match = find_key_match(low_matches, 0);
156+
if (low_match != NO_MATCH) {
157+
return low_match;
158+
}
159+
160+
return find_key_match(high_matches, 8);
161+
}
162+
};
163+
164+
BENCHMARK(BM_hash_bucket_get<naive_scalar_find>)->BM_ARGS;
165+
BENCHMARK(BM_hash_bucket_get<autovec_scalar_find>)->BM_ARGS;
166+
BENCHMARK(BM_hash_bucket_get<vector_find>)->BM_ARGS;
167+
168+
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)