Skip to content

Commit f1c6343

Browse files
committed
various testing changes to confirm bottlenecks
1 parent 7cb8cfb commit f1c6343

File tree

4 files changed

+129
-20
lines changed

4 files changed

+129
-20
lines changed

Makefile

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
paren: main.c
2-
gcc -O3 -march=native \
3-
main.c -c -o paren.o -Wall -Wextra
4-
ld -I /lib64/ld-linux-x86-64.so.2 paren.o -o paren -lc
2+
gcc -O3 -march=native -mtune=native \
3+
main.c -c -o paren.o -Wall -Wextra -ffat-lto-objects -flto
4+
ld -flto -I /lib64/ld-linux-x86-64.so.2 paren.o -o paren -lc
55

66
validate: validate.c
77
gcc -O3 -march=native validate.c -o validate -Wall -Wextra
@@ -15,10 +15,11 @@ tspeed: paren
1515
timeout 15 taskset 1 ./paren | taskset 2 pv -ra > /dev/null
1616

1717
tperfstat: paren
18-
perf stat -e branches -e branch-misses -e cache-misses -e cache-references \
18+
perf stat -e branches -e branch-misses -e cache-misses \
19+
-e cache-references \
1920
-e cycles -e alignment-faults -e major-faults -e minor-faults \
20-
-e dTLB-loads -e dTLB-load-misses -e dTLB-stores -e dTLB-store-misses\
21-
timeout 15 taskset 1 ./paren | taskset 2 pv -q > /dev/null
21+
-e dTLB-loads -e dTLB-load-misses -e dTLB-stores -e dTLB-store-misses \
22+
timeout 150 sh run.sh
2223

2324
tperf: paren
2425
perf record -a sh run.sh
@@ -28,6 +29,7 @@ tvalid: validate paren
2829

2930
clean:
3031
rm -f ./paren
32+
rm -f ./paren.o
3133
rm -f ./validate
3234

3335

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ further will likely recieve corrupted output (For example, `./paren | pv |
1818
Alternatively, run using `make`:
1919
- `make tpseed` does a 15 second speed test.
2020
- `make tvalid` does validation testing.
21-
- `make tperf` does performance profiling.
21+
- `make tperf` does a 5-minute performance profile.
2222

2323
### Method
2424
I calculate the next permutation as a 64-bit unsigned integer (least-significant

main.c

+119-12
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,13 @@
5151
#define BUFSIZE ((1 << 10) << 12)
5252
#define DUAL_BUFSIZE (2 * BUFSIZE)
5353
#define BUFALIGN (1 << 22)
54+
#define BUFSPLIT (BUFALIGN >> 1)
5455

5556
// I am consistantly surprised at how big it's practical for the cache size to
5657
// be. Doing tperf tests as of v6.0 with 1<<19 (512kiB) gives cache miss rate of
5758
// 0.009% but 1<<20 (1MiB) has only 0.011% cache misses. It's not enough to make
5859
// the smaller cache faster (actually it's slower by ~100MiB)
59-
#define CACHESIZE (1l << 20)
60+
#define CACHESIZE (1l << 19)
6061

6162
// how many iterations needed to print output in _N bit batches
6263
#define BATCH_32 (1 + (LSIZE / 4))
@@ -189,21 +190,15 @@ flush_buf(size_t bcnt)
189190

190191
// swap out other buffer
191192
// we do this to be sure the previous pipe is drained
192-
cursor = buf + ((currbuf - buf) ^ BUFSIZE);
193+
cursor = buf + ((currbuf - buf) ^ BUFSPLIT);
193194
currbuf = cursor;
194195
}
195196

196-
/*
197-
* properties:
198-
*
199-
* swapping adjacent (clp, opp) -> (opp, clp) is always valid
200-
*/
201-
202197

203198
/*
204199
* general strategy:
205200
*
206-
* curr is a bitmask of all the close parentheses (clp) with LSb the start of
201+
* curr is a bitmask of all the close parentheses with LSb the start of
207202
* the output string.
208203
*
209204
* We find the rightmost contiguous bit, and reset all but the MSb of the
@@ -212,8 +207,30 @@ flush_buf(size_t bcnt)
212207
*
213208
* This is optimized by adding a 1 that's shifted to start of group, which is
214209
* effectively a swap and clear simultaniously
210+
*
211+
* Here's a simple example of the function in action, remember the least
212+
* significant bit is the rightmost in the output:
213+
*
214+
* curr = 1010111000 = ((()))()()
215+
*
216+
* 1010111000
217+
* first = 3 ^---
218+
*
219+
* 1010111000
220+
* contig = 3 ^^^
221+
*
222+
* 1010111000
223+
* add: + 1000
224+
* = 1011000000 = (((((())()
225+
*
226+
* move contig - 1 = 2 bits back to their original positions
227+
*
228+
* rst: = 0000001010 = ()()((((((
229+
*
230+
* return add + rst
231+
* = 1011001010 = ()()(())()
232+
*
215233
*/
216-
217234
inline static uint64_t
218235
next_paren_bitmask(uint64_t curr)
219236
{
@@ -273,7 +290,7 @@ do_batch(uint64_t paren)
273290
voff = PSIZE;
274291
i = 0;
275292
bcidx = 0;
276-
while(paren != FIN) {
293+
do {
277294
curr = paren >> poff;
278295

279296
if (voff < 32) {
@@ -338,6 +355,93 @@ do_batch(uint64_t paren)
338355
// flush_buf((i) * BATCH_BYTES);
339356
}
340357

358+
static void
359+
flat_store_bytecode(uint64_t paren)
360+
{
361+
/*
362+
* this is a super minimal experiment of how fast I could expect to get
363+
* in a single-threaded context. I run the "same" loop as the full
364+
* output.
365+
*
366+
* This function gets 16GiB/s
367+
*/
368+
369+
int i;
370+
uint64_t bcidx;
371+
__m256i resv, bcv;
372+
373+
const __m256i batch = _mm256_set1_epi64x(0xFF00FF00FF00FF00);
374+
375+
i = 0;
376+
bcidx = 0;
377+
while(true) {
378+
bcv = _mm256_load_si256((__m256i *)&bytecode[bcidx]);
379+
bcidx += BATCH_SIZE;
380+
if (bcidx == BATCH_STORE) {
381+
bcidx = 0;
382+
}
383+
384+
// combine with bytecode
385+
resv = _mm256_sub_epi8(bcv, batch);
386+
387+
_mm256_store_si256((__m256i *) cursor, resv);
388+
cursor += 32;
389+
390+
if (i >= PIPECNT || paren == FIN) {
391+
// flush_buf((i) * BATCH_SIZE);
392+
flush_buf(cursor - currbuf);
393+
i = 0;
394+
}
395+
i += 1;
396+
};
397+
}
398+
399+
static void
400+
flat_store(uint64_t paren)
401+
{
402+
/*
403+
* this is a super minimal experiment of how fast I could expect to get
404+
* in a single-threaded context. This function is just stores
405+
*
406+
* This function gets 26.7 GiB/s
407+
*/
408+
409+
int i;
410+
char *lc;
411+
const __m256i batch = _mm256_set1_epi64x(0xFF00FF00FF00FF00);
412+
413+
i = 0;
414+
lc = currbuf;
415+
while(true) {
416+
_mm256_store_si256((__m256i *) lc, batch);
417+
lc += 32;
418+
419+
if (i >= PIPECNT) {
420+
// flush_buf((i) * BATCH_SIZE);
421+
flush_buf(lc - currbuf);
422+
lc = currbuf;
423+
i = 0;
424+
}
425+
i += 1;
426+
};
427+
}
428+
429+
static void
430+
flat_flush_buf(uint64_t paren)
431+
{
432+
/*
433+
* this is a super minimal experiment of how fast I could expect to get
434+
* in a single-threaded context. This function is just the speed of
435+
* flush_buf
436+
*
437+
* This function gets 72.8 GiB/s
438+
*/
439+
440+
memset(buf, 0xFF, DUAL_BUFSIZE);
441+
while(true) {
442+
flush_buf(CACHESIZE);
443+
};
444+
}
341445

342446

343447
void
@@ -364,7 +468,10 @@ _start(void)
364468
// (removed for batching)
365469
paren = PMASK & 0xAAAAAAAAAAAAAAAA;
366470
gen_bytecode(paren);
367-
do_batch(paren);
471+
// do_batch(paren);
472+
// flat_store_bytecode(paren);
473+
flat_store(paren);
474+
// flat_flush_buf(paren);
368475
close(STDOUT_FILENO);
369476
exit(0);
370477
}

run.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
./paren | pv -q > /dev/null
1+
taskset 1 ./paren | taskset 2 pv -q > /dev/null

0 commit comments

Comments
 (0)