51
51
#define BUFSIZE ((1 << 10) << 12)
52
52
#define DUAL_BUFSIZE (2 * BUFSIZE)
53
53
#define BUFALIGN (1 << 22)
54
+ #define BUFSPLIT (BUFALIGN >> 1)
54
55
55
56
// I am consistantly surprised at how big it's practical for the cache size to
56
57
// be. Doing tperf tests as of v6.0 with 1<<19 (512kiB) gives cache miss rate of
57
58
// 0.009% but 1<<20 (1MiB) has only 0.011% cache misses. It's not enough to make
58
59
// the smaller cache faster (actually it's slower by ~100MiB)
59
- #define CACHESIZE (1l << 20 )
60
+ #define CACHESIZE (1l << 19 )
60
61
61
62
// how many iterations needed to print output in _N bit batches
62
63
#define BATCH_32 (1 + (LSIZE / 4))
@@ -189,21 +190,15 @@ flush_buf(size_t bcnt)
189
190
190
191
// swap out other buffer
191
192
// we do this to be sure the previous pipe is drained
192
- cursor = buf + ((currbuf - buf ) ^ BUFSIZE );
193
+ cursor = buf + ((currbuf - buf ) ^ BUFSPLIT );
193
194
currbuf = cursor ;
194
195
}
195
196
196
- /*
197
- * properties:
198
- *
199
- * swapping adjacent (clp, opp) -> (opp, clp) is always valid
200
- */
201
-
202
197
203
198
/*
204
199
* general strategy:
205
200
*
206
- * curr is a bitmask of all the close parentheses (clp) with LSb the start of
201
+ * curr is a bitmask of all the close parentheses with LSb the start of
207
202
* the output string.
208
203
*
209
204
* We find the rightmost contiguous bit, and reset all but the MSb of the
@@ -212,8 +207,30 @@ flush_buf(size_t bcnt)
212
207
*
213
208
* This is optimized by adding a 1 that's shifted to start of group, which is
214
209
* effectively a swap and clear simultaniously
210
+ *
211
+ * Here's a simple example of the function in action, remember the least
212
+ * significant bit is the rightmost in the output:
213
+ *
214
+ * curr = 1010111000 = ((()))()()
215
+ *
216
+ * 1010111000
217
+ * first = 3 ^---
218
+ *
219
+ * 1010111000
220
+ * contig = 3 ^^^
221
+ *
222
+ * 1010111000
223
+ * add: + 1000
224
+ * = 1011000000 = (((((())()
225
+ *
226
+ * move contig - 1 = 2 bits back to their original positions
227
+ *
228
+ * rst: = 0000001010 = ()()((((((
229
+ *
230
+ * return add + rst
231
+ * = 1011001010 = ()()(())()
232
+ *
215
233
*/
216
-
217
234
inline static uint64_t
218
235
next_paren_bitmask (uint64_t curr )
219
236
{
@@ -273,7 +290,7 @@ do_batch(uint64_t paren)
273
290
voff = PSIZE ;
274
291
i = 0 ;
275
292
bcidx = 0 ;
276
- while ( paren != FIN ) {
293
+ do {
277
294
curr = paren >> poff ;
278
295
279
296
if (voff < 32 ) {
@@ -338,6 +355,93 @@ do_batch(uint64_t paren)
338
355
// flush_buf((i) * BATCH_BYTES);
339
356
}
340
357
358
+ static void
359
+ flat_store_bytecode (uint64_t paren )
360
+ {
361
+ /*
362
+ * this is a super minimal experiment of how fast I could expect to get
363
+ * in a single-threaded context. I run the "same" loop as the full
364
+ * output.
365
+ *
366
+ * This function gets 16GiB/s
367
+ */
368
+
369
+ int i ;
370
+ uint64_t bcidx ;
371
+ __m256i resv , bcv ;
372
+
373
+ const __m256i batch = _mm256_set1_epi64x (0xFF00FF00FF00FF00 );
374
+
375
+ i = 0 ;
376
+ bcidx = 0 ;
377
+ while (true) {
378
+ bcv = _mm256_load_si256 ((__m256i * )& bytecode [bcidx ]);
379
+ bcidx += BATCH_SIZE ;
380
+ if (bcidx == BATCH_STORE ) {
381
+ bcidx = 0 ;
382
+ }
383
+
384
+ // combine with bytecode
385
+ resv = _mm256_sub_epi8 (bcv , batch );
386
+
387
+ _mm256_store_si256 ((__m256i * ) cursor , resv );
388
+ cursor += 32 ;
389
+
390
+ if (i >= PIPECNT || paren == FIN ) {
391
+ // flush_buf((i) * BATCH_SIZE);
392
+ flush_buf (cursor - currbuf );
393
+ i = 0 ;
394
+ }
395
+ i += 1 ;
396
+ };
397
+ }
398
+
399
+ static void
400
+ flat_store (uint64_t paren )
401
+ {
402
+ /*
403
+ * this is a super minimal experiment of how fast I could expect to get
404
+ * in a single-threaded context. This function is just stores
405
+ *
406
+ * This function gets 26.7 GiB/s
407
+ */
408
+
409
+ int i ;
410
+ char * lc ;
411
+ const __m256i batch = _mm256_set1_epi64x (0xFF00FF00FF00FF00 );
412
+
413
+ i = 0 ;
414
+ lc = currbuf ;
415
+ while (true) {
416
+ _mm256_store_si256 ((__m256i * ) lc , batch );
417
+ lc += 32 ;
418
+
419
+ if (i >= PIPECNT ) {
420
+ // flush_buf((i) * BATCH_SIZE);
421
+ flush_buf (lc - currbuf );
422
+ lc = currbuf ;
423
+ i = 0 ;
424
+ }
425
+ i += 1 ;
426
+ };
427
+ }
428
+
429
+ static void
430
+ flat_flush_buf (uint64_t paren )
431
+ {
432
+ /*
433
+ * this is a super minimal experiment of how fast I could expect to get
434
+ * in a single-threaded context. This function is just the speed of
435
+ * flush_buf
436
+ *
437
+ * This function gets 72.8 GiB/s
438
+ */
439
+
440
+ memset (buf , 0xFF , DUAL_BUFSIZE );
441
+ while (true) {
442
+ flush_buf (CACHESIZE );
443
+ };
444
+ }
341
445
342
446
343
447
void
@@ -364,7 +468,10 @@ _start(void)
364
468
// (removed for batching)
365
469
paren = PMASK & 0xAAAAAAAAAAAAAAAA ;
366
470
gen_bytecode (paren );
367
- do_batch (paren );
471
+ // do_batch(paren);
472
+ // flat_store_bytecode(paren);
473
+ flat_store (paren );
474
+ // flat_flush_buf(paren);
368
475
close (STDOUT_FILENO );
369
476
exit (0 );
370
477
}
0 commit comments