From cbf68a54f1eb11de123352871dbea525e94a9884 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Wed, 28 Sep 2022 16:26:22 -0700 Subject: [PATCH 1/3] bench --- .../BitFaster.Caching.Benchmarks.csproj | 3 +- .../Lfu/SketchReset.cs | 106 ++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj index fb11022f..905dd8e8 100644 --- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj +++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj @@ -2,7 +2,8 @@ Exe - net48;net6.0 + net6.0 + True diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs index 0c774061..15286d44 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs @@ -1,6 +1,11 @@  using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; +using BitFaster.Caching.Lfu; +using System.Collections.Generic; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + namespace BitFaster.Caching.Benchmarks.Lfu { @@ -75,5 +80,106 @@ public int Reset4() return (count0 + count1) + (count2 + count3); } + + [Benchmark()] + public int Reset4NoPopcount() + { + for (int i = 0; i < table.Length; i += 4) + { + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + return 0; + } + + [Benchmark()] + public unsafe int ResetAVXNoPopcount() + { + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = &table[0]) + { + for (int i = 0; i < table.Length; i += 4) + { + Vector256 t = Avx2.LoadVector256(tPtr + i).AsInt64(); + t = Avx2.ShiftRightLogical(t, 1); + t = Avx2.And(t, resetMaskVector); + Avx2.Store(tPtr + i, t); + } + } + + return 0; + } + + [Benchmark()] + public unsafe int ResetAVXNoPopcountUnroll2() + { + if (table.Length < 16) + { + return ResetAVXNoPopcount(); + } + + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = &table[0]) + { + for (int i = 0; i < table.Length; i += 8) + { + Vector256 t1 = Avx2.LoadVector256(tPtr + i).AsInt64(); + t1 = Avx2.ShiftRightLogical(t1, 1); + t1 = Avx2.And(t1, resetMaskVector); + Avx2.Store(tPtr + i, t1); + + Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4).AsInt64(); + t2 = Avx2.ShiftRightLogical(t2, 1); + t2 = Avx2.And(t2, resetMaskVector); + Avx2.Store(tPtr + i + 4, t2); + } + } + + return 0; + } + + [Benchmark()] + public unsafe int ResetAVXNoPopcountUnroll4() + { + if (table.Length < 16) + { + return ResetAVXNoPopcount(); + } + + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = &table[0]) + { + for (int i = 0; i < table.Length; i += 16) + { + Vector256 t1 = Avx2.LoadVector256(tPtr + i).AsInt64(); + t1 = Avx2.ShiftRightLogical(t1, 1); + t1 = Avx2.And(t1, resetMaskVector); + Avx2.Store(tPtr + i, t1); + + Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4).AsInt64(); + t2 = Avx2.ShiftRightLogical(t2, 1); + t2 = Avx2.And(t2, resetMaskVector); + Avx2.Store(tPtr + i + 4, t2); + + Vector256 t3 = Avx2.LoadVector256(tPtr + i + 8).AsInt64(); + t3 = Avx2.ShiftRightLogical(t3, 1); + t3 = Avx2.And(t3, resetMaskVector); + Avx2.Store(tPtr + i + 8, t3); + + Vector256 t4 = Avx2.LoadVector256(tPtr + i + 12).AsInt64(); + t4 = Avx2.ShiftRightLogical(t4, 1); + t4 = Avx2.And(t4, resetMaskVector); + Avx2.Store(tPtr + i + 12, t4); + } + } + + return 0; + } } } From 02e6fb99b16df646cdaf54e785f6bc36536ea6fd Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sat, 8 Oct 2022 19:20:47 -0700 Subject: [PATCH 2/3] align --- .../Lfu/SketchReset.cs | 214 +++++++++++------- BitFaster.Caching.UnitTests/Alignment.cs | 95 ++++++++ .../BitFaster.Caching.UnitTests.csproj | 1 + 3 files changed, 231 insertions(+), 79 deletions(-) create mode 100644 BitFaster.Caching.UnitTests/Alignment.cs diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs index 15286d44..ad8288fe 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs @@ -2,6 +2,7 @@ using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; using BitFaster.Caching.Lfu; +using Microsoft.Diagnostics.Tracing.StackSources; using System.Collections.Generic; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -17,11 +18,11 @@ public class SketchReset long[] table; - [Params(4, 128, 8192, 1048576)] + [Params(8192, 1048576)] public int Size { get; set; } [GlobalSetup] - public void Setup() + public unsafe void Setup() { table = new long[Size]; } @@ -39,63 +40,63 @@ public int Reset1() return count; } - [Benchmark()] - public int Reset2() - { - int count0 = 0; - int count1 = 0; - - for (int i = 0; i < table.Length; i += 2) - { - count0 += BitOps.BitCount(table[i] & OneMask); - count1 += BitOps.BitCount(table[i + 1] & OneMask); - - table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - } - - return count0 + count1; - } - - [Benchmark()] - public int Reset4() - { - int count0 = 0; - int count1 = 0; - int count2 = 0; - int count3 = 0; - - for (int i = 0; i < table.Length; i += 4) - { - count0 += BitOps.BitCount(table[i] & OneMask); - count1 += BitOps.BitCount(table[i + 1] & OneMask); - count2 += BitOps.BitCount(table[i + 2] & OneMask); - count3 += BitOps.BitCount(table[i + 3] & OneMask); - - table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; - table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; - } - - return (count0 + count1) + (count2 + count3); - } - - [Benchmark()] - public int Reset4NoPopcount() - { - for (int i = 0; i < table.Length; i += 4) - { - table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; - table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; - } - - return 0; - } - - [Benchmark()] + //[Benchmark()] + //public int Reset2() + //{ + // int count0 = 0; + // int count1 = 0; + + // for (int i = 0; i < table.Length; i += 2) + // { + // count0 += BitOps.BitCount(table[i] & OneMask); + // count1 += BitOps.BitCount(table[i + 1] & OneMask); + + // table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + // table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + // } + + // return count0 + count1; + //} + + //[Benchmark()] + //public int Reset4() + //{ + // int count0 = 0; + // int count1 = 0; + // int count2 = 0; + // int count3 = 0; + + // for (int i = 0; i < table.Length; i += 4) + // { + // count0 += BitOps.BitCount(table[i] & OneMask); + // count1 += BitOps.BitCount(table[i + 1] & OneMask); + // count2 += BitOps.BitCount(table[i + 2] & OneMask); + // count3 += BitOps.BitCount(table[i + 3] & OneMask); + + // table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + // table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + // table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + // table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + // } + + // return (count0 + count1) + (count2 + count3); + //} + + //[Benchmark()] + //public int Reset4NoPopcount() + //{ + // for (int i = 0; i < table.Length; i += 4) + // { + // table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + // table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + // table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + // table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + // } + + // return 0; + //} + + //[Benchmark()] public unsafe int ResetAVXNoPopcount() { var resetMaskVector = Vector256.Create(ResetMask); @@ -114,8 +115,37 @@ public unsafe int ResetAVXNoPopcount() return 0; } + //[Benchmark()] + //public unsafe int ResetAVXNoPopcountUnroll2() + //{ + // if (table.Length < 16) + // { + // return ResetAVXNoPopcount(); + // } + + // var resetMaskVector = Vector256.Create(ResetMask); + + // fixed (long* tPtr = &table[0]) + // { + // for (int i = 0; i < table.Length; i += 8) + // { + // Vector256 t1 = Avx2.LoadVector256(tPtr + i).AsInt64(); + // t1 = Avx2.ShiftRightLogical(t1, 1); + // t1 = Avx2.And(t1, resetMaskVector); + // Avx2.Store(tPtr + i, t1); + + // Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4).AsInt64(); + // t2 = Avx2.ShiftRightLogical(t2, 1); + // t2 = Avx2.And(t2, resetMaskVector); + // Avx2.Store(tPtr + i + 4, t2); + // } + // } + + // return 0; + //} + [Benchmark()] - public unsafe int ResetAVXNoPopcountUnroll2() + public unsafe int ResetAVXNoPopcountUnroll4() { if (table.Length < 16) { @@ -126,7 +156,7 @@ public unsafe int ResetAVXNoPopcountUnroll2() fixed (long* tPtr = &table[0]) { - for (int i = 0; i < table.Length; i += 8) + for (int i = 0; i < table.Length; i += 16) { Vector256 t1 = Avx2.LoadVector256(tPtr + i).AsInt64(); t1 = Avx2.ShiftRightLogical(t1, 1); @@ -137,6 +167,16 @@ public unsafe int ResetAVXNoPopcountUnroll2() t2 = Avx2.ShiftRightLogical(t2, 1); t2 = Avx2.And(t2, resetMaskVector); Avx2.Store(tPtr + i + 4, t2); + + Vector256 t3 = Avx2.LoadVector256(tPtr + i + 8).AsInt64(); + t3 = Avx2.ShiftRightLogical(t3, 1); + t3 = Avx2.And(t3, resetMaskVector); + Avx2.Store(tPtr + i + 8, t3); + + Vector256 t4 = Avx2.LoadVector256(tPtr + i + 12).AsInt64(); + t4 = Avx2.ShiftRightLogical(t4, 1); + t4 = Avx2.And(t4, resetMaskVector); + Avx2.Store(tPtr + i + 12, t4); } } @@ -144,7 +184,7 @@ public unsafe int ResetAVXNoPopcountUnroll2() } [Benchmark()] - public unsafe int ResetAVXNoPopcountUnroll4() + public unsafe int ResetAVXAlignedNoPopcountUnroll4() { if (table.Length < 16) { @@ -155,31 +195,47 @@ public unsafe int ResetAVXNoPopcountUnroll4() fixed (long* tPtr = &table[0]) { - for (int i = 0; i < table.Length; i += 16) + long* alignedPtr = tPtr; + + while (((ulong)alignedPtr & 31UL) != 0) { - Vector256 t1 = Avx2.LoadVector256(tPtr + i).AsInt64(); - t1 = Avx2.ShiftRightLogical(t1, 1); - t1 = Avx2.And(t1, resetMaskVector); - Avx2.Store(tPtr + i, t1); + *alignedPtr = (*alignedPtr >> 1) & ResetMask; + alignedPtr++; + } - Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4).AsInt64(); - t2 = Avx2.ShiftRightLogical(t2, 1); - t2 = Avx2.And(t2, resetMaskVector); - Avx2.Store(tPtr + i + 4, t2); + int c = table.Length - (int)(alignedPtr - tPtr) -16; - Vector256 t3 = Avx2.LoadVector256(tPtr + i + 8).AsInt64(); - t3 = Avx2.ShiftRightLogical(t3, 1); - t3 = Avx2.And(t3, resetMaskVector); - Avx2.Store(tPtr + i + 8, t3); + for (int i = 0; i < c; i += 16) + { + Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i).AsInt64(); + Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4).AsInt64(); + Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8).AsInt64(); + Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12).AsInt64(); - Vector256 t4 = Avx2.LoadVector256(tPtr + i + 12).AsInt64(); + t1 = Avx2.ShiftRightLogical(t1, 1); + t2 = Avx2.ShiftRightLogical(t2, 1); + t3 = Avx2.ShiftRightLogical(t3, 1); t4 = Avx2.ShiftRightLogical(t4, 1); + + t1 = Avx2.And(t1, resetMaskVector); + t2 = Avx2.And(t2, resetMaskVector); + t3 = Avx2.And(t3, resetMaskVector); t4 = Avx2.And(t4, resetMaskVector); - Avx2.Store(tPtr + i + 12, t4); + + Avx2.StoreAligned(alignedPtr + i, t1); + Avx2.StoreAligned(alignedPtr + i + 4, t2); + Avx2.StoreAligned(alignedPtr + i + 8, t3); + Avx2.StoreAligned(alignedPtr + i + 12, t4); } - } - return 0; + for (int i = c; i < table.Length; i++) + { + *alignedPtr = (*alignedPtr >> 1) & ResetMask; + alignedPtr++; + } + + return 0; + } } } } diff --git a/BitFaster.Caching.UnitTests/Alignment.cs b/BitFaster.Caching.UnitTests/Alignment.cs new file mode 100644 index 00000000..9b3e2608 --- /dev/null +++ b/BitFaster.Caching.UnitTests/Alignment.cs @@ -0,0 +1,95 @@ +using System; +using System.Collections.Generic; +using System.Drawing; +using System.Linq; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; +using System.Text; +using System.Threading.Tasks; +using Xunit; +using FluentAssertions; + +namespace BitFaster.Caching.UnitTests +{ + public class Alignment + { + static long ResetMask = 0x7777777777777777L; + + long[] table; + + private const ulong AlignmentMask = 31UL; + + [Fact] + public void Runner() + { + for (int i = 0; i < 8000; i++) + { + Test(); + } + } + + private unsafe void Test() + { + table = new long[128]; + + for (int i = 0; i < table.Length; i++) + { + table[i] = 15; + } + + + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = &table[0]) + { + long* alignedPtr = tPtr; + int remainder = 0; + + while (((ulong)alignedPtr & 31UL) != 0) + { + *alignedPtr = (*alignedPtr >> 1) & ResetMask; + alignedPtr++; + remainder = 16; + } + + int c = table.Length - (int)(alignedPtr - tPtr) - remainder; + int i = 0; + + for (; i < c; i += 16) + { + Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i).AsInt64(); + t1 = Avx2.ShiftRightLogical(t1, 1); + t1 = Avx2.And(t1, resetMaskVector); + Avx2.StoreAligned(alignedPtr + i, t1); + + Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4).AsInt64(); + t2 = Avx2.ShiftRightLogical(t2, 1); + t2 = Avx2.And(t2, resetMaskVector); + Avx2.StoreAligned(alignedPtr + i + 4, t2); + + Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8).AsInt64(); + t3 = Avx2.ShiftRightLogical(t3, 1); + t3 = Avx2.And(t3, resetMaskVector); + Avx2.StoreAligned(alignedPtr + i + 8, t3); + + Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12).AsInt64(); + t4 = Avx2.ShiftRightLogical(t4, 1); + t4 = Avx2.And(t4, resetMaskVector); + Avx2.StoreAligned(alignedPtr + i + 12, t4); + } + + int start = (int)(alignedPtr - tPtr) + i; + + for (int j = start; j < table.Length; j++) + { + tPtr[j] = (tPtr[j] >> 1) & ResetMask; + } + + for (int j = 0; j < table.Length; j++) + { + table[j].Should().Be(7); + } + } + } + } +} diff --git a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj index 4ddc7ca1..636ccc83 100644 --- a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj +++ b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj @@ -2,6 +2,7 @@ net6.0 + True From a4e659a8f66d95a39aa0c8eb5c9ec1b77c0466e6 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sat, 8 Oct 2022 19:57:41 -0700 Subject: [PATCH 3/3] cleanup --- .../Lfu/SketchReset.cs | 208 +++++++++--------- BitFaster.Caching.UnitTests/Alignment.cs | 10 +- 2 files changed, 111 insertions(+), 107 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs index ad8288fe..0a768d28 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs @@ -40,72 +40,72 @@ public int Reset1() return count; } - //[Benchmark()] - //public int Reset2() - //{ - // int count0 = 0; - // int count1 = 0; - - // for (int i = 0; i < table.Length; i += 2) - // { - // count0 += BitOps.BitCount(table[i] & OneMask); - // count1 += BitOps.BitCount(table[i + 1] & OneMask); - - // table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - // table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - // } - - // return count0 + count1; - //} - - //[Benchmark()] - //public int Reset4() - //{ - // int count0 = 0; - // int count1 = 0; - // int count2 = 0; - // int count3 = 0; - - // for (int i = 0; i < table.Length; i += 4) - // { - // count0 += BitOps.BitCount(table[i] & OneMask); - // count1 += BitOps.BitCount(table[i + 1] & OneMask); - // count2 += BitOps.BitCount(table[i + 2] & OneMask); - // count3 += BitOps.BitCount(table[i + 3] & OneMask); - - // table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - // table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - // table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; - // table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; - // } - - // return (count0 + count1) + (count2 + count3); - //} - - //[Benchmark()] - //public int Reset4NoPopcount() - //{ - // for (int i = 0; i < table.Length; i += 4) - // { - // table[i] = (long)((ulong)table[i] >> 1) & ResetMask; - // table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; - // table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; - // table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; - // } - - // return 0; - //} - - //[Benchmark()] + [Benchmark()] + public int Reset2() + { + int count0 = 0; + int count1 = 0; + + for (int i = 0; i < table.Length; i += 2) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + } + + return count0 + count1; + } + + [Benchmark()] + public int Reset4() + { + int count0 = 0; + int count1 = 0; + int count2 = 0; + int count3 = 0; + + for (int i = 0; i < table.Length; i += 4) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + count2 += BitOps.BitCount(table[i + 2] & OneMask); + count3 += BitOps.BitCount(table[i + 3] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + return (count0 + count1) + (count2 + count3); + } + + [Benchmark()] + public int Reset4NoPopcount() + { + for (int i = 0; i < table.Length; i += 4) + { + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + return 0; + } + + [Benchmark()] public unsafe int ResetAVXNoPopcount() { var resetMaskVector = Vector256.Create(ResetMask); - fixed (long* tPtr = &table[0]) + fixed (long* tPtr = table) { for (int i = 0; i < table.Length; i += 4) { - Vector256 t = Avx2.LoadVector256(tPtr + i).AsInt64(); + Vector256 t = Avx2.LoadVector256(tPtr + i); t = Avx2.ShiftRightLogical(t, 1); t = Avx2.And(t, resetMaskVector); Avx2.Store(tPtr + i, t); @@ -115,34 +115,34 @@ public unsafe int ResetAVXNoPopcount() return 0; } - //[Benchmark()] - //public unsafe int ResetAVXNoPopcountUnroll2() - //{ - // if (table.Length < 16) - // { - // return ResetAVXNoPopcount(); - // } - - // var resetMaskVector = Vector256.Create(ResetMask); - - // fixed (long* tPtr = &table[0]) - // { - // for (int i = 0; i < table.Length; i += 8) - // { - // Vector256 t1 = Avx2.LoadVector256(tPtr + i).AsInt64(); - // t1 = Avx2.ShiftRightLogical(t1, 1); - // t1 = Avx2.And(t1, resetMaskVector); - // Avx2.Store(tPtr + i, t1); - - // Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4).AsInt64(); - // t2 = Avx2.ShiftRightLogical(t2, 1); - // t2 = Avx2.And(t2, resetMaskVector); - // Avx2.Store(tPtr + i + 4, t2); - // } - // } - - // return 0; - //} + [Benchmark()] + public unsafe int ResetAVXNoPopcountUnroll2() + { + if (table.Length < 16) + { + return ResetAVXNoPopcount(); + } + + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = table) + { + for (int i = 0; i < table.Length; i += 8) + { + Vector256 t1 = Avx2.LoadVector256(tPtr + i); + t1 = Avx2.ShiftRightLogical(t1, 1); + t1 = Avx2.And(t1, resetMaskVector); + Avx2.Store(tPtr + i, t1); + + Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4); + t2 = Avx2.ShiftRightLogical(t2, 1); + t2 = Avx2.And(t2, resetMaskVector); + Avx2.Store(tPtr + i + 4, t2); + } + } + + return 0; + } [Benchmark()] public unsafe int ResetAVXNoPopcountUnroll4() @@ -154,26 +154,26 @@ public unsafe int ResetAVXNoPopcountUnroll4() var resetMaskVector = Vector256.Create(ResetMask); - fixed (long* tPtr = &table[0]) + fixed (long* tPtr = table) { for (int i = 0; i < table.Length; i += 16) { - Vector256 t1 = Avx2.LoadVector256(tPtr + i).AsInt64(); + Vector256 t1 = Avx2.LoadVector256(tPtr + i); t1 = Avx2.ShiftRightLogical(t1, 1); t1 = Avx2.And(t1, resetMaskVector); Avx2.Store(tPtr + i, t1); - Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4).AsInt64(); + Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4); t2 = Avx2.ShiftRightLogical(t2, 1); t2 = Avx2.And(t2, resetMaskVector); Avx2.Store(tPtr + i + 4, t2); - Vector256 t3 = Avx2.LoadVector256(tPtr + i + 8).AsInt64(); + Vector256 t3 = Avx2.LoadVector256(tPtr + i + 8); t3 = Avx2.ShiftRightLogical(t3, 1); t3 = Avx2.And(t3, resetMaskVector); Avx2.Store(tPtr + i + 8, t3); - Vector256 t4 = Avx2.LoadVector256(tPtr + i + 12).AsInt64(); + Vector256 t4 = Avx2.LoadVector256(tPtr + i + 12); t4 = Avx2.ShiftRightLogical(t4, 1); t4 = Avx2.And(t4, resetMaskVector); Avx2.Store(tPtr + i + 12, t4); @@ -193,24 +193,27 @@ public unsafe int ResetAVXAlignedNoPopcountUnroll4() var resetMaskVector = Vector256.Create(ResetMask); - fixed (long* tPtr = &table[0]) + fixed (long* tPtr = table) { long* alignedPtr = tPtr; + int remainder = 0; while (((ulong)alignedPtr & 31UL) != 0) { *alignedPtr = (*alignedPtr >> 1) & ResetMask; alignedPtr++; + remainder = 16; } - int c = table.Length - (int)(alignedPtr - tPtr) -16; + int c = table.Length - (int)(alignedPtr - tPtr) - remainder; + int i = 0; - for (int i = 0; i < c; i += 16) + for(; i < c; i += 16) { - Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i).AsInt64(); - Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4).AsInt64(); - Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8).AsInt64(); - Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12).AsInt64(); + Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i); + Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4); + Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8); + Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12); t1 = Avx2.ShiftRightLogical(t1, 1); t2 = Avx2.ShiftRightLogical(t2, 1); @@ -228,10 +231,11 @@ public unsafe int ResetAVXAlignedNoPopcountUnroll4() Avx2.StoreAligned(alignedPtr + i + 12, t4); } - for (int i = c; i < table.Length; i++) + int start = (int)(alignedPtr - tPtr) + i; + + for (int j = start; j < table.Length; j++) { - *alignedPtr = (*alignedPtr >> 1) & ResetMask; - alignedPtr++; + tPtr[j] = (tPtr[j] >> 1) & ResetMask; } return 0; diff --git a/BitFaster.Caching.UnitTests/Alignment.cs b/BitFaster.Caching.UnitTests/Alignment.cs index 9b3e2608..8b94b45e 100644 --- a/BitFaster.Caching.UnitTests/Alignment.cs +++ b/BitFaster.Caching.UnitTests/Alignment.cs @@ -40,7 +40,7 @@ private unsafe void Test() var resetMaskVector = Vector256.Create(ResetMask); - fixed (long* tPtr = &table[0]) + fixed (long* tPtr = table) { long* alignedPtr = tPtr; int remainder = 0; @@ -57,22 +57,22 @@ private unsafe void Test() for (; i < c; i += 16) { - Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i).AsInt64(); + Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i); t1 = Avx2.ShiftRightLogical(t1, 1); t1 = Avx2.And(t1, resetMaskVector); Avx2.StoreAligned(alignedPtr + i, t1); - Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4).AsInt64(); + Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4); t2 = Avx2.ShiftRightLogical(t2, 1); t2 = Avx2.And(t2, resetMaskVector); Avx2.StoreAligned(alignedPtr + i + 4, t2); - Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8).AsInt64(); + Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8); t3 = Avx2.ShiftRightLogical(t3, 1); t3 = Avx2.And(t3, resetMaskVector); Avx2.StoreAligned(alignedPtr + i + 8, t3); - Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12).AsInt64(); + Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12); t4 = Avx2.ShiftRightLogical(t4, 1); t4 = Avx2.And(t4, resetMaskVector); Avx2.StoreAligned(alignedPtr + i + 12, t4);