diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj index fb11022f..905dd8e8 100644 --- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj +++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj @@ -2,7 +2,8 @@ Exe - net48;net6.0 + net6.0 + True diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs index 0c774061..0a768d28 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs @@ -1,6 +1,12 @@  using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; +using BitFaster.Caching.Lfu; +using Microsoft.Diagnostics.Tracing.StackSources; +using System.Collections.Generic; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + namespace BitFaster.Caching.Benchmarks.Lfu { @@ -12,11 +18,11 @@ public class SketchReset long[] table; - [Params(4, 128, 8192, 1048576)] + [Params(8192, 1048576)] public int Size { get; set; } [GlobalSetup] - public void Setup() + public unsafe void Setup() { table = new long[Size]; } @@ -75,5 +81,165 @@ public int Reset4() return (count0 + count1) + (count2 + count3); } + + [Benchmark()] + public int Reset4NoPopcount() + { + for (int i = 0; i < table.Length; i += 4) + { + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + return 0; + } + + [Benchmark()] + public unsafe int ResetAVXNoPopcount() + { + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = table) + { + for (int i = 0; i < table.Length; i += 4) + { + Vector256 t = Avx2.LoadVector256(tPtr + i); + t = Avx2.ShiftRightLogical(t, 1); + t = Avx2.And(t, resetMaskVector); + Avx2.Store(tPtr + i, t); + } + } + + return 0; + } + + [Benchmark()] + public unsafe int ResetAVXNoPopcountUnroll2() + { + if (table.Length < 16) + { + return ResetAVXNoPopcount(); + } + + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = table) + { + for (int i = 0; i < table.Length; i += 8) + { + Vector256 t1 = Avx2.LoadVector256(tPtr + i); + t1 = Avx2.ShiftRightLogical(t1, 1); + t1 = Avx2.And(t1, resetMaskVector); + Avx2.Store(tPtr + i, t1); + + Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4); + t2 = Avx2.ShiftRightLogical(t2, 1); + t2 = Avx2.And(t2, resetMaskVector); + Avx2.Store(tPtr + i + 4, t2); + } + } + + return 0; + } + + [Benchmark()] + public unsafe int ResetAVXNoPopcountUnroll4() + { + if (table.Length < 16) + { + return ResetAVXNoPopcount(); + } + + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = table) + { + for (int i = 0; i < table.Length; i += 16) + { + Vector256 t1 = Avx2.LoadVector256(tPtr + i); + t1 = Avx2.ShiftRightLogical(t1, 1); + t1 = Avx2.And(t1, resetMaskVector); + Avx2.Store(tPtr + i, t1); + + Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4); + t2 = Avx2.ShiftRightLogical(t2, 1); + t2 = Avx2.And(t2, resetMaskVector); + Avx2.Store(tPtr + i + 4, t2); + + Vector256 t3 = Avx2.LoadVector256(tPtr + i + 8); + t3 = Avx2.ShiftRightLogical(t3, 1); + t3 = Avx2.And(t3, resetMaskVector); + Avx2.Store(tPtr + i + 8, t3); + + Vector256 t4 = Avx2.LoadVector256(tPtr + i + 12); + t4 = Avx2.ShiftRightLogical(t4, 1); + t4 = Avx2.And(t4, resetMaskVector); + Avx2.Store(tPtr + i + 12, t4); + } + } + + return 0; + } + + [Benchmark()] + public unsafe int ResetAVXAlignedNoPopcountUnroll4() + { + if (table.Length < 16) + { + return ResetAVXNoPopcount(); + } + + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = table) + { + long* alignedPtr = tPtr; + int remainder = 0; + + while (((ulong)alignedPtr & 31UL) != 0) + { + *alignedPtr = (*alignedPtr >> 1) & ResetMask; + alignedPtr++; + remainder = 16; + } + + int c = table.Length - (int)(alignedPtr - tPtr) - remainder; + int i = 0; + + for(; i < c; i += 16) + { + Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i); + Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4); + Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8); + Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12); + + t1 = Avx2.ShiftRightLogical(t1, 1); + t2 = Avx2.ShiftRightLogical(t2, 1); + t3 = Avx2.ShiftRightLogical(t3, 1); + t4 = Avx2.ShiftRightLogical(t4, 1); + + t1 = Avx2.And(t1, resetMaskVector); + t2 = Avx2.And(t2, resetMaskVector); + t3 = Avx2.And(t3, resetMaskVector); + t4 = Avx2.And(t4, resetMaskVector); + + Avx2.StoreAligned(alignedPtr + i, t1); + Avx2.StoreAligned(alignedPtr + i + 4, t2); + Avx2.StoreAligned(alignedPtr + i + 8, t3); + Avx2.StoreAligned(alignedPtr + i + 12, t4); + } + + int start = (int)(alignedPtr - tPtr) + i; + + for (int j = start; j < table.Length; j++) + { + tPtr[j] = (tPtr[j] >> 1) & ResetMask; + } + + return 0; + } + } } } diff --git a/BitFaster.Caching.UnitTests/Alignment.cs b/BitFaster.Caching.UnitTests/Alignment.cs new file mode 100644 index 00000000..8b94b45e --- /dev/null +++ b/BitFaster.Caching.UnitTests/Alignment.cs @@ -0,0 +1,95 @@ +using System; +using System.Collections.Generic; +using System.Drawing; +using System.Linq; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; +using System.Text; +using System.Threading.Tasks; +using Xunit; +using FluentAssertions; + +namespace BitFaster.Caching.UnitTests +{ + public class Alignment + { + static long ResetMask = 0x7777777777777777L; + + long[] table; + + private const ulong AlignmentMask = 31UL; + + [Fact] + public void Runner() + { + for (int i = 0; i < 8000; i++) + { + Test(); + } + } + + private unsafe void Test() + { + table = new long[128]; + + for (int i = 0; i < table.Length; i++) + { + table[i] = 15; + } + + + var resetMaskVector = Vector256.Create(ResetMask); + + fixed (long* tPtr = table) + { + long* alignedPtr = tPtr; + int remainder = 0; + + while (((ulong)alignedPtr & 31UL) != 0) + { + *alignedPtr = (*alignedPtr >> 1) & ResetMask; + alignedPtr++; + remainder = 16; + } + + int c = table.Length - (int)(alignedPtr - tPtr) - remainder; + int i = 0; + + for (; i < c; i += 16) + { + Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i); + t1 = Avx2.ShiftRightLogical(t1, 1); + t1 = Avx2.And(t1, resetMaskVector); + Avx2.StoreAligned(alignedPtr + i, t1); + + Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4); + t2 = Avx2.ShiftRightLogical(t2, 1); + t2 = Avx2.And(t2, resetMaskVector); + Avx2.StoreAligned(alignedPtr + i + 4, t2); + + Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8); + t3 = Avx2.ShiftRightLogical(t3, 1); + t3 = Avx2.And(t3, resetMaskVector); + Avx2.StoreAligned(alignedPtr + i + 8, t3); + + Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12); + t4 = Avx2.ShiftRightLogical(t4, 1); + t4 = Avx2.And(t4, resetMaskVector); + Avx2.StoreAligned(alignedPtr + i + 12, t4); + } + + int start = (int)(alignedPtr - tPtr) + i; + + for (int j = start; j < table.Length; j++) + { + tPtr[j] = (tPtr[j] >> 1) & ResetMask; + } + + for (int j = 0; j < table.Length; j++) + { + table[j].Should().Be(7); + } + } + } + } +} diff --git a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj index 4ddc7ca1..636ccc83 100644 --- a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj +++ b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj @@ -2,6 +2,7 @@ net6.0 + True