diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj
index fb11022f..905dd8e8 100644
--- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj
+++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj
@@ -2,7 +2,8 @@
Exe
- net48;net6.0
+ net6.0
+ True
diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs
index 0c774061..0a768d28 100644
--- a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs
+++ b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs
@@ -1,6 +1,12 @@
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
+using BitFaster.Caching.Lfu;
+using Microsoft.Diagnostics.Tracing.StackSources;
+using System.Collections.Generic;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
namespace BitFaster.Caching.Benchmarks.Lfu
{
@@ -12,11 +18,11 @@ public class SketchReset
long[] table;
- [Params(4, 128, 8192, 1048576)]
+ [Params(8192, 1048576)]
public int Size { get; set; }
[GlobalSetup]
- public void Setup()
+ public unsafe void Setup()
{
table = new long[Size];
}
@@ -75,5 +81,165 @@ public int Reset4()
return (count0 + count1) + (count2 + count3);
}
+
+ [Benchmark()]
+ public int Reset4NoPopcount()
+ {
+ for (int i = 0; i < table.Length; i += 4)
+ {
+ table[i] = (long)((ulong)table[i] >> 1) & ResetMask;
+ table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask;
+ table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask;
+ table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask;
+ }
+
+ return 0;
+ }
+
+ [Benchmark()]
+ public unsafe int ResetAVXNoPopcount()
+ {
+ var resetMaskVector = Vector256.Create(ResetMask);
+
+ fixed (long* tPtr = table)
+ {
+ for (int i = 0; i < table.Length; i += 4)
+ {
+ Vector256 t = Avx2.LoadVector256(tPtr + i);
+ t = Avx2.ShiftRightLogical(t, 1);
+ t = Avx2.And(t, resetMaskVector);
+ Avx2.Store(tPtr + i, t);
+ }
+ }
+
+ return 0;
+ }
+
+ [Benchmark()]
+ public unsafe int ResetAVXNoPopcountUnroll2()
+ {
+ if (table.Length < 16)
+ {
+ return ResetAVXNoPopcount();
+ }
+
+ var resetMaskVector = Vector256.Create(ResetMask);
+
+ fixed (long* tPtr = table)
+ {
+ for (int i = 0; i < table.Length; i += 8)
+ {
+ Vector256 t1 = Avx2.LoadVector256(tPtr + i);
+ t1 = Avx2.ShiftRightLogical(t1, 1);
+ t1 = Avx2.And(t1, resetMaskVector);
+ Avx2.Store(tPtr + i, t1);
+
+ Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4);
+ t2 = Avx2.ShiftRightLogical(t2, 1);
+ t2 = Avx2.And(t2, resetMaskVector);
+ Avx2.Store(tPtr + i + 4, t2);
+ }
+ }
+
+ return 0;
+ }
+
+ [Benchmark()]
+ public unsafe int ResetAVXNoPopcountUnroll4()
+ {
+ if (table.Length < 16)
+ {
+ return ResetAVXNoPopcount();
+ }
+
+ var resetMaskVector = Vector256.Create(ResetMask);
+
+ fixed (long* tPtr = table)
+ {
+ for (int i = 0; i < table.Length; i += 16)
+ {
+ Vector256 t1 = Avx2.LoadVector256(tPtr + i);
+ t1 = Avx2.ShiftRightLogical(t1, 1);
+ t1 = Avx2.And(t1, resetMaskVector);
+ Avx2.Store(tPtr + i, t1);
+
+ Vector256 t2 = Avx2.LoadVector256(tPtr + i + 4);
+ t2 = Avx2.ShiftRightLogical(t2, 1);
+ t2 = Avx2.And(t2, resetMaskVector);
+ Avx2.Store(tPtr + i + 4, t2);
+
+ Vector256 t3 = Avx2.LoadVector256(tPtr + i + 8);
+ t3 = Avx2.ShiftRightLogical(t3, 1);
+ t3 = Avx2.And(t3, resetMaskVector);
+ Avx2.Store(tPtr + i + 8, t3);
+
+ Vector256 t4 = Avx2.LoadVector256(tPtr + i + 12);
+ t4 = Avx2.ShiftRightLogical(t4, 1);
+ t4 = Avx2.And(t4, resetMaskVector);
+ Avx2.Store(tPtr + i + 12, t4);
+ }
+ }
+
+ return 0;
+ }
+
+ [Benchmark()]
+ public unsafe int ResetAVXAlignedNoPopcountUnroll4()
+ {
+ if (table.Length < 16)
+ {
+ return ResetAVXNoPopcount();
+ }
+
+ var resetMaskVector = Vector256.Create(ResetMask);
+
+ fixed (long* tPtr = table)
+ {
+ long* alignedPtr = tPtr;
+ int remainder = 0;
+
+ while (((ulong)alignedPtr & 31UL) != 0)
+ {
+ *alignedPtr = (*alignedPtr >> 1) & ResetMask;
+ alignedPtr++;
+ remainder = 16;
+ }
+
+ int c = table.Length - (int)(alignedPtr - tPtr) - remainder;
+ int i = 0;
+
+ for(; i < c; i += 16)
+ {
+ Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i);
+ Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4);
+ Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8);
+ Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12);
+
+ t1 = Avx2.ShiftRightLogical(t1, 1);
+ t2 = Avx2.ShiftRightLogical(t2, 1);
+ t3 = Avx2.ShiftRightLogical(t3, 1);
+ t4 = Avx2.ShiftRightLogical(t4, 1);
+
+ t1 = Avx2.And(t1, resetMaskVector);
+ t2 = Avx2.And(t2, resetMaskVector);
+ t3 = Avx2.And(t3, resetMaskVector);
+ t4 = Avx2.And(t4, resetMaskVector);
+
+ Avx2.StoreAligned(alignedPtr + i, t1);
+ Avx2.StoreAligned(alignedPtr + i + 4, t2);
+ Avx2.StoreAligned(alignedPtr + i + 8, t3);
+ Avx2.StoreAligned(alignedPtr + i + 12, t4);
+ }
+
+ int start = (int)(alignedPtr - tPtr) + i;
+
+ for (int j = start; j < table.Length; j++)
+ {
+ tPtr[j] = (tPtr[j] >> 1) & ResetMask;
+ }
+
+ return 0;
+ }
+ }
}
}
diff --git a/BitFaster.Caching.UnitTests/Alignment.cs b/BitFaster.Caching.UnitTests/Alignment.cs
new file mode 100644
index 00000000..8b94b45e
--- /dev/null
+++ b/BitFaster.Caching.UnitTests/Alignment.cs
@@ -0,0 +1,95 @@
+using System;
+using System.Collections.Generic;
+using System.Drawing;
+using System.Linq;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
+using System.Text;
+using System.Threading.Tasks;
+using Xunit;
+using FluentAssertions;
+
+namespace BitFaster.Caching.UnitTests
+{
+ public class Alignment
+ {
+ static long ResetMask = 0x7777777777777777L;
+
+ long[] table;
+
+ private const ulong AlignmentMask = 31UL;
+
+ [Fact]
+ public void Runner()
+ {
+ for (int i = 0; i < 8000; i++)
+ {
+ Test();
+ }
+ }
+
+ private unsafe void Test()
+ {
+ table = new long[128];
+
+ for (int i = 0; i < table.Length; i++)
+ {
+ table[i] = 15;
+ }
+
+
+ var resetMaskVector = Vector256.Create(ResetMask);
+
+ fixed (long* tPtr = table)
+ {
+ long* alignedPtr = tPtr;
+ int remainder = 0;
+
+ while (((ulong)alignedPtr & 31UL) != 0)
+ {
+ *alignedPtr = (*alignedPtr >> 1) & ResetMask;
+ alignedPtr++;
+ remainder = 16;
+ }
+
+ int c = table.Length - (int)(alignedPtr - tPtr) - remainder;
+ int i = 0;
+
+ for (; i < c; i += 16)
+ {
+ Vector256 t1 = Avx2.LoadAlignedVector256(alignedPtr + i);
+ t1 = Avx2.ShiftRightLogical(t1, 1);
+ t1 = Avx2.And(t1, resetMaskVector);
+ Avx2.StoreAligned(alignedPtr + i, t1);
+
+ Vector256 t2 = Avx2.LoadAlignedVector256(alignedPtr + i + 4);
+ t2 = Avx2.ShiftRightLogical(t2, 1);
+ t2 = Avx2.And(t2, resetMaskVector);
+ Avx2.StoreAligned(alignedPtr + i + 4, t2);
+
+ Vector256 t3 = Avx2.LoadAlignedVector256(alignedPtr + i + 8);
+ t3 = Avx2.ShiftRightLogical(t3, 1);
+ t3 = Avx2.And(t3, resetMaskVector);
+ Avx2.StoreAligned(alignedPtr + i + 8, t3);
+
+ Vector256 t4 = Avx2.LoadAlignedVector256(alignedPtr + i + 12);
+ t4 = Avx2.ShiftRightLogical(t4, 1);
+ t4 = Avx2.And(t4, resetMaskVector);
+ Avx2.StoreAligned(alignedPtr + i + 12, t4);
+ }
+
+ int start = (int)(alignedPtr - tPtr) + i;
+
+ for (int j = start; j < table.Length; j++)
+ {
+ tPtr[j] = (tPtr[j] >> 1) & ResetMask;
+ }
+
+ for (int j = 0; j < table.Length; j++)
+ {
+ table[j].Should().Be(7);
+ }
+ }
+ }
+ }
+}
diff --git a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj
index 4ddc7ca1..636ccc83 100644
--- a/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj
+++ b/BitFaster.Caching.UnitTests/BitFaster.Caching.UnitTests.csproj
@@ -2,6 +2,7 @@
net6.0
+ True