cache context

LittleLittleCloud · LittleLittleCloud · commit dd97e83c492d · 2025-02-11T09:34:37.000-08:00
diff --git a/TorchSharp.BitsAndBytes.Benchmark/CudaBenchmark.cs b/TorchSharp.BitsAndBytes.Benchmark/CudaBenchmark.cs
@@ -32,34 +32,34 @@ public void Setup()
         (quantizedTensor, absMax, _, _) = BitsAndByteUtils.Quantize4Bit(b, "fp4", blockSize);
     }
 
-    //[Benchmark]
+    [Benchmark]
     public void Quantize4Bit()
     {
         var result = BitsAndByteUtils.Quantize4Bit(a1, quantizedDType, blockSize);
     }
 
-    //[Benchmark]
+    [Benchmark]
     public void Dequantize4Bit()
     {
         var (quantizedTensor, absMax, _, n) = BitsAndByteUtils.Quantize4Bit(a1, quantizedDType, blockSize);
         var result = BitsAndByteUtils.Dequantize4Bit(quantizedTensor, absMax, ScalarType.Float32, quantizedDType, n, a1.shape, blockSize);
     }
 
-    //[Benchmark]
+    [Benchmark]
     public void GEMV_4Bit_FP4()
     {
         using var input = torch.rand(new long[] { 1, dim }, dtype: ScalarType.Float32).cuda();
         using var result = BitsAndByteUtils.Gemv4Bit(input, quantizedTensor, [4*dim, dim], absMax, blockSize, quantizedDType);
     }
 
-    //[Benchmark]
+    [Benchmark]
     public void GEMV_4Bit_NF4()
     {
         using var input = torch.rand(new long[] { 1, dim }, dtype: ScalarType.Float32).cuda();
         using var result = BitsAndByteUtils.Gemv4Bit(input, quantizedTensor, [4 * dim, dim], absMax, blockSize, "nf4");
     }
 
-    //[Benchmark]
+    [Benchmark]
     public void GEMV_FP32()
     {
         using var input = torch.rand([1, dim], dtype: ScalarType.Float32).cuda();
@@ -74,7 +74,7 @@ public void GEMM_INT8()
         using var result = Function.Int8GEMM(input, weight);
     }
 
-    //[Benchmark]
+    [Benchmark]
     public void GEMM_FP32()
     {
         using var input = torch.randint(-128, 127, new long[] { 1, dim }, dtype: ScalarType.Float32).cuda();
diff --git a/TorchSharp.BitsAndBytes.Benchmark/Program.cs b/TorchSharp.BitsAndBytes.Benchmark/Program.cs
@@ -1,4 +1,3 @@
 ﻿using BenchmarkDotNet.Running;
 using TorchSharp.BitsAndBytes.Benchmark;
-new CudaBenchmark().GEMM_INT8();
 BenchmarkRunner.Run<CudaBenchmark>();
diff --git a/TorchSharp.BitsAndBytes/Function.cs b/TorchSharp.BitsAndBytes/Function.cs
@@ -9,6 +9,7 @@ namespace TorchSharp.BitsAndBytes;
 
 public class Function
 {
+    private static readonly Lazy<Dictionary<int, IntPtr>> _context = new(() => new Dictionary<int, IntPtr>());
     /// <summary>
     /// Integer General Matrix Multiplication (IGEMM) for 8-bit integer data types.
     /// </summary>
@@ -24,7 +25,7 @@ public static Tensor Int8GEMM(
         bool transposeInput = false)
     {
         var sout = BitsAndByteUtils.CheckMatmul(input, weight, transposeWeight, transposeInput);
-        var @out = torch.zeros((long[])[.. sout], dtype: torch.int32, device: input.device);
+        var result = torch.zeros((long[])[.. sout], dtype: torch.int32, device: input.device);
         if (input.shape.Length == 3 && weight.shape.Length == 3)
         {
             if (input.shape[0] == weight.shape[0] && input.shape[2] == weight.shape[1])
@@ -130,16 +131,25 @@ public static Tensor Int8GEMM(
             ldc = m;
         }
 
-        var context = BitsAndBytesCudaNative.get_context();
+        IntPtr context;
+        if (_context.Value.TryGetValue(input.device_index, out var ctx))
+        {
+            context = ctx;
+        }
+        else
+        {
+            context = BitsAndBytesCudaNative.get_context();
+            _context.Value[input.device_index] = context;
+        }
+
         var A = LibTorchNativeMethod.THSStorage_data_ptr(input.Handle);
         var B = LibTorchNativeMethod.THSStorage_data_ptr(weight.Handle);
-        var C = LibTorchNativeMethod.THSStorage_data_ptr(@out.Handle);
-
+        var C = LibTorchNativeMethod.THSStorage_data_ptr(result.Handle);
         BitsAndBytesCudaNative.cigemm(
             context: context,
             transposeA: transposeWeight, // cuBLAS expects column major, but PyTorch is row major
             transposeB: transposeInput, // So we have to transpose A and B
-            m: m,   
+            m: m,
             n: n,
             k: k,
             A: B,   // out_T = B_T @ A_T
@@ -148,7 +158,7 @@ public static Tensor Int8GEMM(
             lda: lda,
             ldb: ldb,
             ldc: ldc);
+        return result;
 
-        return @out;
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -32,34 +32,34 @@ public void Setup()`
`32`	`32`	`(quantizedTensor, absMax, _, _) = BitsAndByteUtils.Quantize4Bit(b, "fp4", blockSize);`
`33`	`33`	`}`
`34`	`34`
`35`		`- //[Benchmark]`
	`35`	`+ [Benchmark]`
`36`	`36`	`public void Quantize4Bit()`
`37`	`37`	`{`
`38`	`38`	`var result = BitsAndByteUtils.Quantize4Bit(a1, quantizedDType, blockSize);`
`39`	`39`	`}`
`40`	`40`
`41`		`- //[Benchmark]`
	`41`	`+ [Benchmark]`
`42`	`42`	`public void Dequantize4Bit()`
`43`	`43`	`{`
`44`	`44`	`var (quantizedTensor, absMax, _, n) = BitsAndByteUtils.Quantize4Bit(a1, quantizedDType, blockSize);`
`45`	`45`	`var result = BitsAndByteUtils.Dequantize4Bit(quantizedTensor, absMax, ScalarType.Float32, quantizedDType, n, a1.shape, blockSize);`
`46`	`46`	`}`
`47`	`47`
`48`		`- //[Benchmark]`
	`48`	`+ [Benchmark]`
`49`	`49`	`public void GEMV_4Bit_FP4()`
`50`	`50`	`{`
`51`	`51`	`using var input = torch.rand(new long[] { 1, dim }, dtype: ScalarType.Float32).cuda();`
`52`	`52`	`using var result = BitsAndByteUtils.Gemv4Bit(input, quantizedTensor, [4*dim, dim], absMax, blockSize, quantizedDType);`
`53`	`53`	`}`
`54`	`54`
`55`		`- //[Benchmark]`
	`55`	`+ [Benchmark]`
`56`	`56`	`public void GEMV_4Bit_NF4()`
`57`	`57`	`{`
`58`	`58`	`using var input = torch.rand(new long[] { 1, dim }, dtype: ScalarType.Float32).cuda();`
`59`	`59`	`using var result = BitsAndByteUtils.Gemv4Bit(input, quantizedTensor, [4 * dim, dim], absMax, blockSize, "nf4");`
`60`	`60`	`}`
`61`	`61`
`62`		`- //[Benchmark]`
	`62`	`+ [Benchmark]`
`63`	`63`	`public void GEMV_FP32()`
`64`	`64`	`{`
`65`	`65`	`using var input = torch.rand([1, dim], dtype: ScalarType.Float32).cuda();`
`@@ -74,7 +74,7 @@ public void GEMM_INT8()`
`74`	`74`	`using var result = Function.Int8GEMM(input, weight);`
`75`	`75`	`}`
`76`	`76`
`77`		`- //[Benchmark]`
	`77`	`+ [Benchmark]`
`78`	`78`	`public void GEMM_FP32()`
`79`	`79`	`{`
`80`	`80`	`using var input = torch.randint(-128, 127, new long[] { 1, dim }, dtype: ScalarType.Float32).cuda();`