huggingface
diff --git a/‎server/exllamav2_kernels/exllamav2_kernels/config.h‎
Lines changed: 2 additions & 0 deletions b/‎server/exllamav2_kernels/exllamav2_kernels/config.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu‎
Lines changed: 53 additions & 45 deletions b/‎server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu‎
Lines changed: 53 additions & 45 deletions
diff --git a/‎server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh‎
Lines changed: 4 additions & 1 deletion b/‎server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh‎
Lines changed: 4 additions & 1 deletion
@@ -2,6 +2,7 @@
 #define _config_h
 
 #define MAX_Q_GEMM_ROWS 50
+#define MAX_Q_GEMM_WEIGHTS 4  // must be <= MAX_Q_GEMM_ROWS
 
 #define QMODE_2BIT 1
 #define QMODE_3BIT 1
@@ -10,4 +11,5 @@
 #define QMODE_6BIT 0
 #define QMODE_8BIT 0
 
+
 #endif
@@ -10,16 +10,19 @@
 #include "quant/qdq_6.cuh"
 #include "quant/qdq_8.cuh"
 
-#define BLOCK_KN_SIZE 128
-#define BLOCK_M_SIZE_MAX 8
-#define MAX_GROUPS_IN_BLOCK (BLOCK_KN_SIZE / 32)
+#define GPTQ_BLOCK_KN_SIZE 128
+#define GPTQ_BLOCK_M_SIZE_MAX 8
+#define GPTQ_MAX_GROUPS_IN_BLOCK (GPTQ_BLOCK_KN_SIZE / 32)
+
+#define EXL2_BLOCK_KN_SIZE 64
+#define EXL2_BLOCK_M_SIZE_MAX 8
+#define EXL2_MAX_GROUPS_IN_BLOCK (EXL2_BLOCK_KN_SIZE / 32)
+
 #define CLEAR_N_SIZE 256
 
 #include "q_gemm_kernel.cuh"
 #include "q_gemm_kernel_gptq.cuh"
 
-#include "compat_gemm.cuh"
-
 void gemm_half_q_half_cuda_part
 (
     const half* a,
@@ -29,20 +32,23 @@ void gemm_half_q_half_cuda_part
     int size_n,
     int size_k,
     int m_count,
-    bool clear
+    bool clear,
+    const half* r_weights,
+    int r_weights_stride,
+    bool mul_r_weights
 )
 {
     if (!b->is_gptq)
     {
         dim3 blockDim, gridDim;
-        blockDim.x = BLOCK_KN_SIZE;
+        blockDim.x = EXL2_BLOCK_KN_SIZE;
         blockDim.y = 1;
         blockDim.z = 1;
-        gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
+        gridDim.x = DIVIDE(size_n, EXL2_BLOCK_KN_SIZE * 4);
         gridDim.y = DIVIDE(size_m, m_count);
-        gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+        gridDim.z = DIVIDE(size_k, EXL2_BLOCK_KN_SIZE);
 
-        fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(true, m_count);
+        fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(m_count, r_weights != NULL, mul_r_weights);
 
         kernel<<<gridDim, blockDim>>>
         (
@@ -55,32 +61,35 @@ void gemm_half_q_half_cuda_part
             size_n,
             size_k,
             b->groups,
-            b->groupsize,
+            b->cuda_q_group_map,
             b->cuda_q_perm,
             b->rows_8,
             b->rows_6,
             b->rows_5,
             b->rows_4,
             b->rows_3,
             b->rows_2,
-            clear
+            clear,
+            r_weights,
+            r_weights_stride
         );
     }
     else
     {
         dim3 blockDim, gridDim;
-        blockDim.x = BLOCK_KN_SIZE;
+        blockDim.x = GPTQ_BLOCK_KN_SIZE;
         blockDim.y = 1;
         blockDim.z = 1;
-        gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
+        gridDim.x = DIVIDE(size_n, GPTQ_BLOCK_KN_SIZE * 4);
         gridDim.y = DIVIDE(size_m, m_count);
-        gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+        gridDim.z = DIVIDE(size_k, GPTQ_BLOCK_KN_SIZE);
 
-        fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count);
+        fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(m_count, r_weights != NULL, mul_r_weights);
 
-//         DBGX((uint64_t) b->cuda_q_perm);
-//         DBGI(b->rows_4);
-//         DBGI(b->height);
+//         DBGX((uint64_t) r_weights);
+//         if (r_weights)
+//             print_global_mem(r_weights, 1, 1, 1);
+//         DBGI(r_weights_stride);
 
         kernel<<<gridDim, blockDim>>>
         (
@@ -93,10 +102,12 @@ void gemm_half_q_half_cuda_part
             size_n,
             size_k,
             b->groups,
-            b->groupsize,
+            b->gptq_groupsize,
             b->cuda_q_perm,
             b->rows_4,
-            clear
+            clear,
+            r_weights,
+            r_weights_stride
         );
     }
 }
@@ -112,13 +123,14 @@ void gemm_half_q_half_cuda
     int size_k,
     bool clear,
     half* temp_dq,
-    bool force_cuda
+    bool force_cuda,
+    const half* r_weights,
+    const int r_weights_stride,
+    bool mul_r_weights
 )
 {
     if (size_m > MAX_Q_GEMM_ROWS && !force_cuda)
     {
-        //printf("cublas\n");
-
         // Reconstruct FP16 matrix, then cuBLAS
 
         if (!temp_dq) temp_dq = b->temp_dq;
@@ -139,12 +151,12 @@ void gemm_half_q_half_cuda
         //const float alpha = 1.0f;
         //const float beta = clear ? 0.0f : 1.0f;
         //cublasSgemmEx(cublas_handle,
-        //              CUBLAS_OP_N,
-        //              CUBLAS_OP_N,
-        //              size_n, size_m, size_k,
-        //              &alpha, temp_dq, CUDA_R_16F, size_n,
-        //                      a,       CUDA_R_16F, size_k,
-        //              &beta,  c,       CUDA_R_16F, size_n);
+        //             CUBLAS_OP_N,
+        //             CUBLAS_OP_N,
+        //             size_n, size_m, size_k,
+        //             &alpha, temp_dq, CUDA_R_16F, size_n,
+        //                     a,       CUDA_R_16F, size_k,
+        //             &beta,  c,       CUDA_R_16F, size_n);
 
         //const float alpha = 1.0f;
         //const float beta = clear ? 0.0f : 1.0f;
@@ -158,24 +170,21 @@ void gemm_half_q_half_cuda
     }
     else
     {
-        //printf("cuda\n");
-
         // Quantized matmul
 
-        //if (clear) clear_tensor_cuda(c, size_m, size_n);
-
-        int max_chunks = size_m / BLOCK_M_SIZE_MAX;
-        int last_chunk = max_chunks * BLOCK_M_SIZE_MAX;
+        int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX;
+        int max_chunks = size_m / block_m_size_max;
+        int last_chunk = max_chunks * block_m_size_max;
         int last_chunk_size = size_m - last_chunk;
 
         if (max_chunks)
         {
-            gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, BLOCK_M_SIZE_MAX, clear);
+            gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights);
         }
 
         if (last_chunk_size)
         {
-            gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear);
+            gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights);
         }
     }
 }
@@ -201,11 +210,10 @@ void clear_tensor_cuda
     int size_n
 )
 {
-    return;
-    dim3 blockDim, gridDim;
-    blockDim.x = CLEAR_N_SIZE;
-    blockDim.y = 1;
-    gridDim.x = DIVIDE(size_n / 8, CLEAR_N_SIZE);
-    gridDim.y = size_m;
-    clear_kernel<<<gridDim, blockDim>>>(c, size_m, size_n);
+//     dim3 blockDim, gridDim;
+//     blockDim.x = CLEAR_N_SIZE;
+//     blockDim.y = 1;
+//     gridDim.x = DIVIDE(size_n / 8, CLEAR_N_SIZE);
+//     gridDim.y = size_m;
+//     clear_kernel<<<gridDim, blockDim>>>(c, size_m, size_n);
 }
@@ -20,7 +20,10 @@ void gemm_half_q_half_cuda
     int size_k,
     bool clear = false,
     half* reconstruct = NULL,
-    bool force_cuda = false
+    bool force_cuda = false,
+    const half* r_weights = NULL,
+    const int r_weights_stride = 0,
+    bool mul_r_weights = false
 );
 
 void clear_tensor_cuda