fix(csrc): Remove strong dependency on specific Torch version. (#166)

lcy-seso · web-flow · commit 692a41cbed18 · 2025-01-18T20:16:10.000+08:00
If the built wheel is created with Torch version A, it cannot be used in
a local environment that has Torch version B installed, where A and B
are different. This PR addresses and fixes this issue.
diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
@@ -23,6 +23,10 @@ set_target_properties(
 # https://github.com/pytorch/pytorch/issues/13541
 target_compile_definitions(${TARGET} PUBLIC _GLIBCXX_USE_CXX11_ABI=0)
 
+# The find_package(Torch) command does not expose PyTorch's Python bindings.
+# However, when using Pybind11, we need to link against these bindings.
+list(APPEND TORCH_LIBRARIES "${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so")
+
 target_compile_options(
   ${TARGET}
   PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:
diff --git a/csrc/common.h b/csrc/common.h
@@ -5,6 +5,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 
+namespace vptq {
 class OptionalCUDAGuard {
   int set_device_ = -1;
   int current_device_ = -1;
@@ -40,3 +41,4 @@ inline void gpuAssert(cudaError_t code, const char* file, int line) {
     TORCH_CHECK(false, cudaGetErrorString(code));
   }
 }
+}  // namespace vptq
diff --git a/csrc/dequant_impl_packed.cu b/csrc/dequant_impl_packed.cu
@@ -7,6 +7,7 @@
 #include "common.h"
 #include "utils.cuh"
 
+namespace vptq {
 template <typename T>
 struct C10ToNvType {
   typedef __bfloat16 type;
@@ -734,3 +735,4 @@ torch::Tensor launch_gemv_outliers_cuda_packkernel(
   }
   return output;
 }
+}  // namespace vptq
diff --git a/csrc/ops.cc b/csrc/ops.cc
@@ -6,8 +6,8 @@
 #include <c10/cuda/CUDAGuard.h>
 
 #include <torch/extension.h>
-#include <torch/library.h>
 
+namespace vptq {
 #define CHECK_CUDA(x) \
   TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x) \
@@ -157,41 +157,13 @@ torch::Tensor wqA16Gemm(const torch::Tensor& input,
 
   return output;
 }
+}  // namespace vptq
 
-TORCH_LIBRARY_IMPL(vptq, CUDA, m) {
-  m.impl("dequant", dequant);
-  m.impl("gemm", wqA16Gemm);
-}
+// NOTE: DO NOT change the module name "libvptq" here. It must match how
+// the module is loaded in the Python codes.
+PYBIND11_MODULE(libvptq, m) {
+  m.doc() = "VPTQ customized kernels.";
 
-TORCH_LIBRARY(vptq, m) {
-  m.def(
-      R"DOC(dequant(Tensor q_indice,
-      Tensor centroids,
-      Tensor? q_indice_residual,
-      Tensor? residual_centroids,
-      Tensor? q_indice_outliers,
-      Tensor? outliers_centroids,
-      Tensor? invperm,
-      Tensor weight_scale,
-      Tensor weight_bias,
-      int groupsize,
-      int in_features,
-      int out_features) -> Tensor
-)DOC");
-  m.def(
-      R"DOC(gemm(Tensor input,
-      Tensor q_indice,
-      Tensor centroids,
-      Tensor? q_indice_residual,
-      Tensor? residual_centroids,
-      Tensor? q_indice_outliers,
-      Tensor? outliers_centroids,
-      Tensor? invperm,
-      Tensor weight_scale,
-      Tensor weight_bias,
-      Tensor? bias,
-      int groupsize,
-      int in_features,
-      int out_features) -> Tensor
-)DOC");
+  m.def("dequant", &vptq::dequant, "vptq customized dequantization kernel.");
+  m.def("gemm", &vptq::wqA16Gemm, "vptq customized dequantized gemv kernel.");
 }
diff --git a/csrc/utils.cuh b/csrc/utils.cuh
@@ -22,6 +22,7 @@ typedef __nv_bfloat162 __bfloat162;
 typedef __nv_bfloat16 __bfloat16;
 #endif
 
+namespace vptq {
 namespace cuda {
 
 constexpr int kBlockSize = 256;
@@ -93,20 +94,8 @@ __device__ __forceinline__ void ldg_vec_x(
   const int2* src = (const int2*)src_u32;
   if constexpr (GROUPSIZE == 2) {
     *dst_u32 = VPTQ_LDG(src_u32);
-    // uint32_t* dec = (uint32_t*)dst;
-    // asm volatile (
-    //       "ld.cg.global.v2.u32 {%0, %1}, [%2];"
-    //       : "=r"(dec[0]), "=r"(dec[1])
-    //       : "l"((const void*)src)
-    //     );
   } else if constexpr (GROUPSIZE == 4) {
     *dst = VPTQ_LDG(src);
-    // uint32_t* dec = (uint32_t*)dst;
-    // asm volatile (
-    //       "ld.cg.global.v2.u32 {%0, %1}, [%2];"
-    //       : "=r"(dec[0]), "=r"(dec[1])
-    //       : "l"((const void*)src)
-    //     );
   } else if constexpr (GROUPSIZE == 6) {
     dst_u32[0] = VPTQ_LDG(src_u32);
     dst_u32[1] = VPTQ_LDG(src_u32 + 1);
@@ -116,12 +105,6 @@ __device__ __forceinline__ void ldg_vec_x(
   } else if constexpr (GROUPSIZE == 16) {
     *(int4*)dst = VPTQ_LDG((const int4*)src);
     *(int4*)(dst + 2) = VPTQ_LDG((const int4*)(src + 2));
-    // asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-    //              : "=r"(dst_u32[0]), "=r"(dst_u32[1]), "=r"(dst_u32[2]),
-    //              "=r"(dst_u32[3]) : "l"((const void*)src_u32));
-    // asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-    //              : "=r"(dst_u32[4]), "=r"(dst_u32[5]), "=r"(dst_u32[6]),
-    //              "=r"(dst_u32[7]) : "l"((const void*)(src_u32 + 4)));
   } else if constexpr (GROUPSIZE == 12) {
     if (uint64_t(src) % 16) {
       dst[0] = VPTQ_LDG(src);
@@ -132,38 +115,11 @@ __device__ __forceinline__ void ldg_vec_x(
       *(int4*)dst = VPTQ_LDG((int4*)(src));
       dst[2] = VPTQ_LDG((src + 2));
     }
-    // dst[0] = VPTQ_LDG(src);
-    // dst[1] = VPTQ_LDG((src+1));
-    // dst[2] = VPTQ_LDG((src+2));
-
-    // uint32_t* dec = (uint32_t*)dst;
-    // asm volatile (
-    //         "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-    //         : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
-    //         : "l"((const void*)src)
-    //       );
-    // asm volatile (
-    //       "ld.cg.global.v2.u32 {%0, %1}, [%2];"
-    //       : "=r"(dec[4]), "=r"(dec[5])
-    //       : "l"((const void*)src)
-    //     );
   } else if constexpr (GROUPSIZE == 24) {
     *((int4*)(dst)) = VPTQ_LDG((const int4*)(src));
     *(((int4*)(dst)) + 1) = VPTQ_LDG(((const int4*)(src)) + 1);
     *(((int4*)(dst)) + 2) = VPTQ_LDG(((const int4*)(src)) + 2);
   } else if constexpr (GROUPSIZE == 32) {
-    // asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-    //              : "=r"(dst_u32[0]), "=r"(dst_u32[1]), "=r"(dst_u32[2]),
-    //              "=r"(dst_u32[3]) : "l"((const void*)src_u32));
-    // asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-    //              : "=r"(dst_u32[4]), "=r"(dst_u32[5]), "=r"(dst_u32[6]),
-    //              "=r"(dst_u32[7]) : "l"((const void*)(src_u32 + 4)));
-    // asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-    //              : "=r"(dst_u32[8]), "=r"(dst_u32[9]), "=r"(dst_u32[10]),
-    //              "=r"(dst_u32[11]) : "l"((const void*)(src_u32 + 8)));
-    // asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-    //              : "=r"(dst_u32[12]), "=r"(dst_u32[13]), "=r"(dst_u32[14]),
-    //              "=r"(dst_u32[15]) : "l"((const void*)(src_u32 + 12)));
     *((int4*)(dst)) = VPTQ_LDG((const int4*)(src));
     *(((int4*)(dst)) + 1) = VPTQ_LDG(((const int4*)(src)) + 1);
     *(((int4*)(dst)) + 2) = VPTQ_LDG(((const int4*)(src)) + 2);
@@ -203,7 +159,6 @@ template <typename T>
 __forceinline__ T ceil_div(T a, T b) {
   return (a + b - 1) / b;
 }
-
 }  // namespace cuda
 
 template <typename T>
@@ -288,3 +243,4 @@ __device__ __half operator*(const __half& a, const __half& b) {
   return __hmul(a, b);
 }
 #endif
+}  // namespace vptq
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ classifiers = [
  # `pyproject.toml`'s `dependencies` field.
  # Make sure to keep this field in sync with what is in `requirements.txt`.
 dependencies = [
-    "torch",
+    "torch>=2.3.0",
     "datasets",
     "transformers>=4.45",
     "safetensors",
@@ -29,6 +29,8 @@ dependencies = [
     "gradio",
     "plotly==5.9.0",
     "pynvml",
+    "tqdm",
+    "sentence_transformers",
 ]
 
 [project.urls]
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 cmake
 packaging
 setuptools>=64.0.0
-torch
+torch>=2.3.0
 wheel
 datasets
 transformers>=4.45
@@ -12,3 +12,4 @@ gradio
 plotly==5.9.0
 pynvml
 tqdm
+sentence_transformers
diff --git a/vptq/ops/quant_gemm.py b/vptq/ops/quant_gemm.py
@@ -9,35 +9,29 @@
 ]
 
 import math
-import os
 
 import torch
 from torch.nn import functional as F
 
 from vptq.utils.pack import unpack_index_tensor
 
+__cuda_ops_installed = False
 
-def _load_library(filename: str) -> bool:
-    """Load a shared library from the given filename."""
-    try:
-        libdir = os.path.dirname(os.path.dirname(__file__))
-        torch.ops.load_library(os.path.join(libdir, filename))
-        print(f"Successfully loaded: '{filename}'")
-        return True
-    except Exception as error:
-        print((
-            f"{error}\n"
-            "!!! Warning !!!: CUDA kernels are not found, "
-            "please check CUDA and VPTQ installation."
-        ))
-        print((
-            "!!! Warning !!!: Running on Torch implementations, "
-            "which is extremely slow."
-        ))
-        return False
+try:
+    import vptq.libvptq as vptq_ops
 
-
-__cuda_ops_installed: bool = _load_library("libvptq.so")
+    print("Successfully loaded VPTQ CUDA kernels.")
+    __cuda_ops_installed = True
+except Exception as error:
+    print((
+        f"{error}\n"
+        "!!! Warning !!!: CUDA kernels are not found, "
+        "please check CUDA and VPTQ installation."
+    ))
+    print((
+        "!!! Warning !!!: Running on Torch implementations, "
+        "which is extremely slow."
+    ))
 
 
 def dequant(
@@ -212,7 +206,7 @@ def quant_gemm(
         invert_perm = invert_perm.to(torch.uint16).view(torch.int16)
 
     if (x.numel() // x.shape[-1] < 3) and __cuda_ops_installed:
-        out = torch.ops.vptq.gemm(
+        out = vptq_ops.gemm(
             x,
             indices,
             centroids_,
@@ -231,7 +225,8 @@ def quant_gemm(
         return out
     else:
         if __cuda_ops_installed:
-            weight = torch.ops.vptq.dequant(
+
+            weight = vptq_ops.dequant(
                 indices,
                 centroids_,
                 residual_indices,
diff --git a/vptq/utils/pack.py b/vptq/utils/pack.py
@@ -12,7 +12,6 @@
 import tqdm
 from sentence_transformers.SentenceTransformer import SentenceTransformer
 
-# import time
 import vptq
 
 logging.basicConfig(

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@`
`5`	`5`	`#include <ATen/cuda/CUDAContext.h>`
`6`	`6`	`#include <torch/extension.h>`
`7`	`7`
	`8`	`+namespace vptq {`
`8`	`9`	`class OptionalCUDAGuard {`
`9`	`10`	`int set_device_ = -1;`
`10`	`11`	`int current_device_ = -1;`
`@@ -40,3 +41,4 @@ inline void gpuAssert(cudaError_t code, const char* file, int line) {`
`40`	`41`	`TORCH_CHECK(false, cudaGetErrorString(code));`
`41`	`42`	`}`
`42`	`43`	`}`
	`44`	`+} // namespace vptq`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`#include "common.h"`
`8`	`8`	`#include "utils.cuh"`
`9`	`9`
	`10`	`+namespace vptq {`
`10`	`11`	`template <typename T>`
`11`	`12`	`struct C10ToNvType {`
`12`	`13`	`typedef __bfloat16 type;`
`@@ -734,3 +735,4 @@ torch::Tensor launch_gemv_outliers_cuda_packkernel(`
`734`	`735`	`}`
`735`	`736`	`return output;`
`736`	`737`	`}`
	`738`	`+} // namespace vptq`