microsoft
diff --git a/‎csrc/dequant.cu
+2 b/‎csrc/dequant.cu
+2
diff --git a/‎csrc/dequant.h
-20 b/‎csrc/dequant.h
-20
diff --git a/‎csrc/ops.cc
+29-8 b/‎csrc/ops.cc
+29-8
diff --git a/‎csrc/quant_gemv.cu
-35 b/‎csrc/quant_gemv.cu
-35
diff --git a/‎csrc/quant_gemv.h
-29 b/‎csrc/quant_gemv.h
-29
diff --git a/‎vptq/ops/quant_gemm.py
-69 b/‎vptq/ops/quant_gemm.py
-69
diff --git a/‎vptq/tests/ops/test_quant_gemm.py
-78 b/‎vptq/tests/ops/test_quant_gemm.py
-78
@@ -208,7 +208,9 @@ torch::Tensor launch_deqantize_outliers_cuda_packkernel(
       TORCH_CHECK(false, "un-supported base_groupsize:" +
                              std::to_string(base_groupsize));
   }
+
 #undef CASE_DispatchDequantWithOutliers
+
   if (out_ouf_inf) {
     return output;
   } else {
 
@@ -1,22 +1,43 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-/// register VPTQ APIs bindings in this file. ///
+/// register bindings for VPTQ APIs in this file. ///
 
-#include "dequant.h"
-#include "quant_gemv.h"
+#include <torch/extension.h>
+
+namespace vptq {
+
+torch::Tensor dequant(const torch::Tensor& q_indice,
+                      const torch::Tensor& centroids,
+                      const c10::optional<torch::Tensor>& q_indice_residual,
+                      const c10::optional<torch::Tensor>& residual_centroids,
+                      const c10::optional<torch::Tensor>& q_indice_outliers,
+                      const c10::optional<torch::Tensor>& outliers_centroids,
+                      const c10::optional<torch::Tensor>& invperm,
+                      const torch::Tensor& weight_scale,
+                      const torch::Tensor& weight_bias, int64_t groupsize,
+                      int64_t in_features, int64_t out_features);
+
+torch::Tensor wquant_act16_gemv(
+    const torch::Tensor& input, const torch::Tensor& q_indice,
+    const torch::Tensor& centroids,
+    const c10::optional<torch::Tensor>& q_indice_residual,
+    const c10::optional<torch::Tensor>& residual_centroids,
+    const c10::optional<torch::Tensor>& q_indice_outliers,
+    const c10::optional<torch::Tensor>& outliers_centroids,
+    const c10::optional<torch::Tensor>& invperm,
+    const torch::Tensor& weight_scale, const torch::Tensor& weight_bias,
+    const c10::optional<torch::Tensor>& bias, int64_t in_features,
+    int64_t out_features);
+
+}  // namespace vptq
 
 // NOTE: DO NOT change the module name "libvptq" here. It must match how
 // the module is loaded in the Python codes.
 PYBIND11_MODULE(libvptq, m) {
   m.doc() = "VPTQ customized kernels.";
 
-  // v1 kernels.
   m.def("dequant", &vptq::dequant, "vptq customized dequantization kernel.");
   m.def("quant_gemv", &vptq::wquant_act16_gemv,
         "vptq customized dequantized gemv kernel.");
-
-  // v2 kernels.
-  m.def("quant_gemv_v2", &vptq::quant_gemv_v2,
-        "vptq customized quantized gemm kernel.");
 }
@@ -286,39 +286,4 @@ torch::Tensor wquant_act16_gemv(
   return output;
 }
 
-torch::Tensor quant_gemv_v2(
-    const torch::Tensor& activations, const c10::optional<torch::Tensor>& bias,
-    const torch::Tensor& indices, const torch::Tensor& centroids,
-    const c10::optional<torch::Tensor>& residual_centroids,
-    const torch::Tensor& scale_weights, const torch::Tensor& scale_bias,
-    int64_t in_features, int64_t out_features) {
-  CHECK_INPUT(indices);
-  CHECK_INPUT(centroids);
-  CHECK_INPUT(scale_weights);
-  CHECK_INPUT(scale_bias);
-
-  int64_t ndim = activations.ndimension();
-  TORCH_CHECK(ndim == 3, "activations must be a 3D Tensor, but got: ",
-              activations.sizes());
-
-  const int64_t batch = activations.size(0);
-
-  std::cout << "batch: " << batch << std::endl;
-
-  const int64_t num_codebooks = centroids.size(0);
-  const int64_t num_centroids = centroids.size(1);
-  const int64_t vec_len = centroids.size(2);
-
-  std::cout << "num_codebooks: " << num_codebooks << std::endl
-            << "num_centroids: " << num_centroids << std::endl
-            << "vec_len: " << vec_len << std::endl;
-
-  torch::Tensor output;
-  output = at::empty({in_features, out_features}, centroids.options());
-
-  // auto stream = at::cuda::getCurrentCUDAStream().stream();
-
-  return output;
-}
-
 }  // namespace vptq
@@ -270,72 +270,3 @@ def quant_gemm(
             )
         out = F.linear(x, weight, bias)
         return out
-
-
-def quant_gemv_v2(
-    x: torch.Tensor,
-    bias: Optional[torch.Tensor],
-    indices: torch.Tensor,
-    centroids: torch.Tensor,
-    residual_centroids: Optional[torch.Tensor],
-    scale_weights: Optional[torch.Tensor],
-    scale_bias: Optional[torch.Tensor],
-    vector_len: int,
-    num_codebooks: int,
-    num_centroids: int,
-    num_residual_centroids: int,
-    in_features: int,
-    out_features: int,
-) -> torch.Tensor:
-    """ Dequantize the input tensor and perform GEMV operation.
-    
-    Args:
-        x: Tensor[fp16|bf16], has a shape of (batch_size, sequence_length, 
-           in_features). NOTE that `batch_size` here represents the number of 
-           sequences, not tokens.
-        bias: (optional) Tensor[fp16|bf16], has a shape of (1, out_features)
-        indices: Tensor[int16], the original input tensor is flattened into a
-                 vector with a shape of (1, numel). Then, internally, it will be
-                 reshaped into a 3D tensor with a shape of
-                 (num_codebooks, num_indices, packed_groupsize).
-                 NOTE: If the residual quantization component is enabled,
-                 indices for the main quantization component and the residual
-                 quantization component are packed together into this single
-                 input tensor.
-        centroids: Tensor[fp16|bf16], the original input tensor is flatten into
-                   a vector that has a shape of (1, numel), and then be
-                   reshaped internally into a 3-D tensor with a shape of
-                   (num_codebooks, num_centroids, vector_len).
-        residual_centroids: (optional) Tensor[fp16|bf16], has a shape of
-                            (num_codebooks, num_residual_centroids, vector_len).
-        scale_weights: (optional) Tensor[fp16|bf16], has a shape of 
-                       (in_feature, 1), the scale factor for the quantized 
-                       weight.
-        scale_bias: (optional) Tensor[fp16|bf16], has a shape of 
-                    (in_feature, 1), the bias factor for the quantized weight.
-        vector_len: int, the length of the vector in vector quantization.
-        num_codebooks: int, the number of codebooks.
-        num_centroids: int, the number of centroids.
-        num_residual_centroids: int, the number of residual centroids.
-        in_features: int, the number of input features.
-        out_features: int, the number of output features.
-    """
-    centroids_ = centroids.view(num_codebooks, num_centroids, vector_len)
-
-    residual_centroids_ = None
-    if residual_centroids is not None:
-        shape = (num_codebooks, num_residual_centroids, vector_len)
-        residual_centroids_ = residual_centroids.view(shape)
-
-    out = vptq_ops.quant_gemv_v2(
-        x,
-        bias,
-        indices,
-        centroids_,
-        residual_centroids_,
-        scale_weights,
-        scale_bias,
-        in_features,
-        out_features,
-    )
-    return out