pytorch
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
Lines changed: 9 additions & 3 deletions b/‎torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
Lines changed: 9 additions & 3 deletions
diff --git a/‎torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
Lines changed: 4 additions & 1 deletion b/‎torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h
Lines changed: 63 additions & 1 deletion b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h
Lines changed: 63 additions & 1 deletion
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h
Lines changed: 92 additions & 0 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h
Lines changed: 92 additions & 0 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
Lines changed: 55 additions & 0 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
Lines changed: 55 additions & 0 deletions
diff --git a/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h
Lines changed: 15 additions & 0 deletions b/‎torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h
Lines changed: 15 additions & 0 deletions
@@ -126,7 +126,7 @@ void pack_weights(
       bias);
 }
 
-template <int weight_nbit, int nr, int kr, int sr>
+template <int weight_nbit, int nr_, int kr_, int sr_>
 void pack_weights_with_lut(
     // Output
     void* packed_weights,
@@ -141,10 +141,16 @@ void pack_weights_with_lut(
     // weight_zeros not packed if nullptr
     const int8_t* weight_zeros,
     // bias not packed if nullptr
-    const float* bias) {
+    const float* bias,
+    int nr,
+    int kr,
+    int sr) {
+  (void)nr; // unused
+  (void)kr; // unused
+  (void)sr; // unused
   torchao::kernels::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight::weight_packing::
-          pack_weights_with_lut<weight_nbit, nr, kr, sr>(
+          pack_weights_with_lut<weight_nbit, nr_, kr_, sr_>(
               packed_weights,
               n,
               k,
 
@@ -478,7 +478,10 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_lut(
       lut.data(),
       test_case.weight_scales.data(),
       has_weight_zeros ? test_case.weight_zeros.data() : nullptr,
-      has_bias ? test_case.bias.data() : nullptr);
+      has_bias ? test_case.bias.data() : nullptr,
+      nr,
+      kr,
+      sr);
 
   std::vector<float> output(m * n);
   kernel_1x8x16_f32_neondot<weight_nbit, has_weight_zeros, /*has_lut*/ true>(
 
@@ -87,6 +87,23 @@ struct UKernelConfig {
       int kr,
       int sr);
 
+  // Pack weights into packed_weights buffer with int8-valued LUT
+  using pack_weights_with_lut_fn_type = void (*)(
+    void* packed_weights,
+      int n,
+      int k,
+      int group_size,
+      const int8_t* weight_qval_idxs,
+      int n_luts,
+      const int8_t* luts,
+      const float* weight_scales,
+      const int8_t* weight_zeros,
+      const float* bias,
+      int nr,
+      int kr,
+      int sr
+    );
+
   // Run matmul kernel
   using kernel_fn_type = void (*)(
       float* output,
@@ -126,6 +143,7 @@ struct UKernelConfig {
   packed_weights_size_fn_type packed_weights_size{nullptr};
   packed_weights_offset_fn_type packed_weights_offset{nullptr};
   pack_weights_fn_type pack_weights{nullptr};
+  pack_weights_with_lut_fn_type pack_weights_with_lut{nullptr};
 
   // linear_configs must be sorted in ascending m_step
   std::array<linear_config_type, kMaxLinearConfigs> linear_configs;
@@ -144,6 +162,20 @@ struct UKernelConfig {
       pack_weights_fn_type pack_weights,
       std::array<linear_config_type, kMaxLinearConfigs> linear_configs);
 
+  static UKernelConfig make_with_lut(
+      size_t preferred_alignment,
+      int n_step,
+      int nr,
+      int kr,
+      int sr,
+      int weight_nbit,
+      bool has_weight_zeros,
+      bool has_bias,
+      packed_weights_size_fn_type packed_weights_with_lut_size,
+      packed_weights_offset_fn_type packed_weights_with_lut_offset,
+      pack_weights_with_lut_fn_type pack_weights_with_lut,
+      std::array<linear_config_type, kMaxLinearConfigs> linear_configs);
+
   inline void validate() const {
     TORCHAO_CHECK(preferred_alignment >= 1, "preferred_alignment must be >= 1");
     TORCHAO_CHECK(n_step >= 1, "n_step must be >= 1");
@@ -155,7 +187,7 @@ struct UKernelConfig {
         packed_weights_size != nullptr, "packed_weights_size must be set");
     TORCHAO_CHECK(
         packed_weights_offset != nullptr, "packed_weights_offset must be set");
-    TORCHAO_CHECK(pack_weights != nullptr, "pack_weights must be set");
+    TORCHAO_CHECK(pack_weights != nullptr || pack_weights_with_lut != nullptr, "pack_weights or pack_weights_with_lut must be set");
 
     bool linear_configs_set = true; // first linear config must be set
     for (int i = 0; i < linear_configs.size(); i++) {
@@ -232,6 +264,36 @@ inline UKernelConfig UKernelConfig::make(
       packed_weights_size,
       packed_weights_offset,
       pack_weights,
+      /*pack_weights_with_lut*/nullptr,
+      std::move(linear_configs)};
+}
+
+inline UKernelConfig UKernelConfig::make_with_lut(
+    size_t preferred_alignment,
+    int n_step,
+    int nr,
+    int kr,
+    int sr,
+    int weight_nbit,
+    bool has_weight_zeros,
+    bool has_bias,
+    packed_weights_size_fn_type packed_weights_with_lut_size,
+    packed_weights_offset_fn_type packed_weights_with_lut_offset,
+    pack_weights_with_lut_fn_type pack_weights_with_lut,
+    std::array<linear_config_type, kMaxLinearConfigs> linear_configs) {
+  return UKernelConfig{
+      preferred_alignment,
+      n_step,
+      nr,
+      kr,
+      sr,
+      weight_nbit,
+      has_weight_zeros,
+      has_bias,
+      packed_weights_with_lut_size,
+      packed_weights_with_lut_offset,
+      /*pack_weights*/nullptr,
+      /*pack_weights_with_lut*/pack_weights_with_lut,
       std::move(linear_configs)};
 }
 
 
@@ -164,6 +164,70 @@ void register_ukernel_config_universal(
   }
 }
 
+template <int weight_nbit>
+void register_ukernel_config_lut(
+    UKernelConfigRegistrationTable& table,
+    PackedWeightsFormat format,
+    cpuinfo_uarch uarch) {
+    if (!cpuinfo_initialize()) {
+      throw std::runtime_error("Failed to initialize cpuinfo!");
+    }
+    check_format(
+      format,
+      torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_lut,
+      weight_nbit
+    );
+    constexpr bool has_lut = true;
+    int preferred_alignment = 16;
+
+    #if defined(TORCHAO_ENABLE_ARM_NEON_DOT)
+    namespace kernel = torchao::kernels::cpu::aarch64::linear::
+      channelwise_8bit_activation_groupwise_lowbit_weight;
+
+    if (cpuinfo_has_arm_neon_dot()) {
+      return;
+    }
+    if (format.has_weight_zeros) {
+      return;
+    }
+    constexpr bool has_weight_zeros = false;
+    if (format.nr == 8 && format.kr == 16 && format.sr == 2) {
+      log_registration(format, "lut: kernel_1x8x16_f32_neondot");
+      constexpr int n_step = 8;
+      constexpr int nr = 8;
+      constexpr int kr = 16;
+      constexpr int sr = 2;
+      constexpr int mr = 1;
+      constexpr int m_step = 1;
+      auto uk = UKernelConfig::make_with_lut(
+          preferred_alignment,
+          n_step,
+          nr,
+          kr,
+          sr,
+          weight_nbit,
+          format.has_weight_zeros,
+          format.has_bias,
+          &kernel::packed_weights_with_lut_size,
+          &kernel::packed_weights_with_lut_offset,
+          &kernel::pack_weights_with_lut<weight_nbit, nr, kr, sr>,
+          /*linear_configs*/ {});
+       uk.linear_configs[0] = UKernelConfig::linear_config_type(
+            {m_step,
+             mr,
+             &kernel::packed_activations_size,
+             &kernel::packed_activations_offset,
+             &kernel::pack_activations<mr, kr, sr>,
+             &kernel::kernel_1x8x16_f32_neondot<
+                 weight_nbit,
+                 has_weight_zeros,
+                 has_lut>});
+        table.register_ukernel_config(format, uarch, std::move(uk));
+        return;
+    }
+   #endif // TORCHAO_ENABLE_ARM_NEON_DOT
+}
+
 #if defined(TORCHAO_ENABLE_KLEIDI)
 template <typename kernel_struct>
 UKernelConfig::linear_config_type
@@ -285,6 +349,14 @@ void register_ukernel_config(
 #endif // TORCHAO_ENABLE_KLEIDI
       break;
     }
+    case torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_lut: {
+      // LUT kernels static assert on weight_nbit <= 4
+      // This is needed to avoid compilation error
+      if constexpr (weight_nbit <= 4) {
+        register_ukernel_config_lut<weight_nbit>(table, format, uarch);
+      }
+      break;
+    }
     default:
       throw std::runtime_error(
           "No registration available for packed_weights_type=" +
@@ -377,4 +449,24 @@ PackedWeightsFormat select_packed_weights_format(
   throw std::runtime_error("No packed_weights_format was selected");
 }
 
+template <int weight_nbit>
+PackedWeightsFormat select_packed_weights_with_lut_format(
+    std::optional<std::string> target,
+    bool has_weight_zeros,
+    bool has_bias) {
+  if (!target) {
+#if defined(TORCHAO_ENABLE_ARM_NEON_DOT)
+    return PackedWeightsFormat(
+        torchao::ops::PackedWeightsType::linear_8bit_act_xbit_weight_lut,
+        weight_nbit,
+        has_weight_zeros,
+        has_bias,
+        /*nr*/ 8,
+        /*kr*/ 16,
+        /*sr*/ 2);
+#endif // defined(TORCHAO_ENABLE_ARM_NEON_DOT)
+  }
+  throw std::runtime_error("No packed_weights_format was selected");
+}
+
 } // namespace torchao::ops::linear_8bit_act_xbit_weight
@@ -75,6 +75,61 @@ void pack_weights_operator(
   });
 }
 
+void pack_weights_with_lut_operator(
+    const UKernelConfig& uk,
+    // Outputs
+    void* packed_weights,
+    // Inputs
+    int n,
+    int k,
+    int group_size,
+    const int8_t* weight_qval_idxs,
+    int n_luts,
+    const int8_t* luts,
+    const float* weight_scales,
+    const int8_t* weight_zeros,
+    const float* bias) {
+  int n_step = uk.n_step;
+  int nc = std::min(n, n_step);
+  int num_nc_panels = (n + nc - 1) / nc;
+
+  torchao::parallel_1d(0, num_nc_panels, [&](int64_t idx) {
+    int nc_tile_idx = idx;
+    int n_idx = nc_tile_idx * nc;
+    int nc_tile_size = std::min(nc, n - n_idx);
+
+    auto packed_weights_offset = uk.packed_weights_offset(
+        n_idx,
+        k,
+        group_size,
+        uk.weight_nbit,
+        uk.has_weight_zeros,
+        uk.has_bias,
+        uk.nr,
+        uk.kr,
+        uk.sr);
+
+    int weight_qval_idxs_offset = n_idx * k;
+    int weight_scales_and_zeros_offset = (n_idx * k / group_size);
+    uk.pack_weights_with_lut(
+        (char*)packed_weights + packed_weights_offset,
+        /*n=*/nc_tile_size,
+        k,
+        group_size,
+        weight_qval_idxs + weight_qval_idxs_offset,
+        n_luts,
+        luts,
+        weight_scales + weight_scales_and_zeros_offset,
+        (weight_zeros == nullptr)
+            ? nullptr
+            : (weight_zeros + weight_scales_and_zeros_offset),
+        (bias == nullptr) ? nullptr : (bias + n_idx),
+        uk.nr,
+        uk.kr,
+        uk.sr);
+  });
+}
+
 LinearTilingParams LinearTilingParams::from_target_tiles_per_thread(
     int m,
     int m_step,
 
@@ -27,6 +27,21 @@ void pack_weights_operator(
     const int8_t* weight_zeros,
     const float* bias);
 
+void pack_weights_with_lut_operator(
+    const UKernelConfig& uk,
+    // Outputs
+    void* packed_weights,
+    // Inputs
+    int n,
+    int k,
+    int group_size,
+    const int8_t* weight_qval_idxs,
+    int n_luts,
+    const int8_t* luts,
+    const float* weight_scales,
+    const int8_t* weight_zeros,
+    const float* bias);
+
 // Linear functions
 struct LinearTilingParams {
   int mc{0};