diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index aae6bd0..f4a74c6 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -17,7 +17,7 @@ concurrency:
 
 jobs:
   build:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     container:
       image: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
 
diff --git a/.github/workflows/pre-commit-format.yml b/.github/workflows/pre-commit-format.yml
index c92e6de..bb63b48 100644
--- a/.github/workflows/pre-commit-format.yml
+++ b/.github/workflows/pre-commit-format.yml
@@ -18,7 +18,7 @@ jobs:
 
   # formatting and basic install on cpu-only machine
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml
index 0699ecb..8de8e2b 100644
--- a/.github/workflows/publish-test.yml
+++ b/.github/workflows/publish-test.yml
@@ -16,7 +16,7 @@ permissions:
 
 jobs:
   setup-version:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - name: Generate version number
       run: |
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 3c02bfe..f11ed2f 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -11,7 +11,7 @@ permissions:
 jobs:
   release:
     name: Create Release
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     outputs:
       upload_url: ${{ steps.create_release.outputs.upload_url }}
     steps:
diff --git a/.gitignore b/.gitignore
index d29bc0e..421638c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@
 # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
 
 test_*.py
+*.nsys-*
+*.sh
 
 # Prerequisites
 *.d
diff --git a/README.md b/README.md
index 930efe5..bc29eab 100644
--- a/README.md
+++ b/README.md
@@ -36,15 +36,14 @@ Note that: The open-sourced MoE-Infinity has been redesigned for making it Huggi
 Single GPU A5000 (24GB Memory), per-token-latency (seconds) for generation with a mixed dataset that includes [LongBench](https://huggingface.co/datasets/THUDM/LongBench), [GSM8K](https://huggingface.co/datasets/openai/gsm8k),  [FLAN](https://huggingface.co/datasets/Muennighoff/flan), [BIG-Bench](https://huggingface.co/datasets/bigbench) and [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) datasets.
 Lower per-token-latency is preferable.
 
-|  | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite
-| :---: | :---: | :---: | :---: | :---: |
-| <ins>MoE-Infinity</ins> | <ins>*0.130*</ins>	| <ins>*0.119*</ins> | <ins>*0.735*</ins> | <ins>*0.155*</ins> |
-| Accelerate | 1.043 | 3.071 | 6.633 |  1.743  |
-|DeepSpeed | 4.578 | 8.381 | 2.486 | 0.737 |
-|Mixtral Offloading| X | X | 1.752 | X |
-|Ollama | X | X | 0.903 | 1.250 |
-|vLLM| X | X | 2.137 | 0.493 |
-
+|  | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite-Chat | Qwen3-30B-A3B |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| <ins>MoE-Infinity</ins> | <ins>*0.130*</ins>	| <ins>*0.119*</ins> | <ins>*0.735*</ins> | <ins>*0.100*</ins> | <ins>*0.150*</ins> |
+| Accelerate | 1.043 | 3.071 | 6.633 |  1.743  | |
+|DeepSpeed (0.16.2) | 4.578 | 8.381 | 2.486 | 0.737 | 7.857 |
+|Mixtral Offloading| X | X | 1.752 | X |X|
+|Ollama | X | X | 0.903 | 1.250 ||
+|vLLM (v0.8.5)| X | X | 2.137 | 0.149 | 0.205 |
 
 <!-- Single GPU A5000, throughput (token/s) for generation with batch size 32.
 Higher throughput is preferable.
@@ -207,7 +206,7 @@ If you use MoE-Inifity for your research, please cite our [paper](https://arxiv.
                   Zhan Lu and
                   Luo Mai and
                   Mahesh Marina},
-  title        = {MoE-Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache},
+  title        = {MoE{-}Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache},
   archivePrefix= {arXiv},
   eprint       = {2401.14361},
   year         = {2024}
diff --git a/core/common/pytorch.h b/core/common/pytorch.h
index 5407f85..8af8737 100644
--- a/core/common/pytorch.h
+++ b/core/common/pytorch.h
@@ -45,9 +45,76 @@ inline py::list vector_to_list(std::vector<uint32_t>& vec) {
   return list;
 }
 
+#define DTYPE_BFLOAT16 0
+#define DTYPE_FLOAT32 1
+#define DTYPE_FLOAT16 2
+#define DTYPE_FP8_E4M3FN 3
+
+inline torch::ScalarType dtype_to_torch(int dtype) {
+  auto tensor_dtype = torch::kFloat32;
+  switch (dtype) {
+    case DTYPE_BFLOAT16:
+      tensor_dtype = torch::kBFloat16;
+      break;
+    case DTYPE_FLOAT16:
+      tensor_dtype = torch::kHalf;
+      break;
+    case DTYPE_FLOAT32:
+      tensor_dtype = torch::kFloat32;
+      break;
+    case DTYPE_FP8_E4M3FN:
+      tensor_dtype = torch::kFloat8_e4m3fn;
+      break;
+    default:
+      assert(false);
+  }
+  return tensor_dtype;
+}
+
+inline int torch_dtype_to_int(torch::ScalarType dtype) {
+  auto tensor_dtype = DTYPE_FLOAT32;
+  switch (dtype) {
+    case torch::kBFloat16:
+      tensor_dtype = DTYPE_BFLOAT16;
+      break;
+    case torch::kHalf:
+      tensor_dtype = DTYPE_FLOAT16;
+      break;
+    case torch::kFloat32:
+      tensor_dtype = DTYPE_FLOAT32;
+      break;
+    case torch::kFloat8_e4m3fn:
+      tensor_dtype = DTYPE_FP8_E4M3FN;
+      break;
+    default:
+      assert(false);
+  }
+  return tensor_dtype;
+}
+
+inline size_t torch_dtype_size(int dtype) {
+  size_t itemsize = 0;
+  switch (dtype) {
+    case DTYPE_BFLOAT16:
+      itemsize = 2;  // bfloat16 is 2 bytes
+      break;
+    case DTYPE_FLOAT16:
+      itemsize = 2;  // float16 is 2 bytes
+      break;
+    case DTYPE_FLOAT32:
+      itemsize = 4;  // float32 is 4 bytes
+      break;
+    case DTYPE_FP8_E4M3FN:
+      itemsize = 1;  // fp8_e4m3fn is 1 byte
+      break;
+    default:
+      assert(false);  // Invalid dtype
+  }
+  return itemsize;
+}
+
 inline size_t torch_shape_size(const std::vector<int64_t>& shape, int dtype) {
-  auto torch_type = torch::ScalarType(dtype);
-  auto itemsize = torch::empty({1}, torch_type).itemsize();
+  auto itemsize = torch_dtype_size(dtype);
   size_t size = 1;
   for (auto dim : shape) {
     size *= dim;
diff --git a/core/common/sync.h b/core/common/sync.h
new file mode 100644
index 0000000..fb803a6
--- /dev/null
+++ b/core/common/sync.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <atomic>
+#include <cerrno>
+#include <stdexcept>
+
+// Templated Futex class for atomic variable
+template <typename T>
+class Futex {
+ public:
+  Futex() { value_.store(0); }
+  explicit Futex(T initial_value) : value_(initial_value) {}
+  explicit Futex(const Futex<T>& other) : value_(other.value_.get()) {}
+
+  void wait(T expected) {
+    while (value_.load() != expected) {
+      int ret = syscall(SYS_futex, &value_, FUTEX_WAIT, expected, nullptr,
+                        nullptr, 0);
+      if (ret == -1 && errno != EAGAIN) {
+        throw std::runtime_error("Futex wait failed");
+      }
+    }
+  }
+
+  void wake(int count = 1) {
+    int ret =
+        syscall(SYS_futex, &value_, FUTEX_WAKE, count, nullptr, nullptr, 0);
+    if (ret == -1) {
+      throw std::runtime_error("Futex wake failed");
+    }
+  }
+
+  void set(T new_value) { value_.store(new_value); }
+
+  T get() const { return value_.load(); }
+
+  void set_and_wake(T new_value, int count = 1) {
+    value_.store(new_value);
+    wake(count);
+  }
+
+  void wait_and_set(T expected, T new_value) {
+    while (true) {
+      T current = value_.load();
+      if (current != expected) {
+        int ret = syscall(SYS_futex, &value_, FUTEX_WAIT, current, nullptr,
+                          nullptr, 0);
+        if (ret == -1 && errno != EAGAIN) {
+          throw std::runtime_error("Futex wait failed");
+        }
+      } else if (value_.compare_exchange_strong(current, new_value)) {
+        // Successfully set the new value atomically
+        break;
+      }
+    }
+  }
+
+ private:
+  std::atomic<T> value_;
+};
diff --git a/core/memory/caching_allocator.h b/core/memory/caching_allocator.h
new file mode 100644
index 0000000..01251f4
--- /dev/null
+++ b/core/memory/caching_allocator.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <unordered_map>
+#include <vector>
+#include <stdexcept>
+
+#include "utils/cuda_utils.h"
+
+// Templated CachingAllocator class
+template <typename Allocator>
+class CachingAllocator {
+ public:
+  static CachingAllocator<Allocator>* instance(int idx) {
+    static std::array<CachingAllocator<Allocator>*, 8> instances;
+    if (instances[idx] == nullptr) {
+      instances[idx] = new CachingAllocator<Allocator>();
+    }
+    return instances[idx];
+  }
+
+  void* allocate(const size_t bytes) {
+    const auto& it = available_map_.find(bytes);
+    if (it == available_map_.end() || it->second.empty()) {
+      return allocate_and_cache(bytes);
+    }
+    void* ptr = it->second.back();
+    it->second.pop_back();
+    return ptr;
+  }
+
+  void free(void* ptr) {
+    const auto& it = allocation_map_.find(ptr);
+    if (it == allocation_map_.end()) {
+      Allocator::deallocate(ptr);
+      return;
+    }
+    const size_t alloc_size = it->second;
+    available_map_[alloc_size].push_back(ptr);
+  }
+
+  void record_free(void* ptr) {
+    const auto& it = allocation_map_.find(ptr);
+    if (it != allocation_map_.end()) {
+      allocation_map_.erase(it);
+    }
+  }
+
+  void free_cached() {
+    for (const auto& it : available_map_) {
+      for (const auto ptr : it.second) {
+        Allocator::deallocate(ptr);
+        allocation_map_.erase(ptr);
+      }
+    }
+    available_map_.clear();
+  }
+
+  ~CachingAllocator() { free_cached(); }
+
+ private:
+  void* allocate_and_cache(const size_t bytes) {
+    void* ptr = Allocator::allocate(bytes);
+    allocation_map_[ptr] = bytes;
+    return ptr;
+  }
+
+  std::unordered_map<size_t, std::vector<void*>> available_map_;
+  std::unordered_map<void*, size_t> allocation_map_;
+};
+
+// Example Allocator for CUDA
+struct CudaDeviceAllocator {
+  static void* allocate(size_t bytes) {
+    void* ptr;
+    CUDA_CHECK(cudaMalloc(&ptr, bytes));
+    return ptr;
+  }
+
+  static void deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); }
+};
+
+// Example Allocator for Unified Memory
+struct CudaUnifiedAllocator {
+  static void* allocate(size_t bytes) {
+    void* ptr;
+    CUDA_CHECK(cudaMallocManaged(&ptr, bytes));
+    return ptr;
+  }
+
+  static void deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); }
+};
+
+// Example Allocator for cudaHostAlloc
+struct CudaHostAllocator {
+  static void* allocate(size_t bytes) {
+    void* ptr;
+    CUDA_CHECK(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault));
+    return ptr;
+  }
+
+  static void deallocate(void* ptr) { CUDA_CHECK(cudaFreeHost(ptr)); }
+};
+
+// Template specialization for all types of CachingAllocator
+typedef CachingAllocator<CudaDeviceAllocator> CudaDeviceCachingAllocator;
+typedef CachingAllocator<CudaUnifiedAllocator> CudaUnifiedCachingAllocator;
+typedef CachingAllocator<CudaHostAllocator> CudaHostCachingAllocator;
diff --git a/core/memory/device_caching_allocator.cpp b/core/memory/device_caching_allocator.cpp
index 4fb98ec..aa1a2dd 100644
--- a/core/memory/device_caching_allocator.cpp
+++ b/core/memory/device_caching_allocator.cpp
@@ -10,6 +10,7 @@
 #include "device_caching_allocator.h"
 #include <c10/util/Exception.h>
 #include <cuda_runtime_api.h>
+#include "utils/cuda_utils.h"
 #include "utils/logger.h"
 
 namespace c10 {
@@ -24,11 +25,7 @@ inline void* DeviceCachingAllocator::allocate_and_cache(const size_t bytes) {
   auto cuda_err = cudaMalloc(&ptr, bytes);
   if (cuda_err != cudaSuccess) {
     free_cached();
-    cuda_err = cudaMalloc(&ptr, bytes);
-    if (cuda_err != cudaSuccess) {
-      DLOG_ERROR("cudaMalloc failed", bytes, cuda_err);
-      throw std::runtime_error("cudaMalloc failed");
-    }
+    CUDA_CHECK(cudaMalloc(&ptr, bytes));
   }
 
   allocation_map_[ptr] = bytes;
diff --git a/core/model/fused_mlp.cu b/core/model/fused_mlp.cu
new file mode 100644
index 0000000..85a574d
--- /dev/null
+++ b/core/model/fused_mlp.cu
@@ -0,0 +1,68 @@
+#include "fused_mlp.h"
+#include "parallel/expert_module.h"
+
+torch::Tensor launch_fused_moe_ffn(torch::Tensor hidden,  // [M, K]
+                                   torch::Tensor w1,      // [N, K]
+                                   torch::Tensor w2,      // [N, K]
+                                   torch::Tensor w3,      // [K, N]
+                                   cudaStream_t stream)   // CUDA stream
+{
+  TORCH_CHECK(hidden.scalar_type() == at::kBFloat16, "BF16 only kernel");
+
+  int device_id = at::cuda::current_device();
+
+  int M = hidden.size(0);
+  int K = hidden.size(1);
+  int N = w1.size(0);
+
+  dim3 threads(NUM_THREADS, NUM_THREADS);
+  dim3 blocks((K + NUM_THREADS - 1) / NUM_THREADS,
+              (M + NUM_THREADS - 1) / NUM_THREADS);
+
+  auto options = hidden.options().dtype(at::kBFloat16).device(hidden.device());
+
+  // // get torch cuda allocator
+  // auto allocator = c10::cuda::CUDACachingAllocator::get();
+  // TORCH_CHECK(allocator != nullptr, "CUDACachingAllocator is not
+  // initialized");
+  // // allocate memory using torch cuda allocator
+  // void* output_ptr = allocator->raw_alloc_with_stream(
+  //     M * K * sizeof(__nv_bfloat16), stream);
+  void* output_ptr = c10::DeviceCachingAllocator::get(device_id)->allocate(
+      M * K * sizeof(__nv_bfloat16));
+  // cudaMalloc(&output_ptr, M * K * sizeof(__nv_bfloat16));
+
+  // cudamalloc and create output tensor
+  // torch::Tensor output = torch::empty({M, K}, options);
+  // std::cout << "Output tensor sum: " << output.sum().item<float>() <<
+  // std::endl; TORCH_CHECK(output.is_contiguous(), "Output tensor must be
+  // contiguous"); TORCH_CHECK(w1.is_contiguous() && w2.is_contiguous() &&
+  // w3.is_contiguous(), "Weight tensors must be contiguous");
+  // TORCH_CHECK(hidden.is_contiguous(), "Hidden tensor must be contiguous");
+
+  int shared_mem_bytes = TILE_DIM * TILE_DIM * sizeof(__nv_bfloat16);
+
+  auto start = std::chrono::high_resolution_clock::now();
+  fused_moe_ffn_kernel<<<blocks, threads, shared_mem_bytes, stream>>>(
+      reinterpret_cast<__nv_bfloat16*>(hidden.data_ptr()),
+      reinterpret_cast<__nv_bfloat16*>(w1.data_ptr()),
+      reinterpret_cast<__nv_bfloat16*>(w2.data_ptr()),
+      reinterpret_cast<__nv_bfloat16*>(w3.data_ptr()),
+      reinterpret_cast<__nv_bfloat16*>(output_ptr), M, K, N);
+
+  cudaStreamSynchronize(stream);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration =
+      std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  // std::cout << "Kernel execution time: " << duration.count() << "
+  // microseconds" << std::endl; cudaDeviceSynchronize();
+
+  // torch::Tensor output = torch::from_blob(output_ptr, {M, K},
+  // torch_deleter{}, options);
+
+  // torch::Tensor output = torch::from_blob(output_ptr, {M, K}, options);
+  // std::cout << "Output tensor sum: " << output.sum().item<float>() <<
+  // std::endl;
+  return torch::from_blob(output_ptr, {M, K}, torch_deleter{device_id},
+                          options);
+}
diff --git a/core/model/fused_mlp.h b/core/model/fused_mlp.h
new file mode 100644
index 0000000..fc2fbc4
--- /dev/null
+++ b/core/model/fused_mlp.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include "memory/device_caching_allocator.h"
+
+#define TILE_DIM 32
+#define NUM_THREADS 32
+
+__device__ inline float to_float(__nv_bfloat16 x) {
+  return __bfloat162float(x);
+}
+
+__device__ inline __nv_bfloat16 from_float(float x) {
+  return __float2bfloat16(x);
+}
+
+// __global__ void fused_moe_ffn_kernel(
+//     const __nv_bfloat16* __restrict__ hidden,  // [M, K]
+//     const __nv_bfloat16* __restrict__ w1,      // [N, K]
+//     const __nv_bfloat16* __restrict__ w2,      // [N, K]
+//     const __nv_bfloat16* __restrict__ w3,      // [K, N]
+//     __nv_bfloat16* __restrict__ output,        // [M, K]
+//     int M, int K, int N) {
+
+//     int row = blockIdx.y * blockDim.y + threadIdx.y;
+//     int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+//     if (row >= M || col >= K) return;
+
+//     float acc1 = 0.f, acc2 = 0.f;
+//     for (int i = 0; i < N; ++i) {
+//         float h = 0.f;
+//         for (int k = 0; k < K; ++k) {
+//             h += __bfloat162float(hidden[row * K + k]) *
+//             __bfloat162float(w1[i * K + k]);
+//         }
+//         acc1 = h;
+
+//         h = 0.f;
+//         for (int k = 0; k < K; ++k) {
+//             h += __bfloat162float(hidden[row * K + k]) *
+//             __bfloat162float(w2[i * K + k]);
+//         }
+//         acc2 = h;
+
+//         float silu = acc1 / (1.0f + expf(-acc1));
+//         float fused = silu * acc2;
+
+//         float out = 0.0f;
+//         for (int k = 0; k < K; ++k) {
+//             out += fused * __bfloat162float(w3[k * N + col]);
+//         }
+
+//         output[row * K + col] = __float2bfloat16(out);
+//     }
+// }
+
+__global__ void fused_moe_ffn_kernel(
+    const __nv_bfloat16* __restrict__ hidden,  // [M, K]
+    const __nv_bfloat16* __restrict__ w1,      // [N, K]
+    const __nv_bfloat16* __restrict__ w2,      // [N, K]
+    const __nv_bfloat16* __restrict__ w3,      // [K, N]
+    __nv_bfloat16* __restrict__ output,        // [M, K]
+    int M, int K, int N) {
+  extern __shared__ __nv_bfloat16 smem[];
+
+  int local_row = threadIdx.y;
+  int local_col = threadIdx.x;
+  int row = blockIdx.y * blockDim.y + local_row;
+  int col = blockIdx.x * blockDim.x + local_col;
+
+  if (row >= M || col >= K) return;
+
+  // Shared memory slice for tile of hidden vector
+  __nv_bfloat16* tile_hidden = smem;  // [TILE_DIM * TILE_DIM]
+  int tile_idx = local_row * TILE_DIM + local_col;
+  tile_hidden[tile_idx] =
+      (row < M && col < K) ? hidden[row * K + col] : __float2bfloat16(0.f);
+
+  __syncthreads();
+
+  float acc1 = 0.f, acc2 = 0.f;
+  for (int i = 0; i < N; ++i) {
+    float h1 = 0.f, h2 = 0.f;
+    for (int k = 0; k < TILE_DIM && k + col < K; ++k) {
+      int widx = i * K + (col + k);
+      float x = to_float(tile_hidden[local_row * TILE_DIM + k]);
+      h1 += x * to_float(w1[widx]);
+      h2 += x * to_float(w2[widx]);
+    }
+    acc1 += h1;
+    acc2 += h2;
+  }
+
+  acc1 = to_float(from_float(acc1));
+  acc2 = to_float(from_float(acc2));
+
+  float silu = acc1 / (1.0f + expf(-acc1));
+  float fused = silu * acc2;
+
+  float out = 0.0f;
+  for (int k = 0; k < N; ++k) {
+    int widx = k * N + col;
+    out += fused * to_float(w3[widx]);
+  }
+
+  output[row * K + col] = from_float(out);
+}
+
+struct torch_deleter {
+  void operator()(void* ptr) const {
+    if (ptr != nullptr) {
+      // auto allocator = c10::cuda::CUDACachingAllocator::get();
+      // TORCH_CHECK(allocator != nullptr, "CUDACachingAllocator is not
+      // initialized"); allocator->raw_delete(ptr);
+      c10::DeviceCachingAllocator::get(device_id)->free(ptr);
+    }
+  }
+  int device_id;
+};
+
+// // Batch multiple FFNs in one kernel call
+// void launch_batch_fused_moe_ffns(
+//     const std::vector<torch::Tensor>& hiddens,
+//     const std::vector<torch::Tensor>& w1s,
+//     const std::vector<torch::Tensor>& w2s,
+//     const std::vector<torch::Tensor>& w3s,
+//     const std::vector<torch::Tensor>& outputs,
+//     cudaStream_t stream,
+//     bool sync_after = true)
+// {
+//     TORCH_CHECK(hiddens.size() == w1s.size() && w1s.size() == w2s.size() &&
+//     w2s.size() == w3s.size() && w3s.size() == outputs.size(), "Mismatched
+//     batch sizes");
+
+//     for (size_t i = 0; i < hiddens.size(); ++i) {
+//         launch_fused_moe_ffn(hiddens[i], w1s[i], w2s[i], w3s[i], outputs[i],
+//         stream);
+//     }
+
+//     if (sync_after) {
+//         cudaStreamSynchronize(stream);
+//     }
+// }
diff --git a/core/model/model_topology.cpp b/core/model/model_topology.cpp
index f844197..426568b 100644
--- a/core/model/model_topology.cpp
+++ b/core/model/model_topology.cpp
@@ -518,14 +518,14 @@ void ArcherTopologyHandle::InitializeTopology(
   int num_dense_nodes_per_device = std::ceil(dense_nodes.size() / num_gpu / 2);
   // int total_dense_nodes = dense_nodes.size();
   int counter = 0;
-  DLOG_INFO("Moving dense parameters to GPU");
+  DLOG_INFO("Moving dense parameters to CPU");
   for (auto& node_ptr : tqdm::tqdm(dense_nodes)) {
     node_ptr->default_device = torch::Device(torch::kCUDA, target_device_id);
     counter++;
     if (counter % num_dense_nodes_per_device == 0) {
       target_device_id = (target_device_id + 1) % num_gpu;
     }
-    node_ptr->SetDevice(node_ptr->default_device, false);
+    node_ptr->SetDevice(CPU_DEVICE, false);
   }
   dense_nodes.back()->default_device = torch::Device(torch::kCUDA, num_gpu - 1);
 
diff --git a/core/parallel/expert_dispatcher.cpp b/core/parallel/expert_dispatcher.cpp
index 63b44a4..e2cc4bc 100644
--- a/core/parallel/expert_dispatcher.cpp
+++ b/core/parallel/expert_dispatcher.cpp
@@ -25,33 +25,40 @@ ExpertDispatcher::ExpertDispatcher(int num_experts, int num_layers, int dtype,
       num_enqueued_(0),
       start_(false),
       expert_type_(expert_type),
-      input_mutex_(kNumDevices),
-      input_cv_(kNumDevices),
-      exec_mutex_(kNumDevices),
-      exec_cv_(kNumDevices),
+      dtype_(dtype),
+      num_experts_(num_experts),
+      // input_mutex_(kNumDevices),
+      // input_cv_(kNumDevices),
+      // exec_mutex_(kNumDevices),
+      // exec_cv_(kNumDevices),
+      cache_mutex_(kNumDevices),
+      cache_cv_(kNumDevices),
       input_queue_(kNumDevices),
+      gpu_overload_(kNumDevices, false),
       exec_queue_(kNumDevices),
-      gpu_overload_(kNumDevices, false) {
+      cached_experts_(kNumDevices),
+      modules_(kNumDevices, nullptr) {
   main_thread_stop_flag_.store(false);
 
-  for (int i = 0; i < kNumDevices; ++i) {
-    cudaSetDevice(i);
-    cudaStream_t fetch_stream;
-    cudaStreamCreateWithFlags(&fetch_stream, cudaStreamNonBlocking);
-    fetch_streams_.emplace_back(fetch_stream);
+  // module_ = new MoEMLP(dtype, expert_type);
 
-    cudaStream_t out_stream;
-    cudaStreamCreateWithFlags(&out_stream, cudaStreamNonBlocking);
-    out_streams_.emplace_back(out_stream);
+  // Futex<bool> initial_value(false);
+  // gpu_overload_ = std::move(std::vector<Futex<bool>>(kNumDevices,
+  // initial_value));
 
+  for (int i = 0; i < kNumDevices; ++i) {
     auto thread_func = std::bind(&ExpertDispatcher::GPUFetchFunc, this, i);
-    threads_.emplace_back(new base::Thread(thread_func));
+    std::string thread_name = "GPUFetchFunc" + std::to_string(i);
+    threads_.emplace_back(new base::Thread(thread_func, thread_name));
     threads_.back()->start();
     // SetThreadAffinity(threads_.back()->tid());
 
     auto cache_limit =
         kTopologyHandle->GetSparseCacheLimit(torch::Device(torch::kCUDA, i));
     cache_sizes_.push_back(cache_limit);
+
+    modules_[i] = new MoEMLP(dtype, expert_type);
+    // gpu_overload_.emplace_back(false);
   }
 
   for (int i = 0; i < kNumDevices * num_threads; ++i) {
@@ -63,7 +70,8 @@ ExpertDispatcher::ExpertDispatcher(int num_experts, int num_layers, int dtype,
 
     auto thread_func =
         std::bind(&ExpertDispatcher::GPUExecFunc, this, i % kNumDevices);
-    threads_.emplace_back(new base::Thread(thread_func));
+    std::string thread_name = "GPUExecFunc" + std::to_string(i % kNumDevices);
+    threads_.emplace_back(new base::Thread(thread_func, thread_name));
     threads_.back()->start();
     // SetThreadAffinity(threads_.back()->tid());
   }
@@ -134,13 +142,30 @@ void ExpertDispatcher::Enqueue(CallArgs& args) {
 
   if (expert_node->node->device.is_cuda()) {
     args.gpu_id = expert_node->node->device.index();
-  }
 
-  {
-    std::unique_lock<std::mutex> lock(input_mutex_[args.gpu_id]);
-    input_queue_[args.gpu_id].push_back(std::move(args));
+    auto original_device = (args.remote) ? CPU_DEVICE : hidden_states_.device();
+
+    ExecArgs exec_args;
+    // exec_args.hidden_states = std::move(input);
+    exec_args.expert_node = expert_node;
+    expert_node->SetTensorsFromBlob(expert_node->node->device);
+    exec_args.out_gpu_id = original_device.index();
+    exec_args.out_dtype = c10::typeMetaToScalarType(hidden_states_.dtype());
+    exec_args.evict = false;
+    exec_args.hit = true;
+
+    // module_->SetTensorsFromIds(expert_node->node->tensor_ids);
+
+    // std::unique_lock<std::mutex> lock(exec_mutex_[args.gpu_id]);
+    // exec_queue_[args.gpu_id].push_back(std::move(exec_args));
+    exec_queue_[args.gpu_id].Push(exec_args);
+  } else {
+    // std::unique_lock<std::mutex> lock(input_mutex_[args.gpu_id]);
+    // input_queue_[args.gpu_id].push_back(std::move(args));
+    input_queue_[args.gpu_id].Push(args);
   }
-  input_cv_[args.gpu_id].notify_all();
+  // input_cv_[args.gpu_id].notify_all();
+  // exec_cv_[args.gpu_id].notify_all();
   // input_queue_.push_back(std::move(args));
   num_enqueued_.fetch_add(1);
 
@@ -158,20 +183,29 @@ void ExpertDispatcher::Enqueue(CallArgs& args) {
 }
 
 void ExpertDispatcher::RegisterExpert(
-    int layer_idx, int expert_idx,
-    const std::vector<std::uint32_t>& tensor_ids) {
+    int layer_idx, int expert_idx, const std::vector<std::uint32_t>& tensor_ids,
+    std::string jit_path) {
   NodePtr cached_node = nullptr;
   for (auto tensor_id : tensor_ids) {
     auto node = kTopologyHandle->GetNodeFromTensorID(tensor_id);
     if (cached_node == nullptr) {
       cached_node = node;
       experts_[expert_idx][layer_idx]->node = node;
+      experts_[expert_idx][layer_idx]->jit_module =
+          new torch::jit::script::Module(torch::jit::load(jit_path));
     } else if (cached_node != node) {
       DLOG_FATAL("RegisterExpert: tensor_id has multiple nodes", tensor_id);
     }
   }
 }
 
+void ExpertDispatcher::NotifyFetchStart() {
+  for (int i = 0; i < kNumDevices; ++i) {
+    // std::unique_lock<std::mutex> lock(input_mutex_[i]);
+    input_queue_[i].NotifyAll();
+  }
+}
+
 void ExpertDispatcher::ClearExpertCacheCounts() {
   for (auto& expert : experts_) {
     for (auto& expert_node : expert) {
@@ -188,7 +222,30 @@ void ExpertDispatcher::ClearExpertCacheCounts() {
 //   }
 // }
 
+ExpertNodePtr ExpertDispatcher::FindExpertEvict(int gpu_id) {
+  uint64_t min_visit_count = INT_MAX;
+  ExpertNodePtr evict_expert_node = nullptr;
+
+  for (auto& key : cached_experts_[gpu_id]) {
+    auto layer_idx = key >> 32;
+    auto expert_idx = key & 0xFFFFFFFF;
+    auto node = experts_[expert_idx][layer_idx]->node;
+    if (node == nullptr) continue;
+    if (node->device.is_cuda() && node->incache_visit_count < min_visit_count &&
+        node->mutex.try_lock()) {
+      evict_expert_node = experts_[expert_idx][layer_idx];
+      min_visit_count = node->incache_visit_count;
+      node->mutex.unlock();
+    }
+  }
+  return evict_expert_node;
+}
+
 void ExpertDispatcher::GPUFetchFunc(int gpu_id) {
+  cudaSetDevice(gpu_id);
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
   while (!main_thread_stop_flag_.load()) {
     // std::unique_lock<std::mutex> lock(mutexes_[MUTEX_TYPE::INPUT_MUTEX]);
     // if (cache_ == nullptr) {
@@ -203,18 +260,22 @@ void ExpertDispatcher::GPUFetchFunc(int gpu_id) {
     //   int cache_capacity = cache_limit / expert_node->node->byte_size;
     //   cache_capacity_ = cache_capacity;
     // }
-    std::unique_lock<std::mutex> lock(input_mutex_[gpu_id]);
-    input_cv_[gpu_id].wait(lock, [&] { return !input_queue_[gpu_id].empty(); });
+    // std::unique_lock<std::mutex> lock(input_mutex_[gpu_id]);
+    // input_cv_[gpu_id].wait(lock, [&] { return !input_queue_[gpu_id].empty();
+    // });
 
-    CallArgs args = std::move(input_queue_[gpu_id].front());
-    input_queue_[gpu_id].pop_front();
+    // CallArgs args = std::move(input_queue_[gpu_id].front());
+    // input_queue_[gpu_id].pop_front();
 
-    lock.unlock();
+    // lock.unlock();
+    CallArgs args;
+    input_queue_[gpu_id].Pop(args);
 
     auto device = CUDA_DEVICE(gpu_id);
     auto original_device = (args.remote) ? CPU_DEVICE : hidden_states_.device();
-    int layer_idx = args.layer_idx;
-    int expert_idx = args.expert_idx;
+    int64_t layer_idx = args.layer_idx;
+    int64_t expert_idx = args.expert_idx;
+    int64_t batch_size = hidden_states_.size(0);
 
     auto expert_node = experts_[expert_idx][layer_idx];
     bool cache_hit = expert_node->node->device.is_cuda();
@@ -223,209 +284,287 @@ void ExpertDispatcher::GPUFetchFunc(int gpu_id) {
     //           << " layer_idx " << layer_idx << " expert_idx " << expert_idx
     //           << " cache_hit " << cache_hit << " node "
     //           << expert_node->node->device.str() << std::endl;
-
-    if (!expert_node->node->device.is_cuda() &&
-        cache_sizes_[gpu_id] < expert_node->node->byte_size) {
-      // find the expert in gpu and min incache_visit_count
-      NodePtr evict_node = nullptr;
-      auto num_layers = experts_[0].size();
-      auto num_experts = experts_.size();
-      int min_visit_count = INT_MAX;
-      for (int i = 0; i < num_experts; ++i) {
-        for (int j = 0; j < num_layers; ++j) {
-          auto node = experts_[i][j]->node;
-          if (node == nullptr) {
-            // std::cerr << "ExpertDispatcher::GPUFetchFunc: node is nullptr"
-            //           << " layer_idx " << j << " expert_idx " << i <<
-            //           std::endl;
-            continue;
-          }
-          if (node->device.is_cuda() &&
-              node->incache_visit_count < min_visit_count &&
-              node->mutex.try_lock()) {
-            evict_node = node;
-            min_visit_count = node->incache_visit_count;
-            node->mutex.unlock();
-            // std::cerr << "ExpertDispatcher::GPUFetchFunc: evict node "
-            //           << evict_node->device.str() << " incache_visit_count "
-            //           << min_visit_count << std::endl;
+    DLOG_DEBUG("ExpertDispatcher::GPUFetchFunc: gpu_id ", gpu_id, " layer_idx ",
+               layer_idx, " expert_idx ", expert_idx, "cache_hit ", cache_hit,
+               "cache_size ", cache_sizes_[gpu_id], " incache count ",
+               cached_experts_[gpu_id].size());
+
+    if (!cache_hit && cache_sizes_[gpu_id] < expert_node->node->byte_size) {
+      if (batch_size > 1) {
+        // force fetch to GPU regardless of cache size, only for prefill
+        // only one extra cache slot for prefill
+        DLOG_DEBUG("overloading expert cache: gpu_id ", gpu_id, " cache size ",
+                   cache_sizes_[gpu_id], " incache count ",
+                   cached_experts_[gpu_id].size(), " layer_idx ", layer_idx,
+                   " expert_idx ", expert_idx);
+        // gpu_overload_[gpu_id].wait_and_set(false, true);
+        // busy wait for cache to be available
+        while (gpu_overload_[gpu_id]) {
+          std::this_thread::sleep_for(std::chrono::microseconds(1));
+        }
+        gpu_overload_[gpu_id] = true;
+      } else {
+        // find the expert in gpu and min incache_visit_count
+        ExpertNodePtr evict_expert_node = FindExpertEvict(gpu_id);
+        if (evict_expert_node == nullptr) {
+          // wait for notification that cache is available
+          DLOG_WARN(
+              "All cached expert locked, waiting for cache to be available. "
+              "gpu_id ",
+              gpu_id, " cache size ", cache_sizes_[gpu_id], " incache count ",
+              cached_experts_[gpu_id].size(), " layer_idx ", layer_idx,
+              " expert_idx ", expert_idx);
+          {
+            std::unique_lock<std::mutex> lock(cache_mutex_[gpu_id]);
+            cache_cv_[gpu_id].wait(lock);
           }
+          evict_expert_node = FindExpertEvict(gpu_id);
+        }
+        // auto num_layers = experts_[0].size();
+        // auto num_experts = experts_.size();
+
+        // for (size_t i = 0; i < num_experts; ++i) {
+        //   for (size_t j = 0; j < num_layers; ++j) {
+        // auto node = experts_[i][j]->node;
+        // if (node == nullptr) {
+        //   // std::cerr << "ExpertDispatcher::GPUFetchFunc: node is nullptr"
+        //   //           << " layer_idx " << j << " expert_idx " << i <<
+        //   //           std::endl;
+        //   continue;
+        // }
+        // if (node->device.is_cuda() &&
+        //     node->incache_visit_count < min_visit_count &&
+        //     node->mutex.try_lock()) {
+        //   evict_node = node;
+        //   min_visit_count = node->incache_visit_count;
+        //   node->mutex.unlock();
+        //   // std::cerr << "ExpertDispatcher::GPUFetchFunc: evict node "
+        //   //           << evict_node->device.str() << " incache_visit_count "
+        //   //           << min_visit_count << std::endl;
+        // }
+        //   }
+        // }
+        DLOG_FATAL_IF(
+            evict_expert_node == nullptr,
+            "ExpertDispatcher::GPUFetchFunc: evict_node is nullptr, gpu_id",
+            gpu_id, "cache size", cache_sizes_[gpu_id], "in cache count",
+            cached_experts_[gpu_id].size());
+
+        DLOG_DEBUG("evicting expert: gpu_id ", gpu_id, " cache size ",
+                   cache_sizes_[gpu_id], " incache count ",
+                   cached_experts_[gpu_id].size(), " layer_idx ", layer_idx,
+                   " expert_idx ", expert_idx);
+
+        auto evict_node = evict_expert_node->node;
+        evict_node->SetDevice(evict_node->default_host);
+        cache_sizes_[gpu_id] += evict_node->byte_size;
+        int64_t evict_layer_idx = evict_expert_node->layer_idx;
+        int64_t evict_expert_idx = evict_expert_node->expert_idx;
+
+        // std::lock_guard<std::mutex> lock(cache_mutex_[gpu_id]);
+        uint64_t evict_key = (evict_layer_idx << 32) + evict_expert_idx;
+        auto it = cached_experts_[gpu_id].find(evict_key);
+        if (it != cached_experts_[gpu_id].end()) {
+          cached_experts_[gpu_id].erase(it);
+        } else {
+          DLOG_FATAL(
+              "ExpertDispatcher::GPUFetchFunc: evict_key not found. layer_idx ",
+              evict_layer_idx, " expert_idx ", evict_expert_idx);
         }
       }
-      assert(evict_node != nullptr);
-      evict_node->SetDevice(evict_node->default_host);
-      cache_sizes_[gpu_id] += evict_node->byte_size;
     }
 
-    bool success = true;
+    if (!gpu_overload_[gpu_id]) {
+      cache_sizes_[gpu_id] -= expert_node->node->byte_size;
+      uint64_t key = (layer_idx << 32) + expert_idx;
+      cached_experts_[gpu_id].insert(key);
+    }
 
-    expert_node->node->SetDevice(CUDA_DEVICE(gpu_id), true,
-                                 fetch_streams_[gpu_id]);
+    expert_node->node->SetDevice(device, true, stream);
     expert_node->node->incache_visit_count += 1;
     expert_node->SetTensorsFromBlob(device);
-    cache_sizes_[gpu_id] -= expert_node->node->byte_size;
+    // module_->SetTensorsFromIds(expert_node->node->tensor_ids);
+
     // std::cerr << "ExpertDispatcher::GPUFetchFunc: move to device gpu_id "
     //           << gpu_id << " layer_idx " << layer_idx << " expert_idx "
     //           << expert_idx << " node "
     //           << expert_node->node->device.str() << std::endl;
 
-    int expert_type = expert_type_;
-    torch::Tensor input;
-    auto token_indices =
-        router_mask_.index({"...", expert_idx}).to(torch::kBool);
-    switch (expert_type) {
-      case SWITCH_TRANSFORMERS_DENSE_ACT_DENSE:
-      case SWITCH_TRANSFORMERS_DENSE_GATED_ACT_DENSE:
-      case NLLB_MOE_DENSE_ACT_DENSE:
-      case FSGPT_MOE_DENSE_ACT_DENSE:
-      case MIXTRAL_MOE_DENSE_ACT_DENSE:
-      case DEEPSEEK_MOE_DENSE_ACT_DENSE:
-        input =
-            hidden_states_.index({token_indices}).to(expert_node->node->device);
-        break;
-      default:
-        DLOG_FATAL("ExpertDispatcher::expert_type: unknown expert type ",
-                   expert_type);
-    }
+    // int expert_type = expert_type_;
+    // torch::Tensor input;
+    // auto token_indices =
+    //     router_mask_.index({"...", expert_idx}).to(torch::kBool);
+    // switch (expert_type) {
+    //   case SWITCH_TRANSFORMERS_DENSE_ACT_DENSE:
+    //   case SWITCH_TRANSFORMERS_DENSE_GATED_ACT_DENSE:
+    //   case NLLB_MOE_DENSE_ACT_DENSE:
+    //   case FSGPT_MOE_DENSE_ACT_DENSE:
+    //   case MIXTRAL_MOE_DENSE_ACT_DENSE:
+    //   case DEEPSEEK_MOE_DENSE_ACT_DENSE:
+    //     input =
+    //         hidden_states_.index({token_indices}).to(expert_node->node->device);
+    //     break;
+    //   default:
+    //     DLOG_FATAL("ExpertDispatcher::expert_type: unknown expert type ",
+    //                expert_type);
+    // }
 
-    DLOG_TRACE("ExpertDispatcher::GPUFetchFunc gpu_id ", gpu_id, "layer_idx ",
-               layer_idx, "expert_idx ", expert_idx, "input ",
-               input.device().str(), "node ", expert_node->node->device.str());
+    // DLOG_TRACE("ExpertDispatcher::GPUFetchFunc gpu_id ", gpu_id, "layer_idx
+    // ",
+    //            layer_idx, "expert_idx ", expert_idx, "input ",
+    //            input.device().str(), "node ",
+    //            expert_node->node->device.str());
     {
       ExecArgs exec_args;
-      exec_args.hidden_states = std::move(input);
+      // exec_args.hidden_states = std::move(input);
       exec_args.expert_node = expert_node;
       exec_args.out_gpu_id = original_device.index();
       exec_args.out_dtype = c10::typeMetaToScalarType(hidden_states_.dtype());
-      exec_args.evict = !success;
+      exec_args.evict = gpu_overload_[gpu_id];
       exec_args.hit = cache_hit;
-      std::lock_guard<std::mutex> lock(exec_mutex_[gpu_id]);
-      exec_queue_[gpu_id].emplace_back(std::move(exec_args));
+      // std::lock_guard<std::mutex> lock(exec_mutex_[gpu_id]);
+      // exec_queue_[gpu_id].emplace_back(std::move(exec_args));
+      exec_queue_[gpu_id].Push(exec_args);
     }
-    exec_cv_[gpu_id].notify_all();
+    // exec_cv_[gpu_id].notify_all();
   }
+
+  cudaStreamDestroy(stream);
 }
 
 void ExpertDispatcher::GPUExecFunc(int gpu_id) {
   cudaSetDevice(gpu_id);
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
   while (!main_thread_stop_flag_.load()) {
-    std::unique_lock<std::mutex> lock(exec_mutex_[gpu_id]);
-    exec_cv_[gpu_id].wait(lock, [&] { return !exec_queue_[gpu_id].empty(); });
+    // std::unique_lock<std::mutex> lock(exec_mutex_[gpu_id]);
+    // exec_cv_[gpu_id].wait(lock, [&] { return !exec_queue_[gpu_id].empty();
+    // });
+
+    // ExecArgs args = std::move(exec_queue_[gpu_id].front());
+    // exec_queue_[gpu_id].pop_front();
 
-    ExecArgs args = std::move(exec_queue_[gpu_id].front());
-    exec_queue_[gpu_id].pop_front();
+    // lock.unlock();
 
-    lock.unlock();
+    ExecArgs args;
+    exec_queue_[gpu_id].Pop(args);
 
     if (args.expert_node == nullptr) {
       continue;
     }
 
-    torch::Tensor output;
-
-    // at::InferenceMode infer_guard(true);
+    int64_t batch_size = hidden_states_.size(0);
+    auto device = CUDA_DEVICE(gpu_id);
+    auto expert_idx = args.expert_node->expert_idx;
 
-    // random int [0,8)
-    int rnd = std::rand() % 8;
-    c10::cuda::CUDAStream stream =
-        c10::cuda::getStreamFromExternal(exec_streams_[gpu_id + rnd], gpu_id);
+    auto token_mask = router_mask_.index({"...", expert_idx});
+    torch::Tensor input = (batch_size == 1)
+                              ? hidden_states_.to(device)
+                              : hidden_states_.index({token_mask}).to(device);
 
-    {
-      auto start = TIME_NOW;
-      // c10::cuda::CUDAStreamGuard guard(stream);
+    // args.hidden_states = std::move(input);
+    // assert(args.hidden_states.sum().to(torch::kCPU).item<float>() != 0);
+    // at::InferenceMode infer_guard(true);
 
-      auto* expert_module = args.expert_node->module;
-      int expert_type = expert_type_;
-      cudaStreamSynchronize(stream);  // make sure the input is ready
-
-      try {
-        switch (expert_type) {
-          case SWITCH_TRANSFORMERS_DENSE_ACT_DENSE:
-            output = reinterpret_cast<SwitchTransformersDenseActDense*>(
-                         expert_module)
-                         ->forward(args.hidden_states);
-            break;
-          case SWITCH_TRANSFORMERS_DENSE_GATED_ACT_DENSE:
-            output = reinterpret_cast<SwitchTransformersDenseGatedActDense*>(
-                         expert_module)
-                         ->forward(args.hidden_states);
-            break;
-          case NLLB_MOE_DENSE_ACT_DENSE:
-            output = reinterpret_cast<NllbMoeDenseActDense*>(expert_module)
-                         ->forward(args.hidden_states);
-            break;
-          case FSGPT_MOE_DENSE_ACT_DENSE:
-            output = reinterpret_cast<FSGPTMoEDenseActDense*>(expert_module)
-                         ->forward(args.hidden_states);
-            break;
-          case MIXTRAL_MOE_DENSE_ACT_DENSE:
-            output = reinterpret_cast<MixtralMoEDenseActDense*>(expert_module)
-                         ->forward(args.hidden_states);
-            break;
-          case DEEPSEEK_MOE_DENSE_ACT_DENSE:
-            output = reinterpret_cast<DeepSeekMoEDenseActDense*>(expert_module)
-                         ->forward(args.hidden_states);
-            break;
-          default:
-            DLOG_FATAL("ExpertDispatcher::GPUExecFunc: unknown expert type",
-                       expert_type);
-        }
+    // // prepare jit input vector
+    // std::vector<torch::jit::IValue> jit_inputs;
+    // jit_inputs.push_back(input);
 
-      } catch (const std::exception& e) {
-        std::stringstream ss;
-        ss << "DenseActDense tensor_ids: [";
-        for (auto& id : args.expert_node->node->tensor_ids) {
-          ss << id << " ";
-        }
-        ss << "]";
-        DLOG_FATAL("ExpertDispatcher::GPUExecFunc", ss.str(), "expert_type",
-                   expert_type, e.what());
-      }
+    // cudaDeviceSynchronize();
 
-      stream.synchronize();
-      auto end = TIME_NOW;
-      // DLOG_INFO("ExpertDispatcher::GPUExecFunc: forward time ",
-      //                  std::chrono::duration_cast<MCIROSECONDS>(end -
-      //                  start).count(), "us");
-    }
+    modules_[gpu_id]->SetTensorsFromIds(args.expert_node->node->tensor_ids);
 
-    (void)std::async(std::launch::async, &ExpertDispatcher::OutputFunc, this,
-                     std::move(args), std::move(output), gpu_id);
+    // random int [0,8)
+    // int rnd = std::rand() % kNumDevices;
+    c10::cuda::CUDAStream torch_stream =
+        c10::cuda::getStreamFromExternal(stream, gpu_id);
+    c10::cuda::CUDAStreamGuard guard(torch_stream);
+    // auto start = TIME_NOW;
+    // c10::cuda::CUDAStreamGuard guard(stream);
+
+    // auto* expert_module = args.expert_node->module;
+    // int expert_type = expert_type_;
+    // cudaStreamSynchronize(stream);  // make sure the input is ready
+
+    auto output = modules_[gpu_id]->forward(input, stream);
+    OutputFunc(args, output, token_mask, gpu_id);
   }
+
+  cudaStreamDestroy(stream);
 }
 
 void ExpertDispatcher::OutputFunc(ExecArgs args, torch::Tensor output,
-                                  int gpu_id) {
-  // c10::cuda::CUDAStream stream =
-  // c10::cuda::getStreamFromExternal(out_streams_[gpu_id], gpu_id);
-  // c10::cuda::CUDAStreamGuard guard(stream);
-
+                                  torch::Tensor token_mask, int gpu_id) {
   auto output_device =
       (args.out_gpu_id < 0) ? CPU_DEVICE : CUDA_DEVICE(args.out_gpu_id);
-  torch::Tensor output_tensor = output.to(output_device).to(args.out_dtype);
+  torch::Tensor output_tensor = output.to(output_device).to(torch::kFloat32);
+
+  DLOG_TRACE("ExpertDispatcher::OutputFunc: output_tensor ",
+             output_tensor.sizes().vec(), "(", output_tensor.device().str(),
+             ")");
 
+  // args.expert_node->node->mutex.unlock();
+  int64_t expert_idx = args.expert_node->expert_idx;
+  int64_t layer_idx = args.expert_node->layer_idx;
+  int64_t batch_size = hidden_states_.size(0);
+
+  args.expert_node->node->mutex.unlock();
   if (args.evict) {
+    // pop out overloaded expert such that cache is not polluted
     args.expert_node->node->SetDevice(args.expert_node->node->default_host,
                                       true, nullptr);
-    {
-      std::lock_guard<std::mutex> lock(gpu_overload_mutex_);
-      gpu_overload_[gpu_id] = false;
-    }
+    // std::lock_guard<std::mutex> lock(cache_mutex_[gpu_id]);
+    // uint64_t key = (layer_idx << 32) + expert_idx;
+    // auto it = cached_experts_[gpu_id].find(key);
+    // if (it != cached_experts_[gpu_id].end()) {
+    //   cached_experts_[gpu_id].erase(it);
+    // } else {
+    //   DLOG_FATAL(
+    //       "ExpertDispatcher::OutputFunc: expert not found in cache. gpu_id",
+    //       gpu_id, "layer_idx ", layer_idx, "expert_idx ", expert_idx);
+    // }
+    // cache_sizes_[gpu_id] += args.expert_node->node->byte_size;
+    DLOG_DEBUG("pop out overloaded expert cache_sizes_[gpu_id] ",
+               cache_sizes_[gpu_id], "gpu_id ", gpu_id, "layer_idx ", layer_idx,
+               "expert_idx ", expert_idx);
+    // std::lock_guard<std::mutex> lock(cache_mutex_[gpu_id]);
+    // gpu_overload_[gpu_id].set_and_wake(true);
+    gpu_overload_[gpu_id] = false;
   }
+  cache_cv_[gpu_id].notify_all();
+
+  // if (args.evict) {
+  //   args.expert_node->node->SetDevice(args.expert_node->node->default_host,
+  //                                     true, nullptr);
+  //   {
+  //     std::lock_guard<std::mutex> lock(gpu_overload_mutex_);
+  //     gpu_overload_[gpu_id] = false;
+  //   }
+  // }
 
-  args.expert_node->node->mutex.unlock();
-
-  {
-    std::lock_guard<std::mutex> lock(output_mutex_);
-    output_queue_.emplace_back(std::move(output_tensor),
-                               args.expert_node->layer_idx,
-                               args.expert_node->expert_idx, args.hit);
-    DLOG_TRACE("ExpertDispatcher::OutputFunc: output_queue_",
-               output_queue_.size(), "output",
-               std::get<0>(output_queue_.back()).device().str(), "evict",
-               args.evict, "(", args.expert_node->layer_idx,
-               args.expert_node->expert_idx, gpu_id, args.hit, ")");
+  if (batch_size == 1) {
+    final_hidden_states_.add_(
+        output_tensor *
+        router_weight_.index({torch::indexing::Slice(), expert_idx}));
+  } else {
+    auto token_indices = torch::nonzero(token_mask).squeeze(1);
+    auto weights = router_weight_.index({token_mask, expert_idx}).unsqueeze(1);
+    auto weighted_output = output_tensor * weights;
+    final_hidden_states_.index_add_(0, token_indices, weighted_output);
   }
+  // {
+  //   std::lock_guard<std::mutex> lock(output_mutex_);
+  //   output_queue_.emplace_back(std::move(output_tensor),
+  //                              args.expert_node->layer_idx,
+  //                              args.expert_node->expert_idx, args.hit);
+  //   DLOG_TRACE("ExpertDispatcher::OutputFunc: output_queue_",
+  //              output_queue_.size(), "output",
+  //              std::get<0>(output_queue_.back()).device().str(), "evict",
+  //              args.evict, "(", args.expert_node->layer_idx,
+  //              args.expert_node->expert_idx, gpu_id, args.hit, ")");
+  // }
+
   // stream.synchronize();
   pending_.fetch_sub(1);
   if (pending_.load() == 0) {
@@ -434,7 +573,7 @@ void ExpertDispatcher::OutputFunc(ExecArgs args, torch::Tensor output,
 }
 
 std::vector<ExpertDispatcher::CallResult> ExpertDispatcher::Wait() {
-  int wait_count = 0;
+  // int wait_count = 0;
 
   std::unique_lock<std::mutex> lock(pending_mutex_);
   pending_cv_.wait(lock, [&] { return pending_.load() == 0; });
@@ -448,3 +587,22 @@ std::vector<ExpertDispatcher::CallResult> ExpertDispatcher::Wait() {
 
   return output_queue;
 }
+
+torch::Tensor ExpertDispatcher::WaitHiddenStates() {
+  std::unique_lock<std::mutex> lock(pending_mutex_);
+  pending_cv_.wait(lock, [&] { return pending_.load() == 0; });
+  num_enqueued_.store(0);
+  return final_hidden_states_;
+}
+
+void ExpertDispatcher::SetInputs(const torch::Tensor& hidden_states,
+                                 const torch::Tensor& router_mask,
+                                 const torch::Tensor& router_weight) {
+  int device = at::cuda::current_device();
+  auto options =
+      torch::TensorOptions().dtype(torch::kFloat32).device(CUDA_DEVICE(device));
+  hidden_states_ = hidden_states;
+  router_mask_ = router_mask;
+  router_weight_ = router_weight;  // this can be float32
+  final_hidden_states_ = torch::zeros_like(hidden_states, options);
+}
diff --git a/core/parallel/expert_dispatcher.h b/core/parallel/expert_dispatcher.h
index 81c51cd..8ebbba5 100644
--- a/core/parallel/expert_dispatcher.h
+++ b/core/parallel/expert_dispatcher.h
@@ -13,8 +13,10 @@
 #include <unordered_map>
 #include <vector>
 
+#include "common/sync.h"
 #include "base/noncopyable.h"
 #include "base/thread.h"
+#include "utils/threadsafe_queue.h"
 #include "expert_module.h"
 
 enum MUTEX_TYPE {
@@ -45,44 +47,42 @@ class ExpertDispatcher : public base::noncopyable {
 
  public:
   explicit ExpertDispatcher(int num_experts, int num_layers, int dtype,
-                            int expert_type, int num_threads = 8);
+                            int expert_type, int num_threads = 1);
   ~ExpertDispatcher() {
     main_thread_stop_flag_.store(true);
     for (auto& thread : threads_) {
       thread->join();
     }
 
-    for (auto& stream : fetch_streams_) {
-      cudaStreamDestroy(stream);
-    }
+    // for (auto& stream : fetch_streams_) {
+    //   cudaStreamDestroy(stream);
+    // }
     for (auto& stream : exec_streams_) {
       cudaStreamDestroy(stream);
     }
-    for (auto& stream : out_streams_) {
-      cudaStreamDestroy(stream);
-    }
+    // for (auto& stream : out_streams_) {
+    //   cudaStreamDestroy(stream);
+    // }
   }
 
   void SetInputs(const torch::Tensor& hidden_states,
-                 const torch::Tensor& router_mask) {
-    hidden_states_ = hidden_states.clone();
-    router_mask_ = router_mask.clone();
-  }
+                 const torch::Tensor& router_mask,
+                 const torch::Tensor& router_weight);
 
   void EnqueueExpert(int layer_idx, int expert_idx, int gpu_id = -1,
                      bool remote = false);
+  void NotifyFetchStart();
 
   void RegisterExpert(int layer_idx, int expert_idx,
-                      const std::vector<std::uint32_t>& tensor_ids);
+                      const std::vector<std::uint32_t>& tensor_ids,
+                      std::string jit_path);
   void ClearExpertCacheCounts();
   void SetExpectedQueue(int expected_pending = 0) {
     pending_.store(expected_pending);
   }
 
   std::vector<CallResult> WaitExpert() { return Wait(); }
-  void SetNode(int layer_idx, int expert_idx, const NodePtr& node) {
-    experts_[expert_idx][layer_idx]->node = node;
-  }
+  torch::Tensor WaitHiddenStates();
 
  private:
   void Enqueue(CallArgs& args);
@@ -94,18 +94,25 @@ class ExpertDispatcher : public base::noncopyable {
 
   // void GPUThreadFunc(int gpu_id);
 
-  void OutputFunc(ExecArgs args, torch::Tensor output, int gpu_id);
+  void OutputFunc(ExecArgs args, torch::Tensor output, torch::Tensor token_mask,
+                  int gpu_id);
+
+  ExpertNodePtr FindExpertEvict(int gpu_id);
 
  private:
   std::vector<std::unique_ptr<base::Thread>> threads_;
   std::mutex mutex_;
-  std::vector<std::deque<CallArgs>> input_queue_;
-  std::vector<std::deque<ExecArgs>> exec_queue_;
+  // std::vector<std::deque<CallArgs>> input_queue_;
+  std::vector<ThreadSafeQueue<CallArgs>> input_queue_;
+  // std::vector<std::deque<ExecArgs>> exec_queue_;
+  std::vector<ThreadSafeQueue<ExecArgs>> exec_queue_;
   std::vector<CallResult> output_queue_;
   std::vector<std::vector<ExpertNodePtr>> experts_;
   std::atomic<size_t> num_enqueued_;
   bool start_;
   int expert_type_;
+  int dtype_;
+  int num_experts_;
   std::atomic<bool> main_thread_stop_flag_;
 
   std::atomic<size_t> pending_;
@@ -113,25 +120,39 @@ class ExpertDispatcher : public base::noncopyable {
   std::mutex pending_mutex_;
   std::condition_variable pending_cv_;
 
-  std::vector<std::mutex> input_mutex_;
-  std::vector<std::mutex> exec_mutex_;
-  std::vector<std::condition_variable> input_cv_;
-  std::vector<std::condition_variable> exec_cv_;
+  // std::vector<std::mutex> input_mutex_;
+  // std::vector<std::mutex> exec_mutex_;
+  // std::vector<std::condition_variable> input_cv_;
+  // std::vector<std::condition_variable> exec_cv_;
+
+  std::vector<std::mutex> cache_mutex_;
+  std::vector<std::condition_variable> cache_cv_;
 
   std::mutex output_mutex_;
   // std::mutex exec_mutex_;
-  std::mutex gpu_overload_mutex_;
+  // std::mutex gpu_overload_mutex_;
 
-  std::vector<cudaStream_t> fetch_streams_;
   std::vector<cudaStream_t> exec_streams_;
-  std::vector<cudaStream_t> out_streams_;
 
   std::vector<bool> gpu_overload_;
 
   torch::Tensor hidden_states_;
+  torch::Tensor final_hidden_states_;
   torch::Tensor router_mask_;
+  torch::Tensor router_weight_;
 
   std::vector<int64_t> cache_sizes_;
+  std::vector<std::unordered_set<uint64_t>> cached_experts_;
 
   int cache_capacity_ = 0;
+
+  std::vector<MoEMLP*> modules_;
 };
+
+#define SET_TENSORS_AND_MODULE_FROM_BLOB(cls, module, node, device, \
+                                         jit_module)                \
+  do {                                                              \
+    reinterpret_cast<cls*>(module)->SetTensorsFromBlob(             \
+        node->device_memory_ptr, node->tensor_ids, device);         \
+    reinterpret_cast<cls*>(module)->SetModuleFromBlob(jit_module);  \
+  } while (0)
diff --git a/core/parallel/expert_module.cpp b/core/parallel/expert_module.cpp
index 729d829..4cd8c78 100644
--- a/core/parallel/expert_module.cpp
+++ b/core/parallel/expert_module.cpp
@@ -5,8 +5,11 @@
 
 #include "expert_module.h"
 #include "aio/archer_tensor_handle.h"
+#include "memory/caching_allocator.h"
 #include "utils/logger.h"
 
+static const int64_t kMaxTokens = 128;
+
 SwitchTransformersDenseActDense::SwitchTransformersDenseActDense(int dtype) {
   // auto tensor_dtype = dtype_to_torch(dtype);
   auto options = torch::TensorOptions().device(torch::kCPU);
@@ -21,8 +24,21 @@ void SwitchTransformersDenseActDense::SetTensorsFromBlob(
   wo = kTensorIndex->find(tensor_ids[1])->second.tensor;
 }
 
+void SwitchTransformersDenseActDense::SetModuleFromBlob(
+    torch::jit::script::Module* ptr) {
+  for (auto it = ptr->parameters().begin(); it != ptr->parameters().end();
+       ++it) {
+    auto tensor = *it;
+    if ((*it).name() == "wi") {
+      (*it).set_data(wi);
+    } else if ((*it).name() == "wo") {
+      (*it).set_data(wo);
+    }
+  }
+}
+
 torch::Tensor SwitchTransformersDenseActDense::forward(
-    torch::Tensor hidden_states) {
+    torch::Tensor hidden_states, cudaStream_t stream) {
   // DLOG_TRACE("SwitchTransformersDenseActDense wi {} wo {}, hidden_states {}",
   //                  torch_dtype_to_int(wi.dtype()),
   //                     torch_dtype_to_int(wo.dtype()),
@@ -51,8 +67,23 @@ void SwitchTransformersDenseGatedActDense::SetTensorsFromBlob(
   wo = kTensorIndex->find(tensor_ids[2])->second.tensor;
 }
 
+void SwitchTransformersDenseGatedActDense::SetModuleFromBlob(
+    torch::jit::script::Module* ptr) {
+  for (auto it = ptr->parameters().begin(); it != ptr->parameters().end();
+       ++it) {
+    auto tensor = *it;
+    if ((*it).name() == "wi_0") {
+      (*it).set_data(wi_0);
+    } else if ((*it).name() == "wi_1") {
+      (*it).set_data(wi_1);
+    } else if ((*it).name() == "wo") {
+      (*it).set_data(wo);
+    }
+  }
+}
+
 torch::Tensor SwitchTransformersDenseGatedActDense::forward(
-    torch::Tensor hidden_states) {
+    torch::Tensor hidden_states, cudaStream_t stream) {
   auto gate = torch::gelu(torch::matmul(hidden_states, wi_0.transpose(0, 1)));
   auto linear = torch::matmul(hidden_states, wi_1.transpose(0, 1));
   return torch::matmul(torch::mul(gate, linear), wo.transpose(0, 1));
@@ -76,7 +107,24 @@ void NllbMoeDenseActDense::SetTensorsFromBlob(
   fc2_bias = kTensorIndex->find(tensor_ids[3])->second.tensor;
 }
 
-torch::Tensor NllbMoeDenseActDense::forward(torch::Tensor hidden_states) {
+void NllbMoeDenseActDense::SetModuleFromBlob(torch::jit::script::Module* ptr) {
+  for (auto it = ptr->parameters().begin(); it != ptr->parameters().end();
+       ++it) {
+    auto tensor = *it;
+    if ((*it).name() == "fc1") {
+      (*it).set_data(fc1);
+    } else if ((*it).name() == "fc1_bias") {
+      (*it).set_data(fc1_bias);
+    } else if ((*it).name() == "fc2") {
+      (*it).set_data(fc2);
+    } else if ((*it).name() == "fc2_bias") {
+      (*it).set_data(fc2_bias);
+    }
+  }
+}
+
+torch::Tensor NllbMoeDenseActDense::forward(torch::Tensor hidden_states,
+                                            cudaStream_t stream) {
   // DLOG_TRACE("NllbMoeDenseActDense fc1 {} fc1_bias {} fc2 {} fc2_bias {}
   // hidden_states
   // {}",
@@ -110,7 +158,24 @@ void FSGPTMoEDenseActDense::SetTensorsFromBlob(
   fc2_bias = kTensorIndex->find(tensor_ids[3])->second.tensor;
 }
 
-torch::Tensor FSGPTMoEDenseActDense::forward(torch::Tensor hidden_states) {
+void FSGPTMoEDenseActDense::SetModuleFromBlob(torch::jit::script::Module* ptr) {
+  for (auto it = ptr->parameters().begin(); it != ptr->parameters().end();
+       ++it) {
+    auto tensor = *it;
+    if ((*it).name() == "fc1") {
+      (*it).set_data(fc1);
+    } else if ((*it).name() == "fc1_bias") {
+      (*it).set_data(fc1_bias);
+    } else if ((*it).name() == "fc2") {
+      (*it).set_data(fc2);
+    } else if ((*it).name() == "fc2_bias") {
+      (*it).set_data(fc2_bias);
+    }
+  }
+}
+
+torch::Tensor FSGPTMoEDenseActDense::forward(torch::Tensor hidden_states,
+                                             cudaStream_t stream) {
   // DLOG_TRACE("FSGPTMoEDenseActDense fc1 {} fc1_bias {} fc2 {} fc2_bias {}
   // hidden_states
   // {}",
@@ -144,7 +209,23 @@ void MixtralMoEDenseActDense::SetTensorsFromBlob(
   w3 = kTensorIndex->find(tensor_ids[2])->second.tensor;
 }
 
-torch::Tensor MixtralMoEDenseActDense::forward(torch::Tensor hidden_states) {
+void MixtralMoEDenseActDense::SetModuleFromBlob(
+    torch::jit::script::Module* ptr) {
+  for (auto it = ptr->parameters().begin(); it != ptr->parameters().end();
+       ++it) {
+    auto tensor = *it;
+    if ((*it).name() == "w1") {
+      (*it).set_data(w1);
+    } else if ((*it).name() == "w2") {
+      (*it).set_data(w2);
+    } else if ((*it).name() == "w3") {
+      (*it).set_data(w3);
+    }
+  }
+}
+
+torch::Tensor MixtralMoEDenseActDense::forward(torch::Tensor hidden_states,
+                                               cudaStream_t stream) {
   /*
   current_hidden_states = self.silu(self.w1(hidden_states)) *
   self.w3(hidden_states) current_hidden_states = self.w2(current_hidden_states)
@@ -190,13 +271,33 @@ void DeepSeekMoEDenseActDense::SetTensorsFromBlob(
   down_proj = kTensorIndex->find(tensor_ids[2])->second.tensor;
 }
 
-torch::Tensor DeepSeekMoEDenseActDense::forward(torch::Tensor hidden_states) {
-  // DLOG_INFO("DeepSeekMoEDenseActDense gate_proj {} up_proj {} down_proj {}
-  // hidden_states {}",
-  //           gate_proj.device().str(),
-  //           up_proj.device().str(),
-  //           down_proj.device().str(),
-  //           hidden_states.device().str());
+void DeepSeekMoEDenseActDense::SetModuleFromBlob(
+    torch::jit::script::Module* ptr) {
+  for (auto it = ptr->named_parameters().begin();
+       it != ptr->named_parameters().end(); ++it) {
+    auto tensor = *it;
+    // std::cout << (*it).name << std::endl;
+    if ((*it).name.find("gate_proj") != std::string::npos) {
+      (*it).value.set_data(gate_proj);
+    } else if ((*it).name.find("up_proj") != std::string::npos) {
+      (*it).value.set_data(up_proj);
+    } else if ((*it).name.find("down_proj") != std::string::npos) {
+      (*it).value.set_data(down_proj);
+    }
+  }
+}
+
+torch::Tensor DeepSeekMoEDenseActDense::forward(torch::Tensor hidden_states,
+                                                cudaStream_t stream) {
+  // DLOG_INFO("DeepSeekMoEDenseActDense gate_proj:", gate_proj.sizes().vec(),
+  //           "up_proj:", up_proj.sizes().vec(), "down_proj:",
+  //           down_proj.sizes().vec(), "hidden_states:",
+  //           hidden_states.sizes().vec());
+  // assert(torch::all(gate_proj == 0).item<bool>() == false);
+  // assert(torch::all(up_proj == 0).item<bool>() == false);
+  // assert(torch::all(down_proj == 0).item<bool>() == false);
+  // return launch_fused_moe_ffn(hidden_states, gate_proj, up_proj, down_proj,
+  // stream);
   return torch::matmul(
       torch::silu(torch::matmul(hidden_states, gate_proj.transpose(0, 1))) *
           torch::matmul(hidden_states, up_proj.transpose(0, 1)),
@@ -236,3 +337,189 @@ void ExpertNode::SetTensorsFromBlob(const torch::Device& device) {
       assert(false);
   }
 }
+
+MoEMLP::MoEMLP(int dtype, int expert_type) {
+  auto tensor_dtype = dtype_to_torch(dtype);
+  auto options = torch::TensorOptions().dtype(tensor_dtype).device(torch::kCPU);
+  // input_ = register_parameter("input", torch::zeros({1}, options));
+  // output_ = register_parameter("output", torch::zeros({1}, options));
+  // gate_proj_ = register_parameter("gate_proj", torch::zeros({1}, options));
+  // up_proj_ = register_parameter("up_proj", torch::zeros({1}, options));
+  // down_proj_ = register_parameter("down_proj", torch::zeros({1}, options));
+
+  // fc1_bias_ = register_parameter("fc1_bias", torch::zeros({1}, options));
+  // fc2_bias_ = register_parameter("fc2_bias", torch::zeros({1}, options));
+  // fc3_bias_ = register_parameter("fc3_bias", torch::zeros({1}, options));
+
+  expert_type_ = expert_type;
+  dtype_ = dtype;
+
+  for (int i = 0; i < 8; i++) {
+    buffer_.push_back(torch::zeros({1}, options));
+  }
+  for (int i = 0; i < 4; i++) {
+    param_.push_back(torch::zeros({1}, options));
+  }
+}
+
+void MoEMLP::SetTensorsFromIds(const std::vector<std::uint32_t>& tensor_ids) {
+  std::vector<std::tuple<void*, int64_t>> tensor_ptrs;
+  std::vector<std::vector<int64_t>> tensor_shapes;
+  std::vector<std::vector<int64_t>> data_shapes;
+  int device = at::cuda::current_device();
+  auto options = torch::TensorOptions()
+                     .dtype(dtype_to_torch(dtype_))
+                     .device(CUDA_DEVICE(device));
+  for (auto& id : tensor_ids) {
+    auto tensor = kTensorIndex->find(id)->second.tensor;
+    auto tensor_shape = tensor.sizes().vec();
+    auto tensor_ptr = tensor.data_ptr();
+    auto tensor_size = torch_shape_size(tensor_shape, dtype_);
+    tensor_ptrs.push_back(std::make_tuple(tensor_ptr, tensor_size));
+    tensor_shapes.push_back(tensor_shape);
+  }
+  if (!param_init_) {
+    // auto allocator = CudaDeviceCachingAllocator::instance(device);
+    auto allocator = c10::DeviceCachingAllocator::get(device);
+    for (size_t i = 0; i < tensor_ptrs.size(); i++) {
+      auto [ptr, tensor_size] = tensor_ptrs[i];
+      auto tensor_shape = tensor_shapes[i];
+      void* param_ptr = allocator->allocate(tensor_size);
+      param_[i].set_data(torch::from_blob(param_ptr, tensor_shape,
+                                          DoNothingDeleter<void>{}, options));
+      DLOG_DEBUG("MoEMLP::SetTensorsFromBlob: tensor_ids", tensor_ids[i],
+                 "tensor_shape", tensor_shape, "tensor_size", tensor_size,
+                 "param_", param_[i].sizes().vec(), "device",
+                 param_[i].device().str());
+    }
+
+    // MLP tensor shape is transposed
+    int64_t hdim = tensor_shapes[0][1];
+    int64_t idim = tensor_shapes[0][0];
+    data_shapes.push_back({kMaxTokens, hdim});
+    data_shapes.push_back({kMaxTokens, hdim});
+
+    for (size_t i = 0; i < tensor_shapes.size(); i++) {
+      data_shapes.push_back({kMaxTokens, idim});
+    }
+
+    // auto allocator = CudaDeviceCachingAllocator::instance(device);
+    // auto allocator = c10::DeviceCachingAllocator::get(device);
+    // auto data_size = torch_shape_size({1024, hdim}, dtype_);
+    for (size_t i = 0; i < data_shapes.size(); i++) {
+      auto data_shape = data_shapes[i];
+      auto data_size = torch_shape_size(data_shape, dtype_);
+      void* buffer_ptr = allocator->allocate(data_size);
+      buffer_[i].set_data(torch::from_blob(buffer_ptr, data_shape,
+                                           DoNothingDeleter<void>{}, options));
+      DLOG_TRACE("MoEMLP::SetTensorsFromBlob: buffer_ tensor", i, "data_shape",
+                 data_shape, "data_size", data_size, "device",
+                 buffer_[i].device().str());
+    }
+    param_init_ = true;
+  }
+
+  assert(param_init_ == true);
+  assert(param_set_ == false);
+
+  for (size_t i = 0; i < tensor_ptrs.size(); i++) {
+    auto [ptr, tensor_size] = tensor_ptrs[i];
+    auto tensor_shape = tensor_shapes[i];
+    CUDA_CHECK(cudaMemcpy(param_[i].data_ptr(), ptr, tensor_size,
+                          cudaMemcpyDeviceToDevice));
+  }
+  param_set_ = true;
+  // DLOG_FATAL(
+  //     "MoEMLP::SetTensorsFromBlob: tensor_ids.size() should be 2,3,4, but got
+  //     {}", tensor_ids.size());
+}
+
+torch::Tensor MoEMLP::forward(torch::Tensor hidden_states,
+                              cudaStream_t stream) {
+  int64_t batch_size = hidden_states.size(0);
+  int64_t hdim = hidden_states.size(1);
+
+  DLOG_FATAL_IF(batch_size > kMaxTokens || batch_size <= 0,
+                "batch_size should be (0,", kMaxTokens, "] , but got",
+                batch_size);
+
+  DLOG_FATAL_IF(param_set_ == false, "param_set_ should be true");
+  DLOG_FATAL_IF(param_init_ == false, "param_init_ should be true");
+
+  auto& input_ = buffer_[0];
+  auto& output_ = buffer_[1];
+
+  // copy hidden_states to input_ using cudaMemcpy
+  cudaMemcpy(input_.data_ptr(), hidden_states.data_ptr(),
+             hidden_states.numel() * hidden_states.element_size(),
+             cudaMemcpyDeviceToDevice);
+  // cudaStreamSynchronize(stream);
+  // if (warmup_count_ == 0 && graph_mode_) {
+  //   graph_.replay();
+  // }
+
+  // if (warmup_count_ == 0 && !graph_mode_) {
+  //   graph_.capture_begin();
+  //   ForwardHelper();
+  //   graph_.capture_end();
+  //   graph_mode_ = true;
+  // }
+
+  // if (warmup_count_ > 0) {
+  //   warmup_count_--;
+  //   ForwardHelper();
+  // }
+  ForwardHelper();
+  param_set_ = false;
+  // slice until batch_size
+  cudaStreamSynchronize(stream);
+  // auto options = torch::TensorOptions()
+  //                    .dtype(dtype_to_torch(dtype_))
+  //                    .device(CUDA_DEVICE(at::cuda::current_device()));
+  // auto output = torch::empty({batch_size, hdim}, options);
+  // output.copy_(output_.index({torch::indexing::Slice(0, batch_size)}));
+  // cudaStreamSynchronize(stream);
+  // return std::move(output);
+  return output_.index({torch::indexing::Slice(0, batch_size)});
+}
+
+void MoEMLP::ForwardHelper() {
+  torch::NoGradGuard no_grad;
+  if (expert_type_ == DEEPSEEK_MOE_DENSE_ACT_DENSE ||
+      expert_type_ == MIXTRAL_MOE_DENSE_ACT_DENSE) {
+    auto& gate_proj = param_[0];
+    auto& up_proj =
+        (expert_type_ == DEEPSEEK_MOE_DENSE_ACT_DENSE) ? param_[1] : param_[2];
+    auto& down_proj =
+        (expert_type_ == DEEPSEEK_MOE_DENSE_ACT_DENSE) ? param_[2] : param_[1];
+
+    auto& input = buffer_[0];
+    auto& output = buffer_[1];
+    auto& gate_out = buffer_[2];
+    auto& gate_act_out = buffer_[3];
+    auto& up_out = buffer_[4];
+
+    DLOG_TRACE("MoEMLP::forward: gate_proj", gate_proj.sizes().vec(), "up_proj",
+               up_proj.sizes().vec(), "down_proj", down_proj.sizes().vec(),
+               "input", input.sizes().vec(), "gate_out", gate_out.sizes().vec(),
+               "gate_act_out", gate_act_out.sizes().vec(), "up_out",
+               up_out.sizes().vec(), "output", output.sizes().vec());
+
+    // gate step
+    torch::matmul_out(gate_out, input, gate_proj.transpose(0, 1));
+
+    // activation step
+    torch::silu_out(gate_act_out, gate_out);
+
+    // up step
+    torch::matmul_out(up_out, input, up_proj.transpose(0, 1));
+
+    // multiplication step, reuse gate_out
+    torch::mul_out(gate_out, gate_act_out, up_out);
+
+    // down step
+    torch::matmul_out(output, gate_out, down_proj.transpose(0, 1));
+    return;
+  }
+  DLOG_FATAL("MoEMLP::forward: expert_type not supported", expert_type_);
+}
diff --git a/core/parallel/expert_module.h b/core/parallel/expert_module.h
index 23a7852..d6274a9 100644
--- a/core/parallel/expert_module.h
+++ b/core/parallel/expert_module.h
@@ -4,6 +4,7 @@
 // EfficientMoE Team
 
 #include <torch/torch.h>
+#include <ATen/cuda/CUDAGraph.h>
 #include "model/model_topology.h"
 
 #ifndef EXPERT_TYPE
@@ -17,80 +18,143 @@
 #define MIXTRAL_MOE_DENSE_ACT_DENSE 4
 #define DEEPSEEK_MOE_DENSE_ACT_DENSE 5
 
-#define DTYPE_BFLOAT16 0
-#define DTYPE_FLOAT32 1
-#define DTYPE_FLOAT16 2
-#define DTYPE_FP8_E4M3FN 3
+// forward declarations
+torch::Tensor launch_fused_moe_ffn(torch::Tensor hidden,  // [M, K]
+                                   torch::Tensor w1,      // [N, K]
+                                   torch::Tensor w2,      // [N, K]
+                                   torch::Tensor w3,      // [K, N]
+                                   cudaStream_t stream);  // CUDA stream
 
 struct ModuleUtils {
   virtual void SetTensorsFromBlob(void* ptr,
                                   const std::vector<std::uint32_t>& tensor_ids,
                                   const torch::Device& device) = 0;
+  virtual void SetModuleFromBlob(torch::jit::script::Module* ptr) = 0;
 };
 
-struct SwitchTransformersDenseActDense : public torch::nn::Module,
-                                         public ModuleUtils {
-  SwitchTransformersDenseActDense(int dtype);
-  torch::Tensor forward(torch::Tensor hidden_states);
-  torch::Tensor wi, wo;
-
-  void SetTensorsFromBlob(void* ptr,
-                          const std::vector<std::uint32_t>& tensor_ids,
-                          const torch::Device& device) override;
-};
-
-struct SwitchTransformersDenseGatedActDense : public torch::nn::Module,
-                                              public ModuleUtils {
-  SwitchTransformersDenseGatedActDense(int dtype);
-  torch::Tensor forward(torch::Tensor hidden_states);
-  torch::Tensor wi_0, wi_1, wo;
-
-  void SetTensorsFromBlob(void* ptr,
-                          const std::vector<std::uint32_t>& tensor_ids,
-                          const torch::Device& device) override;
-};
-
-struct NllbMoeDenseActDense : public torch::nn::Module, public ModuleUtils {
-  NllbMoeDenseActDense(int dtype);
-  torch::Tensor forward(torch::Tensor hidden_states);
-  torch::Tensor fc1, fc2;
-  torch::Tensor fc1_bias, fc2_bias;
-
-  void SetTensorsFromBlob(void* ptr,
-                          const std::vector<std::uint32_t>& tensor_ids,
-                          const torch::Device& device) override;
-};
-
-struct FSGPTMoEDenseActDense : public torch::nn::Module, public ModuleUtils {
-  FSGPTMoEDenseActDense(int dtype);
-  torch::Tensor forward(torch::Tensor hidden_states);
-  torch::Tensor fc1, fc2;
-  torch::Tensor fc1_bias, fc2_bias;
-
-  void SetTensorsFromBlob(void* ptr,
-                          const std::vector<std::uint32_t>& tensor_ids,
-                          const torch::Device& device) override;
-};
-
-struct MixtralMoEDenseActDense : public torch::nn::Module, public ModuleUtils {
-  MixtralMoEDenseActDense(int dtype);
-  torch::Tensor forward(torch::Tensor hidden_states);
-  torch::Tensor w1, w2, w3;
-
-  void SetTensorsFromBlob(void* ptr,
-                          const std::vector<std::uint32_t>& tensor_ids,
-                          const torch::Device& device) override;
+#define DECLARE_MODULE(name, ...)                                         \
+  struct name : public torch::nn::Module, public ModuleUtils {            \
+    name(int dtype);                                                      \
+    torch::Tensor forward(torch::Tensor hidden_states,                    \
+                          cudaStream_t stream = nullptr);                 \
+    torch::Tensor __VA_ARGS__;                                            \
+    void SetTensorsFromBlob(void* ptr,                                    \
+                            const std::vector<std::uint32_t>& tensor_ids, \
+                            const torch::Device& device) override;        \
+    void SetModuleFromBlob(torch::jit::script::Module* ptr) override;     \
+  };
+
+DECLARE_MODULE(SwitchTransformersDenseActDense, wi, wo)
+DECLARE_MODULE(SwitchTransformersDenseGatedActDense, wi_0, wi_1, wo)
+DECLARE_MODULE(NllbMoeDenseActDense, fc1, fc2, fc1_bias, fc2_bias)
+DECLARE_MODULE(FSGPTMoEDenseActDense, fc1, fc2, fc1_bias, fc2_bias)
+DECLARE_MODULE(MixtralMoEDenseActDense, w1, w2, w3)
+DECLARE_MODULE(DeepSeekMoEDenseActDense, gate_proj, up_proj, down_proj)
+
+struct MoEMLP : public torch::nn::Module {
+  explicit MoEMLP(int dtype, int expert_type);
+  torch::Tensor forward(torch::Tensor hidden_states, cudaStream_t stream);
+
+  void SetTensorsFromIds(const std::vector<std::uint32_t>& tensor_ids);
+
+ private:
+  void ForwardHelper();
+
+ private:
+  std::vector<torch::Tensor> buffer_;
+  std::vector<torch::Tensor> param_;
+  // torch::Tensor input_;
+  // torch::Tensor output_;
+  // torch::Tensor gate_proj_;
+  // torch::Tensor up_proj_;
+  // torch::Tensor down_proj_;
+
+  // torch::Tensor fc1_bias_;
+  // torch::Tensor fc2_bias_;
+  // torch::Tensor fc3_bias_;
+
+  at::cuda::CUDAGraph graph_;
+  int warmup_count_ = 5;
+  bool graph_mode_ = false;
+  // bool data_initialized_ = false;
+  bool param_init_ = false;
+  bool param_set_ = false;
+
+  int dtype_;
+  int expert_type_;
 };
 
-struct DeepSeekMoEDenseActDense : public torch::nn::Module, public ModuleUtils {
-  DeepSeekMoEDenseActDense(int dtype);
-  torch::Tensor forward(torch::Tensor hidden_states);
-  torch::Tensor gate_proj, up_proj, down_proj;
-
-  void SetTensorsFromBlob(void* ptr,
-                          const std::vector<std::uint32_t>& tensor_ids,
-                          const torch::Device& device) override;
-};
+// struct SwitchTransformersDenseActDense : public torch::nn::Module,
+//                                          public ModuleUtils {
+//   SwitchTransformersDenseActDense(int dtype);
+//   torch::Tensor forward(torch::Tensor hidden_states);
+//   torch::Tensor wi, wo;
+
+//   void SetTensorsFromBlob(void* ptr,
+//                           const std::vector<std::uint32_t>& tensor_ids,
+//                           const torch::Device& device) override;
+//   void SetModuleFromBlob(torch::jit::script::Module* ptr) override;
+// };
+
+// struct SwitchTransformersDenseGatedActDense : public torch::nn::Module,
+//                                               public ModuleUtils {
+//   SwitchTransformersDenseGatedActDense(int dtype);
+//   torch::Tensor forward(torch::Tensor hidden_states);
+//   torch::Tensor wi_0, wi_1, wo;
+
+//   void SetTensorsFromBlob(void* ptr,
+//                           const std::vector<std::uint32_t>& tensor_ids,
+//                           const torch::Device& device) override;
+//   void SetModuleFromBlob(torch::jit::script::Module* ptr) override;
+// };
+
+// struct NllbMoeDenseActDense : public torch::nn::Module, public ModuleUtils {
+//   NllbMoeDenseActDense(int dtype);
+//   torch::Tensor forward(torch::Tensor hidden_states);
+//   torch::Tensor fc1, fc2;
+//   torch::Tensor fc1_bias, fc2_bias;
+
+//   void SetTensorsFromBlob(void* ptr,
+//                           const std::vector<std::uint32_t>& tensor_ids,
+//                           const torch::Device& device) override;
+//   void SetModuleFromBlob(torch::jit::script::Module* ptr) override;
+// };
+
+// struct FSGPTMoEDenseActDense : public torch::nn::Module, public ModuleUtils {
+//   FSGPTMoEDenseActDense(int dtype);
+//   torch::Tensor forward(torch::Tensor hidden_states);
+//   torch::Tensor fc1, fc2;
+//   torch::Tensor fc1_bias, fc2_bias;
+
+//   void SetTensorsFromBlob(void* ptr,
+//                           const std::vector<std::uint32_t>& tensor_ids,
+//                           const torch::Device& device) override;
+//   void SetModuleFromBlob(torch::jit::script::Module* ptr) override;
+// };
+
+// struct MixtralMoEDenseActDense : public torch::nn::Module, public ModuleUtils
+// {
+//   MixtralMoEDenseActDense(int dtype);
+//   torch::Tensor forward(torch::Tensor hidden_states);
+//   torch::Tensor w1, w2, w3;
+
+//   void SetTensorsFromBlob(void* ptr,
+//                           const std::vector<std::uint32_t>& tensor_ids,
+//                           const torch::Device& device) override;
+//   void SetModuleFromBlob(torch::jit::script::Module* ptr) override;
+// };
+
+// struct DeepSeekMoEDenseActDense : public torch::nn::Module, public
+// ModuleUtils {
+//   DeepSeekMoEDenseActDense(int dtype);
+//   torch::Tensor forward(torch::Tensor hidden_states, cudaStream_t stream);
+//   torch::Tensor gate_proj, up_proj, down_proj;
+
+//   void SetTensorsFromBlob(void* ptr,
+//                           const std::vector<std::uint32_t>& tensor_ids,
+//                           const torch::Device& device) override;
+//   void SetModuleFromBlob(torch::jit::script::Module* ptr) override;
+// };
 
 struct ExpertNode {
   NodePtr node;
@@ -99,48 +163,7 @@ struct ExpertNode {
   int layer_idx;
   int expert_idx;
   int expert_type;
+  torch::jit::script::Module* jit_module = nullptr;
 };
 
 typedef std::shared_ptr<ExpertNode> ExpertNodePtr;
-
-inline torch::ScalarType dtype_to_torch(int dtype) {
-  auto tensor_dtype = torch::kFloat32;
-  switch (dtype) {
-    case DTYPE_BFLOAT16:
-      tensor_dtype = torch::kBFloat16;
-      break;
-    case DTYPE_FLOAT16:
-      tensor_dtype = torch::kHalf;
-      break;
-    case DTYPE_FLOAT32:
-      tensor_dtype = torch::kFloat32;
-      break;
-    case DTYPE_FP8_E4M3FN:
-      tensor_dtype = torch::kFloat8_e4m3fn;
-      break;
-    default:
-      assert(false);
-  }
-  return tensor_dtype;
-}
-
-inline int torch_dtype_to_int(torch::ScalarType dtype) {
-  auto tensor_dtype = DTYPE_FLOAT32;
-  switch (dtype) {
-    case torch::kBFloat16:
-      tensor_dtype = DTYPE_BFLOAT16;
-      break;
-    case torch::kHalf:
-      tensor_dtype = DTYPE_FLOAT16;
-      break;
-    case torch::kFloat32:
-      tensor_dtype = DTYPE_FLOAT32;
-      break;
-    case torch::kFloat8_e4m3fn:
-      tensor_dtype = DTYPE_FP8_E4M3FN;
-      break;
-    default:
-      assert(false);
-  }
-  return tensor_dtype;
-}
diff --git a/core/python/py_archer_prefetch.cpp b/core/python/py_archer_prefetch.cpp
index b06dc42..4a66a8d 100644
--- a/core/python/py_archer_prefetch.cpp
+++ b/core/python/py_archer_prefetch.cpp
@@ -87,7 +87,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
       .def("enqueue_expert", &ExpertDispatcher::EnqueueExpert)
       .def("set_inputs", &ExpertDispatcher::SetInputs)
       .def("set_expected_queue", &ExpertDispatcher::SetExpectedQueue)
-      .def("wait_expert", &ExpertDispatcher::WaitExpert)
+      //  .def("wait_expert", &ExpertDispatcher::WaitExpert)
+      .def("wait_expert", &ExpertDispatcher::WaitHiddenStates)
+      .def("notify_fetch_start", &ExpertDispatcher::NotifyFetchStart)
       .def("clear_expert_cache_counts",
            &ExpertDispatcher::ClearExpertCacheCounts);
 }
diff --git a/core/utils/cuda_utils.cpp b/core/utils/cuda_utils.cpp
index f1cd3e2..19c53aa 100644
--- a/core/utils/cuda_utils.cpp
+++ b/core/utils/cuda_utils.cpp
@@ -9,6 +9,10 @@
 int kNumDevices = GetDeviceCount();
 
 bool IsDevicePointer(const void* ptr) {
+  if (ptr == nullptr) {
+    DLOG_ERROR("ptr is null");
+    return false;
+  }
   cudaPointerAttributes attr;
   cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
   if (err != cudaSuccess) {
diff --git a/core/utils/logger.h b/core/utils/logger.h
index abb650e..00a1e83 100644
--- a/core/utils/logger.h
+++ b/core/utils/logger.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <vector>
 #include "base/logging.h"
 
 inline void print(base::LogStream& stream) {}
@@ -18,6 +19,23 @@ inline void print(base::LogStream& stream, T first, Args... args) {
   }
 }
 
+namespace base {
+
+template <typename T>
+LogStream& operator<<(LogStream& stream, const std::vector<T>& vec) {
+  stream << "[";
+  for (size_t i = 0; i < vec.size(); ++i) {
+    if (i > 0) {
+      stream << ", ";
+    }
+    stream << vec[i];
+  }
+  stream << "]";
+  return stream;
+}
+
+}  // namespace base
+
 #define DLOG_TRACE(...)                                                     \
   do {                                                                      \
     if (base::Logger::logLevel() <= base::Logger::TRACE)                    \
@@ -60,3 +78,10 @@ inline void print(base::LogStream& stream, T first, Args... args) {
       print(base::Logger(__FILE__, __LINE__, base::Logger::FATAL).stream(), \
             __VA_ARGS__);                                                   \
   } while (0)
+
+#define DLOG_FATAL_IF(condition, ...) \
+  do {                                \
+    if (condition) {                  \
+      DLOG_FATAL(__VA_ARGS__);        \
+    }                                 \
+  } while (0)
diff --git a/core/utils/threadsafe_queue.h b/core/utils/threadsafe_queue.h
index 57700ef..01c25be 100644
--- a/core/utils/threadsafe_queue.h
+++ b/core/utils/threadsafe_queue.h
@@ -51,6 +51,11 @@ class ThreadSafeQueue : public base::noncopyable {
     return queue_.empty();
   }
 
+  void NotifyAll() {
+    // std::lock_guard<std::mutex> lock(mutex_);
+    cond_.notify_all();
+  }
+
  protected:
   std::queue<T> queue_;
   mutable std::mutex mutex_;
diff --git a/examples/interface_example.py b/examples/interface_example.py
index 96286f6..cd01264 100644
--- a/examples/interface_example.py
+++ b/examples/interface_example.py
@@ -79,21 +79,28 @@ def end(self):
         args.model_name_or_path, trust_remote_code=True, use_fast=False
     )
 
-dataset_name = "tasksource/bigbench"
-names = datasets.get_dataset_config_names(dataset_name)
 
-# remove empty entry in BIGBench dataset
-names.remove("simple_arithmetic_json_multiple_choice")
-names.remove("simple_arithmetic_multiple_targets_json")
-names.remove("cifar10_classification")
+dataset = datasets.load_dataset("openai/gsm8k", "main", split="test")
+all_inputs = dataset["question"]
 
-pool = mp.Pool(mp.cpu_count())
-all_inputs = [None] * len(names)
-all_inputs = pool.map(partial(datasets.load_dataset, dataset_name), names)
+# dataset_name = "openai/gsm8k"
+# names = datasets.get_dataset_config_names(dataset_name)
 
-all_inputs = [
-    text for dataset in all_inputs for text in dataset["validation"]["inputs"]
-]
+# pool = mp.Pool(mp.cpu_count())
+# all_inputs = [None] * len(names)
+# all_inputs = pool.map(partial(datasets.load_dataset, dataset_name), names)
+
+# print(all_inputs)
+
+# text_list = []
+# for dataset in all_inputs:
+#     if "test" not in dataset:
+#         continue
+#     for i, text in enumerate(dataset["test"]["question"]):
+#         text_list.append(text)
+
+# print(len(text_list))
+# all_inputs = text_list
 
 config = {
     "offload_path": os.path.join(args.offload_dir, model_name),
@@ -112,7 +119,10 @@ def end(self):
     custom_kwargs = {}
 elif "arctic" in args.model_name_or_path.lower():
     custom_kwargs = {"pad_token_id": tokenizer.eos_token_id}
-elif "deepseek" in args.model_name_or_path.lower():
+elif (
+    "deepseek" in args.model_name_or_path.lower()
+    or "qwen3" in args.model_name_or_path.lower()
+):
     custom_kwargs = {"pad_token_id": tokenizer.eos_token_id}
 else:
     raise ValueError(f"Model {args.model_name_or_path} not supported")
@@ -121,27 +131,31 @@ def end(self):
 cnt = 0
 max_seq_length = 512
 for input_text in all_inputs:
-    # repeat the input text 100 times to test the performance
-    input_text = input_text * 1000
-    inputs = tokenizer(
-        input_text,
-        truncation=True,
-        padding="do_not_pad",
-        max_length=max_seq_length,
-        return_tensors="pt",
+    prompt = tokenizer.apply_chat_template(
+        conversation=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": input_text,
+            },
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
     )
-    print("inputs ...")
-    print(inputs.input_ids.shape)
+    print(f"prompt: {prompt}")
+
+    token_ids = tokenizer.encode(prompt, return_tensors="pt")
+    token_ids = token_ids.to("cuda:0")
 
     streamer = StopWatch(model.engine, tokenizer)
     with torch.no_grad():
         print("outputs_text ...")
         outputs = model.generate(
-            inputs.input_ids.to("cuda:0"),
+            token_ids.to("cuda:0"),
+            # attention_mask=inputs.attention_mask.to("cuda:0"),
             streamer=streamer,
             max_new_tokens=args.out_len,
             min_new_tokens=args.out_len,
-            attention_mask=inputs.attention_mask,
             do_sample=False,
             # use_cache=False,
             **custom_kwargs,
@@ -153,4 +167,4 @@ def end(self):
         print(
             f"Decoding time per iteration: {streamer.decoding_time / streamer.decoding_iterations} seconds"
         )
-        print(f"Input tokens: {len(inputs.input_ids[0])}")
+        # print(f"Input tokens: {len(inputs.input_ids[0])}")
diff --git a/moe_infinity/common/constants.py b/moe_infinity/common/constants.py
index f1756eb..510e7fd 100644
--- a/moe_infinity/common/constants.py
+++ b/moe_infinity/common/constants.py
@@ -3,13 +3,14 @@
     NllbMoeForConditionalGeneration,
     OPTForCausalLM,
     PretrainedConfig,
+    Qwen3MoeForCausalLM,
     SwitchTransformersForConditionalGeneration,
 )
 
 from ..models.modeling_arctic import (
     ArcticForCausalLM,
 )  # TODO: Replace this with huggingface transformers
-from ..models.modeling_deepseek import DeepseekV2ForCausalLM
+from ..models.modeling_deepseek_v2 import DeepseekV2ForCausalLM
 from ..models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
 from ..models.modeling_grok.modeling_grok1 import (
     Grok1ModelForCausalLM,
@@ -24,6 +25,7 @@
     "arctic": ArcticForCausalLM,
     "deepseek": DeepseekV2ForCausalLM,
     "deepseek_v3": DeepseekV3ForCausalLM,
+    "qwen3": Qwen3MoeForCausalLM,
 }
 
 MODEL_MAPPING_TYPES = {
@@ -34,6 +36,7 @@
     "arctic": 4,
     "deepseek": 5,
     "deepseek_v3": 5,
+    "qwen3": 5,
 }
 
 
diff --git a/moe_infinity/distributed/expert_executor.py b/moe_infinity/distributed/expert_executor.py
index e8968dc..14e4f27 100644
--- a/moe_infinity/distributed/expert_executor.py
+++ b/moe_infinity/distributed/expert_executor.py
@@ -29,7 +29,9 @@ def set_expert_dispatcher(self, expert_dispatcher):
     def set_device_map_manager(self, device_map_manager):
         self.device_map_manager = device_map_manager
 
-    def dispatch_local(self, hidden_states, router_mask, layer_id):
+    def dispatch_local(
+        self, layer_id, hidden_states, router_mask, router_weights
+    ):
         num_expert = router_mask.shape[-1]
         expert_count = (
             torch.sum(router_mask.view((-1, num_expert)), dim=0)
@@ -43,7 +45,9 @@ def dispatch_local(self, hidden_states, router_mask, layer_id):
         )
         expected_wait_cnt = len(expert_list)
 
-        self.expert_dispatcher.set_inputs(hidden_states, router_mask)
+        self.expert_dispatcher.set_inputs(
+            hidden_states, router_mask.bool(), router_weights
+        )
         self.expert_dispatcher.set_expected_queue(expected_wait_cnt)
 
         total_gpus = torch.cuda.device_count()
@@ -52,7 +56,9 @@ def dispatch_local(self, hidden_states, router_mask, layer_id):
             self.expert_dispatcher.enqueue_expert(
                 layer_id, expert_id, gpu_id, False
             )
+        self.expert_dispatcher.notify_fetch_start()
 
+    def wait_dispatch_local(self):
         result = self.expert_dispatcher.wait_expert()
 
         return result
diff --git a/moe_infinity/entrypoints/big_modeling.py b/moe_infinity/entrypoints/big_modeling.py
index a668ac0..40ef4c3 100644
--- a/moe_infinity/entrypoints/big_modeling.py
+++ b/moe_infinity/entrypoints/big_modeling.py
@@ -160,7 +160,8 @@ def _configure_hook(self, input_ids: torch.LongTensor):
             moe_infinity.models.modeling_arctic.modeling_arctic.apply_rotary_pos_emb = apply_rotary_pos_emb
 
         if self.arch == "deepseek" or self.arch == "deepseek_v3":
-            moe_infinity.models.modeling_deepseek.modeling_deepseek.apply_rotary_pos_emb = apply_rotary_pos_emb_deepseek
+            moe_infinity.models.modeling_deepseek_v2.modeling_deepseek.apply_rotary_pos_emb = apply_rotary_pos_emb_deepseek
+            moe_infinity.models.modeling_deepseek_v3.modeling_deepseek.apply_rotary_pos_emb = apply_rotary_pos_emb_deepseek
             # apply_rotary_pos_emb is defined in deepseek and differs from this version.
 
         batch_size = input_ids.shape[0]
diff --git a/moe_infinity/entrypoints/openai/protocol.py b/moe_infinity/entrypoints/openai/protocol.py
index 3064247..cc2f1f2 100644
--- a/moe_infinity/entrypoints/openai/protocol.py
+++ b/moe_infinity/entrypoints/openai/protocol.py
@@ -94,6 +94,7 @@ def to_hf_params(
             "temperature": self.temperature,
             "top_p": self.top_p,
             "logit_bias": self.logit_bias,
+            "max_new_tokens": self.max_tokens,
         }
 
 
@@ -102,7 +103,7 @@ class CompletionRequest(BaseModel):
     # a string, array of strings, array of tokens, or array of token arrays
     prompt: Union[List[int], List[List[int]], str, List[str]]
     suffix: Optional[str] = None
-    max_tokens: Optional[int] = 16
+    max_tokens: Optional[int] = 1024
     temperature: Optional[float] = 1.0
     top_p: Optional[float] = 1.0
     n: Optional[int] = 1
@@ -126,6 +127,7 @@ def to_hf_params(
             "top_p": self.top_p,
             "logit_bias": self.logit_bias,
             "best_of": self.best_of,
+            "max_new_tokens": self.max_tokens,
         }
 
 
diff --git a/moe_infinity/models/__init__.py b/moe_infinity/models/__init__.py
index 10e9ce6..eae2df3 100644
--- a/moe_infinity/models/__init__.py
+++ b/moe_infinity/models/__init__.py
@@ -13,4 +13,5 @@
     rotate_half,
 )
 from .nllb_moe import SyncNllbMoeSparseMLP
+from .qwen import Qwen3MoEBlock
 from .switch_transformers import SyncSwitchTransformersSparseMLP
diff --git a/moe_infinity/models/deepseek.py b/moe_infinity/models/deepseek.py
index 470529a..3b4fd93 100644
--- a/moe_infinity/models/deepseek.py
+++ b/moe_infinity/models/deepseek.py
@@ -1,10 +1,39 @@
-from typing import Dict, Optional, Tuple
+from typing import Dict
 
+import nvtx
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 
+class DeepseekMoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.n_routed_experts = config.n_routed_experts
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.empty((self.n_routed_experts, self.gating_dim))
+        )
+
+    def forward(self, hidden_states):
+        """
+        Forward pass for the MoE gate.
+        :param hidden_states: Input tensor of shape (batch_size, sequence_length, hidden_size).
+        :return: Gating logits of shape (batch_size, sequence_length, n_routed_experts).
+        """
+        # Compute the gating logits
+        bsz, seq_len, h = hidden_states.shape
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.type(torch.float32),
+            self.weight.type(torch.float32),
+            None,
+        )
+        return logits
+
+
 class DeepseekMoEBlock(nn.Module):
     """
     A mixed expert module containing shared experts.
@@ -14,9 +43,10 @@ def __init__(self, config):
         super().__init__()
         self.config = config
         self.num_experts_per_tok = config.num_experts_per_tok
+        self.num_expert = config.n_routed_experts
 
         if self.config.model_type == "deepseek_v2":
-            from .modeling_deepseek import DeepseekV2MLP, MoEGate
+            from .modeling_deepseek_v2 import DeepseekV2MLP, MoEGate
 
             self.mlp_cls = DeepseekV2MLP
             self.gate_cls = MoEGate
@@ -35,7 +65,8 @@ def __init__(self, config):
             ]
         )
 
-        self.gate = self.gate_cls(config)
+        # self.gate = self.gate_cls(config)
+        self.gate = DeepseekMoEGate(config)
         if config.n_shared_experts is not None:
             intermediate_size = (
                 config.moe_intermediate_size * config.n_shared_experts
@@ -48,115 +79,61 @@ def __init__(self, config):
         self.archer_engine = None
         self.expert_tensor_ids: Dict[int, int] = None
 
-    def forward(self, hidden_states):
-        identity = hidden_states
-        orig_shape = hidden_states.shape
-
-        gate_output = self.gate(hidden_states)
-        if len(gate_output) == 3:
-            topk_idx, topk_weight, aux_loss = gate_output
-        else:
-            topk_idx, topk_weight = gate_output
-            aux_loss = None
-        # topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-
-        # print("topk_idx", topk_idx.shape)
-        # print("topk_weight", topk_weight.shape)
-        # print(self.config.n_routed_experts, self.config.num_experts_per_tok)
-
-        # cnts = topk_idx.new_zeros((topk_idx.shape[0], len(self.experts)))
-        # cnts.scatter_(1, topk_idx, 1)
-        # tokens_per_expert = cnts.sum(dim=0)
-        # idxs = topk_idx.view(-1).argsort()
-        # sorted_tokens = hidden_states[idxs // topk_idx.shape[1]]
+    @nvtx.annotate("DeepSeekPrepare", color="blue")
+    def __prepare_expert_route(self, hidden_states):
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)  # dtype float32
 
-        # tokens_per_expert = tokens_per_expert.cpu().numpy()
-
-        batch_size, sequence_length, hidden_dim = orig_shape
-        router_mask = F.one_hot(
-            topk_idx, num_classes=self.config.n_routed_experts
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.num_experts_per_tok, dim=-1
         )
-        routing_weights_mask = (topk_weight[:, :, None] * router_mask).permute(
-            0, 2, 1
+        # if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
+        #     routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        # routing_weights = routing_weights.to(hidden_states.dtype)
+
+        # print(f"hidden_states shape: {hidden_states.shape}")
+        # print(f"routing_weights shape: {routing_weights.shape}")
+
+        # Compute sparse mask via scatter
+        B, E = routing_weights.shape[0], self.num_expert
+        router_mask = torch.zeros(
+            B, E, dtype=torch.bool, device=selected_experts.device
         )
-        routing_weights_mask = torch.sum(routing_weights_mask, dim=-1)
-        router_mask = router_mask.permute(0, 2, 1)
 
-        # use logical or to merge last dimension
-        for i in range(self.config.num_experts_per_tok):
-            router_mask[:, :, 0] = torch.logical_or(
-                router_mask[:, :, 0], router_mask[:, :, i]
-            )
-        router_mask = router_mask[:, :, 0]
+        # print("selected_experts", selected_experts.shape)
+        # print("routing_weights", routing_weights.shape)
         # print("router_mask", router_mask.shape)
-        # print("routing_weights_mask", routing_weights_mask.shape)
-
-        # overlap current layer with unique expert list
-        # unique_expert_list = torch.unique(topk_idx).tolist()
-        # self.expert_prefetcher.fetch_experts_lock_cache(
-        #     self.layer_id, unique_expert_list
-        # )
-
-        # self.expert_prefetcher.prefetch_experts_list(self.layer_id, unique_expert_list)
-
-        # expert_index = topk_idx.reshape(
-        #     batch_size, sequence_length, self.config.num_experts_per_tok
-        # )
-        # for i in range(batch_size):
-        #     seq_id = self.seq_id_list[i]
-        #     expert_matrix = self.expert_predictor.predict(
-        #         seq_id, expert_index[i], self.layer_id
-        #     )
-        #     self.expert_prefetcher.prefetch_experts(
-        #         self.layer_id, expert_matrix
-        #     )
-
-        final_hidden_states = torch.zeros(
-            (batch_size * sequence_length, hidden_dim),
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
+        # print("router_logits", router_logits.shape)
+        router_mask.scatter_(1, selected_experts, True)
+
+        routing_weights_mask = torch.zeros(
+            B, E, dtype=routing_weights.dtype, device=routing_weights.device
         )
-        results = self.expert_executor.dispatch_local(
-            hidden_states, router_mask, self.layer_id
+        routing_weights_mask.scatter_add_(1, selected_experts, routing_weights)
+
+        return router_mask, routing_weights_mask
+
+    @nvtx.annotate(message="DeepseekMoEBlock", color="blue")
+    def forward(self, hidden_states):
+        identity = hidden_states
+        routing_mask, routing_weight = self.__prepare_expert_route(
+            hidden_states
         )
-        for output, _, idx, _ in results:
-            token_indices = router_mask[:, idx].bool()
-            final_hidden_states[token_indices, :] += (
-                output.to(routing_weights_mask.device)
-                * routing_weights_mask[token_indices, idx][:, None]
-            )
+        batch_size, sequence_length, hidden_dim = identity.shape
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+        self.expert_executor.dispatch_local(
+            self.layer_id, hidden_states, routing_mask, routing_weight
+        )
+        final_hidden_states = self.expert_executor.wait_dispatch_local()
 
         final_hidden_states = final_hidden_states.view(
             batch_size, sequence_length, hidden_dim
-        )
+        ).to(hidden_states.dtype)
         if self.config.n_shared_experts is not None:
             final_hidden_states = final_hidden_states + self.shared_experts(
                 identity
             )
         return final_hidden_states
-
-        # outputs = []
-        # start_idx = 0
-        # for i, num_tokens in enumerate(tokens_per_expert):
-        #     end_idx = start_idx + num_tokens
-        #     if num_tokens == 0:
-        #         continue
-        #     expert = self.experts[i]
-        #     tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
-        #     expert_out = expert(tokens_for_this_expert)
-        #     outputs.append(expert_out.to(hidden_states.device))
-        #     start_idx = end_idx
-
-        # outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
-
-        # new_x = torch.empty_like(outs)
-        # new_x[idxs] = outs
-        # y = (
-        #     new_x.view(*topk_idx.shape, -1)
-        #     .type(topk_weight.dtype)
-        #     .mul_(topk_weight.unsqueeze(dim=-1))
-        #     .sum(dim=1)
-        #     .type(new_x.dtype)
-        # )
-        # return y
diff --git a/moe_infinity/models/grok.py b/moe_infinity/models/grok.py
index 695ca5c..5c6a178 100644
--- a/moe_infinity/models/grok.py
+++ b/moe_infinity/models/grok.py
@@ -67,21 +67,27 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
                 self.layer_id, expert_matrix
             )
 
-        final_hidden_states = torch.zeros(
-            (batch_size * sequence_length, hidden_dim),
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
+        self.expert_executor.dispatch_local(
+            self.layer_id, hidden_states, router_mask, routing_weights_mask
         )
-
-        results = self.expert_executor.dispatch_local(
-            hidden_states, router_mask, self.layer_id
-        )
-        for output, _, idx, _ in results:
-            token_indices = router_mask[:, idx].bool()
-            final_hidden_states[token_indices, :] += (
-                output.to(routing_weights_mask.device)
-                * routing_weights_mask[token_indices, idx][:, None]
-            )
+        final_hidden_states = self.expert_executor.wait_dispatch_local()
+
+        # final_hidden_states = torch.zeros(
+        #     (batch_size * sequence_length, hidden_dim),
+        #     dtype=hidden_states.dtype,
+        #     device=hidden_states.device,
+        # )
+
+        # self.expert_executor.dispatch_local(
+        #     self.layer_id, hidden_states, router_mask, routing_weights_mask
+        # )
+        # results = self.expert_executor.wait_dispatch_local()
+        # for output, _, idx, _ in results:
+        #     token_indices = router_mask[:, idx].bool()
+        #     final_hidden_states[token_indices, :] += (
+        #         output.to(routing_weights_mask.device)
+        #         * routing_weights_mask[token_indices, idx][:, None]
+        #     )
 
         # # One hot encode the selected experts to create an expert mask
         # # this will be used to easily index which expert is going to be sollicitated
@@ -119,5 +125,5 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
 
         final_hidden_states = final_hidden_states.reshape(
             batch_size, sequence_length, hidden_dim
-        )
+        ).to(hidden_states.dtype)
         return final_hidden_states, router_logits
diff --git a/moe_infinity/models/mixtral.py b/moe_infinity/models/mixtral.py
index d5170e1..7e2caa6 100644
--- a/moe_infinity/models/mixtral.py
+++ b/moe_infinity/models/mixtral.py
@@ -84,21 +84,24 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         #     )
         #     # print("prefetch", time.time() - start_time)
 
-        final_hidden_states = torch.zeros(
-            (batch_size * sequence_length, hidden_dim),
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
+        self.expert_executor.dispatch_local(
+            self.layer_id, hidden_states, router_mask, routing_weights_mask
         )
+        final_hidden_states = self.expert_executor.wait_dispatch_local()
 
-        results = self.expert_executor.dispatch_local(
-            hidden_states, router_mask, self.layer_id
-        )
-        for output, _, idx, _ in results:
-            token_indices = router_mask[:, idx].bool()
-            final_hidden_states[token_indices, :] += (
-                output.to(routing_weights_mask.device)
-                * routing_weights_mask[token_indices, idx][:, None]
-            )
+        # final_hidden_states = torch.zeros(
+        #     (batch_size * sequence_length, hidden_dim),
+        #     dtype=hidden_states.dtype,
+        #     device=hidden_states.device,
+        # )
+
+        # results = self.expert_executor.wait_dispatch_local()
+        # for output, _, idx, _ in results:
+        #     token_indices = router_mask[:, idx].bool()
+        #     final_hidden_states[token_indices, :] += (
+        #         output.to(routing_weights_mask.device)
+        #         * routing_weights_mask[token_indices, idx][:, None]
+        #     )
 
         # for expert_idx in range(self.num_experts):
         #     # expert_layer = self.experts[expert_idx]
@@ -112,7 +115,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         #         )
         #         final_hidden_states[token_indices, :] += current_hidden_states
 
-        final_hidden_states = final_hidden_states.reshape(
+        final_hidden_states = final_hidden_states.view(
             batch_size, sequence_length, hidden_dim
-        )
+        ).to(hidden_states.dtype)
         return final_hidden_states, router_logits
diff --git a/moe_infinity/models/modeling_deepseek/__init__.py b/moe_infinity/models/modeling_deepseek_v2/__init__.py
similarity index 100%
rename from moe_infinity/models/modeling_deepseek/__init__.py
rename to moe_infinity/models/modeling_deepseek_v2/__init__.py
diff --git a/moe_infinity/models/modeling_deepseek/configuration_deepseek.py b/moe_infinity/models/modeling_deepseek_v2/configuration_deepseek.py
similarity index 100%
rename from moe_infinity/models/modeling_deepseek/configuration_deepseek.py
rename to moe_infinity/models/modeling_deepseek_v2/configuration_deepseek.py
diff --git a/moe_infinity/models/modeling_deepseek/modeling_deepseek.py b/moe_infinity/models/modeling_deepseek_v2/modeling_deepseek.py
similarity index 99%
rename from moe_infinity/models/modeling_deepseek/modeling_deepseek.py
rename to moe_infinity/models/modeling_deepseek_v2/modeling_deepseek.py
index 13a1ff1..2062e11 100644
--- a/moe_infinity/models/modeling_deepseek/modeling_deepseek.py
+++ b/moe_infinity/models/modeling_deepseek_v2/modeling_deepseek.py
@@ -1867,7 +1867,7 @@ def prepare_inputs_for_generation(
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
                 past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
+                max_cache_length = None  # past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
diff --git a/moe_infinity/models/modeling_deepseek/tokenization_deepseek_fast.py b/moe_infinity/models/modeling_deepseek_v2/tokenization_deepseek_fast.py
similarity index 100%
rename from moe_infinity/models/modeling_deepseek/tokenization_deepseek_fast.py
rename to moe_infinity/models/modeling_deepseek_v2/tokenization_deepseek_fast.py
diff --git a/moe_infinity/models/nllb_moe.py b/moe_infinity/models/nllb_moe.py
index 5953628..1ee0812 100644
--- a/moe_infinity/models/nllb_moe.py
+++ b/moe_infinity/models/nllb_moe.py
@@ -56,7 +56,6 @@ def forward(
         )
         router_mask = combining_weights.bool()
 
-        next_states = torch.zeros_like(hidden_states)
         top_1_expert_index = torch.argmax(top_1_mask, dim=-1)
 
         # logits_except_top_1 = router_probs.masked_fill(
@@ -78,18 +77,25 @@ def forward(
         #         self.layer_id, expert_matrix
         #     )
 
-        results = self.expert_executor.dispatch_local(
-            hidden_states, router_mask, self.layer_id
+        self.expert_executor.dispatch_local(
+            self.layer_id, hidden_states, router_mask, combining_weights
         )
-        for output, _, idx, _ in results:
-            token_indices = router_mask[..., idx].bool()
-            weights = combining_weights[..., idx]
-            # print(router_mask.shape, combining_weights.shape, hidden_states.shape, flush=True)
-            # print(output.shape, weights.shape, token_indices.shape, next_states.shape, flush=True)
-            # print(output.shape, weights[token_indices].shape, next_states[token_indices].shape, flush=True)
-            next_states[token_indices] += torch.einsum(
-                "b,be->be", weights[token_indices], output.to(weights.device)
-            )
+        next_states = self.expert_executor.wait_dispatch_local()
+
+        # self.expert_executor.dispatch_local(
+        #     self.layer_id, hidden_states, router_mask, combining_weights
+        # )
+        # next_states = torch.zeros_like(hidden_states)
+        # results = self.expert_executor.wait_dispatch_local()
+        # for output, _, idx, _ in results:
+        #     token_indices = router_mask[..., idx].bool()
+        #     weights = combining_weights[..., idx]
+        #     # print(router_mask.shape, combining_weights.shape, hidden_states.shape, flush=True)
+        #     # print(output.shape, weights.shape, token_indices.shape, next_states.shape, flush=True)
+        #     # print(output.shape, weights[token_indices].shape, next_states[token_indices].shape, flush=True)
+        #     next_states[token_indices] += torch.einsum(
+        #         "b,be->be", weights[token_indices], output.to(weights.device)
+        #     )
 
         # for expert_id, expert in self.experts.items():
         #     idx = int(expert_id.split("_")[-1])
@@ -101,7 +107,7 @@ def forward(
         #         next_states[token_indices] += torch.einsum("b,be->be", weights[token_indices], expert_output)
 
         next_states[next_states == 0] = hidden_states[next_states == 0]
-        hidden_states = next_states
+        hidden_states = next_states.to(hidden_states.dtype)
 
         return hidden_states, (
             router_probs.to("cuda:0", non_blocking=True),
diff --git a/moe_infinity/models/qwen.py b/moe_infinity/models/qwen.py
new file mode 100644
index 0000000..eb8a6e8
--- /dev/null
+++ b/moe_infinity/models/qwen.py
@@ -0,0 +1,138 @@
+import nvtx
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeMLP
+
+
+class Qwen3MoEBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+
+        # gating
+        self.gate = nn.Linear(
+            config.hidden_size, config.num_experts, bias=False
+        )
+        self.experts = nn.ModuleList(
+            [
+                Qwen3MoeMLP(
+                    config, intermediate_size=config.moe_intermediate_size
+                )
+                for _ in range(self.num_experts)
+            ]
+        )
+
+    @nvtx.annotate("Qwen3Prepare", color="blue")
+    def __prepare_expert_route(self, hidden_states):
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.top_k, dim=-1
+        )
+        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        # print(f"hidden_states shape: {hidden_states.shape}")
+        # print(f"routing_weights shape: {routing_weights.shape}")
+
+        # Compute sparse mask via scatter
+        B, E = routing_weights.shape[0], self.num_experts
+        router_mask = torch.zeros(
+            B, E, dtype=torch.bool, device=selected_experts.device
+        )
+        router_mask.scatter_(1, selected_experts, True)
+
+        routing_weights_mask = torch.zeros(
+            B, E, dtype=routing_weights.dtype, device=routing_weights.device
+        )
+        routing_weights_mask.scatter_add_(1, selected_experts, routing_weights)
+
+        return router_logits, router_mask, routing_weights_mask
+
+    @nvtx.annotate("Qwen3MoEBlock", color="blue")
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits, router_mask, routing_weights_mask = (
+            self.__prepare_expert_route(hidden_states)
+        )
+        # router_mask = F.one_hot(selected_experts, num_classes=self.num_experts)
+        # routing_weights_mask = (
+        #     routing_weights[:, :, None] * router_mask
+        # ).permute(0, 2, 1)
+        # routing_weights_mask = torch.sum(routing_weights_mask, dim=-1)
+        # router_mask = router_mask.permute(0, 2, 1)
+        # router_mask = torch.any(router_mask, dim=-1)
+
+        # print(f"router_mask shape: {router_mask.shape}")
+        # print(f"routing_weights_mask shape: {routing_weights_mask.shape}")
+
+        # # use logical or to merge last dimension
+        # for i in range(self.top_k):
+        #     router_mask[:, :, 0] = torch.logical_or(
+        #         router_mask[:, :, 0], router_mask[:, :, i]
+        #     )
+        # router_mask = router_mask[:, :, 0]
+
+        self.expert_executor.dispatch_local(
+            self.layer_id, hidden_states, router_mask, routing_weights_mask
+        )
+        final_hidden_states = self.expert_executor.wait_dispatch_local()
+        # final_hidden_states = torch.zeros(
+        #     (batch_size * sequence_length, hidden_dim),
+        #     dtype=hidden_states.dtype,
+        #     device=hidden_states.device,
+        # )
+
+        # results = self.expert_executor.wait_dispatch_local()
+        # for output, _, idx, _ in results:
+        #     token_indices = router_mask[:, idx].bool()
+        #     final_hidden_states[token_indices, :] += (
+        #         output.to(routing_weights_mask.device)
+        #         * routing_weights_mask[token_indices, idx][:, None]
+        #     )
+
+        final_hidden_states = final_hidden_states.view(
+            batch_size, sequence_length, hidden_dim
+        ).to(hidden_states.dtype)
+
+        return final_hidden_states, router_logits
+        """
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_experts
+        ).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = (
+                expert_layer(current_state) * routing_weights[top_x, idx, None]
+            )
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(hidden_states.dtype)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim
+        )
+        return final_hidden_states, router_logits
+        """
diff --git a/moe_infinity/models/switch_transformers.py b/moe_infinity/models/switch_transformers.py
index cb2576f..c69b07b 100644
--- a/moe_infinity/models/switch_transformers.py
+++ b/moe_infinity/models/switch_transformers.py
@@ -78,7 +78,6 @@ def forward(self, hidden_states):
 
         # The routers introduced might not always map all the tokens, to a router, which means that some hidden states
         # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the selected ones.
-        next_states = hidden_states.clone()
 
         # n_tokens = hidden_states.shape[1] * hidden_states.shape[0]
         batch_size = hidden_states.shape[0]
@@ -92,13 +91,17 @@ def forward(self, hidden_states):
         #         self.layer_id, expert_matrix
         #     )
 
-        results = self.expert_executor.dispatch_local(
-            hidden_states, router_mask, self.layer_id
+        self.expert_executor.dispatch_local(
+            self.layer_id, hidden_states, router_mask, router_probs
         )
+        next_states = self.expert_executor.wait_dispatch_local()
 
-        for output, _, idx, _ in results:
-            token_indices = router_mask[:, :, idx].bool()
-            next_states[token_indices] = output.to(next_states.device)
+        # next_states = hidden_states.clone()
+        # results = self.expert_executor.wait_dispatch_local()
+
+        # for output, _, idx, _ in results:
+        #     token_indices = router_mask[:, :, idx].bool()
+        #     next_states[token_indices] = output.to(next_states.device)
 
         # for expert_id, expert in self.experts.items():
         #     idx = int(expert_id.split("_")[-1])
diff --git a/moe_infinity/runtime/compile.py b/moe_infinity/runtime/compile.py
new file mode 100644
index 0000000..bafb925
--- /dev/null
+++ b/moe_infinity/runtime/compile.py
@@ -0,0 +1,43 @@
+import os
+
+import torch
+from transformers.models.mixtral.modeling_mixtral import (
+    MixtralBlockSparseTop2MLP,
+)
+from transformers.models.nllb_moe.modeling_nllb_moe import (
+    NllbMoeDenseActDense,
+)
+from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeMLP
+
+# from moe_infinity.models.modeling_grok import MoeMLP as GrokMoeMLP
+from moe_infinity.models.modeling_arctic import ArcticMLP
+from moe_infinity.models.modeling_deepseek_v2 import DeepseekV2MLP
+from moe_infinity.models.modeling_deepseek_v3 import DeepseekV3MLP
+
+EXPERT_CLS = {
+    # "grok": GrokMoeMLP,
+    "arctic": ArcticMLP,
+    "deepseek_v2": DeepseekV2MLP,
+    "deepseek_v3": DeepseekV3MLP,
+    "mixtral": MixtralBlockSparseTop2MLP,
+    # "nllb_moe": NllbMoeDenseActDense,
+    "qwen3_moe": Qwen3MoeMLP,
+}
+
+
+# compile a single expert
+def script_expert(save_dir, expert_type, config, **kwargs):
+    """
+    Compile a single expert.
+    """
+    # get argument list from the expert class
+    # expert_cls = EXPERT_CLS[expert_type]
+    # expert_args = expert_cls.__init__.__code__.co_varnames
+
+    expert_instance = EXPERT_CLS[expert_type](config, **kwargs)
+    # compile the forward function of the expert
+    module = torch.jit.script(expert_instance)
+    torch.jit.save(
+        module,
+        os.path.join(save_dir, "expert.pt"),
+    )
diff --git a/moe_infinity/runtime/model_offload.py b/moe_infinity/runtime/model_offload.py
index 4ab8522..4794345 100644
--- a/moe_infinity/runtime/model_offload.py
+++ b/moe_infinity/runtime/model_offload.py
@@ -29,6 +29,7 @@
 from moe_infinity.memory import ExpertPredictor, ExpertPrefetcher, ExpertTracer
 from moe_infinity.models import (
     DeepseekMoEBlock,
+    Qwen3MoEBlock,
     SyncArcticMoeBlock,
     SyncGrokMoeBlock,
     SyncMixtralSparseMoeBlock,
@@ -36,6 +37,7 @@
     SyncSwitchTransformersSparseMLP,
 )
 from moe_infinity.ops.op_builder.prefetch import PrefetchBuilder
+from moe_infinity.runtime.compile import script_expert
 from moe_infinity.runtime.hooks import *
 from moe_infinity.utils import (
     ArcherConfig,
@@ -295,6 +297,9 @@ def archer_cast_classifier(cls, *args, **kwargs):
             SyncMixtralSparseMoeBlock
         )
 
+        transformers.models.qwen3_moe.modeling_qwen3_moe._old_sparse_mlp = transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock
+        transformers.models.qwen3_moe.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock = Qwen3MoEBlock
+
         moe_infinity.models.modeling_grok.modeling_grok1._old_sparse_mlp = (
             moe_infinity.models.modeling_grok.MoeBlock
         )
@@ -309,13 +314,13 @@ def archer_cast_classifier(cls, *args, **kwargs):
             SyncArcticMoeBlock
         )
 
-        moe_infinity.models.modeling_deepseek._old_sparse_mlp = (
-            moe_infinity.models.modeling_deepseek.DeepseekV2MoE
+        moe_infinity.models.modeling_deepseek_v2._old_sparse_mlp = (
+            moe_infinity.models.modeling_deepseek_v2.DeepseekV2MoE
         )
         moe_infinity.models.modeling_deepseek_v3._old_sparse_mlp = (
             moe_infinity.models.modeling_deepseek_v3.DeepseekV3MoE
         )
-        moe_infinity.models.modeling_deepseek.modeling_deepseek.DeepseekV2MoE = DeepseekMoEBlock
+        moe_infinity.models.modeling_deepseek_v2.modeling_deepseek.DeepseekV2MoE = DeepseekMoEBlock
         moe_infinity.models.modeling_deepseek_v3.modeling_deepseek.DeepseekV3MoE = DeepseekMoEBlock
 
         def from_pretrained_decorator(
@@ -415,6 +420,12 @@ def archer_from_pretrained(cls, *args, **kwargs):
                     ),
                 )
 
+                script_expert(
+                    self.checkpoint,
+                    self.config.model_type,
+                    self.config,
+                )
+
                 if self.config.model_type == "deepseek_v3":
                     model = model.to(torch.float8_e4m3fn)
 
@@ -560,6 +571,7 @@ def archer_from_pretrained(cls, *args, **kwargs):
                         or isinstance(module, SyncGrokMoeBlock)
                         or isinstance(module, SyncArcticMoeBlock)
                         or isinstance(module, DeepseekMoEBlock)
+                        or isinstance(module, Qwen3MoEBlock)
                     ):
                         # module.archer_prefetch = self.archer_prefetch
                         # module.archer_tracer = self.archer_tracer
@@ -843,7 +855,10 @@ def gen_args_hook(
                     # )
 
                     self.expert_dispatcher.register_expert(
-                        expert_layer_id, expert_idx, expert_tensors
+                        expert_layer_id,
+                        expert_idx,
+                        expert_tensors,
+                        os.path.join(self.checkpoint, f"expert.pt"),
                     )
                 expert_layer_id += 1
             else:
@@ -1005,5 +1020,5 @@ def clean_up(self):
             moe_infinity.models.modeling_arctic._old_sparse_mlp
         )
 
-        moe_infinity.models.modeling_deepseek.modeling_deepseek.DeepseekV2MoE = moe_infinity.models.modeling_deepseek._old_sparse_mlp
+        moe_infinity.models.modeling_deepseek_v2.modeling_deepseek.DeepseekV2MoE = moe_infinity.models.modeling_deepseek_v2._old_sparse_mlp
         moe_infinity.models.modeling_deepseek_v3.modeling_deepseek.DeepseekV3MoE = moe_infinity.models.modeling_deepseek_v3._old_sparse_mlp
diff --git a/moe_infinity/utils/config.py b/moe_infinity/utils/config.py
index 3a976f2..94ace6c 100644
--- a/moe_infinity/utils/config.py
+++ b/moe_infinity/utils/config.py
@@ -41,7 +41,7 @@ class ArcherConfig:
         metadata={"help": "Ratio of device memory to use"},
     )
     num_threads: int = field(
-        default=8, metadata={"help": "Number of threads for each GPU exec"}
+        default=1, metadata={"help": "Number of threads for each GPU exec"}
     )
     host_memory_ratio: float = field(
         default=0.9,
diff --git a/moe_infinity/utils/hf_config.py b/moe_infinity/utils/hf_config.py
index 56d4c2c..9d4f76d 100644
--- a/moe_infinity/utils/hf_config.py
+++ b/moe_infinity/utils/hf_config.py
@@ -37,7 +37,7 @@ def parse_moe_param(config: PretrainedConfig) -> Tuple[int, int, int]:
         num_decoder_layers = config.num_hidden_layers
         num_layers = config.num_hidden_layers
         num_experts = config.num_local_experts
-    elif "grok" in arch:
+    elif "grok" in arch or "qwen3" in arch:
         num_encoder_layers = 0
         num_decoder_layers = config.num_hidden_layers
         num_layers = config.num_hidden_layers
@@ -100,7 +100,7 @@ def parse_expert_id(
             # print(f"layer_id: {layer_id}, expert_id: {expert_id}")
             layer_id = int(layer_id)
             expert_id = int(expert_id)
-    elif "deepseek" in arch:
+    elif "deepseek" in arch or "qwen3" in arch:
         encoder_sparse_step = None
         decoder_sparse_step = 1
         layer_type = "decoder"
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 01f2262..cad6607 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -754,7 +754,7 @@ def cxx_args(self):
         if sys.platform == "win32":
             return ["-O2"]
         else:
-            return ["-O3", "-std=c++14", "-g", "-Wno-reorder"]
+            return ["-O3", "-std=c++17", "-g", "-Wno-reorder"]
 
     def nvcc_args(self):
         if self.build_for_cpu:
@@ -763,7 +763,7 @@ def nvcc_args(self):
         if self.is_rocm_pytorch():
             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
             args += [
-                "-std=c++14",
+                "-std=c++17",
                 "-U__HIP_NO_HALF_OPERATORS__",
                 "-U__HIP_NO_HALF_CONVERSIONS__",
                 "-U__HIP_NO_HALF2_OPERATORS__",
@@ -779,7 +779,7 @@ def nvcc_args(self):
                 "--use_fast_math",
                 "-std=c++17"
                 if sys.platform == "win32" and cuda_major > 10
-                else "-std=c++14",
+                else "-std=c++17",
                 "-U__CUDA_NO_HALF_OPERATORS__",
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
                 "-U__CUDA_NO_HALF2_OPERATORS__",
diff --git a/op_builder/prefetch.py b/op_builder/prefetch.py
index 1fa4de4..85ea6b6 100644
--- a/op_builder/prefetch.py
+++ b/op_builder/prefetch.py
@@ -15,10 +15,10 @@
 import glob
 import os
 
-from .builder import OpBuilder
+from .builder import CUDAOpBuilder, OpBuilder
 
 
-class PrefetchBuilder(OpBuilder):
+class PrefetchBuilder(CUDAOpBuilder):
     BUILD_VAR = "MOE_BUILD_PREFETCH"
     NAME = "prefetch"
 
@@ -33,6 +33,7 @@ def sources(self):
             "core/utils/logger.cpp",
             "core/utils/cuda_utils.cpp",
             "core/model/model_topology.cpp",
+            "core/model/fused_mlp.cu",
             "core/prefetch/archer_prefetch_handle.cpp",
             "core/prefetch/task_scheduler.cpp",
             "core/prefetch/task_thread.cpp",
@@ -40,7 +41,6 @@ def sources(self):
             "core/memory/stream_pool.cpp",
             "core/memory/host_caching_allocator.cpp",
             "core/memory/device_caching_allocator.cpp",
-            "core/python/py_archer_prefetch.cpp",
             "core/parallel/expert_dispatcher.cpp",
             "core/parallel/expert_module.cpp",
             "core/aio/archer_aio_thread.cpp",
@@ -61,6 +61,7 @@ def sources(self):
             "core/base/timezone.cc",
             "core/base/log_stream.cc",
             "core/base/thread_pool.cc",
+            "core/python/py_archer_prefetch.cpp",
         ]
 
     def include_paths(self):
@@ -70,6 +71,7 @@ def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
         CPU_ARCH = self.cpu_arch()
         SIMD_WIDTH = self.simd_width()
+
         return [
             "-g",
             "-Wall",
diff --git a/pyproject.toml b/pyproject.toml
index ba3b1c5..99e4687 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools==75.3.2", "wheel", "torch"]
+requires = ["setuptools==75.3.2", "wheel", "torch==2.3.1"]
 build-backend = "setuptools.build_meta"
 
 
diff --git a/requirements.txt b/requirements.txt
index 234d66c..b95de9c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,13 +9,13 @@ numpy==1.22.4
 openai
 optimum>=1.17.1
 packaging>=20.0
-pre-commit
+protobuf
 py-cpuinfo
 pyarrow==12.0.0
 pydantic==1.10.12
 scipy
 sentencepiece
 sphinx
-torch>=2.1.1
-transformers>=4.37.1, <4.47
+torch==2.3.1
+transformers==4.51.3
 uvicorn
diff --git a/tests/test_oai_chat_completions.py b/tests/test_oai_chat_completions.py
index 29f9206..19de7a5 100644
--- a/tests/test_oai_chat_completions.py
+++ b/tests/test_oai_chat_completions.py
@@ -12,7 +12,11 @@
     model="deepseek-ai/DeepSeek-V2-Lite-Chat",
     messages=[
         {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Tell me a joke"},
+        {
+            "role": "user",
+            "content": "Write a piece of code in cpp about LRU cache",
+        },
     ],
+    max_tokens=1024,
 )
 print("Chat response:", chat_response)