EfficientMoE · drunkcoding · Feb 27, 2024 · Feb 27, 2024 · Feb 28, 2024 · Feb 29, 2024
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -17,7 +17,7 @@ concurrency:
 
 jobs:
   build:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     container:
       image: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
 

diff --git a/.github/workflows/pre-commit-format.yml b/.github/workflows/pre-commit-format.yml
@@ -18,7 +18,7 @@ jobs:
 
   # formatting and basic install on cpu-only machine
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v4

diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml
@@ -16,7 +16,7 @@ permissions:
 
 jobs:
   setup-version:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - name: Generate version number
       run: |

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -11,7 +11,7 @@ permissions:
 jobs:
   release:
     name: Create Release
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     outputs:
       upload_url: ${{ steps.create_release.outputs.upload_url }}
     steps:

diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@
 # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
 
 test_*.py
+*.nsys-*
+*.sh
 
 # Prerequisites
 *.d

diff --git a/README.md b/README.md
@@ -36,15 +36,14 @@ Note that: The open-sourced MoE-Infinity has been redesigned for making it Huggi
 Single GPU A5000 (24GB Memory), per-token-latency (seconds) for generation with a mixed dataset that includes [LongBench](https://huggingface.co/datasets/THUDM/LongBench), [GSM8K](https://huggingface.co/datasets/openai/gsm8k),  [FLAN](https://huggingface.co/datasets/Muennighoff/flan), [BIG-Bench](https://huggingface.co/datasets/bigbench) and [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) datasets.
 Lower per-token-latency is preferable.
 
-|  | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite
-| :---: | :---: | :---: | :---: | :---: |
-| <ins>MoE-Infinity</ins> | <ins>*0.130*</ins>	| <ins>*0.119*</ins> | <ins>*0.735*</ins> | <ins>*0.155*</ins> |
-| Accelerate | 1.043 | 3.071 | 6.633 |  1.743  |
-|DeepSpeed | 4.578 | 8.381 | 2.486 | 0.737 |
-|Mixtral Offloading| X | X | 1.752 | X |
-|Ollama | X | X | 0.903 | 1.250 |
-|vLLM| X | X | 2.137 | 0.493 |
-
+|  | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite-Chat | Qwen3-30B-A3B |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| <ins>MoE-Infinity</ins> | <ins>*0.130*</ins>	| <ins>*0.119*</ins> | <ins>*0.735*</ins> | <ins>*0.100*</ins> | <ins>*0.150*</ins> |
+| Accelerate | 1.043 | 3.071 | 6.633 |  1.743  | |
+|DeepSpeed (0.16.2) | 4.578 | 8.381 | 2.486 | 0.737 | 7.857 |
+|Mixtral Offloading| X | X | 1.752 | X |X|
+|Ollama | X | X | 0.903 | 1.250 ||
+|vLLM (v0.8.5)| X | X | 2.137 | 0.149 | 0.205 |
 
 <!-- Single GPU A5000, throughput (token/s) for generation with batch size 32.
 Higher throughput is preferable.
@@ -207,7 +206,7 @@ If you use MoE-Inifity for your research, please cite our [paper](https://arxiv.
                   Zhan Lu and
                   Luo Mai and
                   Mahesh Marina},
-  title        = {MoE-Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache},
+  title        = {MoE{-}Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache},
   archivePrefix= {arXiv},
   eprint       = {2401.14361},
   year         = {2024}

diff --git a/core/common/pytorch.h b/core/common/pytorch.h
@@ -45,9 +45,76 @@ inline py::list vector_to_list(std::vector<uint32_t>& vec) {
   return list;
 }
 
+#define DTYPE_BFLOAT16 0
+#define DTYPE_FLOAT32 1
+#define DTYPE_FLOAT16 2
+#define DTYPE_FP8_E4M3FN 3
+
+inline torch::ScalarType dtype_to_torch(int dtype) {
+  auto tensor_dtype = torch::kFloat32;
+  switch (dtype) {
+    case DTYPE_BFLOAT16:
+      tensor_dtype = torch::kBFloat16;
+      break;
+    case DTYPE_FLOAT16:
+      tensor_dtype = torch::kHalf;
+      break;
+    case DTYPE_FLOAT32:
+      tensor_dtype = torch::kFloat32;
+      break;
+    case DTYPE_FP8_E4M3FN:
+      tensor_dtype = torch::kFloat8_e4m3fn;
+      break;
+    default:
+      assert(false);
+  }
+  return tensor_dtype;
+}
+
+inline int torch_dtype_to_int(torch::ScalarType dtype) {
+  auto tensor_dtype = DTYPE_FLOAT32;
+  switch (dtype) {
+    case torch::kBFloat16:
+      tensor_dtype = DTYPE_BFLOAT16;
+      break;
+    case torch::kHalf:
+      tensor_dtype = DTYPE_FLOAT16;
+      break;
+    case torch::kFloat32:
+      tensor_dtype = DTYPE_FLOAT32;
+      break;
+    case torch::kFloat8_e4m3fn:
+      tensor_dtype = DTYPE_FP8_E4M3FN;
+      break;
+    default:
+      assert(false);
+  }
+  return tensor_dtype;
+}
+
+inline size_t torch_dtype_size(int dtype) {
+  size_t itemsize = 0;
+  switch (dtype) {
+    case DTYPE_BFLOAT16:
+      itemsize = 2;  // bfloat16 is 2 bytes
+      break;
+    case DTYPE_FLOAT16:
+      itemsize = 2;  // float16 is 2 bytes
+      break;
+    case DTYPE_FLOAT32:
+      itemsize = 4;  // float32 is 4 bytes
+      break;
+    case DTYPE_FP8_E4M3FN:
+      itemsize = 1;  // fp8_e4m3fn is 1 byte
+      break;
+    default:
+      assert(false);  // Invalid dtype
+  }
+  return itemsize;
+}
+
 inline size_t torch_shape_size(const std::vector<int64_t>& shape, int dtype) {
-  auto torch_type = torch::ScalarType(dtype);
-  auto itemsize = torch::empty({1}, torch_type).itemsize();
+  auto itemsize = torch_dtype_size(dtype);
   size_t size = 1;
   for (auto dim : shape) {
     size *= dim;

diff --git a/core/common/sync.h b/core/common/sync.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <atomic>
+#include <cerrno>
+#include <stdexcept>
+
+// Templated Futex class for atomic variable
+template <typename T>
+class Futex {
+ public:
+  Futex() { value_.store(0); }
+  explicit Futex(T initial_value) : value_(initial_value) {}
+  explicit Futex(const Futex<T>& other) : value_(other.value_.get()) {}
+
+  void wait(T expected) {
+    while (value_.load() != expected) {
+      int ret = syscall(SYS_futex, &value_, FUTEX_WAIT, expected, nullptr,
+                        nullptr, 0);
+      if (ret == -1 && errno != EAGAIN) {
+        throw std::runtime_error("Futex wait failed");
+      }
+    }
+  }
+
+  void wake(int count = 1) {
+    int ret =
+        syscall(SYS_futex, &value_, FUTEX_WAKE, count, nullptr, nullptr, 0);
+    if (ret == -1) {
+      throw std::runtime_error("Futex wake failed");
+    }
+  }
+
+  void set(T new_value) { value_.store(new_value); }
+
+  T get() const { return value_.load(); }
+
+  void set_and_wake(T new_value, int count = 1) {
+    value_.store(new_value);
+    wake(count);
+  }
+
+  void wait_and_set(T expected, T new_value) {
+    while (true) {
+      T current = value_.load();
+      if (current != expected) {
+        int ret = syscall(SYS_futex, &value_, FUTEX_WAIT, current, nullptr,
+                          nullptr, 0);
+        if (ret == -1 && errno != EAGAIN) {
+          throw std::runtime_error("Futex wait failed");
+        }
+      } else if (value_.compare_exchange_strong(current, new_value)) {
+        // Successfully set the new value atomically
+        break;
+      }
+    }
+  }
+
+ private:
+  std::atomic<T> value_;
+};
diff --git a/core/memory/caching_allocator.h b/core/memory/caching_allocator.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <unordered_map>
+#include <vector>
+#include <stdexcept>
+
+#include "utils/cuda_utils.h"
+
+// Templated CachingAllocator class
+template <typename Allocator>
+class CachingAllocator {
+ public:
+  static CachingAllocator<Allocator>* instance(int idx) {
+    static std::array<CachingAllocator<Allocator>*, 8> instances;
+    if (instances[idx] == nullptr) {
+      instances[idx] = new CachingAllocator<Allocator>();
+    }
+    return instances[idx];
+  }
+
+  void* allocate(const size_t bytes) {
+    const auto& it = available_map_.find(bytes);
+    if (it == available_map_.end() || it->second.empty()) {
+      return allocate_and_cache(bytes);
+    }
+    void* ptr = it->second.back();
+    it->second.pop_back();
+    return ptr;
+  }
+
+  void free(void* ptr) {
+    const auto& it = allocation_map_.find(ptr);
+    if (it == allocation_map_.end()) {
+      Allocator::deallocate(ptr);
+      return;
+    }
+    const size_t alloc_size = it->second;
+    available_map_[alloc_size].push_back(ptr);
+  }
+
+  void record_free(void* ptr) {
+    const auto& it = allocation_map_.find(ptr);
+    if (it != allocation_map_.end()) {
+      allocation_map_.erase(it);
+    }
+  }
+
+  void free_cached() {
+    for (const auto& it : available_map_) {
+      for (const auto ptr : it.second) {
+        Allocator::deallocate(ptr);
+        allocation_map_.erase(ptr);
+      }
+    }
+    available_map_.clear();
+  }
+
+  ~CachingAllocator() { free_cached(); }
+
+ private:
+  void* allocate_and_cache(const size_t bytes) {
+    void* ptr = Allocator::allocate(bytes);
+    allocation_map_[ptr] = bytes;
+    return ptr;
+  }
+
+  std::unordered_map<size_t, std::vector<void*>> available_map_;
+  std::unordered_map<void*, size_t> allocation_map_;
+};
+
+// Example Allocator for CUDA
+struct CudaDeviceAllocator {
+  static void* allocate(size_t bytes) {
+    void* ptr;
+    CUDA_CHECK(cudaMalloc(&ptr, bytes));
+    return ptr;
+  }
+
+  static void deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); }
+};
+
+// Example Allocator for Unified Memory
+struct CudaUnifiedAllocator {
+  static void* allocate(size_t bytes) {
+    void* ptr;
+    CUDA_CHECK(cudaMallocManaged(&ptr, bytes));
+    return ptr;
+  }
+
+  static void deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); }
+};
+
+// Example Allocator for cudaHostAlloc
+struct CudaHostAllocator {
+  static void* allocate(size_t bytes) {
+    void* ptr;
+    CUDA_CHECK(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault));
+    return ptr;
+  }
+
+  static void deallocate(void* ptr) { CUDA_CHECK(cudaFreeHost(ptr)); }
+};
+
+// Template specialization for all types of CachingAllocator
+typedef CachingAllocator<CudaDeviceAllocator> CudaDeviceCachingAllocator;
+typedef CachingAllocator<CudaUnifiedAllocator> CudaUnifiedCachingAllocator;
+typedef CachingAllocator<CudaHostAllocator> CudaHostCachingAllocator;
diff --git a/core/memory/device_caching_allocator.cpp b/core/memory/device_caching_allocator.cpp
@@ -10,6 +10,7 @@
 #include "device_caching_allocator.h"
 #include <c10/util/Exception.h>
 #include <cuda_runtime_api.h>
+#include "utils/cuda_utils.h"
 #include "utils/logger.h"
 
 namespace c10 {
@@ -24,11 +25,7 @@ inline void* DeviceCachingAllocator::allocate_and_cache(const size_t bytes) {
   auto cuda_err = cudaMalloc(&ptr, bytes);
   if (cuda_err != cudaSuccess) {
     free_cached();
-    cuda_err = cudaMalloc(&ptr, bytes);
-    if (cuda_err != cudaSuccess) {
-      DLOG_ERROR("cudaMalloc failed", bytes, cuda_err);
-      throw std::runtime_error("cudaMalloc failed");
-    }
+    CUDA_CHECK(cudaMalloc(&ptr, bytes));
   }
 
   allocation_map_[ptr] = bytes;