Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
000f22a
update table format
Feb 27, 2024
c871b41
improve table clarity
Feb 27, 2024
9cd8e99
init code commit
Feb 28, 2024
46cf81c
add openai api support
future-xy Feb 29, 2024
87c3e28
add test scripts, update readme, update api
future-xy Mar 10, 2024
ba9d66f
Merge branch 'main' into feature/openai_api
Mar 22, 2025
9045494
format and change to deepseek in example
Mar 22, 2025
72c641e
fix format
Mar 22, 2025
c218025
remove unused files
Mar 22, 2025
7b97703
fix api server token id device
Mar 22, 2025
9906513
fix gen broken
Mar 23, 2025
5c87fe9
update readme links
Mar 23, 2025
9257e81
cancel concurrent job
Mar 23, 2025
18d08aa
set dense node to device
Mar 23, 2025
cc25124
sparse node set cpu
Mar 23, 2025
9d0b4d8
Merge branch 'main' into feature/openai_api
drunkcoding Mar 23, 2025
de0ebf5
remove OS def
Mar 23, 2025
ba35284
Merge branch 'feature/openai_api' of github.com:TorchMoE/MoE-Infinity…
Mar 23, 2025
128c30f
use update to date clang-format
Mar 23, 2025
e5f625f
fix setuptools version
Mar 23, 2025
48324d8
fix setuptools version for python 3.8
Mar 23, 2025
f73e5b0
keep single cuda version in publish
Mar 23, 2025
fe81a87
add max length in gen openai
Mar 27, 2025
845e89d
fix cache race condition
Apr 16, 2025
ef028d8
all param init at host
Apr 21, 2025
eb0bb11
add qwen3
May 1, 2025
50c9b65
Merge branch 'feature/openai_api' into feature/qwen
May 2, 2025
5c7e368
ubuntu lts and build
May 10, 2025
cde7d3b
pre-commit ubuntu version
May 10, 2025
ea2f3b3
router weights update overlap
May 11, 2025
5017bcc
rename deepseek_v2 and reduce torch kernel launch
May 11, 2025
042b2ee
fix import
May 11, 2025
8d190e9
fix build and fix bug
May 12, 2025
d902eca
fix citation linebreak
May 18, 2025
1a5e10f
fix typo
Jun 14, 2025
7916de6
fix dtype size
Jun 14, 2025
93bf9ad
remove comments
Jun 14, 2025
33932d0
fix example
Jun 14, 2025
823d393
pr update init
Jun 15, 2025
afd0bd1
remove comment and unify deepseek preroute
Jun 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:

jobs:
build:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
container:
image: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pre-commit-format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:

# formatting and basic install on cpu-only machine
unit-tests:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ permissions:

jobs:
setup-version:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- name: Generate version number
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ permissions:
jobs:
release:
name: Create Release
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
outputs:
upload_url: ${{ steps.create_release.outputs.upload_url }}
steps:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode

test_*.py
*.nsys-*
*.sh

# Prerequisites
*.d
Expand Down
19 changes: 9 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,14 @@ Note that: The open-sourced MoE-Infinity has been redesigned for making it Huggi
Single GPU A5000 (24GB Memory), per-token-latency (seconds) for generation with a mixed dataset that includes [LongBench](https://huggingface.co/datasets/THUDM/LongBench), [GSM8K](https://huggingface.co/datasets/openai/gsm8k), [FLAN](https://huggingface.co/datasets/Muennighoff/flan), [BIG-Bench](https://huggingface.co/datasets/bigbench) and [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) datasets.
Lower per-token-latency is preferable.

| | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite
| :---: | :---: | :---: | :---: | :---: |
| <ins>MoE-Infinity</ins> | <ins>*0.130*</ins> | <ins>*0.119*</ins> | <ins>*0.735*</ins> | <ins>*0.155*</ins> |
| Accelerate | 1.043 | 3.071 | 6.633 | 1.743 |
|DeepSpeed | 4.578 | 8.381 | 2.486 | 0.737 |
|Mixtral Offloading| X | X | 1.752 | X |
|Ollama | X | X | 0.903 | 1.250 |
|vLLM| X | X | 2.137 | 0.493 |

| | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite-Chat | Qwen3-30B-A3B |
| :---: | :---: | :---: | :---: | :---: | :---: |
| <ins>MoE-Infinity</ins> | <ins>*0.130*</ins> | <ins>*0.119*</ins> | <ins>*0.735*</ins> | <ins>*0.100*</ins> | <ins>*0.150*</ins> |
| Accelerate | 1.043 | 3.071 | 6.633 | 1.743 | |
|DeepSpeed (0.16.2) | 4.578 | 8.381 | 2.486 | 0.737 | 7.857 |
|Mixtral Offloading| X | X | 1.752 | X |X|
|Ollama | X | X | 0.903 | 1.250 ||
|vLLM (v0.8.5)| X | X | 2.137 | 0.149 | 0.205 |

<!-- Single GPU A5000, throughput (token/s) for generation with batch size 32.
Higher throughput is preferable.
Expand Down Expand Up @@ -207,7 +206,7 @@ If you use MoE-Inifity for your research, please cite our [paper](https://arxiv.
Zhan Lu and
Luo Mai and
Mahesh Marina},
title = {MoE-Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache},
title = {MoE{-}Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache},
archivePrefix= {arXiv},
eprint = {2401.14361},
year = {2024}
Expand Down
71 changes: 69 additions & 2 deletions core/common/pytorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,76 @@ inline py::list vector_to_list(std::vector<uint32_t>& vec) {
return list;
}

#define DTYPE_BFLOAT16 0
#define DTYPE_FLOAT32 1
#define DTYPE_FLOAT16 2
#define DTYPE_FP8_E4M3FN 3

inline torch::ScalarType dtype_to_torch(int dtype) {
auto tensor_dtype = torch::kFloat32;
switch (dtype) {
case DTYPE_BFLOAT16:
tensor_dtype = torch::kBFloat16;
break;
case DTYPE_FLOAT16:
tensor_dtype = torch::kHalf;
break;
case DTYPE_FLOAT32:
tensor_dtype = torch::kFloat32;
break;
case DTYPE_FP8_E4M3FN:
tensor_dtype = torch::kFloat8_e4m3fn;
break;
default:
assert(false);
}
return tensor_dtype;
}

inline int torch_dtype_to_int(torch::ScalarType dtype) {
auto tensor_dtype = DTYPE_FLOAT32;
switch (dtype) {
case torch::kBFloat16:
tensor_dtype = DTYPE_BFLOAT16;
break;
case torch::kHalf:
tensor_dtype = DTYPE_FLOAT16;
break;
case torch::kFloat32:
tensor_dtype = DTYPE_FLOAT32;
break;
case torch::kFloat8_e4m3fn:
tensor_dtype = DTYPE_FP8_E4M3FN;
break;
default:
assert(false);
}
return tensor_dtype;
}

inline size_t torch_dtype_size(int dtype) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might not use a tensor item every time, so constructing a tensor just to query its itemsize() might be unnecessarily expensive.

inline size_t torch_dtype_size(int dtype) {
  switch (dtype) {
    case DTYPE_FLOAT32:
      return 4;
    case DTYPE_FLOAT16:
      return 2;
    case DTYPE_BFLOAT16:
      return 2;
    case DTYPE_FP8_E4M3FN:
      return 1;
    default:
      throw std::invalid_argument("Unknown dtype in torch_dtype_size()");
  }
}

size_t itemsize = 0;
switch (dtype) {
case DTYPE_BFLOAT16:
itemsize = 2; // bfloat16 is 2 bytes
break;
case DTYPE_FLOAT16:
itemsize = 2; // float16 is 2 bytes
break;
case DTYPE_FLOAT32:
itemsize = 4; // float32 is 4 bytes
break;
case DTYPE_FP8_E4M3FN:
itemsize = 1; // fp8_e4m3fn is 1 byte
break;
default:
assert(false); // Invalid dtype
}
return itemsize;
}

inline size_t torch_shape_size(const std::vector<int64_t>& shape, int dtype) {
auto torch_type = torch::ScalarType(dtype);
auto itemsize = torch::empty({1}, torch_type).itemsize();
auto itemsize = torch_dtype_size(dtype);
size_t size = 1;
for (auto dim : shape) {
size *= dim;
Expand Down
63 changes: 63 additions & 0 deletions core/common/sync.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#pragma once

#include <linux/futex.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <atomic>
#include <cerrno>
#include <stdexcept>

// Templated Futex class for atomic variable
template <typename T>
class Futex {
public:
Futex() { value_.store(0); }
explicit Futex(T initial_value) : value_(initial_value) {}
explicit Futex(const Futex<T>& other) : value_(other.value_.get()) {}

void wait(T expected) {
while (value_.load() != expected) {
int ret = syscall(SYS_futex, &value_, FUTEX_WAIT, expected, nullptr,
nullptr, 0);
if (ret == -1 && errno != EAGAIN) {
throw std::runtime_error("Futex wait failed");
}
}
}

void wake(int count = 1) {
int ret =
syscall(SYS_futex, &value_, FUTEX_WAKE, count, nullptr, nullptr, 0);
if (ret == -1) {
throw std::runtime_error("Futex wake failed");
}
}

void set(T new_value) { value_.store(new_value); }

T get() const { return value_.load(); }

void set_and_wake(T new_value, int count = 1) {
value_.store(new_value);
wake(count);
}

void wait_and_set(T expected, T new_value) {
while (true) {
T current = value_.load();
if (current != expected) {
int ret = syscall(SYS_futex, &value_, FUTEX_WAIT, current, nullptr,
nullptr, 0);
if (ret == -1 && errno != EAGAIN) {
throw std::runtime_error("Futex wait failed");
}
} else if (value_.compare_exchange_strong(current, new_value)) {
// Successfully set the new value atomically
break;
}
}
}

private:
std::atomic<T> value_;
};
108 changes: 108 additions & 0 deletions core/memory/caching_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#pragma once

#include <cuda_runtime_api.h>
#include <unordered_map>
#include <vector>
#include <stdexcept>

#include "utils/cuda_utils.h"

// Templated CachingAllocator class
template <typename Allocator>
class CachingAllocator {
public:
static CachingAllocator<Allocator>* instance(int idx) {
static std::array<CachingAllocator<Allocator>*, 8> instances;
if (instances[idx] == nullptr) {
instances[idx] = new CachingAllocator<Allocator>();
}
return instances[idx];
}

void* allocate(const size_t bytes) {
const auto& it = available_map_.find(bytes);
if (it == available_map_.end() || it->second.empty()) {
return allocate_and_cache(bytes);
}
void* ptr = it->second.back();
it->second.pop_back();
return ptr;
}

void free(void* ptr) {
const auto& it = allocation_map_.find(ptr);
if (it == allocation_map_.end()) {
Allocator::deallocate(ptr);
return;
}
const size_t alloc_size = it->second;
available_map_[alloc_size].push_back(ptr);
}

void record_free(void* ptr) {
const auto& it = allocation_map_.find(ptr);
if (it != allocation_map_.end()) {
allocation_map_.erase(it);
}
}

void free_cached() {
for (const auto& it : available_map_) {
for (const auto ptr : it.second) {
Allocator::deallocate(ptr);
allocation_map_.erase(ptr);
}
}
available_map_.clear();
}

~CachingAllocator() { free_cached(); }

private:
void* allocate_and_cache(const size_t bytes) {
void* ptr = Allocator::allocate(bytes);
allocation_map_[ptr] = bytes;
return ptr;
}

std::unordered_map<size_t, std::vector<void*>> available_map_;
std::unordered_map<void*, size_t> allocation_map_;
};

// Example Allocator for CUDA
struct CudaDeviceAllocator {
static void* allocate(size_t bytes) {
void* ptr;
CUDA_CHECK(cudaMalloc(&ptr, bytes));
return ptr;
}

static void deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); }
};

// Example Allocator for Unified Memory
struct CudaUnifiedAllocator {
static void* allocate(size_t bytes) {
void* ptr;
CUDA_CHECK(cudaMallocManaged(&ptr, bytes));
return ptr;
}

static void deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); }
};

// Example Allocator for cudaHostAlloc
struct CudaHostAllocator {
static void* allocate(size_t bytes) {
void* ptr;
CUDA_CHECK(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault));
return ptr;
}

static void deallocate(void* ptr) { CUDA_CHECK(cudaFreeHost(ptr)); }
};

// Template specialization for all types of CachingAllocator
typedef CachingAllocator<CudaDeviceAllocator> CudaDeviceCachingAllocator;
typedef CachingAllocator<CudaUnifiedAllocator> CudaUnifiedCachingAllocator;
typedef CachingAllocator<CudaHostAllocator> CudaHostCachingAllocator;
7 changes: 2 additions & 5 deletions core/memory/device_caching_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "device_caching_allocator.h"
#include <c10/util/Exception.h>
#include <cuda_runtime_api.h>
#include "utils/cuda_utils.h"
#include "utils/logger.h"

namespace c10 {
Expand All @@ -24,11 +25,7 @@ inline void* DeviceCachingAllocator::allocate_and_cache(const size_t bytes) {
auto cuda_err = cudaMalloc(&ptr, bytes);
if (cuda_err != cudaSuccess) {
free_cached();
cuda_err = cudaMalloc(&ptr, bytes);
if (cuda_err != cudaSuccess) {
DLOG_ERROR("cudaMalloc failed", bytes, cuda_err);
throw std::runtime_error("cudaMalloc failed");
}
CUDA_CHECK(cudaMalloc(&ptr, bytes));
}

allocation_map_[ptr] = bytes;
Expand Down
Loading