-
Notifications
You must be signed in to change notification settings - Fork 20
feat: performance improvement and Qwen3 support #60
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
drunkcoding
wants to merge
40
commits into
main
Choose a base branch
from
feature/qwen
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+1,749
−548
Open
Changes from all commits
Commits
Show all changes
40 commits
Select commit
Hold shift + click to select a range
000f22a
update table format
c871b41
improve table clarity
9cd8e99
init code commit
46cf81c
add openai api support
future-xy 87c3e28
add test scripts, update readme, update api
future-xy ba9d66f
Merge branch 'main' into feature/openai_api
9045494
format and change to deepseek in example
72c641e
fix format
c218025
remove unused files
7b97703
fix api server token id device
9906513
fix gen broken
5c87fe9
update readme links
9257e81
cancel concurrent job
18d08aa
set dense node to device
cc25124
sparse node set cpu
9d0b4d8
Merge branch 'main' into feature/openai_api
drunkcoding de0ebf5
remove OS def
ba35284
Merge branch 'feature/openai_api' of github.com:TorchMoE/MoE-Infinity…
128c30f
use update to date clang-format
e5f625f
fix setuptools version
48324d8
fix setuptools version for python 3.8
f73e5b0
keep single cuda version in publish
fe81a87
add max length in gen openai
845e89d
fix cache race condition
ef028d8
all param init at host
eb0bb11
add qwen3
50c9b65
Merge branch 'feature/openai_api' into feature/qwen
5c7e368
ubuntu lts and build
cde7d3b
pre-commit ubuntu version
ea2f3b3
router weights update overlap
5017bcc
rename deepseek_v2 and reduce torch kernel launch
042b2ee
fix import
8d190e9
fix build and fix bug
d902eca
fix citation linebreak
1a5e10f
fix typo
7916de6
fix dtype size
93bf9ad
remove comments
33932d0
fix example
823d393
pr update init
afd0bd1
remove comment and unify deepseek preroute
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| #pragma once | ||
|
|
||
| #include <linux/futex.h> | ||
| #include <sys/syscall.h> | ||
| #include <unistd.h> | ||
| #include <atomic> | ||
| #include <cerrno> | ||
| #include <stdexcept> | ||
|
|
||
| // Templated Futex class for atomic variable | ||
| template <typename T> | ||
| class Futex { | ||
| public: | ||
| Futex() { value_.store(0); } | ||
| explicit Futex(T initial_value) : value_(initial_value) {} | ||
| explicit Futex(const Futex<T>& other) : value_(other.value_.get()) {} | ||
|
|
||
| void wait(T expected) { | ||
| while (value_.load() != expected) { | ||
| int ret = syscall(SYS_futex, &value_, FUTEX_WAIT, expected, nullptr, | ||
| nullptr, 0); | ||
| if (ret == -1 && errno != EAGAIN) { | ||
| throw std::runtime_error("Futex wait failed"); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void wake(int count = 1) { | ||
| int ret = | ||
| syscall(SYS_futex, &value_, FUTEX_WAKE, count, nullptr, nullptr, 0); | ||
| if (ret == -1) { | ||
| throw std::runtime_error("Futex wake failed"); | ||
| } | ||
| } | ||
|
|
||
| void set(T new_value) { value_.store(new_value); } | ||
|
|
||
| T get() const { return value_.load(); } | ||
|
|
||
| void set_and_wake(T new_value, int count = 1) { | ||
| value_.store(new_value); | ||
| wake(count); | ||
| } | ||
|
|
||
| void wait_and_set(T expected, T new_value) { | ||
| while (true) { | ||
| T current = value_.load(); | ||
| if (current != expected) { | ||
| int ret = syscall(SYS_futex, &value_, FUTEX_WAIT, current, nullptr, | ||
| nullptr, 0); | ||
| if (ret == -1 && errno != EAGAIN) { | ||
| throw std::runtime_error("Futex wait failed"); | ||
| } | ||
| } else if (value_.compare_exchange_strong(current, new_value)) { | ||
| // Successfully set the new value atomically | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private: | ||
| std::atomic<T> value_; | ||
| }; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,108 @@ | ||
| #pragma once | ||
|
|
||
| #include <cuda_runtime_api.h> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
| #include <stdexcept> | ||
|
|
||
| #include "utils/cuda_utils.h" | ||
|
|
||
| // Templated CachingAllocator class | ||
| template <typename Allocator> | ||
| class CachingAllocator { | ||
| public: | ||
| static CachingAllocator<Allocator>* instance(int idx) { | ||
| static std::array<CachingAllocator<Allocator>*, 8> instances; | ||
| if (instances[idx] == nullptr) { | ||
| instances[idx] = new CachingAllocator<Allocator>(); | ||
| } | ||
| return instances[idx]; | ||
| } | ||
|
|
||
| void* allocate(const size_t bytes) { | ||
| const auto& it = available_map_.find(bytes); | ||
| if (it == available_map_.end() || it->second.empty()) { | ||
| return allocate_and_cache(bytes); | ||
| } | ||
| void* ptr = it->second.back(); | ||
| it->second.pop_back(); | ||
| return ptr; | ||
| } | ||
|
|
||
| void free(void* ptr) { | ||
drunkcoding marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| const auto& it = allocation_map_.find(ptr); | ||
| if (it == allocation_map_.end()) { | ||
| Allocator::deallocate(ptr); | ||
| return; | ||
| } | ||
| const size_t alloc_size = it->second; | ||
| available_map_[alloc_size].push_back(ptr); | ||
| } | ||
|
|
||
| void record_free(void* ptr) { | ||
| const auto& it = allocation_map_.find(ptr); | ||
| if (it != allocation_map_.end()) { | ||
| allocation_map_.erase(it); | ||
| } | ||
| } | ||
|
|
||
| void free_cached() { | ||
| for (const auto& it : available_map_) { | ||
| for (const auto ptr : it.second) { | ||
| Allocator::deallocate(ptr); | ||
| allocation_map_.erase(ptr); | ||
| } | ||
| } | ||
| available_map_.clear(); | ||
| } | ||
|
|
||
| ~CachingAllocator() { free_cached(); } | ||
|
|
||
| private: | ||
| void* allocate_and_cache(const size_t bytes) { | ||
| void* ptr = Allocator::allocate(bytes); | ||
| allocation_map_[ptr] = bytes; | ||
| return ptr; | ||
| } | ||
|
|
||
| std::unordered_map<size_t, std::vector<void*>> available_map_; | ||
| std::unordered_map<void*, size_t> allocation_map_; | ||
| }; | ||
|
|
||
| // Example Allocator for CUDA | ||
| struct CudaDeviceAllocator { | ||
| static void* allocate(size_t bytes) { | ||
| void* ptr; | ||
| CUDA_CHECK(cudaMalloc(&ptr, bytes)); | ||
| return ptr; | ||
| } | ||
|
|
||
| static void deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); } | ||
| }; | ||
|
|
||
| // Example Allocator for Unified Memory | ||
| struct CudaUnifiedAllocator { | ||
| static void* allocate(size_t bytes) { | ||
| void* ptr; | ||
| CUDA_CHECK(cudaMallocManaged(&ptr, bytes)); | ||
| return ptr; | ||
| } | ||
|
|
||
| static void deallocate(void* ptr) { CUDA_CHECK(cudaFree(ptr)); } | ||
| }; | ||
|
|
||
| // Example Allocator for cudaHostAlloc | ||
| struct CudaHostAllocator { | ||
| static void* allocate(size_t bytes) { | ||
| void* ptr; | ||
| CUDA_CHECK(cudaHostAlloc(&ptr, bytes, cudaHostAllocDefault)); | ||
| return ptr; | ||
| } | ||
|
|
||
| static void deallocate(void* ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } | ||
| }; | ||
|
|
||
| // Template specialization for all types of CachingAllocator | ||
| typedef CachingAllocator<CudaDeviceAllocator> CudaDeviceCachingAllocator; | ||
| typedef CachingAllocator<CudaUnifiedAllocator> CudaUnifiedCachingAllocator; | ||
| typedef CachingAllocator<CudaHostAllocator> CudaHostCachingAllocator; | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we might not use a tensor item every time, so constructing a tensor just to query its itemsize() might be unnecessarily expensive.