diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index aae6bd0..f4a74c6 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -17,7 +17,7 @@ concurrency:
jobs:
build:
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container:
image: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
diff --git a/.github/workflows/pre-commit-format.yml b/.github/workflows/pre-commit-format.yml
index c92e6de..bb63b48 100644
--- a/.github/workflows/pre-commit-format.yml
+++ b/.github/workflows/pre-commit-format.yml
@@ -18,7 +18,7 @@ jobs:
# formatting and basic install on cpu-only machine
unit-tests:
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml
index 0699ecb..8de8e2b 100644
--- a/.github/workflows/publish-test.yml
+++ b/.github/workflows/publish-test.yml
@@ -16,7 +16,7 @@ permissions:
jobs:
setup-version:
- runs-on: ubuntu-latest
+ runs-on: ubuntu-22.04
steps:
- name: Generate version number
run: |
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 3c02bfe..f11ed2f 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -11,7 +11,7 @@ permissions:
jobs:
release:
name: Create Release
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
outputs:
upload_url: ${{ steps.create_release.outputs.upload_url }}
steps:
diff --git a/.gitignore b/.gitignore
index d29bc0e..421638c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
test_*.py
+*.nsys-*
+*.sh
# Prerequisites
*.d
diff --git a/README.md b/README.md
index 930efe5..bc29eab 100644
--- a/README.md
+++ b/README.md
@@ -36,15 +36,14 @@ Note that: The open-sourced MoE-Infinity has been redesigned for making it Huggi
Single GPU A5000 (24GB Memory), per-token-latency (seconds) for generation with a mixed dataset that includes [LongBench](https://huggingface.co/datasets/THUDM/LongBench), [GSM8K](https://huggingface.co/datasets/openai/gsm8k), [FLAN](https://huggingface.co/datasets/Muennighoff/flan), [BIG-Bench](https://huggingface.co/datasets/bigbench) and [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) datasets.
Lower per-token-latency is preferable.
-| | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite
-| :---: | :---: | :---: | :---: | :---: |
-| MoE-Infinity | *0.130* | *0.119* | *0.735* | *0.155* |
-| Accelerate | 1.043 | 3.071 | 6.633 | 1.743 |
-|DeepSpeed | 4.578 | 8.381 | 2.486 | 0.737 |
-|Mixtral Offloading| X | X | 1.752 | X |
-|Ollama | X | X | 0.903 | 1.250 |
-|vLLM| X | X | 2.137 | 0.493 |
-
+| | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite-Chat | Qwen3-30B-A3B |
+| :---: | :---: | :---: | :---: | :---: | :---: |
+| MoE-Infinity | *0.130* | *0.119* | *0.735* | *0.100* | *0.150* |
+| Accelerate | 1.043 | 3.071 | 6.633 | 1.743 | |
+|DeepSpeed (0.16.2) | 4.578 | 8.381 | 2.486 | 0.737 | 7.857 |
+|Mixtral Offloading| X | X | 1.752 | X |X|
+|Ollama | X | X | 0.903 | 1.250 ||
+|vLLM (v0.8.5)| X | X | 2.137 | 0.149 | 0.205 |