diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index aae6bd0..f4a74c6 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -17,7 +17,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 container: image: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 diff --git a/.github/workflows/pre-commit-format.yml b/.github/workflows/pre-commit-format.yml index c92e6de..bb63b48 100644 --- a/.github/workflows/pre-commit-format.yml +++ b/.github/workflows/pre-commit-format.yml @@ -18,7 +18,7 @@ jobs: # formatting and basic install on cpu-only machine unit-tests: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/publish-test.yml b/.github/workflows/publish-test.yml index 0699ecb..8de8e2b 100644 --- a/.github/workflows/publish-test.yml +++ b/.github/workflows/publish-test.yml @@ -16,7 +16,7 @@ permissions: jobs: setup-version: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Generate version number run: | diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3c02bfe..f11ed2f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -11,7 +11,7 @@ permissions: jobs: release: name: Create Release - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 outputs: upload_url: ${{ steps.create_release.outputs.upload_url }} steps: diff --git a/.gitignore b/.gitignore index d29bc0e..421638c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode test_*.py +*.nsys-* +*.sh # Prerequisites *.d diff --git a/README.md b/README.md index 930efe5..bc29eab 100644 --- a/README.md +++ b/README.md @@ -36,15 +36,14 @@ Note that: The open-sourced MoE-Infinity has been redesigned for making it Huggi Single GPU A5000 (24GB Memory), per-token-latency (seconds) for generation with a mixed dataset that includes [LongBench](https://huggingface.co/datasets/THUDM/LongBench), [GSM8K](https://huggingface.co/datasets/openai/gsm8k), [FLAN](https://huggingface.co/datasets/Muennighoff/flan), [BIG-Bench](https://huggingface.co/datasets/bigbench) and [MMLU](https://huggingface.co/datasets/lukaemon/mmlu) datasets. Lower per-token-latency is preferable. -| | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite -| :---: | :---: | :---: | :---: | :---: | -| MoE-Infinity | *0.130* | *0.119* | *0.735* | *0.155* | -| Accelerate | 1.043 | 3.071 | 6.633 | 1.743 | -|DeepSpeed | 4.578 | 8.381 | 2.486 | 0.737 | -|Mixtral Offloading| X | X | 1.752 | X | -|Ollama | X | X | 0.903 | 1.250 | -|vLLM| X | X | 2.137 | 0.493 | - +| | Switch-large-128 | NLLB-MoE-54B | Mixtral-8x7b | DeepSeek-V2-Lite-Chat | Qwen3-30B-A3B | +| :---: | :---: | :---: | :---: | :---: | :---: | +| MoE-Infinity | *0.130* | *0.119* | *0.735* | *0.100* | *0.150* | +| Accelerate | 1.043 | 3.071 | 6.633 | 1.743 | | +|DeepSpeed (0.16.2) | 4.578 | 8.381 | 2.486 | 0.737 | 7.857 | +|Mixtral Offloading| X | X | 1.752 | X |X| +|Ollama | X | X | 0.903 | 1.250 || +|vLLM (v0.8.5)| X | X | 2.137 | 0.149 | 0.205 |