Skip to content

Commit bd6f512

Browse files
committed
move from cuda to triton in regards to GPTQ for Llama
1 parent b668b59 commit bd6f512

12 files changed

Lines changed: 159 additions & 55 deletions

.github/workflows/publish-release-gptq.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
jobs:
88
push_to_dockerhub:
99
runs-on: ubuntu-latest
10-
if: ${{ contains(github.ref, 'gptq-llama-cuda') }}
10+
if: ${{ contains(github.ref, 'gptq-llama-triton') }}
1111
steps:
1212
- name: Checkout
1313
uses: actions/checkout@v3
@@ -30,7 +30,7 @@ jobs:
3030
uses: docker/build-push-action@v4
3131
with:
3232
context: .
33-
file: Dockerfile.gptq-llama-cuda
33+
file: Dockerfile.gptq-llama-triton
3434
push: ${{ github.event_name != 'pull_request' }}
3535
tags: ${{ steps.meta.outputs.tags }}
3636
labels: ${{ steps.meta.outputs.labels }}

.github/workflows/publish-release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77
jobs:
88
push_to_dockerhub:
99
runs-on: ubuntu-latest
10-
if: ${{ !contains(github.ref, 'gptq-llama-cuda') }}
10+
if: ${{ !contains(github.ref, 'gptq-llama-triton') }}
1111
steps:
1212
- name: Checkout
1313
uses: actions/checkout@v3

Dockerfile.gptq-llama-cuda

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# NOTE: this docker file is deprecated is replaced by Dockerfile.gptq-llama-triton
2+
13
FROM debian:bullseye-slim as pytorch-install
24

35
ARG PYTORCH_VERSION=2.0.0

Dockerfile.gptq-llama-triton

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
FROM debian:bullseye-slim as pytorch-install
2+
3+
ARG PYTORCH_VERSION=2.0.0
4+
ARG PYTHON_VERSION=3.9
5+
ARG CUDA_VERSION=11.7.1
6+
ARG MAMBA_VERSION=23.1.0-1
7+
ARG CUDA_CHANNEL=nvidia
8+
ARG INSTALL_CHANNEL=pytorch
9+
# Automatically set by buildx
10+
ARG TARGETPLATFORM
11+
12+
ENV PATH /opt/conda/bin:$PATH
13+
14+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
15+
build-essential \
16+
ca-certificates \
17+
ccache \
18+
curl \
19+
git && \
20+
rm -rf /var/lib/apt/lists/*
21+
22+
# Install conda
23+
# translating Docker's TARGETPLATFORM into mamba arches
24+
RUN case ${TARGETPLATFORM} in \
25+
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
26+
*) MAMBA_ARCH=x86_64 ;; \
27+
esac && \
28+
curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
29+
RUN chmod +x ~/mambaforge.sh && \
30+
bash ~/mambaforge.sh -b -p /opt/conda && \
31+
rm ~/mambaforge.sh
32+
33+
# Install pytorch
34+
# On arm64 we exit with an error code
35+
RUN case ${TARGETPLATFORM} in \
36+
"linux/arm64") exit 1 ;; \
37+
*) /opt/conda/bin/conda update -y conda && \
38+
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION torchvision torchaudio "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
39+
esac && \
40+
/opt/conda/bin/conda clean -ya
41+
42+
# CUDA kernels builder image
43+
FROM pytorch-install as kernel-builder
44+
45+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
46+
ninja-build \
47+
&& rm -rf /var/lib/apt/lists/*
48+
49+
RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.7.1" cuda==11.7.1 && \
50+
/opt/conda/bin/conda clean -ya
51+
52+
53+
FROM debian:bullseye-slim as base
54+
55+
ENV PATH=/opt/conda/bin:$PATH \
56+
CONDA_PREFIX=/opt/conda
57+
58+
LABEL com.nvidia.volumes.needed="nvidia_driver"
59+
60+
# Copy conda with PyTorch installed
61+
COPY --from=kernel-builder /opt/conda /opt/conda
62+
63+
RUN apt-get update && apt-get install -y build-essential git
64+
65+
WORKDIR /llm-api
66+
67+
COPY ./requirements.txt /llm-api/requirements.txt
68+
RUN pip3 install --no-cache-dir --upgrade -r requirements.txt
69+
70+
COPY ./app /llm-api/app
71+
ENV PYTHONPATH "/llm-api"
72+
73+
RUN git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa
74+
75+
RUN cd GPTQ-for-LLaMa && \
76+
pip3 install -r requirements.txt && \
77+
cd ..
78+
79+
RUN touch GPTQ-for-LLaMa/__init__.py && mv GPTQ-for-LLaMa /llm-api/app/llms/gptq_llama/GPTQforLLaMa
80+
81+
FROM base
82+
83+
CMD ["python3", "./app/main.py"]

README.md

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,20 @@ This application can be used to run LLMs (Large Language Models) in docker conta
44

55
The main motivation to start this project, was to be able to use different LLMs running on a local machine or a remote server with [langchain](https://github.com/hwchase17/langchain) using [langchain-llm-api](https://github.com/1b5d/langchain-llm-api)
66

7-
tested on CPU with the following models :
8-
9-
- Llama 7b
10-
- Llama 13b
11-
- Llama 30b
12-
- Alpaca 7b
13-
- Alpaca 13b
14-
- Alpaca 30b
15-
- Vicuna 13b
16-
- Koala 7b
17-
18-
tested on GPU with GPTQ-for-LlaMa with
19-
20-
- Koala 7B-4bit-128g
21-
- wizardLM 7B-4bit-128g
7+
Tested with the following models :
8+
9+
- Llama 7b - ggml
10+
- Llama 13b - ggml
11+
- Llama 30b - ggml
12+
- Alpaca 7b - ggml
13+
- Alpaca 13b - ggml
14+
- Alpaca 30b - ggml
15+
- Vicuna 13b - ggml
16+
- Koala 7b - ggml
17+
- Vicuna GPTQ 7B-4bit-128g
18+
- Vicuna GPTQ 13B-4bit-128g
19+
- Koala GPTQ 7B-4bit-128g
20+
- wizardLM GPTQ 7B-4bit-128g
2221

2322
Contribution for supporting more models is welcomed.
2423

@@ -60,7 +59,6 @@ to configure the application, edit `config.yaml` which is mounted into the docke
6059
```
6160
models_dir: /models # dir inside the container
6261
model_family: alpaca
63-
model_name: 7b
6462
setup_params:
6563
key: value
6664
model_params:
@@ -101,7 +99,7 @@ POST /embeddings
10199
```
102100

103101

104-
## Llama / Alpaca on CPU - using llama.cpp
102+
## Llama on CPU - using llama.cpp
105103

106104
Llama and models based on it such as Alpaca and Vicuna are intended only for academic research and any commercial use is prohibited. This project doesn't provide any links to download these models.
107105

@@ -110,7 +108,6 @@ You can configure the model usage in a local `config.yaml` file, the configs, he
110108
```
111109
models_dir: /models # dir inside the container
112110
model_family: alpaca
113-
model_name: 7b
114111
setup_params:
115112
repo_id: user/repo_id
116113
filename: ggml-model-q4_0.bin
@@ -169,20 +166,22 @@ You should see a table showing you the current nvidia driver version and some ot
169166
+---------------------------------------------------------------------------------------+
170167
```
171168

172-
You can also run the Llama model using GPTQ-for-LLaMa 4 bit quantization, you can use a docker image specially built for that purpose `1b5d/llm-api:0.0.3-gptq-llama-cuda` instead of the default image.
169+
You can also run the Llama model using GPTQ-for-LLaMa 4 bit quantization, you can use a docker image specially built for that purpose `1b5d/llm-api:0.0.4-gptq-llama-triton` instead of the default image.
173170

174171
a separate docker-compose file is also available to run this mode:
175172

176173
```
177-
docker compose -f docker-compose.gptq-llama-cuda.yaml up
174+
docker compose -f docker-compose.gptq-llama-triton.yaml up
178175
```
179176

180177
or by directly running the container:
181178

182179
```
183-
docker run --gpus all -v $PWD/models/:/models:rw -v $PWD/config.yaml:/llm-api/config.yaml:ro -p 8000:8000 1b5d/llm-api:0.0.3-gptq-llama-cuda
180+
docker run --gpus all -v $PWD/models/:/models:rw -v $PWD/config.yaml:/llm-api/config.yaml:ro -p 8000:8000 1b5d/llm-api:0.0.4-gptq-llama-triton
184181
```
185182

183+
**Note**: `llm-api:0.0.x-gptq-llama-cuda` image has been deprecated, please switch to the triton image as it seems more reliable
184+
186185
Example config file:
187186

188187
```

app/base.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""
22
An interface which defines generic LLM related operations
33
"""
4+
import os
5+
import hashlib
46
from abc import ABC, abstractmethod
57
from typing import AsyncIterator, Dict, List
68

@@ -11,6 +13,11 @@ class BaseLLM(ABC):
1113
A base class for LLMs
1214
"""
1315

16+
def get_model_dir(self, models_dir, model_family, model_name):
17+
name_digest = str(int(hashlib.md5(model_name.encode('utf-8')).hexdigest(), 16))[0:12]
18+
dir_name = '_'.join([model_family, name_digest])
19+
return os.path.join(models_dir, dir_name)
20+
1421
@abstractmethod
1522
def generate(self, prompt: str, params: Dict[str, str]) -> str:
1623
"""

app/config.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ class Settings(BaseSettings): # pylint: disable=too-few-public-methods
1818
"""
1919

2020
models_dir: str = "./models"
21-
model_name: str = "7b"
2221
model_family: str
2322
model_params: Dict[str, Any] = {}
2423
setup_params: Dict[str, Any] = {}

app/llms/gptq_llama/GPTQforLLaMa

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit 5dbcb4a4bcec3cedc75062d5791dcfba651d79af

app/llms/gptq_llama/gptq_llama.py

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
sys.path.append(os.path.join(os.path.dirname(__file__), "GPTQ-for-LLaMa"))
1919

2020
try:
21-
from modelutils import find_layers
22-
from quant import make_quant
21+
from .GPTQforLLaMa import quant
22+
from .GPTQforLLaMa.utils import find_layers
2323
except ImportError as exp:
2424
raise ImportError(
2525
"the GPTQ-for-LLaMa lib is missing, please install it first"
@@ -74,7 +74,11 @@ def _download(self, model_path, model_dir): # pylint: disable=duplicate-code
7474
)
7575

7676
def _setup(self):
77-
model_dir = os.path.join(settings.models_dir, settings.model_family)
77+
model_dir = super().get_model_dir(
78+
settings.models_dir,
79+
settings.model_family,
80+
settings.setup_params['filename']
81+
)
7882
model_path = os.path.join(
7983
model_dir,
8084
settings.setup_params["filename"],
@@ -91,7 +95,6 @@ def __init__(self, params: Dict[str, str]) -> None:
9195
wbits = params.get("wbits", 4)
9296
cuda_visible_devices = params.get("cuda_visible_devices", "0")
9397
dev = params.get("device", "cuda:0")
94-
st_device = params.get("st_device", -1)
9598

9699
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
97100
self.device = torch.device(dev)
@@ -100,20 +103,17 @@ def __init__(self, params: Dict[str, str]) -> None:
100103
model_path,
101104
wbits,
102105
group_size,
103-
st_device,
104106
)
105107

106108
self.model.to(self.device)
107109
self.tokenizer = AutoTokenizer.from_pretrained(
108110
settings.setup_params["repo_id"], use_fast=False
109111
)
110112

111-
def _load_quant(
112-
self, model, checkpoint, wbits, groupsize, device
113-
): # pylint: disable=too-many-arguments
113+
def _load_quant(self, model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True):
114114
config = LlamaConfig.from_pretrained(model)
115115

116-
def noop(*args, **kwargs): # pylint: disable=unused-argument
116+
def noop(*args, **kwargs):
117117
pass
118118

119119
torch.nn.init.kaiming_uniform_ = noop
@@ -125,21 +125,31 @@ def noop(*args, **kwargs): # pylint: disable=unused-argument
125125
torch.set_default_dtype(torch.half)
126126
model = LlamaForCausalLM(config)
127127
torch.set_default_dtype(torch.float)
128-
model = model.eval() # pylint: disable=no-member
128+
if eval:
129+
model = model.eval()
129130
layers = find_layers(model)
130-
for name in ["lm_head"]:
131+
for name in ['lm_head']:
131132
if name in layers:
132133
del layers[name]
133-
make_quant(model, layers, wbits, groupsize)
134+
quant.make_quant_linear(model, layers, wbits, groupsize)
135+
136+
del layers
134137

135138
logger.info("Loading model ...")
136-
print("Loading model ...")
137139
if checkpoint.endswith(".safetensors"):
138-
if device == -1:
139-
device = "cpu"
140-
model.load_state_dict(safe_load(checkpoint, device))
140+
model.load_state_dict(safe_load(checkpoint), strict=False)
141141
else:
142-
model.load_state_dict(torch.load(checkpoint))
142+
model.load_state_dict(torch.load(checkpoint), strict=False)
143+
144+
if eval:
145+
quant.make_quant_attn(model)
146+
quant.make_quant_norm(model)
147+
if fused_mlp:
148+
quant.make_fused_mlp(model)
149+
if warmup_autotune:
150+
quant.autotune_warmup_linear(model, transpose=not (eval))
151+
if eval and fused_mlp:
152+
quant.autotune_warmup_fused(model)
143153
model.seqlen = 2048
144154
logger.info("Done loading model.")
145155

@@ -165,7 +175,11 @@ def generate(self, prompt: str, params: Dict[str, str]) -> str:
165175
top_p=top_p,
166176
temperature=temperature,
167177
)
168-
return self.tokenizer.decode([el.item() for el in generated_ids[0]])
178+
return self.tokenizer.decode(
179+
[el.item() for el in generated_ids[:, input_ids.shape[1]:][0]],
180+
skip_special_tokens=True,
181+
clean_up_tokenization_spaces=False
182+
)
169183

170184
async def agenerate(
171185
self, prompt: str, params: Dict[str, str]

app/llms/llama/llama.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class LlamaLLM(BaseLLM):
2424
Llama LLM implementation
2525
"""
2626

27-
def _download(self, model_path):
27+
def _download(self, model_path, model_dir):
2828
if os.path.exists(model_path):
2929
logger.info("found an existing model %s", model_path)
3030
return
@@ -34,23 +34,23 @@ def _download(self, model_path):
3434
huggingface_hub.hf_hub_download(
3535
repo_id=settings.setup_params["repo_id"],
3636
filename=settings.setup_params["filename"],
37-
local_dir=settings.models_dir,
37+
local_dir=model_dir,
3838
local_dir_use_symlinks=False,
3939
cache_dir=os.path.join(settings.models_dir, ".cache"),
4040
)
4141

42-
os.rename(
43-
os.path.join(settings.models_dir, settings.setup_params["filename"]),
44-
model_path,
45-
)
46-
4742
def _setup(self):
48-
model_path = os.path.join(
43+
model_dir = super().get_model_dir(
4944
settings.models_dir,
50-
f"ggml-{settings.model_family}-{settings.model_name}-q4.bin",
45+
settings.model_family,
46+
settings.setup_params['filename']
47+
)
48+
model_path = os.path.join(
49+
model_dir,
50+
settings.setup_params['filename'],
5151
)
5252

53-
self._download(model_path=model_path)
53+
self._download(model_path, model_dir)
5454

5555
if settings.setup_params["convert"]:
5656
tokenizer_model_path = os.path.join(settings.models_dir, "tokenizer.model")

0 commit comments

Comments
 (0)