move from cuda to triton in regards to GPTQ for Llama

1b5d · 1b5d · commit bd6f512fb3fe · 2023-06-08T23:53:13.000+02:00
diff --git a/.github/workflows/publish-release-gptq.yml b/.github/workflows/publish-release-gptq.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   push_to_dockerhub:
     runs-on: ubuntu-latest
-    if: ${{ contains(github.ref, 'gptq-llama-cuda') }}
+    if: ${{ contains(github.ref, 'gptq-llama-triton') }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -30,7 +30,7 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           context: .
-          file: Dockerfile.gptq-llama-cuda
+          file: Dockerfile.gptq-llama-triton
           push: ${{ github.event_name != 'pull_request' }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/publish-release.yml b/.github/workflows/publish-release.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   push_to_dockerhub:
     runs-on: ubuntu-latest
-    if: ${{ !contains(github.ref, 'gptq-llama-cuda') }}
+    if: ${{ !contains(github.ref, 'gptq-llama-triton') }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
diff --git a/Dockerfile.gptq-llama-cuda b/Dockerfile.gptq-llama-cuda
@@ -1,3 +1,5 @@
+# NOTE: this docker file is deprecated is replaced by Dockerfile.gptq-llama-triton
+
 FROM debian:bullseye-slim as pytorch-install
 
 ARG PYTORCH_VERSION=2.0.0
diff --git a/Dockerfile.gptq-llama-triton b/Dockerfile.gptq-llama-triton
@@ -0,0 +1,83 @@
+FROM debian:bullseye-slim as pytorch-install
+
+ARG PYTORCH_VERSION=2.0.0
+ARG PYTHON_VERSION=3.9
+ARG CUDA_VERSION=11.7.1
+ARG MAMBA_VERSION=23.1.0-1
+ARG CUDA_CHANNEL=nvidia
+ARG INSTALL_CHANNEL=pytorch
+# Automatically set by buildx
+ARG TARGETPLATFORM
+
+ENV PATH /opt/conda/bin:$PATH
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    ccache \
+    curl \
+    git && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install conda
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+    *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+# Install pytorch
+# On arm64 we exit with an error code
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  exit 1 ;; \
+    *)              /opt/conda/bin/conda update -y conda &&  \
+    /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION torchvision torchaudio "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+# CUDA kernels builder image
+FROM pytorch-install as kernel-builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ninja-build \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.7.1"  cuda==11.7.1 && \
+    /opt/conda/bin/conda clean -ya
+
+
+FROM debian:bullseye-slim as base
+
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+LABEL com.nvidia.volumes.needed="nvidia_driver"
+
+# Copy conda with PyTorch installed
+COPY --from=kernel-builder /opt/conda /opt/conda
+
+RUN apt-get update && apt-get install -y build-essential git
+
+WORKDIR /llm-api
+
+COPY ./requirements.txt /llm-api/requirements.txt
+RUN pip3 install --no-cache-dir --upgrade -r requirements.txt
+
+COPY ./app /llm-api/app
+ENV PYTHONPATH "/llm-api"
+
+RUN git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa
+
+RUN cd GPTQ-for-LLaMa && \
+    pip3 install -r requirements.txt && \
+    cd ..
+
+RUN touch GPTQ-for-LLaMa/__init__.py && mv GPTQ-for-LLaMa /llm-api/app/llms/gptq_llama/GPTQforLLaMa
+
+FROM base
+
+CMD ["python3", "./app/main.py"]
diff --git a/README.md b/README.md
@@ -4,21 +4,20 @@ This application can be used to run LLMs (Large Language Models) in docker conta
 
 The main motivation to start this project, was to be able to use different LLMs running on a local machine or a remote server with [langchain](https://github.com/hwchase17/langchain) using [langchain-llm-api](https://github.com/1b5d/langchain-llm-api)
 
-tested on CPU with the following models : 
-
-- Llama 7b
-- Llama 13b
-- Llama 30b
-- Alpaca 7b
-- Alpaca 13b 
-- Alpaca 30b
-- Vicuna 13b
-- Koala 7b
-
-tested on GPU with GPTQ-for-LlaMa with
-
-- Koala 7B-4bit-128g
-- wizardLM 7B-4bit-128g
+Tested with the following models : 
+
+- Llama 7b - ggml
+- Llama 13b - ggml
+- Llama 30b - ggml
+- Alpaca 7b - ggml
+- Alpaca 13b - ggml
+- Alpaca 30b - ggml
+- Vicuna 13b - ggml
+- Koala 7b - ggml
+- Vicuna GPTQ 7B-4bit-128g
+- Vicuna GPTQ 13B-4bit-128g
+- Koala GPTQ 7B-4bit-128g
+- wizardLM GPTQ 7B-4bit-128g
 
 Contribution for supporting more models is welcomed.
 
@@ -60,7 +59,6 @@ to configure the application, edit `config.yaml` which is mounted into the docke
 ```
 models_dir: /models     # dir inside the container
 model_family: alpaca
-model_name: 7b
 setup_params:
   key: value
 model_params:
@@ -101,7 +99,7 @@ POST /embeddings
 ```
 
 
-## Llama / Alpaca on CPU - using llama.cpp
+## Llama on CPU - using llama.cpp
 
 Llama and models based on it such as Alpaca and Vicuna are intended only for academic research and any commercial use is prohibited. This project doesn't provide any links to download these models.
 
@@ -110,7 +108,6 @@ You can configure the model usage in a local `config.yaml` file, the configs, he
 ```
 models_dir: /models     # dir inside the container
 model_family: alpaca
-model_name: 7b
 setup_params:
   repo_id: user/repo_id
   filename: ggml-model-q4_0.bin
@@ -169,20 +166,22 @@ You should see a table showing you the current nvidia driver version and some ot
 +---------------------------------------------------------------------------------------+
 ```
 
-You can also run the Llama model using GPTQ-for-LLaMa 4 bit quantization, you can use a docker image specially built for that purpose `1b5d/llm-api:0.0.3-gptq-llama-cuda` instead of the default image.
+You can also run the Llama model using GPTQ-for-LLaMa 4 bit quantization, you can use a docker image specially built for that purpose `1b5d/llm-api:0.0.4-gptq-llama-triton` instead of the default image.
 
 a separate docker-compose file is also available to run this mode:
 
 ```
-docker compose -f docker-compose.gptq-llama-cuda.yaml up
+docker compose -f docker-compose.gptq-llama-triton.yaml up
 ```
 
 or by directly running the container:
 
 ```
-docker run --gpus all -v $PWD/models/:/models:rw -v $PWD/config.yaml:/llm-api/config.yaml:ro -p 8000:8000 1b5d/llm-api:0.0.3-gptq-llama-cuda
+docker run --gpus all -v $PWD/models/:/models:rw -v $PWD/config.yaml:/llm-api/config.yaml:ro -p 8000:8000 1b5d/llm-api:0.0.4-gptq-llama-triton
 ```
 
+**Note**: `llm-api:0.0.x-gptq-llama-cuda` image has been deprecated, please switch to the triton image as it seems more reliable
+
 Example config file:
 
 ```
diff --git a/app/base.py b/app/base.py
@@ -1,6 +1,8 @@
 """
 An interface which defines generic LLM related operations
 """
+import os
+import hashlib
 from abc import ABC, abstractmethod
 from typing import AsyncIterator, Dict, List
 
@@ -11,6 +13,11 @@ class BaseLLM(ABC):
     A base class for LLMs
     """
 
+    def get_model_dir(self, models_dir, model_family, model_name):
+        name_digest = str(int(hashlib.md5(model_name.encode('utf-8')).hexdigest(), 16))[0:12]
+        dir_name = '_'.join([model_family, name_digest])
+        return os.path.join(models_dir, dir_name)
+
     @abstractmethod
     def generate(self, prompt: str, params: Dict[str, str]) -> str:
         """
diff --git a/app/config.py b/app/config.py
@@ -18,7 +18,6 @@ class Settings(BaseSettings):  # pylint: disable=too-few-public-methods
     """
 
     models_dir: str = "./models"
-    model_name: str = "7b"
     model_family: str
     model_params: Dict[str, Any] = {}
     setup_params: Dict[str, Any] = {}
diff --git a/app/llms/gptq_llama/GPTQforLLaMa b/app/llms/gptq_llama/GPTQforLLaMa
@@ -0,0 +1 @@
+Subproject commit 5dbcb4a4bcec3cedc75062d5791dcfba651d79af
diff --git a/app/llms/gptq_llama/gptq_llama.py b/app/llms/gptq_llama/gptq_llama.py
@@ -18,8 +18,8 @@
 sys.path.append(os.path.join(os.path.dirname(__file__), "GPTQ-for-LLaMa"))
 
 try:
-    from modelutils import find_layers
-    from quant import make_quant
+    from .GPTQforLLaMa import quant
+    from .GPTQforLLaMa.utils import find_layers
 except ImportError as exp:
     raise ImportError(
         "the GPTQ-for-LLaMa lib is missing, please install it first"
@@ -74,7 +74,11 @@ def _download(self, model_path, model_dir):  # pylint: disable=duplicate-code
         )
 
     def _setup(self):
-        model_dir = os.path.join(settings.models_dir, settings.model_family)
+        model_dir = super().get_model_dir(
+            settings.models_dir,
+            settings.model_family,
+            settings.setup_params['filename']
+        )
         model_path = os.path.join(
             model_dir,
             settings.setup_params["filename"],
@@ -91,7 +95,6 @@ def __init__(self, params: Dict[str, str]) -> None:
         wbits = params.get("wbits", 4)
         cuda_visible_devices = params.get("cuda_visible_devices", "0")
         dev = params.get("device", "cuda:0")
-        st_device = params.get("st_device", -1)
 
         os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
         self.device = torch.device(dev)
@@ -100,20 +103,17 @@ def __init__(self, params: Dict[str, str]) -> None:
             model_path,
             wbits,
             group_size,
-            st_device,
         )
 
         self.model.to(self.device)
         self.tokenizer = AutoTokenizer.from_pretrained(
             settings.setup_params["repo_id"], use_fast=False
         )
 
-    def _load_quant(
-        self, model, checkpoint, wbits, groupsize, device
-    ):  # pylint: disable=too-many-arguments
+    def _load_quant(self, model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True):
         config = LlamaConfig.from_pretrained(model)
 
-        def noop(*args, **kwargs):  # pylint: disable=unused-argument
+        def noop(*args, **kwargs):
             pass
 
         torch.nn.init.kaiming_uniform_ = noop
@@ -125,21 +125,31 @@ def noop(*args, **kwargs):  # pylint: disable=unused-argument
             torch.set_default_dtype(torch.half)
             model = LlamaForCausalLM(config)
             torch.set_default_dtype(torch.float)
-            model = model.eval()  # pylint: disable=no-member
+            if eval:
+                model = model.eval()
             layers = find_layers(model)
-            for name in ["lm_head"]:
+            for name in ['lm_head']:
                 if name in layers:
                     del layers[name]
-            make_quant(model, layers, wbits, groupsize)
+            quant.make_quant_linear(model, layers, wbits, groupsize)
+
+            del layers
 
             logger.info("Loading model ...")
-            print("Loading model ...")
             if checkpoint.endswith(".safetensors"):
-                if device == -1:
-                    device = "cpu"
-                model.load_state_dict(safe_load(checkpoint, device))
+                model.load_state_dict(safe_load(checkpoint), strict=False)
             else:
-                model.load_state_dict(torch.load(checkpoint))
+                model.load_state_dict(torch.load(checkpoint), strict=False)
+
+            if eval:
+                quant.make_quant_attn(model)
+                quant.make_quant_norm(model)
+                if fused_mlp:
+                    quant.make_fused_mlp(model)
+            if warmup_autotune:
+                quant.autotune_warmup_linear(model, transpose=not (eval))
+                if eval and fused_mlp:
+                    quant.autotune_warmup_fused(model)
             model.seqlen = 2048
             logger.info("Done loading model.")
 
@@ -165,7 +175,11 @@ def generate(self, prompt: str, params: Dict[str, str]) -> str:
                 top_p=top_p,
                 temperature=temperature,
             )
-        return self.tokenizer.decode([el.item() for el in generated_ids[0]])
+        return self.tokenizer.decode(
+            [el.item() for el in generated_ids[:, input_ids.shape[1]:][0]],
+            skip_special_tokens=True, 
+            clean_up_tokenization_spaces=False
+        )
 
     async def agenerate(
         self, prompt: str, params: Dict[str, str]
diff --git a/app/llms/llama/llama.py b/app/llms/llama/llama.py
@@ -24,7 +24,7 @@ class LlamaLLM(BaseLLM):
     Llama LLM implementation
     """
 
-    def _download(self, model_path):
+    def _download(self, model_path, model_dir):
         if os.path.exists(model_path):
             logger.info("found an existing model %s", model_path)
             return
@@ -34,23 +34,23 @@ def _download(self, model_path):
         huggingface_hub.hf_hub_download(
             repo_id=settings.setup_params["repo_id"],
             filename=settings.setup_params["filename"],
-            local_dir=settings.models_dir,
+            local_dir=model_dir,
             local_dir_use_symlinks=False,
             cache_dir=os.path.join(settings.models_dir, ".cache"),
         )
 
-        os.rename(
-            os.path.join(settings.models_dir, settings.setup_params["filename"]),
-            model_path,
-        )
-
     def _setup(self):
-        model_path = os.path.join(
+        model_dir = super().get_model_dir(
             settings.models_dir,
-            f"ggml-{settings.model_family}-{settings.model_name}-q4.bin",
+            settings.model_family,
+            settings.setup_params['filename']
+        )
+        model_path = os.path.join(
+            model_dir,
+            settings.setup_params['filename'],
         )
 
-        self._download(model_path=model_path)
+        self._download(model_path, model_dir)
 
         if settings.setup_params["convert"]:
             tokenizer_model_path = os.path.join(settings.models_dir, "tokenizer.model")
diff --git a/config.yaml b/config.yaml
@@ -8,4 +8,3 @@ model_params:
   wbits: 4
   cuda_visible_devices: "0"
   device: "cuda:0"
-  st_device: 0
diff --git a/docker-compose.gptq-llama-triton.yaml b/docker-compose.gptq-llama-triton.yaml
@@ -2,7 +2,7 @@ version: '3'
 
 services:
   app:
-    image: 1b5d/llm-api:0.0.3-gptq-llama-cuda
+    image: 1b5d/llm-api:0.0.4-gptq-llama-triton
     container_name: llm-api-app
     ports:
       - "8000:8000"

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+# NOTE: this docker file is deprecated is replaced by Dockerfile.gptq-llama-triton`
	`2`	`+`
`1`	`3`	`FROM debian:bullseye-slim as pytorch-install`
`2`	`4`
`3`	`5`	`ARG PYTORCH_VERSION=2.0.0`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Subproject commit 5dbcb4a4bcec3cedc75062d5791dcfba651d79af`