diff --git a/README.md b/README.md index 0944a354754..c9e907af184 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,9 @@

+[](https://hub.docker.com/r/localai/localai) +[](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest) + > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/) > > [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) diff --git a/docs/content/_index.en.md b/docs/content/_index.en.md index 81ebb773163..6242a5255bc 100644 --- a/docs/content/_index.en.md +++ b/docs/content/_index.en.md @@ -18,6 +18,9 @@ title = "LocalAI"

+[](https://hub.docker.com/r/localai/localai) +[](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest) + > 💡 Get help - [❓FAQ](https://localai.io/faq/) [❓How tos](https://localai.io/howtos/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [💭Discord](https://discord.gg/uJAeKSAGDy) > > [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) diff --git a/docs/content/advanced/_index.en.md b/docs/content/advanced/_index.en.md index 608254bc26e..79e36749771 100644 --- a/docs/content/advanced/_index.en.md +++ b/docs/content/advanced/_index.en.md @@ -365,6 +365,36 @@ docker run --env REBUILD=true localai docker run --env-file .env localai ``` +### CLI parameters + +You can control LocalAI with command line arguments, to specify a binding address, or the number of threads. + + +| Parameter | Environmental Variable | Default Variable | Description | +| ------------------------------ | ------------------------------- | -------------------------------------------------- | ------------------------------------------------------------------- | +| --f16 | $F16 | false | Enable f16 mode | +| --debug | $DEBUG | false | Enable debug mode | +| --cors | $CORS | false | Enable CORS support | +| --cors-allow-origins value | $CORS_ALLOW_ORIGINS | | Specify origins allowed for CORS | +| --threads value | $THREADS | 4 | Number of threads to use for parallel computation | +| --models-path value | $MODELS_PATH | ./models | Path to the directory containing models used for inferencing | +| --preload-models value | $PRELOAD_MODELS | | List of models to preload in JSON format at startup | +| --preload-models-config value | $PRELOAD_MODELS_CONFIG | | A config with a list of models to apply at startup. Specify the path to a YAML config file | +| --config-file value | $CONFIG_FILE | | Path to the config file | +| --address value | $ADDRESS | :8080 | Specify the bind address for the API server | +| --image-path value | $IMAGE_PATH | | Path to the directory used to store generated images | +| --context-size value | $CONTEXT_SIZE | 512 | Default context size of the model | +| --upload-limit value | $UPLOAD_LIMIT | 15 | Default upload limit in megabytes (audio file upload) | +| --galleries | $GALLERIES | | Allows to set galleries from command line | +|--parallel-requests | $PARALLEL_REQUESTS | false | Enable backends to handle multiple requests in parallel. This is for backends that supports multiple requests in parallel, like llama.cpp or vllm | +| --single-active-backend | $SINGLE_ACTIVE_BACKEND | false | Allow only one backend to be running | +| --api-keys value | $API_KEY | empty | List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys. +| --enable-watchdog-idle | $WATCHDOG_IDLE | false | Enable watchdog for stopping idle backends. This will stop the backends if are in idle state for too long. (default: false) [$WATCHDOG_IDLE] +| --enable-watchdog-busy | $WATCHDOG_BUSY | false | Enable watchdog for stopping busy backends that exceed a defined threshold.| +| --watchdog-busy-timeout value | $WATCHDOG_BUSY_TIMEOUT | 5m | Watchdog timeout. This will restart the backend if it crashes. | +| --watchdog-idle-timeout value | $WATCHDOG_IDLE_TIMEOUT | 15m | Watchdog idle timeout. This will restart the backend if it crashes. | +| --preload-backend-only | $PRELOAD_BACKEND_ONLY | false | If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups. | +| --external-grpc-backends | EXTERNAL_GRPC_BACKENDS | none | Comma separated list of external gRPC backends to use. Format: `name:host:port` or `name:/path/to/file` | ### Extra backends diff --git a/docs/content/getting_started/_index.en.md b/docs/content/getting_started/_index.en.md index 5e085dfa740..b55438299fe 100644 --- a/docs/content/getting_started/_index.en.md +++ b/docs/content/getting_started/_index.en.md @@ -1,4 +1,4 @@ - + +++ disableToc = false title = "Getting started" @@ -6,7 +6,11 @@ weight = 1 url = '/basics/getting_started/' +++ -`LocalAI` is available as a container image and binary. It can be used with docker, podman, kubernetes and any container engine. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest). +`LocalAI` is available as a container image and binary. It can be used with docker, podman, kubernetes and any container engine. +Container images are published to [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest) and [Dockerhub](https://hub.docker.com/r/localai/localai). + +[](https://hub.docker.com/r/localai/localai) +[](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest) See also our [How to]({{%relref "howtos" %}}) section for end-to-end guided examples curated by the community. @@ -113,6 +117,11 @@ helm show values go-skynet/local-ai > values.yaml helm install local-ai go-skynet/local-ai -f values.yaml ``` +{{% /tab %}} +{{% tab name="From binary" %}} + +LocalAI binary releases are available in [Github](https://github.com/go-skynet/LocalAI/releases). + {{% /tab %}} {{% tab name="From source" %}} @@ -133,37 +142,44 @@ Note: this feature currently is available only on master builds. You can run `local-ai` directly with a model name, and it will download the model and start the API with the model loaded. -#### CPU-only +> Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies -> You can use these images which are lighter and do not have Nvidia dependencies +{{< tabs >}} +{{% tab name="CPU-only" %}} | Model | Docker command | | --- | --- | -| phi2 | ```docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg-core phi-2``` | -| llava | ```docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg-core llava``` | -| mistral-openorca | ```docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg-core mistral-openorca``` | - -#### GPU (CUDA 11) +| phi2 | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` | +| llava | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava``` | +| mistral-openorca | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` | -For accellerated images with Nvidia and CUDA11, use the following images. + +{{% /tab %}} +{{% tab name="GPU (CUDA 11)" %}} -> If you do not know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` +> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` | Model | Docker command | | --- | --- | -| phi-2 | ```docker run -p 8080:8080 --gpus all -ti --rm quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-core phi-2``` | -| llava | ```docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-core llava``` | -| mistral-openorca | ```docker run -p 8080:8080 --gpus all -ti --rm quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-core mistral-openorca``` | +| phi-2 | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` | +| llava | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core llava``` | +| mistral-openorca | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` | -#### GPU (CUDA 12) +{{% /tab %}} -> If you do not know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` +{{% tab name="GPU (CUDA 12)" %}} + +> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` | Model | Docker command | | --- | --- | -| phi-2 | ```docker run -p 8080:8080 -ti --gpus all --rm quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-core phi-2``` | -| llava | ```docker run -p 8080:8080 -ti --gpus all --rm quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-core llava``` | -| mistral-openorca | ```docker run -p 8080:8080 --gpus all -ti --rm quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-core mistral-openorca``` | +| phi-2 | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` | +| llava | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core llava``` | +| mistral-openorca | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` | + +{{% /tab %}} + +{{< /tabs >}} {{% notice note %}} @@ -182,7 +198,7 @@ local-ai --models github://owner/repo/file.yaml@branch --models github://owner/r For example, to start localai with phi-2, it's possible for instance to also use a full config file from gists: ```bash -./local-ai https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml +docker run -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml ``` The file should be a valid YAML configuration file, for the full syntax see [advanced]({{%relref "advanced" %}}). @@ -284,208 +300,9 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso To see other model configurations, see also the example section [here](https://github.com/mudler/LocalAI/tree/master/examples/configurations). - -### From binaries - -LocalAI binary releases are available in [Github](https://github.com/go-skynet/LocalAI/releases). - -You can control LocalAI with command line arguments, to specify a binding address, or the number of threads. - -### CLI parameters - -| Parameter | Environmental Variable | Default Variable | Description | -| ------------------------------ | ------------------------------- | -------------------------------------------------- | ------------------------------------------------------------------- | -| --f16 | $F16 | false | Enable f16 mode | -| --debug | $DEBUG | false | Enable debug mode | -| --cors | $CORS | false | Enable CORS support | -| --cors-allow-origins value | $CORS_ALLOW_ORIGINS | | Specify origins allowed for CORS | -| --threads value | $THREADS | 4 | Number of threads to use for parallel computation | -| --models-path value | $MODELS_PATH | ./models | Path to the directory containing models used for inferencing | -| --preload-models value | $PRELOAD_MODELS | | List of models to preload in JSON format at startup | -| --preload-models-config value | $PRELOAD_MODELS_CONFIG | | A config with a list of models to apply at startup. Specify the path to a YAML config file | -| --config-file value | $CONFIG_FILE | | Path to the config file | -| --address value | $ADDRESS | :8080 | Specify the bind address for the API server | -| --image-path value | $IMAGE_PATH | | Path to the directory used to store generated images | -| --context-size value | $CONTEXT_SIZE | 512 | Default context size of the model | -| --upload-limit value | $UPLOAD_LIMIT | 15 | Default upload limit in megabytes (audio file upload) | -| --galleries | $GALLERIES | | Allows to set galleries from command line | -|--parallel-requests | $PARALLEL_REQUESTS | false | Enable backends to handle multiple requests in parallel. This is for backends that supports multiple requests in parallel, like llama.cpp or vllm | -| --single-active-backend | $SINGLE_ACTIVE_BACKEND | false | Allow only one backend to be running | -| --api-keys value | $API_KEY | empty | List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys. -| --enable-watchdog-idle | $WATCHDOG_IDLE | false | Enable watchdog for stopping idle backends. This will stop the backends if are in idle state for too long. (default: false) [$WATCHDOG_IDLE] -| --enable-watchdog-busy | $WATCHDOG_BUSY | false | Enable watchdog for stopping busy backends that exceed a defined threshold.| -| --watchdog-busy-timeout value | $WATCHDOG_BUSY_TIMEOUT | 5m | Watchdog timeout. This will restart the backend if it crashes. | -| --watchdog-idle-timeout value | $WATCHDOG_IDLE_TIMEOUT | 15m | Watchdog idle timeout. This will restart the backend if it crashes. | -| --preload-backend-only | $PRELOAD_BACKEND_ONLY | false | If set, the api is NOT launched, and only the preloaded models / backends are started. This is intended for multi-node setups. | -| --external-grpc-backends | EXTERNAL_GRPC_BACKENDS | none | Comma separated list of external gRPC backends to use. Format: `name:host:port` or `name:/path/to/file` | - -### Run LocalAI in Kubernetes - -LocalAI can be installed inside Kubernetes with helm. - -Requirements: -- SSD storage class, or disable `mmap` to load the whole model in memory - -
-By default, the helm chart will install LocalAI instance using the ggml-gpt4all-j model without persistent storage. - -1. Add the helm repo - ```bash - helm repo add go-skynet https://go-skynet.github.io/helm-charts/ - ``` -2. Install the helm chart: - ```bash - helm repo update - helm install local-ai go-skynet/local-ai -f values.yaml - ``` -> **Note:** For further configuration options, see the [helm chart repository on GitHub](https://github.com/go-skynet/helm-charts). -### Example values -Deploy a single LocalAI pod with 6GB of persistent storage serving up a `ggml-gpt4all-j` model with custom prompt. -```yaml -### values.yaml - -replicaCount: 1 - -deployment: - image: quay.io/go-skynet/local-ai:latest ##(This is for CPU only, to use GPU change it to a image that supports GPU IE "v2.0.0-cublas-cuda12-core") - env: - threads: 4 - context_size: 512 - modelsPath: "/models" - -resources: - {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - -# Prompt templates to include -# Note: the keys of this map will be the names of the prompt template files -promptTemplates: - {} - # ggml-gpt4all-j.tmpl: | - # The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response. - # ### Prompt: - # {{.Input}} - # ### Response: - -# Models to download at runtime -models: - # Whether to force download models even if they already exist - forceDownload: false - - # The list of URLs to download models from - # Note: the name of the file will be the name of the loaded model - list: - - url: "https://gpt4all.io/models/ggml-gpt4all-j.bin" - # basicAuth: base64EncodedCredentials - - # Persistent storage for models and prompt templates. - # PVC and HostPath are mutually exclusive. If both are enabled, - # PVC configuration takes precedence. If neither are enabled, ephemeral - # storage is used. - persistence: - pvc: - enabled: false - size: 6Gi - accessModes: - - ReadWriteOnce - - annotations: {} - - # Optional - storageClass: ~ - - hostPath: - enabled: false - path: "/models" - -service: - type: ClusterIP - port: 80 - annotations: {} - # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout - # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200" - -ingress: - enabled: false - className: "" - annotations: - {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - hosts: - - host: chart-example.local - paths: - - path: / - pathType: ImplementationSpecific - tls: [] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - -nodeSelector: {} - -tolerations: [] - -affinity: {} -``` -
- - -### Other examples +### Examples ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png) To see other examples on how to integrate with other projects for instance for question answering or for using it with chatbot-ui, see: [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/). - -### Clients - -OpenAI clients are already compatible with LocalAI by overriding the basePath, or the target URL. - -## Javascript - -
- -https://github.com/openai/openai-node/ - -```javascript -import { Configuration, OpenAIApi } from 'openai'; - -const configuration = new Configuration({ - basePath: `http://localhost:8080/v1` -}); -const openai = new OpenAIApi(configuration); -``` - -
- -## Python - -
- -https://github.com/openai/openai-python - -Set the `OPENAI_API_BASE` environment variable, or by code: - -```python -import openai - -openai.api_base = "http://localhost:8080/v1" - -# create a chat completion -chat_completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hello world"}]) - -# print the completion -print(completion.choices[0].message.content) -``` - -