diff --git a/docker/run-vllm-qwen.sh b/docker/run-vllm-qwen.sh new file mode 100755 index 000000000..6d3989053 --- /dev/null +++ b/docker/run-vllm-qwen.sh @@ -0,0 +1,13 @@ +#!/bin/sh +export HUGGING_FACE_HUB_TOKEN=hf_XXX-XXX-XXX +export CUDA_VISIBLE_DEVICES="0,1,2,3" +docker run \ + --name vllm-qwen-vl \ + --network vllm-qwen-vl \ + --gpus all \ + --runtime=nvidia \ + --ipc=host \ + --rm --init \ + -p 8000:8000 \ + -v /opt/vllm:/root/.cache/huggingface \ + vllm/vllm-openai:latest --model Qwen/Qwen2.5-VL-32B-Instruct --served-model-name "Qwen2.5-VL-32B-Instruct" --tensor-parallel-size 4 --max_model_len 32768 --enable-auto-tool-choice --tool-call-parser qwen \ No newline at end of file diff --git a/packages/bytebot-llm-proxy/litellm-config.yaml b/packages/bytebot-llm-proxy/litellm-config.yaml index ff063c345..bc4c4fee5 100644 --- a/packages/bytebot-llm-proxy/litellm-config.yaml +++ b/packages/bytebot-llm-proxy/litellm-config.yaml @@ -28,3 +28,12 @@ model_list: litellm_params: model: gemini/gemini-2.5-flash api_key: os.environ/GEMINI_API_KEY + + # Self-hosted vLLM Models + - model_name: VM426:Qwen2.5-VL-32B-Instruct + litellm_params: + model: openai/Qwen2.5-VL-32B-Instruct + api_base: https://XXX-XXX-XXX-XXX/v1 + supports_function_calling: true + drop_params: true + temperature: 0.1