From 015835dba2854572d50e167b7cade05af41ed214 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 21 Oct 2024 11:47:52 +0200 Subject: [PATCH] models(gallery): add phi-3 vision (#3890) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 13 +++++++++++++ gallery/phi-3-vision.yaml | 23 +++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 gallery/phi-3-vision.yaml diff --git a/gallery/index.yaml b/gallery/index.yaml index 46cdc9544c3..28cd50b650b 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -6225,6 +6225,19 @@ - filename: Phi-3.5-mini-TitanFusion-0.2.Q4_K_M.gguf sha256: 9579305712f2bca246914639c4873acdc1e7bc64ac2c7db0230df4f0ca0ef234 uri: huggingface://mradermacher/Phi-3.5-mini-TitanFusion-0.2-GGUF/Phi-3.5-mini-TitanFusion-0.2.Q4_K_M.gguf +- !!merge <<: *phi-3 + name: "phi-3-vision:vllm" + url: "github:mudler/LocalAI/gallery/phi-3-vision.yaml@master" + description: | + Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures. +- !!merge <<: *phi-3 + name: "phi-3.5-vision:vllm" + url: "github:mudler/LocalAI/gallery/phi-3-vision.yaml@master" + override: + parameters: + model: microsoft/Phi-3.5-vision-instruct + description: | + Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures. - &hermes-2-pro-mistral ### START Hermes url: "github:mudler/LocalAI/gallery/hermes-2-pro-mistral.yaml@master" diff --git a/gallery/phi-3-vision.yaml b/gallery/phi-3-vision.yaml new file mode 100644 index 00000000000..1a3d03af492 --- /dev/null +++ b/gallery/phi-3-vision.yaml @@ -0,0 +1,23 @@ +--- +name: "phi3-vision" + +config_file: | + name: phi3-vision + backend: vllm + parameters: + model: microsoft/Phi-3-vision-128k-instruct + trust_remote_code: true + max_model_len: 32768 + template: + chat_message: |- + <|{{ .RoleName }}|> + {{.Content}}<|end|> + chat: >- + {{.Input}} + + <|assistant|> + + completion: | + {{.Input}} + use_tokenizer_template: false + image: "<|image_{{ add1 .ID }}|>\n{{.Text}}"