From 015835dba2854572d50e167b7cade05af41ed214 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 21 Oct 2024 11:47:52 +0200
Subject: [PATCH] models(gallery): add phi-3 vision (#3890)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml        | 13 +++++++++++++
 gallery/phi-3-vision.yaml | 23 +++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 gallery/phi-3-vision.yaml

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 46cdc9544c3..28cd50b650b 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -6225,6 +6225,19 @@
     - filename: Phi-3.5-mini-TitanFusion-0.2.Q4_K_M.gguf
       sha256: 9579305712f2bca246914639c4873acdc1e7bc64ac2c7db0230df4f0ca0ef234
       uri: huggingface://mradermacher/Phi-3.5-mini-TitanFusion-0.2-GGUF/Phi-3.5-mini-TitanFusion-0.2.Q4_K_M.gguf
+- !!merge <<: *phi-3
+  name: "phi-3-vision:vllm"
+  url: "github:mudler/LocalAI/gallery/phi-3-vision.yaml@master"
+  description: |
+    Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures.
+- !!merge <<: *phi-3
+  name: "phi-3.5-vision:vllm"
+  url: "github:mudler/LocalAI/gallery/phi-3-vision.yaml@master"
+  override:
+    parameters:
+      model: microsoft/Phi-3.5-vision-instruct
+  description: |
+    Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures.
 - &hermes-2-pro-mistral
   ### START Hermes
   url: "github:mudler/LocalAI/gallery/hermes-2-pro-mistral.yaml@master"
diff --git a/gallery/phi-3-vision.yaml b/gallery/phi-3-vision.yaml
new file mode 100644
index 00000000000..1a3d03af492
--- /dev/null
+++ b/gallery/phi-3-vision.yaml
@@ -0,0 +1,23 @@
+---
+name: "phi3-vision"
+
+config_file: |
+  name: phi3-vision
+  backend: vllm
+  parameters:
+    model: microsoft/Phi-3-vision-128k-instruct
+  trust_remote_code: true
+  max_model_len: 32768
+  template:
+    chat_message: |-
+        <|{{ .RoleName }}|>
+        {{.Content}}<|end|>
+    chat: >-
+      {{.Input}}
+
+      <|assistant|>
+
+    completion: |
+        {{.Input}}
+    use_tokenizer_template: false
+    image: "<|image_{{ add1 .ID }}|>\n{{.Text}}"