hao-aaron
diff --git a/‎.buildkite/test-amd.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/test-amd.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/test_areas/misc.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/test_areas/misc.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/features/multimodal_inputs.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/features/multimodal_inputs.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/serving/openai_compatible_server.md‎
Lines changed: 31 additions & 29 deletions b/‎docs/serving/openai_compatible_server.md‎
Lines changed: 31 additions & 29 deletions
diff --git a/‎examples/pooling/classify/vision_classification_online.py‎
Lines changed: 110 additions & 0 deletions b/‎examples/pooling/classify/vision_classification_online.py‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎examples/template_dse_qwen2_vl.jinja‎ ‎…ooling/embed/template/dse_qwen2_vl.jinja‎examples/template_dse_qwen2_vl.jinja renamed to examples/pooling/embed/template/dse_qwen2_vl.jinja b/‎examples/template_dse_qwen2_vl.jinja‎ ‎…ooling/embed/template/dse_qwen2_vl.jinja‎examples/template_dse_qwen2_vl.jinja renamed to examples/pooling/embed/template/dse_qwen2_vl.jinja
diff --git a/‎examples/template_vlm2vec_phi3v.jinja‎ ‎…oling/embed/template/vlm2vec_phi3v.jinja‎examples/template_vlm2vec_phi3v.jinja renamed to examples/pooling/embed/template/vlm2vec_phi3v.jinja b/‎examples/template_vlm2vec_phi3v.jinja‎ ‎…oling/embed/template/vlm2vec_phi3v.jinja‎examples/template_vlm2vec_phi3v.jinja renamed to examples/pooling/embed/template/vlm2vec_phi3v.jinja
diff --git a/‎examples/template_vlm2vec_qwen2vl.jinja‎ ‎…ing/embed/template/vlm2vec_qwen2vl.jinja‎examples/template_vlm2vec_qwen2vl.jinja renamed to examples/pooling/embed/template/vlm2vec_qwen2vl.jinja b/‎examples/template_vlm2vec_qwen2vl.jinja‎ ‎…ing/embed/template/vlm2vec_qwen2vl.jinja‎examples/template_vlm2vec_qwen2vl.jinja renamed to examples/pooling/embed/template/vlm2vec_qwen2vl.jinja
@@ -514,7 +514,7 @@ steps:
     - python3 offline_inference/vision_language_multi_image.py --seed 0
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
 
@@ -453,7 +453,7 @@ steps:
     - python3 offline_inference/vision_language_multi_image.py --seed 0
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
 
@@ -72,7 +72,7 @@ steps:
     - python3 offline_inference/vision_language_multi_image.py --seed 0
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
      # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
 
@@ -510,7 +510,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
     If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
 
     For certain models, we provide alternative chat templates inside [examples](../../examples).
-    For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
+    For example, VLM2Vec uses [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
 
 ### Image Inputs
 
 
@@ -311,15 +311,15 @@ and passing a list of `messages` in the request. Refer to the examples below for
     vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
       --trust-remote-code \
       --max-model-len 4096 \
-      --chat-template examples/template_vlm2vec_phi3v.jinja
+      --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
     ```
 
     !!! important
         Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--runner pooling`
         to run this model in embedding mode instead of text generation mode.
 
         The custom chat template is completely different from the original one for this model,
-        and can be found here: [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja)
+        and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)
 
     Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
 
@@ -359,14 +359,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
     vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
       --trust-remote-code \
       --max-model-len 8192 \
-      --chat-template examples/template_dse_qwen2_vl.jinja
+      --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
     ```
 
     !!! important
         Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
 
         Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
-        by a custom chat template: [examples/template_dse_qwen2_vl.jinja](../../examples/template_dse_qwen2_vl.jinja)
+        by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja)
 
     !!! important
         `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
@@ -532,15 +532,15 @@ The following [sampling parameters](../api/README.md#inference-parameters) are s
 ??? code
 
     ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
+    --8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-sampling-params"
     ```
 
 The following extra parameters are supported:
 
 ??? code
 
     ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
+    --8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-extra-params"
     ```
 
 ### Translations API
@@ -560,13 +560,13 @@ Code example: [examples/online_serving/openai_translation_client.py](../../examp
 The following [sampling parameters](../api/README.md#inference-parameters) are supported.
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
+--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-sampling-params"
 ```
 
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
+--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-extra-params"
 ```
 
 ### Realtime API
@@ -954,28 +954,34 @@ You can pass multi-modal inputs to scoring models by passing `content` including
 
         ```python
         import requests
-
+        
         response = requests.post(
             "http://localhost:8000/v1/score",
             json={
                 "model": "jinaai/jina-reranker-m0",
                 "queries": "slm markdown",
-                "documents": {
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-                            },
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-                            },
-                        },
-                    ],
-                },
+                "documents": [
+                    {
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                                },
+                            }
+                        ],
+                    },
+                    {
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                                },
+                            }
+                        ]
+                    },
+                ],
             },
         )
         response.raise_for_status()
@@ -1001,15 +1007,13 @@ The following Score API parameters are supported:
 
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
---8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```
 
 The following extra parameters are supported:
 
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
---8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```
 
 ### Re-rank API
@@ -1092,15 +1096,13 @@ The following Re-rank API parameters are supported:
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
---8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```
 
 The following extra parameters are supported:
 
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
---8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
 ```
 
 ## Ray Serve LLM
 
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""Example Python client for multimodal classification API using vLLM API server
+NOTE:
+    start a supported multimodal classification model server with `vllm serve`, e.g.
+    vllm serve muziyongshixin/Qwen2.5-VL-7B-for-VideoCls \
+         --runner pooling \
+         --max-model-len 5000 \
+         --limit-mm-per-prompt '{"video": 1}' \
+         --hf-overrides '{"text_config": {"architectures": ["Qwen2_5_VLForSequenceClassification"]}}'
+"""
+
+import argparse
+import pprint
+
+import requests
+
+from vllm.multimodal.utils import encode_image_url, fetch_image
+
+input_text = "This product was excellent and exceeded my expectations"
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+image_base64 = {"url": encode_image_url(fetch_image(image_url))}
+video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+
+def parse_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument("--host", type=str, default="localhost")
+    parse.add_argument("--port", type=int, default=8000)
+    return parse.parse_args()
+
+
+def main(args):
+    base_url = f"http://{args.host}:{args.port}"
+    models_url = base_url + "/v1/models"
+    classify_url = base_url + "/classify"
+
+    response = requests.get(models_url)
+    model_name = response.json()["data"][0]["id"]
+
+    print("Text classification output:")
+    messages = [
+        {
+            "role": "assistant",
+            "content": "Please classify this text request.",
+        },
+        {
+            "role": "user",
+            "content": input_text,
+        },
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Image url classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this image."},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Image base64 classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this image."},
+                {"type": "image_url", "image_url": image_base64},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Video url classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": video_url}},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)