diff --git a/_public/static/function/css/video.css b/_public/static/function/css/video.css
index a634b7a97..82f339a67 100644
--- a/_public/static/function/css/video.css
+++ b/_public/static/function/css/video.css
@@ -222,9 +222,8 @@ body {
 .ref-name {
   font-size: 11px;
   color: var(--accents-4);
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
+  white-space: pre-wrap;
+  overflow-wrap: anywhere;
   max-width: 100%;
   display: inline-block;
 }
@@ -253,6 +252,18 @@ body {
   resize: vertical;
 }
 
+.prompt-tip {
+  margin-top: 8px;
+  font-size: 11px;
+  color: var(--accents-4);
+  line-height: 1.5;
+}
+
+.ref-textarea {
+  min-height: 88px;
+  resize: vertical;
+}
+
 .status-header {
   display: flex;
   align-items: center;
diff --git a/_public/static/function/js/video.js b/_public/static/function/js/video.js
index 9aea422be..a82e3f963 100644
--- a/_public/static/function/js/video.js
+++ b/_public/static/function/js/video.js
@@ -31,12 +31,13 @@
   let contentBuffer = '';
   let collectingContent = false;
   let startAt = 0;
-  let fileDataUrl = '';
+  let fileDataUrls = [];
   let elapsedTimer = null;
   let lastProgress = 0;
   let currentPreviewItem = null;
   let previewCount = 0;
   const DEFAULT_REASONING_EFFORT = 'low';
+  const MAX_REFERENCE_IMAGES = 7;
 
   function toast(message, type) {
     if (typeof showToast === 'function') {
@@ -229,15 +230,45 @@
   }
 
   function clearFileSelection() {
-    fileDataUrl = '';
+    fileDataUrls = [];
     if (imageFileInput) {
       imageFileInput.value = '';
     }
     if (imageFileName) {
-      imageFileName.textContent = t('common.noFileSelected');
+      imageFileName.textContent = t('video.noReferenceSelected');
     }
   }
 
+  function updateReferenceSummary(names) {
+    if (!imageFileName) return;
+    if (!names || !names.length) {
+      imageFileName.textContent = t('video.noReferenceSelected');
+      return;
+    }
+    imageFileName.textContent = names.join('\n');
+  }
+
+  function parseReferenceUrls(value) {
+    return (value || '')
+      .split(/\r?\n/)
+      .map(item => item.trim())
+      .filter(Boolean);
+  }
+
+  function getReferenceImages() {
+    const rawUrls = imageUrlInput ? parseReferenceUrls(imageUrlInput.value) : [];
+    if (fileDataUrls.length && rawUrls.length) {
+      toast(t('video.referenceConflict'), 'error');
+      throw new Error('invalid_reference');
+    }
+    const images = fileDataUrls.length ? [...fileDataUrls] : rawUrls;
+    if (images.length > MAX_REFERENCE_IMAGES) {
+      toast(t('video.referenceLimit'), 'error');
+      throw new Error('too_many_references');
+    }
+    return images;
+  }
+
   function normalizeAuthHeader(authHeader) {
     if (!authHeader) return '';
     if (authHeader.startsWith('Bearer ')) {
@@ -260,12 +291,7 @@
 
   async function createVideoTask(authHeader) {
     const prompt = promptInput ? promptInput.value.trim() : '';
-    const rawUrl = imageUrlInput ? imageUrlInput.value.trim() : '';
-    if (fileDataUrl && rawUrl) {
-      toast(t('video.referenceConflict'), 'error');
-      throw new Error('invalid_reference');
-    }
-    const imageUrl = fileDataUrl || rawUrl;
+    const imageUrls = getReferenceImages();
     const res = await fetch('/v1/function/video/start', {
       method: 'POST',
       headers: {
@@ -274,7 +300,7 @@
       },
       body: JSON.stringify({
         prompt,
-        image_url: imageUrl || null,
+        image_urls: imageUrls,
         reasoning_effort: DEFAULT_REASONING_EFFORT,
         aspect_ratio: ratioSelect ? ratioSelect.value : '3:2',
         video_length: lengthSelect ? parseInt(lengthSelect.value, 10) : 6,
@@ -604,31 +630,38 @@
 
   if (imageFileInput) {
     imageFileInput.addEventListener('change', () => {
-      const file = imageFileInput.files && imageFileInput.files[0];
-      if (!file) {
+      const files = imageFileInput.files ? Array.from(imageFileInput.files) : [];
+      if (!files.length) {
+        clearFileSelection();
+        return;
+      }
+      if (files.length > MAX_REFERENCE_IMAGES) {
         clearFileSelection();
+        toast(t('video.referenceLimit'), 'error');
         return;
       }
       if (imageUrlInput && imageUrlInput.value.trim()) {
         imageUrlInput.value = '';
       }
-      if (imageFileName) {
-        imageFileName.textContent = file.name;
-      }
-      const reader = new FileReader();
-      reader.onload = () => {
-        if (typeof reader.result === 'string') {
-          fileDataUrl = reader.result;
-        } else {
-          fileDataUrl = '';
-          toast(t('common.fileReadFailed'), 'error');
-        }
-      };
-      reader.onerror = () => {
-        fileDataUrl = '';
+      Promise.all(files.map(file => new Promise((resolve, reject) => {
+        const reader = new FileReader();
+        reader.onload = () => {
+          if (typeof reader.result === 'string') {
+            resolve({ name: file.name, data: reader.result });
+          } else {
+            reject(new Error('read_failed'));
+          }
+        };
+        reader.onerror = () => reject(new Error('read_failed'));
+        reader.readAsDataURL(file);
+      }))).then(items => {
+        fileDataUrls = items.map(item => item.data);
+        updateReferenceSummary(items.map((item, index) => `${index + 1}. ${item.name}`));
+      }).catch(() => {
+        fileDataUrls = [];
         toast(t('common.fileReadFailed'), 'error');
-      };
-      reader.readAsDataURL(file);
+        updateReferenceSummary([]);
+      });
     });
   }
 
@@ -646,9 +679,18 @@
 
   if (imageUrlInput) {
     imageUrlInput.addEventListener('input', () => {
-      if (imageUrlInput.value.trim() && fileDataUrl) {
+      const urls = parseReferenceUrls(imageUrlInput.value);
+      if (urls.length > MAX_REFERENCE_IMAGES) {
+        toast(t('video.referenceLimit'), 'error');
+      }
+      if (imageUrlInput.value.trim() && fileDataUrls.length) {
         clearFileSelection();
       }
+      if (urls.length) {
+        updateReferenceSummary(urls.map((url, index) => `${index + 1}. ${url}`));
+      } else if (!fileDataUrls.length) {
+        updateReferenceSummary([]);
+      }
     });
   }
 
diff --git a/_public/static/function/pages/video.html b/_public/static/function/pages/video.html
index 7b57f64f9..a56e176ee 100644
--- a/_public/static/function/pages/video.html
+++ b/_public/static/function/pages/video.html
@@ -51,15 +51,16 @@ <h2 class="text-2xl font-semibold tracking-tight" data-i18n="video.title">Video
           <div class="settings-grid">
             <div class="settings-block prompt-block">
               <label class="field-label" for="promptInput" data-i18n="video.prompt">提示词</label>
-              <textarea id="promptInput" class="geist-input video-textarea" placeholder="例如：街头霓虹雨夜，慢镜头，胶片质感" data-i18n-placeholder="video.promptPlaceholder"></textarea>
+              <textarea id="promptInput" class="geist-input video-textarea" placeholder="例如：@图1街头霓虹雨夜，@图2人物回头微笑，慢镜头，胶片质感" data-i18n-placeholder="video.promptPlaceholder"></textarea>
+              <div class="prompt-tip" data-i18n="video.promptTip">多图参考可在提示词中使用 @图1 到 @图7，按参考图顺序对应。</div>
             </div>
             <div class="settings-block ref-block">
               <label class="field-label" for="imageUrlInput" data-i18n="video.referenceImage">参考图</label>
               <div class="ref-controls">
-                <input id="imageUrlInput" class="geist-input" placeholder="https://... 或 data:image/..." data-i18n-placeholder="video.referenceImagePlaceholder">
+                <textarea id="imageUrlInput" class="geist-input ref-textarea" placeholder="每行一个 https://... 或 data:image/...，最多 7 张" data-i18n-placeholder="video.referenceImagePlaceholder"></textarea>
               </div>
               <div class="ref-meta">
-                <span id="imageFileName" class="ref-name" data-i18n="common.noFileSelected">未选择文件</span>
+                <span id="imageFileName" class="ref-name" data-i18n="video.noReferenceSelected">未选择参考图</span>
               </div>
             </div>
             <div class="settings-block ratio-block">
@@ -95,7 +96,7 @@ <h2 class="text-2xl font-semibold tracking-tight" data-i18n="video.title">Video
             <div class="settings-block upload-block">
               <label class="field-label">&nbsp;</label>
               <button id="selectImageFileBtn" class="geist-button-outline text-xs px-3" type="button" data-i18n="video.upload">上传</button>
-              <input id="imageFileInput" class="ref-file-input" type="file" accept="image/*">
+              <input id="imageFileInput" class="ref-file-input" type="file" accept="image/*" multiple>
             </div>
             <div class="settings-block clear-block">
               <label class="field-label">&nbsp;</label>
diff --git a/_public/static/i18n/locales/en.json b/_public/static/i18n/locales/en.json
index f9252c136..999f699e9 100644
--- a/_public/static/i18n/locales/en.json
+++ b/_public/static/i18n/locales/en.json
@@ -503,13 +503,14 @@
   "video": {
     "pageTitle": "Grok2API - Video Generation",
     "title": "Video Generation",
-    "subtitle": "Generate short videos with reference images and preset styles.",
+    "subtitle": "Generate short videos with up to 7 reference images, @图N placeholders, and preset styles.",
     "startGenerate": "Generate",
     "genSettings": "Generation Settings",
     "prompt": "Prompt",
-    "promptPlaceholder": "e.g.: neon rain at night on the street, slow motion, film grain",
+    "promptPlaceholder": "e.g.: @图1 neon rainy street at night, @图2 subject looking back and smiling, slow motion, film grain",
+    "promptTip": "Use @图1 to @图7 in the prompt to reference images by upload order.",
     "referenceImage": "Reference Image",
-    "referenceImagePlaceholder": "https://... or data:image/...",
+    "referenceImagePlaceholder": "One https://... or data:image/... per line, up to 7 images",
     "aspectRatio": "Aspect Ratio",
     "ratio3_2": "3:2 Landscape",
     "ratio2_3": "2:3 Portrait",
@@ -538,7 +539,9 @@
     "superResolution": "Super Resolution",
     "superResolutionInProgress": "Super resolution in progress",
     "alreadyGenerating": "Already generating",
-    "referenceConflict": "Reference image: choose either URL/Base64 or file upload",
+    "referenceConflict": "Reference images: choose either URL/Base64 list or file upload",
+    "referenceLimit": "A maximum of 7 reference images is supported",
+    "noReferenceSelected": "No reference images selected",
     "downloadFailed": "Download failed, please check if the video link is accessible",
     "sec6": "6s",
     "sec10": "10s",
diff --git a/_public/static/i18n/locales/zh.json b/_public/static/i18n/locales/zh.json
index 409aa3dd2..329ed50ba 100644
--- a/_public/static/i18n/locales/zh.json
+++ b/_public/static/i18n/locales/zh.json
@@ -503,13 +503,14 @@
   "video": {
     "pageTitle": "Grok2API - Video 视频生成",
     "title": "Video 视频生成",
-    "subtitle": "生成短视频，支持参考图与多种预设风格。",
+    "subtitle": "生成短视频，支持最多 7 张参考图、@图N 引用与多种预设风格。",
     "startGenerate": "开始生成",
     "genSettings": "生成设置",
     "prompt": "提示词",
-    "promptPlaceholder": "例如：街头霓虹雨夜，慢镜头，胶片质感",
+    "promptPlaceholder": "例如：@图1街头霓虹雨夜，@图2人物回头微笑，慢镜头，胶片质感",
+    "promptTip": "多图参考可在提示词中使用 @图1 到 @图7，按参考图顺序对应。",
     "referenceImage": "参考图",
-    "referenceImagePlaceholder": "https://... 或 data:image/...",
+    "referenceImagePlaceholder": "每行一个 https://... 或 data:image/...，最多 7 张",
     "aspectRatio": "画面比例",
     "ratio3_2": "3:2 横构图",
     "ratio2_3": "2:3 竖构图",
@@ -538,7 +539,9 @@
     "superResolution": "超分辨率",
     "superResolutionInProgress": "超分辨率中",
     "alreadyGenerating": "已在生成中",
-    "referenceConflict": "参考图只能选择其一：URL/Base64 或 本地上传",
+    "referenceConflict": "参考图只能选择其一：URL/Base64 列表 或 本地上传",
+    "referenceLimit": "参考图最多支持 7 张",
+    "noReferenceSelected": "未选择参考图",
     "downloadFailed": "下载失败，请检查视频链接是否可访问",
     "sec6": "6 秒",
     "sec10": "10 秒",
diff --git a/app/api/v1/function/video.py b/app/api/v1/function/video.py
index 2706ced98..ccd4cd36c 100644
--- a/app/api/v1/function/video.py
+++ b/app/api/v1/function/video.py
@@ -49,7 +49,7 @@ async def _new_session(
     video_length: int,
     resolution_name: str,
     preset: str,
-    image_url: Optional[str],
+    image_urls: Optional[List[str]],
     reasoning_effort: Optional[str],
 ) -> str:
     task_id = uuid.uuid4().hex
@@ -62,7 +62,7 @@ async def _new_session(
             "video_length": video_length,
             "resolution_name": resolution_name,
             "preset": preset,
-            "image_url": image_url,
+            "image_urls": image_urls or [],
             "reasoning_effort": reasoning_effort,
             "created_at": now,
         }
@@ -123,13 +123,23 @@ def _validate_image_url(image_url: str) -> None:
     )
 
 
+def _normalize_image_urls(values: Optional[List[str]]) -> List[str]:
+    normalized: List[str] = []
+    if isinstance(values, list):
+        for item in values:
+            value = (item or "").strip()
+            if value:
+                normalized.append(value)
+    return normalized
+
+
 class VideoStartRequest(BaseModel):
     prompt: str
     aspect_ratio: Optional[str] = "3:2"
     video_length: Optional[int] = 6
     resolution_name: Optional[str] = "480p"
     preset: Optional[str] = "normal"
-    image_url: Optional[str] = None
+    image_urls: Optional[List[str]] = None
     reasoning_effort: Optional[str] = None
 
 
@@ -166,8 +176,12 @@ async def function_video_start(data: VideoStartRequest):
             detail="preset must be one of ['fun','normal','spicy','custom']",
         )
 
-    image_url = (data.image_url or "").strip() or None
-    if image_url:
+    image_urls = _normalize_image_urls(data.image_urls)
+    if len(image_urls) > 7:
+        raise HTTPException(
+            status_code=400, detail="image_urls supports at most 7 references"
+        )
+    for image_url in image_urls:
         _validate_image_url(image_url)
 
     reasoning_effort = (data.reasoning_effort or "").strip() or None
@@ -185,7 +199,7 @@ async def function_video_start(data: VideoStartRequest):
         video_length,
         resolution_name,
         preset,
-        image_url,
+        image_urls,
         reasoning_effort,
     )
     return {"task_id": task_id, "aspect_ratio": aspect_ratio}
@@ -202,7 +216,11 @@ async def function_video_sse(request: Request, task_id: str = Query("")):
     video_length = int(session.get("video_length") or 6)
     resolution_name = str(session.get("resolution_name") or "480p")
     preset = str(session.get("preset") or "normal")
-    image_url = session.get("image_url")
+    image_urls = [
+        str(item).strip()
+        for item in (session.get("image_urls") or [])
+        if str(item).strip()
+    ]
     reasoning_effort = session.get("reasoning_effort")
 
     async def event_stream():
@@ -218,14 +236,16 @@ async def event_stream():
                 yield "data: [DONE]\n\n"
                 return
 
-            if image_url:
+            if image_urls:
+                content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
+                for image_url in image_urls:
+                    content.append(
+                        {"type": "image_url", "image_url": {"url": image_url}}
+                    )
                 messages: List[Dict[str, Any]] = [
                     {
                         "role": "user",
-                        "content": [
-                            {"type": "text", "text": prompt},
-                            {"type": "image_url", "image_url": {"url": image_url}},
-                        ],
+                        "content": content,
                     }
                 ]
             else:
diff --git a/app/api/v1/video.py b/app/api/v1/video.py
index 650655589..61ae50b2f 100644
--- a/app/api/v1/video.py
+++ b/app/api/v1/video.py
@@ -46,8 +46,13 @@ class VideoCreateRequest(BaseModel):
     size: Optional[str] = Field("1792x1024", description="Output size")
     seconds: Optional[int] = Field(6, description="Video length in seconds")
     quality: Optional[str] = Field("standard", description="Quality: standard/high")
-    image_reference: Optional[Any] = Field(None, description="Structured image reference")
-    input_reference: Optional[Any] = Field(None, description="Multipart input reference file")
+    image_reference: Optional[Any] = Field(
+        None,
+        description="Image references using chat/completions content-block array format: [{type:'image_url', image_url:{url:'...'}}] or an array of plain URL strings",
+    )
+    input_reference: Optional[Any] = Field(
+        None, description="Multipart input reference file"
+    )
 
 
 class VideoExtendDirectRequest(BaseModel):
@@ -57,7 +62,8 @@ class VideoExtendDirectRequest(BaseModel):
 
     prompt: str = Field(..., description="Prompt text mapped to message/originalPrompt")
     reference_id: str = Field(
-        ..., description="Reference id mapped to extendPostId/originalPostId/parentPostId"
+        ...,
+        description="Reference id mapped to extendPostId/originalPostId/parentPostId",
     )
     start_time: float = Field(..., description="Mapped to videoExtensionStartTime")
     ratio: str = Field("2:3", description="Mapped to aspectRatio")
@@ -72,7 +78,9 @@ def _raise_validation_error(exc: ValidationError) -> None:
         loc = first.get("loc", [])
         msg = first.get("msg", "Invalid request")
         code = first.get("type", "invalid_value")
-        param_parts = [str(x) for x in loc if not (isinstance(x, int) or str(x).isdigit())]
+        param_parts = [
+            str(x) for x in loc if not (isinstance(x, int) or str(x).isdigit())
+        ]
         param = ".".join(param_parts) if param_parts else None
         raise ValidationException(message=msg, param=param, code=code)
     raise ValidationException(message="Invalid request", code="invalid_value")
@@ -165,58 +173,98 @@ def _validate_reference_value(value: str, param: str) -> str:
     )
 
 
-def _parse_image_reference(value: Any) -> Optional[str]:
+def _parse_image_reference_item(value: Any, idx: int) -> str:
+    """Parse a single image reference item inside an array."""
+    param_prefix = f"image_reference[{idx}]" if idx is not None else "image_reference"
+
+    if isinstance(value, str):
+        stripped = value.strip()
+        if not stripped:
+            raise ValidationException(
+                message=f"{param_prefix} cannot be empty",
+                param=param_prefix,
+                code="invalid_reference",
+            )
+        return _validate_reference_value(stripped, param_prefix)
+
+    if isinstance(value, dict):
+        block_type = value.get("type")
+        if block_type != "image_url":
+            raise ValidationException(
+                message=f'{param_prefix} must have type="image_url"',
+                param=f"{param_prefix}.type",
+                code="invalid_reference",
+            )
+        inner = value.get("image_url")
+        if not isinstance(inner, dict):
+            raise ValidationException(
+                message=f"{param_prefix}.image_url must be an object with a url field",
+                param=f"{param_prefix}.image_url",
+                code="invalid_reference",
+            )
+        url = inner.get("url", "")
+        if not isinstance(url, str) or not url.strip():
+            raise ValidationException(
+                message=f"{param_prefix}.image_url.url cannot be empty",
+                param=f"{param_prefix}.image_url.url",
+                code="invalid_reference",
+            )
+        return _validate_reference_value(url.strip(), f"{param_prefix}.image_url.url")
+
+    raise ValidationException(
+        message=(
+            f"{param_prefix} must be a URL string or "
+            f'{{"type": "image_url", "image_url": {{"url": "..."}}}}'
+        ),
+        param=param_prefix,
+        code="invalid_reference",
+    )
+
+
+def _parse_image_references(value: Any) -> List[str]:
+    """Parse image_reference into a list of validated URL strings.
+
+    Uses the same content-block format as chat/completions.
+    Accepts:
+      - None / ""  -> []
+      - ["url", {"type": "image_url", ...}, ...] -> [url, ...]
+      - JSON string of an array (for multipart/form-data)
+    """
     if value is None or value == "":
-        return None
+        return []
 
     if isinstance(value, str):
         stripped = value.strip()
         if not stripped:
-            return None
-        if stripped[0] in {"{", "["}:
+            return []
+        if stripped[0] == "[":
             try:
                 value = orjson.loads(stripped)
             except orjson.JSONDecodeError:
-                # allow plain url/data-uri in multipart text field as a practical fallback
-                return _validate_reference_value(stripped, "image_reference")
+                raise ValidationException(
+                    message="image_reference must be a JSON array string",
+                    param="image_reference",
+                    code="invalid_reference",
+                )
         else:
-            return _validate_reference_value(stripped, "image_reference")
-
-    if not isinstance(value, dict):
-        raise ValidationException(
-            message=(
-                "image_reference must be an object with exactly one of "
-                "`image_url` or `file_id`"
-            ),
-            param="image_reference",
-            code="invalid_reference",
-        )
-
-    image_url = value.get("image_url")
-    file_id = value.get("file_id")
-    image_url = image_url.strip() if isinstance(image_url, str) else ""
-    file_id = file_id.strip() if isinstance(file_id, str) else ""
-
-    has_image_url = bool(image_url)
-    has_file_id = bool(file_id)
-    if has_image_url == has_file_id:
-        raise ValidationException(
-            message="image_reference requires exactly one of image_url or file_id",
-            param="image_reference",
-            code="invalid_reference",
-        )
+            raise ValidationException(
+                message="image_reference must be an array",
+                param="image_reference",
+                code="invalid_reference",
+            )
 
-    if has_file_id:
-        raise ValidationException(
-            message=(
-                "image_reference.file_id is not supported in current reverse pipeline; "
-                "please use image_reference.image_url or multipart input_reference"
-            ),
-            param="image_reference.file_id",
-            code="unsupported_reference",
-        )
+    if isinstance(value, list):
+        if not value:
+            return []
+        return [
+            _parse_image_reference_item(item, idx=i) for i, item in enumerate(value)
+        ]
 
-    return _validate_reference_value(image_url, "image_reference.image_url")
+    raise ValidationException(
+        message="image_reference must be an array",
+        param="image_reference",
+        code="invalid_reference",
+    )
 
 
 async def _upload_to_data_uri(file: UploadFile, param: str) -> str:
@@ -234,9 +282,8 @@ async def _upload_to_data_uri(file: UploadFile, param: str) -> str:
 
 async def _build_references_for_json(payload: BaseModel) -> List[str]:
     references: List[str] = []
-    parsed_image_ref = _parse_image_reference(getattr(payload, "image_reference", None))
-    if parsed_image_ref:
-        references.append(parsed_image_ref)
+    parsed_refs = _parse_image_references(getattr(payload, "image_reference", None))
+    references.extend(parsed_refs)
     if getattr(payload, "input_reference", None) not in (None, ""):
         raise ValidationException(
             message="input_reference must be uploaded as multipart/form-data file",
@@ -282,9 +329,8 @@ async def _build_payload_and_references_for_form(
             code="invalid_reference",
         )
 
-    parsed_image_ref = _parse_image_reference(payload.image_reference)
-    if parsed_image_ref:
-        references.append(parsed_image_ref)
+    parsed_refs = _parse_image_references(payload.image_reference)
+    references.extend(parsed_refs)
     return payload, references
 
 
@@ -300,7 +346,7 @@ def _multipart_create_schema(default_seconds: int) -> Dict[str, Any]:
             "quality": {"type": "string", "default": "standard"},
             "image_reference": {
                 "type": "string",
-                "description": "JSON string for image_reference object",
+                "description": "JSON string for image_reference array",
             },
             "input_reference": {"type": "string", "format": "binary"},
         },
@@ -434,7 +480,9 @@ async def create_video(request: Request):
         except ValidationError as exc:
             _raise_validation_error(exc)
         references = await _build_references_for_json(payload)
-        return await _create_video_from_payload(payload, references, require_extension=False)
+        return await _create_video_from_payload(
+            payload, references, require_extension=False
+        )
 
     form = await request.form()
     payload, references = await _build_payload_and_references_for_form(
@@ -447,7 +495,9 @@ async def create_video(request: Request):
         image_reference=form.get("image_reference"),
         input_reference=form.get("input_reference"),
     )
-    return await _create_video_from_payload(payload, references, require_extension=False)
+    return await _create_video_from_payload(
+        payload, references, require_extension=False
+    )
 
 
 @router.post(
diff --git a/app/services/grok/services/video.py b/app/services/grok/services/video.py
index e43d27bd8..0fe28b45c 100644
--- a/app/services/grok/services/video.py
+++ b/app/services/grok/services/video.py
@@ -24,7 +24,11 @@
 from app.core.logger import logger
 from app.services.grok.services.model import ModelService
 from app.services.grok.utils.download import DownloadService
-from app.services.grok.utils.process import _is_http2_error, _normalize_line, _with_idle_timeout
+from app.services.grok.utils.process import (
+    _is_http2_error,
+    _normalize_line,
+    _with_idle_timeout,
+)
 from app.services.grok.utils.retry import rate_limited
 from app.services.grok.utils.stream import wrap_stream_with_usage
 from app.services.reverse.app_chat import AppChatReverse
@@ -39,6 +43,7 @@
 _VIDEO_SEM_VALUE = 0
 _APP_CHAT_MODEL = "grok-3"
 _POST_ID_URL_PATTERN = r"/generated/([0-9a-fA-F-]{32,36})/"
+_REFERENCE_PLACEHOLDER_RE = re.compile(r"@(?:(?:图|image|img)\s*(\d+))", re.IGNORECASE)
 
 
 @dataclass(frozen=True)
@@ -68,6 +73,52 @@ def _pick_str(value: Any) -> str:
     return ""
 
 
+def _extract_last_user_prompt_and_images(
+    messages: List[Dict[str, Any]],
+) -> Tuple[str, List[str]]:
+    """Use only the last user turn so placeholder indices map to that turn's images."""
+    for msg in reversed(messages or []):
+        role = msg.get("role") or "user"
+        if role != "user":
+            continue
+
+        content = msg.get("content", "")
+        if isinstance(content, str):
+            return content.strip(), []
+        if isinstance(content, dict):
+            content = [content]
+        if not isinstance(content, list):
+            return "", []
+
+        prompt_parts: List[str] = []
+        image_urls: List[str] = []
+        for item in content:
+            if not isinstance(item, dict):
+                continue
+
+            item_type = item.get("type")
+            if item_type == "text":
+                text = item.get("text", "")
+                if isinstance(text, str) and text.strip():
+                    prompt_parts.append(text.strip())
+            elif item_type == "image_url":
+                image_data = item.get("image_url", {})
+                url = ""
+                if isinstance(image_data, dict):
+                    url = image_data.get("url", "")
+                elif isinstance(image_data, str):
+                    url = image_data
+                if isinstance(url, str) and url.strip():
+                    image_urls.append(url.strip())
+
+        prompt = "\n".join(prompt_parts).strip()
+        if not prompt and image_urls:
+            prompt = "Refer to the following content:"
+        return prompt, image_urls
+
+    return "", []
+
+
 def _extract_post_id_from_video_url(video_url: str) -> Optional[str]:
     if not isinstance(video_url, str) or not video_url:
         return None
@@ -106,7 +157,9 @@ async def _create_public_video_link(token: str, video_url: str) -> str:
         async with _new_session() as session:
             response = await MediaPostLinkReverse.request(session, token, video_id)
         payload = response.json() if response is not None else {}
-        share_link = _pick_str(payload.get("shareLink")) if isinstance(payload, dict) else ""
+        share_link = (
+            _pick_str(payload.get("shareLink")) if isinstance(payload, dict) else ""
+        )
         if share_link:
             if share_link.endswith(".mp4"):
                 logger.info(f"Video public link created: {share_link}")
@@ -230,14 +283,20 @@ def _build_round_config(
     prompt: str,
     aspect_ratio: str,
     resolution_name: str,
+    image_references: Optional[List[str]] = None,
 ) -> Dict[str, Any]:
     if not plan.is_extension:
-        return _build_base_config(
+        config = _build_base_config(
             seed_post_id,
             aspect_ratio,
             resolution_name,
             plan.video_length,
         )
+        if image_references:
+            video_config = config["modelMap"]["videoGenModelConfig"]
+            video_config["imageReferences"] = image_references
+            video_config["isReferenceToVideo"] = True
+        return config
 
     if not original_post_id:
         raise UpstreamException(
@@ -305,7 +364,9 @@ def _extract_post_id_candidates(resp: Dict[str, Any]) -> List[Tuple[int, str]]:
     return candidates
 
 
-def _apply_post_id_candidates(result: VideoRoundResult, candidates: List[Tuple[int, str]]):
+def _apply_post_id_candidates(
+    result: VideoRoundResult, candidates: List[Tuple[int, str]]
+):
     for rank, value in candidates:
         if rank < result.post_id_rank:
             result.post_id_rank = rank
@@ -371,7 +432,9 @@ async def _iter_round_events(
                 rid = _pick_str(model_resp.get("responseId"))
                 if rid:
                     result.response_id = rid
-                _append_unique_errors(result.stream_errors, model_resp.get("streamErrors"))
+                _append_unique_errors(
+                    result.stream_errors, model_resp.get("streamErrors")
+                )
 
             _apply_post_id_candidates(result, _extract_post_id_candidates(resp))
 
@@ -447,7 +510,9 @@ async def _collect_round_result(
     source: str,
 ) -> VideoRoundResult:
     result = VideoRoundResult()
-    async for event_type, payload in _iter_round_events(response, model=model, source=source):
+    async for event_type, payload in _iter_round_events(
+        response, model=model, source=source
+    ):
         if event_type == "done":
             result = payload
     return result
@@ -478,7 +543,9 @@ def _ensure_round_result(
     final_round: bool,
 ):
     if not result.post_id:
-        err_type = "moderated_or_stream_errors" if result.stream_errors else "missing_post_id"
+        err_type = (
+            "moderated_or_stream_errors" if result.stream_errors else "missing_post_id"
+        )
         raise UpstreamException(
             message=f"Video round {round_index}/{total_rounds} missing post_id",
             status_code=502,
@@ -553,6 +620,7 @@ async def _request_round_stream(
     token: str,
     message: str,
     model_config_override: Dict[str, Any],
+    file_attachments: Optional[List[str]] = None,
 ) -> AsyncGenerator[bytes, None]:
     async def _stream():
         session = _new_session()
@@ -563,6 +631,7 @@ async def _stream():
                     token,
                     message=message,
                     model=_APP_CHAT_MODEL,
+                    file_attachments=file_attachments,
                     tool_overrides={"videoGen": True},
                     model_config_override=model_config_override,
                 )
@@ -650,7 +719,9 @@ def ensure_role(self) -> List[str]:
         self.role_sent = True
         return [self._sse(role="assistant")]
 
-    def emit_progress(self, *, round_index: int, total_rounds: int, progress: Any) -> List[str]:
+    def emit_progress(
+        self, *, round_index: int, total_rounds: int, progress: Any
+    ) -> List[str]:
         if not self.show_think:
             return []
 
@@ -661,7 +732,9 @@ def emit_progress(self, *, round_index: int, total_rounds: int, progress: Any) -
 
         progress_text = _format_progress(progress)
         chunks.append(
-            self._sse(f"[round={round_index}/{total_rounds}] progress={progress_text}%\n")
+            self._sse(
+                f"[round={round_index}/{total_rounds}] progress={progress_text}%\n"
+            )
         )
         return chunks
 
@@ -763,30 +836,66 @@ async def generate(
             model_config_override=model_config_override,
         )
 
-    async def generate_from_image(
+    async def generate_from_images(
         self,
         token: str,
         prompt: str,
-        image_url: str,
+        image_urls: list[str],
+        asset_ids: list[str],
         aspect_ratio: str = "3:2",
         video_length: int = 6,
         resolution: str = "480p",
         preset: str = "normal",
     ) -> AsyncGenerator[bytes, None]:
-        """Single-round image-to-video generation stream."""
-        post_id = await self.create_image_post(token, image_url)
-        model_config_override = _build_base_config(
-            post_id,
-            aspect_ratio,
-            resolution,
-            video_length,
+        """Generate video from one or more reference images."""
+        if not image_urls:
+            raise ValidationException("At least one reference image is required")
+        if len(image_urls) != len(asset_ids):
+            raise ValidationException("Reference image metadata mismatch")
+        logger.info(
+            f"Image to video: prompt='{prompt[:50]}...', images={len(image_urls)}"
         )
+        post_id = await self.create_post(token, prompt)
+        mode_map = {
+            "fun": "--mode=extremely-crazy",
+            "normal": "--mode=normal",
+            "spicy": "--mode=extremely-spicy-or-crazy",
+        }
+        mode_flag = mode_map.get(preset, "--mode=custom")
+        message = f"{prompt} {mode_flag}"
+        model_config_override = {
+            "modelMap": {
+                "videoGenModelConfig": {
+                    "aspectRatio": aspect_ratio,
+                    "imageReferences": image_urls,
+                    "isReferenceToVideo": True,
+                    "parentPostId": post_id,
+                    "resolutionName": resolution,
+                    "videoLength": video_length,
+                }
+            }
+        }
         return await _request_round_stream(
             token=token,
-            message=_build_message(prompt, preset),
+            message=message,
             model_config_override=model_config_override,
+            file_attachments=asset_ids,
         )
 
+    @staticmethod
+    def _replace_reference_placeholders(prompt: str, asset_ids: list[str]) -> str:
+        """Replace @图N / @imageN placeholders with uploaded asset ids."""
+
+        def _replace(match: re.Match[str]) -> str:
+            index = int(match.group(1)) - 1
+            if index < 0 or index >= len(asset_ids):
+                raise ValidationException(
+                    f"Reference placeholder {match.group(0)} has no matching uploaded image"
+                )
+            return f"@{asset_ids[index]}"
+
+        return _REFERENCE_PLACEHOLDER_RE.sub(_replace, prompt)
+
     @staticmethod
     async def completions(
         model: str,
@@ -807,313 +916,389 @@ async def completions(
         else:
             show_think = reasoning_effort != "none"
 
-        from app.services.grok.services.chat import MessageExtractor
         from app.services.grok.utils.upload import UploadService
 
-        prompt, _, image_attachments = MessageExtractor.extract(messages)
+        prompt, image_attachments = _extract_last_user_prompt_and_images(messages)
 
-        pool_candidates = ModelService.pool_candidates_for_model(model)
-        token_info = token_mgr.get_token_for_video(
-            resolution=resolution,
-            video_length=video_length,
-            pool_candidates=pool_candidates,
-        )
+        max_token_retries = max(1, int(get_config("retry.max_retry") or 1))
+        last_error: Exception | None = None
 
-        if not token_info:
-            raise AppException(
-                message="No available tokens. Please try again later.",
-                error_type=ErrorType.RATE_LIMIT.value,
-                code="rate_limit_exceeded",
-                status_code=429,
+        for attempt in range(max_token_retries):
+            pool_candidates = ModelService.pool_candidates_for_model(model)
+            token_info = token_mgr.get_token_for_video(
+                resolution=resolution,
+                video_length=video_length,
+                pool_candidates=pool_candidates,
             )
 
-        token = token_info.token
-        if token.startswith("sso="):
-            token = token[4:]
+            if not token_info:
+                if last_error:
+                    raise last_error
+                raise AppException(
+                    message="No available tokens. Please try again later.",
+                    error_type=ErrorType.RATE_LIMIT.value,
+                    code="rate_limit_exceeded",
+                    status_code=429,
+                )
+
+            token = token_info.token
+            if token.startswith("sso="):
+                token = token[4:]
 
-        pool_name = token_mgr.get_pool_name_for_token(token) or BASIC_POOL_NAME
-        is_super_pool = pool_name != BASIC_POOL_NAME
+            pool_name = token_mgr.get_pool_name_for_token(token) or BASIC_POOL_NAME
+            is_super_pool = pool_name != BASIC_POOL_NAME
 
-        requested_resolution = resolution
-        should_upscale = requested_resolution == "720p" and pool_name == BASIC_POOL_NAME
-        generation_resolution = "480p" if should_upscale else requested_resolution
-        upscale_timing = _resolve_upscale_timing() if should_upscale else "complete"
+            requested_resolution = resolution
+            should_upscale = (
+                requested_resolution == "720p" and pool_name == BASIC_POOL_NAME
+            )
+            generation_resolution = "480p" if should_upscale else requested_resolution
+            upscale_timing = _resolve_upscale_timing() if should_upscale else "complete"
 
-        target_length = int(video_length or 6)
-        round_plan = _build_round_plan(target_length, is_super=is_super_pool)
-        total_rounds = len(round_plan)
+            target_length = int(video_length or 6)
+            round_plan = _build_round_plan(target_length, is_super=is_super_pool)
 
-        service = VideoService()
-        message = _build_message(prompt, preset)
+            prompt_text = prompt
+            image_urls: List[str] = []
+            asset_ids: List[str] = []
 
-        image_url = None
-        if image_attachments:
-            upload_service = UploadService()
             try:
-                if len(image_attachments) > 1:
-                    logger.info(
-                        "Video generation supports a single reference image; using the first one."
+                if image_attachments:
+                    if len(image_attachments) > 7:
+                        raise ValidationException(
+                            "Video generation supports at most 7 reference images"
+                        )
+                    upload_service = UploadService()
+                    try:
+                        for attach_data in image_attachments:
+                            asset_id, file_uri = await upload_service.upload_file(
+                                attach_data, token
+                            )
+                            asset_ids.append(asset_id)
+                            image_urls.append(f"https://assets.grok.com/{file_uri}")
+                        prompt_text = VideoService._replace_reference_placeholders(
+                            prompt_text, asset_ids
+                        )
+                        logger.info(
+                            f"Images uploaded for video: count={len(image_urls)}"
+                        )
+                    finally:
+                        await upload_service.close()
+                elif _REFERENCE_PLACEHOLDER_RE.search(prompt_text):
+                    raise ValidationException(
+                        "Reference placeholders require uploaded images"
                     )
-                attach_data = image_attachments[0]
-                _, file_uri = await upload_service.upload_file(attach_data, token)
-                image_url = f"https://assets.grok.com/{file_uri}"
-                logger.info(f"Image uploaded for video: {image_url}")
-            finally:
-                await upload_service.close()
-
-        if image_url:
-            seed_post_id = await service.create_image_post(token, image_url)
-        else:
-            seed_post_id = await service.create_post(token, prompt)
 
-        model_info = ModelService.get(model)
-        effort = (
-            EffortType.HIGH
-            if (model_info and model_info.cost.value == "high")
-            else EffortType.LOW
-        )
+                service = VideoService()
+                message = _build_message(prompt_text, preset)
+                seed_post_id = await service.create_post(token, prompt_text)
 
-        async def _run_round_collect(
-            plan: VideoRoundPlan,
-            *,
-            seed_id: str,
-            last_id: str,
-            original_id: Optional[str],
-            source: str,
-        ) -> VideoRoundResult:
-            config_override = _build_round_config(
-                plan,
-                seed_post_id=seed_id,
-                last_post_id=last_id,
-                original_post_id=original_id,
-                prompt=prompt,
-                aspect_ratio=aspect_ratio,
-                resolution_name=generation_resolution,
-            )
-            response = await _request_round_stream(
-                token=token,
-                message=message,
-                model_config_override=config_override,
-            )
-            return await _collect_round_result(response, model=model, source=source)
-
-        async def _stream_chain() -> AsyncGenerator[str, None]:
-            writer = _VideoChainSSEWriter(model, show_think)
-            seed_id = seed_post_id
-            last_id = seed_id
-            original_id: Optional[str] = seed_id
-            final_result: Optional[VideoRoundResult] = None
+                model_info = ModelService.get(model)
+                effort = (
+                    EffortType.HIGH
+                    if (model_info and model_info.cost.value == "high")
+                    else EffortType.LOW
+                )
 
-            try:
-                for plan in round_plan:
+                async def _run_round_collect(
+                    plan: VideoRoundPlan,
+                    *,
+                    seed_id: str,
+                    last_id: str,
+                    original_id: Optional[str],
+                    source: str,
+                ) -> VideoRoundResult:
                     config_override = _build_round_config(
                         plan,
                         seed_post_id=seed_id,
                         last_post_id=last_id,
                         original_post_id=original_id,
-                        prompt=prompt,
+                        prompt=prompt_text,
                         aspect_ratio=aspect_ratio,
                         resolution_name=generation_resolution,
+                        image_references=image_urls if plan.round_index == 1 else None,
                     )
                     response = await _request_round_stream(
                         token=token,
                         message=message,
                         model_config_override=config_override,
+                        file_attachments=asset_ids if plan.round_index == 1 else None,
+                    )
+                    return await _collect_round_result(
+                        response, model=model, source=source
                     )
 
-                    round_result = VideoRoundResult()
-                    async for event_type, payload in _iter_round_events(
-                        response,
-                        model=model,
-                        source=f"stream-round-{plan.round_index}",
-                    ):
-                        if event_type == "progress":
-                            for chunk in writer.emit_progress(
+                async def _stream_chain() -> AsyncGenerator[str, None]:
+                    writer = _VideoChainSSEWriter(model, show_think)
+                    seed_id = seed_post_id
+                    last_id = seed_id
+                    original_id: Optional[str] = seed_id
+                    final_result: Optional[VideoRoundResult] = None
+
+                    try:
+                        for plan in round_plan:
+                            config_override = _build_round_config(
+                                plan,
+                                seed_post_id=seed_id,
+                                last_post_id=last_id,
+                                original_post_id=original_id,
+                                prompt=prompt_text,
+                                aspect_ratio=aspect_ratio,
+                                resolution_name=generation_resolution,
+                                image_references=image_urls
+                                if plan.round_index == 1
+                                else None,
+                            )
+                            response = await _request_round_stream(
+                                token=token,
+                                message=message,
+                                model_config_override=config_override,
+                                file_attachments=asset_ids
+                                if plan.round_index == 1
+                                else None,
+                            )
+
+                            round_result = VideoRoundResult()
+                            async for event_type, payload in _iter_round_events(
+                                response,
+                                model=model,
+                                source=f"stream-round-{plan.round_index}",
+                            ):
+                                if event_type == "progress":
+                                    for chunk in writer.emit_progress(
+                                        round_index=plan.round_index,
+                                        total_rounds=plan.total_rounds,
+                                        progress=payload,
+                                    ):
+                                        yield chunk
+                                elif event_type == "done":
+                                    round_result = payload
+
+                            _ensure_round_result(
+                                round_result,
                                 round_index=plan.round_index,
                                 total_rounds=plan.total_rounds,
-                                progress=payload,
+                                final_round=(plan.round_index == plan.total_rounds),
+                            )
+
+                            if (
+                                should_upscale
+                                and upscale_timing == "single"
+                                and round_result.video_url
                             ):
+                                for chunk in writer.emit_note(
+                                    f"[round={plan.round_index}/{plan.total_rounds}] 正在对当前轮结果进行超分辨率\n"
+                                ):
+                                    yield chunk
+                                upgraded_url, upscaled = await _upscale_video_url(
+                                    token, round_result.video_url
+                                )
+                                if upscaled:
+                                    round_result.video_url = upgraded_url
+                                else:
+                                    logger.warning(
+                                        "Video upscale failed in single mode, fallback to 480p result"
+                                    )
+
+                            if plan.round_index == 1 and round_result.post_id:
+                                original_id = round_result.post_id
+                            if round_result.post_id:
+                                last_id = round_result.post_id
+
+                            if plan.round_index == plan.total_rounds:
+                                final_result = round_result
+
+                        if final_result is None:
+                            raise UpstreamException(
+                                message="Video generation produced no final round",
+                                status_code=502,
+                                details={"type": "empty_video_stream"},
+                            )
+
+                        final_video_url = final_result.video_url
+                        if should_upscale and upscale_timing == "complete":
+                            for chunk in writer.emit_note("正在对视频进行超分辨率\n"):
                                 yield chunk
-                        elif event_type == "done":
-                            round_result = payload
-
-                    _ensure_round_result(
-                        round_result,
-                        round_index=plan.round_index,
-                        total_rounds=plan.total_rounds,
-                        final_round=(plan.round_index == plan.total_rounds),
-                    )
+                            final_video_url, upscaled = await _upscale_video_url(
+                                token, final_video_url
+                            )
+                            if not upscaled:
+                                logger.warning(
+                                    "Video upscale failed, fallback to 480p result"
+                                )
 
-                    if should_upscale and upscale_timing == "single" and round_result.video_url:
-                        for chunk in writer.emit_note(
-                            f"[round={plan.round_index}/{plan.total_rounds}] 正在对当前轮结果进行超分辨率\n"
-                        ):
+                        if _public_asset_enabled():
+                            for chunk in writer.emit_note("正在生成可公开访问链接\n"):
+                                yield chunk
+                            final_video_url = await _create_public_video_link(
+                                token, final_video_url
+                            )
+
+                        dl_service = DownloadService()
+                        try:
+                            rendered = await dl_service.render_video(
+                                final_video_url,
+                                token,
+                                final_result.thumbnail_url,
+                            )
+                        finally:
+                            await dl_service.close()
+
+                        for chunk in writer.emit_content(rendered):
                             yield chunk
-                        upgraded_url, upscaled = await _upscale_video_url(
-                            token, round_result.video_url
+                        for chunk in writer.finish():
+                            yield chunk
+                    except asyncio.CancelledError:
+                        logger.debug(
+                            "Video stream chain cancelled by client",
+                            extra={"model": model},
+                        )
+                        raise
+                    except UpstreamException as e:
+                        if rate_limited(e):
+                            await token_mgr.mark_rate_limited(token)
+                        raise
+
+                async def _collect_chain() -> Dict[str, Any]:
+                    seed_id = seed_post_id
+                    last_id = seed_id
+                    original_id: Optional[str] = seed_id
+                    final_result: Optional[VideoRoundResult] = None
+
+                    for plan in round_plan:
+                        round_result = await _run_round_collect(
+                            plan,
+                            seed_id=seed_id,
+                            last_id=last_id,
+                            original_id=original_id,
+                            source=f"collect-round-{plan.round_index}",
+                        )
+
+                        _ensure_round_result(
+                            round_result,
+                            round_index=plan.round_index,
+                            total_rounds=plan.total_rounds,
+                            final_round=(plan.round_index == plan.total_rounds),
+                        )
+
+                        if (
+                            should_upscale
+                            and upscale_timing == "single"
+                            and round_result.video_url
+                        ):
+                            upgraded_url, upscaled = await _upscale_video_url(
+                                token, round_result.video_url
+                            )
+                            if upscaled:
+                                round_result.video_url = upgraded_url
+                            else:
+                                logger.warning(
+                                    "Video upscale failed in single mode, fallback to 480p result"
+                                )
+
+                        if plan.round_index == 1 and round_result.post_id:
+                            original_id = round_result.post_id
+                        if round_result.post_id:
+                            last_id = round_result.post_id
+
+                        if plan.round_index == plan.total_rounds:
+                            final_result = round_result
+
+                    if final_result is None:
+                        raise UpstreamException(
+                            message="Video generation produced no final round",
+                            status_code=502,
+                            details={"type": "empty_video_stream"},
                         )
-                        if upscaled:
-                            round_result.video_url = upgraded_url
-                        else:
+
+                    final_video_url = final_result.video_url
+                    if should_upscale and upscale_timing == "complete":
+                        final_video_url, upscaled = await _upscale_video_url(
+                            token, final_video_url
+                        )
+                        if not upscaled:
                             logger.warning(
-                                "Video upscale failed in single mode, fallback to 480p result"
+                                "Video upscale failed, fallback to 480p result"
                             )
 
-                    if plan.round_index == 1 and round_result.post_id:
-                        original_id = round_result.post_id
-                    if round_result.post_id:
-                        last_id = round_result.post_id
+                    if _public_asset_enabled():
+                        final_video_url = await _create_public_video_link(
+                            token, final_video_url
+                        )
 
-                    if plan.round_index == plan.total_rounds:
-                        final_result = round_result
+                    dl_service = DownloadService()
+                    try:
+                        content = await dl_service.render_video(
+                            final_video_url,
+                            token,
+                            final_result.thumbnail_url,
+                        )
+                    finally:
+                        await dl_service.close()
+
+                    return {
+                        "id": final_result.response_id,
+                        "object": "chat.completion",
+                        "created": int(time.time()),
+                        "model": model,
+                        "choices": [
+                            {
+                                "index": 0,
+                                "message": {
+                                    "role": "assistant",
+                                    "content": content,
+                                    "refusal": None,
+                                },
+                                "finish_reason": "stop",
+                            }
+                        ],
+                        "usage": {
+                            "prompt_tokens": 0,
+                            "completion_tokens": 0,
+                            "total_tokens": 0,
+                        },
+                    }
 
-                if final_result is None:
-                    raise UpstreamException(
-                        message="Video generation produced no final round",
-                        status_code=502,
-                        details={"type": "empty_video_stream"},
+                if is_stream:
+                    return wrap_stream_with_usage(
+                        _stream_chain(), token_mgr, token, model
                     )
 
-                final_video_url = final_result.video_url
-                if should_upscale and upscale_timing == "complete":
-                    for chunk in writer.emit_note("正在对视频进行超分辨率\n"):
-                        yield chunk
-                    final_video_url, upscaled = await _upscale_video_url(token, final_video_url)
-                    if not upscaled:
-                        logger.warning("Video upscale failed, fallback to 480p result")
-
-                if _public_asset_enabled():
-                    for chunk in writer.emit_note("正在生成可公开访问链接\n"):
-                        yield chunk
-                    final_video_url = await _create_public_video_link(token, final_video_url)
+                try:
+                    result = await _collect_chain()
+                except UpstreamException as e:
+                    if rate_limited(e):
+                        await token_mgr.mark_rate_limited(token)
+                    raise
 
-                dl_service = DownloadService()
                 try:
-                    rendered = await dl_service.render_video(
-                        final_video_url,
-                        token,
-                        final_result.thumbnail_url,
+                    await token_mgr.consume(token, effort)
+                    logger.debug(
+                        f"Video completed, recorded usage (effort={effort.value})"
                     )
-                finally:
-                    await dl_service.close()
+                except Exception as e:
+                    logger.warning(f"Failed to record video usage: {e}")
 
-                for chunk in writer.emit_content(rendered):
-                    yield chunk
-                for chunk in writer.finish():
-                    yield chunk
-            except asyncio.CancelledError:
-                logger.debug("Video stream chain cancelled by client", extra={"model": model})
-                raise
+                return result
             except UpstreamException as e:
+                last_error = e
                 if rate_limited(e):
                     await token_mgr.mark_rate_limited(token)
-                raise
-
-        async def _collect_chain() -> Dict[str, Any]:
-            seed_id = seed_post_id
-            last_id = seed_id
-            original_id: Optional[str] = seed_id
-            final_result: Optional[VideoRoundResult] = None
-
-            for plan in round_plan:
-                round_result = await _run_round_collect(
-                    plan,
-                    seed_id=seed_id,
-                    last_id=last_id,
-                    original_id=original_id,
-                    source=f"collect-round-{plan.round_index}",
-                )
-
-                _ensure_round_result(
-                    round_result,
-                    round_index=plan.round_index,
-                    total_rounds=plan.total_rounds,
-                    final_round=(plan.round_index == plan.total_rounds),
-                )
-
-                if should_upscale and upscale_timing == "single" and round_result.video_url:
-                    upgraded_url, upscaled = await _upscale_video_url(
-                        token, round_result.video_url
+                    logger.warning(
+                        f"Token {token[:10]}... rate limited (429), "
+                        f"trying next token (attempt {attempt + 1}/{max_token_retries})"
                     )
-                    if upscaled:
-                        round_result.video_url = upgraded_url
-                    else:
-                        logger.warning(
-                            "Video upscale failed in single mode, fallback to 480p result"
-                        )
-
-                if plan.round_index == 1 and round_result.post_id:
-                    original_id = round_result.post_id
-                if round_result.post_id:
-                    last_id = round_result.post_id
-
-                if plan.round_index == plan.total_rounds:
-                    final_result = round_result
-
-            if final_result is None:
-                raise UpstreamException(
-                    message="Video generation produced no final round",
-                    status_code=502,
-                    details={"type": "empty_video_stream"},
-                )
-
-            final_video_url = final_result.video_url
-            if should_upscale and upscale_timing == "complete":
-                final_video_url, upscaled = await _upscale_video_url(token, final_video_url)
-                if not upscaled:
-                    logger.warning("Video upscale failed, fallback to 480p result")
-
-            if _public_asset_enabled():
-                final_video_url = await _create_public_video_link(token, final_video_url)
-
-            dl_service = DownloadService()
-            try:
-                content = await dl_service.render_video(
-                    final_video_url,
-                    token,
-                    final_result.thumbnail_url,
-                )
-            finally:
-                await dl_service.close()
-
-            return {
-                "id": final_result.response_id,
-                "object": "chat.completion",
-                "created": int(time.time()),
-                "model": model,
-                "choices": [
-                    {
-                        "index": 0,
-                        "message": {
-                            "role": "assistant",
-                            "content": content,
-                            "refusal": None,
-                        },
-                        "finish_reason": "stop",
-                    }
-                ],
-                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
-            }
-
-        if is_stream:
-            return wrap_stream_with_usage(_stream_chain(), token_mgr, token, model)
-
-        try:
-            result = await _collect_chain()
-        except UpstreamException as e:
-            if rate_limited(e):
-                await token_mgr.mark_rate_limited(token)
-            raise
-
-        try:
-            await token_mgr.consume(token, effort)
-            logger.debug(
-                f"Video completed, recorded usage (effort={effort.value})"
-            )
-        except Exception as e:
-            logger.warning(f"Failed to record video usage: {e}")
+                    continue
+                raise
 
-        return result
+        if last_error:
+            raise last_error
+        raise AppException(
+            message="No available tokens. Please try again later.",
+            error_type=ErrorType.RATE_LIMIT.value,
+            code="rate_limit_exceeded",
+            status_code=429,
+        )
 
 
 class VideoStreamProcessor:
@@ -1165,7 +1350,9 @@ async def close(self):
             await self._dl_service.close()
             self._dl_service = None
 
-    async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, None]:
+    async def process(
+        self, response: AsyncIterable[bytes]
+    ) -> AsyncGenerator[str, None]:
         result = VideoRoundResult()
         try:
             async for event_type, payload in _iter_round_events(
@@ -1194,14 +1381,18 @@ async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, N
             if self.upscale_on_finish:
                 for chunk in self.writer.emit_note("正在对视频进行超分辨率\n"):
                     yield chunk
-                final_video_url, upscaled = await _upscale_video_url(self.token, final_video_url)
+                final_video_url, upscaled = await _upscale_video_url(
+                    self.token, final_video_url
+                )
                 if not upscaled:
                     logger.warning("Video upscale failed, fallback to 480p result")
 
             if self.enable_public_asset:
                 for chunk in self.writer.emit_note("正在生成可公开访问链接\n"):
                     yield chunk
-                final_video_url = await _create_public_video_link(self.token, final_video_url)
+                final_video_url = await _create_public_video_link(
+                    self.token, final_video_url
+                )
 
             rendered = await self._get_dl().render_video(
                 final_video_url,
@@ -1213,7 +1404,9 @@ async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, N
             for chunk in self.writer.finish():
                 yield chunk
         except asyncio.CancelledError:
-            logger.debug("Video stream cancelled by client", extra={"model": self.model})
+            logger.debug(
+                "Video stream cancelled by client", extra={"model": self.model}
+            )
             raise
         finally:
             await self.close()
@@ -1265,12 +1458,16 @@ async def process(self, response: AsyncIterable[bytes]) -> Dict[str, Any]:
 
             final_video_url = result.video_url
             if self.upscale_on_finish:
-                final_video_url, upscaled = await _upscale_video_url(self.token, final_video_url)
+                final_video_url, upscaled = await _upscale_video_url(
+                    self.token, final_video_url
+                )
                 if not upscaled:
                     logger.warning("Video upscale failed, fallback to 480p result")
 
             if self.enable_public_asset:
-                final_video_url = await _create_public_video_link(self.token, final_video_url)
+                final_video_url = await _create_public_video_link(
+                    self.token, final_video_url
+                )
 
             content = await self._get_dl().render_video(
                 final_video_url,
@@ -1294,7 +1491,11 @@ async def process(self, response: AsyncIterable[bytes]) -> Dict[str, Any]:
                         "finish_reason": "stop",
                     }
                 ],
-                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+                "usage": {
+                    "prompt_tokens": 0,
+                    "completion_tokens": 0,
+                    "total_tokens": 0,
+                },
             }
         finally:
             await self.close()
diff --git a/docs/README.en.md b/docs/README.en.md
index ff3b89815..8d240a7ac 100644
--- a/docs/README.en.md
+++ b/docs/README.en.md
@@ -196,7 +196,8 @@ curl http://localhost:8000/v1/chat/completions \
 - `grok-imagine-1.0-fast` streaming output in `/chat/completions` only returns the final image, hiding intermediate preview images.
 - `grok-imagine-1.0-fast` streaming URL output will retain the original image filename (without appending `-final`).
 - `grok-imagine-1.0-edit` requires an image; if multiple are provided, the **last 3** images and last text are used.
-- `grok-imagine-1.0-video` supports text-to-video and image-to-video via `image_url` (**only the first image is used**).
+- `grok-imagine-1.0-video` supports text-to-video and multi-image reference video: pass up to `7` `image_url` blocks and use placeholders like `@图1`, `@图2` in the prompt; the server will replace them with the corresponding `assetId` values.
+- `@图N` placeholders map to `image_url` order; referencing a missing image index returns an error.
 - Any other parameters will be discarded and ignored.
 
 <br>
@@ -361,7 +362,7 @@ curl http://localhost:8000/v1/videos \
 | `size` | string | Frame size (mapped to aspect_ratio) | `1280x720`, `720x1280`, `1792x1024`, `1024x1792`, `1024x1024` |
 | `seconds` | integer | Target duration (seconds) | `6` ~ `30` |
 | `quality` | string | Video quality (mapped to resolution) | `standard`, `high` |
-| `image_reference` | object/string | Reference image (optional) | `{"image_url":"https://..."}` or Data URI |
+| `image_reference` | array | Reference image (optional) | OpenAI-compatible content block array (`[{"type":"image_url"...}]`) or an array of URL strings; single-image requests should use a one-item array |
 | `input_reference` | file | multipart reference image (optional) | `png`, `jpg`, `webp` |
 
 **Notes**:
@@ -369,7 +370,7 @@ curl http://localhost:8000/v1/videos \
 - Server-side chain extension now supports 6~30 seconds automatically, so **`/v1/video/extend` is not required**.
 - `quality=standard` maps to `480p`; `quality=high` maps to `720p`.
 - For basic-pool requests at `720p`, generation falls back to `480p` first, then upscales according to `video.upscale_timing`.
-- If both `image_reference` and `input_reference` are provided, references are processed in order; the video pipeline uses the first image only.
+- `image_reference` now uses array format only and supports up to 7 images; single-image requests should also use a one-item array. If both `image_reference` and `input_reference` are provided, references are processed and merged in order; you can use placeholders like `@图1`, `@图2` in prompts.
 
 <br>
 
diff --git a/readme.md b/readme.md
index 75cae6ba5..b75c2fee9 100644
--- a/readme.md
+++ b/readme.md
@@ -197,7 +197,8 @@ curl http://localhost:8000/v1/chat/completions \
 - `grok-imagine-1.0-fast` 流式 URL 出图会保持原始图片名（不追加 `-final` 后缀）。
 - 当图片疑似被审查拦截导致无最终图时，若开启 `image.blocked_parallel_enabled`，服务端会按 `image.blocked_parallel_attempts` 自动并行补偿生成，并优先使用不同 token；若仍无满足 `image.final_min_bytes` 的最终图则返回失败。
 - `grok-imagine-1.0-edit` 必须提供图片，多图默认取**最后 3 张**与最后一个文本。
-- `grok-imagine-1.0-video` 支持文生视频与图生视频（通过 `image_url` 传参考图，**仅取第 1 张**）。
+- `grok-imagine-1.0-video` 支持文生视频与多图参考视频：可通过多个 `image_url` 传最多 `7` 张参考图，并在文本中使用 `@图1`、`@图2` 这类占位符；服务端会自动替换为对应 `assetId`。
+- `@图N` 与 `image_url` 的顺序一一对应；若引用了不存在的图片序号，会直接报错。
 - 除上述外的其他参数将自动丢弃并忽略。
 
 <br>
@@ -362,7 +363,7 @@ curl http://localhost:8000/v1/videos \
 | `size` | string | 画面比例（会映射到 aspect_ratio） | `1280x720`, `720x1280`, `1792x1024`, `1024x1792`, `1024x1024` |
 | `seconds` | integer | 目标时长（秒） | `6` ~ `30` |
 | `quality` | string | 视频质量（映射到 resolution） | `standard`, `high` |
-| `image_reference` | object/string | 参考图（可选） | `{"image_url":"https://..."}` 或 Data URI |
+| `image_reference` | array | 参考图（可选） | 兼容 OpenAI content block 数组格式 (`[{"type":"image_url"...}]`) 或纯 URL 字符串数组；单图也请传单元素数组 |
 | `input_reference` | file | multipart 参考图（可选） | `png`, `jpg`, `webp` |
 
 **注意事项**：
@@ -370,7 +371,7 @@ curl http://localhost:8000/v1/videos \
 - 服务端已支持 6~30 秒自动链式扩展，**无需使用 `/v1/video/extend`**。
 - `quality=standard` 对应 `480p`；`quality=high` 对应 `720p`。
 - 基础号池请求 `720p` 时会先产出 `480p` 再按 `video.upscale_timing` 执行超分。
-- `image_reference` 与 `input_reference` 同时传入时，会按顺序作为参考图输入；视频链路只使用第 1 张。
+- `image_reference` 统一使用数组格式，最多可传 7 张参考图；单图场景也请传单元素数组。`input_reference` 主要以表单上传参考图；若两者同时传入，会按顺序作为参考图合并输入；可在提示词中使用 `@图1`、`@图2`。
 
 <br>