chenyme · chenyme · Mar 23, 2026 · Mar 16, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/_public/static/function/css/video.css b/_public/static/function/css/video.css
@@ -222,9 +222,8 @@ body {
 .ref-name {
   font-size: 11px;
   color: var(--accents-4);
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
+  white-space: pre-wrap;
+  overflow-wrap: anywhere;
   max-width: 100%;
   display: inline-block;
 }
@@ -253,6 +252,18 @@ body {
   resize: vertical;
 }
 
+.prompt-tip {
+  margin-top: 8px;
+  font-size: 11px;
+  color: var(--accents-4);
+  line-height: 1.5;
+}
+
+.ref-textarea {
+  min-height: 88px;
+  resize: vertical;
+}
+
 .status-header {
   display: flex;
   align-items: center;

diff --git a/_public/static/function/js/video.js b/_public/static/function/js/video.js
@@ -31,12 +31,13 @@
   let contentBuffer = '';
   let collectingContent = false;
   let startAt = 0;
-  let fileDataUrl = '';
+  let fileDataUrls = [];
   let elapsedTimer = null;
   let lastProgress = 0;
   let currentPreviewItem = null;
   let previewCount = 0;
   const DEFAULT_REASONING_EFFORT = 'low';
+  const MAX_REFERENCE_IMAGES = 7;
 
   function toast(message, type) {
     if (typeof showToast === 'function') {
@@ -229,15 +230,45 @@
   }
 
   function clearFileSelection() {
-    fileDataUrl = '';
+    fileDataUrls = [];
     if (imageFileInput) {
       imageFileInput.value = '';
     }
     if (imageFileName) {
-      imageFileName.textContent = t('common.noFileSelected');
+      imageFileName.textContent = t('video.noReferenceSelected');
     }
   }
 
+  function updateReferenceSummary(names) {
+    if (!imageFileName) return;
+    if (!names || !names.length) {
+      imageFileName.textContent = t('video.noReferenceSelected');
+      return;
+    }
+    imageFileName.textContent = names.join('\n');
+  }
+
+  function parseReferenceUrls(value) {
+    return (value || '')
+      .split(/\r?\n/)
+      .map(item => item.trim())
+      .filter(Boolean);
+  }
+
+  function getReferenceImages() {
+    const rawUrls = imageUrlInput ? parseReferenceUrls(imageUrlInput.value) : [];
+    if (fileDataUrls.length && rawUrls.length) {
+      toast(t('video.referenceConflict'), 'error');
+      throw new Error('invalid_reference');
+    }
+    const images = fileDataUrls.length ? [...fileDataUrls] : rawUrls;
+    if (images.length > MAX_REFERENCE_IMAGES) {
+      toast(t('video.referenceLimit'), 'error');
+      throw new Error('too_many_references');
+    }
+    return images;
+  }
+
   function normalizeAuthHeader(authHeader) {
     if (!authHeader) return '';
     if (authHeader.startsWith('Bearer ')) {
@@ -260,12 +291,7 @@
 
   async function createVideoTask(authHeader) {
     const prompt = promptInput ? promptInput.value.trim() : '';
-    const rawUrl = imageUrlInput ? imageUrlInput.value.trim() : '';
-    if (fileDataUrl && rawUrl) {
-      toast(t('video.referenceConflict'), 'error');
-      throw new Error('invalid_reference');
-    }
-    const imageUrl = fileDataUrl || rawUrl;
+    const imageUrls = getReferenceImages();
     const res = await fetch('/v1/function/video/start', {
       method: 'POST',
       headers: {
@@ -274,7 +300,7 @@
       },
       body: JSON.stringify({
         prompt,
-        image_url: imageUrl || null,
+        image_urls: imageUrls,
         reasoning_effort: DEFAULT_REASONING_EFFORT,
         aspect_ratio: ratioSelect ? ratioSelect.value : '3:2',
         video_length: lengthSelect ? parseInt(lengthSelect.value, 10) : 6,
@@ -604,31 +630,38 @@
 
   if (imageFileInput) {
     imageFileInput.addEventListener('change', () => {
-      const file = imageFileInput.files && imageFileInput.files[0];
-      if (!file) {
+      const files = imageFileInput.files ? Array.from(imageFileInput.files) : [];
+      if (!files.length) {
+        clearFileSelection();
+        return;
+      }
+      if (files.length > MAX_REFERENCE_IMAGES) {
         clearFileSelection();
+        toast(t('video.referenceLimit'), 'error');
         return;
       }
       if (imageUrlInput && imageUrlInput.value.trim()) {
         imageUrlInput.value = '';
       }
-      if (imageFileName) {
-        imageFileName.textContent = file.name;
-      }
-      const reader = new FileReader();
-      reader.onload = () => {
-        if (typeof reader.result === 'string') {
-          fileDataUrl = reader.result;
-        } else {
-          fileDataUrl = '';
-          toast(t('common.fileReadFailed'), 'error');
-        }
-      };
-      reader.onerror = () => {
-        fileDataUrl = '';
+      Promise.all(files.map(file => new Promise((resolve, reject) => {
+        const reader = new FileReader();
+        reader.onload = () => {
+          if (typeof reader.result === 'string') {
+            resolve({ name: file.name, data: reader.result });
+          } else {
+            reject(new Error('read_failed'));
+          }
+        };
+        reader.onerror = () => reject(new Error('read_failed'));
+        reader.readAsDataURL(file);
+      }))).then(items => {
+        fileDataUrls = items.map(item => item.data);
+        updateReferenceSummary(items.map((item, index) => `${index + 1}. ${item.name}`));
+      }).catch(() => {
+        fileDataUrls = [];
         toast(t('common.fileReadFailed'), 'error');
-      };
-      reader.readAsDataURL(file);
+        updateReferenceSummary([]);
+      });
     });
   }
 
@@ -646,9 +679,18 @@
 
   if (imageUrlInput) {
     imageUrlInput.addEventListener('input', () => {
-      if (imageUrlInput.value.trim() && fileDataUrl) {
+      const urls = parseReferenceUrls(imageUrlInput.value);
+      if (urls.length > MAX_REFERENCE_IMAGES) {
+        toast(t('video.referenceLimit'), 'error');
+      }
+      if (imageUrlInput.value.trim() && fileDataUrls.length) {
         clearFileSelection();
       }
+      if (urls.length) {
+        updateReferenceSummary(urls.map((url, index) => `${index + 1}. ${url}`));
+      } else if (!fileDataUrls.length) {
+        updateReferenceSummary([]);
+      }
     });
   }
 

diff --git a/_public/static/function/pages/video.html b/_public/static/function/pages/video.html
@@ -51,15 +51,16 @@ <h2 class="text-2xl font-semibold tracking-tight" data-i18n="video.title">Video
           <div class="settings-grid">
             <div class="settings-block prompt-block">
               <label class="field-label" for="promptInput" data-i18n="video.prompt">提示词</label>
-              <textarea id="promptInput" class="geist-input video-textarea" placeholder="例如：街头霓虹雨夜，慢镜头，胶片质感" data-i18n-placeholder="video.promptPlaceholder"></textarea>
+              <textarea id="promptInput" class="geist-input video-textarea" placeholder="例如：@图1街头霓虹雨夜，@图2人物回头微笑，慢镜头，胶片质感" data-i18n-placeholder="video.promptPlaceholder"></textarea>
+              <div class="prompt-tip" data-i18n="video.promptTip">多图参考可在提示词中使用 @图1 到 @图7，按参考图顺序对应。</div>
             </div>
             <div class="settings-block ref-block">
               <label class="field-label" for="imageUrlInput" data-i18n="video.referenceImage">参考图</label>
               <div class="ref-controls">
-                <input id="imageUrlInput" class="geist-input" placeholder="https://... 或 data:image/..." data-i18n-placeholder="video.referenceImagePlaceholder">
+                <textarea id="imageUrlInput" class="geist-input ref-textarea" placeholder="每行一个 https://... 或 data:image/...，最多 7 张" data-i18n-placeholder="video.referenceImagePlaceholder"></textarea>
               </div>
               <div class="ref-meta">
-                <span id="imageFileName" class="ref-name" data-i18n="common.noFileSelected">未选择文件</span>
+                <span id="imageFileName" class="ref-name" data-i18n="video.noReferenceSelected">未选择参考图</span>
               </div>
             </div>
             <div class="settings-block ratio-block">
@@ -95,7 +96,7 @@ <h2 class="text-2xl font-semibold tracking-tight" data-i18n="video.title">Video
             <div class="settings-block upload-block">
               <label class="field-label">&nbsp;</label>
               <button id="selectImageFileBtn" class="geist-button-outline text-xs px-3" type="button" data-i18n="video.upload">上传</button>
-              <input id="imageFileInput" class="ref-file-input" type="file" accept="image/*">
+              <input id="imageFileInput" class="ref-file-input" type="file" accept="image/*" multiple>
             </div>
             <div class="settings-block clear-block">
               <label class="field-label">&nbsp;</label>

diff --git a/_public/static/i18n/locales/en.json b/_public/static/i18n/locales/en.json
@@ -503,13 +503,14 @@
   "video": {
     "pageTitle": "Grok2API - Video Generation",
     "title": "Video Generation",
-    "subtitle": "Generate short videos with reference images and preset styles.",
+    "subtitle": "Generate short videos with up to 7 reference images, @图N placeholders, and preset styles.",
     "startGenerate": "Generate",
     "genSettings": "Generation Settings",
     "prompt": "Prompt",
-    "promptPlaceholder": "e.g.: neon rain at night on the street, slow motion, film grain",
+    "promptPlaceholder": "e.g.: @图1 neon rainy street at night, @图2 subject looking back and smiling, slow motion, film grain",
+    "promptTip": "Use @图1 to @图7 in the prompt to reference images by upload order.",
     "referenceImage": "Reference Image",
-    "referenceImagePlaceholder": "https://... or data:image/...",
+    "referenceImagePlaceholder": "One https://... or data:image/... per line, up to 7 images",
     "aspectRatio": "Aspect Ratio",
     "ratio3_2": "3:2 Landscape",
     "ratio2_3": "2:3 Portrait",
@@ -538,7 +539,9 @@
     "superResolution": "Super Resolution",
     "superResolutionInProgress": "Super resolution in progress",
     "alreadyGenerating": "Already generating",
-    "referenceConflict": "Reference image: choose either URL/Base64 or file upload",
+    "referenceConflict": "Reference images: choose either URL/Base64 list or file upload",
+    "referenceLimit": "A maximum of 7 reference images is supported",
+    "noReferenceSelected": "No reference images selected",
     "downloadFailed": "Download failed, please check if the video link is accessible",
     "sec6": "6s",
     "sec10": "10s",

diff --git a/_public/static/i18n/locales/zh.json b/_public/static/i18n/locales/zh.json
@@ -503,13 +503,14 @@
   "video": {
     "pageTitle": "Grok2API - Video 视频生成",
     "title": "Video 视频生成",
-    "subtitle": "生成短视频，支持参考图与多种预设风格。",
+    "subtitle": "生成短视频，支持最多 7 张参考图、@图N 引用与多种预设风格。",
     "startGenerate": "开始生成",
     "genSettings": "生成设置",
     "prompt": "提示词",
-    "promptPlaceholder": "例如：街头霓虹雨夜，慢镜头，胶片质感",
+    "promptPlaceholder": "例如：@图1街头霓虹雨夜，@图2人物回头微笑，慢镜头，胶片质感",
+    "promptTip": "多图参考可在提示词中使用 @图1 到 @图7，按参考图顺序对应。",
     "referenceImage": "参考图",
-    "referenceImagePlaceholder": "https://... 或 data:image/...",
+    "referenceImagePlaceholder": "每行一个 https://... 或 data:image/...，最多 7 张",
     "aspectRatio": "画面比例",
     "ratio3_2": "3:2 横构图",
     "ratio2_3": "2:3 竖构图",
@@ -538,7 +539,9 @@
     "superResolution": "超分辨率",
     "superResolutionInProgress": "超分辨率中",
     "alreadyGenerating": "已在生成中",
-    "referenceConflict": "参考图只能选择其一：URL/Base64 或 本地上传",
+    "referenceConflict": "参考图只能选择其一：URL/Base64 列表 或 本地上传",
+    "referenceLimit": "参考图最多支持 7 张",
+    "noReferenceSelected": "未选择参考图",
     "downloadFailed": "下载失败，请检查视频链接是否可访问",
     "sec6": "6 秒",
     "sec10": "10 秒",

diff --git a/app/api/v1/function/video.py b/app/api/v1/function/video.py
@@ -49,7 +49,7 @@ async def _new_session(
     video_length: int,
     resolution_name: str,
     preset: str,
-    image_url: Optional[str],
+    image_urls: Optional[List[str]],
     reasoning_effort: Optional[str],
 ) -> str:
     task_id = uuid.uuid4().hex
@@ -62,7 +62,7 @@ async def _new_session(
             "video_length": video_length,
             "resolution_name": resolution_name,
             "preset": preset,
-            "image_url": image_url,
+            "image_urls": image_urls or [],
             "reasoning_effort": reasoning_effort,
             "created_at": now,
         }
@@ -123,13 +123,23 @@ def _validate_image_url(image_url: str) -> None:
     )
 
 
+def _normalize_image_urls(values: Optional[List[str]]) -> List[str]:
+    normalized: List[str] = []
+    if isinstance(values, list):
+        for item in values:
+            value = (item or "").strip()
+            if value:
+                normalized.append(value)
+    return normalized
+
+
 class VideoStartRequest(BaseModel):
     prompt: str
     aspect_ratio: Optional[str] = "3:2"
     video_length: Optional[int] = 6
     resolution_name: Optional[str] = "480p"
     preset: Optional[str] = "normal"
-    image_url: Optional[str] = None
+    image_urls: Optional[List[str]] = None
     reasoning_effort: Optional[str] = None
 
 
@@ -166,8 +176,12 @@ async def function_video_start(data: VideoStartRequest):
             detail="preset must be one of ['fun','normal','spicy','custom']",
         )
 
-    image_url = (data.image_url or "").strip() or None
-    if image_url:
+    image_urls = _normalize_image_urls(data.image_urls)
+    if len(image_urls) > 7:
+        raise HTTPException(
+            status_code=400, detail="image_urls supports at most 7 references"
+        )
+    for image_url in image_urls:
         _validate_image_url(image_url)
 
     reasoning_effort = (data.reasoning_effort or "").strip() or None
@@ -185,7 +199,7 @@ async def function_video_start(data: VideoStartRequest):
         video_length,
         resolution_name,
         preset,
-        image_url,
+        image_urls,
         reasoning_effort,
     )
     return {"task_id": task_id, "aspect_ratio": aspect_ratio}
@@ -202,7 +216,11 @@ async def function_video_sse(request: Request, task_id: str = Query("")):
     video_length = int(session.get("video_length") or 6)
     resolution_name = str(session.get("resolution_name") or "480p")
     preset = str(session.get("preset") or "normal")
-    image_url = session.get("image_url")
+    image_urls = [
+        str(item).strip()
+        for item in (session.get("image_urls") or [])
+        if str(item).strip()
+    ]
     reasoning_effort = session.get("reasoning_effort")
 
     async def event_stream():
@@ -218,14 +236,16 @@ async def event_stream():
                 yield "data: [DONE]\n\n"
                 return
 
-            if image_url:
+            if image_urls:
+                content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
+                for image_url in image_urls:
+                    content.append(
+                        {"type": "image_url", "image_url": {"url": image_url}}
+                    )
                 messages: List[Dict[str, Any]] = [
                     {
                         "role": "user",
-                        "content": [
-                            {"type": "text", "text": prompt},
-                            {"type": "image_url", "image_url": {"url": image_url}},
-                        ],
+                        "content": content,
                     }
                 ]
             else: