Video
-
+
diff --git a/_public/static/i18n/locales/en.json b/_public/static/i18n/locales/en.json
index f9252c136..999f699e9 100644
--- a/_public/static/i18n/locales/en.json
+++ b/_public/static/i18n/locales/en.json
@@ -503,13 +503,14 @@
"video": {
"pageTitle": "Grok2API - Video Generation",
"title": "Video Generation",
- "subtitle": "Generate short videos with reference images and preset styles.",
+ "subtitle": "Generate short videos with up to 7 reference images, @图N placeholders, and preset styles.",
"startGenerate": "Generate",
"genSettings": "Generation Settings",
"prompt": "Prompt",
- "promptPlaceholder": "e.g.: neon rain at night on the street, slow motion, film grain",
+ "promptPlaceholder": "e.g.: @图1 neon rainy street at night, @图2 subject looking back and smiling, slow motion, film grain",
+ "promptTip": "Use @图1 to @图7 in the prompt to reference images by upload order.",
"referenceImage": "Reference Image",
- "referenceImagePlaceholder": "https://... or data:image/...",
+ "referenceImagePlaceholder": "One https://... or data:image/... per line, up to 7 images",
"aspectRatio": "Aspect Ratio",
"ratio3_2": "3:2 Landscape",
"ratio2_3": "2:3 Portrait",
@@ -538,7 +539,9 @@
"superResolution": "Super Resolution",
"superResolutionInProgress": "Super resolution in progress",
"alreadyGenerating": "Already generating",
- "referenceConflict": "Reference image: choose either URL/Base64 or file upload",
+ "referenceConflict": "Reference images: choose either URL/Base64 list or file upload",
+ "referenceLimit": "A maximum of 7 reference images is supported",
+ "noReferenceSelected": "No reference images selected",
"downloadFailed": "Download failed, please check if the video link is accessible",
"sec6": "6s",
"sec10": "10s",
diff --git a/_public/static/i18n/locales/zh.json b/_public/static/i18n/locales/zh.json
index 409aa3dd2..329ed50ba 100644
--- a/_public/static/i18n/locales/zh.json
+++ b/_public/static/i18n/locales/zh.json
@@ -503,13 +503,14 @@
"video": {
"pageTitle": "Grok2API - Video 视频生成",
"title": "Video 视频生成",
- "subtitle": "生成短视频,支持参考图与多种预设风格。",
+ "subtitle": "生成短视频,支持最多 7 张参考图、@图N 引用与多种预设风格。",
"startGenerate": "开始生成",
"genSettings": "生成设置",
"prompt": "提示词",
- "promptPlaceholder": "例如:街头霓虹雨夜,慢镜头,胶片质感",
+ "promptPlaceholder": "例如:@图1街头霓虹雨夜,@图2人物回头微笑,慢镜头,胶片质感",
+ "promptTip": "多图参考可在提示词中使用 @图1 到 @图7,按参考图顺序对应。",
"referenceImage": "参考图",
- "referenceImagePlaceholder": "https://... 或 data:image/...",
+ "referenceImagePlaceholder": "每行一个 https://... 或 data:image/...,最多 7 张",
"aspectRatio": "画面比例",
"ratio3_2": "3:2 横构图",
"ratio2_3": "2:3 竖构图",
@@ -538,7 +539,9 @@
"superResolution": "超分辨率",
"superResolutionInProgress": "超分辨率中",
"alreadyGenerating": "已在生成中",
- "referenceConflict": "参考图只能选择其一:URL/Base64 或 本地上传",
+ "referenceConflict": "参考图只能选择其一:URL/Base64 列表 或 本地上传",
+ "referenceLimit": "参考图最多支持 7 张",
+ "noReferenceSelected": "未选择参考图",
"downloadFailed": "下载失败,请检查视频链接是否可访问",
"sec6": "6 秒",
"sec10": "10 秒",
diff --git a/app/api/v1/function/video.py b/app/api/v1/function/video.py
index 2706ced98..ccd4cd36c 100644
--- a/app/api/v1/function/video.py
+++ b/app/api/v1/function/video.py
@@ -49,7 +49,7 @@ async def _new_session(
video_length: int,
resolution_name: str,
preset: str,
- image_url: Optional[str],
+ image_urls: Optional[List[str]],
reasoning_effort: Optional[str],
) -> str:
task_id = uuid.uuid4().hex
@@ -62,7 +62,7 @@ async def _new_session(
"video_length": video_length,
"resolution_name": resolution_name,
"preset": preset,
- "image_url": image_url,
+ "image_urls": image_urls or [],
"reasoning_effort": reasoning_effort,
"created_at": now,
}
@@ -123,13 +123,23 @@ def _validate_image_url(image_url: str) -> None:
)
+def _normalize_image_urls(values: Optional[List[str]]) -> List[str]:
+ normalized: List[str] = []
+ if isinstance(values, list):
+ for item in values:
+ value = (item or "").strip()
+ if value:
+ normalized.append(value)
+ return normalized
+
+
class VideoStartRequest(BaseModel):
prompt: str
aspect_ratio: Optional[str] = "3:2"
video_length: Optional[int] = 6
resolution_name: Optional[str] = "480p"
preset: Optional[str] = "normal"
- image_url: Optional[str] = None
+ image_urls: Optional[List[str]] = None
reasoning_effort: Optional[str] = None
@@ -166,8 +176,12 @@ async def function_video_start(data: VideoStartRequest):
detail="preset must be one of ['fun','normal','spicy','custom']",
)
- image_url = (data.image_url or "").strip() or None
- if image_url:
+ image_urls = _normalize_image_urls(data.image_urls)
+ if len(image_urls) > 7:
+ raise HTTPException(
+ status_code=400, detail="image_urls supports at most 7 references"
+ )
+ for image_url in image_urls:
_validate_image_url(image_url)
reasoning_effort = (data.reasoning_effort or "").strip() or None
@@ -185,7 +199,7 @@ async def function_video_start(data: VideoStartRequest):
video_length,
resolution_name,
preset,
- image_url,
+ image_urls,
reasoning_effort,
)
return {"task_id": task_id, "aspect_ratio": aspect_ratio}
@@ -202,7 +216,11 @@ async def function_video_sse(request: Request, task_id: str = Query("")):
video_length = int(session.get("video_length") or 6)
resolution_name = str(session.get("resolution_name") or "480p")
preset = str(session.get("preset") or "normal")
- image_url = session.get("image_url")
+ image_urls = [
+ str(item).strip()
+ for item in (session.get("image_urls") or [])
+ if str(item).strip()
+ ]
reasoning_effort = session.get("reasoning_effort")
async def event_stream():
@@ -218,14 +236,16 @@ async def event_stream():
yield "data: [DONE]\n\n"
return
- if image_url:
+ if image_urls:
+ content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
+ for image_url in image_urls:
+ content.append(
+ {"type": "image_url", "image_url": {"url": image_url}}
+ )
messages: List[Dict[str, Any]] = [
{
"role": "user",
- "content": [
- {"type": "text", "text": prompt},
- {"type": "image_url", "image_url": {"url": image_url}},
- ],
+ "content": content,
}
]
else:
diff --git a/app/api/v1/video.py b/app/api/v1/video.py
index 650655589..61ae50b2f 100644
--- a/app/api/v1/video.py
+++ b/app/api/v1/video.py
@@ -46,8 +46,13 @@ class VideoCreateRequest(BaseModel):
size: Optional[str] = Field("1792x1024", description="Output size")
seconds: Optional[int] = Field(6, description="Video length in seconds")
quality: Optional[str] = Field("standard", description="Quality: standard/high")
- image_reference: Optional[Any] = Field(None, description="Structured image reference")
- input_reference: Optional[Any] = Field(None, description="Multipart input reference file")
+ image_reference: Optional[Any] = Field(
+ None,
+ description="Image references using chat/completions content-block array format: [{type:'image_url', image_url:{url:'...'}}] or an array of plain URL strings",
+ )
+ input_reference: Optional[Any] = Field(
+ None, description="Multipart input reference file"
+ )
class VideoExtendDirectRequest(BaseModel):
@@ -57,7 +62,8 @@ class VideoExtendDirectRequest(BaseModel):
prompt: str = Field(..., description="Prompt text mapped to message/originalPrompt")
reference_id: str = Field(
- ..., description="Reference id mapped to extendPostId/originalPostId/parentPostId"
+ ...,
+ description="Reference id mapped to extendPostId/originalPostId/parentPostId",
)
start_time: float = Field(..., description="Mapped to videoExtensionStartTime")
ratio: str = Field("2:3", description="Mapped to aspectRatio")
@@ -72,7 +78,9 @@ def _raise_validation_error(exc: ValidationError) -> None:
loc = first.get("loc", [])
msg = first.get("msg", "Invalid request")
code = first.get("type", "invalid_value")
- param_parts = [str(x) for x in loc if not (isinstance(x, int) or str(x).isdigit())]
+ param_parts = [
+ str(x) for x in loc if not (isinstance(x, int) or str(x).isdigit())
+ ]
param = ".".join(param_parts) if param_parts else None
raise ValidationException(message=msg, param=param, code=code)
raise ValidationException(message="Invalid request", code="invalid_value")
@@ -165,58 +173,98 @@ def _validate_reference_value(value: str, param: str) -> str:
)
-def _parse_image_reference(value: Any) -> Optional[str]:
+def _parse_image_reference_item(value: Any, idx: int) -> str:
+ """Parse a single image reference item inside an array."""
+ param_prefix = f"image_reference[{idx}]" if idx is not None else "image_reference"
+
+ if isinstance(value, str):
+ stripped = value.strip()
+ if not stripped:
+ raise ValidationException(
+ message=f"{param_prefix} cannot be empty",
+ param=param_prefix,
+ code="invalid_reference",
+ )
+ return _validate_reference_value(stripped, param_prefix)
+
+ if isinstance(value, dict):
+ block_type = value.get("type")
+ if block_type != "image_url":
+ raise ValidationException(
+ message=f'{param_prefix} must have type="image_url"',
+ param=f"{param_prefix}.type",
+ code="invalid_reference",
+ )
+ inner = value.get("image_url")
+ if not isinstance(inner, dict):
+ raise ValidationException(
+ message=f"{param_prefix}.image_url must be an object with a url field",
+ param=f"{param_prefix}.image_url",
+ code="invalid_reference",
+ )
+ url = inner.get("url", "")
+ if not isinstance(url, str) or not url.strip():
+ raise ValidationException(
+ message=f"{param_prefix}.image_url.url cannot be empty",
+ param=f"{param_prefix}.image_url.url",
+ code="invalid_reference",
+ )
+ return _validate_reference_value(url.strip(), f"{param_prefix}.image_url.url")
+
+ raise ValidationException(
+ message=(
+ f"{param_prefix} must be a URL string or "
+ f'{{"type": "image_url", "image_url": {{"url": "..."}}}}'
+ ),
+ param=param_prefix,
+ code="invalid_reference",
+ )
+
+
+def _parse_image_references(value: Any) -> List[str]:
+ """Parse image_reference into a list of validated URL strings.
+
+ Uses the same content-block format as chat/completions.
+ Accepts:
+ - None / "" -> []
+ - ["url", {"type": "image_url", ...}, ...] -> [url, ...]
+ - JSON string of an array (for multipart/form-data)
+ """
if value is None or value == "":
- return None
+ return []
if isinstance(value, str):
stripped = value.strip()
if not stripped:
- return None
- if stripped[0] in {"{", "["}:
+ return []
+ if stripped[0] == "[":
try:
value = orjson.loads(stripped)
except orjson.JSONDecodeError:
- # allow plain url/data-uri in multipart text field as a practical fallback
- return _validate_reference_value(stripped, "image_reference")
+ raise ValidationException(
+ message="image_reference must be a JSON array string",
+ param="image_reference",
+ code="invalid_reference",
+ )
else:
- return _validate_reference_value(stripped, "image_reference")
-
- if not isinstance(value, dict):
- raise ValidationException(
- message=(
- "image_reference must be an object with exactly one of "
- "`image_url` or `file_id`"
- ),
- param="image_reference",
- code="invalid_reference",
- )
-
- image_url = value.get("image_url")
- file_id = value.get("file_id")
- image_url = image_url.strip() if isinstance(image_url, str) else ""
- file_id = file_id.strip() if isinstance(file_id, str) else ""
-
- has_image_url = bool(image_url)
- has_file_id = bool(file_id)
- if has_image_url == has_file_id:
- raise ValidationException(
- message="image_reference requires exactly one of image_url or file_id",
- param="image_reference",
- code="invalid_reference",
- )
+ raise ValidationException(
+ message="image_reference must be an array",
+ param="image_reference",
+ code="invalid_reference",
+ )
- if has_file_id:
- raise ValidationException(
- message=(
- "image_reference.file_id is not supported in current reverse pipeline; "
- "please use image_reference.image_url or multipart input_reference"
- ),
- param="image_reference.file_id",
- code="unsupported_reference",
- )
+ if isinstance(value, list):
+ if not value:
+ return []
+ return [
+ _parse_image_reference_item(item, idx=i) for i, item in enumerate(value)
+ ]
- return _validate_reference_value(image_url, "image_reference.image_url")
+ raise ValidationException(
+ message="image_reference must be an array",
+ param="image_reference",
+ code="invalid_reference",
+ )
async def _upload_to_data_uri(file: UploadFile, param: str) -> str:
@@ -234,9 +282,8 @@ async def _upload_to_data_uri(file: UploadFile, param: str) -> str:
async def _build_references_for_json(payload: BaseModel) -> List[str]:
references: List[str] = []
- parsed_image_ref = _parse_image_reference(getattr(payload, "image_reference", None))
- if parsed_image_ref:
- references.append(parsed_image_ref)
+ parsed_refs = _parse_image_references(getattr(payload, "image_reference", None))
+ references.extend(parsed_refs)
if getattr(payload, "input_reference", None) not in (None, ""):
raise ValidationException(
message="input_reference must be uploaded as multipart/form-data file",
@@ -282,9 +329,8 @@ async def _build_payload_and_references_for_form(
code="invalid_reference",
)
- parsed_image_ref = _parse_image_reference(payload.image_reference)
- if parsed_image_ref:
- references.append(parsed_image_ref)
+ parsed_refs = _parse_image_references(payload.image_reference)
+ references.extend(parsed_refs)
return payload, references
@@ -300,7 +346,7 @@ def _multipart_create_schema(default_seconds: int) -> Dict[str, Any]:
"quality": {"type": "string", "default": "standard"},
"image_reference": {
"type": "string",
- "description": "JSON string for image_reference object",
+ "description": "JSON string for image_reference array",
},
"input_reference": {"type": "string", "format": "binary"},
},
@@ -434,7 +480,9 @@ async def create_video(request: Request):
except ValidationError as exc:
_raise_validation_error(exc)
references = await _build_references_for_json(payload)
- return await _create_video_from_payload(payload, references, require_extension=False)
+ return await _create_video_from_payload(
+ payload, references, require_extension=False
+ )
form = await request.form()
payload, references = await _build_payload_and_references_for_form(
@@ -447,7 +495,9 @@ async def create_video(request: Request):
image_reference=form.get("image_reference"),
input_reference=form.get("input_reference"),
)
- return await _create_video_from_payload(payload, references, require_extension=False)
+ return await _create_video_from_payload(
+ payload, references, require_extension=False
+ )
@router.post(
diff --git a/app/services/grok/services/video.py b/app/services/grok/services/video.py
index e43d27bd8..0fe28b45c 100644
--- a/app/services/grok/services/video.py
+++ b/app/services/grok/services/video.py
@@ -24,7 +24,11 @@
from app.core.logger import logger
from app.services.grok.services.model import ModelService
from app.services.grok.utils.download import DownloadService
-from app.services.grok.utils.process import _is_http2_error, _normalize_line, _with_idle_timeout
+from app.services.grok.utils.process import (
+ _is_http2_error,
+ _normalize_line,
+ _with_idle_timeout,
+)
from app.services.grok.utils.retry import rate_limited
from app.services.grok.utils.stream import wrap_stream_with_usage
from app.services.reverse.app_chat import AppChatReverse
@@ -39,6 +43,7 @@
_VIDEO_SEM_VALUE = 0
_APP_CHAT_MODEL = "grok-3"
_POST_ID_URL_PATTERN = r"/generated/([0-9a-fA-F-]{32,36})/"
+_REFERENCE_PLACEHOLDER_RE = re.compile(r"@(?:(?:图|image|img)\s*(\d+))", re.IGNORECASE)
@dataclass(frozen=True)
@@ -68,6 +73,52 @@ def _pick_str(value: Any) -> str:
return ""
+def _extract_last_user_prompt_and_images(
+ messages: List[Dict[str, Any]],
+) -> Tuple[str, List[str]]:
+ """Use only the last user turn so placeholder indices map to that turn's images."""
+ for msg in reversed(messages or []):
+ role = msg.get("role") or "user"
+ if role != "user":
+ continue
+
+ content = msg.get("content", "")
+ if isinstance(content, str):
+ return content.strip(), []
+ if isinstance(content, dict):
+ content = [content]
+ if not isinstance(content, list):
+ return "", []
+
+ prompt_parts: List[str] = []
+ image_urls: List[str] = []
+ for item in content:
+ if not isinstance(item, dict):
+ continue
+
+ item_type = item.get("type")
+ if item_type == "text":
+ text = item.get("text", "")
+ if isinstance(text, str) and text.strip():
+ prompt_parts.append(text.strip())
+ elif item_type == "image_url":
+ image_data = item.get("image_url", {})
+ url = ""
+ if isinstance(image_data, dict):
+ url = image_data.get("url", "")
+ elif isinstance(image_data, str):
+ url = image_data
+ if isinstance(url, str) and url.strip():
+ image_urls.append(url.strip())
+
+ prompt = "\n".join(prompt_parts).strip()
+ if not prompt and image_urls:
+ prompt = "Refer to the following content:"
+ return prompt, image_urls
+
+ return "", []
+
+
def _extract_post_id_from_video_url(video_url: str) -> Optional[str]:
if not isinstance(video_url, str) or not video_url:
return None
@@ -106,7 +157,9 @@ async def _create_public_video_link(token: str, video_url: str) -> str:
async with _new_session() as session:
response = await MediaPostLinkReverse.request(session, token, video_id)
payload = response.json() if response is not None else {}
- share_link = _pick_str(payload.get("shareLink")) if isinstance(payload, dict) else ""
+ share_link = (
+ _pick_str(payload.get("shareLink")) if isinstance(payload, dict) else ""
+ )
if share_link:
if share_link.endswith(".mp4"):
logger.info(f"Video public link created: {share_link}")
@@ -230,14 +283,20 @@ def _build_round_config(
prompt: str,
aspect_ratio: str,
resolution_name: str,
+ image_references: Optional[List[str]] = None,
) -> Dict[str, Any]:
if not plan.is_extension:
- return _build_base_config(
+ config = _build_base_config(
seed_post_id,
aspect_ratio,
resolution_name,
plan.video_length,
)
+ if image_references:
+ video_config = config["modelMap"]["videoGenModelConfig"]
+ video_config["imageReferences"] = image_references
+ video_config["isReferenceToVideo"] = True
+ return config
if not original_post_id:
raise UpstreamException(
@@ -305,7 +364,9 @@ def _extract_post_id_candidates(resp: Dict[str, Any]) -> List[Tuple[int, str]]:
return candidates
-def _apply_post_id_candidates(result: VideoRoundResult, candidates: List[Tuple[int, str]]):
+def _apply_post_id_candidates(
+ result: VideoRoundResult, candidates: List[Tuple[int, str]]
+):
for rank, value in candidates:
if rank < result.post_id_rank:
result.post_id_rank = rank
@@ -371,7 +432,9 @@ async def _iter_round_events(
rid = _pick_str(model_resp.get("responseId"))
if rid:
result.response_id = rid
- _append_unique_errors(result.stream_errors, model_resp.get("streamErrors"))
+ _append_unique_errors(
+ result.stream_errors, model_resp.get("streamErrors")
+ )
_apply_post_id_candidates(result, _extract_post_id_candidates(resp))
@@ -447,7 +510,9 @@ async def _collect_round_result(
source: str,
) -> VideoRoundResult:
result = VideoRoundResult()
- async for event_type, payload in _iter_round_events(response, model=model, source=source):
+ async for event_type, payload in _iter_round_events(
+ response, model=model, source=source
+ ):
if event_type == "done":
result = payload
return result
@@ -478,7 +543,9 @@ def _ensure_round_result(
final_round: bool,
):
if not result.post_id:
- err_type = "moderated_or_stream_errors" if result.stream_errors else "missing_post_id"
+ err_type = (
+ "moderated_or_stream_errors" if result.stream_errors else "missing_post_id"
+ )
raise UpstreamException(
message=f"Video round {round_index}/{total_rounds} missing post_id",
status_code=502,
@@ -553,6 +620,7 @@ async def _request_round_stream(
token: str,
message: str,
model_config_override: Dict[str, Any],
+ file_attachments: Optional[List[str]] = None,
) -> AsyncGenerator[bytes, None]:
async def _stream():
session = _new_session()
@@ -563,6 +631,7 @@ async def _stream():
token,
message=message,
model=_APP_CHAT_MODEL,
+ file_attachments=file_attachments,
tool_overrides={"videoGen": True},
model_config_override=model_config_override,
)
@@ -650,7 +719,9 @@ def ensure_role(self) -> List[str]:
self.role_sent = True
return [self._sse(role="assistant")]
- def emit_progress(self, *, round_index: int, total_rounds: int, progress: Any) -> List[str]:
+ def emit_progress(
+ self, *, round_index: int, total_rounds: int, progress: Any
+ ) -> List[str]:
if not self.show_think:
return []
@@ -661,7 +732,9 @@ def emit_progress(self, *, round_index: int, total_rounds: int, progress: Any) -
progress_text = _format_progress(progress)
chunks.append(
- self._sse(f"[round={round_index}/{total_rounds}] progress={progress_text}%\n")
+ self._sse(
+ f"[round={round_index}/{total_rounds}] progress={progress_text}%\n"
+ )
)
return chunks
@@ -763,30 +836,66 @@ async def generate(
model_config_override=model_config_override,
)
- async def generate_from_image(
+ async def generate_from_images(
self,
token: str,
prompt: str,
- image_url: str,
+ image_urls: list[str],
+ asset_ids: list[str],
aspect_ratio: str = "3:2",
video_length: int = 6,
resolution: str = "480p",
preset: str = "normal",
) -> AsyncGenerator[bytes, None]:
- """Single-round image-to-video generation stream."""
- post_id = await self.create_image_post(token, image_url)
- model_config_override = _build_base_config(
- post_id,
- aspect_ratio,
- resolution,
- video_length,
+ """Generate video from one or more reference images."""
+ if not image_urls:
+ raise ValidationException("At least one reference image is required")
+ if len(image_urls) != len(asset_ids):
+ raise ValidationException("Reference image metadata mismatch")
+ logger.info(
+ f"Image to video: prompt='{prompt[:50]}...', images={len(image_urls)}"
)
+ post_id = await self.create_post(token, prompt)
+ mode_map = {
+ "fun": "--mode=extremely-crazy",
+ "normal": "--mode=normal",
+ "spicy": "--mode=extremely-spicy-or-crazy",
+ }
+ mode_flag = mode_map.get(preset, "--mode=custom")
+ message = f"{prompt} {mode_flag}"
+ model_config_override = {
+ "modelMap": {
+ "videoGenModelConfig": {
+ "aspectRatio": aspect_ratio,
+ "imageReferences": image_urls,
+ "isReferenceToVideo": True,
+ "parentPostId": post_id,
+ "resolutionName": resolution,
+ "videoLength": video_length,
+ }
+ }
+ }
return await _request_round_stream(
token=token,
- message=_build_message(prompt, preset),
+ message=message,
model_config_override=model_config_override,
+ file_attachments=asset_ids,
)
+ @staticmethod
+ def _replace_reference_placeholders(prompt: str, asset_ids: list[str]) -> str:
+ """Replace @图N / @imageN placeholders with uploaded asset ids."""
+
+ def _replace(match: re.Match[str]) -> str:
+ index = int(match.group(1)) - 1
+ if index < 0 or index >= len(asset_ids):
+ raise ValidationException(
+ f"Reference placeholder {match.group(0)} has no matching uploaded image"
+ )
+ return f"@{asset_ids[index]}"
+
+ return _REFERENCE_PLACEHOLDER_RE.sub(_replace, prompt)
+
@staticmethod
async def completions(
model: str,
@@ -807,313 +916,389 @@ async def completions(
else:
show_think = reasoning_effort != "none"
- from app.services.grok.services.chat import MessageExtractor
from app.services.grok.utils.upload import UploadService
- prompt, _, image_attachments = MessageExtractor.extract(messages)
+ prompt, image_attachments = _extract_last_user_prompt_and_images(messages)
- pool_candidates = ModelService.pool_candidates_for_model(model)
- token_info = token_mgr.get_token_for_video(
- resolution=resolution,
- video_length=video_length,
- pool_candidates=pool_candidates,
- )
+ max_token_retries = max(1, int(get_config("retry.max_retry") or 1))
+ last_error: Exception | None = None
- if not token_info:
- raise AppException(
- message="No available tokens. Please try again later.",
- error_type=ErrorType.RATE_LIMIT.value,
- code="rate_limit_exceeded",
- status_code=429,
+ for attempt in range(max_token_retries):
+ pool_candidates = ModelService.pool_candidates_for_model(model)
+ token_info = token_mgr.get_token_for_video(
+ resolution=resolution,
+ video_length=video_length,
+ pool_candidates=pool_candidates,
)
- token = token_info.token
- if token.startswith("sso="):
- token = token[4:]
+ if not token_info:
+ if last_error:
+ raise last_error
+ raise AppException(
+ message="No available tokens. Please try again later.",
+ error_type=ErrorType.RATE_LIMIT.value,
+ code="rate_limit_exceeded",
+ status_code=429,
+ )
+
+ token = token_info.token
+ if token.startswith("sso="):
+ token = token[4:]
- pool_name = token_mgr.get_pool_name_for_token(token) or BASIC_POOL_NAME
- is_super_pool = pool_name != BASIC_POOL_NAME
+ pool_name = token_mgr.get_pool_name_for_token(token) or BASIC_POOL_NAME
+ is_super_pool = pool_name != BASIC_POOL_NAME
- requested_resolution = resolution
- should_upscale = requested_resolution == "720p" and pool_name == BASIC_POOL_NAME
- generation_resolution = "480p" if should_upscale else requested_resolution
- upscale_timing = _resolve_upscale_timing() if should_upscale else "complete"
+ requested_resolution = resolution
+ should_upscale = (
+ requested_resolution == "720p" and pool_name == BASIC_POOL_NAME
+ )
+ generation_resolution = "480p" if should_upscale else requested_resolution
+ upscale_timing = _resolve_upscale_timing() if should_upscale else "complete"
- target_length = int(video_length or 6)
- round_plan = _build_round_plan(target_length, is_super=is_super_pool)
- total_rounds = len(round_plan)
+ target_length = int(video_length or 6)
+ round_plan = _build_round_plan(target_length, is_super=is_super_pool)
- service = VideoService()
- message = _build_message(prompt, preset)
+ prompt_text = prompt
+ image_urls: List[str] = []
+ asset_ids: List[str] = []
- image_url = None
- if image_attachments:
- upload_service = UploadService()
try:
- if len(image_attachments) > 1:
- logger.info(
- "Video generation supports a single reference image; using the first one."
+ if image_attachments:
+ if len(image_attachments) > 7:
+ raise ValidationException(
+ "Video generation supports at most 7 reference images"
+ )
+ upload_service = UploadService()
+ try:
+ for attach_data in image_attachments:
+ asset_id, file_uri = await upload_service.upload_file(
+ attach_data, token
+ )
+ asset_ids.append(asset_id)
+ image_urls.append(f"https://assets.grok.com/{file_uri}")
+ prompt_text = VideoService._replace_reference_placeholders(
+ prompt_text, asset_ids
+ )
+ logger.info(
+ f"Images uploaded for video: count={len(image_urls)}"
+ )
+ finally:
+ await upload_service.close()
+ elif _REFERENCE_PLACEHOLDER_RE.search(prompt_text):
+ raise ValidationException(
+ "Reference placeholders require uploaded images"
)
- attach_data = image_attachments[0]
- _, file_uri = await upload_service.upload_file(attach_data, token)
- image_url = f"https://assets.grok.com/{file_uri}"
- logger.info(f"Image uploaded for video: {image_url}")
- finally:
- await upload_service.close()
-
- if image_url:
- seed_post_id = await service.create_image_post(token, image_url)
- else:
- seed_post_id = await service.create_post(token, prompt)
- model_info = ModelService.get(model)
- effort = (
- EffortType.HIGH
- if (model_info and model_info.cost.value == "high")
- else EffortType.LOW
- )
+ service = VideoService()
+ message = _build_message(prompt_text, preset)
+ seed_post_id = await service.create_post(token, prompt_text)
- async def _run_round_collect(
- plan: VideoRoundPlan,
- *,
- seed_id: str,
- last_id: str,
- original_id: Optional[str],
- source: str,
- ) -> VideoRoundResult:
- config_override = _build_round_config(
- plan,
- seed_post_id=seed_id,
- last_post_id=last_id,
- original_post_id=original_id,
- prompt=prompt,
- aspect_ratio=aspect_ratio,
- resolution_name=generation_resolution,
- )
- response = await _request_round_stream(
- token=token,
- message=message,
- model_config_override=config_override,
- )
- return await _collect_round_result(response, model=model, source=source)
-
- async def _stream_chain() -> AsyncGenerator[str, None]:
- writer = _VideoChainSSEWriter(model, show_think)
- seed_id = seed_post_id
- last_id = seed_id
- original_id: Optional[str] = seed_id
- final_result: Optional[VideoRoundResult] = None
+ model_info = ModelService.get(model)
+ effort = (
+ EffortType.HIGH
+ if (model_info and model_info.cost.value == "high")
+ else EffortType.LOW
+ )
- try:
- for plan in round_plan:
+ async def _run_round_collect(
+ plan: VideoRoundPlan,
+ *,
+ seed_id: str,
+ last_id: str,
+ original_id: Optional[str],
+ source: str,
+ ) -> VideoRoundResult:
config_override = _build_round_config(
plan,
seed_post_id=seed_id,
last_post_id=last_id,
original_post_id=original_id,
- prompt=prompt,
+ prompt=prompt_text,
aspect_ratio=aspect_ratio,
resolution_name=generation_resolution,
+ image_references=image_urls if plan.round_index == 1 else None,
)
response = await _request_round_stream(
token=token,
message=message,
model_config_override=config_override,
+ file_attachments=asset_ids if plan.round_index == 1 else None,
+ )
+ return await _collect_round_result(
+ response, model=model, source=source
)
- round_result = VideoRoundResult()
- async for event_type, payload in _iter_round_events(
- response,
- model=model,
- source=f"stream-round-{plan.round_index}",
- ):
- if event_type == "progress":
- for chunk in writer.emit_progress(
+ async def _stream_chain() -> AsyncGenerator[str, None]:
+ writer = _VideoChainSSEWriter(model, show_think)
+ seed_id = seed_post_id
+ last_id = seed_id
+ original_id: Optional[str] = seed_id
+ final_result: Optional[VideoRoundResult] = None
+
+ try:
+ for plan in round_plan:
+ config_override = _build_round_config(
+ plan,
+ seed_post_id=seed_id,
+ last_post_id=last_id,
+ original_post_id=original_id,
+ prompt=prompt_text,
+ aspect_ratio=aspect_ratio,
+ resolution_name=generation_resolution,
+ image_references=image_urls
+ if plan.round_index == 1
+ else None,
+ )
+ response = await _request_round_stream(
+ token=token,
+ message=message,
+ model_config_override=config_override,
+ file_attachments=asset_ids
+ if plan.round_index == 1
+ else None,
+ )
+
+ round_result = VideoRoundResult()
+ async for event_type, payload in _iter_round_events(
+ response,
+ model=model,
+ source=f"stream-round-{plan.round_index}",
+ ):
+ if event_type == "progress":
+ for chunk in writer.emit_progress(
+ round_index=plan.round_index,
+ total_rounds=plan.total_rounds,
+ progress=payload,
+ ):
+ yield chunk
+ elif event_type == "done":
+ round_result = payload
+
+ _ensure_round_result(
+ round_result,
round_index=plan.round_index,
total_rounds=plan.total_rounds,
- progress=payload,
+ final_round=(plan.round_index == plan.total_rounds),
+ )
+
+ if (
+ should_upscale
+ and upscale_timing == "single"
+ and round_result.video_url
):
+ for chunk in writer.emit_note(
+ f"[round={plan.round_index}/{plan.total_rounds}] 正在对当前轮结果进行超分辨率\n"
+ ):
+ yield chunk
+ upgraded_url, upscaled = await _upscale_video_url(
+ token, round_result.video_url
+ )
+ if upscaled:
+ round_result.video_url = upgraded_url
+ else:
+ logger.warning(
+ "Video upscale failed in single mode, fallback to 480p result"
+ )
+
+ if plan.round_index == 1 and round_result.post_id:
+ original_id = round_result.post_id
+ if round_result.post_id:
+ last_id = round_result.post_id
+
+ if plan.round_index == plan.total_rounds:
+ final_result = round_result
+
+ if final_result is None:
+ raise UpstreamException(
+ message="Video generation produced no final round",
+ status_code=502,
+ details={"type": "empty_video_stream"},
+ )
+
+ final_video_url = final_result.video_url
+ if should_upscale and upscale_timing == "complete":
+ for chunk in writer.emit_note("正在对视频进行超分辨率\n"):
yield chunk
- elif event_type == "done":
- round_result = payload
-
- _ensure_round_result(
- round_result,
- round_index=plan.round_index,
- total_rounds=plan.total_rounds,
- final_round=(plan.round_index == plan.total_rounds),
- )
+ final_video_url, upscaled = await _upscale_video_url(
+ token, final_video_url
+ )
+ if not upscaled:
+ logger.warning(
+ "Video upscale failed, fallback to 480p result"
+ )
- if should_upscale and upscale_timing == "single" and round_result.video_url:
- for chunk in writer.emit_note(
- f"[round={plan.round_index}/{plan.total_rounds}] 正在对当前轮结果进行超分辨率\n"
- ):
+ if _public_asset_enabled():
+ for chunk in writer.emit_note("正在生成可公开访问链接\n"):
+ yield chunk
+ final_video_url = await _create_public_video_link(
+ token, final_video_url
+ )
+
+ dl_service = DownloadService()
+ try:
+ rendered = await dl_service.render_video(
+ final_video_url,
+ token,
+ final_result.thumbnail_url,
+ )
+ finally:
+ await dl_service.close()
+
+ for chunk in writer.emit_content(rendered):
yield chunk
- upgraded_url, upscaled = await _upscale_video_url(
- token, round_result.video_url
+ for chunk in writer.finish():
+ yield chunk
+ except asyncio.CancelledError:
+ logger.debug(
+ "Video stream chain cancelled by client",
+ extra={"model": model},
+ )
+ raise
+ except UpstreamException as e:
+ if rate_limited(e):
+ await token_mgr.mark_rate_limited(token)
+ raise
+
+ async def _collect_chain() -> Dict[str, Any]:
+ seed_id = seed_post_id
+ last_id = seed_id
+ original_id: Optional[str] = seed_id
+ final_result: Optional[VideoRoundResult] = None
+
+ for plan in round_plan:
+ round_result = await _run_round_collect(
+ plan,
+ seed_id=seed_id,
+ last_id=last_id,
+ original_id=original_id,
+ source=f"collect-round-{plan.round_index}",
+ )
+
+ _ensure_round_result(
+ round_result,
+ round_index=plan.round_index,
+ total_rounds=plan.total_rounds,
+ final_round=(plan.round_index == plan.total_rounds),
+ )
+
+ if (
+ should_upscale
+ and upscale_timing == "single"
+ and round_result.video_url
+ ):
+ upgraded_url, upscaled = await _upscale_video_url(
+ token, round_result.video_url
+ )
+ if upscaled:
+ round_result.video_url = upgraded_url
+ else:
+ logger.warning(
+ "Video upscale failed in single mode, fallback to 480p result"
+ )
+
+ if plan.round_index == 1 and round_result.post_id:
+ original_id = round_result.post_id
+ if round_result.post_id:
+ last_id = round_result.post_id
+
+ if plan.round_index == plan.total_rounds:
+ final_result = round_result
+
+ if final_result is None:
+ raise UpstreamException(
+ message="Video generation produced no final round",
+ status_code=502,
+ details={"type": "empty_video_stream"},
)
- if upscaled:
- round_result.video_url = upgraded_url
- else:
+
+ final_video_url = final_result.video_url
+ if should_upscale and upscale_timing == "complete":
+ final_video_url, upscaled = await _upscale_video_url(
+ token, final_video_url
+ )
+ if not upscaled:
logger.warning(
- "Video upscale failed in single mode, fallback to 480p result"
+ "Video upscale failed, fallback to 480p result"
)
- if plan.round_index == 1 and round_result.post_id:
- original_id = round_result.post_id
- if round_result.post_id:
- last_id = round_result.post_id
+ if _public_asset_enabled():
+ final_video_url = await _create_public_video_link(
+ token, final_video_url
+ )
- if plan.round_index == plan.total_rounds:
- final_result = round_result
+ dl_service = DownloadService()
+ try:
+ content = await dl_service.render_video(
+ final_video_url,
+ token,
+ final_result.thumbnail_url,
+ )
+ finally:
+ await dl_service.close()
+
+ return {
+ "id": final_result.response_id,
+ "object": "chat.completion",
+ "created": int(time.time()),
+ "model": model,
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": content,
+ "refusal": None,
+ },
+ "finish_reason": "stop",
+ }
+ ],
+ "usage": {
+ "prompt_tokens": 0,
+ "completion_tokens": 0,
+ "total_tokens": 0,
+ },
+ }
- if final_result is None:
- raise UpstreamException(
- message="Video generation produced no final round",
- status_code=502,
- details={"type": "empty_video_stream"},
+ if is_stream:
+ return wrap_stream_with_usage(
+ _stream_chain(), token_mgr, token, model
)
- final_video_url = final_result.video_url
- if should_upscale and upscale_timing == "complete":
- for chunk in writer.emit_note("正在对视频进行超分辨率\n"):
- yield chunk
- final_video_url, upscaled = await _upscale_video_url(token, final_video_url)
- if not upscaled:
- logger.warning("Video upscale failed, fallback to 480p result")
-
- if _public_asset_enabled():
- for chunk in writer.emit_note("正在生成可公开访问链接\n"):
- yield chunk
- final_video_url = await _create_public_video_link(token, final_video_url)
+ try:
+ result = await _collect_chain()
+ except UpstreamException as e:
+ if rate_limited(e):
+ await token_mgr.mark_rate_limited(token)
+ raise
- dl_service = DownloadService()
try:
- rendered = await dl_service.render_video(
- final_video_url,
- token,
- final_result.thumbnail_url,
+ await token_mgr.consume(token, effort)
+ logger.debug(
+ f"Video completed, recorded usage (effort={effort.value})"
)
- finally:
- await dl_service.close()
+ except Exception as e:
+ logger.warning(f"Failed to record video usage: {e}")
- for chunk in writer.emit_content(rendered):
- yield chunk
- for chunk in writer.finish():
- yield chunk
- except asyncio.CancelledError:
- logger.debug("Video stream chain cancelled by client", extra={"model": model})
- raise
+ return result
except UpstreamException as e:
+ last_error = e
if rate_limited(e):
await token_mgr.mark_rate_limited(token)
- raise
-
- async def _collect_chain() -> Dict[str, Any]:
- seed_id = seed_post_id
- last_id = seed_id
- original_id: Optional[str] = seed_id
- final_result: Optional[VideoRoundResult] = None
-
- for plan in round_plan:
- round_result = await _run_round_collect(
- plan,
- seed_id=seed_id,
- last_id=last_id,
- original_id=original_id,
- source=f"collect-round-{plan.round_index}",
- )
-
- _ensure_round_result(
- round_result,
- round_index=plan.round_index,
- total_rounds=plan.total_rounds,
- final_round=(plan.round_index == plan.total_rounds),
- )
-
- if should_upscale and upscale_timing == "single" and round_result.video_url:
- upgraded_url, upscaled = await _upscale_video_url(
- token, round_result.video_url
+ logger.warning(
+ f"Token {token[:10]}... rate limited (429), "
+ f"trying next token (attempt {attempt + 1}/{max_token_retries})"
)
- if upscaled:
- round_result.video_url = upgraded_url
- else:
- logger.warning(
- "Video upscale failed in single mode, fallback to 480p result"
- )
-
- if plan.round_index == 1 and round_result.post_id:
- original_id = round_result.post_id
- if round_result.post_id:
- last_id = round_result.post_id
-
- if plan.round_index == plan.total_rounds:
- final_result = round_result
-
- if final_result is None:
- raise UpstreamException(
- message="Video generation produced no final round",
- status_code=502,
- details={"type": "empty_video_stream"},
- )
-
- final_video_url = final_result.video_url
- if should_upscale and upscale_timing == "complete":
- final_video_url, upscaled = await _upscale_video_url(token, final_video_url)
- if not upscaled:
- logger.warning("Video upscale failed, fallback to 480p result")
-
- if _public_asset_enabled():
- final_video_url = await _create_public_video_link(token, final_video_url)
-
- dl_service = DownloadService()
- try:
- content = await dl_service.render_video(
- final_video_url,
- token,
- final_result.thumbnail_url,
- )
- finally:
- await dl_service.close()
-
- return {
- "id": final_result.response_id,
- "object": "chat.completion",
- "created": int(time.time()),
- "model": model,
- "choices": [
- {
- "index": 0,
- "message": {
- "role": "assistant",
- "content": content,
- "refusal": None,
- },
- "finish_reason": "stop",
- }
- ],
- "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
- }
-
- if is_stream:
- return wrap_stream_with_usage(_stream_chain(), token_mgr, token, model)
-
- try:
- result = await _collect_chain()
- except UpstreamException as e:
- if rate_limited(e):
- await token_mgr.mark_rate_limited(token)
- raise
-
- try:
- await token_mgr.consume(token, effort)
- logger.debug(
- f"Video completed, recorded usage (effort={effort.value})"
- )
- except Exception as e:
- logger.warning(f"Failed to record video usage: {e}")
+ continue
+ raise
- return result
+ if last_error:
+ raise last_error
+ raise AppException(
+ message="No available tokens. Please try again later.",
+ error_type=ErrorType.RATE_LIMIT.value,
+ code="rate_limit_exceeded",
+ status_code=429,
+ )
class VideoStreamProcessor:
@@ -1165,7 +1350,9 @@ async def close(self):
await self._dl_service.close()
self._dl_service = None
- async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, None]:
+ async def process(
+ self, response: AsyncIterable[bytes]
+ ) -> AsyncGenerator[str, None]:
result = VideoRoundResult()
try:
async for event_type, payload in _iter_round_events(
@@ -1194,14 +1381,18 @@ async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, N
if self.upscale_on_finish:
for chunk in self.writer.emit_note("正在对视频进行超分辨率\n"):
yield chunk
- final_video_url, upscaled = await _upscale_video_url(self.token, final_video_url)
+ final_video_url, upscaled = await _upscale_video_url(
+ self.token, final_video_url
+ )
if not upscaled:
logger.warning("Video upscale failed, fallback to 480p result")
if self.enable_public_asset:
for chunk in self.writer.emit_note("正在生成可公开访问链接\n"):
yield chunk
- final_video_url = await _create_public_video_link(self.token, final_video_url)
+ final_video_url = await _create_public_video_link(
+ self.token, final_video_url
+ )
rendered = await self._get_dl().render_video(
final_video_url,
@@ -1213,7 +1404,9 @@ async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, N
for chunk in self.writer.finish():
yield chunk
except asyncio.CancelledError:
- logger.debug("Video stream cancelled by client", extra={"model": self.model})
+ logger.debug(
+ "Video stream cancelled by client", extra={"model": self.model}
+ )
raise
finally:
await self.close()
@@ -1265,12 +1458,16 @@ async def process(self, response: AsyncIterable[bytes]) -> Dict[str, Any]:
final_video_url = result.video_url
if self.upscale_on_finish:
- final_video_url, upscaled = await _upscale_video_url(self.token, final_video_url)
+ final_video_url, upscaled = await _upscale_video_url(
+ self.token, final_video_url
+ )
if not upscaled:
logger.warning("Video upscale failed, fallback to 480p result")
if self.enable_public_asset:
- final_video_url = await _create_public_video_link(self.token, final_video_url)
+ final_video_url = await _create_public_video_link(
+ self.token, final_video_url
+ )
content = await self._get_dl().render_video(
final_video_url,
@@ -1294,7 +1491,11 @@ async def process(self, response: AsyncIterable[bytes]) -> Dict[str, Any]:
"finish_reason": "stop",
}
],
- "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+ "usage": {
+ "prompt_tokens": 0,
+ "completion_tokens": 0,
+ "total_tokens": 0,
+ },
}
finally:
await self.close()
diff --git a/docs/README.en.md b/docs/README.en.md
index ff3b89815..8d240a7ac 100644
--- a/docs/README.en.md
+++ b/docs/README.en.md
@@ -196,7 +196,8 @@ curl http://localhost:8000/v1/chat/completions \
- `grok-imagine-1.0-fast` streaming output in `/chat/completions` only returns the final image, hiding intermediate preview images.
- `grok-imagine-1.0-fast` streaming URL output will retain the original image filename (without appending `-final`).
- `grok-imagine-1.0-edit` requires an image; if multiple are provided, the **last 3** images and last text are used.
-- `grok-imagine-1.0-video` supports text-to-video and image-to-video via `image_url` (**only the first image is used**).
+- `grok-imagine-1.0-video` supports text-to-video and multi-image reference video: pass up to `7` `image_url` blocks and use placeholders like `@图1`, `@图2` in the prompt; the server will replace them with the corresponding `assetId` values.
+- `@图N` placeholders map to `image_url` order; referencing a missing image index returns an error.
- Any other parameters will be discarded and ignored.
@@ -361,7 +362,7 @@ curl http://localhost:8000/v1/videos \
| `size` | string | Frame size (mapped to aspect_ratio) | `1280x720`, `720x1280`, `1792x1024`, `1024x1792`, `1024x1024` |
| `seconds` | integer | Target duration (seconds) | `6` ~ `30` |
| `quality` | string | Video quality (mapped to resolution) | `standard`, `high` |
-| `image_reference` | object/string | Reference image (optional) | `{"image_url":"https://..."}` or Data URI |
+| `image_reference` | array | Reference image (optional) | OpenAI-compatible content block array (`[{"type":"image_url"...}]`) or an array of URL strings; single-image requests should use a one-item array |
| `input_reference` | file | multipart reference image (optional) | `png`, `jpg`, `webp` |
**Notes**:
@@ -369,7 +370,7 @@ curl http://localhost:8000/v1/videos \
- Server-side chain extension now supports 6~30 seconds automatically, so **`/v1/video/extend` is not required**.
- `quality=standard` maps to `480p`; `quality=high` maps to `720p`.
- For basic-pool requests at `720p`, generation falls back to `480p` first, then upscales according to `video.upscale_timing`.
-- If both `image_reference` and `input_reference` are provided, references are processed in order; the video pipeline uses the first image only.
+- `image_reference` now uses array format only and supports up to 7 images; single-image requests should also use a one-item array. If both `image_reference` and `input_reference` are provided, references are processed and merged in order; you can use placeholders like `@图1`, `@图2` in prompts.
diff --git a/readme.md b/readme.md
index 75cae6ba5..b75c2fee9 100644
--- a/readme.md
+++ b/readme.md
@@ -197,7 +197,8 @@ curl http://localhost:8000/v1/chat/completions \
- `grok-imagine-1.0-fast` 流式 URL 出图会保持原始图片名(不追加 `-final` 后缀)。
- 当图片疑似被审查拦截导致无最终图时,若开启 `image.blocked_parallel_enabled`,服务端会按 `image.blocked_parallel_attempts` 自动并行补偿生成,并优先使用不同 token;若仍无满足 `image.final_min_bytes` 的最终图则返回失败。
- `grok-imagine-1.0-edit` 必须提供图片,多图默认取**最后 3 张**与最后一个文本。
-- `grok-imagine-1.0-video` 支持文生视频与图生视频(通过 `image_url` 传参考图,**仅取第 1 张**)。
+- `grok-imagine-1.0-video` 支持文生视频与多图参考视频:可通过多个 `image_url` 传最多 `7` 张参考图,并在文本中使用 `@图1`、`@图2` 这类占位符;服务端会自动替换为对应 `assetId`。
+- `@图N` 与 `image_url` 的顺序一一对应;若引用了不存在的图片序号,会直接报错。
- 除上述外的其他参数将自动丢弃并忽略。
@@ -362,7 +363,7 @@ curl http://localhost:8000/v1/videos \
| `size` | string | 画面比例(会映射到 aspect_ratio) | `1280x720`, `720x1280`, `1792x1024`, `1024x1792`, `1024x1024` |
| `seconds` | integer | 目标时长(秒) | `6` ~ `30` |
| `quality` | string | 视频质量(映射到 resolution) | `standard`, `high` |
-| `image_reference` | object/string | 参考图(可选) | `{"image_url":"https://..."}` 或 Data URI |
+| `image_reference` | array | 参考图(可选) | 兼容 OpenAI content block 数组格式 (`[{"type":"image_url"...}]`) 或纯 URL 字符串数组;单图也请传单元素数组 |
| `input_reference` | file | multipart 参考图(可选) | `png`, `jpg`, `webp` |
**注意事项**:
@@ -370,7 +371,7 @@ curl http://localhost:8000/v1/videos \
- 服务端已支持 6~30 秒自动链式扩展,**无需使用 `/v1/video/extend`**。
- `quality=standard` 对应 `480p`;`quality=high` 对应 `720p`。
- 基础号池请求 `720p` 时会先产出 `480p` 再按 `video.upscale_timing` 执行超分。
-- `image_reference` 与 `input_reference` 同时传入时,会按顺序作为参考图输入;视频链路只使用第 1 张。
+- `image_reference` 统一使用数组格式,最多可传 7 张参考图;单图场景也请传单元素数组。`input_reference` 主要以表单上传参考图;若两者同时传入,会按顺序作为参考图合并输入;可在提示词中使用 `@图1`、`@图2`。