diff --git a/_public/static/function/css/video.css b/_public/static/function/css/video.css index a634b7a97..82f339a67 100644 --- a/_public/static/function/css/video.css +++ b/_public/static/function/css/video.css @@ -222,9 +222,8 @@ body { .ref-name { font-size: 11px; color: var(--accents-4); - overflow: hidden; - text-overflow: ellipsis; - white-space: nowrap; + white-space: pre-wrap; + overflow-wrap: anywhere; max-width: 100%; display: inline-block; } @@ -253,6 +252,18 @@ body { resize: vertical; } +.prompt-tip { + margin-top: 8px; + font-size: 11px; + color: var(--accents-4); + line-height: 1.5; +} + +.ref-textarea { + min-height: 88px; + resize: vertical; +} + .status-header { display: flex; align-items: center; diff --git a/_public/static/function/js/video.js b/_public/static/function/js/video.js index 9aea422be..a82e3f963 100644 --- a/_public/static/function/js/video.js +++ b/_public/static/function/js/video.js @@ -31,12 +31,13 @@ let contentBuffer = ''; let collectingContent = false; let startAt = 0; - let fileDataUrl = ''; + let fileDataUrls = []; let elapsedTimer = null; let lastProgress = 0; let currentPreviewItem = null; let previewCount = 0; const DEFAULT_REASONING_EFFORT = 'low'; + const MAX_REFERENCE_IMAGES = 7; function toast(message, type) { if (typeof showToast === 'function') { @@ -229,15 +230,45 @@ } function clearFileSelection() { - fileDataUrl = ''; + fileDataUrls = []; if (imageFileInput) { imageFileInput.value = ''; } if (imageFileName) { - imageFileName.textContent = t('common.noFileSelected'); + imageFileName.textContent = t('video.noReferenceSelected'); } } + function updateReferenceSummary(names) { + if (!imageFileName) return; + if (!names || !names.length) { + imageFileName.textContent = t('video.noReferenceSelected'); + return; + } + imageFileName.textContent = names.join('\n'); + } + + function parseReferenceUrls(value) { + return (value || '') + .split(/\r?\n/) + .map(item => item.trim()) + .filter(Boolean); + } + + function getReferenceImages() { + const rawUrls = imageUrlInput ? parseReferenceUrls(imageUrlInput.value) : []; + if (fileDataUrls.length && rawUrls.length) { + toast(t('video.referenceConflict'), 'error'); + throw new Error('invalid_reference'); + } + const images = fileDataUrls.length ? [...fileDataUrls] : rawUrls; + if (images.length > MAX_REFERENCE_IMAGES) { + toast(t('video.referenceLimit'), 'error'); + throw new Error('too_many_references'); + } + return images; + } + function normalizeAuthHeader(authHeader) { if (!authHeader) return ''; if (authHeader.startsWith('Bearer ')) { @@ -260,12 +291,7 @@ async function createVideoTask(authHeader) { const prompt = promptInput ? promptInput.value.trim() : ''; - const rawUrl = imageUrlInput ? imageUrlInput.value.trim() : ''; - if (fileDataUrl && rawUrl) { - toast(t('video.referenceConflict'), 'error'); - throw new Error('invalid_reference'); - } - const imageUrl = fileDataUrl || rawUrl; + const imageUrls = getReferenceImages(); const res = await fetch('/v1/function/video/start', { method: 'POST', headers: { @@ -274,7 +300,7 @@ }, body: JSON.stringify({ prompt, - image_url: imageUrl || null, + image_urls: imageUrls, reasoning_effort: DEFAULT_REASONING_EFFORT, aspect_ratio: ratioSelect ? ratioSelect.value : '3:2', video_length: lengthSelect ? parseInt(lengthSelect.value, 10) : 6, @@ -604,31 +630,38 @@ if (imageFileInput) { imageFileInput.addEventListener('change', () => { - const file = imageFileInput.files && imageFileInput.files[0]; - if (!file) { + const files = imageFileInput.files ? Array.from(imageFileInput.files) : []; + if (!files.length) { + clearFileSelection(); + return; + } + if (files.length > MAX_REFERENCE_IMAGES) { clearFileSelection(); + toast(t('video.referenceLimit'), 'error'); return; } if (imageUrlInput && imageUrlInput.value.trim()) { imageUrlInput.value = ''; } - if (imageFileName) { - imageFileName.textContent = file.name; - } - const reader = new FileReader(); - reader.onload = () => { - if (typeof reader.result === 'string') { - fileDataUrl = reader.result; - } else { - fileDataUrl = ''; - toast(t('common.fileReadFailed'), 'error'); - } - }; - reader.onerror = () => { - fileDataUrl = ''; + Promise.all(files.map(file => new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => { + if (typeof reader.result === 'string') { + resolve({ name: file.name, data: reader.result }); + } else { + reject(new Error('read_failed')); + } + }; + reader.onerror = () => reject(new Error('read_failed')); + reader.readAsDataURL(file); + }))).then(items => { + fileDataUrls = items.map(item => item.data); + updateReferenceSummary(items.map((item, index) => `${index + 1}. ${item.name}`)); + }).catch(() => { + fileDataUrls = []; toast(t('common.fileReadFailed'), 'error'); - }; - reader.readAsDataURL(file); + updateReferenceSummary([]); + }); }); } @@ -646,9 +679,18 @@ if (imageUrlInput) { imageUrlInput.addEventListener('input', () => { - if (imageUrlInput.value.trim() && fileDataUrl) { + const urls = parseReferenceUrls(imageUrlInput.value); + if (urls.length > MAX_REFERENCE_IMAGES) { + toast(t('video.referenceLimit'), 'error'); + } + if (imageUrlInput.value.trim() && fileDataUrls.length) { clearFileSelection(); } + if (urls.length) { + updateReferenceSummary(urls.map((url, index) => `${index + 1}. ${url}`)); + } else if (!fileDataUrls.length) { + updateReferenceSummary([]); + } }); } diff --git a/_public/static/function/pages/video.html b/_public/static/function/pages/video.html index 7b57f64f9..a56e176ee 100644 --- a/_public/static/function/pages/video.html +++ b/_public/static/function/pages/video.html @@ -51,15 +51,16 @@

Video
- + +
多图参考可在提示词中使用 @图1 到 @图7,按参考图顺序对应。
- +
- 未选择文件 + 未选择参考图
@@ -95,7 +96,7 @@

Video
- +
diff --git a/_public/static/i18n/locales/en.json b/_public/static/i18n/locales/en.json index f9252c136..999f699e9 100644 --- a/_public/static/i18n/locales/en.json +++ b/_public/static/i18n/locales/en.json @@ -503,13 +503,14 @@ "video": { "pageTitle": "Grok2API - Video Generation", "title": "Video Generation", - "subtitle": "Generate short videos with reference images and preset styles.", + "subtitle": "Generate short videos with up to 7 reference images, @图N placeholders, and preset styles.", "startGenerate": "Generate", "genSettings": "Generation Settings", "prompt": "Prompt", - "promptPlaceholder": "e.g.: neon rain at night on the street, slow motion, film grain", + "promptPlaceholder": "e.g.: @图1 neon rainy street at night, @图2 subject looking back and smiling, slow motion, film grain", + "promptTip": "Use @图1 to @图7 in the prompt to reference images by upload order.", "referenceImage": "Reference Image", - "referenceImagePlaceholder": "https://... or data:image/...", + "referenceImagePlaceholder": "One https://... or data:image/... per line, up to 7 images", "aspectRatio": "Aspect Ratio", "ratio3_2": "3:2 Landscape", "ratio2_3": "2:3 Portrait", @@ -538,7 +539,9 @@ "superResolution": "Super Resolution", "superResolutionInProgress": "Super resolution in progress", "alreadyGenerating": "Already generating", - "referenceConflict": "Reference image: choose either URL/Base64 or file upload", + "referenceConflict": "Reference images: choose either URL/Base64 list or file upload", + "referenceLimit": "A maximum of 7 reference images is supported", + "noReferenceSelected": "No reference images selected", "downloadFailed": "Download failed, please check if the video link is accessible", "sec6": "6s", "sec10": "10s", diff --git a/_public/static/i18n/locales/zh.json b/_public/static/i18n/locales/zh.json index 409aa3dd2..329ed50ba 100644 --- a/_public/static/i18n/locales/zh.json +++ b/_public/static/i18n/locales/zh.json @@ -503,13 +503,14 @@ "video": { "pageTitle": "Grok2API - Video 视频生成", "title": "Video 视频生成", - "subtitle": "生成短视频,支持参考图与多种预设风格。", + "subtitle": "生成短视频,支持最多 7 张参考图、@图N 引用与多种预设风格。", "startGenerate": "开始生成", "genSettings": "生成设置", "prompt": "提示词", - "promptPlaceholder": "例如:街头霓虹雨夜,慢镜头,胶片质感", + "promptPlaceholder": "例如:@图1街头霓虹雨夜,@图2人物回头微笑,慢镜头,胶片质感", + "promptTip": "多图参考可在提示词中使用 @图1 到 @图7,按参考图顺序对应。", "referenceImage": "参考图", - "referenceImagePlaceholder": "https://... 或 data:image/...", + "referenceImagePlaceholder": "每行一个 https://... 或 data:image/...,最多 7 张", "aspectRatio": "画面比例", "ratio3_2": "3:2 横构图", "ratio2_3": "2:3 竖构图", @@ -538,7 +539,9 @@ "superResolution": "超分辨率", "superResolutionInProgress": "超分辨率中", "alreadyGenerating": "已在生成中", - "referenceConflict": "参考图只能选择其一:URL/Base64 或 本地上传", + "referenceConflict": "参考图只能选择其一:URL/Base64 列表 或 本地上传", + "referenceLimit": "参考图最多支持 7 张", + "noReferenceSelected": "未选择参考图", "downloadFailed": "下载失败,请检查视频链接是否可访问", "sec6": "6 秒", "sec10": "10 秒", diff --git a/app/api/v1/function/video.py b/app/api/v1/function/video.py index 2706ced98..ccd4cd36c 100644 --- a/app/api/v1/function/video.py +++ b/app/api/v1/function/video.py @@ -49,7 +49,7 @@ async def _new_session( video_length: int, resolution_name: str, preset: str, - image_url: Optional[str], + image_urls: Optional[List[str]], reasoning_effort: Optional[str], ) -> str: task_id = uuid.uuid4().hex @@ -62,7 +62,7 @@ async def _new_session( "video_length": video_length, "resolution_name": resolution_name, "preset": preset, - "image_url": image_url, + "image_urls": image_urls or [], "reasoning_effort": reasoning_effort, "created_at": now, } @@ -123,13 +123,23 @@ def _validate_image_url(image_url: str) -> None: ) +def _normalize_image_urls(values: Optional[List[str]]) -> List[str]: + normalized: List[str] = [] + if isinstance(values, list): + for item in values: + value = (item or "").strip() + if value: + normalized.append(value) + return normalized + + class VideoStartRequest(BaseModel): prompt: str aspect_ratio: Optional[str] = "3:2" video_length: Optional[int] = 6 resolution_name: Optional[str] = "480p" preset: Optional[str] = "normal" - image_url: Optional[str] = None + image_urls: Optional[List[str]] = None reasoning_effort: Optional[str] = None @@ -166,8 +176,12 @@ async def function_video_start(data: VideoStartRequest): detail="preset must be one of ['fun','normal','spicy','custom']", ) - image_url = (data.image_url or "").strip() or None - if image_url: + image_urls = _normalize_image_urls(data.image_urls) + if len(image_urls) > 7: + raise HTTPException( + status_code=400, detail="image_urls supports at most 7 references" + ) + for image_url in image_urls: _validate_image_url(image_url) reasoning_effort = (data.reasoning_effort or "").strip() or None @@ -185,7 +199,7 @@ async def function_video_start(data: VideoStartRequest): video_length, resolution_name, preset, - image_url, + image_urls, reasoning_effort, ) return {"task_id": task_id, "aspect_ratio": aspect_ratio} @@ -202,7 +216,11 @@ async def function_video_sse(request: Request, task_id: str = Query("")): video_length = int(session.get("video_length") or 6) resolution_name = str(session.get("resolution_name") or "480p") preset = str(session.get("preset") or "normal") - image_url = session.get("image_url") + image_urls = [ + str(item).strip() + for item in (session.get("image_urls") or []) + if str(item).strip() + ] reasoning_effort = session.get("reasoning_effort") async def event_stream(): @@ -218,14 +236,16 @@ async def event_stream(): yield "data: [DONE]\n\n" return - if image_url: + if image_urls: + content: List[Dict[str, Any]] = [{"type": "text", "text": prompt}] + for image_url in image_urls: + content.append( + {"type": "image_url", "image_url": {"url": image_url}} + ) messages: List[Dict[str, Any]] = [ { "role": "user", - "content": [ - {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": image_url}}, - ], + "content": content, } ] else: diff --git a/app/api/v1/video.py b/app/api/v1/video.py index 650655589..61ae50b2f 100644 --- a/app/api/v1/video.py +++ b/app/api/v1/video.py @@ -46,8 +46,13 @@ class VideoCreateRequest(BaseModel): size: Optional[str] = Field("1792x1024", description="Output size") seconds: Optional[int] = Field(6, description="Video length in seconds") quality: Optional[str] = Field("standard", description="Quality: standard/high") - image_reference: Optional[Any] = Field(None, description="Structured image reference") - input_reference: Optional[Any] = Field(None, description="Multipart input reference file") + image_reference: Optional[Any] = Field( + None, + description="Image references using chat/completions content-block array format: [{type:'image_url', image_url:{url:'...'}}] or an array of plain URL strings", + ) + input_reference: Optional[Any] = Field( + None, description="Multipart input reference file" + ) class VideoExtendDirectRequest(BaseModel): @@ -57,7 +62,8 @@ class VideoExtendDirectRequest(BaseModel): prompt: str = Field(..., description="Prompt text mapped to message/originalPrompt") reference_id: str = Field( - ..., description="Reference id mapped to extendPostId/originalPostId/parentPostId" + ..., + description="Reference id mapped to extendPostId/originalPostId/parentPostId", ) start_time: float = Field(..., description="Mapped to videoExtensionStartTime") ratio: str = Field("2:3", description="Mapped to aspectRatio") @@ -72,7 +78,9 @@ def _raise_validation_error(exc: ValidationError) -> None: loc = first.get("loc", []) msg = first.get("msg", "Invalid request") code = first.get("type", "invalid_value") - param_parts = [str(x) for x in loc if not (isinstance(x, int) or str(x).isdigit())] + param_parts = [ + str(x) for x in loc if not (isinstance(x, int) or str(x).isdigit()) + ] param = ".".join(param_parts) if param_parts else None raise ValidationException(message=msg, param=param, code=code) raise ValidationException(message="Invalid request", code="invalid_value") @@ -165,58 +173,98 @@ def _validate_reference_value(value: str, param: str) -> str: ) -def _parse_image_reference(value: Any) -> Optional[str]: +def _parse_image_reference_item(value: Any, idx: int) -> str: + """Parse a single image reference item inside an array.""" + param_prefix = f"image_reference[{idx}]" if idx is not None else "image_reference" + + if isinstance(value, str): + stripped = value.strip() + if not stripped: + raise ValidationException( + message=f"{param_prefix} cannot be empty", + param=param_prefix, + code="invalid_reference", + ) + return _validate_reference_value(stripped, param_prefix) + + if isinstance(value, dict): + block_type = value.get("type") + if block_type != "image_url": + raise ValidationException( + message=f'{param_prefix} must have type="image_url"', + param=f"{param_prefix}.type", + code="invalid_reference", + ) + inner = value.get("image_url") + if not isinstance(inner, dict): + raise ValidationException( + message=f"{param_prefix}.image_url must be an object with a url field", + param=f"{param_prefix}.image_url", + code="invalid_reference", + ) + url = inner.get("url", "") + if not isinstance(url, str) or not url.strip(): + raise ValidationException( + message=f"{param_prefix}.image_url.url cannot be empty", + param=f"{param_prefix}.image_url.url", + code="invalid_reference", + ) + return _validate_reference_value(url.strip(), f"{param_prefix}.image_url.url") + + raise ValidationException( + message=( + f"{param_prefix} must be a URL string or " + f'{{"type": "image_url", "image_url": {{"url": "..."}}}}' + ), + param=param_prefix, + code="invalid_reference", + ) + + +def _parse_image_references(value: Any) -> List[str]: + """Parse image_reference into a list of validated URL strings. + + Uses the same content-block format as chat/completions. + Accepts: + - None / "" -> [] + - ["url", {"type": "image_url", ...}, ...] -> [url, ...] + - JSON string of an array (for multipart/form-data) + """ if value is None or value == "": - return None + return [] if isinstance(value, str): stripped = value.strip() if not stripped: - return None - if stripped[0] in {"{", "["}: + return [] + if stripped[0] == "[": try: value = orjson.loads(stripped) except orjson.JSONDecodeError: - # allow plain url/data-uri in multipart text field as a practical fallback - return _validate_reference_value(stripped, "image_reference") + raise ValidationException( + message="image_reference must be a JSON array string", + param="image_reference", + code="invalid_reference", + ) else: - return _validate_reference_value(stripped, "image_reference") - - if not isinstance(value, dict): - raise ValidationException( - message=( - "image_reference must be an object with exactly one of " - "`image_url` or `file_id`" - ), - param="image_reference", - code="invalid_reference", - ) - - image_url = value.get("image_url") - file_id = value.get("file_id") - image_url = image_url.strip() if isinstance(image_url, str) else "" - file_id = file_id.strip() if isinstance(file_id, str) else "" - - has_image_url = bool(image_url) - has_file_id = bool(file_id) - if has_image_url == has_file_id: - raise ValidationException( - message="image_reference requires exactly one of image_url or file_id", - param="image_reference", - code="invalid_reference", - ) + raise ValidationException( + message="image_reference must be an array", + param="image_reference", + code="invalid_reference", + ) - if has_file_id: - raise ValidationException( - message=( - "image_reference.file_id is not supported in current reverse pipeline; " - "please use image_reference.image_url or multipart input_reference" - ), - param="image_reference.file_id", - code="unsupported_reference", - ) + if isinstance(value, list): + if not value: + return [] + return [ + _parse_image_reference_item(item, idx=i) for i, item in enumerate(value) + ] - return _validate_reference_value(image_url, "image_reference.image_url") + raise ValidationException( + message="image_reference must be an array", + param="image_reference", + code="invalid_reference", + ) async def _upload_to_data_uri(file: UploadFile, param: str) -> str: @@ -234,9 +282,8 @@ async def _upload_to_data_uri(file: UploadFile, param: str) -> str: async def _build_references_for_json(payload: BaseModel) -> List[str]: references: List[str] = [] - parsed_image_ref = _parse_image_reference(getattr(payload, "image_reference", None)) - if parsed_image_ref: - references.append(parsed_image_ref) + parsed_refs = _parse_image_references(getattr(payload, "image_reference", None)) + references.extend(parsed_refs) if getattr(payload, "input_reference", None) not in (None, ""): raise ValidationException( message="input_reference must be uploaded as multipart/form-data file", @@ -282,9 +329,8 @@ async def _build_payload_and_references_for_form( code="invalid_reference", ) - parsed_image_ref = _parse_image_reference(payload.image_reference) - if parsed_image_ref: - references.append(parsed_image_ref) + parsed_refs = _parse_image_references(payload.image_reference) + references.extend(parsed_refs) return payload, references @@ -300,7 +346,7 @@ def _multipart_create_schema(default_seconds: int) -> Dict[str, Any]: "quality": {"type": "string", "default": "standard"}, "image_reference": { "type": "string", - "description": "JSON string for image_reference object", + "description": "JSON string for image_reference array", }, "input_reference": {"type": "string", "format": "binary"}, }, @@ -434,7 +480,9 @@ async def create_video(request: Request): except ValidationError as exc: _raise_validation_error(exc) references = await _build_references_for_json(payload) - return await _create_video_from_payload(payload, references, require_extension=False) + return await _create_video_from_payload( + payload, references, require_extension=False + ) form = await request.form() payload, references = await _build_payload_and_references_for_form( @@ -447,7 +495,9 @@ async def create_video(request: Request): image_reference=form.get("image_reference"), input_reference=form.get("input_reference"), ) - return await _create_video_from_payload(payload, references, require_extension=False) + return await _create_video_from_payload( + payload, references, require_extension=False + ) @router.post( diff --git a/app/services/grok/services/video.py b/app/services/grok/services/video.py index e43d27bd8..0fe28b45c 100644 --- a/app/services/grok/services/video.py +++ b/app/services/grok/services/video.py @@ -24,7 +24,11 @@ from app.core.logger import logger from app.services.grok.services.model import ModelService from app.services.grok.utils.download import DownloadService -from app.services.grok.utils.process import _is_http2_error, _normalize_line, _with_idle_timeout +from app.services.grok.utils.process import ( + _is_http2_error, + _normalize_line, + _with_idle_timeout, +) from app.services.grok.utils.retry import rate_limited from app.services.grok.utils.stream import wrap_stream_with_usage from app.services.reverse.app_chat import AppChatReverse @@ -39,6 +43,7 @@ _VIDEO_SEM_VALUE = 0 _APP_CHAT_MODEL = "grok-3" _POST_ID_URL_PATTERN = r"/generated/([0-9a-fA-F-]{32,36})/" +_REFERENCE_PLACEHOLDER_RE = re.compile(r"@(?:(?:图|image|img)\s*(\d+))", re.IGNORECASE) @dataclass(frozen=True) @@ -68,6 +73,52 @@ def _pick_str(value: Any) -> str: return "" +def _extract_last_user_prompt_and_images( + messages: List[Dict[str, Any]], +) -> Tuple[str, List[str]]: + """Use only the last user turn so placeholder indices map to that turn's images.""" + for msg in reversed(messages or []): + role = msg.get("role") or "user" + if role != "user": + continue + + content = msg.get("content", "") + if isinstance(content, str): + return content.strip(), [] + if isinstance(content, dict): + content = [content] + if not isinstance(content, list): + return "", [] + + prompt_parts: List[str] = [] + image_urls: List[str] = [] + for item in content: + if not isinstance(item, dict): + continue + + item_type = item.get("type") + if item_type == "text": + text = item.get("text", "") + if isinstance(text, str) and text.strip(): + prompt_parts.append(text.strip()) + elif item_type == "image_url": + image_data = item.get("image_url", {}) + url = "" + if isinstance(image_data, dict): + url = image_data.get("url", "") + elif isinstance(image_data, str): + url = image_data + if isinstance(url, str) and url.strip(): + image_urls.append(url.strip()) + + prompt = "\n".join(prompt_parts).strip() + if not prompt and image_urls: + prompt = "Refer to the following content:" + return prompt, image_urls + + return "", [] + + def _extract_post_id_from_video_url(video_url: str) -> Optional[str]: if not isinstance(video_url, str) or not video_url: return None @@ -106,7 +157,9 @@ async def _create_public_video_link(token: str, video_url: str) -> str: async with _new_session() as session: response = await MediaPostLinkReverse.request(session, token, video_id) payload = response.json() if response is not None else {} - share_link = _pick_str(payload.get("shareLink")) if isinstance(payload, dict) else "" + share_link = ( + _pick_str(payload.get("shareLink")) if isinstance(payload, dict) else "" + ) if share_link: if share_link.endswith(".mp4"): logger.info(f"Video public link created: {share_link}") @@ -230,14 +283,20 @@ def _build_round_config( prompt: str, aspect_ratio: str, resolution_name: str, + image_references: Optional[List[str]] = None, ) -> Dict[str, Any]: if not plan.is_extension: - return _build_base_config( + config = _build_base_config( seed_post_id, aspect_ratio, resolution_name, plan.video_length, ) + if image_references: + video_config = config["modelMap"]["videoGenModelConfig"] + video_config["imageReferences"] = image_references + video_config["isReferenceToVideo"] = True + return config if not original_post_id: raise UpstreamException( @@ -305,7 +364,9 @@ def _extract_post_id_candidates(resp: Dict[str, Any]) -> List[Tuple[int, str]]: return candidates -def _apply_post_id_candidates(result: VideoRoundResult, candidates: List[Tuple[int, str]]): +def _apply_post_id_candidates( + result: VideoRoundResult, candidates: List[Tuple[int, str]] +): for rank, value in candidates: if rank < result.post_id_rank: result.post_id_rank = rank @@ -371,7 +432,9 @@ async def _iter_round_events( rid = _pick_str(model_resp.get("responseId")) if rid: result.response_id = rid - _append_unique_errors(result.stream_errors, model_resp.get("streamErrors")) + _append_unique_errors( + result.stream_errors, model_resp.get("streamErrors") + ) _apply_post_id_candidates(result, _extract_post_id_candidates(resp)) @@ -447,7 +510,9 @@ async def _collect_round_result( source: str, ) -> VideoRoundResult: result = VideoRoundResult() - async for event_type, payload in _iter_round_events(response, model=model, source=source): + async for event_type, payload in _iter_round_events( + response, model=model, source=source + ): if event_type == "done": result = payload return result @@ -478,7 +543,9 @@ def _ensure_round_result( final_round: bool, ): if not result.post_id: - err_type = "moderated_or_stream_errors" if result.stream_errors else "missing_post_id" + err_type = ( + "moderated_or_stream_errors" if result.stream_errors else "missing_post_id" + ) raise UpstreamException( message=f"Video round {round_index}/{total_rounds} missing post_id", status_code=502, @@ -553,6 +620,7 @@ async def _request_round_stream( token: str, message: str, model_config_override: Dict[str, Any], + file_attachments: Optional[List[str]] = None, ) -> AsyncGenerator[bytes, None]: async def _stream(): session = _new_session() @@ -563,6 +631,7 @@ async def _stream(): token, message=message, model=_APP_CHAT_MODEL, + file_attachments=file_attachments, tool_overrides={"videoGen": True}, model_config_override=model_config_override, ) @@ -650,7 +719,9 @@ def ensure_role(self) -> List[str]: self.role_sent = True return [self._sse(role="assistant")] - def emit_progress(self, *, round_index: int, total_rounds: int, progress: Any) -> List[str]: + def emit_progress( + self, *, round_index: int, total_rounds: int, progress: Any + ) -> List[str]: if not self.show_think: return [] @@ -661,7 +732,9 @@ def emit_progress(self, *, round_index: int, total_rounds: int, progress: Any) - progress_text = _format_progress(progress) chunks.append( - self._sse(f"[round={round_index}/{total_rounds}] progress={progress_text}%\n") + self._sse( + f"[round={round_index}/{total_rounds}] progress={progress_text}%\n" + ) ) return chunks @@ -763,30 +836,66 @@ async def generate( model_config_override=model_config_override, ) - async def generate_from_image( + async def generate_from_images( self, token: str, prompt: str, - image_url: str, + image_urls: list[str], + asset_ids: list[str], aspect_ratio: str = "3:2", video_length: int = 6, resolution: str = "480p", preset: str = "normal", ) -> AsyncGenerator[bytes, None]: - """Single-round image-to-video generation stream.""" - post_id = await self.create_image_post(token, image_url) - model_config_override = _build_base_config( - post_id, - aspect_ratio, - resolution, - video_length, + """Generate video from one or more reference images.""" + if not image_urls: + raise ValidationException("At least one reference image is required") + if len(image_urls) != len(asset_ids): + raise ValidationException("Reference image metadata mismatch") + logger.info( + f"Image to video: prompt='{prompt[:50]}...', images={len(image_urls)}" ) + post_id = await self.create_post(token, prompt) + mode_map = { + "fun": "--mode=extremely-crazy", + "normal": "--mode=normal", + "spicy": "--mode=extremely-spicy-or-crazy", + } + mode_flag = mode_map.get(preset, "--mode=custom") + message = f"{prompt} {mode_flag}" + model_config_override = { + "modelMap": { + "videoGenModelConfig": { + "aspectRatio": aspect_ratio, + "imageReferences": image_urls, + "isReferenceToVideo": True, + "parentPostId": post_id, + "resolutionName": resolution, + "videoLength": video_length, + } + } + } return await _request_round_stream( token=token, - message=_build_message(prompt, preset), + message=message, model_config_override=model_config_override, + file_attachments=asset_ids, ) + @staticmethod + def _replace_reference_placeholders(prompt: str, asset_ids: list[str]) -> str: + """Replace @图N / @imageN placeholders with uploaded asset ids.""" + + def _replace(match: re.Match[str]) -> str: + index = int(match.group(1)) - 1 + if index < 0 or index >= len(asset_ids): + raise ValidationException( + f"Reference placeholder {match.group(0)} has no matching uploaded image" + ) + return f"@{asset_ids[index]}" + + return _REFERENCE_PLACEHOLDER_RE.sub(_replace, prompt) + @staticmethod async def completions( model: str, @@ -807,313 +916,389 @@ async def completions( else: show_think = reasoning_effort != "none" - from app.services.grok.services.chat import MessageExtractor from app.services.grok.utils.upload import UploadService - prompt, _, image_attachments = MessageExtractor.extract(messages) + prompt, image_attachments = _extract_last_user_prompt_and_images(messages) - pool_candidates = ModelService.pool_candidates_for_model(model) - token_info = token_mgr.get_token_for_video( - resolution=resolution, - video_length=video_length, - pool_candidates=pool_candidates, - ) + max_token_retries = max(1, int(get_config("retry.max_retry") or 1)) + last_error: Exception | None = None - if not token_info: - raise AppException( - message="No available tokens. Please try again later.", - error_type=ErrorType.RATE_LIMIT.value, - code="rate_limit_exceeded", - status_code=429, + for attempt in range(max_token_retries): + pool_candidates = ModelService.pool_candidates_for_model(model) + token_info = token_mgr.get_token_for_video( + resolution=resolution, + video_length=video_length, + pool_candidates=pool_candidates, ) - token = token_info.token - if token.startswith("sso="): - token = token[4:] + if not token_info: + if last_error: + raise last_error + raise AppException( + message="No available tokens. Please try again later.", + error_type=ErrorType.RATE_LIMIT.value, + code="rate_limit_exceeded", + status_code=429, + ) + + token = token_info.token + if token.startswith("sso="): + token = token[4:] - pool_name = token_mgr.get_pool_name_for_token(token) or BASIC_POOL_NAME - is_super_pool = pool_name != BASIC_POOL_NAME + pool_name = token_mgr.get_pool_name_for_token(token) or BASIC_POOL_NAME + is_super_pool = pool_name != BASIC_POOL_NAME - requested_resolution = resolution - should_upscale = requested_resolution == "720p" and pool_name == BASIC_POOL_NAME - generation_resolution = "480p" if should_upscale else requested_resolution - upscale_timing = _resolve_upscale_timing() if should_upscale else "complete" + requested_resolution = resolution + should_upscale = ( + requested_resolution == "720p" and pool_name == BASIC_POOL_NAME + ) + generation_resolution = "480p" if should_upscale else requested_resolution + upscale_timing = _resolve_upscale_timing() if should_upscale else "complete" - target_length = int(video_length or 6) - round_plan = _build_round_plan(target_length, is_super=is_super_pool) - total_rounds = len(round_plan) + target_length = int(video_length or 6) + round_plan = _build_round_plan(target_length, is_super=is_super_pool) - service = VideoService() - message = _build_message(prompt, preset) + prompt_text = prompt + image_urls: List[str] = [] + asset_ids: List[str] = [] - image_url = None - if image_attachments: - upload_service = UploadService() try: - if len(image_attachments) > 1: - logger.info( - "Video generation supports a single reference image; using the first one." + if image_attachments: + if len(image_attachments) > 7: + raise ValidationException( + "Video generation supports at most 7 reference images" + ) + upload_service = UploadService() + try: + for attach_data in image_attachments: + asset_id, file_uri = await upload_service.upload_file( + attach_data, token + ) + asset_ids.append(asset_id) + image_urls.append(f"https://assets.grok.com/{file_uri}") + prompt_text = VideoService._replace_reference_placeholders( + prompt_text, asset_ids + ) + logger.info( + f"Images uploaded for video: count={len(image_urls)}" + ) + finally: + await upload_service.close() + elif _REFERENCE_PLACEHOLDER_RE.search(prompt_text): + raise ValidationException( + "Reference placeholders require uploaded images" ) - attach_data = image_attachments[0] - _, file_uri = await upload_service.upload_file(attach_data, token) - image_url = f"https://assets.grok.com/{file_uri}" - logger.info(f"Image uploaded for video: {image_url}") - finally: - await upload_service.close() - - if image_url: - seed_post_id = await service.create_image_post(token, image_url) - else: - seed_post_id = await service.create_post(token, prompt) - model_info = ModelService.get(model) - effort = ( - EffortType.HIGH - if (model_info and model_info.cost.value == "high") - else EffortType.LOW - ) + service = VideoService() + message = _build_message(prompt_text, preset) + seed_post_id = await service.create_post(token, prompt_text) - async def _run_round_collect( - plan: VideoRoundPlan, - *, - seed_id: str, - last_id: str, - original_id: Optional[str], - source: str, - ) -> VideoRoundResult: - config_override = _build_round_config( - plan, - seed_post_id=seed_id, - last_post_id=last_id, - original_post_id=original_id, - prompt=prompt, - aspect_ratio=aspect_ratio, - resolution_name=generation_resolution, - ) - response = await _request_round_stream( - token=token, - message=message, - model_config_override=config_override, - ) - return await _collect_round_result(response, model=model, source=source) - - async def _stream_chain() -> AsyncGenerator[str, None]: - writer = _VideoChainSSEWriter(model, show_think) - seed_id = seed_post_id - last_id = seed_id - original_id: Optional[str] = seed_id - final_result: Optional[VideoRoundResult] = None + model_info = ModelService.get(model) + effort = ( + EffortType.HIGH + if (model_info and model_info.cost.value == "high") + else EffortType.LOW + ) - try: - for plan in round_plan: + async def _run_round_collect( + plan: VideoRoundPlan, + *, + seed_id: str, + last_id: str, + original_id: Optional[str], + source: str, + ) -> VideoRoundResult: config_override = _build_round_config( plan, seed_post_id=seed_id, last_post_id=last_id, original_post_id=original_id, - prompt=prompt, + prompt=prompt_text, aspect_ratio=aspect_ratio, resolution_name=generation_resolution, + image_references=image_urls if plan.round_index == 1 else None, ) response = await _request_round_stream( token=token, message=message, model_config_override=config_override, + file_attachments=asset_ids if plan.round_index == 1 else None, + ) + return await _collect_round_result( + response, model=model, source=source ) - round_result = VideoRoundResult() - async for event_type, payload in _iter_round_events( - response, - model=model, - source=f"stream-round-{plan.round_index}", - ): - if event_type == "progress": - for chunk in writer.emit_progress( + async def _stream_chain() -> AsyncGenerator[str, None]: + writer = _VideoChainSSEWriter(model, show_think) + seed_id = seed_post_id + last_id = seed_id + original_id: Optional[str] = seed_id + final_result: Optional[VideoRoundResult] = None + + try: + for plan in round_plan: + config_override = _build_round_config( + plan, + seed_post_id=seed_id, + last_post_id=last_id, + original_post_id=original_id, + prompt=prompt_text, + aspect_ratio=aspect_ratio, + resolution_name=generation_resolution, + image_references=image_urls + if plan.round_index == 1 + else None, + ) + response = await _request_round_stream( + token=token, + message=message, + model_config_override=config_override, + file_attachments=asset_ids + if plan.round_index == 1 + else None, + ) + + round_result = VideoRoundResult() + async for event_type, payload in _iter_round_events( + response, + model=model, + source=f"stream-round-{plan.round_index}", + ): + if event_type == "progress": + for chunk in writer.emit_progress( + round_index=plan.round_index, + total_rounds=plan.total_rounds, + progress=payload, + ): + yield chunk + elif event_type == "done": + round_result = payload + + _ensure_round_result( + round_result, round_index=plan.round_index, total_rounds=plan.total_rounds, - progress=payload, + final_round=(plan.round_index == plan.total_rounds), + ) + + if ( + should_upscale + and upscale_timing == "single" + and round_result.video_url ): + for chunk in writer.emit_note( + f"[round={plan.round_index}/{plan.total_rounds}] 正在对当前轮结果进行超分辨率\n" + ): + yield chunk + upgraded_url, upscaled = await _upscale_video_url( + token, round_result.video_url + ) + if upscaled: + round_result.video_url = upgraded_url + else: + logger.warning( + "Video upscale failed in single mode, fallback to 480p result" + ) + + if plan.round_index == 1 and round_result.post_id: + original_id = round_result.post_id + if round_result.post_id: + last_id = round_result.post_id + + if plan.round_index == plan.total_rounds: + final_result = round_result + + if final_result is None: + raise UpstreamException( + message="Video generation produced no final round", + status_code=502, + details={"type": "empty_video_stream"}, + ) + + final_video_url = final_result.video_url + if should_upscale and upscale_timing == "complete": + for chunk in writer.emit_note("正在对视频进行超分辨率\n"): yield chunk - elif event_type == "done": - round_result = payload - - _ensure_round_result( - round_result, - round_index=plan.round_index, - total_rounds=plan.total_rounds, - final_round=(plan.round_index == plan.total_rounds), - ) + final_video_url, upscaled = await _upscale_video_url( + token, final_video_url + ) + if not upscaled: + logger.warning( + "Video upscale failed, fallback to 480p result" + ) - if should_upscale and upscale_timing == "single" and round_result.video_url: - for chunk in writer.emit_note( - f"[round={plan.round_index}/{plan.total_rounds}] 正在对当前轮结果进行超分辨率\n" - ): + if _public_asset_enabled(): + for chunk in writer.emit_note("正在生成可公开访问链接\n"): + yield chunk + final_video_url = await _create_public_video_link( + token, final_video_url + ) + + dl_service = DownloadService() + try: + rendered = await dl_service.render_video( + final_video_url, + token, + final_result.thumbnail_url, + ) + finally: + await dl_service.close() + + for chunk in writer.emit_content(rendered): yield chunk - upgraded_url, upscaled = await _upscale_video_url( - token, round_result.video_url + for chunk in writer.finish(): + yield chunk + except asyncio.CancelledError: + logger.debug( + "Video stream chain cancelled by client", + extra={"model": model}, + ) + raise + except UpstreamException as e: + if rate_limited(e): + await token_mgr.mark_rate_limited(token) + raise + + async def _collect_chain() -> Dict[str, Any]: + seed_id = seed_post_id + last_id = seed_id + original_id: Optional[str] = seed_id + final_result: Optional[VideoRoundResult] = None + + for plan in round_plan: + round_result = await _run_round_collect( + plan, + seed_id=seed_id, + last_id=last_id, + original_id=original_id, + source=f"collect-round-{plan.round_index}", + ) + + _ensure_round_result( + round_result, + round_index=plan.round_index, + total_rounds=plan.total_rounds, + final_round=(plan.round_index == plan.total_rounds), + ) + + if ( + should_upscale + and upscale_timing == "single" + and round_result.video_url + ): + upgraded_url, upscaled = await _upscale_video_url( + token, round_result.video_url + ) + if upscaled: + round_result.video_url = upgraded_url + else: + logger.warning( + "Video upscale failed in single mode, fallback to 480p result" + ) + + if plan.round_index == 1 and round_result.post_id: + original_id = round_result.post_id + if round_result.post_id: + last_id = round_result.post_id + + if plan.round_index == plan.total_rounds: + final_result = round_result + + if final_result is None: + raise UpstreamException( + message="Video generation produced no final round", + status_code=502, + details={"type": "empty_video_stream"}, ) - if upscaled: - round_result.video_url = upgraded_url - else: + + final_video_url = final_result.video_url + if should_upscale and upscale_timing == "complete": + final_video_url, upscaled = await _upscale_video_url( + token, final_video_url + ) + if not upscaled: logger.warning( - "Video upscale failed in single mode, fallback to 480p result" + "Video upscale failed, fallback to 480p result" ) - if plan.round_index == 1 and round_result.post_id: - original_id = round_result.post_id - if round_result.post_id: - last_id = round_result.post_id + if _public_asset_enabled(): + final_video_url = await _create_public_video_link( + token, final_video_url + ) - if plan.round_index == plan.total_rounds: - final_result = round_result + dl_service = DownloadService() + try: + content = await dl_service.render_video( + final_video_url, + token, + final_result.thumbnail_url, + ) + finally: + await dl_service.close() + + return { + "id": final_result.response_id, + "object": "chat.completion", + "created": int(time.time()), + "model": model, + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": content, + "refusal": None, + }, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0, + }, + } - if final_result is None: - raise UpstreamException( - message="Video generation produced no final round", - status_code=502, - details={"type": "empty_video_stream"}, + if is_stream: + return wrap_stream_with_usage( + _stream_chain(), token_mgr, token, model ) - final_video_url = final_result.video_url - if should_upscale and upscale_timing == "complete": - for chunk in writer.emit_note("正在对视频进行超分辨率\n"): - yield chunk - final_video_url, upscaled = await _upscale_video_url(token, final_video_url) - if not upscaled: - logger.warning("Video upscale failed, fallback to 480p result") - - if _public_asset_enabled(): - for chunk in writer.emit_note("正在生成可公开访问链接\n"): - yield chunk - final_video_url = await _create_public_video_link(token, final_video_url) + try: + result = await _collect_chain() + except UpstreamException as e: + if rate_limited(e): + await token_mgr.mark_rate_limited(token) + raise - dl_service = DownloadService() try: - rendered = await dl_service.render_video( - final_video_url, - token, - final_result.thumbnail_url, + await token_mgr.consume(token, effort) + logger.debug( + f"Video completed, recorded usage (effort={effort.value})" ) - finally: - await dl_service.close() + except Exception as e: + logger.warning(f"Failed to record video usage: {e}") - for chunk in writer.emit_content(rendered): - yield chunk - for chunk in writer.finish(): - yield chunk - except asyncio.CancelledError: - logger.debug("Video stream chain cancelled by client", extra={"model": model}) - raise + return result except UpstreamException as e: + last_error = e if rate_limited(e): await token_mgr.mark_rate_limited(token) - raise - - async def _collect_chain() -> Dict[str, Any]: - seed_id = seed_post_id - last_id = seed_id - original_id: Optional[str] = seed_id - final_result: Optional[VideoRoundResult] = None - - for plan in round_plan: - round_result = await _run_round_collect( - plan, - seed_id=seed_id, - last_id=last_id, - original_id=original_id, - source=f"collect-round-{plan.round_index}", - ) - - _ensure_round_result( - round_result, - round_index=plan.round_index, - total_rounds=plan.total_rounds, - final_round=(plan.round_index == plan.total_rounds), - ) - - if should_upscale and upscale_timing == "single" and round_result.video_url: - upgraded_url, upscaled = await _upscale_video_url( - token, round_result.video_url + logger.warning( + f"Token {token[:10]}... rate limited (429), " + f"trying next token (attempt {attempt + 1}/{max_token_retries})" ) - if upscaled: - round_result.video_url = upgraded_url - else: - logger.warning( - "Video upscale failed in single mode, fallback to 480p result" - ) - - if plan.round_index == 1 and round_result.post_id: - original_id = round_result.post_id - if round_result.post_id: - last_id = round_result.post_id - - if plan.round_index == plan.total_rounds: - final_result = round_result - - if final_result is None: - raise UpstreamException( - message="Video generation produced no final round", - status_code=502, - details={"type": "empty_video_stream"}, - ) - - final_video_url = final_result.video_url - if should_upscale and upscale_timing == "complete": - final_video_url, upscaled = await _upscale_video_url(token, final_video_url) - if not upscaled: - logger.warning("Video upscale failed, fallback to 480p result") - - if _public_asset_enabled(): - final_video_url = await _create_public_video_link(token, final_video_url) - - dl_service = DownloadService() - try: - content = await dl_service.render_video( - final_video_url, - token, - final_result.thumbnail_url, - ) - finally: - await dl_service.close() - - return { - "id": final_result.response_id, - "object": "chat.completion", - "created": int(time.time()), - "model": model, - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": content, - "refusal": None, - }, - "finish_reason": "stop", - } - ], - "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, - } - - if is_stream: - return wrap_stream_with_usage(_stream_chain(), token_mgr, token, model) - - try: - result = await _collect_chain() - except UpstreamException as e: - if rate_limited(e): - await token_mgr.mark_rate_limited(token) - raise - - try: - await token_mgr.consume(token, effort) - logger.debug( - f"Video completed, recorded usage (effort={effort.value})" - ) - except Exception as e: - logger.warning(f"Failed to record video usage: {e}") + continue + raise - return result + if last_error: + raise last_error + raise AppException( + message="No available tokens. Please try again later.", + error_type=ErrorType.RATE_LIMIT.value, + code="rate_limit_exceeded", + status_code=429, + ) class VideoStreamProcessor: @@ -1165,7 +1350,9 @@ async def close(self): await self._dl_service.close() self._dl_service = None - async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, None]: + async def process( + self, response: AsyncIterable[bytes] + ) -> AsyncGenerator[str, None]: result = VideoRoundResult() try: async for event_type, payload in _iter_round_events( @@ -1194,14 +1381,18 @@ async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, N if self.upscale_on_finish: for chunk in self.writer.emit_note("正在对视频进行超分辨率\n"): yield chunk - final_video_url, upscaled = await _upscale_video_url(self.token, final_video_url) + final_video_url, upscaled = await _upscale_video_url( + self.token, final_video_url + ) if not upscaled: logger.warning("Video upscale failed, fallback to 480p result") if self.enable_public_asset: for chunk in self.writer.emit_note("正在生成可公开访问链接\n"): yield chunk - final_video_url = await _create_public_video_link(self.token, final_video_url) + final_video_url = await _create_public_video_link( + self.token, final_video_url + ) rendered = await self._get_dl().render_video( final_video_url, @@ -1213,7 +1404,9 @@ async def process(self, response: AsyncIterable[bytes]) -> AsyncGenerator[str, N for chunk in self.writer.finish(): yield chunk except asyncio.CancelledError: - logger.debug("Video stream cancelled by client", extra={"model": self.model}) + logger.debug( + "Video stream cancelled by client", extra={"model": self.model} + ) raise finally: await self.close() @@ -1265,12 +1458,16 @@ async def process(self, response: AsyncIterable[bytes]) -> Dict[str, Any]: final_video_url = result.video_url if self.upscale_on_finish: - final_video_url, upscaled = await _upscale_video_url(self.token, final_video_url) + final_video_url, upscaled = await _upscale_video_url( + self.token, final_video_url + ) if not upscaled: logger.warning("Video upscale failed, fallback to 480p result") if self.enable_public_asset: - final_video_url = await _create_public_video_link(self.token, final_video_url) + final_video_url = await _create_public_video_link( + self.token, final_video_url + ) content = await self._get_dl().render_video( final_video_url, @@ -1294,7 +1491,11 @@ async def process(self, response: AsyncIterable[bytes]) -> Dict[str, Any]: "finish_reason": "stop", } ], - "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, + "usage": { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0, + }, } finally: await self.close() diff --git a/docs/README.en.md b/docs/README.en.md index ff3b89815..8d240a7ac 100644 --- a/docs/README.en.md +++ b/docs/README.en.md @@ -196,7 +196,8 @@ curl http://localhost:8000/v1/chat/completions \ - `grok-imagine-1.0-fast` streaming output in `/chat/completions` only returns the final image, hiding intermediate preview images. - `grok-imagine-1.0-fast` streaming URL output will retain the original image filename (without appending `-final`). - `grok-imagine-1.0-edit` requires an image; if multiple are provided, the **last 3** images and last text are used. -- `grok-imagine-1.0-video` supports text-to-video and image-to-video via `image_url` (**only the first image is used**). +- `grok-imagine-1.0-video` supports text-to-video and multi-image reference video: pass up to `7` `image_url` blocks and use placeholders like `@图1`, `@图2` in the prompt; the server will replace them with the corresponding `assetId` values. +- `@图N` placeholders map to `image_url` order; referencing a missing image index returns an error. - Any other parameters will be discarded and ignored.
@@ -361,7 +362,7 @@ curl http://localhost:8000/v1/videos \ | `size` | string | Frame size (mapped to aspect_ratio) | `1280x720`, `720x1280`, `1792x1024`, `1024x1792`, `1024x1024` | | `seconds` | integer | Target duration (seconds) | `6` ~ `30` | | `quality` | string | Video quality (mapped to resolution) | `standard`, `high` | -| `image_reference` | object/string | Reference image (optional) | `{"image_url":"https://..."}` or Data URI | +| `image_reference` | array | Reference image (optional) | OpenAI-compatible content block array (`[{"type":"image_url"...}]`) or an array of URL strings; single-image requests should use a one-item array | | `input_reference` | file | multipart reference image (optional) | `png`, `jpg`, `webp` | **Notes**: @@ -369,7 +370,7 @@ curl http://localhost:8000/v1/videos \ - Server-side chain extension now supports 6~30 seconds automatically, so **`/v1/video/extend` is not required**. - `quality=standard` maps to `480p`; `quality=high` maps to `720p`. - For basic-pool requests at `720p`, generation falls back to `480p` first, then upscales according to `video.upscale_timing`. -- If both `image_reference` and `input_reference` are provided, references are processed in order; the video pipeline uses the first image only. +- `image_reference` now uses array format only and supports up to 7 images; single-image requests should also use a one-item array. If both `image_reference` and `input_reference` are provided, references are processed and merged in order; you can use placeholders like `@图1`, `@图2` in prompts.
diff --git a/readme.md b/readme.md index 75cae6ba5..b75c2fee9 100644 --- a/readme.md +++ b/readme.md @@ -197,7 +197,8 @@ curl http://localhost:8000/v1/chat/completions \ - `grok-imagine-1.0-fast` 流式 URL 出图会保持原始图片名(不追加 `-final` 后缀)。 - 当图片疑似被审查拦截导致无最终图时,若开启 `image.blocked_parallel_enabled`,服务端会按 `image.blocked_parallel_attempts` 自动并行补偿生成,并优先使用不同 token;若仍无满足 `image.final_min_bytes` 的最终图则返回失败。 - `grok-imagine-1.0-edit` 必须提供图片,多图默认取**最后 3 张**与最后一个文本。 -- `grok-imagine-1.0-video` 支持文生视频与图生视频(通过 `image_url` 传参考图,**仅取第 1 张**)。 +- `grok-imagine-1.0-video` 支持文生视频与多图参考视频:可通过多个 `image_url` 传最多 `7` 张参考图,并在文本中使用 `@图1`、`@图2` 这类占位符;服务端会自动替换为对应 `assetId`。 +- `@图N` 与 `image_url` 的顺序一一对应;若引用了不存在的图片序号,会直接报错。 - 除上述外的其他参数将自动丢弃并忽略。
@@ -362,7 +363,7 @@ curl http://localhost:8000/v1/videos \ | `size` | string | 画面比例(会映射到 aspect_ratio) | `1280x720`, `720x1280`, `1792x1024`, `1024x1792`, `1024x1024` | | `seconds` | integer | 目标时长(秒) | `6` ~ `30` | | `quality` | string | 视频质量(映射到 resolution) | `standard`, `high` | -| `image_reference` | object/string | 参考图(可选) | `{"image_url":"https://..."}` 或 Data URI | +| `image_reference` | array | 参考图(可选) | 兼容 OpenAI content block 数组格式 (`[{"type":"image_url"...}]`) 或纯 URL 字符串数组;单图也请传单元素数组 | | `input_reference` | file | multipart 参考图(可选) | `png`, `jpg`, `webp` | **注意事项**: @@ -370,7 +371,7 @@ curl http://localhost:8000/v1/videos \ - 服务端已支持 6~30 秒自动链式扩展,**无需使用 `/v1/video/extend`**。 - `quality=standard` 对应 `480p`;`quality=high` 对应 `720p`。 - 基础号池请求 `720p` 时会先产出 `480p` 再按 `video.upscale_timing` 执行超分。 -- `image_reference` 与 `input_reference` 同时传入时,会按顺序作为参考图输入;视频链路只使用第 1 张。 +- `image_reference` 统一使用数组格式,最多可传 7 张参考图;单图场景也请传单元素数组。`input_reference` 主要以表单上传参考图;若两者同时传入,会按顺序作为参考图合并输入;可在提示词中使用 `@图1`、`@图2`。