diff --git a/omlx/admin/i18n/en.json b/omlx/admin/i18n/en.json index 4308eb41..28494276 100644 --- a/omlx/admin/i18n/en.json +++ b/omlx/admin/i18n/en.json @@ -282,6 +282,7 @@ "settings.models.badge.rep_penalty": "rep_penalty:", "settings.models.badge.tool_result_tokens": "tool_result_tokens:", "settings.models.badge.force_sampling": "force_sampling", + "settings.models.badge.speculative": "speculative", "modal.model_settings.section_label": "Model Settings", "modal.model_settings.model_type": "Model Type", @@ -304,6 +305,11 @@ "modal.model_settings.limit_tool_placeholder": "e.g. 2000", "modal.model_settings.force_sampling": "Force Sampling", "modal.model_settings.force_sampling_hint": "Override request sampling parameters with configured values", + "modal.model_settings.speculative_decoding": "Speculative Decoding", + "modal.model_settings.speculative_decoding_hint": "Speed up decoding with a smaller draft model", + "modal.model_settings.draft_model": "Draft Model", + "modal.model_settings.select_draft_model": "Select draft model", + "modal.model_settings.num_draft_tokens": "Draft Tokens", "modal.model_settings.chat_template_kwargs": "Chat Template Kwargs", "modal.model_settings.chat_template_kwargs_hint": "Parameters passed to chat template", "modal.model_settings.add_kwarg": "+ Add", diff --git a/omlx/admin/i18n/ja.json b/omlx/admin/i18n/ja.json index 33ea5dd4..4d115369 100644 --- a/omlx/admin/i18n/ja.json +++ b/omlx/admin/i18n/ja.json @@ -282,6 +282,7 @@ "settings.models.badge.rep_penalty": "rep_penalty:", "settings.models.badge.tool_result_tokens": "tool_result_tokens:", "settings.models.badge.force_sampling": "force_sampling", + "settings.models.badge.speculative": "speculative", "modal.model_settings.section_label": "モデル設定", "modal.model_settings.model_type": "モデルタイプ", @@ -304,6 +305,11 @@ "modal.model_settings.limit_tool_placeholder": "例: 2000", "modal.model_settings.force_sampling": "強制サンプリング", "modal.model_settings.force_sampling_hint": "設定値でリクエストのサンプリングパラメータを上書きします", + "modal.model_settings.speculative_decoding": "投機的デコーディング", + "modal.model_settings.speculative_decoding_hint": "小さなドラフトモデルでデコード速度を向上", + "modal.model_settings.draft_model": "ドラフトモデル", + "modal.model_settings.select_draft_model": "ドラフトモデルを選択", + "modal.model_settings.num_draft_tokens": "ドラフトトークン数", "modal.model_settings.chat_template_kwargs": "チャットテンプレート引数", "modal.model_settings.chat_template_kwargs_hint": "チャットテンプレートに渡すパラメータ", "modal.model_settings.add_kwarg": "+ 追加", diff --git a/omlx/admin/i18n/ko.json b/omlx/admin/i18n/ko.json index fad618d4..e11222f5 100644 --- a/omlx/admin/i18n/ko.json +++ b/omlx/admin/i18n/ko.json @@ -282,6 +282,7 @@ "settings.models.badge.rep_penalty": "rep_penalty:", "settings.models.badge.tool_result_tokens": "tool_result_tokens:", "settings.models.badge.force_sampling": "force_sampling", + "settings.models.badge.speculative": "speculative", "modal.model_settings.section_label": "모델 설정", "modal.model_settings.model_type": "모델 타입", @@ -304,6 +305,11 @@ "modal.model_settings.limit_tool_placeholder": "예: 2000", "modal.model_settings.force_sampling": "강제 샘플링", "modal.model_settings.force_sampling_hint": "설정된 값으로 요청의 샘플링 파라미터를 덮어씁니다", + "modal.model_settings.speculative_decoding": "추측 디코딩", + "modal.model_settings.speculative_decoding_hint": "작은 드래프트 모델로 디코딩 속도 향상", + "modal.model_settings.draft_model": "드래프트 모델", + "modal.model_settings.select_draft_model": "드래프트 모델 선택", + "modal.model_settings.num_draft_tokens": "드래프트 토큰 수", "modal.model_settings.chat_template_kwargs": "채팅 템플릿 피라미터 설정", "modal.model_settings.chat_template_kwargs_hint": "채팅 템플릿에 전달되는 파라미터", "modal.model_settings.add_kwarg": "+ 추가", diff --git a/omlx/admin/i18n/zh.json b/omlx/admin/i18n/zh.json index 53cf8dd7..ba7825ae 100644 --- a/omlx/admin/i18n/zh.json +++ b/omlx/admin/i18n/zh.json @@ -259,6 +259,7 @@ "settings.models.badge.rep_penalty": "rep_penalty:", "settings.models.badge.tool_result_tokens": "tool_result_tokens:", "settings.models.badge.force_sampling": "force_sampling", + "settings.models.badge.speculative": "speculative", "modal.model_settings.section_label": "模型设置", "modal.model_settings.model_type": "模型类型", "modal.model_settings.model_type_auto": "自动检测", @@ -280,6 +281,11 @@ "modal.model_settings.limit_tool_placeholder": "例如 2000", "modal.model_settings.force_sampling": "强制采样", "modal.model_settings.force_sampling_hint": "用配置值覆盖请求中的采样参数", + "modal.model_settings.speculative_decoding": "推测解码", + "modal.model_settings.speculative_decoding_hint": "使用小型草稿模型加速解码", + "modal.model_settings.draft_model": "草稿模型", + "modal.model_settings.select_draft_model": "选择草稿模型", + "modal.model_settings.num_draft_tokens": "草稿令牌数", "modal.model_settings.chat_template_kwargs": "聊天模板参数", "modal.model_settings.chat_template_kwargs_hint": "传递给聊天模板的参数", "modal.model_settings.add_kwarg": "+ 添加", diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py index fa20dbe1..4ad59fb1 100644 --- a/omlx/admin/routes.py +++ b/omlx/admin/routes.py @@ -68,6 +68,9 @@ class ModelSettingsRequest(BaseModel): chat_template_kwargs: Optional[Dict[str, Any]] = None forced_ct_kwargs: Optional[list[str]] = None ttl_seconds: Optional[int] = None + speculative_decoding: Optional[bool] = None + draft_model: Optional[str] = None + num_draft_tokens: Optional[int] = None is_pinned: Optional[bool] = None is_default: Optional[bool] = None @@ -1015,6 +1018,9 @@ async def list_models(is_admin: bool = Depends(require_admin)): "chat_template_kwargs": settings.chat_template_kwargs, "forced_ct_kwargs": settings.forced_ct_kwargs, "ttl_seconds": settings.ttl_seconds, + "speculative_decoding": settings.speculative_decoding, + "draft_model": settings.draft_model, + "num_draft_tokens": settings.num_draft_tokens, "is_pinned": settings.is_pinned, "is_default": settings.is_default, "display_name": settings.display_name, @@ -1170,6 +1176,12 @@ async def update_model_settings( current_settings.forced_ct_kwargs = request.forced_ct_kwargs if "ttl_seconds" in sent: current_settings.ttl_seconds = request.ttl_seconds + if "speculative_decoding" in sent: + current_settings.speculative_decoding = request.speculative_decoding or False + if "draft_model" in sent: + current_settings.draft_model = request.draft_model or None + if "num_draft_tokens" in sent: + current_settings.num_draft_tokens = request.num_draft_tokens if request.is_pinned is not None: current_settings.is_pinned = request.is_pinned # Also update the engine pool entry @@ -1184,15 +1196,21 @@ async def update_model_settings( settings_manager.set_settings(model_id, current_settings) # Warn if engine type actually changed while model is loaded + speculative_changed = ( + entry.engine is not None + and any(k in sent for k in ("speculative_decoding", "draft_model", "num_draft_tokens")) + ) requires_reload = ( - "model_type_override" in sent - and entry.engine is not None - and entry.engine_type != prev_engine_type + entry.engine is not None + and ( + ("model_type_override" in sent and entry.engine_type != prev_engine_type) + or speculative_changed + ) ) if requires_reload: logger.info( - f"Model type changed for loaded model {model_id} " - f"(now {entry.model_type}/{entry.engine_type}). " + f"Settings changed for loaded model {model_id} " + f"(engine_type={entry.engine_type}). " f"Reload required to take effect." ) @@ -1206,6 +1224,34 @@ async def update_model_settings( } +@router.get("/api/models/{model_id}/draft_candidates") +async def get_draft_candidates( + model_id: str, + is_admin: bool = Depends(require_admin), +): + """Get list of LLM models that can serve as draft models for speculative decoding.""" + engine_pool = _get_engine_pool() + if engine_pool is None: + raise HTTPException(status_code=503, detail="Server not initialized") + + entry = engine_pool.get_entry(model_id) + if entry is None: + raise HTTPException(status_code=404, detail=f"Model not found: {model_id}") + + candidates = [] + for mid in engine_pool.get_model_ids(): + if mid == model_id: + continue + e = engine_pool.get_entry(mid) + if e and e.model_type in ("llm", "vlm"): + candidates.append({ + "model_id": mid, + "estimated_size": e.estimated_size, + }) + + return {"candidates": candidates} + + @router.get("/api/models/{model_id}/generation_config") async def get_generation_config( model_id: str, diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js index 559fa28e..9f19396e 100644 --- a/omlx/admin/static/js/dashboard.js +++ b/omlx/admin/static/js/dashboard.js @@ -76,7 +76,11 @@ enableToolResultLimit: false, max_tool_result_tokens: null, ctKwargEntries: [], + speculative_decoding: false, + draft_model: '', + num_draft_tokens: 3, }, + draftModelCandidates: [], savingModelSettings: false, loadingGenDefaults: false, @@ -518,9 +522,28 @@ max_tool_result_tokens: settings.max_tool_result_tokens || null, ttl_seconds: settings.ttl_seconds ?? null, ctKwargEntries, + speculative_decoding: settings.speculative_decoding || false, + draft_model: settings.draft_model || '', + num_draft_tokens: settings.num_draft_tokens ?? 3, }; this.showModelSettingsModal = true; this.$nextTick(() => lucide.createIcons()); + + // Fetch draft model candidates for speculative decoding. + // Must load candidates BEFORE the select renders, otherwise + // Alpine resets x-model to '' when no matching option exists. + const savedDraftModel = settings.draft_model || ''; + if (model.model_type === 'llm' || model.model_type === 'vlm') { + fetch(`/admin/api/models/${encodeURIComponent(model.id)}/draft_candidates`) + .then(r => r.json()) + .then(data => { + this.draftModelCandidates = data.candidates || []; + this.modelSettings.draft_model = savedDraftModel; + }) + .catch(() => { this.draftModelCandidates = []; }); + } else { + this.draftModelCandidates = []; + } }, async saveModelSettings() { @@ -569,6 +592,9 @@ ? chatTemplateKwargs : null, forced_ct_kwargs: forcedCtKwargs.length > 0 ? forcedCtKwargs : null, + speculative_decoding: this.modelSettings.speculative_decoding, + draft_model: this.modelSettings.draft_model || null, + num_draft_tokens: this.modelSettings.num_draft_tokens || null, }; })()), }); diff --git a/omlx/admin/templates/dashboard/_modal_model_settings.html b/omlx/admin/templates/dashboard/_modal_model_settings.html index b5077e51..a7d6fc51 100644 --- a/omlx/admin/templates/dashboard/_modal_model_settings.html +++ b/omlx/admin/templates/dashboard/_modal_model_settings.html @@ -150,6 +150,43 @@

+
+
+
+ {{ t('modal.model_settings.speculative_decoding') }} +

{{ t('modal.model_settings.speculative_decoding_hint') }}

+
+ +
+ +
+ +
diff --git a/omlx/admin/templates/dashboard/_settings.html b/omlx/admin/templates/dashboard/_settings.html index 500d073f..6b1b228b 100644 --- a/omlx/admin/templates/dashboard/_settings.html +++ b/omlx/admin/templates/dashboard/_settings.html @@ -774,7 +774,7 @@

{{ t('settings.models.no

-
+