InfiniTensor · ArcaLunar · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/.clang-format b/.clang-format
@@ -4,27 +4,9 @@ IndentWidth: 4                        # 缩进宽度，LLVM 默认值为 2，改
 AccessModifierOffset: -4              # public/protected/private 访问控制符相对成员的偏移，与 IndentWidth 配合，LLVM 默认值为 -2
 AlignOperands: AlignAfterOperator     # 双目运算符的行间对齐，LLVM 默认值为 Align，改为带符号一起换行
 BreakBeforeBinaryOperators: All       # 在双目运算符之前换行，LLVM 默认值为 None，改为换行时总是把双目运算符放在行首，包括赋值（=）
-ColumnLimit: 0                        # 列宽限制，LLVM 默认值为 80，改为不限制
+ColumnLimit: 80                        # 列宽限制，LLVM 默认值为 80，改为不限制
 AllowShortBlocksOnASingleLine: Always # 是否允许短块（单个语句的块）不换行，LLVM 默认值为 Never，改为允许
 AllowShortLoopsOnASingleLine: true    # 是否允许短循环不换行，LLVM 默认值为 false，改为允许
-InsertBraces: true                    # 是否在 if/for/while/switch 等语句后插入大括号，LLVM 默认值为 false，改为允许
-BreakBeforeBraces: Custom             # 大括号换行配置，LLVM 默认值为 LLVM，改为自定义以使 BraceWrapping 生效
-BraceWrapping:
-  AfterCaseLabel: false
-  AfterClass: false
-  AfterControlStatement: Never
-  AfterEnum: false
-  AfterFunction: false
-  AfterNamespace: false
-  AfterObjCDeclaration: false
-  AfterStruct: false
-  AfterUnion: false
-  AfterExternBlock: false
-  BeforeCatch: false
-  BeforeElse: false
-  BeforeLambdaBody: false
-  BeforeWhile: false
-  IndentBraces: false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
+InsertBraces: false                   # 是否在 if/for/while/switch 等语句后插入大括号，LLVM 默认值为 false，改为允许
+
+BinPackParameters: OnePerLine
diff --git a/.gitignore b/.gitignore
@@ -87,4 +87,6 @@ htmlcov/
 # Windows
 Thumbs.db
 ehthumbs.db
-desktop.ini
+desktop.ini
+
+data
diff --git a/REPORT.md b/REPORT.md
@@ -0,0 +1,25 @@
+## CUDA Backend for Chat Server
+
+首先设置环境变量 `LLAISYS_DEVICE=nvidia` 来启用 CUDA backend.
+
+## 流式输出
+
+然后运行 `python/chat_server.py` 启用 OpenAI 风格的 API
+
+然后可以用 `curl` 来测试流式输出
+
+```bash
+curl -N http://127.0.0.1:9108/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Accept: text/event-stream" \
+-d '{"model":"qwen2","messages":[{"role":"user","content":"Hi who are you?"}],"stream":true,"max_tokens":64,"temperature":0.8,"top_p":0.9,"top_k":40}'
+```
+
+### TUI Chatting
+
+```bash
+python -m llaisys.chat.tui --url http://127.0.0.1:9108
+
+# 或者用 uv
+uv run python -m llaisys.chat.tui --url http://127.0.0.1:9108
+```
diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
@@ -2,6 +2,8 @@
 #define LLAISYS_MODELS_QWEN2_H
 
 #include "../tensor.h"
+#include <cstdint>
+#include <string>
 
 __C {
     struct LlaisysQwen2Meta {
@@ -31,12 +33,43 @@ __C {
 
     struct LlaisysQwen2Model;
 
-    __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
+    __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(
+        const LlaisysQwen2Meta *meta, llaisysDeviceType_t device,
+        int *device_ids, int ndevice);
 
     __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
 
-    __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
+    __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(
+        struct LlaisysQwen2Model * model);
 
-    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+        /**
+         * @brief Inference function for Qwen2 Model. This function combines both
+         * prefill and decode through `prefill` flag.
+         * @note This function will reset KV Caches if `prefill` is true.
+         * 
+         * @param token_ids input token ids
+         * @param pos_ids input position ids, used for RoPE
+         */
+    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model,
+                                            int64_t *token_ids, int64_t *pos_ids, size_t ntoken, bool prefill);
+
+    /**
+     * @brief Inference with configurable decoding strategy.
+     *
+     * This behaves the same as llaisysQwen2ModelInfer for prefill/decode flow,
+     * but selects the next token using temperature/top-k/top-p sampling.
+     *
+     * @param top_k        Keep only top-k tokens before sampling (0 = disabled).
+     * @param top_p        Nucleus threshold in (0, 1] (1.0 = disabled).
+     * @param temperature  Positive temperature scalar.
+     */
+    __export int64_t llaisysQwen2ModelInferSample(struct LlaisysQwen2Model * model,
+                                                  int64_t *token_ids, int64_t *pos_ids,
+                                                  size_t ntoken, bool prefill,
+                                                  int top_k, float top_p, float temperature);
+
+    __export void llaisysQwen2SetWeights(struct LlaisysQwen2Model * model,
+                                         int name, int layer_id,
+                                         llaisysTensor_t tensor);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h
@@ -13,6 +13,20 @@ __C {
     __export void llaisysROPE(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t pos_ids, float theta);
     __export void llaisysSelfAttention(llaisysTensor_t attn_val, llaisysTensor_t q, llaisysTensor_t k, llaisysTensor_t v, float scale);
     __export void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up);
+
+    /**
+     * @brief Sample a token index from logits with temperature / top-k / top-p.
+     * @param out       Shape {1}, dtype int64. Receives the sampled token index.
+     * @param logits    Shape {vocab_size}, any float dtype. Must be contiguous.
+     * @param top_k     Keep only top-k tokens (0 = disabled).
+     * @param top_p     Nucleus threshold in (0, 1] (1.0 = disabled).
+     * @param temperature Positive temperature scalar.
+     */
+    __export void llaisysSample(llaisysTensor_t out, llaisysTensor_t logits,
+                                int top_k, float top_p, float temperature);
+
+    /** Set the per-thread RNG seed used by llaisysSample. */
+    __export void llaisysSampleSetSeed(uint64_t seed);
 }
 
 #endif
diff --git a/python/chat_server.py b/python/chat_server.py
@@ -0,0 +1,13 @@
+import uvicorn
+
+from llaisys.chat.server import build_runtime_from_env, create_app
+
+
+def main() -> None:
+    runtime = build_runtime_from_env()
+    app = create_app(runtime)
+    uvicorn.run(app, host="0.0.0.0", port=9108)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/llaisys/__init__.py b/python/llaisys/__init__.py
@@ -7,6 +7,7 @@
 from .ops import Ops
 from . import models
 from .models import *
+from .libllaisys import LIB_LLAISYS
 
 __all__ = [
     "RuntimeAPI",
@@ -17,4 +18,5 @@
     "Tensor",
     "Ops",
     "models",
+    "LIB_LLAISYS"
 ]
diff --git a/python/llaisys/chat/__init__.py b/python/llaisys/chat/__init__.py
@@ -0,0 +1,3 @@
+from .server import create_app
+
+__all__ = ["create_app"]
diff --git a/python/llaisys/chat/server.py b/python/llaisys/chat/server.py
@@ -0,0 +1,209 @@
+import os
+import time
+import json
+import threading
+from dataclasses import dataclass, field
+from typing import Literal
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+from transformers import AutoTokenizer
+
+from ..libllaisys import DeviceType
+from ..models.qwen2 import Qwen2
+
+
+class ChatMessage(BaseModel):
+    role: Literal["system", "user", "assistant"]
+    content: str
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str = Field(default="qwen2")
+    messages: list[ChatMessage]
+    max_tokens: int = Field(default=128, ge=1, le=4096)
+    temperature: float = Field(default=0.8, gt=0.0)
+    top_p: float = Field(default=0.9, gt=0.0, le=1.0)
+    top_k: int = Field(default=40, ge=0)
+    stream: bool = False
+
+
+class ChoiceMessage(BaseModel):
+    role: Literal["assistant"] = "assistant"
+    content: str
+
+
+class ChatCompletionChoice(BaseModel):
+    index: int
+    message: ChoiceMessage
+    finish_reason: Literal["stop", "length"]
+
+
+class ChatCompletionUsage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: Literal["chat.completion"] = "chat.completion"
+    created: int
+    model: str
+    choices: list[ChatCompletionChoice]
+    usage: ChatCompletionUsage
+
+
+@dataclass
+class ChatRuntime:
+    tokenizer: AutoTokenizer
+    model: Qwen2
+    lock: threading.Lock = field(default_factory=threading.Lock)
+
+
+def _render_prompt(messages: list[ChatMessage], tokenizer: AutoTokenizer) -> str:
+    role_map = [{"role": m.role, "content": m.content} for m in messages]
+    if hasattr(tokenizer, "apply_chat_template"):
+        return tokenizer.apply_chat_template(
+            role_map,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+    chunks = []
+    for message in role_map:
+        chunks.append(f"<{message['role']}>\n{message['content']}\n")
+    chunks.append("<assistant>\n")
+    return "".join(chunks)
+
+
+def _decode_new_text(tokenizer: AutoTokenizer, all_tokens: list[int], prompt_len: int) -> str:
+    return tokenizer.decode(all_tokens[prompt_len:], skip_special_tokens=True)
+
+
+def _sse(event: dict) -> str:
+    return f"data: {json.dumps(event, ensure_ascii=False)}\n\n"
+
+
+def create_app(runtime: ChatRuntime) -> FastAPI:
+    app = FastAPI(title="LLAISYS Chat Server", version="0.1.0")
+
+    @app.get("/health")
+    def health() -> dict[str, str]:
+        return {"status": "ok"}
+
+    @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+    def chat_completions(req: ChatCompletionRequest):
+        if not req.messages:
+            raise HTTPException(status_code=400, detail="messages must not be empty")
+
+        prompt = _render_prompt(req.messages, runtime.tokenizer)
+        prompt_tokens = runtime.tokenizer.encode(prompt)
+
+        created = int(time.time())
+        response_id = f"chatcmpl-{int(time.time() * 1000)}"
+
+        if req.stream:
+            def event_stream():
+                with runtime.lock:
+                    yield _sse({
+                        "id": response_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": req.model,
+                        "choices": [{
+                            "index": 0,
+                            "delta": {"role": "assistant"},
+                            "finish_reason": None,
+                        }],
+                    })
+
+                    generated = []
+                    last_text = ""
+                    for token in runtime.model.generate_stream(
+                        prompt_tokens,
+                        max_new_tokens=req.max_tokens,
+                        top_k=req.top_k,
+                        top_p=req.top_p,
+                        temperature=req.temperature,
+                    ):
+                        generated.append(token)
+                        current_text = runtime.tokenizer.decode(generated, skip_special_tokens=True)
+                        delta_text = current_text[len(last_text):]
+                        if not delta_text:
+                            continue
+                        last_text = current_text
+                        yield _sse({
+                            "id": response_id,
+                            "object": "chat.completion.chunk",
+                            "created": created,
+                            "model": req.model,
+                            "choices": [{
+                                "index": 0,
+                                "delta": {"content": delta_text},
+                                "finish_reason": None,
+                            }],
+                        })
+
+                    yield _sse({
+                        "id": response_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": req.model,
+                        "choices": [{
+                            "index": 0,
+                            "delta": {},
+                            "finish_reason": "stop",
+                        }],
+                    })
+                    yield "data: [DONE]\n\n"
+
+            return StreamingResponse(event_stream(), media_type="text/event-stream")
+
+        with runtime.lock:
+            generated_tokens = runtime.model.generate(
+                prompt_tokens,
+                max_new_tokens=req.max_tokens,
+                top_k=req.top_k,
+                top_p=req.top_p,
+                temperature=req.temperature,
+            )
+
+        answer_text = _decode_new_text(runtime.tokenizer, generated_tokens, len(prompt_tokens))
+        completion_tokens = max(0, len(generated_tokens) - len(prompt_tokens))
+
+        return ChatCompletionResponse(
+            id=response_id,
+            created=created,
+            model=req.model,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message=ChoiceMessage(content=answer_text),
+                    finish_reason="stop",
+                )
+            ],
+            usage=ChatCompletionUsage(
+                prompt_tokens=len(prompt_tokens),
+                completion_tokens=completion_tokens,
+                total_tokens=len(prompt_tokens) + completion_tokens,
+            ),
+        )
+
+    return app
+
+
+def _parse_device(device_name: str) -> DeviceType:
+    if device_name.lower() == "nvidia":
+        return DeviceType.NVIDIA
+    return DeviceType.CPU
+
+
+def build_runtime_from_env() -> ChatRuntime:
+    model_path = os.environ.get("LLAISYS_MODEL_PATH", "./data")
+    device = _parse_device(os.environ.get("LLAISYS_DEVICE", "cpu"))
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model = Qwen2(model_path=model_path, device=device)
+    return ChatRuntime(tokenizer=tokenizer, model=model)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .server import create_app

		__all__ = ["create_app"]