Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions include/llaisys/models/qwen2.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,29 @@ __C {

__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);

// Set a named weight tensor into the model. Returns 0 on success.
__export int llaisysQwen2ModelSetWeight(struct LlaisysQwen2Model * model, const char * name, llaisysTensor_t tensor);

// Optional finalize call after all weights are set.
__export int llaisysQwen2ModelFinalize(struct LlaisysQwen2Model * model);

// Check whether a named weight has been set. Returns 1 if present, 0 otherwise.
__export uint8_t llaisysQwen2ModelHasWeight(struct LlaisysQwen2Model * model, const char * name);

__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
__export int64_t llaisysQwen2ModelInferSample(
struct LlaisysQwen2Model * model,
int64_t * token_ids,
size_t ntoken,
float temperature,
size_t top_k,
float top_p,
uint64_t seed);

// KV cache APIs
__export void *llaisysQwen2KVCreat(struct LlaisysQwen2Model * model, size_t max_tokens);
__export void llaisysQwen2KVDestroy(void *kv);
__export int llaisysQwen2KVAppend(void *kv, llaisysTensor_t k, llaisysTensor_t v);
__export size_t llaisysQwen2KVLen(void *kv);
}
#endif // LLAISYS_MODELS_QWEN2_H
7 changes: 7 additions & 0 deletions include/llaisys/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
__C {
__export void llaisysAdd(llaisysTensor_t c, llaisysTensor_t a, llaisysTensor_t b);
__export void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals);
__export void llaisysRandomSample(
llaisysTensor_t out_idx,
llaisysTensor_t logits,
float temperature,
size_t top_k,
float top_p,
uint64_t seed);
__export void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight);
__export void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias);
__export void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in);
Expand Down
3 changes: 3 additions & 0 deletions python/llaisys/chat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .engine import ChatEngine

__all__ = ["ChatEngine"]
137 changes: 137 additions & 0 deletions python/llaisys/chat/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import argparse
import json
import sys
import urllib.error
import urllib.request
from typing import Any, Dict, List, Optional


def _post_json(url: str, payload: Dict[str, Any], stream: bool):
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
headers = {
"Content-Type": "application/json",
"Accept": "text/event-stream" if stream else "application/json",
}
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
return urllib.request.urlopen(req, timeout=600)


def _send_chat(
endpoint: str,
model: str,
messages: List[Dict[str, str]],
max_new_tokens: int,
temperature: float,
top_k: int,
top_p: float,
seed: Optional[int],
stream: bool,
) -> str:
payload = {
"model": model,
"messages": messages,
"max_new_tokens": int(max_new_tokens),
"temperature": float(temperature),
"top_k": int(top_k),
"top_p": float(top_p),
"stream": bool(stream),
}
if seed is not None:
payload["seed"] = int(seed)

if not stream:
with _post_json(endpoint, payload, stream=False) as resp:
data = json.loads(resp.read().decode("utf-8"))
return data["choices"][0]["message"]["content"]

assistant_text = ""
with _post_json(endpoint, payload, stream=True) as resp:
for raw in resp:
line = raw.decode("utf-8").strip()
if not line or not line.startswith("data:"):
continue
content = line[5:].strip()
if content == "[DONE]":
break

chunk = json.loads(content)
choices = chunk.get("choices", [])
if not choices:
continue
delta = choices[0].get("delta", {})
piece = delta.get("content", "")
if piece:
sys.stdout.write(piece)
sys.stdout.flush()
assistant_text += piece
sys.stdout.write("\n")
sys.stdout.flush()
return assistant_text


def main():
parser = argparse.ArgumentParser(description="LLAISYS interactive chat CLI")
parser.add_argument("--server", default="http://127.0.0.1:8000")
parser.add_argument("--model", default="qwen2")
parser.add_argument("--max-new-tokens", default=256, type=int)
parser.add_argument("--temperature", default=0.8, type=float)
parser.add_argument("--top-k", default=50, type=int)
parser.add_argument("--top-p", default=0.9, type=float)
parser.add_argument("--seed", default=None, type=int)
parser.add_argument("--system", default=None, type=str)
parser.add_argument("--stream", action="store_true")
args = parser.parse_args()

endpoint = args.server.rstrip("/") + "/v1/chat/completions"
messages: List[Dict[str, str]] = []
if args.system:
messages.append({"role": "system", "content": args.system})

print("Commands: /exit to quit, /reset to clear history.")
while True:
try:
user_text = input("You: ").strip()
except EOFError:
print()
break

if not user_text:
continue
if user_text in ("/exit", "/quit"):
break
if user_text == "/reset":
messages = []
if args.system:
messages.append({"role": "system", "content": args.system})
print("History cleared.")
continue

messages.append({"role": "user", "content": user_text})

try:
if args.stream:
sys.stdout.write("Assistant: ")
sys.stdout.flush()
assistant_text = _send_chat(
endpoint=endpoint,
model=args.model,
messages=messages,
max_new_tokens=args.max_new_tokens,
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
seed=args.seed,
stream=args.stream,
)
if not args.stream:
print(f"Assistant: {assistant_text}")
messages.append({"role": "assistant", "content": assistant_text})
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")
print(f"HTTP {exc.code}: {detail}")
except Exception as exc: # pragma: no cover - network/runtime errors
print(f"Request failed: {exc}")


if __name__ == "__main__":
main()
112 changes: 112 additions & 0 deletions python/llaisys/chat/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from typing import Any, Dict, Iterable, List, Optional

from transformers import AutoTokenizer

from .. import DeviceType
from ..models import Qwen2


def _normalize_content(content: Any) -> str:
if isinstance(content, str):
return content
if isinstance(content, list):
chunks: List[str] = []
for item in content:
if isinstance(item, dict):
if item.get("type") == "text" and isinstance(item.get("text"), str):
chunks.append(item["text"])
elif isinstance(item, str):
chunks.append(item)
return "".join(chunks)
return str(content)


def _normalize_messages(messages: Iterable[Dict[str, Any]]) -> List[Dict[str, str]]:
conversation: List[Dict[str, str]] = []
for message in messages:
if not isinstance(message, dict):
continue
role = str(message.get("role", "user"))
content = _normalize_content(message.get("content", ""))
conversation.append({"role": role, "content": content})
return conversation


class ChatEngine:
def __init__(
self,
model_path: str,
device: str = "cpu",
max_new_tokens: int = 256,
top_k: int = 50,
top_p: float = 0.9,
temperature: float = 0.8,
):
self.model_path = model_path
self.max_new_tokens = int(max_new_tokens)
self.top_k = int(top_k)
self.top_p = float(top_p)
self.temperature = float(temperature)

self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if device.lower() == "nvidia":
model_device = DeviceType.NVIDIA
else:
model_device = DeviceType.CPU
self.model = Qwen2(model_path, model_device)

def complete(
self,
messages: Iterable[Dict[str, Any]],
max_new_tokens: Optional[int] = None,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
temperature: Optional[float] = None,
seed: Optional[int] = None,
) -> Dict[str, Any]:
conversation = _normalize_messages(messages)
input_content = self.tokenizer.apply_chat_template(
conversation=conversation,
add_generation_prompt=True,
tokenize=False,
)
input_ids = self.tokenizer.encode(input_content)

out_ids = self.model.generate(
input_ids,
max_new_tokens=self.max_new_tokens if max_new_tokens is None else int(max_new_tokens),
top_k=self.top_k if top_k is None else int(top_k),
top_p=self.top_p if top_p is None else float(top_p),
temperature=self.temperature if temperature is None else float(temperature),
seed=seed,
)

prompt_tokens = len(input_ids)
if len(out_ids) >= prompt_tokens:
generated_ids = out_ids[prompt_tokens:]
else:
generated_ids = []
assistant_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)

return {
"prompt_tokens": prompt_tokens,
"completion_tokens": len(generated_ids),
"total_tokens": prompt_tokens + len(generated_ids),
"output_ids": out_ids,
"generated_ids": generated_ids,
"assistant_text": assistant_text,
}

def iter_text_deltas(self, generated_ids: Iterable[int]):
seen = ""
buffer: List[int] = []
for token_id in generated_ids:
buffer.append(int(token_id))
current = self.tokenizer.decode(buffer, skip_special_tokens=True)
if current.startswith(seen):
delta = current[len(seen):]
else:
delta = current
seen = current
if delta:
yield delta
Loading