Skip to content

Commit

Permalink
implement api /v1/chat/completion (#3)
Browse files Browse the repository at this point in the history
* add swagger-stats for monitor performance

* ignore volumes when perform lint

* fix name

* update README

* update env

Signed-off-by: cbh778899 <[email protected]>

* add needed environmental variables

Signed-off-by: cbh778899 <[email protected]>

* add function for inference requests

Signed-off-by: cbh778899 <[email protected]>

* move all api routes to seprate function

Signed-off-by: cbh778899 <[email protected]>

* add post /completions

Signed-off-by: cbh778899 <[email protected]>

* add helper functions

Signed-off-by: cbh778899 <[email protected]>

---------

Signed-off-by: cbh778899 <[email protected]>
  • Loading branch information
cbh778899 authored Jul 31, 2024
1 parent 98d50b7 commit 12d3e39
Show file tree
Hide file tree
Showing 9 changed files with 203 additions and 11 deletions.
3 changes: 2 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ INFERENCE_ENG=llamacpp
INFERENCE_ENG_PORT=8080
INFERENCE_ENG_VERSION=server--b1-2321a5e
NUM_CPU_CORES=8.00
NUM_CPU_CORES_EMBEDDING=4.00
EMBEDDING_ENG=embedding_eng
EMBEDDING_ENG_PORT=8081
NUM_CPU_CORES_EMBEDDING=4.00
LANGUAGE_MODEL_NAME=Phi3-mini-4k-instruct-Q4.gguf
LANGUAGE_MODEL_URL=https://huggingface.co/aisuko/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi3-mini-4k-instruct-Q4.gguf?download=true
EMBEDDING_MODEL_NAME=all-MiniLM-L6-v2-Q4_K_M-v2.gguf
Expand Down
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@ CONTAINER_NAME:=voyager:v0.1.0
APP_PORT:=8000
# compose build related
ENV_FILE:=.env

INFERENCE_ENG:=llamacpp
INFERENCE_ENG_PORT:=8080
INFERENCE_ENG_VERSION:=server--b1-2321a5e
NUM_CPU_CORES:=8.00
NUM_CPU_CORES_EMBEDDING:=4.00


EMBEDDING_ENG:=embedding_eng
EMBEDDING_ENG_PORT:=8081
NUM_CPU_CORES_EMBEDDING:=4.00
LANGUAGE_MODEL_NAME:=Phi3-mini-4k-instruct-Q4.gguf
LANGUAGE_MODEL_URL:=https://huggingface.co/aisuko/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi3-mini-4k-instruct-Q4.gguf?download=true
EMBEDDING_MODEL_NAME:=all-MiniLM-L6-v2-Q4_K_M-v2.gguf
Expand All @@ -33,8 +36,9 @@ env:
@echo "INFERENCE_ENG_PORT=$(INFERENCE_ENG_PORT)">> $(ENV_FILE)
@echo "INFERENCE_ENG_VERSION=$(INFERENCE_ENG_VERSION)">> $(ENV_FILE)
@echo "NUM_CPU_CORES=$(NUM_CPU_CORES)">> $(ENV_FILE)
@echo "NUM_CPU_CORES_EMBEDDING=$(NUM_CPU_CORES_EMBEDDING)">> $(ENV_FILE)
@echo "EMBEDDING_ENG=$(EMBEDDING_ENG)">> $(ENV_FILE)
@echo "EMBEDDING_ENG_PORT=$(EMBEDDING_ENG_PORT)">> $(ENV_FILE)
@echo "NUM_CPU_CORES_EMBEDDING=$(NUM_CPU_CORES_EMBEDDING)">> $(ENV_FILE)
@echo "LANGUAGE_MODEL_NAME=$(LANGUAGE_MODEL_NAME)">> $(ENV_FILE)
@echo "LANGUAGE_MODEL_URL=$(LANGUAGE_MODEL_URL)">> $(ENV_FILE)
@echo "EMBEDDING_MODEL_NAME=$(EMBEDDING_MODEL_NAME)">> $(ENV_FILE)
Expand Down
70 changes: 70 additions & 0 deletions actions/inference.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import { formatOpenAIContext } from "../tools/formatContext.js";
import { generateFingerprint } from "../tools/generator.js";
import { post } from "../tools/request.js";

function generateResponseContent(id, object, model, system_fingerprint, stream, content, stopped) {
const resp = {
id,
object,
created: Date.now(),
model,
system_fingerprint,
choices: [{
index: 0,
[stream ? 'delta':'message']: {
role: 'assistant',
content
},
logprobs: null,
finish_reason: stopped ? 'stop' : null
}],
}
if(!stream) {
resp.usage = {
prompt_tokens: 0,
completion_tokens: 0,
total_tokens: 0
}
}
return resp;
}

export async function chatCompletion(req, res) {
const api_key = (req.headers.authorization || '').split('Bearer ').pop();
if(!api_key) {
res.status(401).send('Not Authorized');
return;
}

const system_fingerprint = generateFingerprint();
let {messages, ...request_body} = req.body;
request_body.prompt = formatOpenAIContext(messages);
const model = request_body.model || process.env.LANGUAGE_MODEL_NAME

if(request_body.stream) {
res.setHeader("Content-Type", "text/event-stream");
res.setHeader("Cache-Control", "no-cache");
res.setHeader("X-Accel-Buffering", "no");
res.setHeader("Connection", "Keep-Alive");

const eng_resp = await post('completion', { body: request_body }, { getJSON: false });
const reader = eng_resp.body.pipeThrough(new TextDecoderStream()).getReader();
while(true) {
const { value, done } = await reader.read();
if(done) break;
const data = value.split("data: ").pop()
const json_data = JSON.parse(data)
const { content, stop } = json_data;
res.write(JSON.stringify(generateResponseContent(api_key, 'chat.completion.chunk', model, system_fingerprint, true, content, stop))+'\n\n');
}
res.end();
} else {
const eng_resp = await post('completion', { body: request_body });
const { model, content } = eng_resp;
const response_json = generateResponseContent(
api_key, 'chat.completion', model, system_fingerprint,
false, content, true
)
res.send(response_json);
}
}
4 changes: 2 additions & 2 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ services:
expose:
- 8080
ports:
- 8080:8080
- ${INFERENCE_ENG_PORT}:8080
command: ["-m", "models/${LANGUAGE_MODEL_NAME}","-c","8192"]

embedding_eng:
Expand All @@ -28,7 +28,7 @@ services:
expose:
- 8080
ports:
- 8082:8080
- ${EMBEDDING_ENG_PORT}:8080
command: ["-m", "models/${EMBEDDING_MODEL_NAME}","--embeddings","--pooling","mean","-c","512"]

voyager:
Expand Down
20 changes: 14 additions & 6 deletions routes/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,20 @@ function indexRoute() {
return router;
}

function generateAPIRouters() {
const api_router = Router();

api_router.use('/chat', inferenceRoute());
api_router.use('/token', tokenRoute());
api_router.use('/tracing', tracingRoute());
api_router.use('/embedding', embeddingRoute());
api_router.use('/encoder', encoderRoute());
api_router.use('/decoder', decoderRoute());

return api_router;
}

export default function buildRoutes(app) {
app.use('/', indexRoute());
app.use('/inference', inferenceRoute());
app.use('/token', tokenRoute());
app.use('/tracing', tracingRoute());
app.use('/embedding', embeddingRoute());
app.use('/encoder', encoderRoute());
app.use('/decoder', decoderRoute());
app.use('/v1', generateAPIRouters());
}
4 changes: 4 additions & 0 deletions routes/inference.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import { Router } from "express";
import { chatCompletion } from "../actions/inference.js";

export default function inferenceRoute() {
const router = Router();

router.post('/completions', chatCompletion);

return router;
}
18 changes: 18 additions & 0 deletions tools/formatContext.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
const system_context = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."

export function formatInferenceContext(history, question) {
let context = system_context;
context += history.map(({role, message}) => {
return `### ${role === 'user' ? 'Human' : 'Assistant'}: ${message || ''}`
}).join('\n');
context += `\n### Human: ${question}\n### Assistant:`;
return context;
}

export function formatOpenAIContext(messages) {
let context = messages.map(({role, content}) => {
return `### ${role}: ${content}`;
}).join("\n");
context += '\n### assistant:'
return context;
}
11 changes: 11 additions & 0 deletions tools/generator.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export function generateRandomString() {
return Math.random().toString(32).slice(2)
}

export function generateFingerprint() {
return 'fp_'+generateRandomString();
}

export function generateAPIKey() {
return 'voy-'+[...Array(4)].map(generateRandomString).join('')
}
76 changes: 76 additions & 0 deletions tools/request.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
const BASE_URL = {
"chat": `http://${process.env.INFERENCE_ENG || 'llamacpp'}:${process.env.INFERENCE_ENG_PORT || 8080}`,
"rag": `http://${process.env.EMBEDDING_ENG || 'embedding_eng'}:${process.env.EMBEDDING_ENG_PORT || 8081}`
}

const default_options = {
headers: {
'Content-Type': 'application/json'
}
}

/**
* @typedef RequestOptions
* @property {"rag"|"chat"} eng select between rag engine or chat engine, default value is `chat`
* @property {Boolean} getJSON
* * If set to `true`, this function will return the result of `await(await fetch(...)).json();`
* and include an attribute `http_error: true` if there's any http error occurs during fetch().
* * If set to `false`, this function will return the result of `await fetch(...);`, without error handling
* * default value is `true`;
*/

/**
* A wrap of native fetch api helps fill default headers and urls
* @param {String} url The url to send request
* @param {RequestInit} options the options to init request
* @param {RequestOptions} request_options extra options to be included
* @returns {Promise<Response>|Object|{http_error: true}}
*/
export default async function request(url, options={}, request_options={}) {
const eng = request_options.eng || "chat";
const getJSON = Object.hasOwn(request_options, 'getJSON') ? request_options.getJSON : true

url = `${BASE_URL[eng]}${url[0]!=='/' && '/'}${url}`;

options = {
...default_options,
...options
}

if(options.body) {
options.body = JSON.stringify(options.body)
}

const res = await fetch(url, options);
if(getJSON) {
if(res.ok) {
return await res.json();
} else {
return { http_error: true }
}
} else {
return res;
}
}

/**
* A quick get {@link request} wrap
* @param {String} url The url to send request
* @param {RequestInit} options the options to init request
* @param {RequestOptions} request_options extra options to be included
* @returns {Promise<Response>|Object|{http_error: true}}
*/
export function get(url, options, request_options) {
return request(url, {method: 'GET', ...options}, request_options);
}

/**
* A quick post {@link request} wrap
* @param {String} url The url to send request
* @param {RequestInit} options the options to init request
* @param {RequestOptions} request_options extra options to be included
* @returns {Promise<Response>|Object|{http_error: true}}
*/
export function post(url, options, request_options) {
return request(url, {method: 'POST', ...options}, request_options);
}

0 comments on commit 12d3e39

Please sign in to comment.