implement api /v1/chat/completion (#3)

* add swagger-stats for monitor performance * ignore volumes when perform lint * fix name * update README * update env Signed-off-by: cbh778899 <[email protected]> * add needed environmental variables Signed-off-by: cbh778899 <[email protected]> * add function for inference requests Signed-off-by: cbh778899 <[email protected]> * move all api routes to seprate function Signed-off-by: cbh778899 <[email protected]> * add post /completions Signed-off-by: cbh778899 <[email protected]> * add helper functions Signed-off-by: cbh778899 <[email protected]> --------- Signed-off-by: cbh778899 <[email protected]>
SkywardAI · Jul 31, 2024 · 12d3e39 · 12d3e39
1 parent 98d50b7
commit 12d3e39
Show file tree

Hide file tree

Showing 9 changed files with 203 additions and 11 deletions.
diff --git a/.env b/.env
@@ -3,8 +3,9 @@ INFERENCE_ENG=llamacpp
 INFERENCE_ENG_PORT=8080
 INFERENCE_ENG_VERSION=server--b1-2321a5e
 NUM_CPU_CORES=8.00
-NUM_CPU_CORES_EMBEDDING=4.00
 EMBEDDING_ENG=embedding_eng
+EMBEDDING_ENG_PORT=8081
+NUM_CPU_CORES_EMBEDDING=4.00
 LANGUAGE_MODEL_NAME=Phi3-mini-4k-instruct-Q4.gguf
 LANGUAGE_MODEL_URL=https://huggingface.co/aisuko/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi3-mini-4k-instruct-Q4.gguf?download=true
 EMBEDDING_MODEL_NAME=all-MiniLM-L6-v2-Q4_K_M-v2.gguf

diff --git a/Makefile b/Makefile
@@ -4,13 +4,16 @@ CONTAINER_NAME:=voyager:v0.1.0
 APP_PORT:=8000
 # compose build related
 ENV_FILE:=.env
+
 INFERENCE_ENG:=llamacpp
 INFERENCE_ENG_PORT:=8080
 INFERENCE_ENG_VERSION:=server--b1-2321a5e
 NUM_CPU_CORES:=8.00
-NUM_CPU_CORES_EMBEDDING:=4.00
+
 
 EMBEDDING_ENG:=embedding_eng
+EMBEDDING_ENG_PORT:=8081
+NUM_CPU_CORES_EMBEDDING:=4.00
 LANGUAGE_MODEL_NAME:=Phi3-mini-4k-instruct-Q4.gguf
 LANGUAGE_MODEL_URL:=https://huggingface.co/aisuko/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi3-mini-4k-instruct-Q4.gguf?download=true
 EMBEDDING_MODEL_NAME:=all-MiniLM-L6-v2-Q4_K_M-v2.gguf
@@ -33,8 +36,9 @@ env:
 	@echo "INFERENCE_ENG_PORT=$(INFERENCE_ENG_PORT)">> $(ENV_FILE)
 	@echo "INFERENCE_ENG_VERSION=$(INFERENCE_ENG_VERSION)">> $(ENV_FILE)
 	@echo "NUM_CPU_CORES=$(NUM_CPU_CORES)">> $(ENV_FILE)
-	@echo "NUM_CPU_CORES_EMBEDDING=$(NUM_CPU_CORES_EMBEDDING)">> $(ENV_FILE)
 	@echo "EMBEDDING_ENG=$(EMBEDDING_ENG)">> $(ENV_FILE)
+	@echo "EMBEDDING_ENG_PORT=$(EMBEDDING_ENG_PORT)">> $(ENV_FILE)
+	@echo "NUM_CPU_CORES_EMBEDDING=$(NUM_CPU_CORES_EMBEDDING)">> $(ENV_FILE)
 	@echo "LANGUAGE_MODEL_NAME=$(LANGUAGE_MODEL_NAME)">> $(ENV_FILE)
 	@echo "LANGUAGE_MODEL_URL=$(LANGUAGE_MODEL_URL)">> $(ENV_FILE)
 	@echo "EMBEDDING_MODEL_NAME=$(EMBEDDING_MODEL_NAME)">> $(ENV_FILE)

diff --git a/actions/inference.js b/actions/inference.js
@@ -0,0 +1,70 @@
+import { formatOpenAIContext } from "../tools/formatContext.js";
+import { generateFingerprint } from "../tools/generator.js";
+import { post } from "../tools/request.js";
+
+function generateResponseContent(id, object, model, system_fingerprint, stream, content, stopped) {
+    const resp = {
+        id,
+        object,
+        created: Date.now(),
+        model,
+        system_fingerprint,
+        choices: [{
+            index: 0,
+            [stream ? 'delta':'message']: {
+                role: 'assistant',
+                content
+            },
+            logprobs: null,
+            finish_reason: stopped ? 'stop' : null
+        }],
+    }
+    if(!stream) {
+        resp.usage = {
+            prompt_tokens: 0,
+            completion_tokens: 0,
+            total_tokens: 0
+        }
+    }
+    return resp;
+}
+
+export async function chatCompletion(req, res) {
+    const api_key = (req.headers.authorization || '').split('Bearer ').pop();
+    if(!api_key) {
+        res.status(401).send('Not Authorized');
+        return;
+    }
+
+    const system_fingerprint = generateFingerprint();
+    let {messages, ...request_body} = req.body;
+    request_body.prompt = formatOpenAIContext(messages);
+    const model = request_body.model || process.env.LANGUAGE_MODEL_NAME
+
+    if(request_body.stream) {
+        res.setHeader("Content-Type", "text/event-stream");
+        res.setHeader("Cache-Control", "no-cache");
+        res.setHeader("X-Accel-Buffering", "no");
+        res.setHeader("Connection", "Keep-Alive");
+
+        const eng_resp = await post('completion', { body: request_body }, { getJSON: false });
+        const reader = eng_resp.body.pipeThrough(new TextDecoderStream()).getReader();
+        while(true) {
+            const { value, done } = await reader.read();
+            if(done) break;
+            const data = value.split("data: ").pop()
+            const json_data = JSON.parse(data)
+            const { content, stop } = json_data;
+            res.write(JSON.stringify(generateResponseContent(api_key, 'chat.completion.chunk', model, system_fingerprint, true, content, stop))+'\n\n');
+        }
+        res.end();
+    } else {
+        const eng_resp = await post('completion', { body: request_body });
+        const { model, content } = eng_resp;
+        const response_json = generateResponseContent(
+            api_key, 'chat.completion', model, system_fingerprint,
+            false, content, true
+        )
+        res.send(response_json);
+    }
+}
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -12,7 +12,7 @@ services:
     expose:
       - 8080
     ports:
-      - 8080:8080
+      - ${INFERENCE_ENG_PORT}:8080
     command: ["-m", "models/${LANGUAGE_MODEL_NAME}","-c","8192"]
 
   embedding_eng:
@@ -28,7 +28,7 @@ services:
     expose:
       - 8080
     ports:
-      - 8082:8080
+      - ${EMBEDDING_ENG_PORT}:8080
     command: ["-m", "models/${EMBEDDING_MODEL_NAME}","--embeddings","--pooling","mean","-c","512"]
 
   voyager:

diff --git a/routes/index.js b/routes/index.js
@@ -21,12 +21,20 @@ function indexRoute() {
     return router;
 }
 
+function generateAPIRouters() {
+    const api_router = Router();
+
+    api_router.use('/chat', inferenceRoute());
+    api_router.use('/token', tokenRoute());
+    api_router.use('/tracing', tracingRoute());
+    api_router.use('/embedding', embeddingRoute());
+    api_router.use('/encoder', encoderRoute());
+    api_router.use('/decoder', decoderRoute());
+
+    return api_router;
+}
+
 export default function buildRoutes(app) {
     app.use('/', indexRoute());
-    app.use('/inference', inferenceRoute());
-    app.use('/token', tokenRoute());
-    app.use('/tracing', tracingRoute());
-    app.use('/embedding', embeddingRoute());
-    app.use('/encoder', encoderRoute());
-    app.use('/decoder', decoderRoute());
+    app.use('/v1', generateAPIRouters());
 }
diff --git a/routes/inference.js b/routes/inference.js
@@ -1,6 +1,10 @@
 import { Router } from "express";
+import { chatCompletion } from "../actions/inference.js";
 
 export default function inferenceRoute() {
     const router = Router();
+
+    router.post('/completions', chatCompletion);
+
     return router;
 }
diff --git a/tools/formatContext.js b/tools/formatContext.js
@@ -0,0 +1,18 @@
+const system_context = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+export function formatInferenceContext(history, question) {
+    let context = system_context;
+    context += history.map(({role, message}) => {
+        return `### ${role === 'user' ? 'Human' : 'Assistant'}: ${message || ''}`
+    }).join('\n');
+    context += `\n### Human: ${question}\n### Assistant:`;
+    return context;
+}
+
+export function formatOpenAIContext(messages) {
+    let context = messages.map(({role, content}) => {
+        return `### ${role}: ${content}`;
+    }).join("\n");
+    context += '\n### assistant:'
+    return context;
+}
diff --git a/tools/generator.js b/tools/generator.js
@@ -0,0 +1,11 @@
+export function generateRandomString() {
+    return Math.random().toString(32).slice(2)
+}
+
+export function generateFingerprint() {
+    return 'fp_'+generateRandomString();
+}
+
+export function generateAPIKey() {
+    return 'voy-'+[...Array(4)].map(generateRandomString).join('')
+}
diff --git a/tools/request.js b/tools/request.js
@@ -0,0 +1,76 @@
+const BASE_URL = {
+    "chat": `http://${process.env.INFERENCE_ENG || 'llamacpp'}:${process.env.INFERENCE_ENG_PORT || 8080}`,
+    "rag": `http://${process.env.EMBEDDING_ENG || 'embedding_eng'}:${process.env.EMBEDDING_ENG_PORT || 8081}`
+}
+
+const default_options = {
+    headers: {
+        'Content-Type': 'application/json'
+    }
+}
+
+/**
+ * @typedef RequestOptions
+ * @property {"rag"|"chat"} eng select between rag engine or chat engine, default value is `chat`
+ * @property {Boolean} getJSON  
+ *  * If set to `true`, this function will return the result of `await(await fetch(...)).json();`
+ *  and include an attribute `http_error: true` if there's any http error occurs during fetch().  
+ *  * If set to `false`, this function will return the result of `await fetch(...);`, without error handling
+ *  * default value is `true`;
+ */
+
+/**
+ * A wrap of native fetch api helps fill default headers and urls
+ * @param {String} url The url to send request
+ * @param {RequestInit} options the options to init request
+ * @param {RequestOptions} request_options extra options to be included
+ * @returns {Promise<Response>|Object|{http_error: true}}
+*/
+export default async function request(url, options={}, request_options={}) {
+    const eng = request_options.eng || "chat";
+    const getJSON = Object.hasOwn(request_options, 'getJSON') ? request_options.getJSON : true
+
+    url = `${BASE_URL[eng]}${url[0]!=='/' && '/'}${url}`;
+
+    options = {
+        ...default_options,
+        ...options
+    }
+
+    if(options.body) {
+        options.body = JSON.stringify(options.body)
+    }
+
+    const res = await fetch(url, options);
+    if(getJSON) {
+        if(res.ok) {
+            return await res.json();
+        } else {
+            return { http_error: true }
+        }
+    } else {
+        return res;
+    }
+}
+
+/**
+ * A quick get {@link request} wrap
+ * @param {String} url The url to send request
+ * @param {RequestInit} options the options to init request
+ * @param {RequestOptions} request_options extra options to be included
+ * @returns {Promise<Response>|Object|{http_error: true}}
+ */
+export function get(url, options, request_options) {
+    return request(url, {method: 'GET', ...options}, request_options);
+}
+
+/**
+ * A quick post {@link request} wrap
+ * @param {String} url The url to send request
+ * @param {RequestInit} options the options to init request
+ * @param {RequestOptions} request_options extra options to be included
+ * @returns {Promise<Response>|Object|{http_error: true}}
+ */
+export function post(url, options, request_options) {
+    return request(url, {method: 'POST', ...options}, request_options);
+}