Add GPT-2 inference notebook using ORTEngine and Kinference

devcrocod · devcrocod · commit 165fe685f73c · 2025-02-10T14:31:38.000+01:00
diff --git a/notebooks/kinference/ORTGPT2.ipynb b/notebooks/kinference/ORTGPT2.ipynb
@@ -0,0 +1,302 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2025-02-10T13:27:05.671253Z",
+     "start_time": "2025-02-10T13:27:05.206956Z"
+    }
+   },
+   "source": [
+    "val kinferencerVersion = \"0.2.26\"\n",
+    "val ktorVersion = \"3.0.3\"\n",
+    "\n",
+    "USE {\n",
+    "    repositories {\n",
+    "        mavenCentral()\n",
+    "        maven(\"https://packages.jetbrains.team/maven/p/ki/maven\")\n",
+    "        maven(\"https://packages.jetbrains.team/maven/p/grazi/grazie-platform-public\")\n",
+    "    }\n",
+    "    dependencies {\n",
+    "        implementation(\"io.kinference:inference-core-jvm:$kinferencerVersion\")\n",
+    "        implementation(\"io.kinference:inference-ort-jvm:$kinferencerVersion\")\n",
+    "        implementation(\"io.kinference:serializer-protobuf-jvm:$kinferencerVersion\")\n",
+    "        implementation(\"io.kinference:utils-common-jvm:$kinferencerVersion\")\n",
+    "        implementation(\"io.kinference:ndarray-core-jvm:$kinferencerVersion\")\n",
+    "\n",
+    "        implementation(\"io.ktor:ktor-client-core-jvm:$ktorVersion\")\n",
+    "        implementation(\"io.ktor:ktor-client-cio-jvm:$ktorVersion\")\n",
+    "\n",
+    "        implementation(\"org.slf4j:slf4j-api:2.0.9\")\n",
+    "        implementation(\"org.slf4j:slf4j-simple:2.0.9\")\n",
+    "\n",
+    "        implementation(\"ai.djl:api:0.28.0\")\n",
+    "        implementation(\"ai.djl.huggingface:tokenizers:0.28.0\")\n",
+    "    }\n",
+    "}"
+   ],
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-02-10T13:27:05.732096Z",
+     "start_time": "2025-02-10T13:27:05.678435Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer\n",
+    "import io.kinference.core.data.tensor.KITensor\n",
+    "import io.kinference.core.data.tensor.asTensor\n",
+    "import io.kinference.ndarray.arrays.FloatNDArray\n",
+    "import io.kinference.ndarray.arrays.FloatNDArray.Companion.invoke\n",
+    "import io.kinference.ort.ORTData\n",
+    "import io.kinference.ort.ORTEngine\n",
+    "import io.kinference.ort.data.tensor.ORTTensor\n",
+    "import io.kinference.utils.CommonDataLoader\n",
+    "import io.kinference.utils.inlines.InlineInt\n",
+    "import io.kinference.utils.toIntArray\n",
+    "import okio.Path.Companion.toPath\n",
+    "import io.kinference.core.KIONNXData\n",
+    "import io.kinference.ndarray.arrays.LongNDArray\n",
+    "import io.kinference.ndarray.arrays.NumberNDArrayCore\n",
+    "import io.ktor.client.HttpClient\n",
+    "import io.ktor.client.plugins.HttpTimeout\n",
+    "import io.ktor.client.request.prepareRequest\n",
+    "import io.ktor.client.statement.bodyAsChannel\n",
+    "import io.ktor.util.cio.writeChannel\n",
+    "import io.ktor.utils.io.copyAndClose\n",
+    "import java.io.File\n",
+    "import kotlinx.coroutines.runBlocking"
+   ],
+   "outputs": [],
+   "execution_count": 2
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-02-10T13:27:06.563564Z",
+     "start_time": "2025-02-10T13:27:06.070567Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "/**\n",
+    " * Directory used to store cached files.\n",
+    " *\n",
+    " * This variable combines the user's current working directory\n",
+    " * with a \"cache\" subdirectory to create the path for storing cache files.\n",
+    " * It is used in various functions to check for existing files or directories,\n",
+    " * create new ones if they do not exist, and manage the caching of downloaded files.\n",
+    " */\n",
+    "val cacheDirectory = System.getProperty(\"user.dir\") + \"/.cache/\"\n",
+    "\n",
+    "/**\n",
+    " * Downloads a file from the given URL and saves it with the specified file name.\n",
+    " *\n",
+    " * Checks if the directory specified by `cacheDirectory` exists.\n",
+    " * If not, it creates the directory. If the file already exists,\n",
+    " * the download is skipped. Otherwise, the file is downloaded\n",
+    " * using an HTTP client with a 10-minute timeout setting.\n",
+    " *\n",
+    " * @param url The URL from which to download the file.\n",
+    " * @param fileName The name to use for the downloaded file.\n",
+    " * @param timeout Optional timeout duration for the download request, in milliseconds.\n",
+    " * Defaults to 600,000 milliseconds (10 minutes).\n",
+    " * Increase the timeout if you are not sure that download for the particular model with fit into the default timeout.\n",
+    " */\n",
+    "suspend fun downloadFile(url: String, fileName: String, timeout: Long = 600_000) {\n",
+    "    // Ensure the predefined path is treated as a directory\n",
+    "    val directory = File(cacheDirectory)\n",
+    "\n",
+    "    // Check if the directory exists, if not create it\n",
+    "    if (!directory.exists()) {\n",
+    "        println(\"Predefined directory doesn't exist. Creating directory at $cacheDirectory.\")\n",
+    "        directory.mkdirs() // Create the directory if it doesn't exist\n",
+    "    }\n",
+    "\n",
+    "    // Check if the file already exists\n",
+    "    val file = File(directory, fileName)\n",
+    "    if (file.exists()) {\n",
+    "        println(\"File already exists at ${file.absolutePath}. Skipping download.\")\n",
+    "        return // Exit the function if the file exists\n",
+    "    }\n",
+    "\n",
+    "    // Create an instance of HttpClient with custom timeout settings\n",
+    "    val client = HttpClient {\n",
+    "        install(HttpTimeout) {\n",
+    "            requestTimeoutMillis = timeout\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    // Download the file and write to the specified output path\n",
+    "    client.prepareRequest(url).execute { response ->\n",
+    "        response.bodyAsChannel().copyAndClose(file.writeChannel())\n",
+    "    }\n",
+    "\n",
+    "    client.close()\n",
+    "}\n",
+    "\n",
+    "/**\n",
+    " * Extracts the token ID with the highest probability from the output tensor.\n",
+    " *\n",
+    " * @param output A map containing the output tensors identified by their names.\n",
+    " * @param tokensSize The number of tokens in the sequence.\n",
+    " * @param outputName The name of the tensor containing the logits.\n",
+    " * @return The ID of the top token.\n",
+    " */\n",
+    "suspend fun extractTopToken(output: Map<String, KIONNXData<*>>, tokensSize: Int, outputName: String): Long {\n",
+    "    val logits = output[outputName]!! as KITensor\n",
+    "    val sliced = logits.data.slice(\n",
+    "        starts = intArrayOf(0, 0, tokensSize - 1, 0),   // First batch, first element in the second dimension, last token, first vocab entry\n",
+    "        ends = intArrayOf(1, 1, tokensSize, 50257),     // Same batch, same second dimension, one token step, whole vocab (50257)\n",
+    "        steps = intArrayOf(1, 1, 1, 1)                  // Step of 1 for each dimension\n",
+    "    ) as NumberNDArrayCore\n",
+    "    val softmax = sliced.softmax(axis = -1)\n",
+    "    val topK = softmax.topK(\n",
+    "        axis = -1,                                      // Apply top-k along the last dimension (vocabulary size)\n",
+    "        k = 1,                                          // Retrieve the top 1 element\n",
+    "        largest = true,                                 // We want the largest probabilities (most probable tokens)\n",
+    "        sorted = false                                  // Sorting is unnecessary since we are only retrieving the top 1\n",
+    "    )\n",
+    "    val tokenId = (topK.second as LongNDArray)[intArrayOf(0, 0, 0, 0)]\n",
+    "\n",
+    "    return tokenId\n",
+    "}\n",
+    "\n",
+    "suspend fun convertToKITensorMap(outputs: Map<String, ORTData<*>>): Map<String, KITensor> {\n",
+    "    return outputs.map { (name, ortTensor) ->\n",
+    "        val ortTensor = ortTensor as ORTTensor\n",
+    "        val data = ortTensor.toFloatArray()\n",
+    "        val shape = ortTensor.shape.toIntArray()\n",
+    "        val ndArray = FloatNDArray(shape) { idx: InlineInt -> data[idx.value] }\n",
+    "        val kiTensor = ndArray.asTensor(name)\n",
+    "        return@map name to kiTensor\n",
+    "    }.toMap()\n",
+    "}"
+   ],
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-02-10T13:27:06.737944Z",
+     "start_time": "2025-02-10T13:27:06.715791Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "// Constants for input and output tensor names used in the GPT-2 model\n",
+    "val INPUT_TENSOR_NAME = \"input1\"\n",
+    "val OUTPUT_TENSOR_NAME = \"output1\" // We use only logits tensor"
+   ],
+   "outputs": [],
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-02-10T13:27:07.461104Z",
+     "start_time": "2025-02-10T13:27:07.440849Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "val modelUrl = \"https://github.com/onnx/models/raw/main/validated/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx\"\n",
+    "val modelName = \"gpt2-lm-head-10\"\n"
+   ],
+   "outputs": [],
+   "execution_count": 5
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-02-10T13:27:38.180925Z",
+     "start_time": "2025-02-10T13:27:09.793527Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "runBlocking {\n",
+    "    println(\"Downloading model from: $modelUrl\")\n",
+    "    downloadFile(modelUrl, \"$modelName.onnx\") //GPT-2 from model zoo is around 650 Mb, adjust your timeout if needed\n",
+    "\n",
+    "    println(\"Loading model...\")\n",
+    "    val model = ORTEngine.loadModel(\"$cacheDirectory/$modelName.onnx\".toPath())\n",
+    "\n",
+    "    val tokenizer = HuggingFaceTokenizer.newInstance(\"gpt2\", mapOf(\"modelMaxLength\" to \"1024\"))\n",
+    "    val testString = \"Neurogenesis is most active during embryonic development and is responsible for producing \" +\n",
+    "            \"all the various types of neurons of the organism, but it continues throughout adult life \" +\n",
+    "            \"in a variety of organisms. Once born, neurons do not divide (see mitosis), and many will \" +\n",
+    "            \"live the lifespan of the animal, except under extraordinary and usually pathogenic circumstances.\"\n",
+    "    val encoded = tokenizer.encode(testString)\n",
+    "    val tokens = encoded.ids\n",
+    "    val tokensSize = tokens.size\n",
+    "\n",
+    "    val predictionLength = 34\n",
+    "    val outputTokens = LongArray(predictionLength) { 0 }\n",
+    "\n",
+    "    val input = ORTTensor(tokens, longArrayOf(1, 1, tokensSize.toLong()))\n",
+    "    var currentContext = input.clone(INPUT_TENSOR_NAME)\n",
+    "\n",
+    "    print(\"Here goes the test text for generation:\\n$testString\")\n",
+    "\n",
+    "    for (idx in 0 until predictionLength) {\n",
+    "        val inputTensor = listOf(currentContext)\n",
+    "        val output = model.predict(inputTensor)\n",
+    "\n",
+    "        outputTokens[idx] = extractTopToken(convertToKITensorMap(output), tokensSize + idx, OUTPUT_TENSOR_NAME)\n",
+    "\n",
+    "        val newTokenArray = tokens + outputTokens.slice(IntRange(0, idx))\n",
+    "        currentContext = ORTTensor(newTokenArray, longArrayOf(1, 1, tokensSize + idx + 1L), INPUT_TENSOR_NAME)\n",
+    "        print(tokenizer.decode(longArrayOf(outputTokens[idx])))\n",
+    "    }\n",
+    "    println(\"\\n\\nDone\")\n",
+    "}"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading model from: https://github.com/onnx/models/raw/main/validated/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx\n",
+      "Loading model...\n",
+      "Here goes the test text for generation:\n",
+      "Neurogenesis is most active during embryonic development and is responsible for producing all the various types of neurons of the organism, but it continues throughout adult life in a variety of organisms. Once born, neurons do not divide (see mitosis), and many will live the lifespan of the animal, except under extraordinary and usually pathogenic circumstances.\n",
+      "\n",
+      "The most common type of neurogenesis is the development of the hippocampus, which is the area of the brain that contains the hippocampus's electrical and chemical signals.\n",
+      "\n",
+      "Done\n"
+     ]
+    }
+   ],
+   "execution_count": 6
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Kotlin",
+   "language": "kotlin",
+   "name": "kotlin"
+  },
+  "language_info": {
+   "name": "kotlin",
+   "version": "1.9.23",
+   "mimetype": "text/x-kotlin",
+   "file_extension": ".kt",
+   "pygments_lexer": "kotlin",
+   "codemirror_mode": "text/x-kotlin",
+   "nbconvert_exporter": ""
+  },
+  "ktnbPluginMetadata": {
+   "projectLibraries": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}