InfiniTensor · vankari · Jan 16, 2026 · Jan 17, 2026 · Jan 17, 2026 · Jan 22, 2026
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -7,14 +7,18 @@ on:
       - 'LICENSE'
 
 jobs:
-  build:
-    name: Build
+  build-cuda:
+    name: Build and test (CUDA, ${{ matrix.target }})
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, ubuntu-latest]
-        type: [release]
+        include:
+          - target: ubuntu
+            os: ubuntu-latest
     runs-on: ${{ matrix.os }}
+    env:
+      TMPDIR: ${{ github.workspace }}/.tmp
+      PYTHONUTF8: "1"
     steps:
 
     - name: checkout code
@@ -24,37 +28,144 @@ jobs:
       uses: xmake-io/github-action-setup-xmake@v1
       with:
         xmake-version: latest
-
-    - name: Xmake Build & Install
-      run: | 
-        xmake
+
+    - name: install cuda toolkit
+      uses: Jimver/cuda-toolkit@v0.2.24
+      with:
+        cuda: "12.8.0"
+
+    - name: prepare tmp (linux)
+      if: matrix.target == 'ubuntu'
+      run: |
+        mkdir -p "$TMPDIR"
+      shell: bash
+
+    - name: prepare tmp (windows)
+      if: matrix.target == 'windows'
+      run: |
+        New-Item -ItemType Directory -Force -Path $env:TMPDIR | Out-Null
+      shell: pwsh
+
+    - name: check toolchain (linux)
+      if: matrix.target == 'ubuntu'
+      run: |
+        command -v python
+        command -v pip
+        command -v xmake
+        command -v nvcc
+        python --version
+        pip --version
+        xmake --version
+        nvcc --version
+      shell: bash
+
+    - name: check toolchain (windows)
+      if: matrix.target == 'windows'
+      run: |
+        Get-Command python
+        Get-Command pip
+        Get-Command xmake
+        if (-not $env:CUDA_PATH) {
+          $nvccCmd = Get-Command nvcc -ErrorAction SilentlyContinue
+          if ($nvccCmd) {
+            $env:CUDA_PATH = Split-Path (Split-Path $nvccCmd.Source -Parent) -Parent
+          }
+        }
+        if (-not $env:CUDA_PATH) {
+          throw "CUDA_PATH is not set and nvcc not found in PATH"
+        }
+        "$env:CUDA_PATH\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        python --version
+        pip --version
+        xmake --version
+        & "$env:CUDA_PATH\bin\nvcc.exe" --version
+      shell: pwsh
+
+    - name: detect nvidia gpu (linux)
+      if: matrix.target == 'ubuntu'
+      run: |
+        if command -v nvidia-smi >/dev/null 2>&1; then
+          nvidia-smi || true
+          echo "HAS_NVIDIA_GPU=1" >> "$GITHUB_ENV"
+        else
+          echo "HAS_NVIDIA_GPU=0" >> "$GITHUB_ENV"
+        fi
+      shell: bash
+
+    - name: detect nvidia gpu (windows)
+      if: matrix.target == 'windows'
+      run: |
+        if (Get-Command nvidia-smi -ErrorAction SilentlyContinue) {
+          nvidia-smi
+          "HAS_NVIDIA_GPU=1" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        } else {
+          "HAS_NVIDIA_GPU=0" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        }
+      shell: pwsh
+
+    - name: Xmake CUDA Build & Install (linux)
+      if: matrix.target == 'ubuntu'
+      run: |
+        xmake f --nv-gpu=y -c -v
+        xmake -v
+        xmake install
+      shell: bash
+
+    - name: Xmake CUDA Build & Install (windows)
+      if: matrix.target == 'windows'
+      run: |
+        if (-not $env:CUDA_PATH) {
+          throw "CUDA_PATH is required for windows CUDA build"
+        }
+        $env:Path = "$env:CUDA_PATH\bin;$env:Path"
+        xmake f -p windows -a x64 --toolchain=msvc --cuda="$env:CUDA_PATH" --nv-gpu=y -c -v
+        xmake -v
         xmake install
+      shell: pwsh
 
-    - name: Install Python
+    - name: Install Python (linux)
+      if: matrix.target == 'ubuntu'
       run: | 
         cd python
-        pip install .
+        pip install ./llaisyscore/
+        pip install ./server-project/
         cd ..
+      shell: bash
 
-    - name: Assignment-0
+    - name: Install Python (windows)
+      if: matrix.target == 'windows'
       run: |
-        python test/test_runtime.py --device cpu
+        Set-Location python
+        pip install ./llaisyscore/
+        pip install ./server-project/
+        Set-Location ..
+      shell: pwsh
 
-    - name: Assignment-1
+    - name: CUDA runtime api test
+      if: env.HAS_NVIDIA_GPU == '1'
       run: |
-        python test/test_tensor.py
-
-    - name: Assignment-2
+        python test/test_runtime.py --device nvidia
+
+    - name: CUDA ops tests
+      if: env.HAS_NVIDIA_GPU == '1'
+      run: |
+        python test/ops/add.py --device nvidia
+        python test/ops/argmax.py --device nvidia
+        python test/ops/embedding.py --device nvidia
+        python test/ops/linear.py --device nvidia
+        python test/ops/random_sample.py --device nvidia
+        python test/ops/rms_norm.py --device nvidia
+        python test/ops/rope.py --device nvidia
+        python test/ops/self_attention.py --device nvidia
+        python test/ops/swiglu.py --device nvidia
+
+    - name: CUDA infer test
+      if: env.HAS_NVIDIA_GPU == '1'
       run: |
-        python test/ops/add.py 
-        python test/ops/argmax.py
-        python test/ops/embedding.py
-        python test/ops/linear.py 
-        python test/ops/rms_norm.py
-        python test/ops/rope.py
-        python test/ops/self_attention.py
-        python test/ops/swiglu.py
+        python test/test_infer.py --device nvidia --test
 
-    - name: Assignment-3
+    - name: skip gpu tests when no gpu
+      if: env.HAS_NVIDIA_GPU != '1'
       run: |
-        python test/test_infer.py --test
+        echo "No NVIDIA GPU available on this runner, skipped runtime/ops/infer GPU tests."
+
diff --git a/.gitignore b/.gitignore
@@ -9,7 +9,8 @@ lib/
 *.dll
 *.dylib
 *.pyd
-
+# tmpfile
+.tmp/
 # MacOS Cache
 .DS_Store
 

diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
@@ -29,14 +29,20 @@ __C {
         llaisysTensor_t *mlp_down_w;
     };
 
-    struct LlaisysQwen2Model;
+    struct LlaisysQwen2Model
+    {
+        struct LlaisysQwen2Meta *meta;
+        struct LlaisysQwen2Weights *weights;
+        llaisysDeviceType_t device;
+        int *device_ids;
+        int ndevice;
+    };
 
     __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
-
     __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
-
     __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
-
-    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken,
+                                            llaisysTensor_t *kcache, llaisysTensor_t *vcache, size_t past_len,
+                                            float temperature, int top_k, float top_p, int64_t seed);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h
@@ -13,6 +13,8 @@ __C {
     __export void llaisysROPE(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t pos_ids, float theta);
     __export void llaisysSelfAttention(llaisysTensor_t attn_val, llaisysTensor_t q, llaisysTensor_t k, llaisysTensor_t v, float scale);
     __export void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up);
+    __export void llaisysRandomSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t logits,
+                                      float temperature, int top_k, float top_p, int64_t seed);
 }
 
 #endif
diff --git a/include/llaisys/tensor.h b/include/llaisys/tensor.h
@@ -63,6 +63,16 @@ __C {
         size_t dim,
         size_t start,
         size_t end);
+    __export llaisysTensor_t tensorReshape(
+        llaisysTensor_t tensor,
+        size_t * shape,
+        size_t ndim);
+
+    __export llaisysTensor_t tensorTo(
+        llaisysTensor_t tensor,
+        llaisysDeviceType_t device_type,
+        int device_id);
+
 }
 
 #endif // LLAISYS_TENSOR_H
diff --git a/python/README.md b/python/README.md
@@ -0,0 +1,25 @@
+# llaisys-server
+
+Standalone server package for Project #3.
+
+## Install order
+
+1. Install core package:
+
+```bash
+python3 -m pip install -e /home/vankari/code/llaisys/python/llaisyscore --user --break-system-packages
+```
+
+2. Install server package:
+
+```bash
+python3 -m pip install -e /home/vankari/code/llaisys/python/server-project --user --break-system-packages
+```
+
+## Run server
+
+```bash
+cd /home/vankari/code/llaisys/python
+python3 -m uvicorn server.app:app --host 0.0.0.0 --port 8000
+```
+### attention! the --break-system-packages was enabled on public server ,it should be disabled in personal computer.
diff --git a/python/llaisys/__init__.py b/python/llaisys/__init__.py
@@ -6,6 +6,7 @@
 from .tensor import Tensor
 from .ops import Ops
 from . import models
+from . import backend
 from .models import *
 
 __all__ = [
@@ -17,4 +18,5 @@
     "Tensor",
     "Ops",
     "models",
+    "backend",
 ]
diff --git a/python/llaisys/backend/__init__.py b/python/llaisys/backend/__init__.py
@@ -0,0 +1,3 @@
+from .inference_backend import InferenceBackend, SessionState
+
+__all__ = ["InferenceBackend", "SessionState"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,7 +9,8 @@ lib/ @@
     *.dll
     *.dylib
     *.pyd
+    # tmpfile
+    .tmp/
     # MacOS Cache
     .DS_Store
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .inference_backend import InferenceBackend, SessionState

		__all__ = ["InferenceBackend", "SessionState"]