pytorch · winskuo-quic · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh
diff --git a/.ci/scripts/test_qnn_static_llm.sh b/.ci/scripts/test_qnn_static_llm.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euxo pipefail
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+TASK_NAME=$1
+if [[ -z "${TASK_NAME:-}" ]]; then
+  echo "Missing task name, exiting..."
+  exit 1
+fi
+
+
+# Download QNN_SDK. If already downloaded, export environment path
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+install_qnn
+
+export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
+export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
+export PYTHONPATH=".."
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py
+pip install graphviz
+
+set +e
+
+echo "Executing task: $TASK_NAME"
+if [[ "${TASK_NAME}" == "stories_110m" ]]; then
+    # Download stories llama110m artifacts
+    download_stories_model_artifacts
+    echo "Creating tokenizer.bin"
+    $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+
+    # Compile only as weight sharing is not applicable on x86.
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
+    exit_code1=$?
+
+    # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
+    exit_code2=$?
+
+    # Check the exit codes and print messages
+    if [ $exit_code1 -ne 0 ]; then
+        echo "Static Llama compile only with weight sharing test failed. $exit_code1."
+    fi
+
+    if [ $exit_code2 -ne 0 ]; then
+        echo "Static Llama accuracy test failed. $exit_code2."
+    fi
+
+    if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
+        exit 1
+    else
+        exit 0
+    fi
+
+elif [[ "${TASK_NAME}" == "stories_260k_bc" ]]; then
+
+    # Check BC
+    bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh
+    exit_code1=$?
+    if [ $exit_code1 -ne 0 ]; then
+        exit 1
+    else
+        exit 0
+    fi
+
+elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_smollm2 --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64
+    exit_code1=$?
+    if [ $exit_code1 -ne 0 ]; then
+        exit 1
+    else
+        exit 0
+    fi
+else
+    echo "Unsupported task: $TASK_NAME"
+    exit 1
+fi
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -550,20 +550,22 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
-  test-static-llama-qnn-linux:
-    name: test-static-llama-qnn-linux
+  test-static-llm-qnn-linux:
+    name: test-static-llm-qnn-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
+      matrix:
+        task: [stories_110m, stories_260k_bc, smollm2_135m]
       fail-fast: false
     with:
-      runner: linux.2xlarge
+      runner: linux.24xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 180
+      timeout: 900
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -580,8 +582,7 @@ jobs:
         # Setup install_requirements for llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
 
-        # Test static llama weight sharing and accuracy
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llm.sh ${{ matrix.task }}
 
   test-qnn-models-linux:
     name: test-qnn-models-linux

@@ -5344,8 +5344,10 @@ def test_static_smollm2(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
+                import pdb; pdb.set_trace()
                 self.assertLessEqual(msg["wiki_ppl"], 25)
-                self.assertGreaterEqual(msg["inference_speed"], 200)
+                if not self.enable_x86_64:
+                    self.assertGreaterEqual(msg["inference_speed"], 200)
 
     def test_static_smollm3(self):
         if not self.required_envs():