diff --git a/.github/workflows/sycl-docs.yml b/.github/workflows/sycl-docs.yml
index 5c1e8e425111b..6b748ec9c7ebb 100644
--- a/.github/workflows/sycl-docs.yml
+++ b/.github/workflows/sycl-docs.yml
@@ -49,7 +49,13 @@ jobs:
         mkdir clang
         mv $GITHUB_WORKSPACE/build/tools/sycl/doc/html/* .
         mv $GITHUB_WORKSPACE/build/tools/clang/docs/html/* clang/
+        cp -r $GITHUB_WORKSPACE/repo/devops/scripts/benchmarks/html benchmarks
         touch .nojekyll
+        # Update benchmarking dashboard configuration
+        cat << 'EOF' > benchmarks/config.js
+        remoteDataUrl = 'https://raw.githubusercontent.com/intel/llvm-ci-perf-results/refs/heads/unify-ci/UR_DNP_INTEL_06_03/data.json';
+        defaultCompareNames = ["Baseline_PVC_L0"];
+        EOF 
     # Upload the generated docs as an artifact and deploy to GitHub Pages.
     - name: Upload artifact
       uses: actions/upload-pages-artifact@v3
diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
index 2f3c02bf334ed..3a93c2aae254c 100644
--- a/.github/workflows/sycl-linux-run-tests.yml
+++ b/.github/workflows/sycl-linux-run-tests.yml
@@ -114,6 +114,15 @@ on:
         default: ''
         required: False
 
+      benchmark_upload_results:
+        type: string
+        default: 'false'
+        required: False
+      benchmark_build_hash:
+        type: string
+        default: ''
+        required: False
+
   workflow_dispatch:
     inputs:
       runner:
@@ -126,6 +135,7 @@ on:
           - '["cts-cpu"]'
           - '["Linux", "build"]'
           - '["cuda"]'
+          - '["Linux", "bmg"]'
           - '["PVC_PERF"]'
       image:
         type: choice
@@ -154,6 +164,7 @@ on:
           - e2e
           - cts
           - compute-benchmarks
+          - benchmark_v2
 
       env:
         description: |
@@ -337,3 +348,14 @@ jobs:
       env:
         RUNNER_TAG: ${{ inputs.runner }}
         GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}
+
+    - name: Run benchmarks
+      if: inputs.tests_selector == 'benchmark_v2'
+      uses: ./devops/actions/run-tests/benchmark_v2
+      with:
+        target_devices: ${{ inputs.target_devices }}
+        upload_results: ${{ inputs.benchmark_upload_results }}
+        build_hash: ${{ inputs.benchmark_build_hash }}
+      env:
+        RUNNER_TAG: ${{ inputs.runner }}
+        GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/sycl-ur-perf-benchmarking.yml b/.github/workflows/sycl-ur-perf-benchmarking.yml
new file mode 100644
index 0000000000000..8e860bce6a384
--- /dev/null
+++ b/.github/workflows/sycl-ur-perf-benchmarking.yml
@@ -0,0 +1,129 @@
+name: Run Benchmarks
+
+on:
+  schedule:
+    - cron: '0 1 * * *'  # 2 hrs earlier than sycl-nightly.yml
+  workflow_call:
+    inputs:
+      commit_hash:
+        type: string
+        required: false
+        default: ''
+      upload_results:
+        type: string # true/false: workflow_dispatch does not support booleans
+        required: true
+      runner:
+        type: string
+        required: true
+      backend:
+        type: string
+        required: true
+      reset_intel_gpu:
+        type: string  # true/false: workflow_dispatch does not support booleans
+        required: true
+        default: true
+
+  workflow_dispatch:
+    inputs:
+      commit_hash:
+        description: Commit hash to build intel/llvm from
+        type: string
+        required: false
+        default: ''
+      upload_results:
+        description: 'Save and upload results'
+        type: choice
+        options:
+          - false
+          - true
+        default: true
+      runner:
+        type: choice
+        options:
+          - '["PVC_PERF"]'
+      backend:
+        description: Backend to use
+        type: choice
+        options:
+          - 'level_zero:gpu'
+        # TODO L0 V2 support
+      reset_intel_gpu:
+        description: Reset Intel GPUs
+        type: choice
+        options:
+          - false
+          - true
+        default: true
+
+permissions: read-all
+
+jobs:
+  build_sycl:
+    name: Build SYCL from PR
+    if: inputs.commit_hash != ''
+    uses: ./.github/workflows/sycl-linux-build.yml
+    with:
+      build_ref: ${{ inputs.commit_hash }}
+      build_cache_root: "/__w/"
+      build_artifact_suffix: "default"
+      build_cache_suffix: "default"
+      # Docker image has last nightly pre-installed and added to the PATH
+      build_image: "ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest"
+      cc: clang
+      cxx: clang++
+      changes: '[]'
+
+  run_benchmarks_build:
+    name: Run Benchmarks (on PR Build)
+    needs: [ build_sycl ]
+    if: inputs.commit_hash != ''
+    strategy:
+      matrix:
+        # Set default values if not specified:
+        include:
+          - runner: ${{ inputs.runner || '["PVC_PERF"]' }}
+            backend: ${{ inputs.backend || 'level_zero:gpu' }}
+            reset_intel_gpu: ${{ inputs.reset_intel_gpu || 'true' }}
+            ref: ${{ inputs.commit_hash }}
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
+    secrets: inherit
+    with:
+      # TODO support other benchmarks
+      name: Run compute-benchmarks (${{ matrix.runner }}, ${{ matrix.backend }})
+      runner: ${{ matrix.runner }}
+      image: ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest
+      image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+      target_devices: ${{ matrix.backend }}
+      reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
+      tests_selector: benchmark_v2
+      benchmark_upload_results: ${{ inputs.upload_results }}
+      benchmark_build_hash: ${{ inputs.commit_hash }}
+      repo_ref: ${{ matrix.ref }}
+      devops_ref: ${{ github.ref }}
+      sycl_toolchain_artifact: sycl_linux_default
+      sycl_toolchain_archive: ${{ needs.build_sycl.outputs.artifact_archive_name }}
+      sycl_toolchain_decompress_command: ${{ needs.build_sycl.outputs.artifact_decompress_command }}
+
+  run_benchmarks_nightly:
+    name: Run Benchmarks (on Nightly Build)
+    if: inputs.commit_hash == ''
+    strategy:
+      matrix:
+        # Set default values if not specified:
+        include:
+          - runner: ${{ inputs.runner || '["PVC_PERF"]' }}
+            backend: ${{ inputs.backend || 'level_zero:gpu' }}
+            reset_intel_gpu: ${{ inputs.reset_intel_gpu || 'true' }}
+    uses: ./.github/workflows/sycl-linux-run-tests.yml
+    secrets: inherit
+    with:
+      # TODO support other benchmarks
+      name: Run compute-benchmarks (${{ matrix.runner }}, ${{ matrix.backend }})
+      runner: ${{ matrix.runner }}
+      image: ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest
+      image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+      target_devices: ${{ matrix.backend }}
+      reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
+      tests_selector: benchmark_v2
+      benchmark_upload_results: ${{ inputs.upload_results }}
+      repo_ref: ${{ github.ref }}
diff --git a/.github/workflows/ur-benchmarks-reusable.yml b/.github/workflows/ur-benchmarks-reusable.yml
index 66ffcecd70314..d7c32edfdfc2a 100644
--- a/.github/workflows/ur-benchmarks-reusable.yml
+++ b/.github/workflows/ur-benchmarks-reusable.yml
@@ -1,12 +1,220 @@
 name: Benchmarks Reusable
 
-# This workflow is a WIP: This workflow file acts as a placeholder.
+on:
+  workflow_call:
+    inputs:
+      str_name:
+        required: true
+        type: string
+      pr_no:
+        required: true
+        # even though this is a number, this is a workaround for issues with
+        # reusable workflow calls that result in "Unexpected value '0'" error.
+        type: string
+      bench_script_params:
+        required: false
+        type: string
+        default: ''
+      sycl_config_params:
+        required: false
+        type: string
+        default: ''
+      upload_report:
+        required: false
+        type: boolean
+        default: false
+      compute_runtime_commit:
+        required: false
+        type: string
+        default: ''
 
-on: [ workflow_call ]
+permissions:
+  contents: read
+  pull-requests: write
 
 jobs:
-  do-nothing:
-    runs-on: ubuntu-latest
+  bench-run:
+    name: Build SYCL, Run Benchmarks
+    strategy:
+      matrix:
+        adapter: [
+          {str_name: "${{ inputs.str_name }}",
+          sycl_config: "${{ inputs.sycl_config_params }}"
+          }
+        ]
+        build_type: [Release]
+        compiler: [{c: clang, cxx: clang++}]
+
+    runs-on: "PVC_PERF"
+
     steps:
-      - run: echo 'This workflow is a WIP.'
-  
+    - name: Add comment to PR
+      uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+      if: ${{ always() && inputs.pr_no != 0 }}
+      with:
+        script: |
+          const pr_no = '${{ inputs.pr_no }}';
+          const adapter = '${{ matrix.adapter.str_name }}';
+          const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
+          const params = '${{ inputs.bench_script_params }}';
+          const body = `Compute Benchmarks ${adapter} run (with params: ${params}):\n${url}`;
+
+          github.rest.issues.createComment({
+            issue_number: pr_no,
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            body: body
+          })
+
+    - name: Checkout SYCL
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        path: sycl-repo
+
+    # We need to fetch special ref for proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
+    - name: Fetch PR's merge commit
+      if: ${{ inputs.pr_no != 0 }}
+      working-directory: ${{github.workspace}}/sycl-repo
+      run: |
+        git fetch -- https://github.com/${{github.repository}} +refs/pull/${{ inputs.pr_no }}/*:refs/remotes/origin/pr/${{ inputs.pr_no }}/*
+        git checkout origin/pr/${{ inputs.pr_no }}/merge
+        git rev-parse origin/pr/${{ inputs.pr_no }}/merge
+
+    # TODO: As long as we didn't merge this workflow into main, we should allow both scripts location
+    - name: Establish bench scripts location
+      run: |
+        if [ -d "${{github.workspace}}/sycl-repo/devops/scripts/benchmarks" ]; then
+          echo "Bench scripts are in devops/scripts"
+          echo "BENCH_SCRIPTS_DIR=${{github.workspace}}/sycl-repo/devops/scripts/benchmarks" >> $GITHUB_ENV
+        elif [ -d "${{github.workspace}}/sycl-repo/unified-runtime/scripts/benchmarks" ]; then
+          echo "Bench scripts are in unified-runtime/scripts"
+          echo "BENCH_SCRIPTS_DIR=${{github.workspace}}/sycl-repo/unified-runtime/scripts/benchmarks" >> $GITHUB_ENV
+        else
+          echo "Bench scripts are absent...?"
+          exit 1
+        fi
+
+    - name: Create virtual environment
+      run: python -m venv .venv
+
+    - name: Activate virtual environment and install pip packages
+      run: |
+        source .venv/bin/activate
+        pip install -r ${BENCH_SCRIPTS_DIR}/requirements.txt
+
+    - name: Configure SYCL
+      run: >
+        python3 sycl-repo/buildbot/configure.py
+        -t ${{matrix.build_type}}
+        -o ${{github.workspace}}/sycl_build
+        --cmake-gen "Ninja"
+        --cmake-opt="-DLLVM_INSTALL_UTILS=ON"
+        --cmake-opt="-DSYCL_PI_TESTS=OFF"
+        --cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache
+        --cmake-opt=-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+        ${{matrix.adapter.sycl_config}}
+
+    - name: Build SYCL
+      run: cmake --build ${{github.workspace}}/sycl_build -j $(nproc)
+
+    # We need a complete installed UR for compute-benchmarks.
+    - name: Configure UR
+      run: >
+        cmake -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
+        -S${{github.workspace}}/sycl-repo/unified-runtime
+        -B${{github.workspace}}/ur_build
+        -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/ur_install
+        -DUR_BUILD_TESTS=OFF
+        -DUR_BUILD_ADAPTER_L0=ON
+        -DUR_BUILD_ADAPTER_L0_V2=ON
+        -DUMF_DISABLE_HWLOC=ON
+
+    - name: Build UR
+      run: cmake --build ${{github.workspace}}/ur_build -j $(nproc)
+
+    - name: Install UR
+      run: cmake --install ${{github.workspace}}/ur_build
+
+    - name: Compute core range
+      run: |
+        # Compute the core range for the first NUMA node; second node is for UMF jobs.
+        # Skip the first 4 cores - the kernel is likely to schedule more work on these.
+        CORES="$(lscpu | awk '
+          /NUMA node0 CPU|On-line CPU/ {line=$0}
+          END {
+            split(line, a, " ")
+            split(a[4], b, ",")
+            sub(/^0/, "4", b[1])
+            print b[1]
+          }')"
+        echo "Selected core: $CORES"
+        echo "CORES=$CORES" >> $GITHUB_ENV
+
+        ZE_AFFINITY_MASK=0
+        echo "ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK" >> $GITHUB_ENV
+
+    - name: Run benchmarks
+      working-directory: ${{ github.workspace }}
+      id: benchmarks
+      run: >
+        source .venv/bin/activate &&
+        taskset -c "${{ env.CORES }}" ${BENCH_SCRIPTS_DIR}/main.py
+        ~/llvm_bench_workdir
+        --sycl ${{ github.workspace }}/sycl_build
+        --ur ${{ github.workspace }}/ur_install
+        --adapter ${{ matrix.adapter.str_name }}
+        --compare baseline
+        --compute-runtime ${{ inputs.compute_runtime_commit }}
+        --build-igc
+        ${{ inputs.upload_report && '--output-html' || '' }}
+        ${{ inputs.pr_no != 0 && '--output-markdown' || '' }}
+        ${{ inputs.bench_script_params }}
+
+    - name: Print benchmark results
+      run: |
+        cat ${{ github.workspace }}/benchmark_results.md || true
+
+    - name: Add comment to PR
+      uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+      if: ${{ always() && inputs.pr_no != 0 }}
+      with:
+        script: |
+          let markdown = ""
+          try {
+            const fs = require('fs');
+            markdown = fs.readFileSync('benchmark_results.md', 'utf8');
+          } catch(err) {
+          }
+
+          const pr_no = '${{ inputs.pr_no }}';
+          const adapter = '${{ matrix.adapter.str_name }}';
+          const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
+          const test_status = '${{ steps.benchmarks.outcome }}';
+          const job_status = '${{ job.status }}';
+          const params = '${{ inputs.bench_script_params }}';
+          const body = `Benchmarks ${adapter} run (${params}):\n${url}\nJob status: ${job_status}. Test status: ${test_status}.\n ${markdown}`;
+
+          github.rest.issues.createComment({
+            issue_number: pr_no,
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            body: body
+          })
+
+    - name: Rename benchmark results file
+      if: ${{ always() && inputs.upload_report }}
+      run: mv benchmark_results.html benchmark_results_${{ inputs.pr_no }}.html
+
+    - name: Upload HTML report
+      if: ${{ always() && inputs.upload_report }}
+      uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+      with:
+        path: benchmark_results_${{ inputs.pr_no }}.html
+        key: benchmark-results-${{ inputs.pr_no }}-${{ matrix.adapter.str_name }}-${{ github.run_id }}
+
+    # TODO: As long as we didn't merge this workflow into main, we should allow both scripts location
+    - name: Get information about platform
+      if: ${{ always() }}
+      run: |
+        ${{github.workspace}}/sycl-repo/devops/scripts/get_system_info.sh || true
+        ${{github.workspace}}/sycl-repo/unified-runtime/.github/scripts/get_system_info.sh || true
diff --git a/.github/workflows/ur-benchmarks.yml b/.github/workflows/ur-benchmarks.yml
index 23fbb1ad903b4..cde4bfa828d71 100644
--- a/.github/workflows/ur-benchmarks.yml
+++ b/.github/workflows/ur-benchmarks.yml
@@ -1,12 +1,53 @@
 name: Benchmarks
 
-# This workflow is a WIP: this workflow file acts as a placeholder.
+on:
+  workflow_dispatch:
+    inputs:
+      str_name:
+        description: Adapter
+        type: choice
+        required: true
+        default: 'level_zero'
+        options:
+          - level_zero
+          - level_zero_v2
+      pr_no:
+        description: PR number (0 is sycl main branch)
+        type: number
+        required: true
+      bench_script_params:
+        description: Benchmark script arguments
+        type: string
+        required: false
+        default: ''
+      sycl_config_params:
+        description: Extra params for SYCL configuration
+        type: string
+        required: false
+        default: ''
+      compute_runtime_commit:
+        description: 'Compute Runtime commit'
+        type: string
+        required: false
+        default: ''
+      upload_report:
+        description: 'Upload HTML report'
+        type: boolean
+        required: false
+        default: false
 
-on: [ workflow_dispatch ]
+permissions:
+  contents: read
+  pull-requests: write
 
 jobs:
-  do-nothing:
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo 'This workflow is a WIP.'
-
+  manual:
+    name: Compute Benchmarks
+    uses: ./.github/workflows/ur-benchmarks-reusable.yml
+    with:
+      str_name: ${{ inputs.str_name }}
+      pr_no: ${{ inputs.pr_no }}
+      bench_script_params: ${{ inputs.bench_script_params }}
+      sycl_config_params: ${{ inputs.sycl_config_params }}
+      compute_runtime_commit: ${{ inputs.compute_runtime_commit }}
+      upload_report: ${{ inputs.upload_report }}
diff --git a/.github/workflows/ur-build-hw.yml b/.github/workflows/ur-build-hw.yml
index a0f94ab10f538..eebac4e424a4b 100644
--- a/.github/workflows/ur-build-hw.yml
+++ b/.github/workflows/ur-build-hw.yml
@@ -156,4 +156,4 @@ jobs:
 
     - name: Get information about platform
       if: ${{ always() }}
-      run: ${{github.workspace}}/unified-runtime/.github/scripts/get_system_info.sh
+      run: ${{github.workspace}}/devops/scripts/get_system_info.sh
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
index e357e2bddec30..03b7d4ad776fd 100644
--- a/devops/actions/run-tests/benchmark/action.yml
+++ b/devops/actions/run-tests/benchmark/action.yml
@@ -95,7 +95,6 @@ runs:
     if: always()
     shell: bash
     run: |
-      # TODO -- waiting on security clearance
       # Load configuration values
       $(python ./devops/scripts/benchmarking/load_config.py ./devops constants)
 
diff --git a/devops/actions/run-tests/benchmark_v2/action.yml b/devops/actions/run-tests/benchmark_v2/action.yml
new file mode 100644
index 0000000000000..bab571ec16ff2
--- /dev/null
+++ b/devops/actions/run-tests/benchmark_v2/action.yml
@@ -0,0 +1,135 @@
+name: 'Run benchmarks'
+
+# This action assumes the following prerequisites:
+#
+# - SYCL is placed in ./toolchain -- TODO change this
+# - /devops has been checked out in ./devops.
+# - env.GITHUB_TOKEN was properly set, because according to Github, that's
+#   apparently the recommended way to pass a secret into a github action:
+
+#   https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions#accessing-your-secrets
+#
+# - env.RUNNER_TAG set to the runner tag used to run this workflow: Currently,
+#   only specific runners are fully supported.
+
+inputs:
+  target_devices:
+    type: string
+    required: True
+  upload_results:
+    type: string
+    required: True
+  build_hash:
+    type: string
+    required: False
+    default: ''
+
+runs:
+  using: "composite"
+  steps:
+  - name: Check specified runner type / target backend
+    shell: bash
+    env:
+      TARGET_DEVICE: ${{ inputs.target_devices }}
+      RUNNER_NAME: ${{ runner.name }}
+    run: |
+      case "$RUNNER_TAG" in
+        '["PVC_PERF"]' ) ;;
+        *)
+          echo "#"
+          echo "# WARNING: Only specific tuned runners are fully supported."
+          echo "# This workflow is not guaranteed to work with other runners."
+          echo "#" ;;
+      esac
+
+      # Ensure runner name has nothing injected
+      # TODO: in terms of security, is this overkill?
+      if [ -z "$(printf '%s' "$RUNNER_NAME" | grep -oE '^[a-zA-Z0-9_-]+$')" ]; then
+          echo "Bad runner name, please ensure runner name is [a-zA-Z0-9_-]."
+          exit 1
+      fi
+      echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV 
+
+      # input.target_devices is not directly used, as this allows code injection
+      case "$TARGET_DEVICE" in
+        level_zero:*) ;;
+        *)
+          echo "#"
+          echo "# WARNING: Only level_zero backend is fully supported."
+          echo "# This workflow is not guaranteed to work with other backends."
+          echo "#" ;;
+      esac
+      echo "ONEAPI_DEVICE_SELECTOR=$TARGET_DEVICE" >> $GITHUB_ENV 
+
+  - name: Compute CPU core range to run benchmarks on
+    shell: bash
+    run: |
+      # Compute the core range for the first NUMA node; second node is used by
+      # UMF. Skip the first 4 cores as the kernel is likely to schedule more
+      # work on these.
+      CORES="$(lscpu | awk '
+        /NUMA node0 CPU|On-line CPU/ {line=$0}
+        END {
+          split(line, a, " ")
+          split(a[4], b, ",")
+          sub(/^0/, "4", b[1])
+          print b[1]
+        }')"
+      echo "CPU core range to use: $CORES"
+      echo "CORES=$CORES" >> $GITHUB_ENV
+
+      ZE_AFFINITY_MASK=0
+      echo "ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK" >> $GITHUB_ENV
+  - name: Checkout results repo
+    shell: bash
+    run: |
+      git clone -b unify-ci https://github.com/intel/llvm-ci-perf-results
+  - name: Run compute-benchmarks
+    env:
+      BUILD_HASH: ${{ inputs.build_hash }}
+    shell: bash
+    run: |
+      # TODO generate summary + display helpful message here
+      export CMPLR_ROOT=./toolchain
+      echo "-----"
+      sycl-ls
+      echo "-----"
+      pip install --user --break-system-packages -r ./devops/scripts/benchmarks/requirements.txt
+      echo "-----"
+      mkdir -p "./llvm-ci-perf-results/$RUNNER_NAME"
+
+      # TODO accomodate for different GPUs and backends
+      SAVE_NAME="Baseline_PVC_L0"
+      if [ -n "$BUILD_HASH" ]; then
+          SAVE_NAME="Commit_PVC_$BUILD_HASH"
+      fi
+
+      taskset -c "$CORES" ./devops/scripts/benchmarks/main.py \
+        "$(realpath ./llvm_test_workdir)" \
+        --sycl "$(realpath ./toolchain)" \
+        --save "$SAVE_NAME" \
+        --output-html remote \
+        --results-dir "./llvm-ci-perf-results/$RUNNER_NAME" \
+        --output-dir "./llvm-ci-perf-results/$RUNNER_NAME" \
+        --preset Minimal
+      echo "-----"
+  - name: Push compute-benchmarks results
+    if: inputs.upload_results == 'true' && always()
+    shell: bash
+    run: |
+      # TODO redo configuration
+      # $(python ./devops/scripts/benchmarking/load_config.py ./devops constants)
+
+      cd "./llvm-ci-perf-results"
+      git config user.name "SYCL Benchmarking Bot"
+      git config user.email "sys_sycl_benchmarks@intel.com"
+      git pull
+      git add .
+      # Make sure changes have been made
+      if git diff --quiet && git diff --cached --quiet; then
+        echo "No new results added, skipping push."
+      else
+        git commit -m "[GHA] Upload compute-benchmarks results from https://github.com/intel/llvm/actions/runs/${{ github.run_id }}"
+        git push "https://$GITHUB_TOKEN@github.com/intel/llvm-ci-perf-results.git" unify-ci
+      fi
+
diff --git a/unified-runtime/scripts/benchmarks/README.md b/devops/scripts/benchmarks/README.md
similarity index 91%
rename from unified-runtime/scripts/benchmarks/README.md
rename to devops/scripts/benchmarks/README.md
index 004fe14eca35b..fcadded3cad51 100644
--- a/unified-runtime/scripts/benchmarks/README.md
+++ b/devops/scripts/benchmarks/README.md
@@ -6,6 +6,8 @@ Scripts for running performance tests on SYCL and Unified Runtime.
 
 - [Velocity Bench](https://github.com/oneapi-src/Velocity-Bench)
 - [Compute Benchmarks](https://github.com/intel/compute-benchmarks/)
+- [LlamaCpp Benchmarks](https://github.com/ggerganov/llama.cpp)
+- [SYCL-Bench](https://github.com/unisa-hpc/sycl-bench)
 
 ## Running
 
@@ -27,8 +29,6 @@ You can also include additional benchmark parameters, such as environment variab
 
 Once all the required information is entered, click the "Run workflow" button to initiate a new workflow run. This will execute the benchmarks and then post the results as a comment on the specified Pull Request.
 
-By default, all benchmark runs are compared against `baseline`, which is a well-established set of the latest data.
-
 You must be a member of the `oneapi-src` organization to access these features.
 
 ## Comparing results
@@ -37,8 +37,8 @@ By default, the benchmark results are not stored. To store them, use the option
 
 You can compare benchmark results using `--compare` option. The comparison will be presented in a markdown output file (see below). If you want to calculate the relative performance of the new results against the previously saved data, use `--compare <previously_saved_data>` (i.e. `--compare baseline`). In case of comparing only stored data without generating new results, use `--dry-run --compare <name1> --compare <name2> --relative-perf <name1>`, where `name1` indicates the baseline for the relative performance calculation and `--dry-run` prevents the script for running benchmarks. Listing more than two `--compare` options results in displaying only execution time, without statistical analysis.
 
-Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
-are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).
+Baseline_L0, as well as Baseline_L0v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
+are stored [here](https://oneapi-src.github.io/unified-runtime/performance/).
 
 ## Output formats
 You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.
diff --git a/unified-runtime/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
similarity index 52%
rename from unified-runtime/scripts/benchmarks/benches/base.py
rename to devops/scripts/benchmarks/benches/base.py
index d1bb5fb53b83a..4c2973d250e3d 100644
--- a/unified-runtime/scripts/benchmarks/benches/base.py
+++ b/devops/scripts/benchmarks/benches/base.py
@@ -1,16 +1,37 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from dataclasses import dataclass
 import os
 import shutil
 from pathlib import Path
-from .result import Result
+from utils.result import BenchmarkMetadata, BenchmarkTag, Result
 from options import options
 from utils.utils import download, run
-import urllib.request
-import tarfile
+
+benchmark_tags = [
+    BenchmarkTag("SYCL", "Benchmark uses SYCL runtime"),
+    BenchmarkTag("UR", "Benchmark uses Unified Runtime API"),
+    BenchmarkTag("L0", "Benchmark uses Level Zero API directly"),
+    BenchmarkTag("UMF", "Benchmark uses Unified Memory Framework directly"),
+    BenchmarkTag("micro", "Microbenchmark focusing on a specific functionality"),
+    BenchmarkTag("application", "Real application-based performance test"),
+    BenchmarkTag("proxy", "Benchmark that simulates real application use-cases"),
+    BenchmarkTag("submit", "Tests kernel submission performance"),
+    BenchmarkTag("math", "Tests math computation performance"),
+    BenchmarkTag("memory", "Tests memory transfer or bandwidth performance"),
+    BenchmarkTag("allocation", "Tests memory allocation performance"),
+    BenchmarkTag("graph", "Tests graph-based execution performance"),
+    BenchmarkTag("latency", "Measures operation latency"),
+    BenchmarkTag("throughput", "Measures operation throughput"),
+    BenchmarkTag("inference", "Tests ML/AI inference performance"),
+    BenchmarkTag("image", "Image processing benchmark"),
+    BenchmarkTag("simulation", "Physics or scientific simulation benchmark"),
+]
+
+benchmark_tags_dict = {tag.name: tag for tag in benchmark_tags}
 
 
 class Benchmark:
@@ -55,19 +76,25 @@ def create_data_path(self, name, skip_data_dir=False):
             data_path = os.path.join(self.directory, name)
         else:
             data_path = os.path.join(self.directory, "data", name)
-            if options.rebuild and Path(data_path).exists():
+            if options.redownload and Path(data_path).exists():
                 shutil.rmtree(data_path)
 
         Path(data_path).mkdir(parents=True, exist_ok=True)
 
         return data_path
 
-    def download(self, name, url, file, untar=False, unzip=False, skip_data_dir=False):
+    def download(
+        self,
+        name,
+        url,
+        file,
+        untar=False,
+        unzip=False,
+        skip_data_dir=False,
+        checksum="",
+    ):
         self.data_path = self.create_data_path(name, skip_data_dir)
-        return download(self.data_path, url, file, untar, unzip)
-
-    def name(self):
-        raise NotImplementedError()
+        return download(self.data_path, url, file, untar, unzip, checksum)
 
     def lower_is_better(self):
         return True
@@ -87,6 +114,30 @@ def stddev_threshold(self):
     def get_suite_name(self) -> str:
         return self.suite.name()
 
+    def name(self):
+        raise NotImplementedError()
+
+    def description(self):
+        return ""
+
+    def notes(self) -> str:
+        return None
+
+    def unstable(self) -> str:
+        return None
+
+    def get_tags(self) -> list[str]:
+        return []
+
+    def get_metadata(self) -> BenchmarkMetadata:
+        return BenchmarkMetadata(
+            type="benchmark",
+            description=self.description(),
+            notes=self.notes(),
+            unstable=self.unstable(),
+            tags=self.get_tags(),
+        )
+
 
 class Suite:
     def benchmarks(self) -> list[Benchmark]:
@@ -97,3 +148,6 @@ def name(self) -> str:
 
     def setup(self):
         return
+
+    def additionalMetadata(self) -> dict[str, BenchmarkMetadata]:
+        return {}
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
new file mode 100644
index 0000000000000..8dc7315af9e76
--- /dev/null
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -0,0 +1,704 @@
+# Copyright (C) 2024-2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import csv
+import io
+from utils.utils import run, git_clone, create_build_path
+from .base import Benchmark, Suite
+from utils.result import BenchmarkMetadata, Result
+from options import options
+from enum import Enum
+
+
+class RUNTIMES(Enum):
+    SYCL = "sycl"
+    LEVEL_ZERO = "l0"
+    UR = "ur"
+
+
+def runtime_to_name(runtime: RUNTIMES) -> str:
+    return {
+        RUNTIMES.SYCL: "SYCL",
+        RUNTIMES.LEVEL_ZERO: "Level Zero",
+        RUNTIMES.UR: "Unified Runtime",
+    }[runtime]
+
+
+def runtime_to_tag_name(runtime: RUNTIMES) -> str:
+    return {
+        RUNTIMES.SYCL: "SYCL",
+        RUNTIMES.LEVEL_ZERO: "L0",
+        RUNTIMES.UR: "UR",
+    }[runtime]
+
+
+class ComputeBench(Suite):
+    def __init__(self, directory):
+        self.directory = directory
+
+    def name(self) -> str:
+        return "Compute Benchmarks"
+
+    def git_url(self) -> str:
+        return "https://github.com/intel/compute-benchmarks.git"
+
+    def git_hash(self) -> str:
+        return "b5cc46acf61766ab00da04e85bd4da4f7591eb21"
+
+    def setup(self):
+        if options.sycl is None:
+            return
+
+        repo_path = git_clone(
+            self.directory,
+            "compute-benchmarks-repo",
+            self.git_url(),
+            self.git_hash(),
+        )
+        build_path = create_build_path(self.directory, "compute-benchmarks-build")
+
+        configure_command = [
+            "cmake",
+            f"-B {build_path}",
+            f"-S {repo_path}",
+            f"-DCMAKE_BUILD_TYPE=Release",
+            f"-DBUILD_SYCL=ON",
+            f"-DSYCL_COMPILER_ROOT={options.sycl}",
+            f"-DALLOW_WARNINGS=ON",
+        ]
+
+        if options.ur is not None:
+            configure_command += [
+                f"-DBUILD_UR=ON",
+                f"-Dunified-runtime_DIR={options.ur}/lib/cmake/unified-runtime",
+            ]
+
+        run(configure_command, add_sycl=True)
+
+        run(f"cmake --build {build_path} -j {options.build_jobs}", add_sycl=True)
+
+        self.built = True
+
+    def additionalMetadata(self) -> dict[str, BenchmarkMetadata]:
+        # TODO: group metadata should be automatically generated based on the benchmarks...
+        submit_kernel_metadata = BenchmarkMetadata(
+            type="group",
+            description="Measures CPU time overhead of submitting kernels through different APIs.",
+            notes="Each layer builds on top of the previous layer, adding functionality and overhead.\n"
+            "The first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\n"
+            "The UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\n"
+            "Work is ongoing to reduce the overhead of the SYCL API\n",
+            tags=["submit", "micro", "SYCL", "UR", "L0"],
+        )
+
+        return {
+            "SubmitKernel In Order": submit_kernel_metadata,
+            "SubmitKernel Out Of Order": submit_kernel_metadata,
+            "SubmitKernel In Order With Completion": submit_kernel_metadata,
+            "SubmitKernel Out Of Order With Completion": submit_kernel_metadata,
+            "SinKernelGraph": BenchmarkMetadata(
+                type="group",
+                unstable="This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+                tags=["submit", "memory", "proxy", "SYCL", "UR", "L0", "graph"],
+            ),
+            "SubmitGraph": BenchmarkMetadata(
+                type="group", tags=["submit", "micro", "SYCL", "UR", "L0", "graph"]
+            ),
+        }
+
+    def enabled_runtimes(self, supported_runtimes=None):
+        # all runtimes in the RUNTIMES enum
+        runtimes = supported_runtimes or list(RUNTIMES)
+
+        # Filter out UR if not available
+        if options.ur is None:
+            runtimes = [r for r in runtimes if r != RUNTIMES.UR]
+
+        return runtimes
+
+    def benchmarks(self) -> list[Benchmark]:
+        if options.sycl is None:
+            return []
+
+        if options.ur_adapter == "cuda":
+            return []
+
+        benches = []
+
+        # Add SubmitKernel benchmarks using loops
+        for runtime in self.enabled_runtimes():
+            for in_order_queue in [0, 1]:
+                for measure_completion in [0, 1]:
+                    benches.append(
+                        SubmitKernel(self, runtime, in_order_queue, measure_completion)
+                    )
+
+        # Add SinKernelGraph benchmarks
+        for runtime in self.enabled_runtimes():
+            for with_graphs in [0, 1]:
+                for num_kernels in [5, 100]:
+                    benches.append(
+                        GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
+                    )
+
+        # Add ULLS benchmarks
+        for runtime in self.enabled_runtimes([RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]):
+            benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
+            benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
+
+        # Add GraphApiSubmitGraph benchmarks
+        for runtime in self.enabled_runtimes([RUNTIMES.SYCL]):
+            for in_order_queue in [0, 1]:
+                for num_kernels in [4, 10, 32]:
+                    for measure_completion_time in [0, 1]:
+                        benches.append(
+                            GraphApiSubmitGraph(
+                                self,
+                                runtime,
+                                in_order_queue,
+                                num_kernels,
+                                measure_completion_time,
+                            )
+                        )
+
+        # Add other benchmarks
+        benches += [
+            QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
+            QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
+            QueueMemcpy(self, "Device", "Device", 1024),
+            StreamMemory(self, "Triad", 10 * 1024, "Device"),
+            ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
+            ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
+            VectorSum(self),
+        ]
+
+        # Add UR-specific benchmarks
+        if options.ur is not None:
+            benches += [
+                MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
+                MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
+                MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
+            ]
+
+        return benches
+
+
+def parse_unit_type(compute_unit):
+    if "[count]" in compute_unit:
+        return "instr"
+    elif "[us]" in compute_unit:
+        return "μs"
+    return compute_unit.replace("[", "").replace("]", "")
+
+
+class ComputeBenchmark(Benchmark):
+    def __init__(self, bench, name, test):
+        super().__init__(bench.directory, bench)
+        self.bench = bench
+        self.bench_name = name
+        self.test = test
+
+    def bin_args(self) -> list[str]:
+        return []
+
+    def extra_env_vars(self) -> dict:
+        return {}
+
+    def setup(self):
+        self.benchmark_bin = os.path.join(
+            self.bench.directory, "compute-benchmarks-build", "bin", self.bench_name
+        )
+
+    def explicit_group(self):
+        return ""
+
+    def description(self) -> str:
+        return ""
+
+    def run(self, env_vars) -> list[Result]:
+        command = [
+            f"{self.benchmark_bin}",
+            f"--test={self.test}",
+            "--csv",
+            "--noHeaders",
+        ]
+
+        command += self.bin_args()
+        env_vars.update(self.extra_env_vars())
+
+        result = self.run_bench(command, env_vars)
+        parsed_results = self.parse_output(result)
+        ret = []
+        for label, median, stddev, unit in parsed_results:
+            extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
+            explicit_group = (
+                self.explicit_group() + extra_label
+                if self.explicit_group() != ""
+                else ""
+            )
+            ret.append(
+                Result(
+                    label=self.name() + extra_label,
+                    explicit_group=explicit_group,
+                    value=median,
+                    stddev=stddev,
+                    command=command,
+                    env=env_vars,
+                    stdout=result,
+                    unit=parse_unit_type(unit),
+                    git_url=self.bench.git_url(),
+                    git_hash=self.bench.git_hash(),
+                )
+            )
+        return ret
+
+    def parse_output(self, output):
+        csv_file = io.StringIO(output)
+        reader = csv.reader(csv_file)
+        next(reader, None)
+        results = []
+        while True:
+            data_row = next(reader, None)
+            if data_row is None:
+                break
+            try:
+                label = data_row[0]
+                mean = float(data_row[1])
+                median = float(data_row[2])
+                # compute benchmarks report stddev as %
+                stddev = mean * (float(data_row[3].strip("%")) / 100.0)
+                unit = data_row[7]
+                results.append((label, median, stddev, unit))
+            except (ValueError, IndexError) as e:
+                raise ValueError(f"Error parsing output: {e}")
+        if len(results) == 0:
+            raise ValueError("Benchmark output does not contain data.")
+        return results
+
+    def teardown(self):
+        return
+
+
+class SubmitKernel(ComputeBenchmark):
+    def __init__(self, bench, runtime: RUNTIMES, ioq, measure_completion=0):
+        self.ioq = ioq
+        self.runtime = runtime
+        self.measure_completion = measure_completion
+        super().__init__(
+            bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel"
+        )
+
+    def get_tags(self):
+        return ["submit", "latency", runtime_to_tag_name(self.runtime), "micro"]
+
+    def name(self):
+        order = "in order" if self.ioq else "out of order"
+        completion_str = " with measure completion" if self.measure_completion else ""
+        return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}"
+
+    def explicit_group(self):
+        order = "In Order" if self.ioq else "Out Of Order"
+        completion_str = " With Completion" if self.measure_completion else ""
+        return f"SubmitKernel {order}{completion_str}"
+
+    def description(self) -> str:
+        order = "in-order" if self.ioq else "out-of-order"
+        runtime_name = runtime_to_name(self.runtime)
+
+        completion_desc = ""
+        if self.runtime == RUNTIMES.UR:
+            completion_desc = f", {'including' if self.measure_completion else 'excluding'} kernel completion time"
+
+        l0_specific = ""
+        if self.runtime == RUNTIMES.LEVEL_ZERO:
+            l0_specific = " Uses immediate command lists"
+
+        return (
+            f"Measures CPU time overhead of submitting {order} kernels through {runtime_name} API{completion_desc}. "
+            f"Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time. {l0_specific}"
+        )
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--Ioq={self.ioq}",
+            "--DiscardEvents=0",
+            f"--MeasureCompletion={self.measure_completion}",
+            "--iterations=100000",
+            "--Profiling=0",
+            "--NumKernels=10",
+            "--KernelExecTime=1",
+        ]
+
+
+class ExecImmediateCopyQueue(ComputeBenchmark):
+    def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
+        self.ioq = ioq
+        self.isCopyOnly = isCopyOnly
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
+
+    def name(self):
+        order = "in order" if self.ioq else "out of order"
+        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+
+    def description(self) -> str:
+        order = "in-order" if self.ioq else "out-of-order"
+        operation = "copy-only" if self.isCopyOnly else "copy and command submission"
+        return (
+            f"Measures SYCL {order} queue overhead for {operation} from {self.source} to "
+            f"{self.destination} memory with {self.size} bytes. Tests immediate execution overheads."
+        )
+
+    def get_tags(self):
+        return ["memory", "submit", "latency", "SYCL", "micro"]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=100000",
+            f"--ioq={self.ioq}",
+            f"--IsCopyOnly={self.isCopyOnly}",
+            "--MeasureCompletionTime=0",
+            f"--src={self.destination}",
+            f"--dst={self.destination}",
+            f"--size={self.size}",
+        ]
+
+
+class QueueInOrderMemcpy(ComputeBenchmark):
+    def __init__(self, bench, isCopyOnly, source, destination, size):
+        self.isCopyOnly = isCopyOnly
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
+
+    def name(self):
+        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+
+    def description(self) -> str:
+        operation = "copy-only" if self.isCopyOnly else "copy and command submission"
+        return (
+            f"Measures SYCL in-order queue memory copy performance for {operation} from "
+            f"{self.source} to {self.destination} with {self.size} bytes, executed 100 times per iteration."
+        )
+
+    def get_tags(self):
+        return ["memory", "latency", "SYCL", "micro"]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--IsCopyOnly={self.isCopyOnly}",
+            f"--sourcePlacement={self.source}",
+            f"--destinationPlacement={self.destination}",
+            f"--size={self.size}",
+            "--count=100",
+        ]
+
+
+class QueueMemcpy(ComputeBenchmark):
+    def __init__(self, bench, source, destination, size):
+        self.source = source
+        self.destination = destination
+        self.size = size
+        super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
+
+    def name(self):
+        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+
+    def description(self) -> str:
+        return (
+            f"Measures general SYCL queue memory copy performance from {self.source} to "
+            f"{self.destination} with {self.size} bytes per operation."
+        )
+
+    def get_tags(self):
+        return ["memory", "latency", "SYCL", "micro"]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--sourcePlacement={self.source}",
+            f"--destinationPlacement={self.destination}",
+            f"--size={self.size}",
+        ]
+
+
+class StreamMemory(ComputeBenchmark):
+    def __init__(self, bench, type, size, placement):
+        self.type = type
+        self.size = size
+        self.placement = placement
+        super().__init__(bench, "memory_benchmark_sycl", "StreamMemory")
+
+    def name(self):
+        return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
+
+    def description(self) -> str:
+        return (
+            f"Measures {self.placement} memory bandwidth using {self.type} pattern with "
+            f"{self.size} bytes. Higher values (GB/s) indicate better performance."
+        )
+
+    # measurement is in GB/s
+    def lower_is_better(self):
+        return False
+
+    def get_tags(self):
+        return ["memory", "throughput", "SYCL", "micro"]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--type={self.type}",
+            f"--size={self.size}",
+            f"--memoryPlacement={self.placement}",
+            "--useEvents=0",
+            "--contents=Zeros",
+            "--multiplier=1",
+            "--vectorSize=1",
+        ]
+
+
+class VectorSum(ComputeBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "miscellaneous_benchmark_sycl", "VectorSum")
+
+    def name(self):
+        return f"miscellaneous_benchmark_sycl VectorSum"
+
+    def description(self) -> str:
+        return (
+            "Measures performance of vector addition across 3D grid (512x256x256 elements) "
+            "using SYCL."
+        )
+
+    def get_tags(self):
+        return ["math", "throughput", "SYCL", "micro"]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=1000",
+            "--numberOfElementsX=512",
+            "--numberOfElementsY=256",
+            "--numberOfElementsZ=256",
+        ]
+
+
+class MemcpyExecute(ComputeBenchmark):
+    def __init__(
+        self,
+        bench,
+        numOpsPerThread,
+        numThreads,
+        allocSize,
+        iterations,
+        srcUSM,
+        dstUSM,
+        useEvent,
+    ):
+        self.numOpsPerThread = numOpsPerThread
+        self.numThreads = numThreads
+        self.allocSize = allocSize
+        self.iterations = iterations
+        self.srcUSM = srcUSM
+        self.dstUSM = dstUSM
+        self.useEvents = useEvent
+        super().__init__(bench, "multithread_benchmark_ur", "MemcpyExecute")
+
+    def name(self):
+        return (
+            f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}"
+            + (" without events" if not self.useEvents else "")
+        )
+
+    def description(self) -> str:
+        src_type = "device" if self.srcUSM == 1 else "host"
+        dst_type = "device" if self.dstUSM == 1 else "host"
+        events = "with" if self.useEvents else "without"
+        return (
+            f"Measures multithreaded memory copy performance with {self.numThreads} threads "
+            f"each performing {self.numOpsPerThread} operations on {self.allocSize} bytes "
+            f"from {src_type} to {dst_type} memory {events} events."
+        )
+
+    def get_tags(self):
+        return ["memory", "latency", "UR", "micro"]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--Ioq=1",
+            f"--UseEvents={self.useEvents}",
+            "--MeasureCompletion=1",
+            "--UseQueuePerThread=1",
+            f"--AllocSize={self.allocSize}",
+            f"--NumThreads={self.numThreads}",
+            f"--NumOpsPerThread={self.numOpsPerThread}",
+            f"--iterations={self.iterations}",
+            f"--SrcUSM={self.srcUSM}",
+            f"--DstUSM={self.dstUSM}",
+        ]
+
+
+class GraphApiSinKernelGraph(ComputeBenchmark):
+    def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
+        self.withGraphs = withGraphs
+        self.numKernels = numKernels
+        self.runtime = runtime
+        super().__init__(
+            bench, f"graph_api_benchmark_{runtime.value}", "SinKernelGraph"
+        )
+
+    def explicit_group(self):
+        return f"SinKernelGraph {self.numKernels}"
+
+    def description(self) -> str:
+        execution = "using graphs" if self.withGraphs else "without graphs"
+        return (
+            f"Measures {self.runtime.value.upper()} performance when executing {self.numKernels} "
+            f"sin kernels {execution}. Tests overhead and benefits of graph-based execution."
+        )
+
+    def name(self):
+        return f"graph_api_benchmark_{self.runtime.value} SinKernelGraph graphs:{self.withGraphs}, numKernels:{self.numKernels}"
+
+    def unstable(self) -> str:
+        return "This benchmark combines both eager and graph execution, and may not be representative of real use cases."
+
+    def get_tags(self):
+        return [
+            "graph",
+            runtime_to_tag_name(self.runtime),
+            "proxy",
+            "submit",
+            "memory",
+            "latency",
+        ]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--numKernels={self.numKernels}",
+            f"--withGraphs={self.withGraphs}",
+            "--withCopyOffload=1",
+            "--immediateAppendCmdList=0",
+        ]
+
+
+class GraphApiSubmitGraph(ComputeBenchmark):
+    def __init__(
+        self, bench, runtime: RUNTIMES, inOrderQueue, numKernels, measureCompletionTime
+    ):
+        self.inOrderQueue = inOrderQueue
+        self.numKernels = numKernels
+        self.runtime = runtime
+        self.measureCompletionTime = measureCompletionTime
+        super().__init__(bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph")
+
+    def explicit_group(self):
+        return f"SubmitGraph {self.numKernels}"
+
+    def description(self) -> str:
+        return (
+            f"Measures {self.runtime.value.upper()} performance when executing {self.numKernels} "
+            f"trivial kernels using graphs. Tests overhead and benefits of graph-based execution."
+        )
+
+    def name(self):
+        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}"
+
+    def get_tags(self):
+        return [
+            "graph",
+            runtime_to_tag_name(self.runtime),
+            "micro",
+            "submit",
+            "latency",
+        ]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--NumKernels={self.numKernels}",
+            f"--MeasureCompletionTime={self.measureCompletionTime}",
+            f"--InOrderQueue={self.inOrderQueue}",
+            "--Profiling=0",
+            "--KernelExecutionTime=1",
+        ]
+
+
+class UllsEmptyKernel(ComputeBenchmark):
+    def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
+        self.wgc = wgc
+        self.wgs = wgs
+        self.runtime = runtime
+        super().__init__(bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel")
+
+    def explicit_group(self):
+        return f"EmptyKernel {self.wgc} {self.wgs}"
+
+    def description(self) -> str:
+        return ""
+
+    def name(self):
+        return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}"
+
+    def get_tags(self):
+        return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=10000",
+            f"--wgs={self.wgs}",
+            f"--wgc={self.wgs}",
+        ]
+
+
+class UllsKernelSwitch(ComputeBenchmark):
+    def __init__(
+        self,
+        bench,
+        runtime: RUNTIMES,
+        count,
+        kernelTime,
+        barrier,
+        hostVisible,
+        ioq,
+        ctrBasedEvents,
+    ):
+        self.count = count
+        self.kernelTime = kernelTime
+        self.barrier = barrier
+        self.hostVisible = hostVisible
+        self.ctrBasedEvents = ctrBasedEvents
+        self.runtime = runtime
+        self.ioq = ioq
+        super().__init__(bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch")
+
+    def explicit_group(self):
+        return f"KernelSwitch {self.count} {self.kernelTime}"
+
+    def description(self) -> str:
+        return ""
+
+    def name(self):
+        return f"ulls_benchmark_{self.runtime.value} KernelSwitch count {self.count} kernelTime {self.kernelTime}"
+
+    def get_tags(self):
+        return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
+
+    def bin_args(self) -> list[str]:
+        return [
+            "--iterations=1000",
+            f"--count={self.count}",
+            f"--kernelTime={self.kernelTime}",
+            f"--barrier={self.barrier}",
+            f"--hostVisible={self.hostVisible}",
+            f"--ioq={self.ioq}",
+            f"--ctrBasedEvents={self.ctrBasedEvents}",
+        ]
diff --git a/unified-runtime/scripts/benchmarks/benches/llamacpp.py b/devops/scripts/benchmarks/benches/llamacpp.py
similarity index 77%
rename from unified-runtime/scripts/benchmarks/benches/llamacpp.py
rename to devops/scripts/benchmarks/benches/llamacpp.py
index 6524c95a9f56f..86d41ed525292 100644
--- a/unified-runtime/scripts/benchmarks/benches/llamacpp.py
+++ b/devops/scripts/benchmarks/benches/llamacpp.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -8,10 +8,10 @@
 from pathlib import Path
 from utils.utils import download, git_clone
 from .base import Benchmark, Suite
-from .result import Result
+from utils.result import Result
 from utils.utils import run, create_build_path
 from options import options
-from .oneapi import get_oneapi
+from utils.oneapi import get_oneapi
 import os
 
 
@@ -25,6 +25,12 @@ def __init__(self, directory):
     def name(self) -> str:
         return "llama.cpp bench"
 
+    def git_url(self) -> str:
+        return "https://github.com/ggerganov/llama.cpp"
+
+    def git_hash(self) -> str:
+        return "1ee9eea094fe5846c7d8d770aa7caa749d246b23"
+
     def setup(self):
         if options.sycl is None:
             return
@@ -32,8 +38,8 @@ def setup(self):
         repo_path = git_clone(
             self.directory,
             "llamacpp-repo",
-            "https://github.com/ggerganov/llama.cpp",
-            "1ee9eea094fe5846c7d8d770aa7caa749d246b23",
+            self.git_url(),
+            self.git_hash(),
         )
 
         self.models_dir = os.path.join(self.directory, "models")
@@ -43,6 +49,7 @@ def setup(self):
             self.models_dir,
             "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf",
             "Phi-3-mini-4k-instruct-q4.gguf",
+            checksum="fc4f45c9729874a33a527465b2ec78189a18e5726b7121182623feeae38632ace4f280617b01d4a04875acf49d263ee4",
         )
 
         self.oneapi = get_oneapi()
@@ -62,11 +69,11 @@ def setup(self):
             f'-DCMAKE_CXX_FLAGS=-I"{self.oneapi.mkl_include()}"',
             f"-DCMAKE_SHARED_LINKER_FLAGS=-L{self.oneapi.compiler_lib()} -L{self.oneapi.mkl_lib()}",
         ]
-        print(f"{self.__class__.__name__}: Run {configure_command}")
+
         run(configure_command, add_sycl=True)
-        print(f"{self.__class__.__name__}: Run cmake --build {self.build_path} -j")
+
         run(
-            f"cmake --build {self.build_path} -j",
+            f"cmake --build {self.build_path} -j {options.build_jobs}",
             add_sycl=True,
             ld_library=self.oneapi.ld_libraries(),
         )
@@ -92,6 +99,17 @@ def setup(self):
     def name(self):
         return f"llama.cpp"
 
+    def description(self) -> str:
+        return (
+            "Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. "
+            "Runs both prompt processing (initial context processing) and text generation benchmarks with "
+            "different batch sizes. Higher values indicate better performance. Uses the Phi-3-mini-4k-instruct "
+            "quantized model and leverages SYCL with oneDNN for acceleration."
+        )
+
+    def get_tags(self):
+        return ["SYCL", "application", "inference", "throughput"]
+
     def lower_is_better(self):
         return False
 
@@ -130,6 +148,8 @@ def run(self, env_vars) -> list[Result]:
                     env=env_vars,
                     stdout=result,
                     unit="token/s",
+                    git_url=self.bench.git_url(),
+                    git_hash=self.bench.git_hash(),
                 )
             )
         return results
diff --git a/unified-runtime/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py
similarity index 86%
rename from unified-runtime/scripts/benchmarks/benches/syclbench.py
rename to devops/scripts/benchmarks/benches/syclbench.py
index f7cf571a7ecd7..9854c92d338fc 100644
--- a/unified-runtime/scripts/benchmarks/benches/syclbench.py
+++ b/devops/scripts/benchmarks/benches/syclbench.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -8,7 +8,7 @@
 import io
 from utils.utils import run, git_clone, create_build_path
 from .base import Benchmark, Suite
-from .result import Result
+from utils.result import Result
 from options import options
 
 
@@ -23,6 +23,12 @@ def __init__(self, directory):
     def name(self) -> str:
         return "SYCL-Bench"
 
+    def git_url(self) -> str:
+        return "https://github.com/unisa-hpc/sycl-bench.git"
+
+    def git_hash(self) -> str:
+        return "31fc70be6266193c4ba60eb1fe3ce26edee4ca5b"
+
     def setup(self):
         if options.sycl is None:
             return
@@ -31,8 +37,8 @@ def setup(self):
         repo_path = git_clone(
             self.directory,
             "sycl-bench-repo",
-            "https://github.com/mateuszpn/sycl-bench.git",
-            "1e6ab2cfd004a72c5336c26945965017e06eab71",
+            self.git_url(),
+            self.git_hash(),
         )
 
         configure_command = [
@@ -51,7 +57,7 @@ def setup(self):
             ]
 
         run(configure_command, add_sycl=True)
-        run(f"cmake --build {build_path} -j", add_sycl=True)
+        run(f"cmake --build {build_path} -j {options.build_jobs}", add_sycl=True)
 
         self.built = True
 
@@ -65,14 +71,14 @@ def benchmarks(self) -> list[Benchmark]:
             DagTaskS(self),
             HostDevBandwidth(self),
             LocalMem(self),
-            Pattern_L2(self),
-            Reduction(self),
+            # Pattern_L2(self), # validation failure
+            # Reduction(self), # validation failure
             ScalarProd(self),
             SegmentReduction(self),
-            UsmAccLatency(self),
+            # UsmAccLatency(self), # validation failure
             UsmAllocLatency(self),
-            UsmInstrMix(self),
-            UsmPinnedOverhead(self),
+            # UsmInstrMix(self), # validation failure
+            # UsmPinnedOverhead(self), # validation failure
             VecAdd(self),
             # *** sycl-bench single benchmarks
             # TwoDConvolution(self), # run time < 1ms
@@ -82,20 +88,20 @@ def benchmarks(self) -> list[Benchmark]:
             Atax(self),
             # Atomic_reduction(self), # run time < 1ms
             Bicg(self),
-            Correlation(self),
-            Covariance(self),
-            Gemm(self),
-            Gesumv(self),
-            Gramschmidt(self),
+            # Correlation(self), # validation failure
+            # Covariance(self), # validation failure
+            # Gemm(self), # validation failure
+            # Gesumv(self), # validation failure
+            # Gramschmidt(self), # validation failure
             KMeans(self),
             LinRegCoeff(self),
             # LinRegError(self), # run time < 1ms
-            MatmulChain(self),
+            # MatmulChain(self), # validation failure
             MolDyn(self),
-            Mvt(self),
+            # Mvt(self), # validation failure
             Sf(self),
-            Syr2k(self),
-            Syrk(self),
+            # Syr2k(self), # validation failure
+            # Syrk(self), # validation failure
         ]
 
 
@@ -105,7 +111,6 @@ def __init__(self, bench, name, test):
         self.bench = bench
         self.bench_name = name
         self.test = test
-        self.done = False
 
     def bin_args(self) -> list[str]:
         return []
@@ -113,16 +118,26 @@ def bin_args(self) -> list[str]:
     def extra_env_vars(self) -> dict:
         return {}
 
+    def get_tags(self):
+        base_tags = ["SYCL", "micro"]
+        if "Memory" in self.bench_name or "mem" in self.bench_name.lower():
+            base_tags.append("memory")
+        if "Reduction" in self.bench_name:
+            base_tags.append("math")
+        if "Bandwidth" in self.bench_name:
+            base_tags.append("throughput")
+        if "Latency" in self.bench_name:
+            base_tags.append("latency")
+        return base_tags
+
     def setup(self):
         self.benchmark_bin = os.path.join(
             self.directory, "sycl-bench-build", self.bench_name
         )
 
     def run(self, env_vars) -> list[Result]:
-        if self.done:
-            return
         self.outputfile = os.path.join(self.bench.directory, self.test + ".csv")
-        print(f"{self.__class__.__name__}: Results in {self.outputfile}")
+
         command = [
             f"{self.benchmark_bin}",
             f"--warmup-run",
@@ -143,25 +158,27 @@ def run(self, env_vars) -> list[Result]:
                 if not row[0].startswith("#"):
                     res_list.append(
                         Result(
-                            label=row[0],
+                            label=f"{self.name()} {row[0]}",
                             value=float(row[12]) * 1000,  # convert to ms
                             passed=(row[1] == "PASS"),
                             command=command,
                             env=env_vars,
                             stdout=row,
                             unit="ms",
+                            git_url=self.bench.git_url(),
+                            git_hash=self.bench.git_hash(),
                         )
                     )
-        self.done = True
-        return res_list
 
-    def teardown(self):
-        print(f"Removing {self.outputfile}...")
         os.remove(self.outputfile)
-        return
+
+        return res_list
 
     def name(self):
-        return self.test
+        return f"{self.bench.name()} {self.test}"
+
+    def teardown(self):
+        return
 
 
 # multi benchmarks
diff --git a/devops/scripts/benchmarks/benches/test.py b/devops/scripts/benchmarks/benches/test.py
new file mode 100644
index 0000000000000..ad1e8c9e57735
--- /dev/null
+++ b/devops/scripts/benchmarks/benches/test.py
@@ -0,0 +1,106 @@
+# Copyright (C) 2024-2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import random
+from utils.utils import git_clone
+from .base import Benchmark, Suite
+from utils.result import BenchmarkMetadata, Result
+from utils.utils import run, create_build_path
+from options import options
+import os
+
+
+class TestSuite(Suite):
+    def __init__(self):
+        return
+
+    def setup(self):
+        return
+
+    def name(self) -> str:
+        return "Test Suite"
+
+    def benchmarks(self) -> list[Benchmark]:
+        bench_configs = [
+            ("Memory Bandwidth", 2000, 200, "Foo Group", None, None),
+            ("Latency", 100, 20, "Bar Group", "A Latency test note!", None),
+            ("Throughput", 1500, 150, "Foo Group", None, None),
+            ("FLOPS", 3000, 300, "Foo Group", None, "Unstable FLOPS test!"),
+            ("Cache Miss Rate", 250, 25, "Bar Group", "Test Note", "And another note!"),
+        ]
+
+        result = []
+        for base_name, base_value, base_diff, group, notes, unstable in bench_configs:
+            for variant in range(6):
+                value_multiplier = 1.0 + (variant * 0.2)
+                name = f"{base_name} {variant+1}"
+                value = base_value * value_multiplier
+                diff = base_diff * value_multiplier
+
+                result.append(
+                    TestBench(self, name, value, diff, group, notes, unstable)
+                )
+
+        return result
+
+    def additionalMetadata(self) -> dict[str, BenchmarkMetadata]:
+        return {
+            "Foo Group": BenchmarkMetadata(
+                type="group",
+                description="This is a test benchmark for Foo Group.",
+                notes="This is a test note for Foo Group.\n" "Look, multiple lines!",
+            ),
+            "Bar Group": BenchmarkMetadata(
+                type="group",
+                description="This is a test benchmark for Bar Group.",
+                unstable="This is an unstable note for Bar Group.",
+            ),
+        }
+
+
+class TestBench(Benchmark):
+    def __init__(self, suite, name, value, diff, group="", notes=None, unstable=None):
+        super().__init__("", suite)
+        self.bname = name
+        self.value = value
+        self.diff = diff
+        self.group = group
+        self.notes_text = notes
+        self.unstable_text = unstable
+
+    def name(self):
+        return self.bname
+
+    def lower_is_better(self):
+        return True
+
+    def setup(self):
+        return
+
+    def description(self) -> str:
+        return f"This is a test benchmark for {self.bname}."
+
+    def notes(self) -> str:
+        return self.notes_text
+
+    def unstable(self) -> str:
+        return self.unstable_text
+
+    def run(self, env_vars) -> list[Result]:
+        random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
+        return [
+            Result(
+                label=self.name(),
+                explicit_group=self.group,
+                value=random_value,
+                command=["test", "--arg1", "foo"],
+                env={"A": "B"},
+                stdout="no output",
+                unit="ms",
+            )
+        ]
+
+    def teardown(self):
+        return
diff --git a/unified-runtime/scripts/benchmarks/benches/umf.py b/devops/scripts/benchmarks/benches/umf.py
similarity index 55%
rename from unified-runtime/scripts/benchmarks/benches/umf.py
rename to devops/scripts/benchmarks/benches/umf.py
index c7b767f02bbe1..1b7726b4db819 100644
--- a/unified-runtime/scripts/benchmarks/benches/umf.py
+++ b/devops/scripts/benchmarks/benches/umf.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -6,13 +6,14 @@
 import random
 from utils.utils import git_clone
 from .base import Benchmark, Suite
-from .result import Result
+from utils.result import Result
 from utils.utils import run, create_build_path
 from options import options
-from .oneapi import get_oneapi
+from utils.oneapi import get_oneapi
 import os
 import csv
 import io
+import re
 
 
 def isUMFAvailable():
@@ -22,8 +23,6 @@ def isUMFAvailable():
 class UMFSuite(Suite):
     def __init__(self, directory):
         self.directory = directory
-        if not isUMFAvailable():
-            print("UMF not provided. Related benchmarks will not run")
 
     def name(self) -> str:
         return "UMF"
@@ -47,87 +46,36 @@ def benchmarks(self) -> list[Benchmark]:
         return benches
 
 
-class ComputeUMFBenchmark(Benchmark):
-    def __init__(self, bench, name):
+class GBench(Benchmark):
+    def __init__(self, bench):
         super().__init__(bench.directory, bench)
 
         self.bench = bench
-        self.bench_name = name
+        self.bench_name = "umf-benchmark"
         self.oneapi = get_oneapi()
+        self.umf_lib = options.umf + "lib"
 
-        self.col_name = None
-        self.col_iterations = None
-        self.col_real_time = None
-        self.col_cpu_time = None
-        self.col_time_unit = None
+        self.fragmentation_prefix = "FRAGMENTATION_"
 
-        self.col_statistics_time = None
+        self.num_cols_with_memory = 13
 
-    def bin_args(self) -> list[str]:
-        return []
-
-    def extra_env_vars(self) -> dict:
-        return {}
-
-    def setup(self):
-        if not isUMFAvailable():
-            print("UMF prefix path not provided")
-            return
-
-        self.benchmark_bin = os.path.join(options.umf, "benchmark", self.bench_name)
-
-    def run(self, env_vars) -> list[Result]:
-        command = [
-            f"{self.benchmark_bin}",
-        ]
-
-        command += self.bin_args()
-        env_vars.update(self.extra_env_vars())
-
-        result = self.run_bench(
-            command, env_vars, add_sycl=False, ld_library=[self.oneapi.tbb_lib()]
-        )
-        parsed = self.parse_output(result)
-        results = []
-        for r in parsed:
-            (config, pool, mean) = r
-            label = f"{config} {pool}"
-            results.append(
-                Result(
-                    label=label,
-                    value=mean,
-                    command=command,
-                    env=env_vars,
-                    stdout=result,
-                    unit="ns",
-                    explicit_group=config,
-                )
-            )
-        return results
-
-    # Implementation with self.col_* indices could lead to the division by None
-    def get_mean(self, datarow):
-        raise NotImplementedError()
-
-    def teardown(self):
-        return
-
-
-class GBench(ComputeUMFBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "umf-benchmark")
-
-        self.col_name = 0
-        self.col_iterations = 1
-        self.col_real_time = 2
-        self.col_cpu_time = 3
-        self.col_time_unit = 4
+        self.col_name = "name" 
+        self.col_iterations = "iterations" 
+        self.col_real_time = "real_time" 
+        self.col_cpu_time = "cpu_time" 
+        self.col_time_unit = "time_unit" 
+        self.col_memory_overhead = "memory_overhead" 
 
         self.idx_pool = 0
         self.idx_config = 1
         self.name_separator = "/"
 
         self.col_statistics_time = self.col_real_time
+        self.col_statistics_memory = self.col_memory_overhead
+
+        self.is_preloaded = False
+
+        self.lib_to_be_replaced = None
 
     def name(self):
         return self.bench_name
@@ -138,17 +86,23 @@ def name(self):
     def bin_args(self):
         return ["--benchmark_format=csv"]
 
-    # the default unit
-    # might be changed globally with --benchmark_time_unit={ns|us|ms|s}
-    # the change affects only benchmark where time unit has not been set
-    # explicitly
-    def unit(self):
-        return "ns"
-
     # these benchmarks are not stable, so set this at a large value
     def stddev_threshold(self) -> float:
         return 0.2  # 20%
 
+    def extra_env_vars(self) -> dict:
+        return {}
+
+    def setup(self):
+        if not isUMFAvailable():
+            print("UMF prefix path not provided")
+            return
+
+        self.benchmark_bin = os.path.join(options.umf, "benchmark", self.bench_name)
+
+    def is_memory_statistics_included(self, data_row):
+        return len(data_row) == self.num_cols_with_memory
+
     def get_pool_and_config(self, full_name):
         list_split = full_name.split(self.name_separator, 1)
         if len(list_split) != 2:
@@ -156,71 +110,115 @@ def get_pool_and_config(self, full_name):
 
         return list_split[self.idx_pool], list_split[self.idx_config]
 
-    def get_mean(self, datarow):
+    def get_mean_time(self, datarow):
         return float(datarow[self.col_statistics_time])
 
-    def parse_output(self, output):
-        csv_file = io.StringIO(output)
-        reader = csv.reader(csv_file)
+    def get_memory_overhead(self, datarow):
+        return float(datarow[self.col_statistics_memory])
 
-        data_row = next(reader, None)
-        if data_row is None:
-            raise ValueError("Benchmark output does not contain data.")
+    def get_unit_time_or_overhead(self, config):
+        if re.search(f"^{self.fragmentation_prefix}", config):
+            return "%"
 
-        results = []
-        for row in reader:
-            try:
-                full_name = row[self.col_name]
-                pool, config = self.get_pool_and_config(full_name)
-                mean = self.get_mean(row)
-                results.append((config, pool, mean))
-            except KeyError as e:
-                raise ValueError(f"Error parsing output: {e}")
+        # the default time unit
+        # might be changed globally with --benchmark_time_unit={ns|us|ms|s}
+        # the change affects only benchmark where time unit has not been set
+        # explicitly
+        return "ns"
 
-        return results
+    def get_names_of_benchmarks_to_be_run(self, command, env_vars):
+        list_all_command = command + ["--benchmark_list_tests"]
 
+        if self.is_preloaded:
+            list_all_command += ["--benchmark_filter=" + self.lib_to_be_replaced]
 
-class GBenchPreloaded(GBench):
-    def __init__(self, bench, lib_to_be_replaced, replacing_lib):
-        super().__init__(bench)
+        all_names = self.run_bench(
+            list_all_command, env_vars, add_sycl=False, ld_library=[self.umf_lib]
+        ).splitlines()
 
-        self.lib_to_be_replaced = lib_to_be_replaced
-        self.replacing_lib = replacing_lib
+        return all_names
 
-    def bin_args(self):
-        full_args = super().bin_args()
-        full_args.append(f"--benchmark_filter={self.lib_to_be_replaced}")
+    def run(self, env_vars) -> list[Result]:
+        command = [f"{self.benchmark_bin}"]
 
-        return full_args
+        all_names = self.get_names_of_benchmarks_to_be_run(command, env_vars)
 
-    def get_preloaded_name(self, pool_name) -> str:
-        new_pool_name = pool_name.replace(self.lib_to_be_replaced, self.replacing_lib)
+        command += self.bin_args()
+        env_vars.update(self.extra_env_vars())
 
-        return new_pool_name
+        results = []
+
+        for name in all_names:
+            specific_benchmark = command + ["--benchmark_filter=^" + name + "$"]
+
+            result = self.run_bench(
+                specific_benchmark, env_vars, add_sycl=False, ld_library=[self.umf_lib]
+            )
+
+            parsed = self.parse_output(result)
+            for r in parsed:
+                (explicit_group, pool, value) = r
+                label = f"{explicit_group} {pool}"
+                results.append(
+                    Result(
+                        label=label,
+                        value=value,
+                        command=command,
+                        env=env_vars,
+                        stdout=result,
+                        unit=self.get_unit_time_or_overhead(explicit_group),
+                        explicit_group=explicit_group,
+                    )
+                )
+
+        return results
 
     def parse_output(self, output):
         csv_file = io.StringIO(output)
-        reader = csv.reader(csv_file)
-
-        data_row = next(reader, None)
-        if data_row is None:
-            raise ValueError("Benchmark output does not contain data.")
+        reader = csv.DictReader(csv_file)
 
         results = []
+
         for row in reader:
             try:
                 full_name = row[self.col_name]
                 pool, config = self.get_pool_and_config(full_name)
-                mean = self.get_mean(row)
-                updated_pool = self.get_preloaded_name(pool)
-                updated_config = self.get_preloaded_name(config)
+                statistics_time = self.get_mean_time(row)
+
+                if self.is_preloaded:
+                    pool = self.get_preloaded_pool_name(pool)
+
+                results.append((config, pool, statistics_time))
+
+                if self.is_memory_statistics_included(row):
+                    statistics_overhead = self.get_memory_overhead(row)
+                    config = self.fragmentation_prefix + config
+
+                    results.append((config, pool, statistics_overhead))
 
-                results.append((updated_config, updated_pool, mean))
             except KeyError as e:
                 raise ValueError(f"Error parsing output: {e}")
 
         return results
 
+    def teardown(self):
+        return
+
+
+class GBenchPreloaded(GBench):
+    def __init__(self, bench, lib_to_be_replaced, replacing_lib):
+        super().__init__(bench)
+
+        self.is_preloaded = True
+
+        self.lib_to_be_replaced = lib_to_be_replaced
+        self.replacing_lib = replacing_lib
+
+    def get_preloaded_pool_name(self, pool_name) -> str:
+        new_pool_name = pool_name.replace(self.lib_to_be_replaced, self.replacing_lib)
+
+        return new_pool_name
+
 
 class GBenchGlibc(GBenchPreloaded):
     def __init__(self, bench, replacing_lib):
diff --git a/unified-runtime/scripts/benchmarks/benches/velocity.py b/devops/scripts/benchmarks/benches/velocity.py
similarity index 74%
rename from unified-runtime/scripts/benchmarks/benches/velocity.py
rename to devops/scripts/benchmarks/benches/velocity.py
index b7d06cbe4a3a2..493298dea8b10 100644
--- a/unified-runtime/scripts/benchmarks/benches/velocity.py
+++ b/devops/scripts/benchmarks/benches/velocity.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -7,10 +7,10 @@
 import shutil
 from utils.utils import git_clone
 from .base import Benchmark, Suite
-from .result import Result
+from utils.result import Result
 from utils.utils import run, create_build_path
 from options import options
-from .oneapi import get_oneapi
+from utils.oneapi import get_oneapi
 import shutil
 
 import os
@@ -26,6 +26,12 @@ def __init__(self, directory):
     def name(self) -> str:
         return "Velocity Bench"
 
+    def git_url(self) -> str:
+        return "https://github.com/oneapi-src/Velocity-Bench/"
+
+    def git_hash(self) -> str:
+        return "b22215c16f789100449c34bf4eaa3fb178983d69"
+
     def setup(self):
         if options.sycl is None:
             return
@@ -33,8 +39,8 @@ def setup(self):
         self.repo_path = git_clone(
             self.directory,
             "velocity-bench-repo",
-            "https://github.com/oneapi-src/Velocity-Bench/",
-            "b22215c16f789100449c34bf4eaa3fb178983d69",
+            self.git_url(),
+            self.git_hash(),
         )
 
     def benchmarks(self) -> list[Benchmark]:
@@ -101,7 +107,7 @@ def setup(self):
 
         run(configure_command, {"CC": "clang", "CXX": "clang++"}, add_sycl=True)
         run(
-            f"cmake --build {build_path} -j",
+            f"cmake --build {build_path} -j {options.build_jobs}",
             add_sycl=True,
             ld_library=self.ld_libraries(),
         )
@@ -115,6 +121,12 @@ def extra_env_vars(self) -> dict:
     def parse_output(self, stdout: str) -> float:
         raise NotImplementedError()
 
+    def description(self) -> str:
+        return ""
+
+    def get_tags(self):
+        return ["SYCL", "application"]
+
     def run(self, env_vars) -> list[Result]:
         env_vars.update(self.extra_env_vars())
 
@@ -133,6 +145,8 @@ def run(self, env_vars) -> list[Result]:
                 env=env_vars,
                 stdout=result,
                 unit=self.unit,
+                git_url=self.vb.git_url(),
+                git_hash=self.vb.git_hash(),
             )
         ]
 
@@ -147,6 +161,12 @@ def __init__(self, vb: VelocityBench):
     def name(self):
         return "Velocity-Bench Hashtable"
 
+    def description(self) -> str:
+        return (
+            "Measures hash table search performance using an efficient lock-free algorithm with linear probing. "
+            "Reports throughput in millions of keys processed per second. Higher values indicate better performance."
+        )
+
     def bin_args(self) -> list[str]:
         return ["--no-verify"]
 
@@ -162,6 +182,9 @@ def parse_output(self, stdout: str) -> float:
                 "{self.__class__.__name__}: Failed to parse keys per second from benchmark output."
             )
 
+    def get_tags(self):
+        return ["SYCL", "application", "throughput"]
+
 
 class Bitcracker(VelocityBase):
     def __init__(self, vb: VelocityBench):
@@ -170,6 +193,13 @@ def __init__(self, vb: VelocityBench):
     def name(self):
         return "Velocity-Bench Bitcracker"
 
+    def description(self) -> str:
+        return (
+            "Password-cracking application for BitLocker-encrypted memory units. "
+            "Uses dictionary attack to find user or recovery passwords. "
+            "Measures total time required to process 60000 passwords."
+        )
+
     def bin_args(self) -> list[str]:
         self.data_path = os.path.join(self.vb.repo_path, "bitcracker", "hash_pass")
 
@@ -193,6 +223,9 @@ def parse_output(self, stdout: str) -> float:
                 "{self.__class__.__name__}: Failed to parse benchmark output."
             )
 
+    def get_tags(self):
+        return ["SYCL", "application", "throughput"]
+
 
 class SobelFilter(VelocityBase):
     def __init__(self, vb: VelocityBench):
@@ -204,11 +237,19 @@ def download_deps(self):
             "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=",
             "sobel_filter_data.tgz",
             untar=True,
+            checksum="7fc62aa729792ede80ed8ae70fb56fa443d479139c5888ed4d4047b98caec106687a0f05886a9ced77922ccba7f65e66",
         )
 
     def name(self):
         return "Velocity-Bench Sobel Filter"
 
+    def description(self) -> str:
+        return (
+            "Popular RGB-to-grayscale image conversion technique that applies a gaussian filter "
+            "to reduce edge artifacts. Processes a large 32K x 32K image and measures "
+            "the time required to apply the filter."
+        )
+
     def bin_args(self) -> list[str]:
         return [
             "-i",
@@ -231,6 +272,9 @@ def parse_output(self, stdout: str) -> float:
                 "{self.__class__.__name__}: Failed to parse benchmark output."
             )
 
+    def get_tags(self):
+        return ["SYCL", "application", "image", "throughput"]
+
 
 class QuickSilver(VelocityBase):
     def __init__(self, vb: VelocityBench):
@@ -249,6 +293,13 @@ def run(self, env_vars) -> list[Result]:
     def name(self):
         return "Velocity-Bench QuickSilver"
 
+    def description(self) -> str:
+        return (
+            "Solves a simplified dynamic Monte Carlo particle-transport problem used in HPC. "
+            "Replicates memory access patterns, communication patterns, and branching of Mercury workloads. "
+            "Reports a figure of merit in MMS/CTT where higher values indicate better performance."
+        )
+
     def lower_is_better(self):
         return False
 
@@ -271,6 +322,9 @@ def parse_output(self, stdout: str) -> float:
                 "{self.__class__.__name__}: Failed to parse benchmark output."
             )
 
+    def get_tags(self):
+        return ["SYCL", "application", "simulation", "throughput"]
+
 
 class Easywave(VelocityBase):
     def __init__(self, vb: VelocityBench):
@@ -279,14 +333,22 @@ def __init__(self, vb: VelocityBench):
     def download_deps(self):
         self.download(
             "easywave",
-            "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz",
+            "https://gitlab.oca.eu/AstroGeoGPM/eazyWave/-/raw/master/data/examples.tar.gz",
             "examples.tar.gz",
             untar=True,
+            checksum="3b0cd0efde10122934ba6db8451b8c41f4f95a3370fc967fc5244039ef42aae7e931009af1586fa5ed2143ade8ed47b1",
         )
 
     def name(self):
         return "Velocity-Bench Easywave"
 
+    def description(self) -> str:
+        return (
+            "A tsunami wave simulator used for researching tsunami generation and wave propagation. "
+            "Measures the elapsed time in milliseconds to simulate a specified tsunami event "
+            "based on real-world data."
+        )
+
     def bin_args(self) -> list[str]:
         return [
             "-grid",
@@ -327,6 +389,9 @@ def parse_output(self, stdout: str) -> float:
             os.path.join(options.benchmark_cwd, "easywave.log")
         )
 
+    def get_tags(self):
+        return ["SYCL", "application", "simulation"]
+
 
 class CudaSift(VelocityBase):
     def __init__(self, vb: VelocityBench):
@@ -341,6 +406,13 @@ def download_deps(self):
     def name(self):
         return "Velocity-Bench CudaSift"
 
+    def description(self) -> str:
+        return (
+            "Implementation of the SIFT (Scale Invariant Feature Transform) algorithm "
+            "for detecting, describing, and matching local features in images. "
+            "Measures average processing time in milliseconds."
+        )
+
     def parse_output(self, stdout: str) -> float:
         match = re.search(r"Avg workload time = (\d+\.\d+) ms", stdout)
         if match:
@@ -348,6 +420,9 @@ def parse_output(self, stdout: str) -> float:
         else:
             raise ValueError("Failed to parse benchmark output.")
 
+    def get_tags(self):
+        return ["SYCL", "application", "image"]
+
 
 class DLCifar(VelocityBase):
     def __init__(self, vb: VelocityBench):
@@ -364,6 +439,7 @@ def download_deps(self):
             "cifar-10-binary.tar.gz",
             untar=True,
             skip_data_dir=True,
+            checksum="974b1bd62da0cb3b7a42506d42b1e030c9a0cb4a0f2c359063f9c0e65267c48f0329e4493c183a348f44ddc462eaf814",
         )
         return
 
@@ -382,6 +458,13 @@ def extra_cmake_args(self):
     def name(self):
         return "Velocity-Bench dl-cifar"
 
+    def description(self) -> str:
+        return (
+            "Deep learning image classification workload based on the CIFAR-10 dataset "
+            "of 60,000 32x32 color images in 10 classes. Uses neural networks to "
+            "classify input images and measures total calculation time."
+        )
+
     def parse_output(self, stdout: str) -> float:
         match = re.search(
             r"dl-cifar - total time for whole calculation: (\d+\.\d+) s", stdout
@@ -391,6 +474,9 @@ def parse_output(self, stdout: str) -> float:
         else:
             raise ValueError("Failed to parse benchmark output.")
 
+    def get_tags(self):
+        return ["SYCL", "application", "inference", "image"]
+
 
 class DLMnist(VelocityBase):
     def __init__(self, vb: VelocityBench):
@@ -407,6 +493,7 @@ def download_deps(self):
             "train-images.idx3-ubyte.gz",
             unzip=True,
             skip_data_dir=True,
+            checksum="f40eb179f7c3d2637e789663bde56d444a23e4a0a14477a9e6ed88bc39c8ad6eaff68056c0cd9bb60daf0062b70dc8ee",
         )
         self.download(
             "datasets",
@@ -414,6 +501,7 @@ def download_deps(self):
             "train-labels.idx1-ubyte.gz",
             unzip=True,
             skip_data_dir=True,
+            checksum="ba9c11bf9a7f7c2c04127b8b3e568cf70dd3429d9029ca59b7650977a4ac32f8ff5041fe42bc872097487b06a6794e00",
         )
         self.download(
             "datasets",
@@ -421,6 +509,7 @@ def download_deps(self):
             "t10k-images.idx3-ubyte.gz",
             unzip=True,
             skip_data_dir=True,
+            checksum="1bf45877962fd391f7abb20534a30fd2203d0865309fec5f87d576dbdbefdcb16adb49220afc22a0f3478359d229449c",
         )
         self.download(
             "datasets",
@@ -428,6 +517,7 @@ def download_deps(self):
             "t10k-labels.idx1-ubyte.gz",
             unzip=True,
             skip_data_dir=True,
+            checksum="ccc1ee70f798a04e6bfeca56a4d0f0de8d8eeeca9f74641c1e1bfb00cf7cc4aa4d023f6ea1b40e79bb4707107845479d",
         )
 
     def extra_cmake_args(self):
@@ -445,6 +535,13 @@ def extra_cmake_args(self):
     def name(self):
         return "Velocity-Bench dl-mnist"
 
+    def description(self) -> str:
+        return (
+            "Digit recognition based on the MNIST database, one of the oldest and most popular "
+            "databases of handwritten digits. Uses neural networks to identify digits "
+            "and measures total calculation time."
+        )
+
     def bin_args(self):
         return ["-conv_algo", "ONEDNN_AUTO"]
 
@@ -465,6 +562,9 @@ def parse_output(self, stdout: str) -> float:
         else:
             raise ValueError("Failed to parse benchmark output.")
 
+    def get_tags(self):
+        return ["SYCL", "application", "inference", "image"]
+
 
 class SVM(VelocityBase):
     def __init__(self, vb: VelocityBench):
@@ -488,6 +588,13 @@ def extra_cmake_args(self):
     def name(self):
         return "Velocity-Bench svm"
 
+    def description(self) -> str:
+        return (
+            "Implementation of Support Vector Machine, a popular classical machine learning technique. "
+            "Uses supervised learning models with associated algorithms to analyze data "
+            "for classification and regression analysis. Measures total elapsed time."
+        )
+
     def bin_args(self):
         return [
             f"{self.code_path}/a9a",
@@ -500,3 +607,6 @@ def parse_output(self, stdout: str) -> float:
             return float(match.group(1))
         else:
             raise ValueError("Failed to parse benchmark output.")
+
+    def get_tags(self):
+        return ["SYCL", "application", "inference"]
diff --git a/unified-runtime/scripts/benchmarks/history.py b/devops/scripts/benchmarks/history.py
similarity index 68%
rename from unified-runtime/scripts/benchmarks/history.py
rename to devops/scripts/benchmarks/history.py
index 7902aa4f04c35..0b80c54ad7393 100644
--- a/unified-runtime/scripts/benchmarks/history.py
+++ b/devops/scripts/benchmarks/history.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -6,14 +6,14 @@
 import os
 import json
 from pathlib import Path
-from benches.result import Result, BenchmarkRun
+import socket
+from utils.result import Result, BenchmarkRun
 from options import Compare, options
 from datetime import datetime, timezone
 from utils.utils import run
 
 
 class BenchmarkHistory:
-    benchmark_run_index_max = 0
     runs = []
 
     def __init__(self, dir):
@@ -35,42 +35,55 @@ def load(self, n: int):
         # Get all JSON files in the results directory
         benchmark_files = list(results_dir.glob("*.json"))
 
-        # Extract index numbers and sort files by index number
-        def extract_index(file_path: Path) -> int:
+        # Extract timestamp and sort files by it
+        def extract_timestamp(file_path: Path) -> str:
             try:
-                return int(file_path.stem.split("_")[0])
-            except (IndexError, ValueError):
-                return -1
+                return file_path.stem.split("_")[-1]
+            except IndexError:
+                return ""
 
-        benchmark_files = [
-            file for file in benchmark_files if extract_index(file) != -1
-        ]
-        benchmark_files.sort(key=extract_index)
+        benchmark_files.sort(key=extract_timestamp, reverse=True)
 
         # Load the first n benchmark files
         benchmark_runs = []
-        for file_path in benchmark_files[n::-1]:
+        for file_path in benchmark_files[:n]:
             benchmark_run = self.load_result(file_path)
             if benchmark_run:
                 benchmark_runs.append(benchmark_run)
 
-        if benchmark_files:
-            self.benchmark_run_index_max = extract_index(benchmark_files[-1])
-
         self.runs = benchmark_runs
 
     def create_run(self, name: str, results: list[Result]) -> BenchmarkRun:
         try:
-            result = run("git rev-parse --short HEAD")
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            result = run("git rev-parse --short HEAD", cwd=script_dir)
             git_hash = result.stdout.decode().strip()
+
+            # Get the GitHub repo URL from git remote
+            remote_result = run("git remote get-url origin", cwd=script_dir)
+            remote_url = remote_result.stdout.decode().strip()
+
+            # Convert SSH or HTTPS URL to owner/repo format
+            if remote_url.startswith("git@github.com:"):
+                # SSH format: git@github.com:owner/repo.git
+                github_repo = remote_url.split("git@github.com:")[1].rstrip(".git")
+            elif remote_url.startswith("https://github.com/"):
+                # HTTPS format: https://github.com/owner/repo.git
+                github_repo = remote_url.split("https://github.com/")[1].rstrip(".git")
+            else:
+                github_repo = None
+
         except:
             git_hash = "unknown"
+            github_repo = None
 
         return BenchmarkRun(
             name=name,
             git_hash=git_hash,
+            github_repo=github_repo,
             date=datetime.now(tz=timezone.utc),
             results=results,
+            hostname=socket.gethostname(),
         )
 
     def save(self, save_name, results: list[Result], to_file=True):
@@ -84,12 +97,9 @@ def save(self, save_name, results: list[Result], to_file=True):
         results_dir = Path(os.path.join(self.dir, "results"))
         os.makedirs(results_dir, exist_ok=True)
 
-        self.benchmark_run_index_max += 1
-        file_path = Path(
-            os.path.join(
-                results_dir, f"{self.benchmark_run_index_max}_{save_name}.json"
-            )
-        )
+        # Use formatted timestamp for the filename
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        file_path = Path(os.path.join(results_dir, f"{save_name}_{timestamp}.json"))
         with file_path.open("w") as file:
             json.dump(serialized, file, indent=4)
         print(f"Benchmark results saved to {file_path}")
@@ -120,6 +130,7 @@ def compute_average(self, data: list[BenchmarkRun]):
             name=first_run.name,
             git_hash="average",
             date=first_run.date,  # should this be different?
+            hostname=first_run.hostname,
         )
 
         return average_benchmark_run
diff --git a/devops/scripts/benchmarks/html/config.js b/devops/scripts/benchmarks/html/config.js
new file mode 100644
index 0000000000000..3e67ae1dce8e5
--- /dev/null
+++ b/devops/scripts/benchmarks/html/config.js
@@ -0,0 +1,2 @@
+//remoteDataUrl = 'https://example.com/data.json';
+//defaultCompareNames = ['baseline'];
diff --git a/devops/scripts/benchmarks/html/data.js b/devops/scripts/benchmarks/html/data.js
new file mode 100644
index 0000000000000..a5b96c72834ba
--- /dev/null
+++ b/devops/scripts/benchmarks/html/data.js
@@ -0,0 +1,3 @@
+benchmarkRuns = [];
+
+defaultCompareNames = [];
diff --git a/devops/scripts/benchmarks/html/index.html b/devops/scripts/benchmarks/html/index.html
new file mode 100644
index 0000000000000..c0b4cd3d06b4f
--- /dev/null
+++ b/devops/scripts/benchmarks/html/index.html
@@ -0,0 +1,82 @@
+<!--
+  Copyright (C) 2024-2025 Intel Corporation
+  Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+  See LICENSE.TXT
+  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Benchmark Results</title>
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/chartjs-adapter-date-fns"></script>
+    <script src="data.js"></script>
+    <script src="config.js"></script>
+    <script src="scripts.js"></script>
+    <link rel="stylesheet" href="styles.css">
+</head>
+<body>
+    <div class="container">
+        <h1>Benchmark Results</h1>
+        <div id="loading-indicator" class="loading-indicator" style="display: none;">
+            Loading data, please wait...
+        </div>
+        <div class="filter-container">
+            <input type="text" id="bench-filter" placeholder="Regex...">
+        </div>
+        <div class="run-selector">
+            <select id="run-select">
+                <option value="">Select a run to compare...</option>
+            </select>
+            <button onclick="addSelectedRun()">Add</button>
+            <div id="selected-runs" class="selected-runs"></div>
+        </div>
+        <details class="options-container">
+            <summary>Options</summary>
+            <div class="options-content">
+                <div class="filter-section">
+                    <h3>Display Options</h3>
+                    <div class="display-options">
+                        <label>
+                            <input type="checkbox" id="show-notes" checked>
+                            Director's commentary
+                        </label>
+                        <label>
+                            <input type="checkbox" id="show-unstable">
+                            Show 'it works on my machine' scenarios
+                        </label>
+                    </div>
+                </div>
+
+                <div class="filter-section">
+                    <h3>Suites</h3>
+                    <div id="suite-filters">
+                        <!-- Suite checkboxes will be generated by JavaScript -->
+                    </div>
+                </div>
+
+                <div class="filter-section">
+                    <h3>Tags <button class="tag-action-button" onclick="toggleAllTags(false)">Clear All</button></h3>
+                    <div id="tag-filters">
+                        <!-- Tag checkboxes will be generated by JavaScript -->
+                    </div>
+                </div>
+            </div>
+        </details>
+        <details class="timeseries">
+            <summary>Historical Results</summary>
+            <div class="charts"></div>
+        </details>
+        <details class="layer-comparisons">
+            <summary>Historical Layer Comparisons</summary>
+            <div class="charts"></div>
+        </details>
+        <details class="bar-charts">
+            <summary>Comparisons</summary>
+            <div class="charts"></div>
+        </details>
+    </div>
+</body>
+</html>
diff --git a/devops/scripts/benchmarks/html/scripts.js b/devops/scripts/benchmarks/html/scripts.js
new file mode 100644
index 0000000000000..74716bec6b82f
--- /dev/null
+++ b/devops/scripts/benchmarks/html/scripts.js
@@ -0,0 +1,976 @@
+// Copyright (C) 2024-2025 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Core state
+let activeRuns = new Set(defaultCompareNames);
+let chartInstances = new Map();
+let suiteNames = new Set();
+let timeseriesData, barChartsData, allRunNames;
+let activeTags = new Set();
+let layerComparisonsData;
+
+// DOM Elements
+let runSelect, selectedRunsDiv, suiteFiltersContainer, tagFiltersContainer;
+
+const colorPalette = [
+    'rgb(255, 50, 80)',
+    'rgb(255, 145, 15)',
+    'rgb(255, 220, 0)',
+    'rgb(20, 200, 50)',
+    'rgb(0, 130, 255)',
+    'rgb(180, 60, 255)',
+    'rgb(255, 40, 200)',
+    'rgb(0, 210, 180)',
+    'rgb(255, 90, 0)',
+    'rgb(110, 220, 0)',
+    'rgb(240, 100, 170)',
+    'rgb(30, 175, 255)',
+    'rgb(180, 210, 0)',
+    'rgb(130, 0, 220)',
+    'rgb(255, 170, 0)',
+    'rgb(0, 170, 110)',
+    'rgb(220, 80, 60)',
+    'rgb(80, 115, 230)',
+    'rgb(210, 190, 0)',
+];
+
+// Run selector functions
+function updateSelectedRuns(forceUpdate = true) {
+    selectedRunsDiv.innerHTML = '';
+    activeRuns.forEach(name => {
+        selectedRunsDiv.appendChild(createRunElement(name));
+    });
+    if (forceUpdate)
+        updateCharts();
+}
+
+function createRunElement(name) {
+    const runElement = document.createElement('span');
+    runElement.className = 'selected-run';
+    runElement.innerHTML = `${name} <button onclick="removeRun('${name}')">X</button>`;
+    return runElement;
+}
+
+function addSelectedRun() {
+    const selectedRun = runSelect.value;
+    if (selectedRun && !activeRuns.has(selectedRun)) {
+        activeRuns.add(selectedRun);
+        updateSelectedRuns();
+    }
+}
+
+function removeRun(name) {
+    activeRuns.delete(name);
+    updateSelectedRuns();
+}
+
+// Chart creation and update
+function createChart(data, containerId, type) {
+    if (chartInstances.has(containerId)) {
+        chartInstances.get(containerId).destroy();
+    }
+
+    const ctx = document.getElementById(containerId).getContext('2d');
+    const options = {
+        responsive: true,
+        plugins: {
+            title: {
+                display: true,
+                text: data.label
+            },
+            subtitle: {
+                display: true,
+                text: data.lower_is_better ? "Lower is better" : "Higher is better"
+            },
+            tooltip: {
+                callbacks: {
+                    label: (context) => {
+                        if (type === 'time') {
+                            const point = context.raw;
+                            return [
+                                `${point.seriesName}:`,
+                                `Value: ${point.y.toFixed(2)} ${data.unit}`,
+                                `Stddev: ${point.stddev.toFixed(2)} ${data.unit}`,
+                                `Git Hash: ${point.gitHash}`,
+                            ];
+                        } else {
+                            return [`${context.dataset.label}:`,
+                                `Value: ${context.parsed.y.toFixed(2)} ${data.unit}`,
+                            ];
+                        }
+                    }
+                }
+            }
+        },
+        scales: {
+            y: {
+                title: {
+                    display: true,
+                    text: data.unit
+                },
+                grace: '20%',
+            }
+        }
+    };
+
+    if (type === 'time') {
+        options.interaction = {
+            mode: 'nearest',
+            intersect: false
+        };
+        options.onClick = (event, elements) => {
+            if (elements.length > 0) {
+                const point = elements[0].element.$context.raw;
+                if (point.gitHash && point.gitRepo) {
+                    window.open(`https://github.com/${point.gitRepo}/commit/${point.gitHash}`, '_blank');
+                }
+            }
+        };
+        options.scales.x = {
+            type: 'timeseries',
+            time: {
+                unit: 'day'
+            },
+            ticks: {
+                maxRotation: 45,
+                minRotation: 45,
+                autoSkip: true,
+                maxTicksLimit: 10
+            }
+        };
+    }
+
+    const chartConfig = {
+        type: type === 'time' ? 'line' : 'bar',
+        data: type === 'time' ? {
+            datasets: createTimeseriesDatasets(data)
+        } : {
+            labels: data.labels,
+            datasets: data.datasets
+        },
+        options: options
+    };
+
+    const chart = new Chart(ctx, chartConfig);
+    chartInstances.set(containerId, chart);
+    return chart;
+}
+
+function createTimeseriesDatasets(data) {
+    return Object.entries(data.runs).map(([name, runData], index) => ({
+        label: name,
+        data: runData.points.map(p => ({
+            seriesName: name,
+            x: p.date,
+            y: p.value,
+            gitHash: p.git_hash,
+            gitRepo: p.github_repo,
+            stddev: p.stddev
+        })),
+        borderColor: colorPalette[index % colorPalette.length],
+        backgroundColor: colorPalette[index % colorPalette.length],
+        borderWidth: 1,
+        pointRadius: 3,
+        pointStyle: 'circle',
+        pointHoverRadius: 5
+    }));
+}
+
+function updateCharts() {
+    const filterRunData = (chart) => ({
+        ...chart,
+        runs: Object.fromEntries(
+            Object.entries(chart.runs).filter(([_, data]) => 
+                activeRuns.has(data.runName)
+            )
+        )
+    });
+
+    const filteredTimeseriesData = timeseriesData.map(filterRunData);
+    const filteredLayerComparisonsData = layerComparisonsData.map(filterRunData);
+
+    const filteredBarChartsData = barChartsData.map(chart => ({
+        ...chart,
+        labels: chart.labels.filter(label => activeRuns.has(label)),
+        datasets: chart.datasets.map(dataset => ({
+            ...dataset,
+            data: dataset.data.filter((_, i) => activeRuns.has(chart.labels[i]))
+        }))
+    }));
+
+    drawCharts(filteredTimeseriesData, filteredBarChartsData, filteredLayerComparisonsData);
+}
+
+function drawCharts(filteredTimeseriesData, filteredBarChartsData, filteredLayerComparisonsData) {
+    // Clear existing charts
+    document.querySelectorAll('.charts').forEach(container => container.innerHTML = '');
+    chartInstances.forEach(chart => chart.destroy());
+    chartInstances.clear();
+
+    // Create timeseries charts
+    filteredTimeseriesData.forEach((data, index) => {
+        const containerId = `timeseries-${index}`;
+        const container = createChartContainer(data, containerId, 'benchmark');
+        document.querySelector('.timeseries .charts').appendChild(container);
+        createChart(data, containerId, 'time');
+    });
+
+    // Create layer comparison charts
+    filteredLayerComparisonsData.forEach((data, index) => {
+        const containerId = `layer-comparison-${index}`;
+        const container = createChartContainer(data, containerId, 'group');
+        document.querySelector('.layer-comparisons .charts').appendChild(container);
+        createChart(data, containerId, 'time');
+    });
+
+    // Create bar charts
+    filteredBarChartsData.forEach((data, index) => {
+        const containerId = `barchart-${index}`;
+        const container = createChartContainer(data, containerId, 'group');
+        document.querySelector('.bar-charts .charts').appendChild(container);
+        createChart(data, containerId, 'bar');
+    });
+
+    // Apply current filters
+    filterCharts();
+}
+
+function createChartContainer(data, canvasId, type) {
+    const container = document.createElement('div');
+    container.className = 'chart-container';
+    container.setAttribute('data-label', data.label);
+    container.setAttribute('data-suite', data.suite);
+
+    // Check if this benchmark is marked as unstable
+    const metadata = metadataForLabel(data.label, type);
+    if (metadata && metadata.unstable) {
+        container.setAttribute('data-unstable', 'true');
+
+        // Add unstable warning
+        const unstableWarning = document.createElement('div');
+        unstableWarning.className = 'benchmark-unstable';
+        unstableWarning.textContent = metadata.unstable;
+        unstableWarning.style.display = isUnstableEnabled() ? 'block' : 'none';
+        container.appendChild(unstableWarning);
+    }
+
+    // Add description if present in metadata (moved outside of details)
+    if (metadata && metadata.description) {
+        const descElement = document.createElement('div');
+        descElement.className = 'benchmark-description';
+        descElement.textContent = metadata.description;
+        container.appendChild(descElement);
+    }
+
+    // Add notes if present
+    if (metadata && metadata.notes) {
+        const noteElement = document.createElement('div');
+        noteElement.className = 'benchmark-note';
+        noteElement.textContent = metadata.notes;
+        noteElement.style.display = isNotesEnabled() ? 'block' : 'none';
+        container.appendChild(noteElement);
+    }
+
+    // Add tags if present
+    if (metadata && metadata.tags) {
+        container.setAttribute('data-tags', metadata.tags.join(','));
+        
+        // Add tags display
+        const tagsContainer = document.createElement('div');
+        tagsContainer.className = 'benchmark-tags';
+        
+        metadata.tags.forEach(tag => {
+            const tagElement = document.createElement('span');
+            tagElement.className = 'tag';
+            tagElement.textContent = tag;
+            tagElement.setAttribute('data-tag', tag);
+            
+            // Add tooltip with tag description
+            if (benchmarkTags[tag]) {
+                tagElement.setAttribute('title', benchmarkTags[tag].description);
+            }
+            
+            tagsContainer.appendChild(tagElement);
+        });
+        
+        container.appendChild(tagsContainer);
+    }
+
+    const canvas = document.createElement('canvas');
+    canvas.id = canvasId;
+    container.appendChild(canvas);
+
+    // Create details section for extra info
+    const details = document.createElement('details');
+    const summary = document.createElement('summary');
+    summary.textContent = "Details";
+
+    // Add subtle download button to the summary
+    const downloadButton = document.createElement('button');
+    downloadButton.className = 'download-button';
+    downloadButton.textContent = 'Download';
+    downloadButton.onclick = (event) => {
+        event.stopPropagation(); // Prevent details toggle
+        downloadChart(canvasId, data.label);
+    };
+    summary.appendChild(downloadButton);
+    details.appendChild(summary);
+
+    // Create and append extra info
+    const extraInfo = document.createElement('div');
+    extraInfo.className = 'extra-info';
+    latestRunsLookup = createLatestRunsLookup(benchmarkRuns);
+    extraInfo.innerHTML = generateExtraInfo(latestRunsLookup, data, 'benchmark');
+    details.appendChild(extraInfo);
+
+    container.appendChild(details);
+
+    return container;
+}
+
+function metadataForLabel(label, type) {
+    for (const [key, metadata] of Object.entries(benchmarkMetadata)) {
+        if (metadata.type === type && label.startsWith(key)) {
+            return metadata;
+        }
+    }
+
+    return null;
+}
+
+// Pre-compute a lookup for the latest run per label
+function createLatestRunsLookup(benchmarkRuns) {
+    const latestRunsMap = new Map();
+
+    benchmarkRuns.forEach(run => {
+        // Yes, we need to convert the date every time. I checked.
+        const runDate = new Date(run.date);
+        run.results.forEach(result => {
+            const label = result.label;
+            if (!latestRunsMap.has(label) || runDate > new Date(latestRunsMap.get(label).date)) {
+                latestRunsMap.set(label, {
+                    run,
+                    result
+                });
+            }
+        });
+    });
+
+    return latestRunsMap;
+}
+
+function extractLabels(data) {
+    // For layer comparison charts
+    if (data.benchmarkLabels) {
+        return data.benchmarkLabels;
+    }
+
+    // For bar charts
+    if (data.datasets) {
+        return data.datasets.map(dataset => dataset.label);
+    }
+
+    // For time series charts
+    return [data.label];
+}
+
+function generateExtraInfo(latestRunsLookup, data) {
+    const labels = extractLabels(data);
+
+    return labels.map(label => {
+        const metadata = metadataForLabel(label, 'benchmark');
+        const latestRun = latestRunsLookup.get(label);
+
+        let html = '<div class="extra-info-entry">';
+
+        if (metadata && latestRun) {
+            html += `<strong>${label}:</strong> ${formatCommand(latestRun.result)}<br>`;
+
+            if (metadata.description) {
+                html += `<em>Description:</em> ${metadata.description}`;
+            }
+
+            if (metadata.notes) {
+                html += `<br><em>Notes:</em> <span class="note-text">${metadata.notes}</span>`;
+            }
+
+            if (metadata.unstable) {
+                html += `<br><em class="unstable-warning">⚠️ Unstable:</em> <span class="unstable-text">${metadata.unstable}</span>`;
+            }
+        } else {
+            html += `<strong>${label}:</strong> No data available`;
+        }
+
+        html += '</div>';
+        return html;
+    }).join('');
+}
+
+function formatCommand(run) {
+    const envVars = Object.entries(run.env || {}).map(([key, value]) => `${key}=${value}`).join(' ');
+    let command = run.command ? [...run.command] : [];
+
+    return `${envVars} ${command.join(' ')}`.trim();
+}
+
+function downloadChart(canvasId, label) {
+    const chart = chartInstances.get(canvasId);
+    if (chart) {
+        const link = document.createElement('a');
+        link.href = chart.toBase64Image('image/png', 1)
+        link.download = `${label}.png`;
+        link.click();
+    }
+}
+
+// URL and filtering functions
+function getQueryParam(param) {
+    const urlParams = new URLSearchParams(window.location.search);
+    return urlParams.get(param);
+}
+
+function updateURL() {
+    const url = new URL(window.location);
+    const regex = document.getElementById('bench-filter').value;
+    const activeSuites = getActiveSuites();
+    const activeRunsList = Array.from(activeRuns);
+    const activeTagsList = Array.from(activeTags);
+
+    if (regex) {
+        url.searchParams.set('regex', regex);
+    } else {
+        url.searchParams.delete('regex');
+    }
+
+    if (activeSuites.length > 0 && activeSuites.length != suiteNames.size) {
+        url.searchParams.set('suites', activeSuites.join(','));
+    } else {
+        url.searchParams.delete('suites');
+    }
+
+    // Add tags to URL
+    if (activeTagsList.length > 0) {
+        url.searchParams.set('tags', activeTagsList.join(','));
+    } else {
+        url.searchParams.delete('tags');
+    }
+
+    // Handle the runs parameter
+    if (activeRunsList.length > 0) {
+        // Check if the active runs are the same as default runs
+        const defaultRuns = new Set(defaultCompareNames || []);
+        const isDefaultRuns = activeRunsList.length === defaultRuns.size &&
+            activeRunsList.every(run => defaultRuns.has(run));
+
+        if (isDefaultRuns) {
+            // If it's just the default runs, omit the parameter entirely
+            url.searchParams.delete('runs');
+        } else {
+            url.searchParams.set('runs', activeRunsList.join(','));
+        }
+    } else {
+        url.searchParams.delete('runs');
+    }
+
+    // Add toggle states to URL
+    if (isNotesEnabled()) {
+        url.searchParams.delete('notes');
+    } else {
+        url.searchParams.set('notes', 'false');
+    }
+
+    if (!isUnstableEnabled()) {
+        url.searchParams.delete('unstable');
+    } else {
+        url.searchParams.set('unstable', 'true');
+    }
+
+    history.replaceState(null, '', url);
+}
+
+function filterCharts() {
+    const regexInput = document.getElementById('bench-filter').value;
+    const regex = new RegExp(regexInput, 'i');
+    const activeSuites = getActiveSuites();
+
+    document.querySelectorAll('.chart-container').forEach(container => {
+        const label = container.getAttribute('data-label');
+        const suite = container.getAttribute('data-suite');
+        const isUnstable = container.getAttribute('data-unstable') === 'true';
+        const tags = container.getAttribute('data-tags') ? 
+                    container.getAttribute('data-tags').split(',') : [];
+
+        // Check if benchmark has all active tags (if any are selected)
+        const hasAllActiveTags = activeTags.size === 0 || 
+                               Array.from(activeTags).every(tag => tags.includes(tag));
+
+        // Hide unstable benchmarks if showUnstable is false
+        const shouldShow = regex.test(label) &&
+            activeSuites.includes(suite) &&
+            (isUnstableEnabled() || !isUnstable) &&
+            hasAllActiveTags;
+
+        container.style.display = shouldShow ? '' : 'none';
+    });
+
+    updateURL();
+}
+
+function getActiveSuites() {
+    return Array.from(document.querySelectorAll('.suite-checkbox:checked'))
+        .map(checkbox => checkbox.getAttribute('data-suite'));
+}
+
+// Data processing
+function processTimeseriesData(benchmarkRuns) {
+    const resultsByLabel = {};
+
+    benchmarkRuns.forEach(run => {
+        run.results.forEach(result => {
+            if (!resultsByLabel[result.label]) {
+                resultsByLabel[result.label] = {
+                    label: result.label,
+                    suite: result.suite,
+                    unit: result.unit,
+                    lower_is_better: result.lower_is_better,
+                    runs: {}
+                };
+            }
+
+            addRunDataPoint(resultsByLabel[result.label], run, result, run.name);
+        });
+    });
+
+    return Object.values(resultsByLabel);
+}
+
+function processBarChartsData(benchmarkRuns) {
+    const groupedResults = {};
+
+    benchmarkRuns.forEach(run => {
+        run.results.forEach(result => {
+            if (!result.explicit_group) return;
+
+            if (!groupedResults[result.explicit_group]) {
+                // Look up group metadata
+                const groupMetadata = metadataForLabel(result.explicit_group);
+
+                groupedResults[result.explicit_group] = {
+                    label: result.explicit_group,
+                    suite: result.suite,
+                    unit: result.unit,
+                    lower_is_better: result.lower_is_better,
+                    labels: [],
+                    datasets: [],
+                    // Add metadata if available
+                    description: groupMetadata?.description || null,
+                    notes: groupMetadata?.notes || null,
+                    unstable: groupMetadata?.unstable || null
+                };
+            }
+
+            const group = groupedResults[result.explicit_group];
+
+            if (!group.labels.includes(run.name)) {
+                group.labels.push(run.name);
+            }
+
+            let dataset = group.datasets.find(d => d.label === result.label);
+            if (!dataset) {
+                const datasetIndex = group.datasets.length;
+                dataset = {
+                    label: result.label,
+                    data: new Array(group.labels.length).fill(null),
+                    backgroundColor: colorPalette[datasetIndex % colorPalette.length],
+                    borderColor: colorPalette[datasetIndex % colorPalette.length],
+                    borderWidth: 1
+                };
+                group.datasets.push(dataset);
+            }
+
+            const runIndex = group.labels.indexOf(run.name);
+            if (dataset.data[runIndex] == null)
+                dataset.data[runIndex] = result.value;
+        });
+    });
+
+    return Object.values(groupedResults);
+}
+
+function getLayerTags(metadata) {
+    const layerTags = new Set();
+    if (metadata?.tags) {
+        metadata.tags.forEach(tag => {
+            if (tag.startsWith('SYCL') || tag.startsWith('UR') || tag === 'L0') {
+                layerTags.add(tag);
+            }
+        });
+    }
+    return layerTags;
+}
+
+function processLayerComparisonsData(benchmarkRuns) {
+    const groupedResults = {};
+
+    benchmarkRuns.forEach(run => {
+        run.results.forEach(result => {
+            if (!result.explicit_group) return;
+
+            // Skip if no metadata available
+            const metadata = metadataForLabel(result.explicit_group, 'group');
+            if (!metadata) return;
+
+            // Get all benchmark labels in this group
+            const labelsInGroup = new Set(
+                benchmarkRuns.flatMap(r =>
+                    r.results
+                        .filter(res => res.explicit_group === result.explicit_group)
+                        .map(res => res.label)
+                )
+            );
+
+            // Check if this group compares different layers
+            const uniqueLayers = new Set();
+            labelsInGroup.forEach(label => {
+                const labelMetadata = metadataForLabel(label, 'benchmark');
+                const layerTags = getLayerTags(labelMetadata);
+                layerTags.forEach(tag => uniqueLayers.add(tag));
+            });
+
+            // Only process groups that compare different layers
+            if (uniqueLayers.size <= 1) return;
+
+            if (!groupedResults[result.explicit_group]) {
+                groupedResults[result.explicit_group] = {
+                    label: result.explicit_group,
+                    suite: result.suite,
+                    unit: result.unit,
+                    lower_is_better: result.lower_is_better,
+                    runs: {},
+                    benchmarkLabels: [],
+                    description: metadata?.description || null,
+                    notes: metadata?.notes || null,
+                    unstable: metadata?.unstable || null
+                };
+            }
+
+            const group = groupedResults[result.explicit_group];
+            const name = result.label + ' (' + run.name + ')';
+
+            // Add the benchmark label if it's not already in the array
+            if (!group.benchmarkLabels.includes(result.label)) {
+                group.benchmarkLabels.push(result.label);
+            }
+
+            addRunDataPoint(group, run, result, name);
+        });
+    });
+
+    return Object.values(groupedResults);
+}
+
+function createRunDataStructure(run, result, label) {
+    return {
+        runName: run.name,
+        points: [{
+            date: new Date(run.date),
+            value: result.value,
+            stddev: result.stddev,
+            git_hash: run.git_hash,
+            github_repo: run.github_repo,
+            label: label || result.label
+        }]
+    };
+}
+
+function addRunDataPoint(group, run, result, name = null) {
+    const runKey = name || result.label + ' (' + run.name + ')';
+
+    if (!group.runs[runKey]) {
+        group.runs[runKey] = {
+            runName: run.name,
+            points: []
+        };
+    }
+
+    group.runs[runKey].points.push({
+        date: new Date(run.date),
+        value: result.value,
+        stddev: result.stddev,
+        git_hash: run.git_hash,
+        github_repo: run.github_repo,
+    });
+
+    return group;
+}
+
+// Setup functions
+function setupRunSelector() {
+    runSelect = document.getElementById('run-select');
+    selectedRunsDiv = document.getElementById('selected-runs');
+
+    allRunNames.forEach(name => {
+        const option = document.createElement('option');
+        option.value = name;
+        option.textContent = name;
+        runSelect.appendChild(option);
+    });
+
+    updateSelectedRuns(false);
+}
+
+function setupSuiteFilters() {
+    suiteFiltersContainer = document.getElementById('suite-filters');
+
+    benchmarkRuns.forEach(run => {
+        run.results.forEach(result => {
+            suiteNames.add(result.suite);
+        });
+    });
+
+    suiteNames.forEach(suite => {
+        const label = document.createElement('label');
+        const checkbox = document.createElement('input');
+        checkbox.type = 'checkbox';
+        checkbox.className = 'suite-checkbox';
+        checkbox.dataset.suite = suite;
+        checkbox.checked = true;
+        label.appendChild(checkbox);
+        label.appendChild(document.createTextNode(' ' + suite));
+        suiteFiltersContainer.appendChild(label);
+        suiteFiltersContainer.appendChild(document.createTextNode(' '));
+    });
+}
+
+function isNotesEnabled() {
+    const notesToggle = document.getElementById('show-notes');
+    return notesToggle.checked;
+}
+
+function isUnstableEnabled() {
+    const unstableToggle = document.getElementById('show-unstable');
+    return unstableToggle.checked;
+}
+
+function setupToggles() {
+    const notesToggle = document.getElementById('show-notes');
+    const unstableToggle = document.getElementById('show-unstable');
+
+    notesToggle.addEventListener('change', function() {
+        // Update all note elements visibility
+        document.querySelectorAll('.benchmark-note').forEach(note => {
+            note.style.display = isNotesEnabled() ? 'block' : 'none';
+        });
+        updateURL();
+    });
+
+    unstableToggle.addEventListener('change', function() {
+        // Update all unstable warning elements visibility
+        document.querySelectorAll('.benchmark-unstable').forEach(warning => {
+            warning.style.display = isUnstableEnabled() ? 'block' : 'none';
+        });
+        filterCharts();
+    });
+
+    // Initialize from URL params if present
+    const notesParam = getQueryParam('notes');
+    const unstableParam = getQueryParam('unstable');
+
+    if (notesParam !== null) {
+        let showNotes = notesParam === 'true';
+        notesToggle.checked = showNotes;
+    }
+
+    if (unstableParam !== null) {
+        let showUnstable = unstableParam === 'true';
+        unstableToggle.checked = showUnstable;
+    }
+}
+
+function setupTagFilters() {
+    tagFiltersContainer = document.getElementById('tag-filters');
+
+    const allTags = [];
+    
+    if (benchmarkTags) {
+        for (const tag in benchmarkTags) {
+            if (!allTags.includes(tag)) {
+                allTags.push(tag);
+            }
+        }
+    }
+
+    // Create tag filter elements
+    allTags.forEach(tag => {
+        const tagContainer = document.createElement('div');
+        tagContainer.className = 'tag-filter';
+        
+        const checkbox = document.createElement('input');
+        checkbox.type = 'checkbox';
+        checkbox.id = `tag-${tag}`;
+        checkbox.className = 'tag-checkbox';
+        checkbox.dataset.tag = tag;
+        
+        const label = document.createElement('label');
+        label.htmlFor = `tag-${tag}`;
+        label.textContent = tag;
+        
+        // Add info icon with tooltip if tag description exists
+        if (benchmarkTags[tag]) {
+            const infoIcon = document.createElement('span');
+            infoIcon.className = 'tag-info';
+            infoIcon.textContent = 'ⓘ';
+            infoIcon.title = benchmarkTags[tag].description;
+            label.appendChild(infoIcon);
+        }
+        
+        checkbox.addEventListener('change', function() {
+            if (this.checked) {
+                activeTags.add(tag);
+            } else {
+                activeTags.delete(tag);
+            }
+            filterCharts();
+        });
+        
+        tagContainer.appendChild(checkbox);
+        tagContainer.appendChild(label);
+        tagFiltersContainer.appendChild(tagContainer);
+    });
+}
+
+function toggleAllTags(select) {
+    const checkboxes = document.querySelectorAll('.tag-checkbox');
+    
+    checkboxes.forEach(checkbox => {
+        checkbox.checked = select;
+        const tag = checkbox.dataset.tag;
+        
+        if (select) {
+            activeTags.add(tag);
+        } else {
+            activeTags.delete(tag);
+        }
+    });
+    
+    filterCharts();
+}
+
+function initializeCharts() {
+    // Process raw data
+    timeseriesData = processTimeseriesData(benchmarkRuns);
+    barChartsData = processBarChartsData(benchmarkRuns);
+    layerComparisonsData = processLayerComparisonsData(benchmarkRuns);
+    allRunNames = [...new Set(benchmarkRuns.map(run => run.name))];
+
+    // Set up active runs
+    const runsParam = getQueryParam('runs');
+    if (runsParam) {
+        const runsFromUrl = runsParam.split(',');
+
+        // Start with an empty set
+        activeRuns = new Set();
+
+        // Process each run from URL
+        runsFromUrl.forEach(run => {
+            if (run === 'default') {
+                // Special case: include all default runs
+                (defaultCompareNames || []).forEach(defaultRun => {
+                    if (allRunNames.includes(defaultRun)) {
+                        activeRuns.add(defaultRun);
+                    }
+                });
+            } else if (allRunNames.includes(run)) {
+                // Add the specific run if it exists
+                activeRuns.add(run);
+            }
+        });
+    } else {
+        // No runs parameter, use defaults
+        activeRuns = new Set(defaultCompareNames || []);
+    }
+
+    // Setup UI components
+    setupRunSelector();
+    setupSuiteFilters();
+    setupTagFilters();
+    setupToggles();
+
+    // Apply URL parameters
+    const regexParam = getQueryParam('regex');
+    const suitesParam = getQueryParam('suites');
+    const tagsParam = getQueryParam('tags');
+
+    if (regexParam) {
+        document.getElementById('bench-filter').value = regexParam;
+    }
+
+    if (suitesParam) {
+        const suites = suitesParam.split(',');
+        document.querySelectorAll('.suite-checkbox').forEach(checkbox => {
+            checkbox.checked = suites.includes(checkbox.getAttribute('data-suite'));
+        });
+    }
+
+    // Apply tag filters from URL
+    if (tagsParam) {
+        const tags = tagsParam.split(',');
+        tags.forEach(tag => {
+            const checkbox = document.querySelector(`.tag-checkbox[data-tag="${tag}"]`);
+            if (checkbox) {
+                checkbox.checked = true;
+                activeTags.add(tag);
+            }
+        });
+    }
+
+    // Setup event listeners
+    document.querySelectorAll('.suite-checkbox').forEach(checkbox => {
+        checkbox.addEventListener('change', filterCharts);
+    });
+    document.getElementById('bench-filter').addEventListener('input', filterCharts);
+
+    // Draw initial charts
+    updateCharts();
+}
+
+// Make functions available globally for onclick handlers
+window.addSelectedRun = addSelectedRun;
+window.removeRun = removeRun;
+window.toggleAllTags = toggleAllTags;
+
+// Load data based on configuration
+function loadData() {
+    const loadingIndicator = document.getElementById('loading-indicator');
+    loadingIndicator.style.display = 'block'; // Show loading indicator
+
+    if (typeof remoteDataUrl !== 'undefined' && remoteDataUrl !== '') {
+        // Fetch data from remote URL
+        fetch(remoteDataUrl)
+            .then(response => response.json())
+            .then(data => {
+                benchmarkRuns = data.runs || data;
+                benchmarkMetadata = data.metadata || benchmarkMetadata || {};
+                benchmarkTags = data.tags || benchmarkTags || {};
+                initializeCharts();
+            })
+            .catch(error => {
+                console.error('Error fetching remote data:', error);
+                loadingIndicator.textContent = 'Fetching remote data failed.';
+            })
+            .finally(() => {
+                loadingIndicator.style.display = 'none'; // Hide loading indicator
+            });
+    } else {
+        // Use local data (benchmarkRuns and benchmarkMetadata should be defined in data.js)
+        initializeCharts();
+        loadingIndicator.style.display = 'none'; // Hide loading indicator
+    }
+}
+
+// Initialize when DOM is ready
+document.addEventListener('DOMContentLoaded', () => {
+    loadData();
+});
diff --git a/devops/scripts/benchmarks/html/styles.css b/devops/scripts/benchmarks/html/styles.css
new file mode 100644
index 0000000000000..3e9c3bd22fc37
--- /dev/null
+++ b/devops/scripts/benchmarks/html/styles.css
@@ -0,0 +1,357 @@
+body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+    margin: 0;
+    padding: 16px;
+    background: #f8f9fa;
+}
+.container {
+    max-width: 1100px;
+    margin: 0 auto;
+}
+h1, h2 {
+    color: #212529;
+    text-align: center;
+    margin-bottom: 24px;
+    font-weight: 500;
+}
+.chart-container {
+    background: white;
+    border-radius: 8px;
+    padding: 24px;
+    margin-bottom: 24px;
+    box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+}
+@media (max-width: 768px) {
+    body {
+        padding: 12px;
+    }
+    .chart-container {
+        padding: 16px;
+        border-radius: 6px;
+    }
+    h1 {
+        font-size: 24px;
+        margin-bottom: 16px;
+    }
+}
+.filter-container {
+    text-align: center;
+    margin-bottom: 24px;
+}
+.filter-container input {
+    padding: 8px;
+    font-size: 16px;
+    border: 1px solid #ccc;
+    border-radius: 4px;
+    width: 400px;
+    max-width: 100%;
+}
+.suite-filter-container {
+    text-align: center;
+    margin-bottom: 24px;
+    padding: 16px;
+    background: #e9ecef;
+    border-radius: 8px;
+}
+.suite-checkbox {
+    margin: 0 8px;
+}
+details {
+    margin-bottom: 24px;
+}
+summary {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    font-size: 16px;
+    font-weight: 500;
+    cursor: pointer;
+    padding: 12px 16px;
+    background: #dee2e6;
+    border-radius: 8px;
+    user-select: none;
+}
+summary:hover {
+    background: #ced4da;
+}
+summary::marker {
+    display: none;
+}
+summary::-webkit-details-marker {
+    display: none;
+}
+summary::after {
+    content: "▼";
+    font-size: 12px;
+    margin-left: 8px;
+    transition: transform 0.3s;
+}
+details[open] summary::after {
+    transform: rotate(180deg);
+}
+.extra-info {
+    padding: 8px;
+    background: #f8f9fa;
+    border-radius: 8px;
+    margin-top: 8px;
+}
+.run-selector {
+    text-align: center;
+    margin-bottom: 24px;
+    padding: 16px;
+    background: #e9ecef;
+    border-radius: 8px;
+}
+.run-selector select {
+    width: 300px;
+    padding: 8px;
+    margin-right: 8px;
+}
+.run-selector button {
+    padding: 8px 16px;
+    background: #0068B5;
+    color: white;
+    border: none;
+    border-radius: 4px;
+    cursor: pointer;
+}
+.run-selector button:hover {
+    background: #00C7FD;
+}
+.selected-runs {
+    margin-top: 12px;
+}
+.selected-run {
+    display: inline-block;
+    padding: 4px 8px;
+    margin: 4px;
+    background: #e2e6ea;
+    border-radius: 4px;
+}
+.selected-run button {
+    margin-left: 8px;
+    padding: 0 4px;
+    background: none;
+    border: none;
+    color: #dc3545;
+    cursor: pointer;
+}
+.download-button {
+    background: none;
+    border: none;
+    color: #0068B5;
+    cursor: pointer;
+    font-size: 16px;
+    padding: 4px;
+    margin-left: 8px;
+}
+.download-button:hover {
+    color: #00C7FD;
+}
+.loading-indicator {
+    text-align: center;
+    font-size: 18px;
+    color: #0068B5;
+    margin-bottom: 20px;
+}
+.extra-info-entry {
+    border: 1px solid #ddd;
+    padding: 10px;
+    margin-bottom: 10px;
+    background-color: #f9f9f9;
+    border-radius: 5px;
+}
+.extra-info-entry strong {
+    display: block;
+    margin-bottom: 5px;
+}
+.extra-info-entry em {
+    color: #555;
+}
+.display-options-container {
+    text-align: center;
+    margin-bottom: 24px;
+    padding: 16px;
+    background: #e9ecef;
+    border-radius: 8px;
+}
+.display-options-container label {
+    margin: 0 12px;
+    cursor: pointer;
+}
+.display-options-container input {
+    margin-right: 8px;
+}
+.benchmark-note {
+    background-color: #cfe2ff;
+    color: #084298;
+    padding: 10px;
+    margin-bottom: 10px;
+    border-radius: 5px;
+    border-left: 4px solid #084298;
+    white-space: pre-line;
+}
+.benchmark-unstable {
+    background-color: #f8d7da;
+    color: #842029;
+    padding: 10px;
+    margin-bottom: 10px;
+    border-radius: 5px;
+    border-left: 4px solid #842029;
+    white-space: pre-line;
+}
+.note-text {
+    color: #084298;
+}
+.unstable-warning {
+    color: #842029;
+    font-weight: bold;
+}
+.unstable-text {
+    color: #842029;
+}
+.options-container {
+    margin-bottom: 24px;
+    background: #e9ecef;
+    border-radius: 8px;
+    overflow: hidden;
+}
+.options-container summary {
+    padding: 12px 16px;
+    font-weight: 500;
+    cursor: pointer;
+    background: #dee2e6;
+    user-select: none;
+}
+.options-container summary:hover {
+    background: #ced4da;
+}
+.options-content {
+    padding: 16px;
+    display: flex;
+    flex-wrap: wrap;
+    gap: 24px;
+}
+.filter-section {
+    flex: 1;
+    min-width: 300px;
+}
+.filter-section h3 {
+    margin-top: 0;
+    margin-bottom: 12px;
+    font-size: 18px;
+    font-weight: 500;
+    text-align: left;
+    display: flex;
+    align-items: center;
+}
+#suite-filters {
+    display: flex;
+    flex-wrap: wrap;
+    max-height: 200px;
+    overflow-y: auto;
+    border: 1px solid #dee2e6;
+    border-radius: 4px;
+    padding: 8px;
+    background-color: #f8f9fa;
+}
+.display-options {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+}
+.display-options label {
+    display: flex;
+    align-items: center;
+    cursor: pointer;
+}
+.display-options input {
+    margin-right: 8px;
+}
+.benchmark-description {
+    background-color: #f2f2f2;
+    color: #333;
+    padding: 10px;
+    margin-bottom: 10px;
+    border-radius: 5px;
+    border-left: 4px solid #6c757d;
+    white-space: pre-line;
+    font-style: italic;
+}
+/* Tag styles */
+.benchmark-tags {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 4px;
+    margin-bottom: 10px;
+}
+
+.tag {
+    display: inline-block;
+    background-color: #e2e6ea;
+    color: #495057;
+    padding: 2px 8px;
+    border-radius: 12px;
+    font-size: 12px;
+    cursor: help;
+}
+
+.tag-filter {
+    display: inline-flex;
+    align-items: center;
+    margin: 4px;
+}
+
+.tag-filter label {
+    margin-left: 4px;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+}
+
+.tag-info {
+    color: #0068B5;
+    margin-left: 4px;
+    cursor: help;
+    font-size: 12px;
+}
+
+#tag-filters {
+    display: flex;
+    flex-wrap: wrap;
+    max-height: 200px;
+    overflow-y: auto;
+    border: 1px solid #dee2e6;
+    border-radius: 4px;
+    padding: 8px;
+    background-color: #f8f9fa;
+}
+
+.tag-action-button {
+    padding: 2px 8px;
+    background: #e2e6ea;
+    border: none;
+    border-radius: 4px;
+    cursor: pointer;
+    font-size: 12px;
+    margin-left: 8px;
+    vertical-align: middle;
+}
+
+.tag-action-button:hover {
+    background: #ced4da;
+}
+
+.remove-tag {
+    background: none;
+    border: none;
+    color: white;
+    margin-left: 4px;
+    cursor: pointer;
+    font-size: 16px;
+    padding: 0 4px;
+}
+
+.remove-tag:hover {
+    color: #f8d7da;
+}
diff --git a/unified-runtime/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
similarity index 74%
rename from unified-runtime/scripts/benchmarks/main.py
rename to devops/scripts/benchmarks/main.py
index 4ad90b39b9001..14e5fe1a04624 100755
--- a/unified-runtime/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -17,6 +17,7 @@
 from history import BenchmarkHistory
 from utils.utils import prepare_workdir
 from utils.compute_runtime import *
+from presets import enabled_suites, presets
 
 import argparse
 import re
@@ -27,23 +28,27 @@
 
 
 def run_iterations(
-    benchmark: Benchmark, env_vars, iters: int, results: dict[str, list[Result]]
+    benchmark: Benchmark,
+    env_vars,
+    iters: int,
+    results: dict[str, list[Result]],
+    failures: dict[str, str],
 ):
     for iter in range(iters):
-        print(f"running {benchmark.name()}, iteration {iter}... ", end="", flush=True)
+        print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
         bench_results = benchmark.run(env_vars)
         if bench_results is None:
-            print(f"did not finish (OK for sycl-bench).")
+            failures[benchmark.name()] = "benchmark produced no results!"
             break
 
         for bench_result in bench_results:
-            # TODO: report failures in markdown/html ?
             if not bench_result.passed:
-                print(f"complete ({bench_result.label}: verification FAILED)")
+                failures[bench_result.label] = "verification failed"
+                print(f"complete ({bench_result.label}: verification failed).")
                 continue
 
             print(
-                f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})."
+                f"{benchmark.name()} complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})."
             )
 
             bench_result.name = bench_result.label
@@ -132,6 +137,18 @@ def process_results(
     return valid_results, processed
 
 
+def collect_metadata(suites):
+    metadata = {}
+
+    for s in suites:
+        metadata.update(s.additionalMetadata())
+        suite_benchmarks = s.benchmarks()
+        for benchmark in suite_benchmarks:
+            metadata[benchmark.name()] = benchmark.get_metadata()
+
+    return metadata
+
+
 def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
@@ -142,22 +159,29 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         options.extra_ld_libraries.extend(cr.ld_libraries())
         options.extra_env_vars.update(cr.env_vars())
 
-    suites = (
-        [
-            ComputeBench(directory),
-            VelocityBench(directory),
-            SyclBench(directory),
-            LlamaCppBench(directory),
-            UMFSuite(directory),
-            # TestSuite()
-        ]
-        if not options.dry_run
-        else []
-    )
+    suites = [
+        ComputeBench(directory),
+        VelocityBench(directory),
+        SyclBench(directory),
+        LlamaCppBench(directory),
+        UMFSuite(directory),
+        TestSuite(),
+    ]
+
+    # Collect metadata from all benchmarks without setting them up
+    metadata = collect_metadata(suites)
+
+    # If dry run, we're done
+    if options.dry_run:
+        suites = []
 
     benchmarks = []
+    failures = {}
 
     for s in suites:
+        if s.name() not in enabled_suites(options.preset):
+            continue
+
         suite_benchmarks = s.benchmarks()
         if filter:
             suite_benchmarks = [
@@ -170,25 +194,26 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             print(f"Setting up {type(s).__name__}")
             try:
                 s.setup()
-            except:
+            except Exception as e:
+                failures[s.name()] = f"Suite setup failure: {e}"
                 print(f"{type(s).__name__} setup failed. Benchmarks won't be added.")
             else:
                 print(f"{type(s).__name__} setup complete.")
                 benchmarks += suite_benchmarks
 
-    for b in benchmarks:
-        print(b.name())
-
     for benchmark in benchmarks:
         try:
-            print(f"Setting up {benchmark.name()}... ")
+            if options.verbose:
+                print(f"Setting up {benchmark.name()}... ")
             benchmark.setup()
-            print(f"{benchmark.name()} setup complete.")
+            if options.verbose:
+                print(f"{benchmark.name()} setup complete.")
 
         except Exception as e:
             if options.exit_on_failure:
                 raise e
             else:
+                failures[benchmark.name()] = f"Benchmark setup failure: {e}"
                 print(f"failed: {e}")
 
     results = []
@@ -199,7 +224,11 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             processed: list[Result] = []
             for _ in range(options.iterations_stddev):
                 run_iterations(
-                    benchmark, merged_env_vars, options.iterations, intermediate_results
+                    benchmark,
+                    merged_env_vars,
+                    options.iterations,
+                    intermediate_results,
+                    failures,
                 )
                 valid, processed = process_results(
                     intermediate_results, benchmark.stddev_threshold()
@@ -211,12 +240,16 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             if options.exit_on_failure:
                 raise e
             else:
+                failures[benchmark.name()] = f"Benchmark run failure: {e}"
                 print(f"failed: {e}")
 
     for benchmark in benchmarks:
-        print(f"tearing down {benchmark.name()}... ", end="", flush=True)
+        # this never has any useful information anyway, so hide it behind verbose
+        if options.verbose:
+            print(f"tearing down {benchmark.name()}... ", flush=True)
         benchmark.teardown()
-        print("complete.")
+        if options.verbose:
+            print("{benchmark.name()} teardown complete.")
 
     this_name = options.current_run_name
     chart_data = {}
@@ -224,7 +257,10 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     if not options.dry_run:
         chart_data = {this_name: results}
 
-    history = BenchmarkHistory(directory)
+    results_dir = directory
+    if options.custom_results_dir:
+        results_dir = Path(options.custom_results_dir)
+    history = BenchmarkHistory(results_dir)
     # limit how many files we load.
     # should this be configurable?
     history.load(1000)
@@ -241,14 +277,18 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
 
     if options.output_markdown:
         markdown_content = generate_markdown(
-            this_name, chart_data, options.output_markdown
+            this_name, chart_data, failures, options.output_markdown
         )
 
-        with open("benchmark_results.md", "w") as file:
+        md_path = options.output_directory
+        if options.output_directory is None:
+            md_path = os.getcwd()
+
+        with open(os.path.join(md_path, "benchmark_results.md"), "w") as file:
             file.write(markdown_content)
 
         print(
-            f"Markdown with benchmark results has been written to {os.getcwd()}/benchmark_results.md"
+            f"Markdown with benchmark results has been written to {md_path}/benchmark_results.md"
         )
 
     saved_name = save_name if save_name is not None else this_name
@@ -262,14 +302,10 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             compare_names.append(saved_name)
 
     if options.output_html:
-        html_content = generate_html(history.runs, "intel/llvm", compare_names)
-
-        with open("benchmark_results.html", "w") as file:
-            file.write(html_content)
-
-        print(
-            f"HTML with benchmark results has been written to {os.getcwd()}/benchmark_results.html"
-        )
+        html_path = options.output_directory
+        if options.output_directory is None:
+            html_path = os.path.join(os.path.dirname(__file__), "html")
+        generate_html(history.runs, compare_names, html_path, metadata)
 
 
 def validate_and_parse_env_args(env_args):
@@ -297,7 +333,7 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument(
         "--adapter",
         type=str,
-        help="Options to build the Unified Runtime as part of the benchmark",
+        help="Unified Runtime adapter to use.",
         default="level_zero",
     )
     parser.add_argument(
@@ -305,6 +341,11 @@ def validate_and_parse_env_args(env_args):
         help="Do not rebuild the benchmarks from scratch.",
         action="store_true",
     )
+    parser.add_argument(
+        "--redownload",
+        help="Always download benchmark data dependencies, even if they already exist.",
+        action="store_true",
+    )
     parser.add_argument(
         "--env",
         type=str,
@@ -347,12 +388,6 @@ def validate_and_parse_env_args(env_args):
         help="Regex pattern to filter benchmarks by name.",
         default=None,
     )
-    parser.add_argument(
-        "--epsilon",
-        type=float,
-        help="Threshold to consider change of performance significant",
-        default=options.epsilon,
-    )
     parser.add_argument(
         "--verbose", help="Print output of all the commands.", action="store_true"
     )
@@ -379,7 +414,17 @@ def validate_and_parse_env_args(env_args):
         help="Specify whether markdown output should fit the content size limit for request validation",
     )
     parser.add_argument(
-        "--output-html", help="Create HTML output", action="store_true", default=False
+        "--output-html",
+        help="Create HTML output. Local output is for direct local viewing of the html file, remote is for server deployment.",
+        nargs="?",
+        const=options.output_html,
+        choices=["local", "remote"],
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="Location for output files, if --output-html or --output_markdown was specified.",
+        default=None,
     )
     parser.add_argument(
         "--dry-run",
@@ -423,6 +468,25 @@ def validate_and_parse_env_args(env_args):
         help="Directory for cublas library",
         default=None,
     )
+    parser.add_argument(
+        "--preset",
+        type=str,
+        choices=[p for p in presets.keys()],
+        help="Benchmark preset to run.",
+        default=options.preset,
+    )
+    parser.add_argument(
+        "--results-dir",
+        type=str,
+        help="Specify a custom results directory",
+        default=options.custom_results_dir,
+    )
+    parser.add_argument(
+        "--build-jobs",
+        type=int,
+        help="Number of build jobs to run simultaneously",
+        default=options.build_jobs,
+    )
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -430,10 +494,10 @@ def validate_and_parse_env_args(env_args):
     options.workdir = args.benchmark_directory
     options.verbose = args.verbose
     options.rebuild = not args.no_rebuild
+    options.redownload = args.redownload
     options.sycl = args.sycl
     options.iterations = args.iterations
     options.timeout = args.timeout
-    options.epsilon = args.epsilon
     options.ur = args.ur
     options.ur_adapter = args.adapter
     options.exit_on_failure = args.exit_on_failure
@@ -448,12 +512,19 @@ def validate_and_parse_env_args(env_args):
     options.current_run_name = args.relative_perf
     options.cudnn_directory = args.cudnn_directory
     options.cublas_directory = args.cublas_directory
+    options.preset = args.preset
+    options.custom_results_dir = args.results_dir
+    options.build_jobs = args.build_jobs
 
     if args.build_igc and args.compute_runtime is None:
         parser.error("--build-igc requires --compute-runtime to be set")
     if args.compute_runtime is not None:
         options.build_compute_runtime = True
         options.compute_runtime_tag = args.compute_runtime
+    if args.output_dir is not None:
+        if not os.path.isdir(args.output_dir):
+            parser.error("Specified --output-dir is not a valid path")
+        options.output_directory = os.path.abspath(args.output_dir)
 
     benchmark_filter = re.compile(args.filter) if args.filter else None
 
diff --git a/unified-runtime/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
similarity index 76%
rename from unified-runtime/scripts/benchmarks/options.py
rename to devops/scripts/benchmarks/options.py
index 2e92675264544..c852e50c71372 100644
--- a/unified-runtime/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -1,5 +1,8 @@
 from dataclasses import dataclass, field
 from enum import Enum
+import multiprocessing
+
+from presets import presets
 
 
 class Compare(Enum):
@@ -21,6 +24,7 @@ class Options:
     ur_adapter: str = None
     umf: str = None
     rebuild: bool = True
+    redownload: bool = False
     benchmark_cwd: str = "INVALID"
     timeout: float = 600
     iterations: int = 3
@@ -28,18 +32,20 @@ class Options:
     compare: Compare = Compare.LATEST
     compare_max: int = 10  # average/median over how many results
     output_markdown: MarkdownSize = MarkdownSize.SHORT
-    output_html: bool = False
+    output_html: str = "local"
+    output_directory: str = None
     dry_run: bool = False
-    # these two should probably be merged into one setting
     stddev_threshold: float = 0.02
-    epsilon: float = 0.02
     iterations_stddev: int = 5
     build_compute_runtime: bool = False
     extra_ld_libraries: list[str] = field(default_factory=list)
     extra_env_vars: dict = field(default_factory=dict)
-    compute_runtime_tag: str = "25.05.32567.12"
+    compute_runtime_tag: str = "25.05.32567.18"
     build_igc: bool = False
     current_run_name: str = "This PR"
+    preset: str = "Full"
+    custom_results_dir = None
+    build_jobs: int = multiprocessing.cpu_count()
 
 
 options = Options()
diff --git a/devops/scripts/benchmarks/output_html.py b/devops/scripts/benchmarks/output_html.py
new file mode 100644
index 0000000000000..319e796a3831d
--- /dev/null
+++ b/devops/scripts/benchmarks/output_html.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2024-2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import json
+import os
+from options import options
+from utils.result import BenchmarkMetadata, BenchmarkOutput
+from benches.base import benchmark_tags, benchmark_tags_dict
+
+
+def generate_html(
+    benchmark_runs: list,
+    compare_names: list[str],
+    html_path: str,
+    metadata: dict[str, BenchmarkMetadata],
+):
+    benchmark_runs.sort(key=lambda run: run.date, reverse=True)
+
+    # Create the comprehensive output object
+    output = BenchmarkOutput(
+        runs=benchmark_runs,
+        metadata=metadata,
+        tags=benchmark_tags_dict,
+        default_compare_names=compare_names,
+    )
+
+    if options.output_html == "local":
+        data_path = os.path.join(html_path, "data.js")
+        with open(data_path, "w") as f:
+            # For local format, we need to write JavaScript variable assignments
+            f.write("benchmarkRuns = ")
+            json.dump(json.loads(output.to_json())["runs"], f, indent=2)
+            f.write(";\n\n")
+
+            f.write("benchmarkMetadata = ")
+            json.dump(json.loads(output.to_json())["metadata"], f, indent=2)
+            f.write(";\n\n")
+
+            f.write("benchmarkTags = ")
+            json.dump(json.loads(output.to_json())["tags"], f, indent=2)
+            f.write(";\n\n")
+
+            f.write("defaultCompareNames = ")
+            json.dump(output.default_compare_names, f, indent=2)
+            f.write(";\n")
+
+        print(f"See {os.getcwd()}/html/index.html for the results.")
+    else:
+        # For remote format, we write a single JSON file
+        data_path = os.path.join(html_path, "data.json")
+        with open(data_path, "w") as f:
+            json.dump(json.loads(output.to_json()), f, indent=2)
+
+        print(
+            f"Upload {data_path} to a location set in config.js remoteDataUrl argument."
+        )
diff --git a/unified-runtime/scripts/benchmarks/output_markdown.py b/devops/scripts/benchmarks/output_markdown.py
similarity index 92%
rename from unified-runtime/scripts/benchmarks/output_markdown.py
rename to devops/scripts/benchmarks/output_markdown.py
index dd6711cec6365..3295968603d0c 100644
--- a/unified-runtime/scripts/benchmarks/output_markdown.py
+++ b/devops/scripts/benchmarks/output_markdown.py
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import collections
-from benches.result import Result
+from utils.result import Result
 from options import options, MarkdownSize
 import ast
 
@@ -79,7 +79,7 @@ def get_improved_regressed_summary(is_improved: bool, rows_count: int):
         "\n<details>\n"
         "<summary>\n"
         f"{title} {rows_count} "
-        f"(threshold {options.epsilon*100:.2f}%)\n"
+        f"(threshold {options.stddev_threshold*100:.2f}%)\n"
         "</summary>\n\n"
     )
 
@@ -138,17 +138,6 @@ def generate_markdown_details(
         env_dict = res.env
         command = res.command
 
-        # If data is collected from already saved results,
-        # the content is parsed as strings
-        if isinstance(res.env, str):
-            # Since the scripts would be used solely on data prepared
-            # by our scripts, this should be safe
-            # However, maybe needs an additional blessing
-            # https://docs.python.org/3/library/ast.html#ast.literal_eval
-            env_dict = ast.literal_eval(res.env)
-        if isinstance(res.command, str):
-            command = ast.literal_eval(res.command)
-
         section = (
             "\n<details>\n"
             f"<summary>{res.label}</summary>\n\n"
@@ -179,7 +168,7 @@ def generate_markdown_details(
             return "\nBenchmark details contain too many chars to display\n"
 
 
-def generate_summary_table_and_chart(
+def generate_summary_table(
     chart_data: dict[str, list[Result]], baseline_name: str, markdown_size: MarkdownSize
 ):
     summary_table = get_chart_markdown_header(
@@ -276,7 +265,7 @@ def generate_summary_table_and_chart(
                 delta = oln.diff - 1
                 oln.row += f" {delta*100:.2f}%"
 
-                if abs(delta) > options.epsilon:
+                if abs(delta) > options.stddev_threshold:
                     if delta > 0:
                         improved_rows.append(oln.row + " | \n")
                     else:
@@ -374,10 +363,27 @@ def generate_summary_table_and_chart(
                 return "\n# Summary\n" "Benchmark output is too large to display\n\n"
 
 
+def generate_failures_section(failures: dict[str, str]) -> str:
+    if not failures:
+        return ""
+
+    section = "\n# Failures\n"
+    section += "| Name | Failure |\n"
+    section += "|---|---|\n"
+
+    for name, failure in failures.items():
+        section += f"| {name} | {failure} |\n"
+
+    return section
+
+
 def generate_markdown(
-    name: str, chart_data: dict[str, list[Result]], markdown_size: MarkdownSize
+    name: str,
+    chart_data: dict[str, list[Result]],
+    failures: dict[str, str],
+    markdown_size: MarkdownSize,
 ):
-    (summary_line, summary_table) = generate_summary_table_and_chart(
+    (summary_line, summary_table) = generate_summary_table(
         chart_data, name, markdown_size
     )
 
@@ -396,4 +402,6 @@ def generate_markdown(
         )
         generated_markdown += "\n# Details\n" f"{markdown_details}\n"
 
-    return generated_markdown
+    failures_section = generate_failures_section(failures)
+
+    return failures_section + generated_markdown
diff --git a/devops/scripts/benchmarks/presets.py b/devops/scripts/benchmarks/presets.py
new file mode 100644
index 0000000000000..3f191766deb8c
--- /dev/null
+++ b/devops/scripts/benchmarks/presets.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+presets: dict[str, list[str]] = {
+    "Full": [
+        "Compute Benchmarks",
+        "llama.cpp bench",
+        "SYCL-Bench",
+        "Velocity Bench",
+        "UMF",
+    ],
+    "SYCL": [
+        "Compute Benchmarks",
+        "llama.cpp bench",
+        "SYCL-Bench",
+        "Velocity Bench",
+    ],
+    "Minimal": [
+        "Compute Benchmarks",
+    ],
+    "Normal": [
+        "Compute Benchmarks",
+        "llama.cpp bench",
+        "Velocity Bench",
+    ],
+    "Test": [
+        "Test Suite",
+    ],
+}
+
+
+def enabled_suites(preset: str) -> list[str]:
+    try:
+        return presets[preset]
+    except KeyError:
+        raise ValueError(f"Preset '{preset}' not found.")
diff --git a/unified-runtime/scripts/benchmarks/requirements.txt b/devops/scripts/benchmarks/requirements.txt
similarity index 85%
rename from unified-runtime/scripts/benchmarks/requirements.txt
rename to devops/scripts/benchmarks/requirements.txt
index 99ba0caab55c2..9f0381ceef6c2 100644
--- a/unified-runtime/scripts/benchmarks/requirements.txt
+++ b/devops/scripts/benchmarks/requirements.txt
@@ -2,3 +2,4 @@ matplotlib==3.9.2
 mpld3==0.5.10
 dataclasses-json==0.6.7
 PyYAML==6.0.1
+Mako==1.3.9
diff --git a/unified-runtime/scripts/benchmarks/utils/compute_runtime.py b/devops/scripts/benchmarks/utils/compute_runtime.py
similarity index 94%
rename from unified-runtime/scripts/benchmarks/utils/compute_runtime.py
rename to devops/scripts/benchmarks/utils/compute_runtime.py
index 74d8ff4eb5345..e617168f37a76 100644
--- a/unified-runtime/scripts/benchmarks/utils/compute_runtime.py
+++ b/devops/scripts/benchmarks/utils/compute_runtime.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -62,7 +62,7 @@ def build_gmmlib(self, repo, commit):
             f"-DCMAKE_BUILD_TYPE=Release",
         ]
         run(configure_command)
-        run(f"cmake --build {self.gmmlib_build} -j")
+        run(f"cmake --build {self.gmmlib_build} -j {options.build_jobs}")
         run(f"cmake --install {self.gmmlib_build}")
         return self.gmmlib_install
 
@@ -87,7 +87,7 @@ def build_level_zero(self, repo, commit):
             f"-DCMAKE_BUILD_TYPE=Release",
         ]
         run(configure_command)
-        run(f"cmake --build {self.level_zero_build} -j")
+        run(f"cmake --build {self.level_zero_build} -j {options.build_jobs}")
         run(f"cmake --install {self.level_zero_build}")
         return self.level_zero_install
 
@@ -142,8 +142,11 @@ def build_igc(self, repo, commit):
         ]
         run(configure_command)
 
-        # set timeout to 30min. IGC takes A LONG time to build if building from scratch.
-        run(f"cmake --build {self.igc_build} -j", timeout=600 * 3)
+        # set timeout to 2h. IGC takes A LONG time to build if building from scratch.
+        run(
+            f"cmake --build {self.igc_build} -j {options.build_jobs}",
+            timeout=60 * 60 * 2,
+        )
         # cmake --install doesn't work...
         run("make install", cwd=self.igc_build)
         return self.igc_install
@@ -214,7 +217,7 @@ def build_compute_runtime(self):
             configure_command.append(f"-DIGC_DIR={self.igc}")
 
         run(configure_command)
-        run(f"cmake --build {self.compute_runtime_build} -j")
+        run(f"cmake --build {self.compute_runtime_build} -j {options.build_jobs}")
         return self.compute_runtime_build
 
 
diff --git a/unified-runtime/scripts/benchmarks/benches/oneapi.py b/devops/scripts/benchmarks/utils/oneapi.py
similarity index 78%
rename from unified-runtime/scripts/benchmarks/benches/oneapi.py
rename to devops/scripts/benchmarks/utils/oneapi.py
index 0547f6646e39e..fc27b9a8b2d3e 100644
--- a/unified-runtime/scripts/benchmarks/benches/oneapi.py
+++ b/devops/scripts/benchmarks/utils/oneapi.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -7,29 +7,33 @@
 from utils.utils import download, run
 from options import options
 import os
+import hashlib
 
 
 class OneAPI:
-    # random unique number for benchmark oneAPI installation
-    ONEAPI_BENCHMARK_INSTANCE_ID = 987654
-
     def __init__(self):
         self.oneapi_dir = os.path.join(options.workdir, "oneapi")
         Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True)
-        # delete if some option is set?
+        self.oneapi_instance_id = self.generate_unique_oneapi_id(self.oneapi_dir)
 
         # can we just hardcode these links?
         self.install_package(
             "dnnl",
             "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh",
+            "6866feb5b8dfefd6ff45d6bfabed44f01d7fba8fd452480ae1fd86b92e9481ae052c24842da14f112f672f5c4859945b",
         )
         self.install_package(
             "mkl",
             "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh",
+            "122bb84cf943ea27753cb399c81ab2ae218ebd51b789c74d273240157722925ab4d5a43cb0b5de41b854f2c5a59a4002",
         )
         return
 
-    def install_package(self, name, url):
+    def generate_unique_oneapi_id(self, path):
+        hash_object = hashlib.md5(path.encode())
+        return hash_object.hexdigest()
+
+    def install_package(self, name, url, checksum):
         package_path = os.path.join(self.oneapi_dir, name)
         if Path(package_path).exists():
             print(
@@ -37,11 +41,13 @@ def install_package(self, name, url):
             )
             return
 
-        package = download(self.oneapi_dir, url, f"package_{name}.sh")
+        package = download(
+            self.oneapi_dir, url, f"package_{name}.sh", checksum=checksum
+        )
         try:
             print(f"installing {name}")
             run(
-                f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}"
+                f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance {self.oneapi_instance_id}"
             )
         except:
             print("oneAPI installation likely exists already")
diff --git a/devops/scripts/benchmarks/utils/result.py b/devops/scripts/benchmarks/utils/result.py
new file mode 100644
index 0000000000000..b9ebfdcb60952
--- /dev/null
+++ b/devops/scripts/benchmarks/utils/result.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2024-2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from dataclasses import dataclass, field
+from typing import Optional, Dict, List, Any
+from dataclasses_json import config, dataclass_json
+from datetime import datetime
+
+
+@dataclass_json
+@dataclass
+class Result:
+    label: str
+    value: float
+    command: list[str]
+    env: dict[str, str]
+    stdout: str
+    passed: bool = True
+    unit: str = ""
+    explicit_group: str = ""
+    # stddev can be optionally set by the benchmark,
+    # if not set, it will be calculated automatically.
+    stddev: float = 0.0
+    # values below should not be set by the benchmark
+    name: str = ""
+    lower_is_better: bool = True
+    suite: str = "Unknown"
+    git_url: str = ""
+    git_hash: str = ""
+
+@dataclass_json
+@dataclass
+class BenchmarkRun:
+    results: list[Result]
+    name: str = "This PR"
+    hostname: str = "Unknown"
+    git_hash: str = ""
+    github_repo: str = None
+    date: datetime = field(
+        default=None,
+        metadata=config(encoder=datetime.isoformat, decoder=datetime.fromisoformat),
+    )
+
+
+@dataclass_json
+@dataclass
+class BenchmarkTag:
+    name: str
+    description: str = ""
+
+
+@dataclass_json
+@dataclass
+class BenchmarkMetadata:
+    type: str = "benchmark"  # or 'group'
+    description: Optional[str] = None
+    notes: Optional[str] = None
+    unstable: Optional[str] = None
+    tags: list[str] = field(default_factory=list)  # Changed to list of tag names
+
+
+@dataclass_json
+@dataclass
+class BenchmarkOutput:
+    runs: list[BenchmarkRun]
+    metadata: Dict[str, BenchmarkMetadata]
+    tags: Dict[str, BenchmarkTag]
+    default_compare_names: List[str] = field(default_factory=list)
diff --git a/unified-runtime/scripts/benchmarks/utils/utils.py b/devops/scripts/benchmarks/utils/utils.py
similarity index 81%
rename from unified-runtime/scripts/benchmarks/utils/utils.py
rename to devops/scripts/benchmarks/utils/utils.py
index 3a516e8d724f7..54f2ef7fb9c1f 100644
--- a/unified-runtime/scripts/benchmarks/utils/utils.py
+++ b/devops/scripts/benchmarks/utils/utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -12,6 +12,7 @@
 import urllib  # nosec B404
 from options import options
 from pathlib import Path
+import hashlib
 
 
 def run(
@@ -45,6 +46,12 @@ def run(
 
         env.update(env_vars)
 
+        if options.verbose:
+            command_str = " ".join(command)
+            env_str = " ".join(f"{key}={value}" for key, value in env_vars.items())
+            full_command_str = f"{env_str} {command_str}".strip()
+            print(f"Running: {full_command_str}")
+
         result = subprocess.run(
             command,
             cwd=cwd,
@@ -107,7 +114,7 @@ def prepare_workdir(dir, version):
                 shutil.rmtree(dir)
         else:
             raise Exception(
-                f"The directory {dir} exists but is a benchmark work directory."
+                f"The directory {dir} exists but is not a benchmark work directory."
             )
 
     os.makedirs(dir)
@@ -128,11 +135,26 @@ def create_build_path(directory, name):
     return build_path
 
 
-def download(dir, url, file, untar=False, unzip=False):
+def calculate_checksum(file_path):
+    sha_hash = hashlib.sha384()
+    with open(file_path, "rb") as f:
+        for byte_block in iter(lambda: f.read(4096), b""):
+            sha_hash.update(byte_block)
+    return sha_hash.hexdigest()
+
+
+def download(dir, url, file, untar=False, unzip=False, checksum=""):
     data_file = os.path.join(dir, file)
     if not Path(data_file).exists():
         print(f"{data_file} does not exist, downloading")
         urllib.request.urlretrieve(url, data_file)
+        calculated_checksum = calculate_checksum(data_file)
+        if calculated_checksum != checksum:
+            print(
+                f"Checksum mismatch: expected {checksum}, got {calculated_checksum}. Refusing to continue."
+            )
+            exit(1)
+
         if untar:
             file = tarfile.open(data_file)
             file.extractall(dir)
diff --git a/unified-runtime/scripts/benchmarks/workflow.png b/devops/scripts/benchmarks/workflow.png
similarity index 100%
rename from unified-runtime/scripts/benchmarks/workflow.png
rename to devops/scripts/benchmarks/workflow.png
diff --git a/unified-runtime/.github/scripts/get_system_info.sh b/devops/scripts/get_system_info.sh
similarity index 100%
rename from unified-runtime/.github/scripts/get_system_info.sh
rename to devops/scripts/get_system_info.sh
diff --git a/unified-runtime/scripts/benchmarks/benches/compute.py b/unified-runtime/scripts/benchmarks/benches/compute.py
deleted file mode 100644
index 4658a3414e16a..0000000000000
--- a/unified-runtime/scripts/benchmarks/benches/compute.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# Copyright (C) 2024-2025 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import os
-import csv
-import io
-from utils.utils import run, git_clone, create_build_path
-from .base import Benchmark, Suite
-from .result import Result
-from options import options
-from enum import Enum
-
-class ComputeBench(Suite):
-    def __init__(self, directory):
-        self.directory = directory
-
-    def name(self) -> str:
-        return "Compute Benchmarks"
-
-    def setup(self):
-        if options.sycl is None:
-            return
-
-        repo_path = git_clone(
-            self.directory,
-            "compute-benchmarks-repo",
-            "https://github.com/intel/compute-benchmarks.git",
-            "dfdbf2ff9437ee159627cc2cd9159c289da1a7ba",
-        )
-        build_path = create_build_path(self.directory, "compute-benchmarks-build")
-
-        configure_command = [
-            "cmake",
-            f"-B {build_path}",
-            f"-S {repo_path}",
-            f"-DCMAKE_BUILD_TYPE=Release",
-            f"-DBUILD_SYCL=ON",
-            f"-DSYCL_COMPILER_ROOT={options.sycl}",
-            f"-DALLOW_WARNINGS=ON",
-        ]
-
-        if options.ur is not None:
-            configure_command += [
-                f"-DBUILD_UR=ON",
-                f"-Dunified-runtime_DIR={options.ur}/lib/cmake/unified-runtime",
-            ]
-
-        print(f"{self.__class__.__name__}: Run {configure_command}")
-        run(configure_command, add_sycl=True)
-        print(f"{self.__class__.__name__}: Run cmake --build {build_path} -j")
-        run(f"cmake --build {build_path} -j", add_sycl=True)
-
-        self.built = True
-
-    def benchmarks(self) -> list[Benchmark]:
-        if options.sycl is None:
-            return []
-
-        if options.ur_adapter == "cuda":
-            return []
-
-        benches = [
-            SubmitKernelL0(self, 0),
-            SubmitKernelL0(self, 1),
-            SubmitKernelSYCL(self, 0),
-            SubmitKernelSYCL(self, 1),
-            QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
-            QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
-            QueueMemcpy(self, "Device", "Device", 1024),
-            StreamMemory(self, "Triad", 10 * 1024, "Device"),
-            ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
-            ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
-            VectorSum(self),
-            MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
-            MemcpyExecute(self, 400, 8, 1024, 100, 1, 1, 1),
-            MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
-            MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
-            GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 5),
-            GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 5),
-            GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 100),
-            GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 100),
-            GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 0, 5),
-            GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 1, 5),
-            GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 0, 100),
-            GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 1, 100),
-        ]
-
-        if options.ur is not None:
-            benches += [
-                SubmitKernelUR(self, 0, 0),
-                SubmitKernelUR(self, 1, 0),
-                SubmitKernelUR(self, 1, 1),
-                GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 5),
-                GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 5),
-                GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 100),
-                GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 100),
-            ]
-
-        return benches
-
-
-def parse_unit_type(compute_unit):
-    if "[count]" in compute_unit:
-        return "instr"
-    elif "[us]" in compute_unit:
-        return "μs"
-    return compute_unit.replace("[", "").replace("]", "")
-
-
-class ComputeBenchmark(Benchmark):
-    def __init__(self, bench, name, test):
-        super().__init__(bench.directory, bench)
-        self.bench = bench
-        self.bench_name = name
-        self.test = test
-
-    def bin_args(self) -> list[str]:
-        return []
-
-    def extra_env_vars(self) -> dict:
-        return {}
-
-    def setup(self):
-        self.benchmark_bin = os.path.join(
-            self.bench.directory, "compute-benchmarks-build", "bin", self.bench_name
-        )
-
-    def explicit_group(self):
-        return ""
-
-    def run(self, env_vars) -> list[Result]:
-        command = [
-            f"{self.benchmark_bin}",
-            f"--test={self.test}",
-            "--csv",
-            "--noHeaders",
-        ]
-
-        command += self.bin_args()
-        env_vars.update(self.extra_env_vars())
-
-        result = self.run_bench(command, env_vars)
-        parsed_results = self.parse_output(result)
-        ret = []
-        for label, median, stddev, unit in parsed_results:
-            extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
-            explicit_group = (
-                self.explicit_group() + extra_label
-                if self.explicit_group() != ""
-                else ""
-            )
-            ret.append(
-                Result(
-                    label=self.name() + extra_label,
-                    explicit_group=explicit_group,
-                    value=median,
-                    stddev=stddev,
-                    command=command,
-                    env=env_vars,
-                    stdout=result,
-                    unit=parse_unit_type(unit),
-                )
-            )
-        return ret
-
-    def parse_output(self, output):
-        csv_file = io.StringIO(output)
-        reader = csv.reader(csv_file)
-        next(reader, None)
-        results = []
-        while True:
-            data_row = next(reader, None)
-            if data_row is None:
-                break
-            try:
-                label = data_row[0]
-                mean = float(data_row[1])
-                median = float(data_row[2])
-                # compute benchmarks report stddev as %
-                stddev = mean * (float(data_row[3].strip("%")) / 100.0)
-                unit = data_row[7]
-                results.append((label, median, stddev, unit))
-            except (ValueError, IndexError) as e:
-                raise ValueError(f"Error parsing output: {e}")
-        if len(results) == 0:
-            raise ValueError("Benchmark output does not contain data.")
-        return results
-
-    def teardown(self):
-        return
-
-
-class SubmitKernelSYCL(ComputeBenchmark):
-    def __init__(self, bench, ioq):
-        self.ioq = ioq
-        super().__init__(bench, "api_overhead_benchmark_sycl", "SubmitKernel")
-
-    def name(self):
-        order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_sycl SubmitKernel {order}"
-
-    def explicit_group(self):
-        return "SubmitKernel"
-
-    def bin_args(self) -> list[str]:
-        return [
-            f"--Ioq={self.ioq}",
-            "--DiscardEvents=0",
-            "--MeasureCompletion=0",
-            "--iterations=100000",
-            "--Profiling=0",
-            "--NumKernels=10",
-            "--KernelExecTime=1",
-        ]
-
-
-class SubmitKernelUR(ComputeBenchmark):
-    def __init__(self, bench, ioq, measureCompletion):
-        self.ioq = ioq
-        self.measureCompletion = measureCompletion
-        super().__init__(bench, "api_overhead_benchmark_ur", "SubmitKernel")
-
-    def name(self):
-        order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_ur SubmitKernel {order}" + (
-            " with measure completion" if self.measureCompletion else ""
-        )
-
-    def explicit_group(self):
-        return "SubmitKernel"
-
-    def bin_args(self) -> list[str]:
-        return [
-            f"--Ioq={self.ioq}",
-            "--DiscardEvents=0",
-            f"--MeasureCompletion={self.measureCompletion}",
-            "--iterations=100000",
-            "--Profiling=0",
-            "--NumKernels=10",
-            "--KernelExecTime=1",
-        ]
-
-
-class SubmitKernelL0(ComputeBenchmark):
-    def __init__(self, bench, ioq):
-        self.ioq = ioq
-        super().__init__(bench, "api_overhead_benchmark_l0", "SubmitKernel")
-
-    def name(self):
-        order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_l0 SubmitKernel {order}"
-
-    def explicit_group(self):
-        return "SubmitKernel"
-
-    def bin_args(self) -> list[str]:
-        return [
-            f"--Ioq={self.ioq}",
-            "--DiscardEvents=0",
-            "--MeasureCompletion=0",
-            "--iterations=100000",
-            "--Profiling=0",
-            "--NumKernels=10",
-            "--KernelExecTime=1",
-        ]
-
-
-class ExecImmediateCopyQueue(ComputeBenchmark):
-    def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
-        self.ioq = ioq
-        self.isCopyOnly = isCopyOnly
-        self.source = source
-        self.destination = destination
-        self.size = size
-        super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
-
-    def name(self):
-        order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
-
-    def bin_args(self) -> list[str]:
-        return [
-            "--iterations=100000",
-            f"--ioq={self.ioq}",
-            f"--IsCopyOnly={self.isCopyOnly}",
-            "--MeasureCompletionTime=0",
-            f"--src={self.destination}",
-            f"--dst={self.destination}",
-            f"--size={self.size}",
-        ]
-
-
-class QueueInOrderMemcpy(ComputeBenchmark):
-    def __init__(self, bench, isCopyOnly, source, destination, size):
-        self.isCopyOnly = isCopyOnly
-        self.source = source
-        self.destination = destination
-        self.size = size
-        super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
-
-    def name(self):
-        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
-
-    def bin_args(self) -> list[str]:
-        return [
-            "--iterations=10000",
-            f"--IsCopyOnly={self.isCopyOnly}",
-            f"--sourcePlacement={self.source}",
-            f"--destinationPlacement={self.destination}",
-            f"--size={self.size}",
-            "--count=100",
-        ]
-
-
-class QueueMemcpy(ComputeBenchmark):
-    def __init__(self, bench, source, destination, size):
-        self.source = source
-        self.destination = destination
-        self.size = size
-        super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
-
-    def name(self):
-        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
-
-    def bin_args(self) -> list[str]:
-        return [
-            "--iterations=10000",
-            f"--sourcePlacement={self.source}",
-            f"--destinationPlacement={self.destination}",
-            f"--size={self.size}",
-        ]
-
-
-class StreamMemory(ComputeBenchmark):
-    def __init__(self, bench, type, size, placement):
-        self.type = type
-        self.size = size
-        self.placement = placement
-        super().__init__(bench, "memory_benchmark_sycl", "StreamMemory")
-
-    def name(self):
-        return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
-
-    # measurement is in GB/s
-    def lower_is_better(self):
-        return False
-
-    def bin_args(self) -> list[str]:
-        return [
-            "--iterations=10000",
-            f"--type={self.type}",
-            f"--size={self.size}",
-            f"--memoryPlacement={self.placement}",
-            "--useEvents=0",
-            "--contents=Zeros",
-            "--multiplier=1",
-        ]
-
-
-class VectorSum(ComputeBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "miscellaneous_benchmark_sycl", "VectorSum")
-
-    def name(self):
-        return f"miscellaneous_benchmark_sycl VectorSum"
-
-    def bin_args(self) -> list[str]:
-        return [
-            "--iterations=1000",
-            "--numberOfElementsX=512",
-            "--numberOfElementsY=256",
-            "--numberOfElementsZ=256",
-        ]
-
-
-class MemcpyExecute(ComputeBenchmark):
-    def __init__(
-        self,
-        bench,
-        numOpsPerThread,
-        numThreads,
-        allocSize,
-        iterations,
-        srcUSM,
-        dstUSM,
-        useEvent,
-    ):
-        self.numOpsPerThread = numOpsPerThread
-        self.numThreads = numThreads
-        self.allocSize = allocSize
-        self.iterations = iterations
-        self.srcUSM = srcUSM
-        self.dstUSM = dstUSM
-        self.useEvents = useEvent
-        super().__init__(bench, "multithread_benchmark_ur", "MemcpyExecute")
-
-    def name(self):
-        return (
-            f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}"
-            + (" without events" if not self.useEvents else "")
-        )
-
-    def bin_args(self) -> list[str]:
-        return [
-            "--Ioq=1",
-            f"--UseEvents={self.useEvents}",
-            "--MeasureCompletion=1",
-            "--UseQueuePerThread=1",
-            f"--AllocSize={self.allocSize}",
-            f"--NumThreads={self.numThreads}",
-            f"--NumOpsPerThread={self.numOpsPerThread}",
-            f"--iterations={self.iterations}",
-            f"--SrcUSM={self.srcUSM}",
-            f"--DstUSM={self.dstUSM}",
-        ]
-
-
-class RUNTIMES(Enum):
-    SYCL = "sycl"
-    LEVEL_ZERO = "l0"
-    UR = "ur"
-
-
-class GraphApiSinKernelGraph(ComputeBenchmark):
-    def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
-        self.withGraphs = withGraphs
-        self.numKernels = numKernels
-        self.runtime = runtime
-        super().__init__(
-            bench, f"graph_api_benchmark_{runtime.value}", "SinKernelGraph"
-        )
-
-    def explicit_group(self):
-        return f"SinKernelGraph {self.numKernels}"
-
-    def name(self):
-        return f"graph_api_benchmark_{self.runtime.value} SinKernelGraph graphs:{self.withGraphs}, numKernels:{self.numKernels}"
-
-    def bin_args(self) -> list[str]:
-        return [
-            "--iterations=10000",
-            f"--numKernels={self.numKernels}",
-            f"--withGraphs={self.withGraphs}",
-            "--withCopyOffload=1",
-            "--immediateAppendCmdList=0",
-        ]
-
-
-class GraphApiSubmitExecGraph(ComputeBenchmark):
-    def __init__(self, bench, ioq, submit, numKernels):
-        self.ioq = ioq
-        self.submit = submit
-        self.numKernels = numKernels
-        super().__init__(bench, "graph_api_benchmark_sycl", "SubmitExecGraph")
-
-    def name(self):
-        return f"graph_api_benchmark_sycl SubmitExecGraph ioq:{self.ioq}, submit:{self.submit}, numKernels:{self.numKernels}"
-
-    def explicit_group(self):
-        if self.submit:
-            return "SubmitGraph"
-        else:
-            return "ExecGraph"
-
-    def bin_args(self) -> list[str]:
-        return [
-            "--iterations=100",
-            f"--measureSubmit={self.submit}",
-            f"--ioq={self.ioq}",
-            f"--numKernels={self.numKernels}",
-        ]
diff --git a/unified-runtime/scripts/benchmarks/benches/result.py b/unified-runtime/scripts/benchmarks/benches/result.py
deleted file mode 100644
index 52a098d91c24a..0000000000000
--- a/unified-runtime/scripts/benchmarks/benches/result.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from dataclasses import dataclass
-from typing import Optional
-from dataclasses_json import dataclass_json
-from datetime import datetime
-
-
-@dataclass_json
-@dataclass
-class Result:
-    label: str
-    value: float
-    command: str
-    env: str
-    stdout: str
-    passed: bool = True
-    unit: str = ""
-    explicit_group: str = ""
-    # stddev can be optionally set by the benchmark,
-    # if not set, it will be calculated automatically.
-    stddev: float = 0.0
-    # values below should not be set by the benchmark
-    name: str = ""
-    lower_is_better: bool = True
-    git_hash: str = ""
-    date: Optional[datetime] = None
-    suite: str = "Unknown"
-
-
-@dataclass_json
-@dataclass
-class BenchmarkRun:
-    results: list[Result]
-    name: str = "This PR"
-    git_hash: str = ""
-    date: datetime = None
diff --git a/unified-runtime/scripts/benchmarks/benches/test.py b/unified-runtime/scripts/benchmarks/benches/test.py
deleted file mode 100644
index 06eac12b25344..0000000000000
--- a/unified-runtime/scripts/benchmarks/benches/test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import random
-from utils.utils import git_clone
-from .base import Benchmark, Suite
-from .result import Result
-from utils.utils import run, create_build_path
-from options import options
-import os
-
-
-class TestSuite(Suite):
-    def __init__(self):
-        return
-
-    def setup(self):
-        return
-
-    def benchmarks(self) -> list[Benchmark]:
-        bench_configs = [
-            ("Memory Bandwidth", 2000, 200, "Foo Group"),
-            ("Latency", 100, 20, "Bar Group"),
-            ("Throughput", 1500, 150, "Foo Group"),
-            ("FLOPS", 3000, 300, "Foo Group"),
-            ("Cache Miss Rate", 250, 25, "Bar Group"),
-        ]
-
-        result = []
-        for base_name, base_value, base_diff, group in bench_configs:
-            for variant in range(6):
-                value_multiplier = 1.0 + (variant * 0.2)
-                name = f"{base_name} {variant+1}"
-                value = base_value * value_multiplier
-                diff = base_diff * value_multiplier
-
-                result.append(TestBench(name, value, diff, group))
-
-        return result
-
-
-class TestBench(Benchmark):
-    def __init__(self, name, value, diff, group=""):
-        self.bname = name
-        self.value = value
-        self.diff = diff
-        self.group = group
-        super().__init__("")
-
-    def name(self):
-        return self.bname
-
-    def lower_is_better(self):
-        return True
-
-    def setup(self):
-        return
-
-    def run(self, env_vars) -> list[Result]:
-        random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
-        return [
-            Result(
-                label=self.name(),
-                explicit_group=self.group,
-                value=random_value,
-                command="",
-                env={"A": "B"},
-                stdout="no output",
-                unit="ms",
-            )
-        ]
-
-    def teardown(self):
-        return
diff --git a/unified-runtime/scripts/benchmarks/benchmark_results.html.template b/unified-runtime/scripts/benchmarks/benchmark_results.html.template
deleted file mode 100644
index 1deeedad66b00..0000000000000
--- a/unified-runtime/scripts/benchmarks/benchmark_results.html.template
+++ /dev/null
@@ -1,192 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <title>Benchmark Results</title>
-    <style>
-        body {
-            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
-            margin: 0;
-            padding: 16px;
-            background: #f8f9fa;
-        }
-        .container {
-            max-width: 1100px;
-            margin: 0 auto;
-        }
-        h1, h2 {
-            color: #212529;
-            text-align: center;
-            margin-bottom: 24px;
-            font-weight: 500;
-        }
-        .chart {
-            background: white;
-            border-radius: 8px;
-            padding: 24px;
-            margin-bottom: 24px;
-            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
-            overflow-x: auto;
-        }
-        .chart > div {
-            min-width: 600px;
-            margin: 0 auto;
-        }
-        @media (max-width: 768px) {
-            body {
-                padding: 12px;
-            }
-            .chart {
-                padding: 16px;
-                border-radius: 6px;
-            }
-            h1 {
-                font-size: 24px;
-                margin-bottom: 16px;
-            }
-        }
-        .filter-container {
-            text-align: center;
-            margin-bottom: 24px;
-        }
-        .filter-container input {
-            padding: 8px;
-            font-size: 16px;
-            border: 1px solid #ccc;
-            border-radius: 4px;
-            width: 400px;
-            max-width: 100%;
-        }
-        .suite-filter-container {
-            text-align: center;
-            margin-bottom: 24px;
-            padding: 16px;
-            background: #e9ecef;
-            border-radius: 8px;
-        }
-        .suite-checkbox {
-            margin: 0 8px;
-        }
-        details {
-            margin-bottom: 24px;
-        }
-        summary {
-            font-size: 18px;
-            font-weight: 500;
-            cursor: pointer;
-            padding: 12px;
-            background: #e9ecef;
-            border-radius: 8px;
-            user-select: none;
-        }
-        summary:hover {
-            background: #dee2e6;
-        }
-    </style>
-    <script>
-        function getQueryParam(param) {
-            const urlParams = new URLSearchParams(window.location.search);
-            return urlParams.get(param);
-        }
-
-        function filterCharts() {
-            const regexInput = document.getElementById('bench-filter').value;
-            const regex = new RegExp(regexInput, 'i');
-            const activeSuites = Array.from(document.querySelectorAll('.suite-checkbox:checked')).map(checkbox => checkbox.getAttribute('data-suite'));
-            const charts = document.querySelectorAll('.chart');
-
-            charts.forEach(chart => {
-                const label = chart.getAttribute('data-label');
-                const suite = chart.getAttribute('data-suite');
-                if (regex.test(label) && activeSuites.includes(suite)) {
-                    chart.style.display = '';
-                } else {
-                    chart.style.display = 'none';
-                }
-            });
-
-            updateURL();
-        }
-
-        function updateURL() {
-            const url = new URL(window.location);
-            const regex = document.getElementById('bench-filter').value;
-            const activeSuites = Array.from(document.querySelectorAll('.suite-checkbox:checked')).map(checkbox => checkbox.getAttribute('data-suite'));
-
-            if (regex) {
-                url.searchParams.set('regex', regex);
-            } else {
-                url.searchParams.delete('regex');
-            }
-
-            if (activeSuites.length > 0) {
-                url.searchParams.set('suites', activeSuites.join(','));
-            } else {
-                url.searchParams.delete('suites');
-            }
-
-            history.replaceState(null, '', url);
-        }
-
-        document.addEventListener('DOMContentLoaded', (event) => {
-            const regexParam = getQueryParam('regex');
-            const suitesParam = getQueryParam('suites');
-
-            if (regexParam) {
-                document.getElementById('bench-filter').value = regexParam;
-            }
-
-            const suiteCheckboxes = document.querySelectorAll('.suite-checkbox');
-            if (suitesParam) {
-                const suites = suitesParam.split(',');
-                suiteCheckboxes.forEach(checkbox => {
-                    if (suites.includes(checkbox.getAttribute('data-suite'))) {
-                        checkbox.checked = true;
-                    } else {
-                        checkbox.checked = false;
-                    }
-                });
-            } else {
-                suiteCheckboxes.forEach(checkbox => {
-                    checkbox.checked = true;
-                });
-            }
-            filterCharts();
-
-            suiteCheckboxes.forEach(checkbox => {
-                checkbox.addEventListener('change', () => {
-                    filterCharts();
-                });
-            });
-
-            document.getElementById('bench-filter').addEventListener('input', () => {
-                filterCharts();
-            });
-        });
-    </script>
-</head>
-<body>
-    <div class="container">
-        <h1>Benchmark Results</h1>
-        <div class="filter-container">
-            <input type="text" id="bench-filter" placeholder="Regex...">
-        </div>
-        <div class="suite-filter-container">
-            ${suite_checkboxes_html}
-        </div>
-        <details class="timeseries">
-            <summary>Historical Results</summary>
-            <div class="charts">
-                ${timeseries_charts_html}
-            </div>
-        </details>
-        <details class="bar-charts">
-            <summary>Comparisons</summary>
-            <div class="charts">
-                ${bar_charts_html}
-            </div>
-        </details>
-    </div>
-</body>
-</html>
diff --git a/unified-runtime/scripts/benchmarks/output_html.py b/unified-runtime/scripts/benchmarks/output_html.py
deleted file mode 100644
index 4ba395bc3aac6..0000000000000
--- a/unified-runtime/scripts/benchmarks/output_html.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import re
-import os
-from pathlib import Path
-import matplotlib.pyplot as plt
-import mpld3
-from collections import defaultdict
-from dataclasses import dataclass
-import matplotlib.dates as mdates
-from benches.result import BenchmarkRun, Result
-import numpy as np
-from string import Template
-
-
-@dataclass
-class BenchmarkMetadata:
-    unit: str
-    suite: str
-    lower_is_better: bool
-
-
-@dataclass
-class BenchmarkSeries:
-    label: str
-    metadata: BenchmarkMetadata
-    runs: list[BenchmarkRun]
-
-
-@dataclass
-class BenchmarkChart:
-    label: str
-    suite: str
-    html: str
-
-
-def tooltip_css() -> str:
-    return ".mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}"
-
-
-def create_time_series_chart(
-    benchmarks: list[BenchmarkSeries], github_repo: str
-) -> list[BenchmarkChart]:
-    plt.close("all")
-
-    num_benchmarks = len(benchmarks)
-    if num_benchmarks == 0:
-        return []
-
-    html_charts = []
-
-    for _, benchmark in enumerate(benchmarks):
-        fig, ax = plt.subplots(figsize=(10, 4))
-
-        all_values = []
-        all_stddevs = []
-
-        for run in benchmark.runs:
-            sorted_points = sorted(run.results, key=lambda x: x.date)
-            dates = [point.date for point in sorted_points]
-            values = [point.value for point in sorted_points]
-            stddevs = [point.stddev for point in sorted_points]
-
-            all_values.extend(values)
-            all_stddevs.extend(stddevs)
-
-            ax.errorbar(dates, values, yerr=stddevs, fmt="-", label=run.name, alpha=0.5)
-            scatter = ax.scatter(dates, values, picker=True)
-
-            tooltip_labels = [
-                f"Date: {point.date.strftime('%Y-%m-%d %H:%M:%S')}\n"
-                f"Value: {point.value:.2f} {benchmark.metadata.unit}\n"
-                f"Stddev: {point.stddev:.2f} {benchmark.metadata.unit}\n"
-                f"Git Hash: {point.git_hash}"
-                for point in sorted_points
-            ]
-
-            targets = [
-                f"https://github.com/{github_repo}/commit/{point.git_hash}"
-                for point in sorted_points
-            ]
-
-            tooltip = mpld3.plugins.PointHTMLTooltip(
-                scatter, tooltip_labels, css=tooltip_css(), targets=targets
-            )
-            mpld3.plugins.connect(fig, tooltip)
-
-        ax.set_title(benchmark.label, pad=20)
-        performance_indicator = (
-            "lower is better"
-            if benchmark.metadata.lower_is_better
-            else "higher is better"
-        )
-        ax.text(
-            0.5,
-            1.05,
-            f"({performance_indicator})",
-            ha="center",
-            transform=ax.transAxes,
-            style="italic",
-            fontsize=7,
-            color="#666666",
-        )
-
-        ax.set_xlabel("")
-        unit = benchmark.metadata.unit
-        ax.set_ylabel(f"Value ({unit})" if unit else "Value")
-        ax.grid(True, alpha=0.2)
-        ax.legend(bbox_to_anchor=(1, 1), loc="upper left")
-        ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter("%Y-%m-%d %H:%M:%S"))
-
-        plt.tight_layout()
-        html_charts.append(
-            BenchmarkChart(
-                html=mpld3.fig_to_html(fig),
-                label=benchmark.label,
-                suite=benchmark.metadata.suite,
-            )
-        )
-        plt.close(fig)
-
-    return html_charts
-
-
-@dataclass
-class ExplicitGroup:
-    name: str
-    nnames: int
-    metadata: BenchmarkMetadata
-    runs: dict[str, dict[str, Result]]
-
-
-def create_explicit_groups(
-    benchmark_runs: list[BenchmarkRun], compare_names: list[str]
-) -> list[ExplicitGroup]:
-    groups = {}
-
-    for run in benchmark_runs:
-        if run.name in compare_names:
-            for res in run.results:
-                if res.explicit_group != "":
-                    if res.explicit_group not in groups:
-                        groups[res.explicit_group] = ExplicitGroup(
-                            name=res.explicit_group,
-                            nnames=len(compare_names),
-                            metadata=BenchmarkMetadata(
-                                unit=res.unit,
-                                lower_is_better=res.lower_is_better,
-                                suite=res.suite,
-                            ),
-                            runs={},
-                        )
-
-                    group = groups[res.explicit_group]
-                    if res.label not in group.runs:
-                        group.runs[res.label] = {name: None for name in compare_names}
-
-                    if group.runs[res.label][run.name] is None:
-                        group.runs[res.label][run.name] = res
-
-    return list(groups.values())
-
-
-def create_grouped_bar_charts(groups: list[ExplicitGroup]) -> list[BenchmarkChart]:
-    plt.close("all")
-
-    html_charts = []
-
-    for group in groups:
-        fig, ax = plt.subplots(figsize=(10, 6))
-
-        x = np.arange(group.nnames)
-        x_labels = []
-        width = 0.8 / len(group.runs)
-
-        max_height = 0
-
-        for i, (run_name, run_results) in enumerate(group.runs.items()):
-            offset = width * i
-
-            positions = x + offset
-            x_labels = run_results.keys()
-            valid_data = [r.value if r is not None else 0 for r in run_results.values()]
-            rects = ax.bar(positions, valid_data, width, label=run_name)
-            # This is a hack to disable all bar_label. Setting labels to empty doesn't work.
-            # We create our own labels below for each bar, this works better in mpld3.
-            ax.bar_label(rects, fmt="")
-
-            for rect, run, res in zip(rects, run_results.keys(), run_results.values()):
-                if res is None:
-                    continue
-
-                height = rect.get_height()
-                if height > max_height:
-                    max_height = height
-
-                ax.text(
-                    rect.get_x() + rect.get_width() / 2.0,
-                    height + 1,
-                    f"{res.value:.1f}",
-                    ha="center",
-                    va="bottom",
-                    fontsize=9,
-                )
-
-                tooltip_labels = [
-                    f"Date: {res.date.strftime('%Y-%m-%d %H:%M:%S')}\n"
-                    f"Run: {run}\n"
-                    f"Label: {res.label}\n"
-                    f"Value: {res.value:.2f} {res.unit}\n"
-                    f"Stddev: {res.stddev:.2f} {res.unit}\n"
-                ]
-                tooltip = mpld3.plugins.LineHTMLTooltip(
-                    rect, tooltip_labels, css=tooltip_css()
-                )
-                mpld3.plugins.connect(ax.figure, tooltip)
-
-        # normally we'd just set legend to be outside
-        # the chart, but this is not supported by mpld3.
-        # instead, we adjust the y axis to account for
-        # the height of the bars.
-        legend_height = len(group.runs) * 0.1
-        ax.set_ylim(0, max_height * (1 + legend_height))
-
-        ax.set_xticks([])
-        ax.grid(True, axis="y", alpha=0.2)
-        ax.set_ylabel(f"Value ({group.metadata.unit})")
-        ax.legend(loc="upper left")
-        ax.set_title(group.name, pad=20)
-        performance_indicator = (
-            "lower is better" if group.metadata.lower_is_better else "higher is better"
-        )
-        ax.text(
-            0.5,
-            1.03,
-            f"({performance_indicator})",
-            ha="center",
-            transform=ax.transAxes,
-            style="italic",
-            fontsize=7,
-            color="#666666",
-        )
-
-        for idx, label in enumerate(x_labels):
-            # this is a hack to get labels to show above the legend
-            # we normalize the idx to transAxes transform and offset it a little.
-            x_norm = (idx + 0.3 - ax.get_xlim()[0]) / (
-                ax.get_xlim()[1] - ax.get_xlim()[0]
-            )
-            ax.text(x_norm, 1.03, label, transform=ax.transAxes, color="#666666")
-
-        plt.tight_layout()
-        html_charts.append(
-            BenchmarkChart(
-                label=group.name,
-                html=mpld3.fig_to_html(fig),
-                suite=group.metadata.suite,
-            )
-        )
-        plt.close(fig)
-
-    return html_charts
-
-
-def process_benchmark_data(
-    benchmark_runs: list[BenchmarkRun], compare_names: list[str]
-) -> list[BenchmarkSeries]:
-    benchmark_metadata: dict[str, BenchmarkMetadata] = {}
-    run_map: dict[str, dict[str, list[Result]]] = defaultdict(lambda: defaultdict(list))
-
-    for run in benchmark_runs:
-        if run.name not in compare_names:
-            continue
-
-        for result in run.results:
-            if result.label not in benchmark_metadata:
-                benchmark_metadata[result.label] = BenchmarkMetadata(
-                    unit=result.unit,
-                    lower_is_better=result.lower_is_better,
-                    suite=result.suite,
-                )
-
-            result.date = run.date
-            result.git_hash = run.git_hash
-            run_map[result.label][run.name].append(result)
-
-    benchmark_series = []
-    for label, metadata in benchmark_metadata.items():
-        runs = [
-            BenchmarkRun(name=run_name, results=results)
-            for run_name, results in run_map[label].items()
-        ]
-        benchmark_series.append(
-            BenchmarkSeries(label=label, metadata=metadata, runs=runs)
-        )
-
-    return benchmark_series
-
-
-def generate_html(
-    benchmark_runs: list[BenchmarkRun], github_repo: str, compare_names: list[str]
-) -> str:
-    benchmarks = process_benchmark_data(benchmark_runs, compare_names)
-
-    timeseries = create_time_series_chart(benchmarks, github_repo)
-    timeseries_charts_html = "\n".join(
-        f'<div class="chart" data-label="{ts.label}" data-suite="{ts.suite}"><div>{ts.html}</div></div>'
-        for ts in timeseries
-    )
-
-    explicit_groups = create_explicit_groups(benchmark_runs, compare_names)
-
-    bar_charts = create_grouped_bar_charts(explicit_groups)
-    bar_charts_html = "\n".join(
-        f'<div class="chart" data-label="{bc.label}" data-suite="{bc.suite}"><div>{bc.html}</div></div>'
-        for bc in bar_charts
-    )
-
-    suite_names = {t.suite for t in timeseries}
-    suite_checkboxes_html = " ".join(
-        f'<label><input type="checkbox" class="suite-checkbox" data-suite="{suite}" checked> {suite}</label>'
-        for suite in suite_names
-    )
-
-    script_path = os.path.dirname(os.path.realpath(__file__))
-    results_template_path = Path(script_path, "benchmark_results.html.template")
-    with open(results_template_path, "r") as file:
-        html_template = file.read()
-
-    template = Template(html_template)
-    data = {
-        "suite_checkboxes_html": suite_checkboxes_html,
-        "timeseries_charts_html": timeseries_charts_html,
-        "bar_charts_html": bar_charts_html,
-    }
-
-    return template.substitute(data)