diff --git a/.bazelrc b/.bazelrc
index 8b87092b699b..529661c15c8b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -24,8 +24,16 @@ build -c opt
 
 build --config=short_logs
 
-# PyTorch/XLA uses exceptions to communicate with Python.
-build --copt=-fexceptions
+# Force GCC because clang/bazel has issues.
+build --spawn_strategy=standalone
+
+###########################################################################
+
+build:gcc --action_env=CC=gcc
+build:gcc --action_env=CXX=g++
+
+build:clang --action_env=CC=/usr/lib/llvm-17/bin/clang
+build:clang --action_env=CXX=/usr/lib/llvm-17/bin/clang++
 
 # Why we use the sandbox mode:
 #
@@ -254,7 +262,9 @@ build:linux --copt="-Werror=unused-result"
 build:linux --copt="-Wswitch"
 build:linux --copt="-Werror=switch"
 # Required for building with clang
-build:linux --copt="-Wno-error=unused-but-set-variable"
+# build:linux --copt="-Wno-error=unused-but-set-variable"
+build:linux --copt="-Wno-gnu-offsetof-extensions"
+build:linux --copt="-Wno-unused-command-line-argument"
 
 # Only include debug info for files not under XLA.
 build:dbg -c dbg
diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
index 6d35cfb2dabe..cee111b433d3 100755
--- a/.github/scripts/run_tests.sh
+++ b/.github/scripts/run_tests.sh
@@ -1,25 +1,16 @@
 set -ex
 
+
 function run_torch_xla_python_tests() {
   XLA_DIR=$1
   USE_COVERAGE="${2:-0}"
 
   pushd $XLA_DIR
   echo "Running Python Tests"
+  ./test/run_tests.sh
   if [ "$USE_COVERAGE" != "0" ]; then
-    pip install coverage==6.5.0 --upgrade
-    pip install coverage-lcov
-    pip install toml
-    ./test/run_tests.sh
     coverage combine
-    mkdir lcov && cp .coverage lcov/
-    coverage-lcov --data_file_path lcov/.coverage
-    coverage html
-    cp lcov.info htmlcov/
-    mv htmlcov ~/
-    chmod -R 755 ~/htmlcov
-  else
-    ./test/run_tests.sh
+    coverage lcov --omit="/tmp/*" --ignore-errors -o $COVERAGE_FILE.info
   fi
   popd
 }
@@ -81,6 +72,10 @@ function run_torch_xla_benchmark_tests() {
   echo "Running Torchbench Tests"
   test/benchmarks/run_torchbench_tests.sh "${TORCHBENCH_MODELS[@]}"
   popd
+  if [ "$USE_COVERAGE" != "0" ]; then
+    coverage combine
+    coverage lcov --omit="/tmp/*" --ignore-errors -o $COVERAGE_FILE.info
+  fi
 }
 
 PYTORCH_DIR=$1
diff --git a/.github/workflows/_build_plugin.yml b/.github/workflows/_build_plugin.yml
index 9e8376d262bf..57d95f1405d6 100644
--- a/.github/workflows/_build_plugin.yml
+++ b/.github/workflows/_build_plugin.yml
@@ -16,11 +16,6 @@ on:
         type: number
         description: Timeout in minutes for the build job
         default: 120
-      has_code_changes:
-        required: false
-        type: string
-        description: Whether to run full workflow or not
-        default: 'true'
     secrets:
       gcloud-service-key:
         required: true
@@ -37,31 +32,23 @@ jobs:
       BAZEL_REMOTE_CACHE: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
     steps:
       - name: Checkout actions
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           sparse-checkout: |
             .github/workflows/setup
           path: .actions
       - name: Setup
-        if: inputs.has_code_changes == 'true'
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
           cuda: true
       - name: Build
-        if: inputs.has_code_changes == 'true'
         shell: bash
         run: |
           cd pytorch/xla/infra/ansible
           ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5,8.6 src_root=${GITHUB_WORKSPACE} cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
       - name: Upload wheel
-        if: inputs.has_code_changes == 'true'
         uses: actions/upload-artifact@v4
         with:
           name: cuda-plugin
           path: /dist/*.whl
-      - name: Report no code changes
-        if: inputs.has_code_changes == 'false'
-        run: |
-          echo "No code changes were detected that require running the full test suite."
diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml
index 8e5db19d5b79..ad8bf50632e4 100644
--- a/.github/workflows/_build_torch_with_cuda.yml
+++ b/.github/workflows/_build_torch_with_cuda.yml
@@ -20,11 +20,6 @@ on:
         type: number
         description: Timeout in minutes for the build job
         default: 120
-      has_code_changes:
-        required: false
-        type: string
-        description: Whether to run full workflow or not
-        default: 'true'
 jobs:
   build:
     runs-on: ${{ inputs.runner }}
@@ -38,31 +33,23 @@ jobs:
       MAX_JOBS: 24
     steps:
       - name: Checkout actions
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           sparse-checkout: |
             .github/workflows/setup
           path: .actions
       - name: Setup
-        if: inputs.has_code_changes == 'true'
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
           cuda: true
       - name: Build PyTorch with CUDA enabled
-        if: inputs.has_code_changes == 'true'
         shell: bash
         run: |
           cd pytorch
           python setup.py bdist_wheel
       - name: Upload wheel
-        if: inputs.has_code_changes == 'true'
         uses: actions/upload-artifact@v4
         with:
           name: torch-with-cuda
           path: pytorch/dist/*.whl
-      - name: Report no code changes
-        if: inputs.has_code_changes == 'false'
-        run: |
-          echo "No code changes were detected that require running the full test suite."
diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml
index 02acff92085c..ebbae40bc65c 100644
--- a/.github/workflows/_build_torch_xla.yml
+++ b/.github/workflows/_build_torch_xla.yml
@@ -20,11 +20,6 @@ on:
         type: number
         description: Timeout in minutes for the build job
         default: 120
-      has_code_changes:
-        required: false
-        type: string
-        description: Whether to run full workflow or not
-        default: 'true'
     secrets:
       gcloud-service-key:
         required: true
@@ -44,37 +39,28 @@ jobs:
       # Need to check out local composite actions before using them
       # https://github.com/orgs/community/discussions/11771
       - name: Checkout actions
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           sparse-checkout: |
             .github/workflows/setup
           path: .actions
       - name: Setup
-        if: inputs.has_code_changes == 'true'
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
       - name: Build
-        if: inputs.has_code_changes == 'true'
         shell: bash
         run: |
           cd pytorch/xla/infra/ansible
           ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
       - name: Upload wheel
-        if: inputs.has_code_changes == 'true'
         uses: actions/upload-artifact@v4
         with:
           name: torch-xla-wheels
           path: /dist/*.whl
       - name: Upload CPP test binaries
-        if: inputs.has_code_changes == 'true'
         uses: actions/upload-artifact@v4
         with:
           name: cpp-test-bin
           path: /tmp/test/bin
-      - name: Report no code changes
-        if: inputs.has_code_changes == 'false'
-        run: |
-          echo "No code changes were detected that require running the full test suite."
 
diff --git a/.github/workflows/_check_code_changes.yml b/.github/workflows/_check_code_changes.yml
deleted file mode 100644
index 8615efd09264..000000000000
--- a/.github/workflows/_check_code_changes.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-name: Check Code Changes
-
-on:
-  workflow_call:
-    inputs:
-      event_name:
-        required: true
-        type: string
-      # For pull_request, base_sha is github.event.pull_request.base.sha (target branch tip)
-      # For push, base_sha is github.event.before
-      base_sha:
-        required: true
-        type: string
-      # For pull_request, head_sha is github.event.pull_request.head.sha (PR branch tip)
-      # For push, head_sha is github.sha
-      head_sha:
-        required: true
-        type: string
-    outputs:
-      has_code_changes:
-        description: "True if non-markdown code files were changed or event is workflow_dispatch/schedule, false otherwise."
-        value: ${{ jobs.check_files.outputs.has_code_changes }}
-
-jobs:
-  check_files:
-    runs-on: ubuntu-24.04
-    outputs:
-      has_code_changes: ${{ steps.perform_check.outputs.has_code_changes }}
-    steps:
-      - name: Checkout code for diff (if needed)
-        # Checkout only if a diff is actually needed
-        if: inputs.event_name != 'workflow_dispatch' && inputs.event_name != 'schedule'
-        uses: actions/checkout@v4
-        with:
-          # Fetch all history for all branches and tags.
-          # This is necessary for `git diff A...B` (three-dot diff) to find the merge base
-          # and correctly diff PR changes against the point where it diverged.
-          # It's also needed for `git diff A B` if A and B are far apart.
-          fetch-depth: 0
-
-      - name: Perform file content check
-        id: perform_check
-        run: |
-          echo "Event Name: ${{ inputs.event_name }}"
-          echo "Base SHA input (for PR: target branch; for Push: before SHA): ${{ inputs.base_sha }}"
-          echo "Head SHA input (for PR: PR head; for Push: current SHA): ${{ inputs.head_sha }}"
-
-          # Handle workflow_dispatch and schedule events first
-          if [[ "${{ inputs.event_name }}" == "workflow_dispatch" || "${{ inputs.event_name }}" == "schedule" ]]; then
-            echo "Event is ${{ inputs.event_name }}. Assuming code changes or full run needed."
-            echo "has_code_changes=true" >> "$GITHUB_OUTPUT"
-            exit 0 # Exit early, no diff needed
-          fi
-
-          # Handle initial push (base SHA is all zeros)
-          # For an initial push, all files in the head_sha are considered "changed" (new).
-          if [[ "${{ inputs.base_sha }}" == "0000000000000000000000000000000000000000" ]]; then
-            echo "Initial push (base SHA is zeros). Assuming code changes."
-            # We can list all files in the current commit (inputs.head_sha) if needed,
-            # but for simplicity, just assuming code changes is often sufficient.
-            # To be precise, one could do: git ls-tree -r --name-only ${{ inputs.head_sha }} > changed_files.txt
-            # And then apply the markdown filter. For now, we'll assume changes.
-            echo "has_code_changes=true" >> "$GITHUB_OUTPUT"
-            exit 0
-          fi
-
-          # Handle cases where base and head are the same (e.g., re-run on a specific commit, or a push with no new commits)
-          # This can happen if a workflow is re-run, or if a branch is pushed without new commits (e.g., force push to same SHA).
-          if [[ "${{ inputs.base_sha }}" == "${{ inputs.head_sha }}" ]]; then
-            echo "Base SHA is the same as Head SHA. No file changes. Assuming no code changes for skipping purposes."
-            echo "has_code_changes=false" >> "$GITHUB_OUTPUT"
-            exit 0
-          fi
-
-          # Ensure SHAs are valid before attempting diff
-          # (git rev-parse --verify will exit with non-zero if SHA is not found)
-          git rev-parse --verify ${{ inputs.base_sha }}^{commit} >/dev/null 2>&1 || { echo "Error: Base SHA ${{ inputs.base_sha }} not found or invalid."; exit 1; }
-          git rev-parse --verify ${{ inputs.head_sha }}^{commit} >/dev/null 2>&1 || { echo "Error: Head SHA ${{ inputs.head_sha }} not found or invalid."; exit 1; }
-
-
-          # Determine the diff command based on the event type
-          if [[ "${{ inputs.event_name }}" == "pull_request" ]]; then
-            # For pull requests, use three-dot diff (A...B).
-            # This shows changes on the PR branch (inputs.head_sha)
-            # since it diverged from the target branch (inputs.base_sha).
-            # inputs.base_sha is github.event.pull_request.base.sha
-            # inputs.head_sha is github.event.pull_request.head.sha
-            echo "Pull Request: Diffing ${{ inputs.base_sha }}...${{ inputs.head_sha }}"
-            git diff --name-only --no-renames ${{ inputs.base_sha }}...${{ inputs.head_sha }} > changed_files.txt
-          else # For 'push' and potentially other events not explicitly handled above
-            # For pushes, use two-dot diff (A B).
-            # inputs.base_sha is github.event.before
-            # inputs.head_sha is github.sha
-            echo "Push or other event: Diffing ${{ inputs.base_sha }} ${{ inputs.head_sha }}"
-            git diff --name-only --no-renames ${{ inputs.base_sha }} ${{ inputs.head_sha }} > changed_files.txt
-          fi
-
-          echo "Changed files:"
-          cat changed_files.txt
-
-          if [ ! -s changed_files.txt ]; then # Check if changed_files.txt is empty
-            echo "No files changed in the diff."
-            echo "has_code_changes=false" >> "$GITHUB_OUTPUT"
-          elif grep -q -v -E '\.md$' changed_files.txt; then
-            echo "Non-markdown code changes detected."
-            echo "has_code_changes=true" >> "$GITHUB_OUTPUT"
-          else
-            echo "Only markdown changes detected or no non-markdown changes found in diff."
-            echo "has_code_changes=false" >> "$GITHUB_OUTPUT"
-          fi
-        shell: bash
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 6d1e1881b296..b3fc13041a5b 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -29,14 +29,14 @@ on:
         default: false
         description: Whether to install CUDA plugin package
       torch-commit:
-          required: true
-          type: string
-          description: torch-commit
-      has_code_changes:
-        required: false
+        required: true
+        type: string
+        description: torch-commit
+      device-type:
+        required: true
         type: string
-        description: Whether to run full workflow or not
-        default: 'true'
+        description: Device type for naming the coverage results.
+
     secrets:
       gcloud-service-key:
         required: true
@@ -46,30 +46,38 @@ jobs:
     runs-on: ${{ inputs.runner }}
     container:
       image: ${{ inputs.dev-image }}
-      options: "${{ inputs.install-cuda-plugin == true && '--gpus all' || '' }} --shm-size 16g"
+      options: "${{ inputs.install-cuda-plugin && '--gpus all' || '' }} --shm-size 16g"
     strategy:
       fail-fast: false
       matrix:
         include:
           # Use readable strings as they define the workflow titles.
-          - run_benchmark_tests: 'benchmark_tests'
-          - run_python_tests: 'python_tests'
+          - name: 'benchmark_tests'
+            run_benchmark_tests: 'benchmark_tests'
+          - name: 'python_tests-xla_op1'
+            run_python_tests: 'python_tests'
             run_xla_op_tests1: 'xla_op1'
-          - run_python_tests: 'python_tests'
+          - name: 'python_tests-xla_op2'
+            run_python_tests: 'python_tests'
             run_xla_op_tests2: 'xla_op2'
-          - run_python_tests: 'python_tests'
+          - name: 'python_tests-xla_op3'
+            run_python_tests: 'python_tests'
             run_xla_op_tests3: 'xla_op3'
-          - run_python_tests: 'python_tests'
+          - name: 'python_tests-torch_mp_op'
+            run_python_tests: 'python_tests'
             run_torch_mp_op_tests: 'torch_mp_op'
-          - run_cpp_tests: 'cpp_tests'
+          - name: 'cpp_tests-1'
+            run_cpp_tests: 'cpp_tests'
             run_cpp_tests1: 'cpp_tests1'
-          - run_cpp_tests: 'cpp_tests'
+          - name: 'cpp_tests-2'
+            run_cpp_tests: 'cpp_tests'
             run_cpp_tests2: 'cpp_tests2'
     timeout-minutes: ${{ inputs.timeout-minutes }}
     env:
       GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
       GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
       USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
+      COVERAGE_DIR: '/tmp/lcov'
       RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }}
       RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }}
       RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }}
@@ -82,14 +90,12 @@ jobs:
       BAZEL_REMOTE_CACHE: 1
     steps:
       - name: Checkout actions
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           sparse-checkout: |
             .github/workflows/setup
           path: .actions
       - name: Setup
-        if: inputs.has_code_changes == 'true'
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
@@ -97,42 +103,38 @@ jobs:
           wheels-artifact: torch-xla-wheels
           cuda-plugin-artifact: ${{ inputs.install-cuda-plugin && 'cuda-plugin' || null }}
       - name: Fetch CPP test binaries
-        if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests
         uses: actions/download-artifact@v4
         with:
           name: cpp-test-bin
           path: /tmp/test/bin
+        if: ${{ matrix.run_cpp_tests }}
       # GitHub Actions doesn't preserve executable permissions
       # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
       - name: Set CPP test permissions
-        if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests
         run: |
           chmod +x /tmp/test/bin/*
           ls -l /tmp/test/bin
+        if: ${{ matrix.run_cpp_tests }}
       - name: Check GPU
-        if: inputs.has_code_changes == 'true' && inputs.install-cuda-plugin
         run: nvidia-smi
+        if: ${{ inputs.install-cuda-plugin }}
       - name: Install test deps
-        if: inputs.has_code_changes == 'true'
         shell: bash
         run: |
           # TODO: Add these in setup.py
           pip install fsspec
           pip install rich
       - name: Checkout PyTorch Repo
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           repository: pytorch/pytorch
           path: pytorch
           ref: ${{ inputs.torch-commit }}
       - name: Checkout PyTorch/XLA Repo
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           path: pytorch/xla
       - name: Extra CI deps
-        if: inputs.has_code_changes == 'true'
         shell: bash
         run: |
           set -x
@@ -142,56 +144,27 @@ jobs:
           if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then
             pip install -r pytorch/xla/benchmarks/requirements.txt
           fi
-      - name: Test
-        if: inputs.has_code_changes == 'true'
+      - name: Extra Coverage deps
         shell: bash
-        run: pytorch/xla/.github/scripts/run_tests.sh pytorch/ pytorch/xla/ $USE_COVERAGE
-      - name: Upload coverage results
-        if: inputs.has_code_changes == 'true' && inputs.collect-coverage
+        run: |
+          set -x
+          pip install -U coverage
+        if: ${{ inputs.collect-coverage }}
+      - name: Test
         shell: bash
         env:
-          CIRCLE_WORKFLOW_ID: ${{ github.run_id }}
-          CIRCLE_BUILD_NUM: ${{ github.run_number }}
-          BENCHMARK_TEST_NAME: ${{ env.RUN_BENCHMARK_TESTS }}
-          PYTHON_TEST_NAME: ${{ env.RUN_PYTHON_TESTS }}${{ env.RUN_XLA_OP_TESTS1 }}${{ env.RUN_XLA_OP_TESTS2 }}${{ env.RUN_XLA_OP_TESTS3 }}${{ env.RUN_TORCH_MP_OP_TESTS }}
-          CPP_TEST_NAME: ${{ env.RUN_CPP_TESTS1 }}${{ env.RUN_CPP_TESTS2 }}
+          COVERAGE_FILE: "${{ env.COVERAGE_DIR }}/py-coverage"
+          LLVM_PROFILE_FILE: "${{ env.COVERAGE_DIR }}/cpp-coverage/data-%p.profraw"
         run: |
-            # TODO(yeounoh) collect coverage report as needed.
-            if [ -n "${BENCHMARK_TEST_NAME}" ]; then
-                exit 0
-            fi
-            docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}"
-            if [ -n "${GPU_FLAG:-}" ]; then
-              if [ -n "${PYTHON_TEST_NAME}" ]; then
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
-              fi
-              if [ -n "${CPP_TEST_NAME}" ]; then
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
-              fi
-            else
-              if [ -n "${PYTHON_TEST_NAME}" ]; then
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
-              fi
-
-              if [ -n "${CPP_TEST_NAME}" ]; then
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
-              fi
-
-              if [ "${CPP_TEST_NAME}" == "cpp_tests1" ]; then
-                ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
-                echo $ABS_METADATA > abs_metadata.json
-                gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
+          pytorch/xla/.github/scripts/run_tests.sh pytorch/ pytorch/xla/ $USE_COVERAGE
+          # Merge the profraw files, so that they take less space.
+          llvm-profdata-17 merge $COVERAGE_DIR/cpp-coverage/*.profraw -o $COVERAGE_DIR/cpp-coverage.profdata
+          # Remove the collected raw coverage files.
+          rm -r $COVERAGE_DIR/cpp-coverage
+      - name: Upload coverage results
+        uses: actions/upload-artifact@v4
+        with:
+          name: "coverage-${{ inputs.device-type }}-${{ matrix.name }}"
+          path: "${{ env.COVERAGE_DIR }}"
+        if: ${{ inputs.collect-coverage }}
 
-                INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
-                echo $INC_METADATA > inc_metadata.json
-                gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
-              fi
-            fi
-      - name: Report no code changes
-        if: inputs.has_code_changes == 'false'
-        run: |
-          echo "No code changes were detected that require running the full test suite."
diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml
index 9861e4fba161..df7343ef1bd4 100644
--- a/.github/workflows/_test_requiring_torch_cuda.yml
+++ b/.github/workflows/_test_requiring_torch_cuda.yml
@@ -27,11 +27,7 @@ on:
         required: true
         type: string
         description: torch-commit
-      has_code_changes:
-        required: false
-        type: string
-        description: Whether to run full workflow or not
-        default: 'true'
+
 jobs:
   test:
     container:
@@ -40,26 +36,28 @@ jobs:
     strategy:
       matrix:
         include:
-          - run_python_tests: 'python_tests'
+          - name: 'torch_with_cuda_python_tests'
+            run_python_tests: 'python_tests'
             runner: ${{ inputs.runner }}
-          - run_triton_tests: 'triton_tests'
+          - name: 'torch_with_cuda_triton_tests'
+            run_triton_tests: 'triton_tests'
             runner: 'linux.g5.4xlarge.nvidia.gpu'
     runs-on: ${{ matrix.runner }}
     timeout-minutes: ${{ inputs.timeout-minutes }}
     env:
       USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
+      COVERAGE_DIR: '/tmp/lcov'
       BAZEL_JOBS: ''  # Let bazel decide the parallelism based on the number of CPUs.
       BAZEL_REMOTE_CACHE: 1
+      PJRT_DEVICE: 'CUDA'
     steps:
       - name: Checkout actions
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           sparse-checkout: |
             .github/workflows/setup
           path: .actions
       - name: Setup
-        if: inputs.has_code_changes == 'true'
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
@@ -68,10 +66,8 @@ jobs:
           cuda-plugin-artifact: cuda-plugin
           cuda-torch-artifact: torch-with-cuda
       - name: Check GPU
-        if: inputs.has_code_changes == 'true'
         run: nvidia-smi
       - name: Install wheels
-        if: inputs.has_code_changes == 'true'
         shell: bash
         run: |
           pip install /tmp/wheels/*.whl
@@ -86,42 +82,93 @@ jobs:
           python -c "import torch; assert torch.cuda.is_available()"
           echo "CUDA is available for PyTorch."
       - name: Checkout PyTorch Repo
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           repository: pytorch/pytorch
           path: pytorch
           ref: ${{ inputs.torch-commit }}
       - name: Checkout PyTorch/XLA Repo
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           path: pytorch/xla
       - name: Extra CI deps
-        if: inputs.has_code_changes == 'true' && matrix.run_triton_tests
         shell: bash
         run: |
           set -x
           pip install -U --pre jax jaxlib "jax-cuda12-plugin[with_cuda]" jax-cuda12-pjrt -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+        if: ${{ matrix.run_triton_tests }}
+      - name: Extra Coverage deps
+        shell: bash
+        run: |
+          set -x
+          pip install -U coverage
+        if: ${{ inputs.collect-coverage }}
       - name: Install Triton
-        if: inputs.has_code_changes == 'true'
         shell: bash
         run: |
           cd pytorch
           make triton
       - name: Python Tests
-        if: inputs.has_code_changes == 'true' && matrix.run_python_tests
         shell: bash
+        env:
+          COVERAGE_FILE: "${{ env.COVERAGE_DIR }}/py-coverage"
+          LLVM_PROFILE_FILE: "${{ env.COVERAGE_DIR }}/cpp-coverage/data-%p.profraw"
         run: |
           set -xue
-          PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v
-          PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v
+
+          TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')")
+
+          function run_coverage {
+            if [ "$USE_COVERAGE" != "0" ]; then
+              coverage run --source="$TORCH_XLA_DIR" -p "$@"
+            else
+              python3 "$@"
+            fi
+          }
+
+          run_coverage pytorch/xla/test/test_operations.py -v
+          run_coverage pytorch/xla/test/dynamo/test_dynamo.py -v
+
+          # Combine all Python coverage files.
+          coverage combine
+          coverage lcov --omit="/tmp/*" --ignore-errors -o $COVERAGE_FILE.info
+          # Merge the profraw files, so that they take less space.
+          llvm-profdata-17 merge $COVERAGE_DIR/cpp-coverage/*.profraw -o $COVERAGE_DIR/cpp-coverage.profdata
+          # Remove the collected raw coverage files.
+          rm -r $COVERAGE_DIR/cpp-coverage
+        if: ${{ matrix.run_python_tests }}
       - name: Triton Tests
-        if: inputs.has_code_changes == 'true' && matrix.run_triton_tests
         shell: bash
+        env:
+          COVERAGE_FILE: "${{ env.COVERAGE_DIR }}/py-coverage"
+          LLVM_PROFILE_FILE: "${{ env.COVERAGE_DIR }}/cpp-coverage/data-%p.profraw"
+          TRITON_PTXAS_PATH: "/usr/local/cuda-12.3/bin/ptxas"
         run: |
-          PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.3/bin/ptxas python pytorch/xla/test/test_triton.py
-      - name: Report no code changes
-        if: inputs.has_code_changes == 'false'
-        run: |
-          echo "No code changes were detected that require running the full test suite."
+          set -x
+
+          TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')")
+
+          function run_coverage {
+            if [ "$USE_COVERAGE" != "0" ]; then
+              coverage run --source="$TORCH_XLA_DIR" -p "$@"
+            else
+              python3 "$@"
+            fi
+          }
+
+          run_coverage pytorch/xla/test/test_triton.py
+
+          # Combine all Python coverage files.
+          coverage combine
+          coverage lcov --omit="/tmp/*" --ignore-errors -o $COVERAGE_FILE.info
+          # Merge the profraw files, so that they take less space.
+          llvm-profdata-17 merge $COVERAGE_DIR/cpp-coverage/*.profraw -o $COVERAGE_DIR/cpp-coverage.profdata
+          # Remove the collected raw coverage files.
+          rm -r $COVERAGE_DIR/cpp-coverage
+        if: ${{ matrix.run_triton_tests }}
+      - name: Upload coverage results
+        uses: actions/upload-artifact@v4
+        with:
+          name: "coverage-${{ matrix.name }}"
+          path: "${{ env.COVERAGE_DIR }}"
+        if: ${{ inputs.collect-coverage }}
diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml
index 6c28d21a267c..c2c15a6b3d71 100644
--- a/.github/workflows/_tpu_ci.yml
+++ b/.github/workflows/_tpu_ci.yml
@@ -7,11 +7,11 @@ on:
         type: number
         description: Timeout in minutes for the job run
         default: 120
-      has_code_changes:
+      collect-coverage:
         required: false
-        type: string
-        description: Whether to run full workflow or not
-        default: 'true'
+        type: boolean
+        description: Set to true to collect coverage information
+        default: false
 jobs:
   tpu-test:
     runs-on: v4-runner-set
@@ -19,26 +19,29 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        test_script:
-          - test/tpu/run_tests.sh
-          - test/tpu/run_expensive_test_1.sh
-          - test/tpu/run_expensive_test_2.sh
+        include:
+          - name: 'run_tests'
+            test_script: 'test/tpu/run_tests.sh'
+          - name: 'run_expensive_test_1'
+            test_script: 'test/tpu/run_expensive_test_1.sh'
+          - name: 'run_expensive_test_2'
+            test_script: 'test/tpu/run_expensive_test_2.sh'
+    env:
+      USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
+      COVERAGE_DIR: '/tmp/lcov'
     steps:
       - name: Checkout actions
-        if: inputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           sparse-checkout: |
             .github/workflows/setup
           path: .actions
       - name: Setup
-        if: inputs.has_code_changes == 'true'
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
           wheels-artifact: torch-xla-wheels
       - name: Install test dependencies
-        if: inputs.has_code_changes == 'true'
         shell: bash
         run: |
           # TODO: Add these in setup.py
@@ -48,16 +51,28 @@ jobs:
           # libtpu is needed for pallas tests.
           pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-wheels/index.html -f https://storage.googleapis.com/libtpu-releases/index.html
           pip install --upgrade protobuf
+      - name: Extra Coverage deps
+        shell: bash
+        run: |
+          set -x
+          pip install -U coverage
+        if: ${{ inputs.collect-coverage }}
       - name: Run Tests (${{ matrix.test_script }})
-        if: inputs.has_code_changes == 'true'
         env:
           PJRT_DEVICE: TPU
           TPU_LOG_DIR: tpu_logs_${{ strategy.job-index }}
+          COVERAGE_FILE: "${{ env.COVERAGE_DIR }}/py-coverage"
+          LLVM_PROFILE_FILE: "${{ env.COVERAGE_DIR }}/cpp-coverage/data-%p.profraw"
         run: |
           cd pytorch/xla
           ${{ matrix.test_script }}
-      - name: Report no code changes
-        # Only report the first instance
-        if: inputs.has_code_changes == 'false' && strategy.job-index == 0
-        run: |
-          echo "No code changes were detected that require running the full test suite."
+          # Merge the profraw files, so that they take less space.
+          llvm-profdata-17 merge $COVERAGE_DIR/cpp-coverage/*.profraw -o $COVERAGE_DIR/cpp-coverage.profdata
+          # Remove the collected raw coverage files.
+          rm -r $COVERAGE_DIR/cpp-coverage
+      - name: Upload coverage results
+        uses: actions/upload-artifact@v4
+        with:
+          name: "coverage-tpu-${{ matrix.name }}"
+          path: "${{ env.COVERAGE_DIR }}"
+        if: ${{ inputs.collect-coverage }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index da3f8e8190e5..c0b765a32848 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -15,47 +15,32 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  check_code_changes:
-    name: Check Code Changes
-    uses: ./.github/workflows/_check_code_changes.yml
-    with:
-      event_name: ${{ github.event_name }}
-      # For pull_request, use PR's base and head. For push, use event's before and sha.
-      base_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.base.sha || github.event.before }}
-      head_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
 
   get-torch-commit:
-    needs: [check_code_changes]
     runs-on: ubuntu-24.04
     outputs:
       torch_commit: ${{ steps.commit.outputs.torch_commit }}
     steps:
-      - name: Get latest torch commit
-        id: commit
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
+      - id: commit
+        name: Get latest torch commit
         run: |
           echo "torch_commit=$(git ls-remote https://github.com/pytorch/pytorch.git HEAD | awk '{print $1}')" >> "$GITHUB_OUTPUT"
-      - name: Report no code changes
-        if: needs.check_code_changes.outputs.has_code_changes == 'false'
-        run: |
-          echo "No code changes were detected that require running the full test suite."
 
   build-torch-xla:
     name: "Build PyTorch/XLA"
     uses: ./.github/workflows/_build_torch_xla.yml
-    needs: [check_code_changes, get-torch-commit]
+    needs: get-torch-commit
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm
       torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
       timeout-minutes: 240
-      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
   build-torch-with-cuda:
     name: "Build PyTorch with CUDA"
     uses: ./.github/workflows/_build_torch_with_cuda.yml
-    needs: [check_code_changes, get-torch-commit]
+    needs: get-torch-commit
     with:
       # TODO: bump CUDA version to either 12.4 or 12.6 (supported by PyTorch).
       # Ref: https://github.com/pytorch/xla/issues/8700
@@ -64,65 +49,61 @@ jobs:
       # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
       runner: linux.24xlarge
       timeout-minutes: 120
-      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
 
   build-cuda-plugin:
     name: "Build XLA CUDA plugin"
     uses: ./.github/workflows/_build_plugin.yml
-    needs: [check_code_changes]
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
-      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
   test-python-cpu:
     name: "CPU tests"
     uses: ./.github/workflows/_test.yml
-    needs: [build-torch-xla, check_code_changes, get-torch-commit]
+    needs: [build-torch-xla, get-torch-commit]
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm
       timeout-minutes: 120
-      collect-coverage: false
+      collect-coverage: true
       torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
+      device-type: "cpu"
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
   test-cuda:
     name: "GPU tests"
     uses: ./.github/workflows/_test.yml
-    needs: [build-torch-xla, build-cuda-plugin, check_code_changes, get-torch-commit]
+    needs: [build-torch-xla, build-cuda-plugin, get-torch-commit]
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
       runner: linux.g4dn.12xlarge.nvidia.gpu
       timeout-minutes: 300
-      collect-coverage: false
+      collect-coverage: true
       install-cuda-plugin: true
       torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
+      device-type: "cuda"
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
   test-cuda-with-pytorch-cuda-enabled:
     name: "GPU tests requiring torch CUDA"
     uses: ./.github/workflows/_test_requiring_torch_cuda.yml
-    needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, check_code_changes, get-torch-commit]
+    needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit]
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
       runner: linux.8xlarge.nvidia.gpu
       timeout-minutes: 300
-      collect-coverage: false
+      collect-coverage: true
       torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
 
   test-tpu:
     name: "TPU tests"
     uses: ./.github/workflows/_tpu_ci.yml
-    needs: [build-torch-xla, check_code_changes]
+    needs: build-torch-xla
     with:
-      timeout-minutes: 300
-      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
+      timeout-minutes: 300 
+      collect-coverage: true
     if: github.event_name == 'push' || github.event_name == 'pull_request'
 
   push-docs:
diff --git a/.github/workflows/lintercheck.yml b/.github/workflows/lintercheck.yml
index edd012e9c008..63b8028ea3ff 100644
--- a/.github/workflows/lintercheck.yml
+++ b/.github/workflows/lintercheck.yml
@@ -8,23 +8,12 @@ on:
       - r[0-9]+.[0-9]+
 
 jobs:
-  check_code_changes:
-    name: Check Code Changes
-    uses: ./.github/workflows/_check_code_changes.yml
-    with:
-      event_name: ${{ github.event_name }}
-      # For pull_request, use PR's base and head. For push, use event's before and sha.
-      base_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.base.sha || github.event.before }}
-      head_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
   linter_check:
     runs-on: ubuntu-24.04
-    needs: [check_code_changes]
     steps:
       - name: Checkout repo
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
         uses: actions/checkout@v3
       - name: Setup Python
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
         uses: actions/setup-python@v4
         with:
           python-version: '3.10'
@@ -32,9 +21,7 @@ jobs:
       - run: pip install yapf==0.40.2  # N.B.: keep in sync with `torchax/dev-requirements.txt`, `infra/ansible/config/pip.yaml`
 
       - name: Check no TORCH_PIN
-        if: >
-          (github.event_name == 'push' && github.event.ref == 'refs/heads/master') &&
-          needs.check_code_changes.outputs.has_code_changes == 'true'
+        if: github.event_name == 'push' && github.event.ref == 'refs/heads/master'
         shell: bash
         run: |
           TORCH_PIN=./.torch_pin
@@ -60,7 +47,6 @@ jobs:
             echo "PASSED *.cc file extension check"
           fi
       - name: Run clang-format
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
         shell: bash
         env:
           CLANG_FORMAT: clang-format-16
@@ -86,7 +72,6 @@ jobs:
             echo "PASSED C++ format"
           fi
       - name: Run yapf
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
         shell: bash
         run: |
           git_status=$(git status --porcelain)
@@ -107,7 +92,3 @@ jobs:
           else
             echo "PASSED Python format"
           fi
-      - name: Report no code changes
-        if: needs.check_code_changes.outputs.has_code_changes == 'false'
-        run: |
-          echo "No code changes were detected that require running the full test suite."
diff --git a/.github/workflows/torchax.yml b/.github/workflows/torchax.yml
index 2f1e930f48b5..87b117130ebf 100644
--- a/.github/workflows/torchax.yml
+++ b/.github/workflows/torchax.yml
@@ -15,41 +15,28 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  check_code_changes:
-    name: Check Code Changes
-    uses: ./.github/workflows/_check_code_changes.yml
-    with:
-      event_name: ${{ github.event_name }}
-      # For pull_request, use PR's base and head. For push, use event's before and sha.
-      base_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.base.sha || github.event.before }}
-      head_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
   torchax-cpu:
     runs-on: ubuntu-24.04
-    needs: [check_code_changes]
     strategy:
       matrix:
         python-version: ['3.10', '3.11', '3.12']
     steps:
       - name: Checkout repo
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
         uses: actions/checkout@v4
         with:
           sparse-checkout: |
             torchax
       - name: Setup Python
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
         shell: bash
         working-directory: torchax
         run: |
           pip install -r test-requirements.txt
           pip install -e .[cpu]
       - name: Run tests
-        if: needs.check_code_changes.outputs.has_code_changes == 'true'
         working-directory: torchax
         shell: bash
         run: |
@@ -66,8 +53,3 @@ jobs:
           done
           # Run distributed tests.
           XLA_FLAGS=--xla_force_host_platform_device_count=4 pytest -n 0 test_dist/
-          echo "Tests completed."
-      - name: Report no code changes
-        if: needs.check_code_changes.outputs.has_code_changes == 'false'
-        run: |
-          echo "No code changes were detected that require running the full test suite."
diff --git a/BUILD b/BUILD
index c9a2578c722e..be3b9c5da508 100644
--- a/BUILD
+++ b/BUILD
@@ -26,11 +26,18 @@ cc_binary(
         "-fopenmp",
         "-fPIC",
         "-fwrapv",
+        "-fprofile-instr-generate",
+        "-fcoverage-mapping",
+        "-O0",
+        "-g",
     ],
     linkopts = [
         "-Wl,-rpath,$$ORIGIN/torch_xla/lib",  # for libtpu
         "-Wl,-soname,_XLAC.so",
         "-lstdc++fs",  # For std::filesystem
+        "-fprofile-instr-generate",
+        "-O0",
+        "-g",
     ],
     linkshared = 1,
     visibility = ["//visibility:public"],
@@ -51,9 +58,16 @@ cc_binary(
     copts = [
         "-fopenmp",
         "-fPIC",
+        "-fprofile-instr-generate",
+        "-fcoverage-mapping",
+        "-O0",
+        "-g",
     ],
     linkopts = [
         "-Wl,-soname,_XLAC_cuda_functions.so",
+        "-fprofile-instr-generate",
+        "-O0",
+        "-g",
     ],
     linkshared = 1,
     visibility = ["//visibility:public"],
diff --git a/bazel/rules_def.bzl b/bazel/rules_def.bzl
index 3a089bb79405..c83da77aa2df 100644
--- a/bazel/rules_def.bzl
+++ b/bazel/rules_def.bzl
@@ -5,11 +5,23 @@ load(
     "xla_cc_test",
 )
 
-def ptxla_cc_library(
+def cov_library(
         deps = [],
         copts = [],
+        linkopts = [],
         **kwargs):
     native.cc_library(
+        copts = copts + ["-fprofile-instr-generate", "-fcoverage-mapping", "-O0", "-g"],
+        linkopts = linkopts + ["-fprofile-instr-generate", "-O0", "-g"],
+        deps = deps,
+        **kwargs
+    )
+
+def ptxla_cc_library(
+        deps = [],
+        copts = [],
+        **kwargs):
+    cov_library(
         copts = copts + ["-isystemexternal/torch"],  # Required for system includes.
         deps = deps + [
             "@torch//:headers",
@@ -26,6 +38,7 @@ def ptxla_cc_test(
         linkstatic = True,
         copts = copts + [
             "-isystemexternal/torch",  # Required for system includes.
+            "-fexceptions",  # Required for testing crashes.
         ],
         deps = deps + [
             "@pybind11//:pybind11_embed",  # libpython
diff --git a/build_util.py b/build_util.py
index 487f5116323e..c94ddd014ec2 100644
--- a/build_util.py
+++ b/build_util.py
@@ -15,6 +15,11 @@ def bazel_options_from_env() -> Iterable[str]:
   if check_env_flag('DEBUG'):
     bazel_flags.append('--config=dbg')
 
+  if check_env_flag('USE_CLANG'):
+    bazel_flags.append('--config=clang')
+  else:
+    bazel_flags.append('--config=gcc')
+
   if check_env_flag('TPUVM_MODE'):
     bazel_flags.append('--config=tpu')
 
diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml
index db5562acb5c9..98a355ad1a67 100644
--- a/infra/ansible/roles/build_srcs/tasks/main.yaml
+++ b/infra/ansible/roles/build_srcs/tasks/main.yaml
@@ -46,7 +46,7 @@
   ansible.builtin.command:
     cmd: python setup.py bdist_wheel
     chdir: "{{ (src_root, 'pytorch/xla') | path_join }}"
-  environment: "{{ env_vars }}"
+  environment: "{{ env_vars | combine({'USE_CLANG': '1'}) }}"
 
 - name: Find XLA *.whl files in pytorch/xla/dist
   ansible.builtin.find:
@@ -190,6 +190,7 @@
   args:
     executable: bash
     chdir: "{{ src_root }}"
+  environment: "{{ env_vars | combine({'USE_CLANG': '1'}) }}"
   when: build_cpp_tests
 
 - name: Read Torchvision pin
diff --git a/test/benchmarks/run_tests.sh b/test/benchmarks/run_tests.sh
index fc0f1bfa1bf2..5ce6c7163963 100755
--- a/test/benchmarks/run_tests.sh
+++ b/test/benchmarks/run_tests.sh
@@ -4,8 +4,11 @@ CDIR="$(cd "$(dirname "$0")" ; pwd -P)"
 LOGFILE=/tmp/pytorch_benchmarks_test.log
 VERBOSITY=0
 
+BENCHMARKS_DIR="$CDIR/../../benchmarks/"
+TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')")
+
 # Make benchmark module available as it is not part of torch_xla.
-export PYTHONPATH=$PYTHONPATH:$CDIR/../../benchmarks/
+export PYTHONPATH=$PYTHONPATH:$BENCHMARKS_DIR
 
 # Note [Keep Going]
 #
@@ -30,6 +33,14 @@ do
 done
 shift $(($OPTIND - 1))
 
+function run_coverage {
+  if [ "${USE_COVERAGE:-0}" != "0" ]; then
+    coverage run --source="$TORCH_XLA_DIR,$BENCHMARKS_DIR" -p "$@"
+  else
+    python3 "$@"
+  fi
+}
+
 function run_make_tests {
   MAKE_V=""
   if [ "$VERBOSITY" != "0" ]; then
@@ -42,10 +53,10 @@ function run_python_tests {
   # HACK: don't confuse local `torch_xla` folder with installed package
   # Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559
   pushd $CDIR
-  python3 "test_experiment_runner.py"
-  python3 "test_benchmark_experiment.py"
-  python3 "test_benchmark_model.py"
-  python3 "test_result_analyzer.py"
+  run_coverage "test_experiment_runner.py"
+  run_coverage "test_benchmark_experiment.py"
+  run_coverage "test_benchmark_model.py"
+  run_coverage "test_result_analyzer.py"
   popd
 }
 
diff --git a/test/run_tests.sh b/test/run_tests.sh
index 99bab29ce3a9..252ed694ba92 100755
--- a/test/run_tests.sh
+++ b/test/run_tests.sh
@@ -32,7 +32,6 @@ _TORCH_XLA_DIR=$(
   cd ~
   dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')"
 )
-COVERAGE_FILE="$_TEST_DIR/../.coverage"
 
 function run_coverage {
   if ! test_is_selected "$1"; then
diff --git a/test/tpu/run_tests.sh b/test/tpu/run_tests.sh
index 41dfdf1c56b0..fc910118b148 100755
--- a/test/tpu/run_tests.sh
+++ b/test/tpu/run_tests.sh
@@ -9,6 +9,7 @@ _TPU_DIR="$(
 
 # Absolute path to the test/ directory.
 _TEST_DIR="$(dirname "$_TPU_DIR")"
+_TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')")
 
 source "${_TEST_DIR}/utils/run_tests_utils.sh"
 
@@ -34,94 +35,115 @@ function run_test {
   if ! test_is_selected "$1"; then
     return
   fi
-  python3 "$@"
+  run_coverage "$@"
 }
 
-# TODO: merge with other run_tests
-if test_is_selected $_TEST_DIR/test_mat_mul_precision.py; then
-  (cd $_TEST_DIR && python3 -m unittest test_mat_mul_precision.TestMatMulPrecision.test_high)
-  (cd $_TEST_DIR && python3 -m unittest test_mat_mul_precision.TestMatMulPrecision.test_default)
-  (cd $_TEST_DIR && python3 -m unittest test_mat_mul_precision.TestMatMulPrecision.test_highest)
-  (cd $_TEST_DIR && python3 -m unittest test_mat_mul_precision.TestMatMulPrecision.test_all)
-fi
-run_test "$_TEST_DIR/test_mat_mul_precision_get_and_set.py"
-run_test "$_TEST_DIR/test_operations.py" -v
-run_test "$_TEST_DIR/test_xla_graph_execution.py" -v
-run_test "$_TEST_DIR/pjrt/test_runtime_tpu.py"
-run_test "$_TEST_DIR/pjrt/test_collective_ops_tpu.py"
-run_test "$_TEST_DIR/spmd/test_mp_input_sharding.py"
-run_test "$_TEST_DIR/test_mp_collective_matmul.py"
-run_save_tensor_hlo run_test "$_TEST_DIR/spmd/test_spmd_lowering_context.py"
-run_test "$_TEST_DIR/spmd/test_xla_sharding.py"
-run_test "$_TEST_DIR/spmd/test_xla_virtual_device.py"
-run_test "$_TEST_DIR/spmd/test_xla_distributed_checkpoint.py"
-run_test "$_TEST_DIR/spmd/test_train_spmd_linear_model.py"
-run_test "$_TEST_DIR/spmd/test_xla_spmd_python_api_interaction.py"
-run_test "$_TEST_DIR/spmd/test_xla_auto_sharding.py"
-run_test "$_TEST_DIR/spmd/test_fsdp_v2.py"
-run_test "$_TEST_DIR/test_gradient_accumulation.py"
-XLA_EXPERIMENTAL=nonzero:masked_select:nms run_test "$_TEST_DIR/ds/test_dynamic_shape_models.py" -v
-run_test "$_TEST_DIR/test_autocast.py"
-run_test "$_TEST_DIR/test_fp8.py"
-run_test "$_TEST_DIR/test_grad_checkpoint.py"
-run_test "$_TEST_DIR/test_grad_checkpoint.py" "$@" --test_autocast
-run_test "$_TEST_DIR/dynamo/test_dynamo.py"
-run_test "$_TEST_DIR/dynamo/test_dynamo_dynamic_shape.py"
-run_test "$_TEST_DIR/spmd/test_spmd_debugging.py"
-XLA_PARAMETER_WRAPPING_THREADSHOLD=1 run_test "$_TEST_DIR/spmd/test_spmd_parameter_wrapping.py"
-run_test "$_TEST_DIR/pjrt/test_dtypes.py"
-run_test "$_TEST_DIR/pjrt/test_dynamic_plugin_tpu.py"
-run_test "$_TEST_DIR/test_while_loop.py"
-run_test "$_TEST_DIR/scan/test_scan.py"
-run_test "$_TEST_DIR/scan/test_scan_spmd.py"
-run_test "$_TEST_DIR/scan/test_scan_pallas.py"
-run_test "$_TEST_DIR/scan/test_scan_layers.py"
-run_test "$_TEST_DIR/test_gru.py"
-run_test "$_TEST_DIR/test_assume_pure.py"
-run_test "$_TEST_DIR/test_assume_pure_spmd.py"
-run_test "$_TEST_DIR/test_as_stride_use_slice.py"
-run_xla_hlo_debug run_test "$_TEST_DIR/scan/test_scan_debug.py"
-run_test "$_TEST_DIR/test_pallas.py" -v
-run_test "$_TEST_DIR/test_pallas_spmd.py"
-XLA_DISABLE_FUNCTIONALIZATION=1 run_test "$_TEST_DIR/test_pallas_spmd.py"
-run_test "$_TEST_DIR/test_splash_attention.py"
-run_test "$_TEST_DIR/test_profiler_session.py"
-run_test "$_TEST_DIR/test_input_output_aliases.py"
-run_test "$_TEST_DIR/test_gmm.py"
-run_test "$_TEST_DIR/eager/test_eager_spmd.py"
-run_test "$_TEST_DIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py"
-run_test "$_TEST_DIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py"
-run_test "$_TEST_DIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py"
-run_test "$_TEST_DIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py"
-run_test "$_TEST_DIR/quantized_ops/test_dot_general.py"
-run_xla_ir_hlo_debug run_test "$_TEST_DIR/test_user_computation_debug_cache.py"
-run_test "$_TEST_DIR/test_data_type.py"
-run_test "$_TEST_DIR/test_compilation_cache_utils.py"
-
-# run examples, each test should takes <2 minutes
-run_test "$_TEST_DIR/../examples/data_parallel/train_resnet_spmd_data_parallel.py"
-run_test "$_TEST_DIR/../examples/fsdp/train_decoder_only_fsdp_v2.py"
-run_test "$_TEST_DIR/../examples/train_resnet_amp.py"
-run_test "$_TEST_DIR/../examples/train_decoder_only_base.py"
-run_test "$_TEST_DIR/../examples/train_decoder_only_base.py" scan.decoder_with_scan.DecoderWithScan \
-    --num-steps 30 # TODO(https://github.com/pytorch/xla/issues/8632): Reduce scan tracing overhead
+function run_coverage {
+  if [ "${USE_COVERAGE:-0}" != "0" ]; then
+    coverage run --source="$_TORCH_XLA_DIR" -p "$@"
+  else
+    python3 "$@"
+  fi
+}
 
-# HACK: don't confuse local `torch_xla` folder with installed package
-# Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559
-# Egaer tests will take more HBM, only run them on TPU v4 CI
-TPU_VERSION=$(python -c "import sys; sys.path.remove(''); import torch_xla; print(torch_xla._internal.tpu.version())")
-if [[ -n "$TPU_VERSION" && "$TPU_VERSION" == "4" ]]; then
-    run_test "$_TEST_DIR/dynamo/test_traceable_collectives.py"
-    run_test "$_TEST_DIR/../examples/data_parallel/train_resnet_xla_ddp.py"
-    run_test "$_TEST_DIR/../examples/fsdp/train_resnet_fsdp_auto_wrap.py"
-    run_test "$_TEST_DIR/../examples/eager/train_decoder_only_eager.py"
-    run_test "$_TEST_DIR/../examples/eager/train_decoder_only_eager_spmd_data_parallel.py"
-    run_test "$_TEST_DIR/../examples/eager/train_decoder_only_eager_with_compile.py"
-    run_test "$_TEST_DIR/../examples/eager/train_decoder_only_eager_multi_process.py"
-    XLA_EXPERIMENTAL=nonzero:masked_select:nms run_test "$_TEST_DIR/ds/test_dynamic_shapes.py" -v
-fi
+function run {
+  # TODO: merge with other run_tests
+  if test_is_selected $_TEST_DIR/test_mat_mul_precision.py; then
+    (cd $_TEST_DIR && run_coverage -m unittest test_mat_mul_precision.TestMatMulPrecision.test_high)
+    (cd $_TEST_DIR && run_coverage -m unittest test_mat_mul_precision.TestMatMulPrecision.test_default)
+    (cd $_TEST_DIR && run_coverage -m unittest test_mat_mul_precision.TestMatMulPrecision.test_highest)
+    (cd $_TEST_DIR && run_coverage -m unittest test_mat_mul_precision.TestMatMulPrecision.test_all)
+  fi
+  run_coverage "$_TEST_DIR/test_mat_mul_precision_get_and_set.py"
+  run_coverage "$_TEST_DIR/test_operations.py" -v
+  run_coverage "$_TEST_DIR/pjrt/test_runtime_tpu.py"
+  run_coverage "$_TEST_DIR/pjrt/test_collective_ops_tpu.py"
+  run_coverage "$_TEST_DIR/spmd/test_mp_input_sharding.py"
+  run_coverage "$_TEST_DIR/test_mp_collective_matmul.py"
+  run_save_tensor_hlo run_coverage "$_TEST_DIR/spmd/test_spmd_lowering_context.py"
+  run_coverage "$_TEST_DIR/spmd/test_xla_sharding.py"
+  run_coverage "$_TEST_DIR/spmd/test_xla_virtual_device.py"
+  run_coverage "$_TEST_DIR/spmd/test_xla_distributed_checkpoint.py"
+  run_coverage "$_TEST_DIR/spmd/test_train_spmd_linear_model.py"
+  run_coverage "$_TEST_DIR/spmd/test_xla_spmd_python_api_interaction.py"
+  run_coverage "$_TEST_DIR/spmd/test_xla_auto_sharding.py"
+  run_coverage "$_TEST_DIR/spmd/test_fsdp_v2.py"
+  run_coverage "$_TEST_DIR/test_gradient_accumulation.py"
+  XLA_EXPERIMENTAL=nonzero:masked_select:nms run_coverage "$_TEST_DIR/ds/test_dynamic_shape_models.py" -v
+  run_coverage "$_TEST_DIR/test_autocast.py"
+  run_coverage "$_TEST_DIR/test_fp8.py"
+  run_coverage "$_TEST_DIR/test_grad_checkpoint.py"
+  run_coverage "$_TEST_DIR/test_grad_checkpoint.py" "$@" --test_autocast
+  run_coverage "$_TEST_DIR/dynamo/test_dynamo.py"
+  run_coverage "$_TEST_DIR/dynamo/test_dynamo_dynamic_shape.py"
+  run_coverage "$_TEST_DIR/spmd/test_spmd_debugging.py"
+  XLA_PARAMETER_WRAPPING_THREADSHOLD=1 run_coverage "$_TEST_DIR/spmd/test_spmd_parameter_wrapping.py"
+  run_coverage "$_TEST_DIR/pjrt/test_dtypes.py"
+  run_coverage "$_TEST_DIR/pjrt/test_dynamic_plugin_tpu.py"
+  run_coverage "$_TEST_DIR/test_while_loop.py"
+  run_coverage "$_TEST_DIR/scan/test_scan.py"
+  run_coverage "$_TEST_DIR/scan/test_scan_spmd.py"
+  run_coverage "$_TEST_DIR/scan/test_scan_pallas.py"
+  run_coverage "$_TEST_DIR/scan/test_scan_layers.py"
+  run_coverage "$_TEST_DIR/test_gru.py"
+  run_coverage "$_TEST_DIR/test_assume_pure.py"
+  run_coverage "$_TEST_DIR/test_assume_pure_spmd.py"
+  run_coverage "$_TEST_DIR/test_as_stride_use_slice.py"
+  run_xla_hlo_debug run_coverage "$_TEST_DIR/scan/test_scan_debug.py"
+  run_coverage "$_TEST_DIR/test_pallas.py" -v
+  run_coverage "$_TEST_DIR/test_pallas_spmd.py"
+  XLA_DISABLE_FUNCTIONALIZATION=1 run_coverage "$_TEST_DIR/test_pallas_spmd.py"
+  run_coverage "$_TEST_DIR/test_splash_attention.py"
+  run_coverage "$_TEST_DIR/test_profiler_session.py"
+  run_coverage "$_TEST_DIR/test_multi_queries_paged_attention_kernel.py"
+  run_coverage "$_TEST_DIR/test_ragged_paged_attention_kernel.py"
+  run_coverage "$_TEST_DIR/test_input_output_aliases.py"
+  run_coverage "$_TEST_DIR/test_gmm.py"
+  run_coverage "$_TEST_DIR/eager/test_eager_spmd.py"
+  run_coverage "$_TEST_DIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py"
+  run_coverage "$_TEST_DIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py"
+  run_coverage "$_TEST_DIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py"
+  run_coverage "$_TEST_DIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py"
+  run_coverage "$_TEST_DIR/quantized_ops/test_dot_general.py"
+  run_xla_ir_hlo_debug run_coverage "$_TEST_DIR/test_user_computation_debug_cache.py"
+  run_coverage "$_TEST_DIR/test_data_type.py"
+  run_coverage "$_TEST_DIR/test_compilation_cache_utils.py"
+  
+  # run examples, each test should takes <2 minutes
+  run_coverage "$_TEST_DIR/../examples/data_parallel/train_resnet_spmd_data_parallel.py"
+  run_coverage "$_TEST_DIR/../examples/fsdp/train_decoder_only_fsdp_v2.py"
+  run_coverage "$_TEST_DIR/../examples/train_resnet_amp.py"
+  run_coverage "$_TEST_DIR/../examples/train_decoder_only_base.py"
+  run_coverage "$_TEST_DIR/../examples/train_decoder_only_base.py" scan.decoder_with_scan.DecoderWithScan \
+      --num-steps 30  # TODO(https://github.com/pytorch/xla/issues/8632): Reduce scan tracing overhead
+  
+  # HACK: don't confuse local `torch_xla` folder with installed package
+  # Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559
+  # Egaer tests will take more HBM, only run them on TPU v4 CI
+  TPU_VERSION=$(python -c "import sys; sys.path.remove(''); import torch_xla; print(torch_xla._internal.tpu.version())")
+  if [[ -n "$TPU_VERSION" && "$TPU_VERSION" == "4" ]]; then
+      run_coverage "$_TEST_DIR/dynamo/test_traceable_collectives.py"
+      run_coverage "$_TEST_DIR/../examples/data_parallel/train_resnet_xla_ddp.py"
+      run_coverage "$_TEST_DIR/../examples/fsdp/train_resnet_fsdp_auto_wrap.py"
+      run_coverage "$_TEST_DIR/../examples/eager/train_decoder_only_eager.py"
+      run_coverage "$_TEST_DIR/../examples/eager/train_decoder_only_eager_spmd_data_parallel.py"
+      run_coverage "$_TEST_DIR/../examples/eager/train_decoder_only_eager_with_compile.py"
+      run_coverage "$_TEST_DIR/../examples/eager/train_decoder_only_eager_multi_process.py"
+      XLA_EXPERIMENTAL=nonzero:masked_select:nms run_coverage "$_TEST_DIR/ds/test_dynamic_shapes.py" -v
+  fi
+  
+  if [[ -n "$TPU_VERSION" && "$TPU_VERSION" != "6" ]]; then
+      # Test `tpu-info` CLI compatibility
+      run_coverage "$_TPU_DIR/tpu_info/test_cli.py"
+  fi
+}
 
-if [[ -n "$TPU_VERSION" && "$TPU_VERSION" != "6" ]]; then
-    # Test `tpu-info` CLI compatibility
-    run_test "$_TPU_DIR/tpu_info/test_cli.py"
+if [ "$USE_COVERAGE" != "0" ]; then
+  PYTHONBIN="$(python -m site --user-base)/bin"
+  ls -l "$PYTHONBIN"
+  run
+  $PYTHONBIN/coverage combine
+  $PYTHONBIN/coverage lcov --omit="/tmp/*" --ignore-errors -o $COVERAGE_FILE.info
+else
+  run
 fi
diff --git a/torch_xla/csrc/BUILD b/torch_xla/csrc/BUILD
index 22143e984fd7..a17a8888130e 100644
--- a/torch_xla/csrc/BUILD
+++ b/torch_xla/csrc/BUILD
@@ -1,6 +1,7 @@
 load(
     "//bazel:rules_def.bzl",
     "ptxla_cc_library",
+    "cov_library",
 )
 
 genrule(
@@ -208,7 +209,7 @@ ptxla_cc_library(
     ],
 )
 
-cc_library(
+cov_library(
     name = "einsum_utilities",
     hdrs = ["ops/einsum_utilities.h"],
     deps = [
@@ -247,7 +248,7 @@ ptxla_cc_library(
     ],
 )
 
-cc_library(
+cov_library(
     name = "version",
     srcs = ["version.cpp"],
     hdrs = ["version.h"],
@@ -310,7 +311,7 @@ ptxla_cc_library(
     ],
 )
 
-cc_library(
+cov_library(
     name = "shape_helper",
     srcs = ["shape_helper.cpp"],
     hdrs = ["shape_helper.h"],
@@ -320,7 +321,7 @@ cc_library(
     ],
 )
 
-cc_library(
+cov_library(
     name = "thread_pool",
     srcs = ["thread_pool.cpp"],
     hdrs = ["thread_pool.h"],
diff --git a/torch_xla/csrc/helpers.cpp b/torch_xla/csrc/helpers.cpp
index f689d2dcfa9d..c79ff5e2e220 100644
--- a/torch_xla/csrc/helpers.cpp
+++ b/torch_xla/csrc/helpers.cpp
@@ -3,9 +3,6 @@
 #include <torch/csrc/lazy/core/helpers.h>
 #include <torch/csrc/lazy/core/util.h>
 
-#include <iterator>
-#include <limits>
-
 #include "absl/strings/str_join.h"
 #include "torch_xla/csrc/convert_ops.h"
 #include "torch_xla/csrc/dtype.h"
diff --git a/torch_xla/csrc/ops/custom_call.cpp b/torch_xla/csrc/ops/custom_call.cpp
index 82bcb40162be..b7980f75f573 100644
--- a/torch_xla/csrc/ops/custom_call.cpp
+++ b/torch_xla/csrc/ops/custom_call.cpp
@@ -13,7 +13,7 @@ CustomCall::CustomCall(
     xla::Shape output_shape, bool has_side_effect,
     const std::string& backend_config, const int api_version,
     const std::unordered_map<std::string, std::string>& frontend_attributes)
-    : XlaNode(xla_custom_call, inputs, std::move(output_shape),
+    : XlaNode(xla_custom_call, inputs, output_shape,
               /*num_outputs=*/output_shape.tuple_shapes_size(),
               torch::lazy::MHash(call_target)),
       call_target_(call_target),
diff --git a/torch_xla/csrc/ops/gpu_custom_call.cpp b/torch_xla/csrc/ops/gpu_custom_call.cpp
index 708bb91a7b51..26581f94899b 100644
--- a/torch_xla/csrc/ops/gpu_custom_call.cpp
+++ b/torch_xla/csrc/ops/gpu_custom_call.cpp
@@ -9,7 +9,7 @@ namespace torch_xla {
 GpuCustomCall::GpuCustomCall(torch::lazy::OpList inputs,
                              xla::Shape output_shape,
                              const std::string& payload)
-    : XlaNode(xla_gpu_custom_call, inputs, std::move(output_shape),
+    : XlaNode(xla_gpu_custom_call, inputs, output_shape,
               /*num_outputs=*/output_shape.tuple_shapes_size(),
               torch::lazy::MHash(payload)),
       payload_(payload) {}
diff --git a/torch_xla/csrc/ops/tpu_custom_call.cpp b/torch_xla/csrc/ops/tpu_custom_call.cpp
index a17d8f2c74d7..2a902ae4f387 100644
--- a/torch_xla/csrc/ops/tpu_custom_call.cpp
+++ b/torch_xla/csrc/ops/tpu_custom_call.cpp
@@ -9,7 +9,7 @@ namespace torch_xla {
 TpuCustomCall::TpuCustomCall(torch::lazy::OpList inputs,
                              xla::Shape output_shape,
                              const std::string& payload)
-    : XlaNode(xla_tpu_custom_call, inputs, std::move(output_shape),
+    : XlaNode(xla_tpu_custom_call, inputs, output_shape,
               /*num_outputs=*/output_shape.tuple_shapes_size(),
               torch::lazy::MHash(payload)),
       payload_(payload) {}
diff --git a/torch_xla/csrc/runtime/stablehlo_composite_helper.cpp b/torch_xla/csrc/runtime/stablehlo_composite_helper.cpp
index 101b36908555..51a065f9a271 100644
--- a/torch_xla/csrc/runtime/stablehlo_composite_helper.cpp
+++ b/torch_xla/csrc/runtime/stablehlo_composite_helper.cpp
@@ -16,8 +16,6 @@
 namespace torch_xla {
 namespace runtime {
 
-namespace {
-
 using nlohmann::json;
 
 static bool IsXlaMarkTensorOp(mlir::Operation* op) {
@@ -529,8 +527,6 @@ class RemoveXlaMarkTensorOpsPass
   }
 };
 
-}  // namespace
-
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateBuildStableHLOCompositePass() {
   return std::make_unique<BuildStableHLOCompositePass>();
diff --git a/torch_xla/csrc/runtime/xla_mlir_debuginfo_helper.cpp b/torch_xla/csrc/runtime/xla_mlir_debuginfo_helper.cpp
index 5aee4e828b83..1eff2f4496e1 100644
--- a/torch_xla/csrc/runtime/xla_mlir_debuginfo_helper.cpp
+++ b/torch_xla/csrc/runtime/xla_mlir_debuginfo_helper.cpp
@@ -9,8 +9,6 @@
 namespace torch_xla {
 namespace runtime {
 
-namespace {
-
 // Defined in torch_xla/experimental/xla_mlir_debuginfo.py
 static constexpr char XLA_MLIR_DEBUGINFO_BEGIN[] = "<XLA_MLIR_DEBUGINFO_BEGIN>";
 static constexpr char XLA_MLIR_DEBUGINFO_END[] = "<XLA_MLIR_DEBUGINFO_END>";
@@ -81,8 +79,6 @@ class PrepareXlaMlirDebuginfoPass : public mlir::OperationPass<mlir::ModuleOp> {
   }
 };
 
-}  // namespace
-
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreatePrepareXlaMlirDebuginfoPass() {
   return std::make_unique<PrepareXlaMlirDebuginfoPass>();