diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh index 6d35cfb2dabe..ff5e7fa90cd9 100755 --- a/.github/scripts/run_tests.sh +++ b/.github/scripts/run_tests.sh @@ -1,25 +1,16 @@ set -ex + function run_torch_xla_python_tests() { XLA_DIR=$1 USE_COVERAGE="${2:-0}" pushd $XLA_DIR echo "Running Python Tests" + ./test/run_tests.sh if [ "$USE_COVERAGE" != "0" ]; then - pip install coverage==6.5.0 --upgrade - pip install coverage-lcov - pip install toml - ./test/run_tests.sh coverage combine - mkdir lcov && cp .coverage lcov/ - coverage-lcov --data_file_path lcov/.coverage - coverage html - cp lcov.info htmlcov/ - mv htmlcov ~/ - chmod -R 755 ~/htmlcov - else - ./test/run_tests.sh + coverage-lcov --data_file_path $COVERAGE_FILE --output_file_path $COVERAGE_FILE.info fi popd } @@ -81,6 +72,10 @@ function run_torch_xla_benchmark_tests() { echo "Running Torchbench Tests" test/benchmarks/run_torchbench_tests.sh "${TORCHBENCH_MODELS[@]}" popd + if [ "$USE_COVERAGE" != "0" ]; then + coverage combine + coverage-lcov --data_file_path $COVERAGE_FILE --output_file_path $COVERAGE_FILE.info + fi } PYTORCH_DIR=$1 diff --git a/.github/workflows/_build_plugin.yml b/.github/workflows/_build_plugin.yml index 15a750acc629..2a4899394029 100644 --- a/.github/workflows/_build_plugin.yml +++ b/.github/workflows/_build_plugin.yml @@ -16,11 +16,6 @@ on: type: number description: Timeout in minutes for the build job default: 120 - has_code_changes: - required: false - type: string - description: Whether to run full workflow or not - default: 'true' secrets: gcloud-service-key: required: true @@ -37,31 +32,23 @@ jobs: BAZEL_REMOTE_CACHE: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} steps: - name: Checkout actions - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: sparse-checkout: | .github/workflows/setup path: .actions - name: Setup - if: inputs.has_code_changes == 'true' uses: ./.actions/.github/workflows/setup with: torch-commit: ${{ inputs.torch-commit }} cuda: true - name: Build - if: inputs.has_code_changes == 'true' shell: bash run: | cd pytorch/xla/infra/ansible ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5,8.6 src_root=${GITHUB_WORKSPACE} cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps - name: Upload wheel - if: inputs.has_code_changes == 'true' uses: actions/upload-artifact@v4 with: name: cuda-plugin path: /dist/*.whl - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml index 8e5db19d5b79..ad8bf50632e4 100644 --- a/.github/workflows/_build_torch_with_cuda.yml +++ b/.github/workflows/_build_torch_with_cuda.yml @@ -20,11 +20,6 @@ on: type: number description: Timeout in minutes for the build job default: 120 - has_code_changes: - required: false - type: string - description: Whether to run full workflow or not - default: 'true' jobs: build: runs-on: ${{ inputs.runner }} @@ -38,31 +33,23 @@ jobs: MAX_JOBS: 24 steps: - name: Checkout actions - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: sparse-checkout: | .github/workflows/setup path: .actions - name: Setup - if: inputs.has_code_changes == 'true' uses: ./.actions/.github/workflows/setup with: torch-commit: ${{ inputs.torch-commit }} cuda: true - name: Build PyTorch with CUDA enabled - if: inputs.has_code_changes == 'true' shell: bash run: | cd pytorch python setup.py bdist_wheel - name: Upload wheel - if: inputs.has_code_changes == 'true' uses: actions/upload-artifact@v4 with: name: torch-with-cuda path: pytorch/dist/*.whl - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml index 7988a09ce237..aea671c095d5 100644 --- a/.github/workflows/_build_torch_xla.yml +++ b/.github/workflows/_build_torch_xla.yml @@ -20,11 +20,6 @@ on: type: number description: Timeout in minutes for the build job default: 120 - has_code_changes: - required: false - type: string - description: Whether to run full workflow or not - default: 'true' secrets: gcloud-service-key: required: true @@ -40,41 +35,38 @@ jobs: BAZEL_REMOTE_CACHE: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} BAZEL_JOBS: 16 BUILD_CPP_TESTS: 1 + GCNO_OUTPUT_DIR: "/tmp/torch-xla-gcno" steps: # Need to check out local composite actions before using them # https://github.com/orgs/community/discussions/11771 - name: Checkout actions - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: sparse-checkout: | .github/workflows/setup path: .actions - name: Setup - if: inputs.has_code_changes == 'true' uses: ./.actions/.github/workflows/setup with: torch-commit: ${{ inputs.torch-commit }} - name: Build - if: inputs.has_code_changes == 'true' shell: bash run: | cd pytorch/xla/infra/ansible ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps - name: Upload wheel - if: inputs.has_code_changes == 'true' uses: actions/upload-artifact@v4 with: name: torch-xla-wheels path: /dist/*.whl - name: Upload CPP test binaries - if: inputs.has_code_changes == 'true' uses: actions/upload-artifact@v4 with: name: cpp-test-bin path: /tmp/test/bin - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." + - name: Upload GCNO files + uses: actions/upload-artifact@v4 + with: + name: torch-xla-gcnos + path: ${{ env.GCNO_OUTPUT_DIR }} diff --git a/.github/workflows/_check_code_changes.yml b/.github/workflows/_check_code_changes.yml deleted file mode 100644 index 8615efd09264..000000000000 --- a/.github/workflows/_check_code_changes.yml +++ /dev/null @@ -1,111 +0,0 @@ -name: Check Code Changes - -on: - workflow_call: - inputs: - event_name: - required: true - type: string - # For pull_request, base_sha is github.event.pull_request.base.sha (target branch tip) - # For push, base_sha is github.event.before - base_sha: - required: true - type: string - # For pull_request, head_sha is github.event.pull_request.head.sha (PR branch tip) - # For push, head_sha is github.sha - head_sha: - required: true - type: string - outputs: - has_code_changes: - description: "True if non-markdown code files were changed or event is workflow_dispatch/schedule, false otherwise." - value: ${{ jobs.check_files.outputs.has_code_changes }} - -jobs: - check_files: - runs-on: ubuntu-24.04 - outputs: - has_code_changes: ${{ steps.perform_check.outputs.has_code_changes }} - steps: - - name: Checkout code for diff (if needed) - # Checkout only if a diff is actually needed - if: inputs.event_name != 'workflow_dispatch' && inputs.event_name != 'schedule' - uses: actions/checkout@v4 - with: - # Fetch all history for all branches and tags. - # This is necessary for `git diff A...B` (three-dot diff) to find the merge base - # and correctly diff PR changes against the point where it diverged. - # It's also needed for `git diff A B` if A and B are far apart. - fetch-depth: 0 - - - name: Perform file content check - id: perform_check - run: | - echo "Event Name: ${{ inputs.event_name }}" - echo "Base SHA input (for PR: target branch; for Push: before SHA): ${{ inputs.base_sha }}" - echo "Head SHA input (for PR: PR head; for Push: current SHA): ${{ inputs.head_sha }}" - - # Handle workflow_dispatch and schedule events first - if [[ "${{ inputs.event_name }}" == "workflow_dispatch" || "${{ inputs.event_name }}" == "schedule" ]]; then - echo "Event is ${{ inputs.event_name }}. Assuming code changes or full run needed." - echo "has_code_changes=true" >> "$GITHUB_OUTPUT" - exit 0 # Exit early, no diff needed - fi - - # Handle initial push (base SHA is all zeros) - # For an initial push, all files in the head_sha are considered "changed" (new). - if [[ "${{ inputs.base_sha }}" == "0000000000000000000000000000000000000000" ]]; then - echo "Initial push (base SHA is zeros). Assuming code changes." - # We can list all files in the current commit (inputs.head_sha) if needed, - # but for simplicity, just assuming code changes is often sufficient. - # To be precise, one could do: git ls-tree -r --name-only ${{ inputs.head_sha }} > changed_files.txt - # And then apply the markdown filter. For now, we'll assume changes. - echo "has_code_changes=true" >> "$GITHUB_OUTPUT" - exit 0 - fi - - # Handle cases where base and head are the same (e.g., re-run on a specific commit, or a push with no new commits) - # This can happen if a workflow is re-run, or if a branch is pushed without new commits (e.g., force push to same SHA). - if [[ "${{ inputs.base_sha }}" == "${{ inputs.head_sha }}" ]]; then - echo "Base SHA is the same as Head SHA. No file changes. Assuming no code changes for skipping purposes." - echo "has_code_changes=false" >> "$GITHUB_OUTPUT" - exit 0 - fi - - # Ensure SHAs are valid before attempting diff - # (git rev-parse --verify will exit with non-zero if SHA is not found) - git rev-parse --verify ${{ inputs.base_sha }}^{commit} >/dev/null 2>&1 || { echo "Error: Base SHA ${{ inputs.base_sha }} not found or invalid."; exit 1; } - git rev-parse --verify ${{ inputs.head_sha }}^{commit} >/dev/null 2>&1 || { echo "Error: Head SHA ${{ inputs.head_sha }} not found or invalid."; exit 1; } - - - # Determine the diff command based on the event type - if [[ "${{ inputs.event_name }}" == "pull_request" ]]; then - # For pull requests, use three-dot diff (A...B). - # This shows changes on the PR branch (inputs.head_sha) - # since it diverged from the target branch (inputs.base_sha). - # inputs.base_sha is github.event.pull_request.base.sha - # inputs.head_sha is github.event.pull_request.head.sha - echo "Pull Request: Diffing ${{ inputs.base_sha }}...${{ inputs.head_sha }}" - git diff --name-only --no-renames ${{ inputs.base_sha }}...${{ inputs.head_sha }} > changed_files.txt - else # For 'push' and potentially other events not explicitly handled above - # For pushes, use two-dot diff (A B). - # inputs.base_sha is github.event.before - # inputs.head_sha is github.sha - echo "Push or other event: Diffing ${{ inputs.base_sha }} ${{ inputs.head_sha }}" - git diff --name-only --no-renames ${{ inputs.base_sha }} ${{ inputs.head_sha }} > changed_files.txt - fi - - echo "Changed files:" - cat changed_files.txt - - if [ ! -s changed_files.txt ]; then # Check if changed_files.txt is empty - echo "No files changed in the diff." - echo "has_code_changes=false" >> "$GITHUB_OUTPUT" - elif grep -q -v -E '\.md$' changed_files.txt; then - echo "Non-markdown code changes detected." - echo "has_code_changes=true" >> "$GITHUB_OUTPUT" - else - echo "Only markdown changes detected or no non-markdown changes found in diff." - echo "has_code_changes=false" >> "$GITHUB_OUTPUT" - fi - shell: bash diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index 0ca3fa33475f..f6a8d1b0814f 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -11,11 +11,6 @@ on: type: string description: Runner type for the test default: linux.4xlarge - has_code_changes: - required: false - type: string - description: Whether to run full workflow or not - default: 'true' secrets: torchxla-bot-token: required: true @@ -29,50 +24,42 @@ jobs: BRANCH_NAME: ${{ github.ref_name }} steps: - name: Fetch wheels - if: inputs.has_code_changes == 'true' uses: actions/download-artifact@v4 with: name: torch-xla-wheels path: /tmp/wheels/ - name: Install wheels - if: inputs.has_code_changes == 'true' shell: bash run: | pip install /tmp/wheels/*.whl - name: Checkout PyTorch/XLA Repo - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: path: pytorch/xla - name: Build docs - if: inputs.has_code_changes == 'true' shell: bash run: | cd pytorch/xla/docs pip install -r requirements.txt sphinx-build -b html source build - name: Checkout GitHub Pages - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: path: gh-pages ref: gh-pages token: ${{ github.event_name == 'push' && secrets.torchxla-bot-token || github.token }} - name: Merge changes - if: inputs.has_code_changes == 'true' shell: bash run: | subdir=${{ env.BRANCH_NAME == 'master' && 'master' || format('{0}/{1}', 'release', env.BRANCH_NAME) }} mkdir -p gh-pages/$subdir cp -fR pytorch/xla/docs/build/* gh-pages/$subdir - name: Upload preview as artifact - if: inputs.has_code_changes == 'true' uses: actions/upload-artifact@v4 with: name: github-pages path: pytorch/xla/docs/build/ - name: Deploy - if: inputs.has_code_changes == 'true' && github.event_name == 'push' shell: bash run: | cd gh-pages @@ -81,7 +68,4 @@ jobs: git add . -v git diff --cached --exit-code || git commit -m "Update doc from commit ${{ github.sha }}" git push origin gh-pages - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." + if: github.event_name == 'push' diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml index 2b1ed0cb7f95..07fb978d7e92 100644 --- a/.github/workflows/_test.yml +++ b/.github/workflows/_test.yml @@ -29,14 +29,14 @@ on: default: false description: Whether to install CUDA plugin package torch-commit: - required: true - type: string - description: torch-commit - has_code_changes: - required: false + required: true + type: string + description: torch-commit + device-type: + required: true type: string - description: Whether to run full workflow or not - default: 'true' + description: Device type for naming the coverage results. + secrets: gcloud-service-key: required: true @@ -46,30 +46,38 @@ jobs: runs-on: ${{ inputs.runner }} container: image: ${{ inputs.dev-image }} - options: "${{ inputs.install-cuda-plugin == true && '--gpus all' || '' }} --shm-size 16g" + options: "${{ inputs.install-cuda-plugin && '--gpus all' || '' }} --shm-size 16g" strategy: fail-fast: false matrix: include: # Use readable strings as they define the workflow titles. - - run_benchmark_tests: 'benchmark_tests' - - run_python_tests: 'python_tests' + - name: 'benchmark_tests' + run_benchmark_tests: 'benchmark_tests' + - name: 'python_tests-xla_op1' + run_python_tests: 'python_tests' run_xla_op_tests1: 'xla_op1' - - run_python_tests: 'python_tests' + - name: 'python_tests-xla_op2' + run_python_tests: 'python_tests' run_xla_op_tests2: 'xla_op2' - - run_python_tests: 'python_tests' + - name: 'python_tests-xla_op3' + run_python_tests: 'python_tests' run_xla_op_tests3: 'xla_op3' - - run_python_tests: 'python_tests' + - name: 'python_tests-torch_mp_op' + run_python_tests: 'python_tests' run_torch_mp_op_tests: 'torch_mp_op' - - run_cpp_tests: 'cpp_tests' + - name: 'cpp_tests-1' + run_cpp_tests: 'cpp_tests' run_cpp_tests1: 'cpp_tests1' - - run_cpp_tests: 'cpp_tests' + - name: 'cpp_tests-2' + run_cpp_tests: 'cpp_tests' run_cpp_tests2: 'cpp_tests2' timeout-minutes: ${{ inputs.timeout-minutes }} env: GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }} + COVERAGE_DIR: '/tmp/lcov' RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }} RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }} RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }} @@ -82,14 +90,12 @@ jobs: BAZEL_REMOTE_CACHE: 1 steps: - name: Checkout actions - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: sparse-checkout: | .github/workflows/setup path: .actions - name: Setup - if: inputs.has_code_changes == 'true' uses: ./.actions/.github/workflows/setup with: torch-commit: ${{ inputs.torch-commit }} @@ -97,42 +103,38 @@ jobs: wheels-artifact: torch-xla-wheels cuda-plugin-artifact: ${{ inputs.install-cuda-plugin && 'cuda-plugin' || null }} - name: Fetch CPP test binaries - if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests uses: actions/download-artifact@v4 with: name: cpp-test-bin path: /tmp/test/bin + if: ${{ matrix.run_cpp_tests }} # GitHub Actions doesn't preserve executable permissions # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss - name: Set CPP test permissions - if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests run: | chmod +x /tmp/test/bin/* ls -l /tmp/test/bin + if: ${{ matrix.run_cpp_tests }} - name: Check GPU - if: inputs.has_code_changes == 'true' && inputs.install-cuda-plugin run: nvidia-smi + if: ${{ inputs.install-cuda-plugin }} - name: Install test deps - if: inputs.has_code_changes == 'true' shell: bash run: | # TODO: Add these in setup.py pip install fsspec pip install rich - name: Checkout PyTorch Repo - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: repository: pytorch/pytorch path: pytorch ref: ${{ inputs.torch-commit }} - name: Checkout PyTorch/XLA Repo - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: path: pytorch/xla - name: Extra CI deps - if: inputs.has_code_changes == 'true' shell: bash run: | set -x @@ -142,56 +144,22 @@ jobs: if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then pip install -r pytorch/xla/benchmarks/requirements.txt fi + - name: Extra Coverage deps + shell: bash + run: | + set -x + pip install -U coverage coverage-lcov + if: ${{ inputs.collect-coverage }} - name: Test - if: inputs.has_code_changes == 'true' shell: bash + env: + COVERAGE_FILE: "${{ env.COVERAGE_DIR }}/py-coverage" + GCOV_PREFIX: "${{ env.COVERAGE_DIR }}/cpp-coverage" run: pytorch/xla/.github/scripts/run_tests.sh pytorch/ pytorch/xla/ $USE_COVERAGE - name: Upload coverage results - if: inputs.has_code_changes == 'true' && inputs.collect-coverage - shell: bash - env: - CIRCLE_WORKFLOW_ID: ${{ github.run_id }} - CIRCLE_BUILD_NUM: ${{ github.run_number }} - BENCHMARK_TEST_NAME: ${{ env.RUN_BENCHMARK_TESTS }} - PYTHON_TEST_NAME: ${{ env.RUN_PYTHON_TESTS }}${{ env.RUN_XLA_OP_TESTS1 }}${{ env.RUN_XLA_OP_TESTS2 }}${{ env.RUN_XLA_OP_TESTS3 }}${{ env.RUN_TORCH_MP_OP_TESTS }} - CPP_TEST_NAME: ${{ env.RUN_CPP_TESTS1 }}${{ env.RUN_CPP_TESTS2 }} - run: | - # TODO(yeounoh) collect coverage report as needed. - if [ -n "${BENCHMARK_TEST_NAME}" ]; then - exit 0 - fi - docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}" - if [ -n "${GPU_FLAG:-}" ]; then - if [ -n "${PYTHON_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out - fi - if [ -n "${CPP_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out - fi - else - if [ -n "${PYTHON_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out - fi - - if [ -n "${CPP_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out - fi - - if [ "${CPP_TEST_NAME}" == "cpp_tests1" ]; then - ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' - echo $ABS_METADATA > abs_metadata.json - gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json + uses: actions/upload-artifact@v4 + with: + name: "lcov-${{ inputs.device-type }}-${{ matrix.name }}" + path: "${{ env.COVERAGE_DIR }}" + if: ${{ inputs.collect-coverage }} - INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' - echo $INC_METADATA > inc_metadata.json - gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json - fi - fi - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml index 2c579184e914..c15efcc44421 100644 --- a/.github/workflows/_test_requiring_torch_cuda.yml +++ b/.github/workflows/_test_requiring_torch_cuda.yml @@ -27,11 +27,7 @@ on: required: true type: string description: torch-commit - has_code_changes: - required: false - type: string - description: Whether to run full workflow or not - default: 'true' + jobs: test: container: @@ -40,26 +36,28 @@ jobs: strategy: matrix: include: - - run_python_tests: 'python_tests' + - name: 'torch_with_cuda_python_tests' + run_python_tests: 'python_tests' runner: ${{ inputs.runner }} - - run_triton_tests: 'triton_tests' + - name: 'torch_with_cuda_triton_tests' + run_triton_tests: 'triton_tests' runner: 'linux.g5.4xlarge.nvidia.gpu' runs-on: ${{ matrix.runner }} timeout-minutes: ${{ inputs.timeout-minutes }} env: USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }} + COVERAGE_DIR: '/tmp/lcov' BAZEL_JOBS: 16 BAZEL_REMOTE_CACHE: 1 + PJRT_DEVICE: 'CUDA' steps: - name: Checkout actions - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: sparse-checkout: | .github/workflows/setup path: .actions - name: Setup - if: inputs.has_code_changes == 'true' uses: ./.actions/.github/workflows/setup with: torch-commit: ${{ inputs.torch-commit }} @@ -68,10 +66,8 @@ jobs: cuda-plugin-artifact: cuda-plugin cuda-torch-artifact: torch-with-cuda - name: Check GPU - if: inputs.has_code_changes == 'true' run: nvidia-smi - name: Install wheels - if: inputs.has_code_changes == 'true' shell: bash run: | pip install /tmp/wheels/*.whl @@ -86,42 +82,83 @@ jobs: python -c "import torch; assert torch.cuda.is_available()" echo "CUDA is available for PyTorch." - name: Checkout PyTorch Repo - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: repository: pytorch/pytorch path: pytorch ref: ${{ inputs.torch-commit }} - name: Checkout PyTorch/XLA Repo - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: path: pytorch/xla - name: Extra CI deps - if: inputs.has_code_changes == 'true' && matrix.run_triton_tests shell: bash run: | set -x pip install -U --pre jax jaxlib "jax-cuda12-plugin[with_cuda]" jax-cuda12-pjrt -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html + if: ${{ matrix.run_triton_tests }} + - name: Extra Coverage deps + shell: bash + run: | + set -x + pip install -U coverage coverage-lcov + if: ${{ inputs.collect-coverage }} - name: Install Triton - if: inputs.has_code_changes == 'true' shell: bash run: | cd pytorch make triton - name: Python Tests - if: inputs.has_code_changes == 'true' && matrix.run_python_tests shell: bash + env: + COVERAGE_FILE: "${{ env.COVERAGE_DIR }}/py-coverage" + GCOV_PREFIX: "${{ env.COVERAGE_DIR }}/cpp-coverage" run: | set -xue - PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v - PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v + + TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')") + + function run_coverage { + if [ "$USE_COVERAGE" != "0" ]; then + coverage run --source="$TORCH_XLA_DIR" -p "$@" + else + python3 "$@" + fi + } + + run_coverage pytorch/xla/test/test_operations.py -v + run_coverage pytorch/xla/test/dynamo/test_dynamo.py -v + + coverage combine + coverage-lcov --data_file_path $COVERAGE_FILE --output_file_path $COVERAGE_FILE.info + if: ${{ matrix.run_python_tests }} - name: Triton Tests - if: inputs.has_code_changes == 'true' && matrix.run_triton_tests shell: bash + env: + COVERAGE_FILE: "${{ env.COVERAGE_DIR }}/py-coverage" + GCOV_PREFIX: "${{ env.COVERAGE_DIR }}/cpp-coverage" + TRITON_PTXAS_PATH: "/usr/local/cuda-12.3/bin/ptxas" run: | - PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.3/bin/ptxas python pytorch/xla/test/test_triton.py - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." + set -x + + TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')") + + function run_coverage { + if [ "$USE_COVERAGE" != "0" ]; then + coverage run --source="$TORCH_XLA_DIR" -p "$@" + else + python3 "$@" + fi + } + + run_coverage pytorch/xla/test/test_triton.py + + coverage combine + coverage-lcov --data_file_path $COVERAGE_FILE --output_file_path $COVERAGE_FILE.info + if: ${{ matrix.run_triton_tests }} + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: "lcov-${{ matrix.name }}" + path: "${{ env.COVERAGE_DIR }}" + if: ${{ inputs.collect-coverage }} diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml index 263ee7f8e68d..4b27565c6801 100644 --- a/.github/workflows/_tpu_ci.yml +++ b/.github/workflows/_tpu_ci.yml @@ -7,31 +7,31 @@ on: type: number description: Timeout in minutes for the job run default: 120 - has_code_changes: + collect-coverage: required: false - type: string - description: Whether to run full workflow or not - default: 'true' + type: boolean + description: Set to true to collect coverage information + default: false jobs: tpu-test: runs-on: v4-runner-set timeout-minutes: ${{ inputs.timeout-minutes }} + env: + USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }} + COVERAGE_DIR: '/tmp/lcov' steps: - name: Checkout actions - if: inputs.has_code_changes == 'true' uses: actions/checkout@v4 with: sparse-checkout: | .github/workflows/setup path: .actions - name: Setup - if: inputs.has_code_changes == 'true' uses: ./.actions/.github/workflows/setup with: torch-commit: ${{ inputs.torch-commit }} wheels-artifact: torch-xla-wheels - name: Install test dependencies - if: inputs.has_code_changes == 'true' shell: bash run: | # TODO: Add these in setup.py @@ -41,15 +41,24 @@ jobs: # libtpu is needed for pallas tests. pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-wheels/index.html -f https://storage.googleapis.com/libtpu-releases/index.html pip install --upgrade protobuf + - name: Extra Coverage deps + shell: bash + run: | + set -x + pip install -U coverage coverage-lcov + if: ${{ inputs.collect-coverage }} - name: Run Tests - if: inputs.has_code_changes == 'true' env: PJRT_DEVICE: TPU TPU_LOG_DIR: tpu_logs + COVERAGE_FILE: "${{ env.COVERAGE_DIR }}/py-coverage" + GCOV_PREFIX: "${{ env.COVERAGE_DIR }}/cpp-coverage" run: | cd pytorch/xla test/tpu/run_tests.sh - - name: Report no code changes - if: inputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: "lcov-tpu" + path: "${{ env.COVERAGE_DIR }}" + if: ${{ inputs.collect-coverage }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4c7b108c1e71..71d17e8c84be 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -15,47 +15,32 @@ concurrency: cancel-in-progress: true jobs: - check_code_changes: - name: Check Code Changes - uses: ./.github/workflows/_check_code_changes.yml - with: - event_name: ${{ github.event_name }} - # For pull_request, use PR's base and head. For push, use event's before and sha. - base_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.base.sha || github.event.before }} - head_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} get-torch-commit: - needs: [check_code_changes] runs-on: ubuntu-24.04 outputs: torch_commit: ${{ steps.commit.outputs.torch_commit }} steps: - - name: Get latest torch commit - id: commit - if: needs.check_code_changes.outputs.has_code_changes == 'true' + - id: commit + name: Get latest torch commit run: | echo "torch_commit=$(git ls-remote https://github.com/pytorch/pytorch.git HEAD | awk '{print $1}')" >> "$GITHUB_OUTPUT" - - name: Report no code changes - if: needs.check_code_changes.outputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." build-torch-xla: name: "Build PyTorch/XLA" uses: ./.github/workflows/_build_torch_xla.yml - needs: [check_code_changes, get-torch-commit] + needs: get-torch-commit with: dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} timeout-minutes: 240 - has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }} secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} build-torch-with-cuda: name: "Build PyTorch with CUDA" uses: ./.github/workflows/_build_torch_with_cuda.yml - needs: [check_code_changes, get-torch-commit] + needs: get-torch-commit with: # TODO: bump CUDA version to either 12.4 or 12.6 (supported by PyTorch). # Ref: https://github.com/pytorch/xla/issues/8700 @@ -64,65 +49,61 @@ jobs: # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner. runner: linux.24xlarge timeout-minutes: 120 - has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }} build-cuda-plugin: name: "Build XLA CUDA plugin" uses: ./.github/workflows/_build_plugin.yml - needs: [check_code_changes] with: dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 - has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }} secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} test-python-cpu: name: "CPU tests" uses: ./.github/workflows/_test.yml - needs: [build-torch-xla, check_code_changes, get-torch-commit] + needs: [build-torch-xla, get-torch-commit] with: dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm timeout-minutes: 120 - collect-coverage: false + collect-coverage: true torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} - has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }} + device-type: "cpu" secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} test-cuda: name: "GPU tests" uses: ./.github/workflows/_test.yml - needs: [build-torch-xla, build-cuda-plugin, check_code_changes, get-torch-commit] + needs: [build-torch-xla, build-cuda-plugin, get-torch-commit] with: dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 runner: linux.g4dn.12xlarge.nvidia.gpu timeout-minutes: 300 - collect-coverage: false + collect-coverage: true install-cuda-plugin: true torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} - has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }} + device-type: "cuda" secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} test-cuda-with-pytorch-cuda-enabled: name: "GPU tests requiring torch CUDA" uses: ./.github/workflows/_test_requiring_torch_cuda.yml - needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, check_code_changes, get-torch-commit] + needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit] with: dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 runner: linux.8xlarge.nvidia.gpu timeout-minutes: 300 - collect-coverage: false + collect-coverage: true torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} - has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }} test-tpu: name: "TPU tests" uses: ./.github/workflows/_tpu_ci.yml - needs: [build-torch-xla, check_code_changes] + needs: build-torch-xla with: - timeout-minutes: 300 - has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }} + timeout-minutes: 300 + collect-coverage: true if: github.event_name == 'push' || github.event_name == 'pull_request' push-docs: diff --git a/.github/workflows/lintercheck.yml b/.github/workflows/lintercheck.yml index 27e3769ac99b..e45454e475f3 100644 --- a/.github/workflows/lintercheck.yml +++ b/.github/workflows/lintercheck.yml @@ -8,23 +8,12 @@ on: - r[0-9]+.[0-9]+ jobs: - check_code_changes: - name: Check Code Changes - uses: ./.github/workflows/_check_code_changes.yml - with: - event_name: ${{ github.event_name }} - # For pull_request, use PR's base and head. For push, use event's before and sha. - base_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.base.sha || github.event.before }} - head_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} linter_check: runs-on: ubuntu-24.04 - needs: [check_code_changes] steps: - name: Checkout repo - if: needs.check_code_changes.outputs.has_code_changes == 'true' uses: actions/checkout@v3 - name: Setup Python - if: needs.check_code_changes.outputs.has_code_changes == 'true' uses: actions/setup-python@v4 with: python-version: '3.10' @@ -32,9 +21,7 @@ jobs: - run: pip install yapf==0.40.2 # N.B.: keep in sync with `torchax/dev-requirements.txt`, `infra/ansible/config/pip.yaml` - name: Check no TORCH_PIN - if: > - (github.event_name == 'push' && github.event.ref == 'refs/heads/master') && - needs.check_code_changes.outputs.has_code_changes == 'true' + if: github.event_name == 'push' && github.event.ref == 'refs/heads/master' shell: bash run: | TORCH_PIN=./.torch_pin @@ -45,7 +32,6 @@ jobs: echo "No ${TORCH_PIN} found, safe to land..." fi - name: Run clang-format - if: needs.check_code_changes.outputs.has_code_changes == 'true' shell: bash env: CLANG_FORMAT: clang-format-16 @@ -71,7 +57,6 @@ jobs: echo "PASSED C++ format" fi - name: Run yapf - if: needs.check_code_changes.outputs.has_code_changes == 'true' shell: bash run: | git_status=$(git status --porcelain) @@ -92,7 +77,3 @@ jobs: else echo "PASSED Python format" fi - - name: Report no code changes - if: needs.check_code_changes.outputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." diff --git a/.github/workflows/torchax.yml b/.github/workflows/torchax.yml index deb1cf165d2e..b04e3c24c9f7 100644 --- a/.github/workflows/torchax.yml +++ b/.github/workflows/torchax.yml @@ -15,41 +15,28 @@ concurrency: cancel-in-progress: true jobs: - check_code_changes: - name: Check Code Changes - uses: ./.github/workflows/_check_code_changes.yml - with: - event_name: ${{ github.event_name }} - # For pull_request, use PR's base and head. For push, use event's before and sha. - base_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.base.sha || github.event.before }} - head_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} torchax-cpu: runs-on: ubuntu-24.04 - needs: [check_code_changes] strategy: matrix: python-version: ['3.10'] steps: - name: Checkout repo - if: needs.check_code_changes.outputs.has_code_changes == 'true' uses: actions/checkout@v4 with: sparse-checkout: | torchax - name: Setup Python - if: needs.check_code_changes.outputs.has_code_changes == 'true' uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install - if: needs.check_code_changes.outputs.has_code_changes == 'true' shell: bash working-directory: torchax run: | pip install -r test-requirements.txt pip install -e .[cpu] - name: Run tests - if: needs.check_code_changes.outputs.has_code_changes == 'true' working-directory: torchax shell: bash run: | @@ -72,8 +59,3 @@ jobs: pytest test/test_view.py pytest test/test_util.py XLA_FLAGS=--xla_force_host_platform_device_count=4 pytest -n 0 test_dist/ - echo "Tests completed." - - name: Report no code changes - if: needs.check_code_changes.outputs.has_code_changes == 'false' - run: | - echo "No code changes were detected that require running the full test suite." diff --git a/BUILD b/BUILD index c9a2578c722e..6a8c210f86ae 100644 --- a/BUILD +++ b/BUILD @@ -5,6 +5,7 @@ load( load("@python//:defs.bzl", "compile_pip_requirements") load("@python_version_repo//:py_version.bzl", "REQUIREMENTS") +load("//bazel:rules_def.bzl", "cov_library") compile_pip_requirements( name = "requirements", @@ -26,11 +27,18 @@ cc_binary( "-fopenmp", "-fPIC", "-fwrapv", + "-fprofile-arcs", + "-ftest-coverage", + "-O0", + "-g", ], linkopts = [ "-Wl,-rpath,$$ORIGIN/torch_xla/lib", # for libtpu "-Wl,-soname,_XLAC.so", "-lstdc++fs", # For std::filesystem + "-fprofile-arcs", + "-ftest-coverage", + "-lgcov", ], linkshared = 1, visibility = ["//visibility:public"], @@ -51,9 +59,16 @@ cc_binary( copts = [ "-fopenmp", "-fPIC", + "-fprofile-arcs", + "-ftest-coverage", + "-O0", + "-g", ], linkopts = [ "-Wl,-soname,_XLAC_cuda_functions.so", + "-fprofile-arcs", + "-ftest-coverage", + "-lgcov", ], linkshared = 1, visibility = ["//visibility:public"], diff --git a/bazel/rules_def.bzl b/bazel/rules_def.bzl index 1645514acbcb..dd31b06e9a7a 100644 --- a/bazel/rules_def.bzl +++ b/bazel/rules_def.bzl @@ -5,11 +5,23 @@ load( "xla_cc_test", ) -def ptxla_cc_library( +def cov_library( deps = [], copts = [], + linkopts = [], **kwargs): native.cc_library( + copts = copts + ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"], + linkopts = linkopts + ["-fprofile-arcs", "-ftest-coverage"], + deps = deps, + **kwargs + ) + +def ptxla_cc_library( + deps = [], + copts = [], + **kwargs): + cov_library( copts = copts + ["-isystemexternal/torch"], # Required for system includes. deps = deps + [ "@torch//:headers", diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml index db5562acb5c9..e028728ebc94 100644 --- a/infra/ansible/roles/build_srcs/tasks/main.yaml +++ b/infra/ansible/roles/build_srcs/tasks/main.yaml @@ -192,6 +192,25 @@ chdir: "{{ src_root }}" when: build_cpp_tests +- name: Collect GCNO files + ansible.builtin.shell: | + set -x + + # Bazel output directory. + OUTPUT_BASE=$(bazel info output_base) + echo "OUTPUT_BASE: $OUTPUT_BASE" + + CSRC=$(find "$OUTPUT_BASE/" -path "*/bazel-out/k8-opt/bin/torch_xla/csrc") + echo "CSRC: $CSRC" + + # Go to the directory with the build files of torch_xla/csrc. + cd "$CSRC" + + # Find all GCNO files, and install them into a temporary directory. + find . -name "*.gcno" -exec install -D \{} $GCNO_OUTPUT_DIR/\{} \; + + set +x + - name: Read Torchvision pin ansible.builtin.command: cat {{ (src_root, 'pytorch') | path_join }}/.github/ci_commit_pins/vision.txt register: torchvision_pin diff --git a/test/benchmarks/run_tests.sh b/test/benchmarks/run_tests.sh index fce6140a4fec..3769341dd9b2 100755 --- a/test/benchmarks/run_tests.sh +++ b/test/benchmarks/run_tests.sh @@ -4,8 +4,11 @@ CDIR="$(cd "$(dirname "$0")" ; pwd -P)" LOGFILE=/tmp/pytorch_benchmarks_test.log VERBOSITY=0 +BENCHMARKS_DIR="$CDIR/../../benchmarks/" +TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')") + # Make benchmark module available as it is not part of torch_xla. -export PYTHONPATH=$PYTHONPATH:$CDIR/../../benchmarks/ +export PYTHONPATH=$PYTHONPATH:$BENCHMARKS_DIR # Note [Keep Going] # @@ -30,6 +33,14 @@ do done shift $(($OPTIND - 1)) +function run_coverage { + if [ "${USE_COVERAGE:-0}" != "0" ]; then + coverage run --source="$TORCH_XLA_DIR,$BENCHMARKS_DIR" -p "$@" + else + python3 "$@" + fi +} + function run_make_tests { MAKE_V="" if [ "$VERBOSITY" != "0" ]; then @@ -42,10 +53,10 @@ function run_python_tests { # HACK: don't confuse local `torch_xla` folder with installed package # Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559 pushd $CDIR - python3 "test_experiment_runner.py" - python3 "test_benchmark_experiment.py" - python3 "test_benchmark_model.py" - python3 "test_result_analyzer.py" + run_coverage "test_experiment_runner.py" + run_coverage "test_benchmark_experiment.py" + run_coverage "test_benchmark_model.py" + run_coverage "test_result_analyzer.py" popd } diff --git a/test/run_tests.sh b/test/run_tests.sh index 243f8ee365cd..82247f759de9 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -46,7 +46,6 @@ export XLA_DUMP_FATAL_STACK=1 export CPU_NUM_DEVICES=4 TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')") -COVERAGE_FILE="$CDIR/../.coverage" function run_coverage { if [ "${USE_COVERAGE:-0}" != "0" ]; then diff --git a/test/tpu/run_tests.sh b/test/tpu/run_tests.sh index f0df6dec8651..69c791777487 100755 --- a/test/tpu/run_tests.sh +++ b/test/tpu/run_tests.sh @@ -2,93 +2,109 @@ set -xue CDIR="$(cd "$(dirname "$0")" ; pwd -P)" TEST_CDIR="$(dirname "$CDIR")" +TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')") source "${TEST_CDIR}/utils/run_tests_utils.sh" -# TODO: merge with other run_tests -(cd $TEST_CDIR && python3 -m unittest test_mat_mul_precision.TestMatMulPrecision.test_high) -(cd $TEST_CDIR && python3 -m unittest test_mat_mul_precision.TestMatMulPrecision.test_default) -(cd $TEST_CDIR && python3 -m unittest test_mat_mul_precision.TestMatMulPrecision.test_highest) -(cd $TEST_CDIR && python3 -m unittest test_mat_mul_precision.TestMatMulPrecision.test_all) -python3 "$TEST_CDIR/test_mat_mul_precision_get_and_set.py" -python3 "$TEST_CDIR/test_operations.py" -v -python3 "$TEST_CDIR/pjrt/test_runtime_tpu.py" -python3 "$TEST_CDIR/pjrt/test_collective_ops_tpu.py" -python3 "$TEST_CDIR/spmd/test_mp_input_sharding.py" -python3 "$TEST_CDIR/test_mp_collective_matmul.py" -run_save_tensor_hlo python3 "$TEST_CDIR/spmd/test_spmd_lowering_context.py" -python3 "$TEST_CDIR/spmd/test_xla_sharding.py" -python3 "$TEST_CDIR/spmd/test_xla_virtual_device.py" -python3 "$TEST_CDIR/spmd/test_xla_distributed_checkpoint.py" -python3 "$TEST_CDIR/spmd/test_train_spmd_linear_model.py" -python3 "$TEST_CDIR/spmd/test_xla_spmd_python_api_interaction.py" -python3 "$TEST_CDIR/spmd/test_xla_auto_sharding.py" -python3 "$TEST_CDIR/spmd/test_fsdp_v2.py" -python3 "$TEST_CDIR/test_gradient_accumulation.py" -XLA_EXPERIMENTAL=nonzero:masked_select:nms python3 "$TEST_CDIR/ds/test_dynamic_shape_models.py" -v -python3 "$TEST_CDIR/test_autocast.py" -python3 "$TEST_CDIR/test_fp8.py" -python3 "$TEST_CDIR/test_grad_checkpoint.py" -python3 "$TEST_CDIR/test_grad_checkpoint.py" "$@" --test_autocast -python3 "$TEST_CDIR/dynamo/test_dynamo.py" -python3 "$TEST_CDIR/dynamo/test_dynamo_dynamic_shape.py" -python3 "$TEST_CDIR/spmd/test_spmd_debugging.py" -XLA_PARAMETER_WRAPPING_THREADSHOLD=1 python3 "$TEST_CDIR/spmd/test_spmd_parameter_wrapping.py" -python3 "$TEST_CDIR/pjrt/test_dtypes.py" -python3 "$TEST_CDIR/pjrt/test_dynamic_plugin_tpu.py" -python3 "$TEST_CDIR/test_while_loop.py" -python3 "$TEST_CDIR/scan/test_scan.py" -python3 "$TEST_CDIR/scan/test_scan_spmd.py" -python3 "$TEST_CDIR/scan/test_scan_pallas.py" -python3 "$TEST_CDIR/scan/test_scan_layers.py" -python3 "$TEST_CDIR/test_gru.py" -python3 "$TEST_CDIR/test_assume_pure.py" -python3 "$TEST_CDIR/test_assume_pure_spmd.py" -python3 "$TEST_CDIR/test_as_stride_use_slice.py" -run_xla_hlo_debug python3 "$TEST_CDIR/scan/test_scan_debug.py" -python3 "$TEST_CDIR/test_pallas.py" -v -python3 "$TEST_CDIR/test_pallas_spmd.py" -XLA_DISABLE_FUNCTIONALIZATION=1 python3 "$TEST_CDIR/test_pallas_spmd.py" -python3 "$TEST_CDIR/test_splash_attention.py" -python3 "$TEST_CDIR/test_profiler_session.py" -python3 "$TEST_CDIR/test_multi_queries_paged_attention_kernel.py" -python3 "$TEST_CDIR/test_ragged_paged_attention_kernel.py" -python3 "$TEST_CDIR/test_input_output_aliases.py" -python3 "$TEST_CDIR/test_gmm.py" -python3 "$TEST_CDIR/eager/test_eager_spmd.py" -python3 "$TEST_CDIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py" -python3 "$TEST_CDIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py" -python3 "$TEST_CDIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py" -python3 "$TEST_CDIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py" -python3 "$TEST_CDIR/quantized_ops/test_dot_general.py" -run_xla_ir_hlo_debug python3 "$TEST_CDIR/test_user_computation_debug_cache.py" -python3 "$TEST_CDIR/test_data_type.py" -python3 "$TEST_CDIR/test_compilation_cache_utils.py" +function run_coverage { + if [ "${USE_COVERAGE:-0}" != "0" ]; then + coverage run --source="$TORCH_XLA_DIR" -p "$@" + else + python3 "$@" + fi +} -# run examples, each test should takes <2 minutes -python3 "$TEST_CDIR/../examples/data_parallel/train_resnet_spmd_data_parallel.py" -python3 "$TEST_CDIR/../examples/fsdp/train_decoder_only_fsdp_v2.py" -python3 "$TEST_CDIR/../examples/train_resnet_amp.py" -python3 "$TEST_CDIR/../examples/train_decoder_only_base.py" -python3 "$TEST_CDIR/../examples/train_decoder_only_base.py" scan.decoder_with_scan.DecoderWithScan \ - --num-steps 30 # TODO(https://github.com/pytorch/xla/issues/8632): Reduce scan tracing overhead +function run { + # TODO: merge with other run_tests + run_coverage "$TEST_CDIR/test_operations.py" -v + run_coverage "$TEST_CDIR/pjrt/test_runtime_tpu.py" + run_coverage "$TEST_CDIR/pjrt/test_collective_ops_tpu.py" + run_coverage "$TEST_CDIR/spmd/test_mp_input_sharding.py" + run_coverage "$TEST_CDIR/test_mp_collective_matmul.py" + run_save_tensor_hlo run_coverage "$TEST_CDIR/spmd/test_spmd_lowering_context.py" + run_coverage "$TEST_CDIR/spmd/test_xla_sharding.py" + run_coverage "$TEST_CDIR/spmd/test_xla_virtual_device.py" + run_coverage "$TEST_CDIR/spmd/test_xla_distributed_checkpoint.py" + run_coverage "$TEST_CDIR/spmd/test_train_spmd_linear_model.py" + run_coverage "$TEST_CDIR/spmd/test_xla_spmd_python_api_interaction.py" + run_coverage "$TEST_CDIR/spmd/test_xla_auto_sharding.py" + run_coverage "$TEST_CDIR/spmd/test_fsdp_v2.py" + run_coverage "$TEST_CDIR/test_gradient_accumulation.py" + XLA_EXPERIMENTAL=nonzero:masked_select:nms run_coverage "$TEST_CDIR/ds/test_dynamic_shape_models.py" -v + run_coverage "$TEST_CDIR/test_autocast.py" + run_coverage "$TEST_CDIR/test_fp8.py" + run_coverage "$TEST_CDIR/test_grad_checkpoint.py" + run_coverage "$TEST_CDIR/test_grad_checkpoint.py" "$@" --test_autocast + run_coverage "$TEST_CDIR/dynamo/test_dynamo.py" + run_coverage "$TEST_CDIR/dynamo/test_dynamo_dynamic_shape.py" + run_coverage "$TEST_CDIR/spmd/test_spmd_debugging.py" + XLA_PARAMETER_WRAPPING_THREADSHOLD=1 run_coverage "$TEST_CDIR/spmd/test_spmd_parameter_wrapping.py" + run_coverage "$TEST_CDIR/pjrt/test_dtypes.py" + run_coverage "$TEST_CDIR/pjrt/test_dynamic_plugin_tpu.py" + run_coverage "$TEST_CDIR/test_while_loop.py" + run_coverage "$TEST_CDIR/scan/test_scan.py" + run_coverage "$TEST_CDIR/scan/test_scan_spmd.py" + run_coverage "$TEST_CDIR/scan/test_scan_pallas.py" + run_coverage "$TEST_CDIR/scan/test_scan_layers.py" + run_coverage "$TEST_CDIR/test_gru.py" + run_coverage "$TEST_CDIR/test_assume_pure.py" + run_coverage "$TEST_CDIR/test_assume_pure_spmd.py" + run_coverage "$TEST_CDIR/test_as_stride_use_slice.py" + run_xla_hlo_debug run_coverage "$TEST_CDIR/scan/test_scan_debug.py" + run_coverage "$TEST_CDIR/test_pallas.py" -v + run_coverage "$TEST_CDIR/test_pallas_spmd.py" + XLA_DISABLE_FUNCTIONALIZATION=1 run_coverage "$TEST_CDIR/test_pallas_spmd.py" + run_coverage "$TEST_CDIR/test_splash_attention.py" + run_coverage "$TEST_CDIR/test_profiler_session.py" + run_coverage "$TEST_CDIR/test_multi_queries_paged_attention_kernel.py" + run_coverage "$TEST_CDIR/test_ragged_paged_attention_kernel.py" + run_coverage "$TEST_CDIR/test_input_output_aliases.py" + run_coverage "$TEST_CDIR/test_gmm.py" + run_coverage "$TEST_CDIR/eager/test_eager_spmd.py" + run_coverage "$TEST_CDIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py" + run_coverage "$TEST_CDIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py" + run_coverage "$TEST_CDIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py" + run_coverage "$TEST_CDIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py" + run_coverage "$TEST_CDIR/quantized_ops/test_dot_general.py" + run_xla_ir_hlo_debug run_coverage "$TEST_CDIR/test_user_computation_debug_cache.py" + run_coverage "$TEST_CDIR/test_data_type.py" + run_coverage "$TEST_CDIR/test_compilation_cache_utils.py" + + # run examples, each test should takes <2 minutes + run_coverage "$TEST_CDIR/../examples/data_parallel/train_resnet_spmd_data_parallel.py" + run_coverage "$TEST_CDIR/../examples/fsdp/train_decoder_only_fsdp_v2.py" + run_coverage "$TEST_CDIR/../examples/train_resnet_amp.py" + run_coverage "$TEST_CDIR/../examples/train_decoder_only_base.py" + run_coverage "$TEST_CDIR/../examples/train_decoder_only_base.py" scan.decoder_with_scan.DecoderWithScan \ + --num-steps 30 # TODO(https://github.com/pytorch/xla/issues/8632): Reduce scan tracing overhead + + # HACK: don't confuse local `torch_xla` folder with installed package + # Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559 + # Egaer tests will take more HBM, only run them on TPU v4 CI + TPU_VERSION=$(python -c "import sys; sys.path.remove(''); import torch_xla; print(torch_xla._internal.tpu.version())") + if [[ -n "$TPU_VERSION" && "$TPU_VERSION" == "4" ]]; then + run_coverage "$TEST_CDIR/dynamo/test_traceable_collectives.py" + run_coverage "$TEST_CDIR/../examples/data_parallel/train_resnet_xla_ddp.py" + run_coverage "$TEST_CDIR/../examples/fsdp/train_resnet_fsdp_auto_wrap.py" + run_coverage "$TEST_CDIR/../examples/eager/train_decoder_only_eager.py" + run_coverage "$TEST_CDIR/../examples/eager/train_decoder_only_eager_spmd_data_parallel.py" + run_coverage "$TEST_CDIR/../examples/eager/train_decoder_only_eager_with_compile.py" + run_coverage "$TEST_CDIR/../examples/eager/train_decoder_only_eager_multi_process.py" + XLA_EXPERIMENTAL=nonzero:masked_select:nms run_coverage "$TEST_CDIR/ds/test_dynamic_shapes.py" -v + fi + + if [[ -n "$TPU_VERSION" && "$TPU_VERSION" != "6" ]]; then + # Test `tpu-info` CLI compatibility + run_coverage "$CDIR/tpu_info/test_cli.py" + fi +} -# HACK: don't confuse local `torch_xla` folder with installed package -# Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559 -# Egaer tests will take more HBM, only run them on TPU v4 CI -TPU_VERSION=$(python -c "import sys; sys.path.remove(''); import torch_xla; print(torch_xla._internal.tpu.version())") -if [[ -n "$TPU_VERSION" && "$TPU_VERSION" == "4" ]]; then - python3 "$TEST_CDIR/dynamo/test_traceable_collectives.py" - python3 "$TEST_CDIR/../examples/data_parallel/train_resnet_xla_ddp.py" - python3 "$TEST_CDIR/../examples/fsdp/train_resnet_fsdp_auto_wrap.py" - python3 "$TEST_CDIR/../examples/eager/train_decoder_only_eager.py" - python3 "$TEST_CDIR/../examples/eager/train_decoder_only_eager_spmd_data_parallel.py" - python3 "$TEST_CDIR/../examples/eager/train_decoder_only_eager_with_compile.py" - python3 "$TEST_CDIR/../examples/eager/train_decoder_only_eager_multi_process.py" - XLA_EXPERIMENTAL=nonzero:masked_select:nms python3 "$TEST_CDIR/ds/test_dynamic_shapes.py" -v -fi - -if [[ -n "$TPU_VERSION" && "$TPU_VERSION" != "6" ]]; then - # Test `tpu-info` CLI compatibility - python3 "$CDIR/tpu_info/test_cli.py" +if [ "$USE_COVERAGE" != "0" ]; then + PYTHONBIN="$(python -m site --user-base)/bin" + ls -l "$PYTHONBIN" + run + $PYTHONBIN/coverage combine + $PYTHONBIN/coverage-lcov --data_file_path $COVERAGE_FILE --output_file_path $COVERAGE_FILE.info +else + run fi diff --git a/torch_xla/csrc/BUILD b/torch_xla/csrc/BUILD index 7dc2026e42d2..7885dbd1d87d 100644 --- a/torch_xla/csrc/BUILD +++ b/torch_xla/csrc/BUILD @@ -1,6 +1,7 @@ load( "//bazel:rules_def.bzl", "ptxla_cc_library", + "cov_library", ) genrule( @@ -208,7 +209,7 @@ ptxla_cc_library( ], ) -cc_library( +cov_library( name = "einsum_utilities", hdrs = ["ops/einsum_utilities.h"], deps = [ @@ -247,7 +248,7 @@ ptxla_cc_library( ], ) -cc_library( +cov_library( name = "version", srcs = ["version.cpp"], hdrs = ["version.h"], @@ -310,7 +311,7 @@ ptxla_cc_library( ], ) -cc_library( +cov_library( name = "shape_helper", srcs = ["shape_helper.cpp"], hdrs = ["shape_helper.h"], @@ -320,7 +321,7 @@ cc_library( ], ) -cc_library( +cov_library( name = "thread_pool", srcs = ["thread_pool.cc"], hdrs = ["thread_pool.h"],