diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 4cec3e2f6d72..aa82d36aa7ce 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -75,14 +75,7 @@ export PYTORCH_BUILD_NUMBER=1 : <<'BLOCK_COMMENT' # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) - -# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT -TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'" - -# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries. -if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then - TRITON_CONSTRAINT="platform_system == 'Linux'" -fi +TRITON_CONSTRAINT="platform_system == 'Linux'" if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}" diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index f53472571993..bf7db5866e78 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -77,6 +77,9 @@ jobs: runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 + {%- elif config["gpu_arch_type"] == "rocm" %} + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.24xlarge.ephemeral diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index f5eca8751840..bc671ae80ae2 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -333,6 +333,7 @@ jobs: LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: libtorch-rocm6_3-shared-with-deps-release build_environment: linux-binary-libtorch secrets: @@ -446,6 +447,7 @@ jobs: LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: libtorch-rocm6_4-shared-with-deps-release build_environment: linux-binary-libtorch secrets: diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index c996437a3b9f..5f9eaab976a6 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -323,6 +323,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -433,6 +434,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -912,6 +914,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_11-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -1022,6 +1025,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_11-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -1501,6 +1505,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_12-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -1611,6 +1616,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_12-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -2090,6 +2096,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -2200,6 +2207,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -2679,6 +2687,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13t-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -2789,6 +2798,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13t-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -3268,6 +3278,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -3378,6 +3389,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -3857,6 +3869,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14t-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -3967,6 +3980,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14t-rocm6_4 build_environment: linux-binary-manywheel secrets: diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml index 9593391217ac..9df4835757c4 100644 --- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml @@ -60,6 +60,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel-rocm secrets: diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index dd262d31b8fc..dcdc2cd0ba24 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -14,6 +14,10 @@ on: schedule: # Run at 07:00 UTC every Sunday - cron: 0 7 * * 0 + pull_request: + paths: + - benchmarks/operator_benchmark/** + - .github/workflows/operator_benchmark.yml concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} diff --git a/Dockerfile b/Dockerfile index f73dfcc1af3a..331cf00593cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121 ARG INSTALL_CHANNEL=whl/nightly # Automatically set by buildx # pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574 -RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0 +RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0 ARG TARGETPLATFORM diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index d858df073397..6c58de099648 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -12,7 +12,7 @@ #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled" #define MPS_ERROR_RUNTIME_TOO_LOW \ - "The MPS backend is supported on MacOS 13.0+.", \ + "The MPS backend is supported on MacOS 14.0+. ", \ "Current OS version can be queried using `sw_vers`" #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \ "as the MPS framework doesn't support float64. Please use float32 instead." diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv index 873f14d20127..9a7b6797e982 100644 --- a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv +++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv @@ -1,5 +1,5 @@ Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time -PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497 +PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459 PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181 PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826 PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449 @@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547 -PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739 +PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664 -PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875 +PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719 PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728 @@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619 -PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118 +PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613 -PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436 +PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295 PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189 @@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667 PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333 -PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667 \ No newline at end of file +PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667 diff --git a/torch/__init__.py b/torch/__init__.py index a5c072396e1d..0625ad60bfff 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -302,7 +302,7 @@ def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]: return nvidia_lib_paths + lib_paths -def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None: +def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None: # type: ignore[valid-type] """Preloads cuda deps if they could not be found otherwise.""" # Should only be called on Linux if default path resolution have failed assert platform.system() == "Linux", "Should only be called on Linux" @@ -313,9 +313,10 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None: if candidate_lib_paths: lib_path = candidate_lib_paths[0] break - if not lib_path: + if not lib_path and required: raise ValueError(f"{lib_name} not found in the system path {sys.path}") - ctypes.CDLL(lib_path) + if lib_path: + ctypes.CDLL(lib_path) # See Note [Global dependencies] @@ -354,8 +355,6 @@ def _load_global_deps() -> None: except OSError as err: # Can only happen for wheel with cuda libs as PYPI deps # As PyTorch is not purelib, but nvidia-*-cu12 is - from torch.version import cuda as cuda_version - cuda_libs: dict[str, str] = { "cublas": "libcublas.so.*[0-9]", "cudnn": "libcudnn.so.*[0-9]", @@ -369,7 +368,6 @@ def _load_global_deps() -> None: "cusparselt": "libcusparseLt.so.*[0-9]", "cusolver": "libcusolver.so.*[0-9]", "nccl": "libnccl.so.*[0-9]", - "nvtx": "libnvToolsExt.so.*[0-9]", "nvshmem": "libnvshmem_host.so.*[0-9]", "cufile": "libcufile.so.*[0-9]", } @@ -381,6 +379,9 @@ def _load_global_deps() -> None: raise err for lib_folder, lib_name in cuda_libs.items(): _preload_cuda_deps(lib_folder, lib_name) + + # libnvToolsExt is Optional Dependency + _preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False) ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 7202a9638756..902d2fe6ce0f 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -2418,10 +2418,6 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]: # If not given or set as native, determine what's best for the GPU / CUDA version that can be found if not _arch_list or _arch_list == "native": - if not _arch_list: - logger.warning( - "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n" - "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.") arch_list = [] # the assumption is that the extension should run on any of the currently visible cards, # which could be of different types - therefore all archs for visible cards should be included @@ -2440,6 +2436,15 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]: arch_list.append(arch) arch_list = sorted(arch_list) arch_list[-1] += '+PTX' + + if not _arch_list: + # Only log on rank 0 in distributed settings to avoid spam + if not torch.distributed.is_available() or not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + arch_list_str = ';'.join(arch_list) + logger.debug( + "TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='%s' " + "for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.", + arch_list_str) else: # Deal with lists that are ' ' separated (only deal with ';' after) _arch_list = _arch_list.replace(' ', ';')