Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions .circleci/scripts/binary_populate_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,7 @@ export PYTORCH_BUILD_NUMBER=1
: <<'BLOCK_COMMENT'
# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"

# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
TRITON_CONSTRAINT="platform_system == 'Linux'"
fi
TRITON_CONSTRAINT="platform_system == 'Linux'"

if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
Expand Down
3 changes: 3 additions & 0 deletions .github/templates/linux_binary_build_workflow.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ jobs:
runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine"
timeout-minutes: 420
{%- elif config["gpu_arch_type"] == "rocm" %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
{%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.24xlarge.ephemeral
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/generated-linux-binary-libtorch-nightly.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions .github/workflows/generated-linux-binary-manywheel-nightly.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .github/workflows/operator_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ on:
schedule:
# Run at 07:00 UTC every Sunday
- cron: 0 7 * * 0
pull_request:
paths:
- benchmarks/operator_benchmark/**
- .github/workflows/operator_benchmark.yml

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121
ARG INSTALL_CHANNEL=whl/nightly
# Automatically set by buildx
# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0

ARG TARGETPLATFORM

Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/mps/EmptyTensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

#define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
#define MPS_ERROR_RUNTIME_TOO_LOW \
"The MPS backend is supported on MacOS 13.0+.", \
"The MPS backend is supported on MacOS 14.0+. ", \
"Current OS version can be queried using `sw_vers`"
#define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
"as the MPS framework doesn't support float64. Please use float32 instead."
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459
PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
Expand Down Expand Up @@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
Expand All @@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
Expand Down Expand Up @@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp
PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
13 changes: 7 additions & 6 deletions torch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
return nvidia_lib_paths + lib_paths


def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None: # type: ignore[valid-type]
"""Preloads cuda deps if they could not be found otherwise."""
# Should only be called on Linux if default path resolution have failed
assert platform.system() == "Linux", "Should only be called on Linux"
Expand All @@ -313,9 +313,10 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
if candidate_lib_paths:
lib_path = candidate_lib_paths[0]
break
if not lib_path:
if not lib_path and required:
raise ValueError(f"{lib_name} not found in the system path {sys.path}")
ctypes.CDLL(lib_path)
if lib_path:
ctypes.CDLL(lib_path)


# See Note [Global dependencies]
Expand Down Expand Up @@ -354,8 +355,6 @@ def _load_global_deps() -> None:
except OSError as err:
# Can only happen for wheel with cuda libs as PYPI deps
# As PyTorch is not purelib, but nvidia-*-cu12 is
from torch.version import cuda as cuda_version

cuda_libs: dict[str, str] = {
"cublas": "libcublas.so.*[0-9]",
"cudnn": "libcudnn.so.*[0-9]",
Expand All @@ -369,7 +368,6 @@ def _load_global_deps() -> None:
"cusparselt": "libcusparseLt.so.*[0-9]",
"cusolver": "libcusolver.so.*[0-9]",
"nccl": "libnccl.so.*[0-9]",
"nvtx": "libnvToolsExt.so.*[0-9]",
"nvshmem": "libnvshmem_host.so.*[0-9]",
"cufile": "libcufile.so.*[0-9]",
}
Expand All @@ -381,6 +379,9 @@ def _load_global_deps() -> None:
raise err
for lib_folder, lib_name in cuda_libs.items():
_preload_cuda_deps(lib_folder, lib_name)

# libnvToolsExt is Optional Dependency
_preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False)
ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL)


Expand Down
13 changes: 9 additions & 4 deletions torch/utils/cpp_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -2418,10 +2418,6 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:

# If not given or set as native, determine what's best for the GPU / CUDA version that can be found
if not _arch_list or _arch_list == "native":
if not _arch_list:
logger.warning(
"TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
"If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.")
arch_list = []
# the assumption is that the extension should run on any of the currently visible cards,
# which could be of different types - therefore all archs for visible cards should be included
Expand All @@ -2440,6 +2436,15 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
arch_list.append(arch)
arch_list = sorted(arch_list)
arch_list[-1] += '+PTX'

if not _arch_list:
# Only log on rank 0 in distributed settings to avoid spam
if not torch.distributed.is_available() or not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
arch_list_str = ';'.join(arch_list)
logger.debug(
"TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='%s' "
"for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.",
arch_list_str)
else:
# Deal with lists that are ' ' separated (only deal with ';' after)
_arch_list = _arch_list.replace(' ', ';')
Expand Down