Skip to content

Commit

Permalink
Merge branch 'master' into disable_logger_for_PT2.6
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Feb 13, 2025
2 parents 86fabd7 + 5a361e1 commit 5b9cd65
Show file tree
Hide file tree
Showing 11 changed files with 35 additions and 28 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -29,7 +29,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-ds-chat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -37,7 +37,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -32,7 +32,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121
pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -58,8 +58,8 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
#pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.1"
pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
#pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.4"
pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.4"
pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.4"
# run ds_report again to check updated op list
ds_report
4 changes: 2 additions & 2 deletions .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -29,7 +29,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-mii.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -37,7 +37,7 @@ jobs:

- name: Install pytorch
run: |
pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/nv-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -28,7 +28,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down Expand Up @@ -58,7 +58,7 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.5" --cuda_ver="12.1"
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.6" --cuda_ver="12.4"
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -29,7 +29,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -55,5 +55,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.5" --cuda_ver="12.1"
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.5" --cuda_ver="12.1"
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.6" --cuda_ver="12.4"
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"
4 changes: 2 additions & 2 deletions .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -28,7 +28,7 @@ jobs:

- name: Install pytorch
run: |
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu121, v100]
runs-on: [self-hosted, nvidia, cu124, v100]

steps:
- uses: actions/checkout@v4
Expand All @@ -29,7 +29,7 @@ jobs:
- name: Install pytorch
run: |
# use the same pytorch version as transformers CI
pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu121 --index-url https://download.pytorch.org/whl/cu121
pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu124 --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
10 changes: 7 additions & 3 deletions deepspeed/comm/comm.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,9 +704,13 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True)
master_addr = None
if rank == 0:
import shlex
hostname_cmd = shlex.split("hostname -I")
result = subprocess.check_output(hostname_cmd)
master_addr = result.decode('utf-8').split()[0]
try:
hostname_cmd = shlex.split("hostname -I")
result = subprocess.check_output(hostname_cmd)
master_addr = result.decode('utf-8').split()[0]
except subprocess.CalledProcessError: # hostname -I not available (e.g. on macOS)
import socket
master_addr = socket.gethostbyname(socket.gethostname())
master_addr = comm.bcast(master_addr, root=0)

# Determine local rank by assuming hostnames are unique
Expand Down
5 changes: 4 additions & 1 deletion op_builder/fp_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,10 @@ def sources(self):
]

def extra_ldflags(self):
return ['-lcurand']
if not self.is_rocm_pytorch():
return ['-lcurand']
else:
return []

def include_paths(self):
return ['csrc/fp_quantizer/includes', 'csrc/includes']
Expand Down

0 comments on commit 5b9cd65

Please sign in to comment.