Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[submodule "pytorch"]
path = pytorch
url = https://github.com/QDelta/pytorch.git
branch = 2.7.1-phantora
url = https://github.com/ruogu-alter/pytorch.git
branch = 2.9.0-phantora
8 changes: 6 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS phantora-pytorch
FROM nvidia/cuda:12.6.3-devel-ubuntu22.04 AS phantora-pytorch

RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive \
Expand Down Expand Up @@ -28,7 +28,7 @@ ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
ENV TORCH_CUDA_ARCH_LIST="8.0;9.0"
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV PYTORCH_BUILD_VERSION=2.7.1
ENV PYTORCH_BUILD_VERSION=2.9.0
ENV PYTORCH_BUILD_NUMBER=1
ENV USE_CUDNN=0
ENV USE_CUSPARSELT=0
Expand All @@ -52,6 +52,10 @@ RUN curl -Lo torchtitan-requirements.txt https://raw.githubusercontent.com/pytor
python3 -m pip install --no-cache-dir -r torchtitan-requirements.txt
RUN python3 -m pip install --no-cache-dir megatron-core==0.13.1 transformers==4.41.2 deepspeed==0.17.5 torchtitan==0.1.0

RUN python3 -m pip install --no-cache-dir --no-deps colossalai==0.5.0
RUN python3 -m pip install --no-cache-dir peft==0.10.0
RUN python3 -m pip install --no-cache-dir galore-torch==1.0

# DeepSpeed needs passwordless ssh
COPY config/sshconfig /root/.ssh/config
COPY config/id_ed25519 /root/.ssh/id_ed25519
Expand Down
2 changes: 1 addition & 1 deletion pytorch
Submodule pytorch updated 7824 files
12 changes: 12 additions & 0 deletions stub/cudart.c
Original file line number Diff line number Diff line change
Expand Up @@ -555,12 +555,24 @@ _dummy() // accept any number of arguments
}

cudaError_t
#if CUDART_VERSION >= 12000
cudaGetDriverEntryPoint(const char* symbol,
void** funcPtr,
unsigned long long flags,
enum cudaDriverEntryPointQueryResult* driverStatus)
#else
cudaGetDriverEntryPoint(const char* symbol,
void** funcPtr,
unsigned long long flags)
#endif
{
// TODO
*funcPtr = _dummy;
#if CUDART_VERSION >= 12000
if (driverStatus) {
*driverStatus = cudaDriverEntryPointSuccess;
}
#endif
return cudaSuccess;
}

Expand Down
26 changes: 25 additions & 1 deletion stub/cudart_noimpl.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,13 +232,23 @@ cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph)
}

cudaError_t
#if CUDART_VERSION >= 12000
cudaStreamGetCaptureInfo(cudaStream_t stream,
enum cudaStreamCaptureStatus* captureStatus_out,
unsigned long long* id_out,
cudaGraph_t* graph_out,
const cudaGraphNode_t** dependencies_out,
size_t* numDependencies_out)
#else
cudaStreamGetCaptureInfo(cudaStream_t stream,
enum cudaStreamCaptureStatus* pCaptureStatus,
unsigned long long* pId)
#endif
{
NOT_IMPLEMENTED;
}

#if CUDART_VERSION < 12000
cudaError_t
cudaStreamGetCaptureInfo_v2(cudaStream_t stream,
enum cudaStreamCaptureStatus* captureStatus_out,
Expand All @@ -249,6 +259,7 @@ cudaStreamGetCaptureInfo_v2(cudaStream_t stream,
{
NOT_IMPLEMENTED;
}
#endif

cudaError_t
cudaStreamUpdateCaptureDependencies(cudaStream_t stream,
Expand All @@ -270,7 +281,8 @@ cudaEventRecordWithFlags(cudaEvent_t event,
cudaStream_t stream,
unsigned int flags)
{
NOT_IMPLEMENTED;
(void)flags;
return cudaEventRecord(event, stream);
}

cudaError_t
Expand Down Expand Up @@ -1687,11 +1699,17 @@ cudaGraphDestroyNode(cudaGraphNode_t node)
}

cudaError_t
#if CUDART_VERSION >= 12000
cudaGraphInstantiate(cudaGraphExec_t* pGraphExec,
cudaGraph_t graph,
unsigned long long flags)
#else
cudaGraphInstantiate(cudaGraphExec_t* pGraphExec,
cudaGraph_t graph,
cudaGraphNode_t* pErrorNode,
char* pLogBuffer,
size_t bufferSize)
#endif
{
NOT_IMPLEMENTED;
}
Expand Down Expand Up @@ -1852,10 +1870,16 @@ cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec,
#endif

cudaError_t
#if CUDART_VERSION >= 12000
cudaGraphExecUpdate(cudaGraphExec_t hGraphExec,
cudaGraph_t hGraph,
cudaGraphExecUpdateResultInfo* resultInfo)
#else
cudaGraphExecUpdate(cudaGraphExec_t hGraphExec,
cudaGraph_t hGraph,
cudaGraphNode_t* hErrorNode_out,
enum cudaGraphExecUpdateResult* updateResult_out)
#endif
{
NOT_IMPLEMENTED;
}
Expand Down
3 changes: 3 additions & 0 deletions tests/docker/ColossalAI/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/compose.yaml
/netconfig.toml
/config.sh
113 changes: 113 additions & 0 deletions tests/docker/ColossalAI/config_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env python3

SIMULATOR_TEMPLATE = r"""
simulator:
image: "phantora:latest"
volumes:
- /run/phantora:/run/phantora
- ./netconfig.toml:/netconfig.toml:ro
pid: host
ipc: host
environment:
- PHANTORA_LOG=${{PHANTORA_LOG:-info}}
- PHANTORA_SOCKET_PREFIX=/run/phantora/phantora
command: /phantora/dist/phantora_server --netconfig /netconfig.toml
cpuset: '{cpuset}'
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0']
capabilities: [gpu]
"""

HOST_TEMPLATE = r"""
host-{host_id}:
image: "phantora:latest"
volumes:
- /run/phantora:/run/phantora
- ../..:/phantora/tests:ro
pid: host
ipc: host
environment:
- CUDA_DEVICE_MAX_CONNECTIONS=1
- PHANTORA_NGPU={ngpu}
- PHANTORA_VRAM_MIB={vram_mib}
- PHANTORA_IGNORE_CPU_TIME=1
- PHANTORA_SOCKET_PREFIX=/run/phantora/phantora
hostname: host-{host_id}
command: sleep infinity
cpuset: '{cpuset}'
depends_on:
- simulator
"""

NETCONFIG_TEMPLATE = r"""
host_mapping = {host_list}

[simulator]
loopback_speed = 2880
fairness = "PerFlowMaxMin"

[topology]
type = "TwoLayerMultiPath"

[topology.args]
nspines = 2
nracks = {nracks}
rack_size = 2
host_bw = 800
rack_uplink_port_bw = 800
load_balancer_type = "EcmpEverything"
"""

if __name__ == '__main__':
import argparse
from os.path import dirname, realpath, join
from multiprocessing import cpu_count
script_dir = dirname(realpath(__file__))

nproc = cpu_count()
if nproc <= 2:
default_sim_core = str(nproc - 1)
default_host_cpuset = str(nproc - 1)
else:
default_sim_core = str(nproc // 2)
default_host_cpuset = f"{nproc // 2 + 1}-{nproc - 1}"

parser = argparse.ArgumentParser()
parser.add_argument("--nhost", type=int, default=4)
parser.add_argument("--ngpu", type=int, default=4)
parser.add_argument("--vram_mib", type=int, default=143771)
parser.add_argument("--cpuset_sim", type=str, default=default_sim_core)
parser.add_argument("--cpuset_host", type=str, default=default_host_cpuset)
args = parser.parse_args()

nhosts = args.nhost
ngpu = args.ngpu

with open(join(script_dir, "compose.yaml"), "w") as f:
f.write("services:")
f.write(SIMULATOR_TEMPLATE.format(cpuset=args.cpuset_sim))
for i in range(1, nhosts + 1):
f.write(
HOST_TEMPLATE.format(
host_id=i,
ngpu=ngpu,
vram_mib=args.vram_mib,
cpuset=args.cpuset_host,
)
)

with open(join(script_dir, "netconfig.toml"), "w") as f:
host_list = str([f"host-{i}" for i in range(1, nhosts + 1)])
f.write(
NETCONFIG_TEMPLATE.format(
host_list=host_list, nracks=(nhosts + 1) // 2
)
)

with open(join(script_dir, "config.sh"), "w") as f:
f.write(f"EVAL_NHOST={nhosts}\n")
f.write(f"EVAL_NGPU={ngpu}\n")
17 changes: 17 additions & 0 deletions tests/docker/ColossalAI/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash

WORKDIR=`dirname $(realpath $0)`
source $WORKDIR/config.sh

compose_file=$WORKDIR/compose.yaml

docker compose -f $compose_file down --remove-orphans
docker compose -f $compose_file up -d
sleep 1

cmd="/phantora/dist/phantora_run torchrun --nproc_per_node $EVAL_NGPU --nnodes $EVAL_NHOST --rdzv_backend c10d --rdzv_endpoint=\"host-1:12345\" /phantora/tests/test_ColossalAI.py $@"

for w in $(seq 2 $EVAL_NHOST); do
docker compose -f $compose_file exec -it -d host-$w bash -c "$cmd"
done
docker compose -f $compose_file exec -it host-1 bash -c "$cmd"
6 changes: 6 additions & 0 deletions tests/docker/ColossalAI/stop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash

WORKDIR=`dirname $(realpath $0)`
compose_file=$WORKDIR/compose.yaml
docker compose -f $compose_file down --remove-orphans
sudo rm -f /run/phantora/phantora*
2 changes: 2 additions & 0 deletions tests/phantora_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def time_pair() -> float:
LIB.get_time_double.restype = ctypes.c_double
_get_time = LIB.get_time_double
_perf_counter = _time.perf_counter
_wall_time = _time.time

def time() -> float:
_read_timer()
Expand All @@ -34,6 +35,7 @@ def time_pair() -> float:
return t, t_wall

_time.perf_counter = time
#_time.time = time

# seems cannot patch `assert_ints_same_as_other_ranks`
# maybe due to decorator, but cannot reproduce in a mini example
Expand Down
Loading