QDelta · ruogu-alter · Feb 2, 2026 · Feb 2, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "pytorch"]
 	path = pytorch
-	url = https://github.com/QDelta/pytorch.git
-	branch = 2.7.1-phantora
+	url = https://github.com/ruogu-alter/pytorch.git
+	branch = 2.9.0-phantora
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS phantora-pytorch
+FROM nvidia/cuda:12.6.3-devel-ubuntu22.04 AS phantora-pytorch
 
 RUN apt-get update && \
     DEBIAN_FRONTEND=noninteractive \
@@ -28,7 +28,7 @@ ENV CUDA_HOME=/usr/local/cuda
 ENV LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
 ENV TORCH_CUDA_ARCH_LIST="8.0;9.0"
 ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
-ENV PYTORCH_BUILD_VERSION=2.7.1
+ENV PYTORCH_BUILD_VERSION=2.9.0
 ENV PYTORCH_BUILD_NUMBER=1
 ENV USE_CUDNN=0
 ENV USE_CUSPARSELT=0
@@ -52,6 +52,10 @@ RUN curl -Lo torchtitan-requirements.txt https://raw.githubusercontent.com/pytor
     python3 -m pip install --no-cache-dir -r torchtitan-requirements.txt
 RUN python3 -m pip install --no-cache-dir megatron-core==0.13.1 transformers==4.41.2 deepspeed==0.17.5 torchtitan==0.1.0
 
+RUN python3 -m pip install --no-cache-dir --no-deps colossalai==0.5.0
+RUN python3 -m pip install --no-cache-dir peft==0.10.0
+RUN python3 -m pip install --no-cache-dir galore-torch==1.0
+
 # DeepSpeed needs passwordless ssh
 COPY config/sshconfig /root/.ssh/config
 COPY config/id_ed25519 /root/.ssh/id_ed25519

diff --git a/pytorch b/pytorch
diff --git a/stub/cudart.c b/stub/cudart.c
@@ -555,12 +555,24 @@ _dummy() // accept any number of arguments
 }
 
 cudaError_t
+#if CUDART_VERSION >= 12000
+cudaGetDriverEntryPoint(const char* symbol,
+                        void** funcPtr,
+                        unsigned long long flags,
+                        enum cudaDriverEntryPointQueryResult* driverStatus)
+#else
 cudaGetDriverEntryPoint(const char* symbol,
                         void** funcPtr,
                         unsigned long long flags)
+#endif
 {
     // TODO
     *funcPtr = _dummy;
+#if CUDART_VERSION >= 12000
+    if (driverStatus) {
+        *driverStatus = cudaDriverEntryPointSuccess;
+    }
+#endif
     return cudaSuccess;
 }
 

diff --git a/stub/cudart_noimpl.c b/stub/cudart_noimpl.c
@@ -232,13 +232,23 @@ cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t* pGraph)
 }
 
 cudaError_t
+#if CUDART_VERSION >= 12000
+cudaStreamGetCaptureInfo(cudaStream_t stream,
+                         enum cudaStreamCaptureStatus* captureStatus_out,
+                         unsigned long long* id_out,
+                         cudaGraph_t* graph_out,
+                         const cudaGraphNode_t** dependencies_out,
+                         size_t* numDependencies_out)
+#else
 cudaStreamGetCaptureInfo(cudaStream_t stream,
                          enum cudaStreamCaptureStatus* pCaptureStatus,
                          unsigned long long* pId)
+#endif
 {
     NOT_IMPLEMENTED;
 }
 
+#if CUDART_VERSION < 12000
 cudaError_t
 cudaStreamGetCaptureInfo_v2(cudaStream_t stream,
                             enum cudaStreamCaptureStatus* captureStatus_out,
@@ -249,6 +259,7 @@ cudaStreamGetCaptureInfo_v2(cudaStream_t stream,
 {
     NOT_IMPLEMENTED;
 }
+#endif
 
 cudaError_t
 cudaStreamUpdateCaptureDependencies(cudaStream_t stream,
@@ -270,7 +281,8 @@ cudaEventRecordWithFlags(cudaEvent_t event,
                          cudaStream_t stream,
                          unsigned int flags)
 {
-    NOT_IMPLEMENTED;
+    (void)flags;
+    return cudaEventRecord(event, stream);
 }
 
 cudaError_t
@@ -1687,11 +1699,17 @@ cudaGraphDestroyNode(cudaGraphNode_t node)
 }
 
 cudaError_t
+#if CUDART_VERSION >= 12000
+cudaGraphInstantiate(cudaGraphExec_t* pGraphExec,
+                     cudaGraph_t graph,
+                     unsigned long long flags)
+#else
 cudaGraphInstantiate(cudaGraphExec_t* pGraphExec,
                      cudaGraph_t graph,
                      cudaGraphNode_t* pErrorNode,
                      char* pLogBuffer,
                      size_t bufferSize)
+#endif
 {
     NOT_IMPLEMENTED;
 }
@@ -1852,10 +1870,16 @@ cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec,
 #endif
 
 cudaError_t
+#if CUDART_VERSION >= 12000
+cudaGraphExecUpdate(cudaGraphExec_t hGraphExec,
+                    cudaGraph_t hGraph,
+                    cudaGraphExecUpdateResultInfo* resultInfo)
+#else
 cudaGraphExecUpdate(cudaGraphExec_t hGraphExec,
                     cudaGraph_t hGraph,
                     cudaGraphNode_t* hErrorNode_out,
                     enum cudaGraphExecUpdateResult* updateResult_out)
+#endif
 {
     NOT_IMPLEMENTED;
 }

diff --git a/tests/docker/ColossalAI/.gitignore b/tests/docker/ColossalAI/.gitignore
@@ -0,0 +1,3 @@
+/compose.yaml
+/netconfig.toml
+/config.sh
diff --git a/tests/docker/ColossalAI/config_gen.py b/tests/docker/ColossalAI/config_gen.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+SIMULATOR_TEMPLATE = r"""
+  simulator:
+    image: "phantora:latest"
+    volumes:
+      - /run/phantora:/run/phantora
+      - ./netconfig.toml:/netconfig.toml:ro
+    pid: host
+    ipc: host
+    environment:
+      - PHANTORA_LOG=${{PHANTORA_LOG:-info}}
+      - PHANTORA_SOCKET_PREFIX=/run/phantora/phantora
+    command: /phantora/dist/phantora_server --netconfig /netconfig.toml
+    cpuset: '{cpuset}'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
+"""
+
+HOST_TEMPLATE = r"""
+  host-{host_id}:
+    image: "phantora:latest"
+    volumes:
+      - /run/phantora:/run/phantora
+      - ../..:/phantora/tests:ro
+    pid: host
+    ipc: host
+    environment:
+      - CUDA_DEVICE_MAX_CONNECTIONS=1
+      - PHANTORA_NGPU={ngpu}
+      - PHANTORA_VRAM_MIB={vram_mib}
+      - PHANTORA_IGNORE_CPU_TIME=1
+      - PHANTORA_SOCKET_PREFIX=/run/phantora/phantora
+    hostname: host-{host_id}
+    command: sleep infinity
+    cpuset: '{cpuset}'
+    depends_on:
+      - simulator
+"""
+
+NETCONFIG_TEMPLATE = r"""
+host_mapping = {host_list}
+
+[simulator]
+loopback_speed = 2880
+fairness = "PerFlowMaxMin"
+
+[topology]
+type = "TwoLayerMultiPath"
+
+[topology.args]
+nspines = 2
+nracks = {nracks}
+rack_size = 2
+host_bw = 800
+rack_uplink_port_bw = 800
+load_balancer_type = "EcmpEverything"
+"""
+
+if __name__ == '__main__':
+    import argparse
+    from os.path import dirname, realpath, join
+    from multiprocessing import cpu_count
+    script_dir = dirname(realpath(__file__))
+
+    nproc = cpu_count()
+    if nproc <= 2:
+        default_sim_core = str(nproc - 1)
+        default_host_cpuset = str(nproc - 1)
+    else:
+        default_sim_core = str(nproc // 2)
+        default_host_cpuset = f"{nproc // 2 + 1}-{nproc - 1}"
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nhost", type=int, default=4)
+    parser.add_argument("--ngpu", type=int, default=4)
+    parser.add_argument("--vram_mib", type=int, default=143771)
+    parser.add_argument("--cpuset_sim", type=str, default=default_sim_core)
+    parser.add_argument("--cpuset_host", type=str, default=default_host_cpuset)
+    args = parser.parse_args()
+
+    nhosts = args.nhost
+    ngpu = args.ngpu
+
+    with open(join(script_dir, "compose.yaml"), "w") as f:
+        f.write("services:")
+        f.write(SIMULATOR_TEMPLATE.format(cpuset=args.cpuset_sim))
+        for i in range(1, nhosts + 1):
+            f.write(
+                HOST_TEMPLATE.format(
+                    host_id=i,
+                    ngpu=ngpu,
+                    vram_mib=args.vram_mib,
+                    cpuset=args.cpuset_host,
+                )
+            )
+
+    with open(join(script_dir, "netconfig.toml"), "w") as f:
+        host_list = str([f"host-{i}" for i in range(1, nhosts + 1)])
+        f.write(
+            NETCONFIG_TEMPLATE.format(
+                host_list=host_list, nracks=(nhosts + 1) // 2
+            )
+        )
+
+    with open(join(script_dir, "config.sh"), "w") as f:
+        f.write(f"EVAL_NHOST={nhosts}\n")
+        f.write(f"EVAL_NGPU={ngpu}\n")
diff --git a/tests/docker/ColossalAI/run.sh b/tests/docker/ColossalAI/run.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+WORKDIR=`dirname $(realpath $0)`
+source $WORKDIR/config.sh
+
+compose_file=$WORKDIR/compose.yaml
+
+docker compose -f $compose_file down --remove-orphans
+docker compose -f $compose_file up -d
+sleep 1
+
+cmd="/phantora/dist/phantora_run torchrun --nproc_per_node $EVAL_NGPU --nnodes $EVAL_NHOST --rdzv_backend c10d --rdzv_endpoint=\"host-1:12345\" /phantora/tests/test_ColossalAI.py $@"
+
+for w in $(seq 2 $EVAL_NHOST); do
+  docker compose -f $compose_file exec -it -d host-$w bash -c "$cmd"
+done
+docker compose -f $compose_file exec -it host-1 bash -c "$cmd"
diff --git a/tests/docker/ColossalAI/stop.sh b/tests/docker/ColossalAI/stop.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+WORKDIR=`dirname $(realpath $0)`
+compose_file=$WORKDIR/compose.yaml
+docker compose -f $compose_file down --remove-orphans
+sudo rm -f /run/phantora/phantora*
diff --git a/tests/phantora_utils.py b/tests/phantora_utils.py
@@ -22,6 +22,7 @@ def time_pair() -> float:
     LIB.get_time_double.restype = ctypes.c_double
     _get_time = LIB.get_time_double
     _perf_counter = _time.perf_counter
+    _wall_time = _time.time
 
     def time() -> float:
         _read_timer()
@@ -34,6 +35,7 @@ def time_pair() -> float:
         return t, t_wall
 
     _time.perf_counter = time
+    #_time.time = time
 
     # seems cannot patch `assert_ints_same_as_other_ranks`
     # maybe due to decorator, but cannot reproduce in a mini example