Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
06173ad
Happy Init
pkourdis Dec 6, 2025
7b40721
comments and format
Chao1Han Nov 3, 2025
8c16c18
typo
Chao1Han Nov 3, 2025
d5aa7e7
revert split support and leave for seperate PR
zhangxiaoli73 Nov 6, 2025
bde7460
move some APIs if we cannot support now
zhangxiaoli73 Nov 6, 2025
43ffef8
fix ptr
Chao1Han Nov 10, 2025
35424a6
support xccl test
Chao1Han Nov 7, 2025
02f98f3
support allreduce
zhangxiaoli73 Nov 10, 2025
a27f699
Revert "support xccl test"
Chao1Han Nov 10, 2025
09063d6
Add env check
Chao1Han Nov 10, 2025
09b00cf
feat(xpu): add Intel extension support for free memory query
pkourdis Dec 5, 2025
5eaa25b
refactor(xpu): move free memory warning to device initialization
pkourdis Nov 26, 2025
3c68257
improve(xpu): refine free memory warning message
pkourdis Dec 2, 2025
0c087f3
fix(xccl): suppress unused variable warning for device_prop
pkourdis Dec 2, 2025
89b144d
perf(xpu): add branch prediction hints for Intel free memory extension
pkourdis Dec 3, 2025
6574bde
Apply suggestion from @frost-intel
frost-intel Dec 3, 2025
82affe6
style(xccl): Apply clang-format to XpuApi.cpp file
pkourdis Dec 5, 2025
15c1bd6
fix(xccl): prevent exception propagation from destructor
pkourdis Dec 6, 2025
c0f6a65
Address empty input tensor for all_reduce operation from unittest
newtdms Dec 5, 2025
4c5ee99
Fix typo
newtdms Dec 5, 2025
e798ec7
Fix comment
pkourdis Dec 5, 2025
d65c4f5
Add a fix for PREMUL_SUM when world size is 1 due to oneCCL bug
frost-intel Dec 12, 2025
8611648
Workaround for reduce PREMUL_SUM
frost-intel Dec 15, 2025
15b5f81
refactor(xccl): simplify preReduce function using std::visit
pkourdis Dec 15, 2025
63b8d70
make preReduce static
frost-intel Dec 15, 2025
a68aab7
Add XCCL backend documentation to README
siju-samuel Nov 27, 2025
2402ed7
feat(xccl): add placeholder for getMemAllocator
pkourdis Dec 6, 2025
f7f00c1
refactor: replace CUDA-specific device selection with accelerator API
pkourdis Nov 26, 2025
b81c952
Update readme about XCCL backend source code building
zhangxiaoli73 Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ option(USE_NCCLX "Whether to build NCCLX or not" ON)
option(USE_GLOO "Whether to build Gloo or not" ON)
option(USE_RCCL "Whether to build RCCL or not" OFF)
option(USE_RCCLX "Whether to build RCCLX or not" OFF)
option(USE_XCCL "Whether to build XCCL or not" OFF)
option(USE_TRANSPORT "Whether to build TRANSPORT or not" ON)
message(STATUS " USE_NCCL : ${USE_NCCL}")
message(STATUS " USE_NCCLX : ${USE_NCCLX}")
message(STATUS " USE_GLOO : ${USE_GLOO}")
message(STATUS " USE_RCCL : ${USE_RCCL}")
message(STATUS " USE_RCCLX : ${USE_RCCLX}")
message(STATUS " USE_XCCL : ${USE_XCCL}")
message(STATUS " USE_TRANSPORT : ${USE_TRANSPORT}")

if(NOT DEFINED ENV{TORCH_CUDA_ARCH_LIST})
Expand Down Expand Up @@ -123,6 +125,9 @@ endif()
if (USE_RCCLX)
include(comms/torchcomms/rcclx/CMakeLists.txt)
endif()
if (USE_XCCL)
include(comms/torchcomms/xccl/CMakeLists.txt)
endif()
if (USE_TRANSPORT)
include(comms/torchcomms/transport/CMakeLists.txt)
endif()
Expand Down
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ torchcomms requires the following software and hardware:
- Python 3.10 or higher
- PyTorch 2.8 or higher
- CUDA-capable GPU (for NCCL/NCCLX or RCCL backends)
- Intel XPU (for XCCL backend)

## Installation

Expand Down Expand Up @@ -119,6 +120,25 @@ export RCCLX_LIB=${BUILD_DIR}/lib
```


##### XCCL Backend

Source Intel oneAPI environment (update path to your oneAPI installation)
```bash
export INTEL_ONEAPI=/path/to/intel/oneapi # e.g., /opt/intel/oneapi or ~/intel/oneapi
source $INTEL_ONEAPI/compiler/latest/env/vars.sh
source $INTEL_ONEAPI/ccl/latest/env/vars.sh
```

Enable XCCL backend and install
```bash
export USE_XCCL=ON
export USE_NCCL=OFF
export USE_NCCLX=OFF
export USE_TRANSPORT=OFF
pip install --no-build-isolation -v .
```


#### Install torchcomms:

```bash
Expand All @@ -138,6 +158,7 @@ export USE_NCCLX=ON # Default: ON
export USE_GLOO=ON # Default: ON
export USE_RCCL=OFF # Default: OFF
export USE_RCCLX=OFF # Default: OFF
export USE_XCCL=OFF # Default: OFF
```

Then run:
Expand Down
332 changes: 332 additions & 0 deletions comms/torchcomms/device/XpuApi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,332 @@
#include "comms/torchcomms/device/XpuApi.hpp"
#include <ATen/xpu/XPUContext.h>
#include <c10/xpu/XPUFunctions.h>
#include <c10/xpu/XPUStream.h>
#include <sstream>
#include <stdexcept>
#include "comms/torchcomms/TorchCommLogging.hpp"

namespace torch {
namespace comms {

xpu_result_t DefaultXpuApi::setDevice(int device) {
try {
::c10::xpu::set_device(device);
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}

xpu_result_t DefaultXpuApi::getDeviceProperties(
xpuDeviceProp* prop,
int device) {
if (!prop) {
return XPU_ERROR_INVALID_VALUE;
}

try {
sycl::device sycl_device = ::c10::xpu::get_raw_device(device);

// Get device name
std::string device_name = sycl_device.get_info<sycl::info::device::name>();
strncpy(prop->name, device_name.c_str(), 255);
prop->name[255] = '\0';

// Get memory info
prop->totalGlobalMem =
sycl_device.get_info<sycl::info::device::global_mem_size>();

if (!sycl_device.has(sycl::aspect::ext_intel_free_memory)) [[unlikely]] {
TC_LOG(WARNING)
<< "Free memory queries are unsupported on this SYCL device; using total global memory as the free-memory estimate.";
}

// Get compute capabilities
auto max_work_group_size =
sycl_device.get_info<sycl::info::device::max_work_group_size>();
auto max_work_item_sizes =
sycl_device.get_info<sycl::info::device::max_work_item_sizes<3>>();
auto max_compute_units =
sycl_device.get_info<sycl::info::device::max_compute_units>();

prop->multiProcessorCount = max_compute_units;
prop->maxThreadsPerBlock = max_work_group_size;
prop->maxThreadsDim[0] = max_work_item_sizes[0];
prop->maxThreadsDim[1] = max_work_item_sizes[1];
prop->maxThreadsDim[2] = max_work_item_sizes[2];

return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}

xpu_result_t DefaultXpuApi::memGetInfo(size_t* free, size_t* total) {
if (!free || !total) {
return XPU_ERROR_INVALID_VALUE;
}

try {
int device = ::c10::xpu::current_device();
sycl::device& sycl_device = ::c10::xpu::get_raw_device(device);

*total = sycl_device.get_info<sycl::info::device::global_mem_size>();
if (sycl_device.has(sycl::aspect::ext_intel_free_memory)) [[likely]] {
*free =
sycl_device.get_info<sycl::ext::intel::info::device::free_memory>();
} else [[unlikely]] {
*free = *total;
}

return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}

xpu_result_t DefaultXpuApi::getDeviceCount(int* count) {
if (!count) {
return XPU_ERROR_INVALID_VALUE;
}

try {
*count = ::c10::xpu::device_count();
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}

xpu_result_t DefaultXpuApi::streamCreateWithPriority(
xpuStream_t& stream,
unsigned int flags,
int priority) {
try {
// Map priority: priority < 0 = high, priority >= 0 = normal
bool isHighPriority = (priority < 0);
stream = ::c10::xpu::getStreamFromPool(isHighPriority);
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}

xpu_result_t DefaultXpuApi::streamDestroy(const xpuStream_t& stream) {
// Stream is managed by PyTorch, nothing to do
return XPU_SUCCESS;
}

xpu_result_t DefaultXpuApi::streamWaitEvent(
const xpuStream_t& stream,
xpuEvent_t& event,
unsigned int flags) {
try {
event.block(stream);
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_HANDLE;
}
}

xpuStream_t DefaultXpuApi::getCurrentXPUStream(int device_index) {
return ::c10::xpu::getCurrentXPUStream(device_index);
}

xpu_result_t DefaultXpuApi::streamSynchronize(const xpuStream_t& stream) {
try {
stream.queue().wait_and_throw();
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_HANDLE;
}
}

xpu_result_t DefaultXpuApi::streamIsCapturing(
const xpuStream_t& stream,
xpuStreamCaptureStatus* pCaptureStatus) {
if (!pCaptureStatus) {
return XPU_ERROR_INVALID_VALUE;
}

// XPU/SYCL doesn't support stream capture
*pCaptureStatus = xpuStreamCaptureStatusNone;
return XPU_SUCCESS;
}

xpu_result_t DefaultXpuApi::streamGetCaptureInfo(
const xpuStream_t& stream,
xpuStreamCaptureStatus* pCaptureStatus,
unsigned long long* pId) {
if (!pCaptureStatus) {
return XPU_ERROR_INVALID_VALUE;
}

*pCaptureStatus = xpuStreamCaptureStatusNone;
if (pId) {
*pId = 0;
}
return XPU_SUCCESS;
}

xpu_result_t DefaultXpuApi::malloc(void** devPtr, size_t size) {
if (!devPtr) {
return XPU_ERROR_INVALID_VALUE;
}

if (size == 0) {
*devPtr = nullptr;
return XPU_SUCCESS;
}

try {
// Use SYCL's malloc_device
sycl::context& ctx = ::c10::xpu::get_device_context();
int device = ::c10::xpu::current_device();
sycl::device& dev = ::c10::xpu::get_raw_device(device);

*devPtr = sycl::malloc_device(size, dev, ctx);

if (!*devPtr) {
return XPU_ERROR_OUT_OF_MEMORY;
}

return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_OUT_OF_MEMORY;
}
}

xpu_result_t DefaultXpuApi::free(void* devPtr) {
if (!devPtr) {
return XPU_SUCCESS;
}

try {
sycl::context& ctx = ::c10::xpu::get_device_context();
sycl::free(devPtr, ctx);
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}

xpu_result_t DefaultXpuApi::memcpyAsync(
void* dst,
const void* src,
size_t count,
const xpuStream_t& stream) {
if (!dst || !src) {
return XPU_ERROR_INVALID_VALUE;
}

if (count == 0) {
return XPU_SUCCESS;
}

try {
stream.queue().memcpy(dst, src, count);
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}


xpu_result_t DefaultXpuApi::eventCreate(xpuEvent_t& event) {
try {
event = ::at::xpu::XPUEvent(false); // No timing
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}

xpu_result_t DefaultXpuApi::eventCreateWithFlags(
xpuEvent_t& event,
unsigned int flags) {
try {
bool enable_timing = (flags & 0x1) != 0;
event = ::at::xpu::XPUEvent(enable_timing);
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_VALUE;
}
}

xpu_result_t DefaultXpuApi::eventDestroy(const xpuEvent_t& event) {
// Event is RAII, nothing to do
return XPU_SUCCESS;
}

xpu_result_t DefaultXpuApi::eventRecord(xpuEvent_t& event, const xpuStream_t& stream) {
try {
event.record(stream);
return XPU_SUCCESS;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_HANDLE;
}
}

xpu_result_t DefaultXpuApi::eventQuery(const xpuEvent_t& event) {
try {
bool is_complete = event.query();
return is_complete ? XPU_SUCCESS : XPU_ERROR_NOT_READY;
} catch (const std::exception& e) {
return XPU_ERROR_INVALID_HANDLE;
}
}

// Graph Operations (Unsupported)
xpu_result_t DefaultXpuApi::userObjectCreate(
xpuUserObject_t* object_out,
void* ptr,
xpuHostFn_t destroy,
unsigned int initialRefcount,
unsigned int flags) {
// XPU/SYCL doesn't support user objects
return XPU_ERROR_UNSUPPORTED;
}

xpu_result_t DefaultXpuApi::graphRetainUserObject(
xpuGraph_t graph,
xpuUserObject_t object,
unsigned int count,
unsigned int flags) {
// Currently, XPU/SYCL doesn't support graphs
return XPU_ERROR_UNSUPPORTED;
}

xpu_result_t DefaultXpuApi::streamGetCaptureInfo_v2(
const xpuStream_t& stream,
xpuStreamCaptureStatus* captureStatus_out,
unsigned long long* id_out,
xpuGraph_t* graph_out,
const xpuGraphNode_t** dependencies_out,
size_t* numDependencies_out) {
// Currently, XPU/SYCL doesn't support graphs
return XPU_ERROR_UNSUPPORTED;
}

// Error Handling
const char* DefaultXpuApi::getErrorString(xpu_result_t error) {
switch (error) {
case XPU_SUCCESS:
return "success";
case XPU_ERROR_INVALID_VALUE:
return "invalid value";
case XPU_ERROR_NOT_READY:
return "not ready";
case XPU_ERROR_INVALID_HANDLE:
return "invalid handle";
case XPU_ERROR_OUT_OF_MEMORY:
return "out of memory";
case XPU_ERROR_UNSUPPORTED:
return "unsupported feature";
default:
return "unknown error";
}
}

} // namespace comms
} // namespace torch
Loading