Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 23 additions & 21 deletions ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2497,27 +2497,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
}

/**
* @brief Determines if a tensor operation should be offloaded to the CANN
* backend.
*
* This function checks if a given tensor operation should be offloaded to the
* CANN backend based on the operation type and the size of the tensor. It
* returns true if the second dimension (ne[1]) of the tensor is greater than or
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
*
* @param backend Pointer to the CANN backend.
* @param op Pointer to the tensor operation to check.
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
GGML_UNUSED(dev);

return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
}

/**
* @brief Records an event on the CANN backend stream.
*
Expand Down Expand Up @@ -2593,6 +2572,7 @@ struct ggml_backend_cann_device_context {
int device;
std::string name;
std::string description;
int op_offload_min_batch_size;
};

static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
Expand Down Expand Up @@ -2669,6 +2649,26 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
return ggml_backend_cann_host_buffer_type();
}

/**
* @brief Determines if a tensor operation should be offloaded to the CANN
* backend.
*
* This function checks if a given tensor operation should be offloaded to the
* CANN backend based on the operation type and the size of the tensor. It
* returns true if the second dimension (ne[1]) of the tensor is greater than or
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
*
* @param backend Pointer to the CANN backend.
* @param op Pointer to the tensor operation to check.
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;

return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
}

/**
* @brief Creates a new event for the CANN backend device.
*
Expand Down Expand Up @@ -2785,12 +2785,14 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
if (!initialized) {
aclInit(nullptr);
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

for (int i = 0; i < ggml_cann_info().device_count; i++) {
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
dev_ctx->description = aclrtGetSocName();
dev_ctx->device = i;
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
dev_ctx->op_offload_min_batch_size = min_batch_size;
ggml_cann_set_device(i);
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
/* .reg = */ &reg,
Expand Down
9 changes: 5 additions & 4 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4107,6 +4107,7 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description;
std::string pci_bus_id;
int op_offload_min_batch_size;
};

static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
Expand Down Expand Up @@ -4660,11 +4661,9 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
}

static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;

return get_op_batch_size(op) >= min_batch_size;
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;

GGML_UNUSED(dev);
return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
}

static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
Expand Down Expand Up @@ -4832,6 +4831,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
Expand All @@ -4845,6 +4845,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;
dev_ctx->op_offload_min_batch_size = min_batch_size;

ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-metal/ggml-metal-device.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ struct ggml_metal_device_props {
bool use_shared_buffers;

bool supports_gpu_family_apple7;

int op_offload_min_batch_size;
};

ggml_metal_device_t ggml_metal_device_init(void);
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-metal/ggml-metal-device.m
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {

dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];

dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
Expand Down
7 changes: 2 additions & 5 deletions ggml/src/ggml-metal/ggml-metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -625,14 +625,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
}

static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;

return (op->op == GGML_OP_MUL_MAT ||
op->op == GGML_OP_MUL_MAT_ID) &&
get_op_batch_size(op) >= min_batch_size;

GGML_UNUSED(dev);
GGML_UNUSED(op);
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
}

static ggml_backend_device_i ggml_backend_metal_device_i = {
Expand Down
8 changes: 5 additions & 3 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4286,6 +4286,7 @@ struct ggml_backend_sycl_device_context {
int device;
std::string name;
std::string description;
int op_offload_min_batch_size;
};

static const char * ggml_backend_sycl_device_get_name(ggml_backend_dev_t dev) {
Expand Down Expand Up @@ -4674,9 +4675,8 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
}

static bool ggml_backend_sycl_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
return get_op_batch_size(op) >= min_batch_size;
GGML_UNUSED(dev);
ggml_backend_sycl_device_context * sycl_ctx = (ggml_backend_sycl_device_context *)dev->context;
return get_op_batch_size(op) >= sycl_ctx->op_offload_min_batch_size;
}

static ggml_backend_event_t
Expand Down Expand Up @@ -4799,6 +4799,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_sycl_reg_context * ctx = new ggml_backend_sycl_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

for (int i = 0; i < ggml_sycl_info().device_count; i++) {
ggml_backend_sycl_device_context * dev_ctx = new ggml_backend_sycl_device_context;
Expand All @@ -4812,6 +4813,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
prop, dpct::dev_mgr::instance().get_device(i))));

dev_ctx->description = prop.get_name();
dev_ctx->op_offload_min_batch_size = min_batch_size;

ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_sycl_device_interface,
Expand Down
11 changes: 6 additions & 5 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14077,6 +14077,7 @@ struct ggml_backend_vk_device_context {
std::string description;
bool is_integrated_gpu;
std::string pci_bus_id;
int op_offload_min_batch_size;
};

static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
Expand Down Expand Up @@ -14651,12 +14652,10 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba
}

static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context;

return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);

UNUSED(dev);
return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) ||
(op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
}

static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
Expand Down Expand Up @@ -14737,6 +14736,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256];
Expand All @@ -14746,6 +14746,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
ctx->description = desc;
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
ctx->op_offload_min_batch_size = min_batch_size;
devices.push_back(new ggml_backend_device {
/* .iface = */ ggml_backend_vk_device_i,
/* .reg = */ reg,
Expand Down
Loading