diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 4e36909f45e..19c7c7cae0e 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -254,7 +254,7 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) { void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(backend); GGML_ASSERT(tensor); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); if (backend->iface.set_tensor_async == NULL) { @@ -268,7 +268,7 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(backend); GGML_ASSERT(tensor); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); if (backend->iface.get_tensor_async == NULL) { @@ -283,7 +283,7 @@ void ggml_backend_tensor_set_2d_async(ggml_backend_t backend, struct ggml_tensor size_t n_copies, size_t stride_tensor, size_t stride_data) { GGML_ASSERT(backend); GGML_ASSERT(tensor); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) { for (size_t i = 0; i < n_copies; i++) { @@ -295,7 +295,7 @@ void ggml_backend_tensor_set_2d_async(ggml_backend_t backend, struct ggml_tensor return; } - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); backend->iface.set_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data); } @@ -304,7 +304,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_ size_t n_copies, size_t stride_tensor, size_t stride_data) { GGML_ASSERT(backend); GGML_ASSERT(tensor); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) { for (size_t i = 0; i < n_copies; i++) { @@ -316,7 +316,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_ return; } - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data); } @@ -330,7 +330,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz return; } - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); buf->iface.set_tensor(buf, tensor, data, offset, size); @@ -345,7 +345,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz return; } - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); buf->iface.get_tensor(buf, tensor, data, offset, size); @@ -367,7 +367,7 @@ void ggml_backend_tensor_set_2d(struct ggml_tensor * tensor, const void * data, return; } - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); buf->iface.set_tensor_2d(buf, tensor, data, offset, size, n_copies, stride_tensor, stride_data); @@ -389,7 +389,7 @@ void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, return; } - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); buf->iface.get_tensor_2d(buf, tensor, data, offset, size, n_copies, stride_tensor, stride_data); @@ -404,7 +404,7 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size } GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT((tensor->data != NULL || (tensor->view_src != NULL && tensor->view_src->data != NULL)) && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer"); diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index d3e95f678a5..74ce6ae9e1c 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -1299,14 +1299,19 @@ void launch_fattn( #ifdef GGML_USE_HIP // HIP/ROCm: bypass the memory pool for f16 temp buffers. - // The legacy pool (ggml_cuda_pool_leg) retains peak-sized allocations permanently. - // For quantized KV dequant, this means the f16 temp buffer stays allocated, - // consuming more VRAM than the quantized KV compression saves — causing OOM. - // Using raw alloc+free ensures the memory is released after the kernel completes. + // The legacy pool (ggml_cuda_pool_leg) retains peak-sized allocations permanently + // because free() stores buffers for reuse rather than releasing them. + // On HIP without VMM support (RDNA 3/4), this means the f16 dequant temp buffers + // for quantized KV stay allocated after use, consuming more VRAM than the KV + // compression saves — causing OOM before f16 at equivalent context lengths. + // Using raw cudaMalloc/cudaFree ensures memory is released after the kernel completes. + // Ref: https://github.com/ggml-org/llama.cpp/issues/22107 struct hip_f16_alloc { half * ptr = nullptr; cudaStream_t stream; hip_f16_alloc(cudaStream_t s) : stream(s) {} + hip_f16_alloc(const hip_f16_alloc &) = delete; + hip_f16_alloc & operator=(const hip_f16_alloc &) = delete; ~hip_f16_alloc() { if (ptr) { // Cast to void: hipStreamSynchronize / hipFree are [[nodiscard]] under diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a296d0ab446..831ff31ffa7 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4600,6 +4600,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_1], "set_rows_q5_1" #itype, set_rows_q5_1 ## itype ## _len, set_rows_q5_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q8_0], "set_rows_q8_0" #itype, set_rows_q8_0 ## itype ## _len, set_rows_q8_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TURBO3_0], "set_rows_turbo3_0" #itype, set_rows_turbo3_0 ## itype ## _len, set_rows_turbo3_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_TURBO4_0], "set_rows_turbo4_0" #itype, set_rows_turbo4_0 ## itype ## _len, set_rows_turbo4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_IQ4_NL], "set_rows_iq4_nl" #itype, set_rows_iq4_nl ## itype ## _len, set_rows_iq4_nl ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); SET_ROWS(_i32) @@ -15814,6 +15816,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: case GGML_TYPE_IQ4_NL: return true; default: @@ -15838,6 +15842,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: case GGML_TYPE_IQ4_NL: return true; default: @@ -15853,6 +15859,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: + case GGML_TYPE_TURBO3_0: + case GGML_TYPE_TURBO4_0: case GGML_TYPE_IQ4_NL: return true; default: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index d99b2b5d802..91ab4c00427 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -749,7 +749,7 @@ void process_shaders() { string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); } - for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) { + for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl", "turbo3_0", "turbo4_0"}) { string_to_spv("set_rows_" + t + "_i32", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("set_rows_" + t + "_i64", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); }