From f58c91289d03d020a4ef4e67930a461f3ef0607d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Mar 2025 16:49:41 +0200 Subject: [PATCH 1/3] llama : make loras compatible with repacking ggml-ci --- src/llama-adapter.cpp | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index b448614e471d6..f2d4c9df8dafe 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -247,6 +247,27 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ } } + // get extra buffer types of the CPU + std::vector buft_extra; + { + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + + // add the default CPU buffer type which will be used as a fallback if the lora needs to be loaded to an extra buft + buft_extra.emplace_back(ggml_backend_dev_buffer_type(cpu_dev)); + + auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) + ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); + + if (ggml_backend_dev_get_extra_bufts_fn) { + ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); + while (extra_bufts && *extra_bufts) { + buft_extra.emplace_back(*extra_bufts); + ++extra_bufts; + } + } + } + // add tensors for (auto & it : ab_map) { const std::string & name = it.first; @@ -263,7 +284,20 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)"); } - ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); + auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer); + + // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case + for (auto & ex : buft_extra) { + if (ex == buft) { + LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); + buft = buft_extra[0]; + break; + } + } + + LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); + + ggml_context * dev_ctx = ctx_for_buft(buft); // validate tensor shape if (is_token_embd) { // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd() From 9aef6acdcc135b8580c7405cb0adbc24cce713fd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Mar 2025 18:29:54 +0200 Subject: [PATCH 2/3] cont : simplify ggml-ci --- src/llama-adapter.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index f2d4c9df8dafe..b9766dc96d786 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -253,9 +253,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); - // add the default CPU buffer type which will be used as a fallback if the lora needs to be loaded to an extra buft - buft_extra.emplace_back(ggml_backend_dev_buffer_type(cpu_dev)); - auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); @@ -289,8 +286,11 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case for (auto & ex : buft_extra) { if (ex == buft) { - LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); - buft = buft_extra[0]; + LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); + + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + buft = ggml_backend_dev_buffer_type(cpu_dev); + break; } } From 1f9dc73cf3a91600879595aa439f8c0c277eeb02 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 27 Mar 2025 08:23:50 +0200 Subject: [PATCH 3/3] cont : add TODO [no ci] --- src/llama-adapter.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index b9766dc96d786..7ac54d2391fd0 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -248,6 +248,8 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ } // get extra buffer types of the CPU + // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future + // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948 std::vector buft_extra; { auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);