Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update to latest llama.cpp breaking API changes #254

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/test-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,12 @@ jobs:
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y pip wget
- name: Build and test
run: |
set -o pipefail
GPU_TESTS=true BUILD_TYPE=cublas CMAKE_ARGS="-DLLAMA_METAL=OFF -DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" \
make test 2>&1 | tee test_log.log
set +o pipefail
if grep -q "using CUDA for GPU acceleration" test_log.log; then
echo "All good";
echo "GPU was used";
else
echo "No CUDA found";
exit 1;
Expand Down
17 changes: 11 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ llama.cpp/grammar-parser.o:
llama.cpp/ggml-alloc.o:
cd build && cp -rf CMakeFiles/ggml.dir/ggml-alloc.c.o ../llama.cpp/ggml-alloc.o

llama.cpp/ggml-backend.o:
cd build && cp -rf CMakeFiles/ggml.dir/ggml-backend.c.o ../llama.cpp/ggml-backend.o

llama.cpp/ggml.o: prepare
mkdir -p build
cd build && CC="$(CC)" CXX="$(CXX)" cmake ../llama.cpp $(CMAKE_ARGS) && VERBOSE=1 cmake --build . --config Release && cp -rf CMakeFiles/ggml.dir/ggml.c.o ../llama.cpp/ggml.o
Expand All @@ -227,16 +230,18 @@ llama.cpp/llama.o:
llama.cpp/common.o:
cd build && cp -rf common/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o

binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o

binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o llama.cpp/grammar-parser.o llama.cpp/ggml-backend.o llama.cpp/ggml-alloc.o
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/common binding.cpp -o binding.o -c $(LDFLAGS)

## https://github.com/ggerganov/llama.cpp/pull/1902
prepare:
cd llama.cpp && patch -p1 < ../patches/1902-cuda.patch
cd llama.cpp && \
patch -p1 < ../patches/1902-cuda.patch
touch $@

libbinding.a: prepare binding.o llama.cpp/k_quants.o llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o $(EXTRA_TARGETS)
ar src libbinding.a llama.cpp/ggml.o llama.cpp/k_quants.o $(EXTRA_TARGETS) llama.cpp/ggml-alloc.o llama.cpp/common.o llama.cpp/grammar-parser.o llama.cpp/llama.o binding.o
libbinding.a: prepare binding.o llama.cpp/k_quants.o llama.cpp/grammar-parser.o llama.cpp/ggml-alloc.o llama.cpp/ggml-backend.o $(EXTRA_TARGETS)
ar src libbinding.a llama.cpp/ggml.o llama.cpp/k_quants.o $(EXTRA_TARGETS) llama.cpp/ggml-alloc.o llama.cpp/ggml-backend.o llama.cpp/common.o llama.cpp/grammar-parser.o llama.cpp/llama.o binding.o

clean:
rm -rf *.o
Expand All @@ -245,7 +250,7 @@ clean:
rm -rf build

ggllm-test-model.bin:
wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O ggllm-test-model.bin
wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q5_K_M.gguf -O ggllm-test-model.bin

test: ggllm-test-model.bin libbinding.a
C_INCLUDE_PATH=${INCLUDE_PATH} CGO_LDFLAGS=${CGO_LDFLAGS} LIBRARY_PATH=${LIBRARY_PATH} TEST_MODEL=ggllm-test-model.bin go run github.com/onsi/ginkgo/v2/ginkgo --label-filter="$(TEST_LABEL)" --flake-attempts 5 -v -r ./...
C_INCLUDE_PATH=${INCLUDE_PATH} CGO_LDFLAGS=${CGO_LDFLAGS} LIBRARY_PATH=${LIBRARY_PATH} TEST_MODEL=$(abspath ./)/ggllm-test-model.bin go run github.com/onsi/ginkgo/v2/ginkgo --label-filter="$(TEST_LABEL)" -v -r ./...
130 changes: 69 additions & 61 deletions binding.cpp

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,21 @@ void* load_model(const char *fname,
bool mlock,
bool embeddings,
bool mmap,
bool low_vram,
int n_gpu,
int n_batch,
const char *maingpu,
const char *tensorsplit,
bool numa,
float rope_freq_base,
float rope_freq_scale,
bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity
bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool perplexity
);

int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);

int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tokenSize, float * res_embeddings);

void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
void* llama_allocate_params(const char *prompt, int seed, int threads, int batch_threads, int tokens,
int top_k, float top_p, float temp, float repeat_penalty,
int repeat_last_n, bool ignore_eos, bool memory_f16,
int n_batch, int n_keep, const char** antiprompt, int antiprompt_count,
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 103 files
16 changes: 8 additions & 8 deletions llama.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ func New(model string, opts ...ModelOption) (*LLama, error) {

result := C.load_model(modelPath,
C.int(mo.ContextSize), C.int(mo.Seed),
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM),
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap),
C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA),
C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale),
C.bool(MulMatQ), loraAdapter, loraBase, C.bool(mo.Perplexity),
C.bool(MulMatQ), loraAdapter, loraBase, C.float(mo.LoraScale), C.bool(mo.Perplexity),
)

if result == nil {
Expand Down Expand Up @@ -112,7 +112,7 @@ func (l *LLama) TokenEmbeddings(tokens []int, opts ...PredictOption) ([]float32,
// float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, const char *logit_bias, const char *session_file, bool prompt_cache_all, bool mlock, bool mmap, const char *maingpu, const char *tensorsplit , bool prompt_cache_ro,
// float rope_freq_base, float rope_freq_scale, float negative_prompt_scale, const char* negative_prompt
// );
params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), nil, C.int(0),
Expand Down Expand Up @@ -154,7 +154,7 @@ func (l *LLama) Embeddings(text string, opts ...PredictOption) ([]float32, error
pass = &reversePrompt[0]
}

params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
Expand Down Expand Up @@ -193,7 +193,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
pass = &reversePrompt[0]
}

params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
Expand Down Expand Up @@ -238,7 +238,7 @@ func (l *LLama) SpeculativeSampling(ll *LLama, text string, opts ...PredictOptio
pass = &reversePrompt[0]
}

params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
Expand Down Expand Up @@ -296,7 +296,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
pass = &reversePrompt[0]
}

params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
Expand Down Expand Up @@ -346,7 +346,7 @@ func (l *LLama) TokenizeString(text string, opts ...PredictOption) (int32, []int
var fakeDblPtr **C.char

// copy pasted and modified minimally. Should I simplify down / do we need an "allocate defaults"
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.BatchThreads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), fakeDblPtr, C.int(0),
Expand Down
14 changes: 10 additions & 4 deletions llama_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ how much is 2+2?
Expect(err).ToNot(HaveOccurred())
Expect(model).ToNot(BeNil())
text, err := model.SpeculativeSampling(model2, `[INST] Answer to the following question:
how much is 2+2?
Do a simple math calculation: How much is 2+2?
[/INST]`, llama.SetNDraft(16),
)
Expect(err).ToNot(HaveOccurred(), text)
Expand All @@ -97,7 +97,9 @@ how much is 2+2?
getModel := func() (*LLama, error) {
model, err := New(
testModelPath,
llama.EnableF16Memory, llama.SetContext(128), llama.EnableEmbeddings, llama.SetGPULayers(10),
llama.EnableF16Memory,
llama.SetContext(128),
llama.SetGPULayers(10),
)
Expect(err).ToNot(HaveOccurred())
Expect(model).ToNot(BeNil())
Expand All @@ -111,8 +113,12 @@ how much is 2+2?

model, err := getModel()
text, err := model.Predict(`[INST] Answer to the following question:
how much is 2+2?
[/INST]`)
Do a simple math calculation: How much is 2+2?
[/INST]`,
SetTemperature(1.0),
SetTopP(0.8),
SetTopK(40),
)
Expect(err).ToNot(HaveOccurred(), text)
Expect(text).To(ContainSubstring("4"), text)
})
Expand Down
21 changes: 15 additions & 6 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ type ModelOptions struct {
F16Memory bool
MLock bool
MMap bool
LowVRAM bool
Embeddings bool
NUMA bool
NGPULayers int
Expand All @@ -16,6 +15,7 @@ type ModelOptions struct {
FreqRopeBase float32
FreqRopeScale float32
MulMatQ *bool
LoraScale float32
LoraBase string
LoraAdapter string
Perplexity bool
Expand All @@ -29,6 +29,7 @@ type PredictOptions struct {
DebugMode bool
StopPrompts []string
IgnoreEOS bool
BatchThreads int

TailFreeSamplingZ float32
TypicalP float32
Expand Down Expand Up @@ -68,7 +69,6 @@ var DefaultModelOptions ModelOptions = ModelOptions{
MLock: false,
Embeddings: false,
MMap: true,
LowVRAM: false,
NBatch: 512,
FreqRopeBase: 10000,
FreqRopeScale: 1.0,
Expand All @@ -79,6 +79,7 @@ var DefaultOptions PredictOptions = PredictOptions{
Threads: 4,
Tokens: 128,
Penalty: 1.1,
BatchThreads: -1,
Repeat: 64,
Batch: 512,
NKeep: 64,
Expand Down Expand Up @@ -109,6 +110,18 @@ func SetLoraBase(s string) ModelOption {
}
}

func SetBatchThreads(b int) PredictOption {
return func(p *PredictOptions) {
p.BatchThreads = b
}
}

func SetLoraScale(f float32) ModelOption {
return func(p *ModelOptions) {
p.LoraScale = f
}
}

func SetLoraAdapter(s string) ModelOption {
return func(p *ModelOptions) {
p.LoraAdapter = s
Expand Down Expand Up @@ -219,10 +232,6 @@ func SetNegativePrompt(np string) PredictOption {
}
}

var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
p.LowVRAM = true
}

var EnableNUMA ModelOption = func(p *ModelOptions) {
p.NUMA = true
}
Expand Down
38 changes: 21 additions & 17 deletions patches/1902-cuda.patch
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
diff --git a/common/common.cpp b/common/common.cpp
index 2597ba0..e42ae73 100644
index ec181c6..9ba699b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1268,3 +1268,218 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
@@ -1345,3 +1345,222 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
}
+
+gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base) {
+gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base, float lora_scale) {
+ gpt_params* lparams = new gpt_params;
+ fprintf(stderr, "%s: loading model %s\n", __func__, fname.c_str());
+
+ // Initialize the 'model' member with the 'fname' parameter
+ lparams->model = fname;
+ lparams->lora_base = lora_base;
+ lparams->lora_adapter = lora;
+ if (lora_scale == 0 && !lora_base.empty()) {
+ lora_scale = 1.0f;
+ }
+ if (!lora.empty()) {
+ lparams->lora_adapter.push_back(std::make_tuple(lora, lora_scale));
+ }
+ if (lparams->lora_adapter.empty()) {
+ lparams->use_mmap = false;
+ }
Expand All @@ -30,14 +35,14 @@ index 2597ba0..e42ae73 100644
+ return lparams;
+}
+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity) {
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all) {
+ // load the model
+ gpt_params * lparams;
+// Temporary workaround for https://github.com/go-skynet/go-llama.cpp/issues/218
+#ifdef GGML_USE_CUBLAS
+ lparams = create_gpt_params_cuda(fname);
+#else
+ lparams = create_gpt_params(fname, lora, lora_base);
+ lparams = create_gpt_params(fname, lora, lora_base, lora_scale);
+#endif
+ llama_model * model;
+ llama_binding_state * state;
Expand All @@ -49,10 +54,8 @@ index 2597ba0..e42ae73 100644
+ lparams->embedding = embeddings;
+ lparams->use_mlock = mlock;
+ lparams->n_gpu_layers = n_gpu_layers;
+ lparams->perplexity = perplexity;
+ lparams->logits_all = logits_all;
+ lparams->use_mmap = mmap;
+
+ lparams->low_vram = low_vram;
+ if (rope_freq_base != 0.0f) {
+ lparams->rope_freq_base = rope_freq_base;
+ } else {
Expand Down Expand Up @@ -114,8 +117,9 @@ index 2597ba0..e42ae73 100644
+ int idx) {
+
+ struct gpt_params params = *g_params;
+
+ const int n_ctx = llama_n_ctx(ctx);
+ const int n_vocab = llama_n_vocab(ctx);
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+ const float temp = params.temp;
+ const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
Expand All @@ -133,7 +137,7 @@ index 2597ba0..e42ae73 100644
+
+ llama_token id = 0;
+
+ float * logits = llama_get_logits(ctx) + idx * n_vocab;
+ float * logits = llama_get_logits_ith(ctx, idx);
+
+ // Apply params.logit_bias map
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
Expand Down Expand Up @@ -184,19 +188,19 @@ index 2597ba0..e42ae73 100644
+ if (mirostat == 1) {
+ static float mirostat_mu = 2.0f * mirostat_tau;
+ const int mirostat_m = 100;
+ llama_sample_temperature(ctx, &cur_p, temp);
+ llama_sample_temp(ctx, &cur_p, temp);
+ id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+ } else if (mirostat == 2) {
+ static float mirostat_mu = 2.0f * mirostat_tau;
+ llama_sample_temperature(ctx, &cur_p, temp);
+ llama_sample_temp(ctx, &cur_p, temp);
+ id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+ } else {
+ // Temperature sampling
+ llama_sample_top_k (ctx, &cur_p, top_k, 1);
+ llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
+ llama_sample_typical (ctx, &cur_p, typical_p, 1);
+ llama_sample_top_p (ctx, &cur_p, top_p, 1);
+ llama_sample_temperature(ctx, &cur_p, temp);
+ llama_sample_temp(ctx, &cur_p, temp);
+
+ {
+ const int n_top = 10;
Expand All @@ -223,10 +227,10 @@ index 2597ba0..e42ae73 100644
+}
\ No newline at end of file
diff --git a/common/common.h b/common/common.h
index 18aea38..ca7a168 100644
index 0e2d3fa..9992d2b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -209,3 +209,19 @@ std::string get_sortable_timestamp();
@@ -221,3 +221,19 @@ std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
Expand All @@ -236,7 +240,7 @@ index 18aea38..ca7a168 100644
+ llama_model * model;
+};
+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity);
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base, float lora_scale, bool logits_all);
+
+llama_token llama_sample_token_binding(
+ struct llama_context * ctx,
Expand Down
Loading