Skip to content

Commit

Permalink
Updates llama.cpp, sync with ggufv2
Browse files Browse the repository at this point in the history
  • Loading branch information
mudler committed Aug 30, 2023
1 parent 9072315 commit 2249559
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 52 deletions.
105 changes: 66 additions & 39 deletions binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <cstdio>
#include <cstring>
#include <fstream>
#include <sstream>
#include <iostream>
#include <string>
#include <vector>
Expand Down Expand Up @@ -47,9 +48,9 @@ int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings) {

int n_past = 0;

const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
auto embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);


if (embd_inp.size() > 0) {
Expand Down Expand Up @@ -78,7 +79,7 @@ int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tok
gpt_params params = *params_p;

for (int i = 0; i < tokenSize; i++) {
auto token_str = llama_token_to_str(ctx, tokens[i]);
auto token_str = llama_token_to_piece(ctx, tokens[i]);
std::vector<std::string> my_vector;
std::string str_token(token_str); // create a new std::string from the char*
params_p->prompt += str_token;
Expand Down Expand Up @@ -106,15 +107,20 @@ int eval(void* params_ptr,void* state_pr,char *text) {
// evaluate prompt
return llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params_p->n_threads);
}

static llama_context ** g_ctx;
static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;

int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
gpt_params* params_p = (gpt_params*) params_ptr;
llama_binding_state* state = (llama_binding_state*) state_pr;
llama_context* ctx = state->ctx;

gpt_params params = *params_p;

g_params = &params;
const int n_ctx = llama_n_ctx(ctx);

if (params.seed == LLAMA_DEFAULT_SEED) {
Expand Down Expand Up @@ -175,11 +181,11 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
}
}
}
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;

std::vector<llama_token> embd_inp;
if ( !params.prompt.empty() || session_tokens.empty() ) {
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
} else {
embd_inp = session_tokens;
}
Expand All @@ -193,10 +199,8 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
int guidance_offset = 0;
int original_prompt_len = 0;
if (ctx_guidance) {
params.cfg_negative_prompt.insert(0, 1, ' ');
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);

std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
}
Expand All @@ -209,7 +213,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {

// debug message about similarity of saved session, if applicable
size_t n_matching_session_tokens = 0;
if (session_tokens.size()) {
if (session_tokens.size() > 0) {
for (llama_token id : session_tokens) {
if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
break;
Expand Down Expand Up @@ -246,7 +250,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
}

Expand Down Expand Up @@ -280,18 +284,27 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);

bool is_antiprompt = false;
bool input_echo = true;
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;

int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
int n_session_consumed = 0;
int n_past_guidance = 0;

std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;

// the first thing we will do is to output the prompt, so set color accordingly

std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;

std::string res = "";

// do one empty run to warm up the model
{
const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
Expand All @@ -306,7 +319,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
auto max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int)embd.size() > max_embd_size) {
auto skipped_tokens = embd.size() - max_embd_size;
const int skipped_tokens = (int) embd.size() - max_embd_size;
printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
embd.resize(max_embd_size);
}
Expand All @@ -330,7 +343,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx, embd[i]));
// printf("%s", llama_token_to_piece(ctx, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
Expand Down Expand Up @@ -384,7 +397,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
input_size = embd_guidance.size();
//fprintf(stderr, "\n---------------------\n");
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
//fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
//}
//fprintf(stderr, "\n---------------------\n");
} else {
Expand Down Expand Up @@ -464,44 +477,49 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}

llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };

// Apply penalties
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p,
llama_sample_repetition_penalty(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
logits[llama_token_nl(ctx)] = nl_logit;
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &candidates_p, grammar);
llama_sample_grammar(ctx, &cur_p, grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
llama_sample_top_k(ctx, &cur_p, top_k, 1);
llama_sample_tail_free(ctx, &cur_p, tfs_z, 1);
llama_sample_typical(ctx, &cur_p, typical_p, 1);
llama_sample_top_p(ctx, &cur_p, top_p, 1);
llama_sample_temperature(ctx, &cur_p, temp);
id = llama_sample_token(ctx, &cur_p);
}
}
// printf("`%d`", candidates_p.size);
Expand All @@ -521,7 +539,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {

// call the token callback, no need to check if one is actually registered, that will
// be handled on the Go side.
auto token_str = llama_token_to_str(ctx, id);
auto token_str = llama_token_to_piece(ctx, id);
if (!tokenCallback(state_pr, (char*)token_str.c_str())) {
break;
}
Expand All @@ -539,7 +557,16 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
}

for (auto id : embd) {
res += llama_token_to_str(ctx, id).c_str();
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());

if (embd.size() > 1) {
input_tokens.push_back(id);
} else {
output_tokens.push_back(id);
output_ss << token_str;
}
res += llama_token_to_piece(ctx, id).c_str();
}

// if not currently processing queued inputs;
Expand All @@ -548,7 +575,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
if (params.antiprompt.size()) {
std::string last_output;
for (auto id : last_n_tokens) {
last_output += llama_token_to_str(ctx, id);
last_output += llama_token_to_piece(ctx, id);
}

// Check if each of the reverse prompts appears at the end of the output.
Expand All @@ -560,7 +587,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
: 0;

if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
break;
}
}
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 62 files
+35 −9 .devops/llama-cpp-clblast.srpm.spec
+25 −1 .devops/llama-cpp-cublas.srpm.spec
+36 −9 .devops/llama-cpp.srpm.spec
+5 −1 .gitignore
+2 −1 CMakeLists.txt
+50 −21 Makefile
+5 −2 README.md
+118 −22 ci/run.sh
+373 −12 common/common.cpp
+42 −1 common/common.h
+643 −0 common/log.h
+1 −1 convert-falcon-hf-to-gguf.py
+1 −1 convert-gptneox-hf-to-gguf.py
+1 −1 convert-llama-7b-pth-to-gguf.py
+1 −1 convert-llama-hf-to-gguf.py
+139 −77 convert.py
+3 −3 examples/beam_search/beam_search.cpp
+1 −1 examples/chat.sh
+2 −6 examples/convert-llama2c-to-ggml/README.md
+218 −123 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+1 −1 examples/embd-input/embd-input-lib.cpp
+1 −4 examples/embedding/embedding.cpp
+5 −0 examples/gguf/CMakeLists.txt
+3 −0 examples/gguf/gguf.cpp
+6 −1 examples/llama-bench/llama-bench.cpp
+2 −1 examples/llm.vim
+269 −111 examples/main/main.cpp
+152 −30 examples/perplexity/perplexity.cpp
+2 −2 examples/quantize/quantize.cpp
+2 −2 examples/save-load-state/save-load-state.cpp
+16 −13 examples/server/README.md
+29 −10 examples/server/server.cpp
+2 −2 examples/simple/simple.cpp
+7 −7 examples/train-text-from-scratch/README.md
+492 −0 examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+1,152 −2,250 examples/train-text-from-scratch/train-text-from-scratch.cpp
+6 −6 flake.lock
+34 −20 flake.nix
+13 −50 ggml-alloc.c
+23 −11 ggml-cuda.cu
+1 −0 ggml-metal.h
+113 −56 ggml-metal.m
+410 −328 ggml.c
+45 −19 ggml.h
+29 −7 gguf-py/gguf/gguf.py
+2 −2 k_quants.c
+157 −95 llama.cpp
+10 −5 llama.h
+140 −0 run_with_preset.py
+26 −0 scripts/convert-gg.sh
+3 −0 scripts/qnt-all.sh
+3 −0 scripts/run-all-perf.sh
+3 −0 scripts/run-all-ppl.sh
+9 −2 tests/CMakeLists.txt
+3 −0 tests/test-c.c
+37 −17 tests/test-grad0.cpp
+178 −0 tests/test-tokenizer-0-falcon.cpp
+83 −0 tests/test-tokenizer-0-falcon.py
+182 −0 tests/test-tokenizer-0-llama.cpp
+95 −0 tests/test-tokenizer-0-llama.py
+0 −140 tests/test-tokenizer-0.cpp
+3 −11 tests/test-tokenizer-1.cpp
21 changes: 9 additions & 12 deletions patches/1902-cuda.patch
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
diff --git a/common/common.cpp b/common/common.cpp
index 53002ba..996ed9f 100644
index ed09fc2..ced02e8 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -744,3 +744,83 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok

return std::string(result.data(), result.size());
@@ -1107,3 +1107,82 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
}
+
+
+gpt_params* create_gpt_params(const std::string& fname,const std::string& lora,const std::string& lora_base) {
+ gpt_params* lparams = new gpt_params;
+ fprintf(stderr, "%s: loading model %s\n", __func__, fname.c_str());
Expand Down Expand Up @@ -88,19 +87,17 @@ index 53002ba..996ed9f 100644
+}
\ No newline at end of file
diff --git a/common/common.h b/common/common.h
index 17d271e..15a85a8 100644
index 5a37968..8b09050 100644
--- a/common/common.h
+++ b/common/common.h
@@ -123,3 +123,11 @@ std::vector<llama_token> llama_tokenize(
std::string llama_token_to_str(
const struct llama_context * ctx,
llama_token token);
+
@@ -165,3 +165,10 @@ std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
+
+struct llama_binding_state {
+ llama_context * ctx;
+ llama_model * model;
+};
+
+void* load_binding_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit, bool numa, float rope_freq_base, float rope_freq_scale, bool mul_mat_q, const char *lora, const char *lora_base);
\ No newline at end of file

0 comments on commit 2249559

Please sign in to comment.