Skip to content

Commit b81c03c

Browse files
committed
Merge branch 'master' into xsn/glm4v
2 parents 7d53c0f + c45f89d commit b81c03c

File tree

118 files changed

+4218
-1809
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

118 files changed

+4218
-1809
lines changed

.github/ISSUE_TEMPLATE/011-bug-results.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ body:
1111
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
1212
If you encountered the issue while using an external UI (e.g. ollama),
1313
please reproduce your issue using one of the examples/binaries in this repository.
14-
The `llama-cli` binary can be used for simple and reproducible model inference.
14+
The `llama-completion` binary can be used for simple and reproducible model inference.
1515
- type: textarea
1616
id: version
1717
attributes:
@@ -74,9 +74,12 @@ body:
7474
Please give us a summary of the problem and tell us how to reproduce it.
7575
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
7676
that information would be very much appreciated by us.
77+
78+
If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
79+
If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
7780
placeholder: >
78-
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
79-
When I use -ngl 0 it works correctly.
81+
e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
82+
With short prompts or `-fa off` it works correctly.
8083
Here are the exact commands that I used: ...
8184
validations:
8285
required: true

.github/workflows/build.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ on:
2020
'**/*.swift',
2121
'**/*.m',
2222
'**/*.metal',
23-
'**/*.comp'
23+
'**/*.comp',
24+
'**/*.glsl'
2425
]
2526

2627
pull_request:
@@ -40,7 +41,8 @@ on:
4041
'**/*.swift',
4142
'**/*.m',
4243
'**/*.metal',
43-
'**/*.comp'
44+
'**/*.comp',
45+
'**/*.glsl'
4446
]
4547

4648
concurrency:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
/out/
5555
/tmp/
5656
/autogen-*.md
57+
/common/build-info.cpp
5758

5859
# Deprecated
5960

CODEOWNERS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@
8787
/tests/ @ggerganov
8888
/tests/test-chat-.* @pwilkin
8989
/tools/batched-bench/ @ggerganov
90-
/tools/main/ @ggerganov
90+
/tools/cli/ @ngxson
91+
/tools/completion/ @ggerganov
9192
/tools/mtmd/ @ngxson
9293
/tools/perplexity/ @ggerganov
9394
/tools/quantize/ @ggerganov

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua
313313

314314
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
315315

316-
## [`llama-cli`](tools/main)
316+
## [`llama-cli`](tools/cli)
317317

318318
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
319319

@@ -525,7 +525,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
525525
526526
## Other documentation
527527
528-
- [main (cli)](tools/main/README.md)
528+
- [cli](tools/cli/README.md)
529+
- [completion](tools/completion/README.md)
529530
- [server](tools/server/README.md)
530531
- [GBNF grammars](grammars/README.md)
531532

ci/run.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,8 @@ function gg_run_qwen3_0_6b {
398398
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
399399
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
400400

401+
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
402+
401403
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
402404
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
403405
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -523,6 +525,8 @@ function gg_run_embd_bge_small {
523525

524526
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
525527

528+
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
529+
526530
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
527531
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
528532

@@ -563,6 +567,8 @@ function gg_run_rerank_tiny {
563567

564568
model_f16="${path_models}/ggml-model-f16.gguf"
565569

570+
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
571+
566572
# for this model, the SEP token is "</s>"
567573
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
568574

common/arg.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <nlohmann/json.hpp>
2121

2222
#include <algorithm>
23+
#include <cinttypes>
2324
#include <climits>
2425
#include <cstdarg>
2526
#include <fstream>
@@ -529,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
529530
params.kv_overrides.back().key[0] = 0;
530531
}
531532

532-
if (!params.tensor_buft_overrides.empty()) {
533+
// pad tensor_buft_overrides for llama_params_fit:
534+
const size_t ntbo = llama_max_tensor_buft_overrides();
535+
while (params.tensor_buft_overrides.size() < ntbo) {
533536
params.tensor_buft_overrides.push_back({nullptr, nullptr});
534537
}
535538

@@ -1415,7 +1418,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14151418
params.sampling.top_k = value;
14161419
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
14171420
}
1418-
).set_sparam());
1421+
).set_sparam().set_env("LLAMA_ARG_TOP_K"));
14191422
add_opt(common_arg(
14201423
{"--top-p"}, "N",
14211424
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
@@ -2153,6 +2156,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21532156
}
21542157
}
21552158
).set_env("LLAMA_ARG_MAIN_GPU"));
2159+
add_opt(common_arg(
2160+
{ "-fit", "--fit" }, "[on|off]",
2161+
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
2162+
[](common_params & params, const std::string & value) {
2163+
if (is_truthy(value)) {
2164+
params.fit_params = true;
2165+
} else if (is_falsey(value)) {
2166+
params.fit_params = false;
2167+
} else {
2168+
throw std::runtime_error(
2169+
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
2170+
}
2171+
}
2172+
).set_env("LLAMA_ARG_FIT"));
2173+
add_opt(common_arg(
2174+
{ "-fitt", "--fit-target" }, "MiB",
2175+
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
2176+
[](common_params & params, int value) {
2177+
params.fit_params_target = value * size_t(1024*1024);
2178+
}
2179+
).set_env("LLAMA_ARG_FIT_TARGET"));
2180+
add_opt(common_arg(
2181+
{ "-fitc", "--fit-ctx" }, "N",
2182+
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
2183+
[](common_params & params, int value) {
2184+
params.fit_params_min_ctx = value;
2185+
}
2186+
).set_env("LLAMA_ARG_FIT_CTX"));
21562187
add_opt(common_arg(
21572188
{"--check-tensors"},
21582189
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),

0 commit comments

Comments
 (0)