|
20 | 20 | #include <nlohmann/json.hpp> |
21 | 21 |
|
22 | 22 | #include <algorithm> |
| 23 | +#include <cinttypes> |
23 | 24 | #include <climits> |
24 | 25 | #include <cstdarg> |
25 | 26 | #include <fstream> |
@@ -529,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context |
529 | 530 | params.kv_overrides.back().key[0] = 0; |
530 | 531 | } |
531 | 532 |
|
532 | | - if (!params.tensor_buft_overrides.empty()) { |
| 533 | + // pad tensor_buft_overrides for llama_params_fit: |
| 534 | + const size_t ntbo = llama_max_tensor_buft_overrides(); |
| 535 | + while (params.tensor_buft_overrides.size() < ntbo) { |
533 | 536 | params.tensor_buft_overrides.push_back({nullptr, nullptr}); |
534 | 537 | } |
535 | 538 |
|
@@ -1415,7 +1418,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex |
1415 | 1418 | params.sampling.top_k = value; |
1416 | 1419 | params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K; |
1417 | 1420 | } |
1418 | | - ).set_sparam()); |
| 1421 | + ).set_sparam().set_env("LLAMA_ARG_TOP_K")); |
1419 | 1422 | add_opt(common_arg( |
1420 | 1423 | {"--top-p"}, "N", |
1421 | 1424 | string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), |
@@ -2153,6 +2156,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex |
2153 | 2156 | } |
2154 | 2157 | } |
2155 | 2158 | ).set_env("LLAMA_ARG_MAIN_GPU")); |
| 2159 | + add_opt(common_arg( |
| 2160 | + { "-fit", "--fit" }, "[on|off]", |
| 2161 | + string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"), |
| 2162 | + [](common_params & params, const std::string & value) { |
| 2163 | + if (is_truthy(value)) { |
| 2164 | + params.fit_params = true; |
| 2165 | + } else if (is_falsey(value)) { |
| 2166 | + params.fit_params = false; |
| 2167 | + } else { |
| 2168 | + throw std::runtime_error( |
| 2169 | + string_format("error: unkown value for --fit: '%s'\n", value.c_str())); |
| 2170 | + } |
| 2171 | + } |
| 2172 | + ).set_env("LLAMA_ARG_FIT")); |
| 2173 | + add_opt(common_arg( |
| 2174 | + { "-fitt", "--fit-target" }, "MiB", |
| 2175 | + string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)), |
| 2176 | + [](common_params & params, int value) { |
| 2177 | + params.fit_params_target = value * size_t(1024*1024); |
| 2178 | + } |
| 2179 | + ).set_env("LLAMA_ARG_FIT_TARGET")); |
| 2180 | + add_opt(common_arg( |
| 2181 | + { "-fitc", "--fit-ctx" }, "N", |
| 2182 | + string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx), |
| 2183 | + [](common_params & params, int value) { |
| 2184 | + params.fit_params_min_ctx = value; |
| 2185 | + } |
| 2186 | + ).set_env("LLAMA_ARG_FIT_CTX")); |
2156 | 2187 | add_opt(common_arg( |
2157 | 2188 | {"--check-tensors"}, |
2158 | 2189 | string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), |
|
0 commit comments