Skip to content

Commit a6457ba

Browse files
committed
Merge branch 'main' of https://github.com/abetlen/llama-cpp-python into main
2 parents af3ed50 + 165b4dc commit a6457ba

File tree

7 files changed

+51
-24
lines changed

7 files changed

+51
-24
lines changed

.github/workflows/build-and-release.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
python -m pip install -e .[all]
3030
3131
- name: Build wheels
32-
uses: pypa/[email protected].0
32+
uses: pypa/[email protected].1
3333
env:
3434
# disable repair
3535
CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -56,7 +56,7 @@ jobs:
5656
platforms: linux/arm64
5757

5858
- name: Build wheels
59-
uses: pypa/[email protected].0
59+
uses: pypa/[email protected].1
6060
env:
6161
CIBW_SKIP: "*musllinux* pp*"
6262
CIBW_REPAIR_WHEEL_COMMAND: ""

Makefile

+7-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,13 @@ build:
1313
python3 -m pip install --verbose -e .
1414

1515
build.debug:
16-
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable .
16+
python3 -m pip install \
17+
--verbose \
18+
--config-settings=cmake.verbose=true \
19+
--config-settings=logging.level=INFO \
20+
--config-settings=install.strip=false \
21+
--config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \
22+
--editable .
1723

1824
build.cuda:
1925
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .

llama_cpp/llama.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import time
77
import json
88
import ctypes
9+
import typing
910
import fnmatch
1011
import multiprocessing
1112

@@ -249,24 +250,26 @@ def __init__(
249250
self._kv_overrides_array[i].key = k.encode("utf-8")
250251
if isinstance(v, bool):
251252
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
252-
self._kv_overrides_array[i].value.bool_value = v
253+
self._kv_overrides_array[i].value.val_bool = v
253254
elif isinstance(v, int):
254255
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
255-
self._kv_overrides_array[i].value.int_value = v
256+
self._kv_overrides_array[i].value.val_i64 = v
256257
elif isinstance(v, float):
257258
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
258-
self._kv_overrides_array[i].value.float_value = v
259+
self._kv_overrides_array[i].value.val_f64 = v
259260
elif isinstance(v, str): # type: ignore
260261
v_bytes = v.encode("utf-8")
261262
if len(v_bytes) > 128: # TODO: Make this a constant
262263
raise ValueError(f"Value for {k} is too long: {v}")
263264
v_bytes = v_bytes.ljust(128, b"\0")
264265
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
265266
# copy min(v_bytes, 128) to str_value
267+
address = typing.cast(int, ctypes.addressof(self._kv_overrides_array[i].value) + llama_cpp.llama_model_kv_override_value.val_str.offset)
268+
buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char))
266269
ctypes.memmove(
267-
self._kv_overrides_array[i].value.str_value,
270+
buffer_start,
268271
v_bytes,
269-
min(len(v_bytes), 128),
272+
128,
270273
)
271274
else:
272275
raise ValueError(f"Unknown value type for {k}: {v}")

llama_cpp/llama_chat_format.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -3098,7 +3098,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
30983098
"{% endif %}"
30993099
)
31003100

3101-
class Llama3VisionAlpha(Llava15ChatHandler):
3101+
class Llama3VisionAlphaChatHandler(Llava15ChatHandler):
31023102
# question = "<image>" + q
31033103

31043104
# prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
@@ -3159,6 +3159,10 @@ class Llama3VisionAlpha(Llava15ChatHandler):
31593159
"{% endif %}"
31603160
)
31613161

3162+
# alias
3163+
Llama3VisionAlpha = Llama3VisionAlphaChatHandler
3164+
3165+
31623166
@register_chat_completion_handler("chatml-function-calling")
31633167
def chatml_function_calling(
31643168
llama: llama.Llama,
@@ -3193,7 +3197,6 @@ def chatml_function_calling(
31933197
llama_types.CreateChatCompletionResponse,
31943198
Iterator[llama_types.CreateChatCompletionStreamResponse],
31953199
]:
3196-
print(logprobs)
31973200
function_calling_template = (
31983201
"{% for message in messages %}"
31993202
"<|im_start|>{{ message.role }}\n"

llama_cpp/llama_cpp.py

+25-12
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
300300
# LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
301301
# LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
302302
# LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
303+
# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
303304
# };
304305
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
305306
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -315,6 +316,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
315316
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
316317
LLAMA_VOCAB_PRE_TYPE_OLMO = 12
317318
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
319+
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
318320

319321

320322
# // note: these values should be synchronized with ggml_rope
@@ -611,17 +613,17 @@ class llama_batch(ctypes.Structure):
611613
# };
612614
class llama_model_kv_override_value(ctypes.Union):
613615
_fields_ = [
614-
("int_value", ctypes.c_int64),
615-
("float_value", ctypes.c_double),
616-
("bool_value", ctypes.c_bool),
617-
("str_value", ctypes.c_char * 128),
616+
("val_i64", ctypes.c_int64),
617+
("val_f64", ctypes.c_double),
618+
("val_bool", ctypes.c_bool),
619+
("val_str", ctypes.c_char * 128),
618620
]
619621

620622
if TYPE_CHECKING:
621-
int_value: int
622-
float_value: float
623-
bool_value: bool
624-
str_value: bytes
623+
val_i64: int
624+
val_f64: float
625+
val_bool: bool
626+
val_str: bytes
625627

626628

627629
class llama_model_kv_override(ctypes.Structure):
@@ -718,6 +720,8 @@ class llama_model_params(ctypes.Structure):
718720
]
719721

720722

723+
# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
724+
# // https://github.com/ggerganov/llama.cpp/pull/7544
721725
# struct llama_context_params {
722726
# uint32_t seed; // RNG seed, -1 for random
723727
# uint32_t n_ctx; // text context, 0 = from model
@@ -744,15 +748,14 @@ class llama_model_params(ctypes.Structure):
744748
# ggml_backend_sched_eval_callback cb_eval;
745749
# void * cb_eval_user_data;
746750

747-
# enum ggml_type type_k; // data type for K cache
748-
# enum ggml_type type_v; // data type for V cache
751+
# enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
752+
# enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
749753

750754
# // Keep the booleans together to avoid misalignment during copy-by-value.
751755
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
752756
# bool embeddings; // if true, extract embeddings (together with logits)
753757
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
754-
# bool flash_attn; // whether to use flash attention
755-
758+
# bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
756759

757760
# // Abort callback
758761
# // if it returns true, execution of llama_decode() will be aborted
@@ -2454,6 +2457,16 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /)
24542457
...
24552458

24562459

2460+
# // Identify if Token Id is a control token or a render-able token
2461+
# LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
2462+
@ctypes_function(
2463+
"llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool
2464+
)
2465+
def llama_token_is_control(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
2466+
"""Identify if Token Id is a control token or a render-able token"""
2467+
...
2468+
2469+
24572470
# // Special tokens
24582471

24592472

llama_cpp/server/model.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
183183
num_pred_tokens=settings.draft_model_num_pred_tokens
184184
)
185185

186-
kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
186+
kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
187187
if settings.kv_overrides is not None:
188188
assert isinstance(settings.kv_overrides, list)
189189
kv_overrides = {}
@@ -197,6 +197,8 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
197197
kv_overrides[key] = int(value)
198198
elif value_type == "float":
199199
kv_overrides[key] = float(value)
200+
elif value_type == "str":
201+
kv_overrides[key] = value
200202
else:
201203
raise ValueError(f"Unknown value type {value_type}")
202204

vendor/llama.cpp

0 commit comments

Comments
 (0)