@@ -772,7 +772,7 @@ class llama_context_params(ctypes.Structure):
772772 cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
773773 type_k (int): data type for K cache
774774 type_v (int): data type for V cache
775- logits_all (bool): the llama_eval () call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
775+ logits_all (bool): the llama_decode () call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
776776 embeddings (bool): if true, extract embeddings (together with logits)
777777 offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
778778 flash_attn (bool): whether to use flash attention
@@ -2469,10 +2469,10 @@ def llama_synchronize(ctx: llama_context_p, /):
24692469 "llama_get_logits" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
24702470)
24712471def llama_get_logits (ctx : llama_context_p , / ) -> CtypesArray [ctypes .c_float ]:
2472- """Token logits obtained from the last call to llama_eval ()
2473- The logits for the last token are stored in the last row
2474- Logits for which llama_batch.logits[i] == 0 are undefined
2475- Rows: n_tokens provided with llama_batch
2472+ """Token logits obtained from the last call to llama_decode ()
2473+ The logits for which llama_batch.logits[i] != 0 are stored contiguously
2474+ in the order they have appeared in the batch.
2475+ Rows: number of tokens for which llama_batch.logits[i] != 0
24762476 Cols: n_vocab
24772477
24782478 Returns:
0 commit comments