Add beam search. Invoke by adding "beam_search": 2 (for example) to /v1/completions POST.

mattpulver · mattpulver · commit 3b3c1d645859 · 2023-08-23T11:35:27.000-04:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -17,6 +17,7 @@
     Callable,
 )
 from collections import deque, OrderedDict
+from dataclasses import dataclass
 
 import diskcache
 import ctypes
@@ -199,6 +200,42 @@ class StoppingCriteriaList(List[StoppingCriteria]):
     def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
 
+# Custom data that is accessible to the beam_search_callback() function.
+@dataclass
+class beam_search_callback_data:
+    ctx: llama_cpp.llama_context_p
+    response_tokens: List[int]
+
+def beam_view_to_string(ctx, beam_view):
+    string = f"p({beam_view.p}): "
+    for i in range(beam_view.n_tokens):
+        string += str(llama_cpp.llama_token_to_str(ctx, beam_view.tokens[i]))
+    return string
+
+def is_at_eos(tokens, n_tokens) :
+    return 0 < n_tokens and tokens[n_tokens-1] == llama_cpp.llama_token_eos();
+
+# beam_search_callback requires a global dictionary to pass data via their object id.
+beam_search_dictionary = {}
+
+# beam_search_callback() must flag beams when they reach end-of-sentence.
+# TODO: Use stop_sequences.
+def beam_search_callback(callback_data_id, beams_state):
+    for i in range(beams_state.n_beams):
+        beam_view = beams_state.beam_views[i]
+        if not beam_view.eos and is_at_eos(beam_view.tokens, beam_view.n_tokens):
+             beam_view.eos = True;  # Flag beams as EOS as required.
+    callback_data = beam_search_dictionary[callback_data_id]
+    # Collect tokens into callback_data.response_tokens
+    if 0 < beams_state.common_prefix_length:
+        assert(0 < beams_state.n_beams);
+        tokens = ctypes.cast(beams_state.beam_views[0].tokens, ctypes.POINTER(ctypes.c_int * beams_state.common_prefix_length)).contents
+        callback_data.response_tokens.extend(tokens)
+
+    # DEBUG print beams and their relative probabilities
+    # print(f"\n\nCurrent beams (last_call={beams_state.last_call}):\n")
+    # for i in range(beams_state.n_beams):
+    #     print(f"beams[{i}]", beam_view_to_string(callback_data.ctx,beams_state.beam_views[i]))
 
 class Llama:
     """High-level Python wrapper for a llama.cpp model."""
@@ -475,6 +512,7 @@ def eval(self, tokens: Sequence[int]):
             tokens: The list of tokens to evaluate.
         """
         assert self.ctx is not None
+
         n_ctx = self._n_ctx
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
@@ -719,6 +757,7 @@ def generate(
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         grammar: Optional[LlamaGrammar] = None,
+        beam_width: int = 0,
     ) -> Generator[int, Optional[Sequence[int]], None]:
         """Create a generator of tokens from a prompt.
 
@@ -760,6 +799,28 @@ def generate(
         if grammar is not None:
             grammar.reset()
 
+        if 0 < beam_width:
+            print("beam_width=", beam_width)
+            self.eval(tokens)
+            callback_data = beam_search_callback_data(self.ctx, [])
+            beam_search_dictionary[id(callback_data)] = callback_data
+            callback = llama_cpp.llama_beam_search_callback(beam_search_callback)
+            n_remain = llama_cpp.llama_n_ctx(self.ctx) - self.n_tokens
+            llama_cpp.llama_beam_search(self.ctx, callback, id(callback_data),
+                                        beam_width,
+                                        self.n_tokens,
+                                        n_remain,
+                                        self.n_threads)
+            beam_search_dictionary.pop(id(callback_data))
+            # Ideally we would yield from within the callback, but that is impossible.
+            for token in callback_data.response_tokens:
+                string = llama_cpp.llama_token_to_str(self.ctx, token)
+                np.append(self.input_ids, [token])
+                np.append(self.scores, [0.0])
+                self.n_tokens += 1
+                yield token
+            return
+
         while True:
             self.eval(tokens)
             token = self.sample(
@@ -776,6 +837,7 @@ def generate(
                 logits_processor=logits_processor,
                 grammar=grammar,
             )
+
             if stopping_criteria is not None and stopping_criteria(
                 self._input_ids.tolist(), self._scores[-1, :].tolist()
             ):
@@ -878,6 +940,7 @@ def _create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
+        beam_width: int = 0,
     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
         assert self.ctx is not None
 
@@ -956,6 +1019,7 @@ def _create_completion(
             stopping_criteria=stopping_criteria,
             logits_processor=logits_processor,
             grammar=grammar,
+            beam_width=beam_width,
         ):
             if token == self._token_eos:
                 text = self.detokenize(completion_tokens)
@@ -1301,6 +1365,7 @@ def create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
+        beam_width: int = 0,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -1316,6 +1381,7 @@ def create_completion(
             repeat_penalty: The penalty to apply to repeated tokens.
             top_k: The top-k value to use for sampling.
             stream: Whether to stream the results.
+            beam_width: Number of beams to use in beam search. 0 disables.
 
         Raises:
             ValueError: If the requested tokens exceed the context window.
@@ -1345,7 +1411,8 @@ def create_completion(
             model=model,
             stopping_criteria=stopping_criteria,
             logits_processor=logits_processor,
-            grammar=grammar
+            grammar=grammar,
+            beam_width=beam_width,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks
@@ -1376,6 +1443,7 @@ def __call__(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
+        beam_width: int = 0,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -1391,6 +1459,7 @@ def __call__(
             repeat_penalty: The penalty to apply to repeated tokens.
             top_k: The top-k value to use for sampling.
             stream: Whether to stream the results.
+            beam_width: Number of beams to use in beam search. 0 disables.
 
         Raises:
             ValueError: If the requested tokens exceed the context window.
@@ -1421,6 +1490,7 @@ def __call__(
             stopping_criteria=stopping_criteria,
             logits_processor=logits_processor,
             grammar=grammar,
+            beam_width=beam_width,
         )
 
     def _convert_text_completion_to_chat(
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -1312,6 +1312,38 @@ def llama_grammar_accept_token(
 ]
 _lib.llama_grammar_accept_token.restype = None
 
+# Beam search types and function
+class llama_beam_view(Structure):
+    _fields_ = [
+        ("tokens", POINTER(c_int)),
+        ("n_tokens", c_size_t),
+        ("p", c_float),
+        ("eos", c_bool)
+    ]
+
+class llama_beams_state(Structure):
+    _fields_ = [
+        ("beam_views", POINTER(llama_beam_view)),
+        ("n_beams", c_size_t),
+        ("common_prefix_length", c_size_t),
+        ("last_call", c_bool)
+    ]
+
+# typedef void (*llama_beam_search_callback_fn_t)(void* callback_data, llama_beams_state);
+llama_beam_search_callback = ctypes.CFUNCTYPE(None, c_void_p, llama_beams_state)
+
+def llama_beam_search(ctx: llama_context_p,
+                      callback: llama_beam_search_callback,
+                      callback_data: c_void_p,
+                      n_beams: c_size_t,
+                      n_past: c_int,
+                      n_predict: c_int,
+                      n_threads: c_int):
+    return _lib.llama_beam_search(ctx, callback, callback_data, n_beams, n_past, n_predict, n_threads)
+
+_lib.llama_beam_search.argtypes = [llama_context_p, llama_beam_search_callback, c_void_p, c_size_t, c_int, c_int, c_int]
+_lib.llama_beam_search.restype = None
+
 # Performance information
 
 
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
@@ -70,7 +70,7 @@ class Settings(BaseSettings):
         ge=0,
         description="Last n tokens to keep for repeat penalty calculation.",
     )
-    logits_all: bool = Field(default=True, description="Whether to return logits.")
+    logits_all: bool = Field(default=False, description="Whether to return logits.")
     cache: bool = Field(
         default=False,
         description="Use a cache to reduce processing times for evaluated prompts.",
@@ -525,6 +525,7 @@ class CreateCompletionRequest(BaseModel):
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    beam_width: int = 0
 
     model_config = {
         "json_schema_extra": {
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 604b8bdfa6320bbcb018eebcc1252dfede603c6b
+Subproject commit 1528660b4e6edea9e6418c40a78fc31eb54929fa