jeffmaury
diff --git a/‎examples/batch-processing/server.py
+1 b/‎examples/batch-processing/server.py
+1
diff --git a/‎examples/gradio_chat/local.py
+15-7 b/‎examples/gradio_chat/local.py
+15-7
diff --git a/‎examples/gradio_chat/server.py
+12-9 b/‎examples/gradio_chat/server.py
+12-9
diff --git a/‎examples/hf_pull/main.py
+9-12 b/‎examples/hf_pull/main.py
+9-12
diff --git a/‎examples/high_level_api/fastapi_server.py
+1 b/‎examples/high_level_api/fastapi_server.py
+1
diff --git a/‎examples/high_level_api/high_level_api_infill.py
+12-8 b/‎examples/high_level_api/high_level_api_infill.py
+12-8
diff --git a/‎examples/low_level_api/Chat.py
+28-24 b/‎examples/low_level_api/Chat.py
+28-24
diff --git a/‎examples/low_level_api/Miku.py
+25-21 b/‎examples/low_level_api/Miku.py
+25-21
@@ -25,6 +25,7 @@
 
 import openai.types.chat as types
 
+
 @app.post("/v1/chat/completions")
 def create_chat_completions():
     return {"message": "Hello World"}
@@ -6,25 +6,26 @@
 llama = llama_cpp.Llama.from_pretrained(
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",
-    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
-    verbose=False
+    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
+        "Qwen/Qwen1.5-0.5B"
+    ),
+    verbose=False,
 )
 
 model = "gpt-3.5-turbo"
 
+
 def predict(message, history):
     messages = []
 
     for user_message, assistant_message in history:
         messages.append({"role": "user", "content": user_message})
         messages.append({"role": "assistant", "content": assistant_message})
-    
+
     messages.append({"role": "user", "content": message})
 
     response = llama.create_chat_completion_openai_v1(
-        model=model,
-        messages=messages,
-        stream=True
+        model=model, messages=messages, stream=True
     )
 
     text = ""
@@ -52,7 +53,14 @@ def predict(message, history):
 """
 
 with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css, fill_height=True) as demo:
-    gr.ChatInterface(predict, fill_height=True, examples=["What is the capital of France?", "Who was the first person on the moon?"])
+    gr.ChatInterface(
+        predict,
+        fill_height=True,
+        examples=[
+            "What is the capital of France?",
+            "Who was the first person on the moon?",
+        ],
+    )
 
 
 if __name__ == "__main__":
 
@@ -2,26 +2,22 @@
 
 from openai import OpenAI
 
-client = OpenAI(
-    base_url="http://localhost:8000/v1",
-    api_key="llama.cpp"
-)
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="llama.cpp")
 
 model = "gpt-3.5-turbo"
 
+
 def predict(message, history):
     messages = []
 
     for user_message, assistant_message in history:
         messages.append({"role": "user", "content": user_message})
         messages.append({"role": "assistant", "content": assistant_message})
-    
+
     messages.append({"role": "user", "content": message})
 
     response = client.chat.completions.create(
-        model=model,
-        messages=messages,
-        stream=True
+        model=model, messages=messages, stream=True
     )
 
     text = ""
@@ -49,7 +45,14 @@ def predict(message, history):
 """
 
 with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css, fill_height=True) as demo:
-    gr.ChatInterface(predict, fill_height=True, examples=["What is the capital of France?", "Who was the first person on the moon?"])
+    gr.ChatInterface(
+        predict,
+        fill_height=True,
+        examples=[
+            "What is the capital of France?",
+            "Who was the first person on the moon?",
+        ],
+    )
 
 
 if __name__ == "__main__":
 
@@ -5,29 +5,26 @@
 llama = llama_cpp.Llama.from_pretrained(
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",
-    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
-    verbose=False
+    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
+        "Qwen/Qwen1.5-0.5B"
+    ),
+    verbose=False,
 )
 
 response = llama.create_chat_completion(
-    messages=[
-        {
-            "role": "user",
-            "content": "What is the capital of France?"
-        }
-    ],
+    messages=[{"role": "user", "content": "What is the capital of France?"}],
     response_format={
         "type": "json_object",
         "schema": {
             "type": "object",
             "properties": {
                 "country": {"type": "string"},
-                "capital": {"type": "string"}
+                "capital": {"type": "string"},
             },
             "required": ["country", "capital"],
-        }
+        },
     },
-    stream=True
+    stream=True,
 )
 
 for chunk in response:
@@ -36,4 +33,4 @@
         continue
     print(delta["content"], end="", flush=True)
 
-print()
+print()
@@ -24,6 +24,7 @@
 To actually see the implementation of the server, see llama_cpp/server/app.py
 
 """
+
 import os
 import uvicorn
 
 
@@ -6,16 +6,16 @@
 parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
 parser.add_argument("-p", "--prompt", type=str, default="def add(")
 parser.add_argument("-s", "--suffix", type=str, default="\n    return sum\n\n")
-parser.add_argument("-i", "--spm-infill", action='store_true')
+parser.add_argument("-i", "--spm-infill", action="store_true")
 args = parser.parse_args()
 
 llm = Llama(model_path=args.model, n_gpu_layers=-1, spm_infill=args.spm_infill)
 
 output = llm.create_completion(
-      temperature = 0.0,
-      repeat_penalty = 1.0,
-      prompt = args.prompt,
-      suffix = args.suffix,
+    temperature=0.0,
+    repeat_penalty=1.0,
+    prompt=args.prompt,
+    suffix=args.suffix,
 )
 
 # Models sometimes repeat suffix in response, attempt to filter that
@@ -25,9 +25,13 @@
 unwanted_response_length = len(unwanted_response_suffix)
 
 filtered = False
-if unwanted_response_suffix and response_stripped[-unwanted_response_length:] == unwanted_response_suffix:
+if (
+    unwanted_response_suffix
+    and response_stripped[-unwanted_response_length:] == unwanted_response_suffix
+):
     response = response_stripped[:-unwanted_response_length]
     filtered = True
 
-print(f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m")
-
+print(
+    f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m"
+)
@@ -3,10 +3,12 @@
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 
+
 def env_or_def(env, default):
-	if (env in os.environ):
-		return os.environ[env]
-	return default
+    if env in os.environ:
+        return os.environ[env]
+    return default
+
 
 AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
 MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
@@ -15,10 +17,10 @@ def env_or_def(env, default):
 N_THREAD = int(env_or_def("N_THREAD", "8"))
 
 today = datetime.datetime.today()
-DATE_YEAR=today.strftime("%Y")
-DATE_TIME=today.strftime("%H:%M")
+DATE_YEAR = today.strftime("%Y")
+DATE_TIME = today.strftime("%H:%M")
 
-prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
+prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
 {AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
 There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
 The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
@@ -45,27 +47,29 @@ def env_or_def(env, default):
 {AI_NAME}: Blue.
 {USER_NAME}: What time is it?
 {AI_NAME}: It is {DATE_TIME}.
-{USER_NAME}:""" + " ".join(sys.argv[1:])
+{USER_NAME}:""" + " ".join(
+    sys.argv[1:]
+)
 
 print("Loading model...")
 params = GptParams(
-	n_ctx=2048,
-	temp=0.7,
-	top_k=40,
-	top_p=0.5,
-	repeat_last_n=256,
-	n_batch=1024,
-	repeat_penalty=1.17647,
-	model=MODEL,
-	n_threads=N_THREAD,
-	n_predict=N_PREDICTS,
-	use_color=True,
-	interactive=True,
-	antiprompt=[f"{USER_NAME}:"],
-	input_prefix=" ",
-	input_suffix=f"{AI_NAME}:",
-	prompt=prompt,
+    n_ctx=2048,
+    temp=0.7,
+    top_k=40,
+    top_p=0.5,
+    repeat_last_n=256,
+    n_batch=1024,
+    repeat_penalty=1.17647,
+    model=MODEL,
+    n_threads=N_THREAD,
+    n_predict=N_PREDICTS,
+    use_color=True,
+    interactive=True,
+    antiprompt=[f"{USER_NAME}:"],
+    input_prefix=" ",
+    input_suffix=f"{AI_NAME}:",
+    prompt=prompt,
 )
 
 with LLaMAInteract(params) as m:
-	m.interact()
+    m.interact()
@@ -3,18 +3,20 @@
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 
+
 def env_or_def(env, default):
-	if (env in os.environ):
-		return os.environ[env]
-	return default
+    if env in os.environ:
+        return os.environ[env]
+    return default
+
 
 AI_NAME = env_or_def("AI_NAME", "Miku")
 MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
 USER_NAME = env_or_def("USER_NAME", "Anon")
 N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
 N_THREAD = int(env_or_def("N_THREAD", "0"))
 
-prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
+prompt = f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
 {AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
 {AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
 {AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
@@ -32,28 +34,30 @@ def env_or_def(env, default):
 {AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
 {AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
 {AI_NAME}: What do you like to do in your free time? ^_^
-{USER_NAME}:""" + " ".join(sys.argv[1:])
+{USER_NAME}:""" + " ".join(
+    sys.argv[1:]
+)
 
 print("Loading model...")
 params = GptParams(
-	n_batch=1024,
-	n_ctx=2048,
-	n_keep=-1,
-	repeat_last_n=256,
-	repeat_penalty=1.17647,
-	temp=0.7,
-	top_k=40,
-	top_p=0.5,
-	model=MODEL,
-	n_predict=N_PREDICTS,
-	use_color=True,
-	interactive=True,
-	antiprompt=[f"{USER_NAME}:"],
-	prompt=prompt,
+    n_batch=1024,
+    n_ctx=2048,
+    n_keep=-1,
+    repeat_last_n=256,
+    repeat_penalty=1.17647,
+    temp=0.7,
+    top_k=40,
+    top_p=0.5,
+    model=MODEL,
+    n_predict=N_PREDICTS,
+    use_color=True,
+    interactive=True,
+    antiprompt=[f"{USER_NAME}:"],
+    prompt=prompt,
 )
 
 if N_THREAD > 0:
-	params.n_threads = N_THREAD
+    params.n_threads = N_THREAD
 
 with LLaMAInteract(params) as m:
-	m.interact()
+    m.interact()