diff --git a/examples/llama.swiftui/.gitignore b/examples/llama.swiftui/.gitignore index 4f405ccc66e1ae..9bce6af399ba96 100644 --- a/examples/llama.swiftui/.gitignore +++ b/examples/llama.swiftui/.gitignore @@ -1 +1 @@ -xcuserdata \ No newline at end of file +xcuserdata diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md index 44049dc81c5b3f..fa68e6ed8e34db 100644 --- a/examples/llama.swiftui/README.md +++ b/examples/llama.swiftui/README.md @@ -1,7 +1,7 @@ # llama.swiftui -Local inference of llama.cpp on an iPhone. -So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well. +Local inference of llama.cpp on an iPhone. +So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well. https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545 diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 7f8b0f0b6d04a9..5f0e7db50404bd 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -11,28 +11,28 @@ actor LlamaContext { private var context: OpaquePointer private var batch: llama_batch private var tokens_list: [llama_token] - + var n_len: Int32 = 512 var n_cur: Int32 = 0 var n_decode: Int32 = 0 - + init(model: OpaquePointer, context: OpaquePointer) { self.model = model self.context = context self.tokens_list = [] self.batch = llama_batch_init(512, 0, 1) } - + deinit { llama_free(context) llama_free_model(model) llama_backend_free() } - + static func createContext(path: String) throws -> LlamaContext { llama_backend_init(false) let model_params = llama_model_default_params() - + let model = llama_load_model_from_file(path, model_params) guard let model else { print("Could not load model at \(path)") @@ -43,41 +43,41 @@ actor LlamaContext { ctx_params.n_ctx = 2048 ctx_params.n_threads = 8 ctx_params.n_threads_batch = 8 - + let context = llama_new_context_with_model(model, ctx_params) guard let context else { print("Could not load context!") throw LlamaError.couldNotInitializeContext } - + return LlamaContext(model: model, context: context) } - + func get_n_tokens() -> Int32 { return batch.n_tokens; } - + func completion_init(text: String) { print("attempting to complete \"\(text)\"") - + tokens_list = tokenize(text: text, add_bos: true) - + let n_ctx = llama_n_ctx(context) let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count) - + print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)") if n_kv_req > n_ctx { print("error: n_kv_req > n_ctx, the required KV cache size is not big enough") } - + for id in tokens_list { print(token_to_piece(token: id)) } - + // batch = llama_batch_init(512, 0) // done in init() batch.n_tokens = Int32(tokens_list.count) - + for i1 in 0.. String { var new_token_id: llama_token = 0 - + let n_vocab = llama_n_vocab(model) let logits = llama_get_logits_ith(context, batch.n_tokens - 1) - + var candidates = Array() candidates.reserveCapacity(Int(n_vocab)) - + for token_id in 0.. [llama_token] { let n_tokens = text.count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false) - + var swiftTokens: [llama_token] = [] for i in 0.. String { let result = UnsafeMutablePointer.allocate(capacity: 8) result.initialize(repeating: Int8(0), count: 8) - + let _ = llama_token_to_piece(model, token, result, 8) - + let resultStr = String(cString: result) - + result.deallocate() - + return resultStr } -} +} \ No newline at end of file diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index ed1573f4e2d499..babc60cdcc9dcc 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -3,7 +3,7 @@ import Foundation @MainActor class LlamaState: ObservableObject { @Published var messageLog = "" - + private var llamaContext: LlamaContext? private var modelUrl: URL? { Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models") @@ -17,7 +17,6 @@ class LlamaState: ObservableObject { } } - private func loadModel() throws { messageLog += "Loading model...\n" if let modelUrl { @@ -27,7 +26,7 @@ class LlamaState: ObservableObject { messageLog += "Could not locate model\n" } } - + func complete(text: String) async { guard let llamaContext else { return @@ -35,7 +34,7 @@ class LlamaState: ObservableObject { messageLog += "Attempting to complete text...\n" await llamaContext.completion_init(text: text) messageLog += "\(text)" - + while await llamaContext.n_cur <= llamaContext.n_len { let result = await llamaContext.completion_loop() messageLog += "\(result)" diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 070ecde78c3e06..0bd16a806d10fa 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -2,15 +2,15 @@ import SwiftUI struct ContentView: View { @StateObject var llamaState = LlamaState() - + @State private var multiLineText = "" - + var body: some View { VStack { ScrollView(.vertical) { Text(llamaState.messageLog) } - + TextEditor(text: $multiLineText) .frame(height: 200) .padding() @@ -27,7 +27,7 @@ struct ContentView: View { } .padding() } - + func sendText() { Task { await llamaState.complete(text: multiLineText)