diff --git a/README.md b/README.md index f2caeb8..c39bfa0 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,12 @@ Have you ever wanted to inference a baby Llama 2 model in pure Mojo? No? Well, now you can! -supported version: [Mojo 24.3](https://docs.modular.com/mojo/changelog#v243-2024-05-02) +supported version: [Mojo 0.5.0](https://docs.modular.com/mojo/changelog.html#v0.5.0-2023-11-2) With the release of [Mojo](https://www.modular.com/blog/mojo-its-finally-here), I was inspired to take my Python port of [llama2.py](https://github.com/tairov/llama2.py) and transition it to Mojo. The result? A version that leverages Mojo's SIMD & vectorization primitives, boosting the Python performance by nearly 250x. -Impressively, after few native improvements the Mojo version outperforms the original `llama2.c` by 30% in multi-threaded inference. As well as it outperforms `llama.cpp` on baby-llama inference on CPU by 20%. +Impressively, after few native improvements the Mojo version outperforms the original `llama2.c` compiled in `runfast` mode by 30% in Multi-threaded mode. As well as it outperforms `llama.cpp` on baby-llama inference on CPU by 20%. This showcases the potential of hardware-level optimizations through Mojo's advanced features. ## supported models @@ -31,7 +31,7 @@ At the moment, the following models were successfully executed via `llama2.mojo` **Mac M1 Max (6 threads)** -| Model | [llama2.c](https://github.com/karpathy/llama2.c) (OMP/parallelized) | **llama2.mojo** (parallelized) | llama.cpp (CPU, 6 threads) | [llama2.py](https://github.com/tairov/llama2.py) | +| Model | [llama2.c](https://github.com/karpathy/llama2.c) (OMP/parallelized) | **llama2.mojo** (parallelized) | llama.cpp (cpu) | [llama2.py](https://github.com/tairov/llama2.py) | |-----------------|---------------------------------------------------------------------|--------------------------------|-----------------|--------------------------------------------------| | stories15M.bin | 730 tok/s | 1025 tok/s | 890 tok/s | 38 tok/s (pypi) | | stories42M.bin | 270 tok/s | 490 tok/s | 420 tok/s | - | @@ -133,15 +133,6 @@ readers to easily find the latest updates and extensions to the project. model, and applications of the `mojo` programming language. Citing the project helps growth of the knowledge community around these topics. We appreciate your support through referencing `llama2.mojo`! -## in research & industry - -Our project has been used in academic. If you’ve published a paper or article that uses this project, please send a PR, so we can feature it here -* https://arxiv.org/abs/2410.17736 - MojoBench: Language Modeling and Benchmarks for Mojo -* https://arxiv.org/abs/2505.04080 - MojoFrame: Dataframe Library in Mojo Language -* https://ieeexplore.ieee.org/abstract/document/10883176/metrics - A Comprehensive Review of Mojo: A High-Performance Programming Language -* https://arxiv.org/abs/2502.01651 - Fine-tuning LLaMA 2 interference: a comparative study of language implementations for optimal efficiency - - ## play with Tinyllama-1.1B-Chat-v0.2 The [TinyLlama](https://github.com/jzhang38/TinyLlama) is a 1.1B Llama model trained on 3 trillion tokens. This @@ -202,5 +193,3 @@ This function takes an integer n as a parameter and returns the next Fibonacci n ## license MIT - - diff --git a/llama2.mojo b/llama2.mojo index 134c6f0..229226d 100644 --- a/llama2.mojo +++ b/llama2.mojo @@ -1,14 +1,14 @@ from algorithm import sum -from algorithm import vectorize, parallelize +from algorithm import vectorize, parallelize, unroll, tile from builtin import string -from math import round +from math import round, log2 from memory import memset_zero, memcpy, stack_allocation -from memory.unsafe import DTypePointer, bitcast -from tensor import rand -from sys.info import num_performance_cores +from memory.buffer import Buffer +from memory.unsafe import DTypePointer +from random import rand +from runtime.llcl import num_cores from sys import argv from tensor import Tensor, TensorShape, TensorSpec -from collections import List, Dict # The SIMD vector width. from sys.info import simdwidthof @@ -17,43 +17,17 @@ import os import random import time -alias NUM_CONFIG_INT = 7 var workers = 0 alias nelts = (4 * simdwidthof[DType.float32]()) +alias PointerString = Pointer[UInt8] alias BufferPtrType = DTypePointer[DType.uint8] alias BufferPtrFloat32 = DTypePointer[DType.float32] -alias PointerStrings = Pointer[String] +alias PointerStrings = Pointer[PointerString] alias TensorF32 = Tensor[DType.float32] -@register_passable -struct Accumulator[T: DType, width: Int]: - # ideally this could be SIMD[T, width] but the width - # in accumulate() method is compared by identity - var data: DTypePointer[T] - - @always_inline - fn __init__() -> Self: - # allocate a DTypePointer on stack that doesn't need to be freed. - var data = stack_allocation[width, T]() - memset_zero(data, width) - return Self {data: data} - - @always_inline - fn accumulate[_width: Int](inout self, val: SIMD[T, _width]) -> None: - # This is a hack to make sure both SIMD have _width length. - # SIMD[T, width] += SIMD[T, _width] is always an error. - var newVal = self.data.load[width=_width]() + val - self.data.store[width=_width](newVal) - - @always_inline - fn total(self) -> SIMD[T, 1]: - return self.data.load[width=width]().reduce_add() - - -@value struct TensorSlice: # Provides a view into a tensor representing a 1D slice on its first or first 2 dimensions. # Same function signatures as Tensor but without owning the data. @@ -61,7 +35,7 @@ struct TensorSlice: var _shape: TensorShape fn __init__(inout self, t: TensorF32, layer: Int) raises: - var elements_per_layer = t.num_elements() // t.dim(0) + let elements_per_layer = t.num_elements() // t.dim(0) self._data = t.data().offset(layer * elements_per_layer) if t.rank() == 2: self._shape = TensorShape(t.dim(1)) @@ -73,8 +47,8 @@ struct TensorSlice: raise Error("TensorSlice: rank greater than 3 not implemented.") fn __init__(inout self, t: TensorF32, layer: Int, row: Int) raises: - var elements_per_layer = t.num_elements() // t.dim(0) - var elements_per_row = elements_per_layer // t.dim(1) + let elements_per_layer = t.num_elements() // t.dim(0) + let elements_per_row = elements_per_layer // t.dim(1) self._data = t.data().offset( layer * elements_per_layer + row * elements_per_row ) @@ -84,8 +58,8 @@ struct TensorSlice: # Compiler complains if _shape not defined self._shape = TensorShape(1) raise Error( - "Trying to slice a 1D Tensor by layer and row. This requires a" - " 3D Tensor." + "Trying to slice a 1D Tensor by layer and row. This requires a 3D" + " Tensor." ) else: # Compiler complains if _shape not defined @@ -107,132 +81,257 @@ struct TensorSlice: fn rank(self) -> Int: return self._shape.rank() - fn load[width: Int](self, idx: Int) -> SIMD[DType.float32, nelts]: - return self._data.load[width=nelts](idx) + fn simd_load[nelts: Int](self, idx: Int) -> SIMD[DType.float32, nelts]: + return self._data.simd_load[nelts](idx) - fn load[width: Int](self, *indices: Int) -> SIMD[DType.float32, nelts]: + fn simd_load[nelts: Int](self, *indices: Int) -> SIMD[DType.float32, nelts]: if len(VariadicList(indices)) > 2: print( - "Warning: TensorSlice only supports 1D and 2D indexing. " - " Results are unlikely to be correct." + "Warning: TensorSlice only supports 1D and 2D indexing. Results are" + " unlikely to be correct." ) - return self.load[width=nelts](indices[0] * self._shape[1] + indices[1]) + return self.simd_load[nelts](indices[0] * self._shape[1] + indices[1]) - fn load[ - width: Int + fn simd_load[ + nelts: Int ](self, indices: StaticIntTuple[2]) -> SIMD[DType.float32, nelts]: - return self._data.load[width=nelts]( - indices[0] * self._shape[1] + indices[1] - ) + return self._data.simd_load[nelts](indices[0] * self._shape[1] + indices[1]) fn __getitem__(self, idx: Int) -> SIMD[DType.float32, 1]: - return self._data.load[width=1](idx) + return self._data.simd_load[1](idx) - fn store[nelts: Int](self, idx: Int, val: SIMD[DType.float32, nelts]): - return self._data.store[width=nelts](idx, val) + fn simd_store[nelts: Int](self, idx: Int, val: SIMD[DType.float32, nelts]): + return self._data.simd_store[nelts](idx, val) fn __setitem__(self, idx: Int, val: SIMD[DType.float32, 1]): - return self.store[1](idx, val) + return self.simd_store[1](idx, val) + + +fn read_val_int(inout buf: FileBuf) raises -> Int: + # DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]() + let data = buf.data.offset(buf.get_offset()).bitcast[DType.int32]() + let result = data.load(0) + buf.move_offset(4) + return result.to_int() + + +fn read_val_float32(inout buf: FileBuf) raises -> Float32: + # DTypePointer[DType.ui8](buf.data).bitcast[DType.ui8]() + let val = buf.data.offset(buf.get_offset()).bitcast[DType.float32]().load(0) + buf.move_offset(4) + return val + + +fn read_val_str(inout buf: FileBuf, slen: Int) raises -> PointerString: + let str = PointerString.alloc(slen + 1) + for i in range(slen): + str.store(i, buf.data.load(buf.get_offset())) + buf.move_offset(1) + str.store(slen, 0) + + return str + + +fn str_len(s: PointerString) -> Int: + var len = 0 + while s[len] != 0: + len += 1 + return len # not optimal concat -fn str_concat(s1: String, s2: String) -> String: - var l1 = len(s1) - var l2 = len(s2) - var str = List[Int8](capacity=l1 + l2 + 1) - memcpy(str.data, s1._buffer.data, l1) - memcpy(str.data + l1, s2._buffer.data, l2) - str[l1 + l2] = 0 - str.size = l1 + l2 + 1 - return str^ - - -fn string_compare(a: String, b: String) -> Int: +fn str_concat(s1: PointerString, s2: PointerString) -> PointerString: + let l1 = str_len(s1) + let l2 = str_len(s2) + let str = PointerString.alloc(l1 + l2 + 1) + memcpy[UInt8](str, s1, l1) + memcpy[UInt8](str.offset(l1), s2, l2) + str.store(l1 + l2, 0) + return str + + +fn str_to_ptr(s: String) -> PointerString: + let ret = PointerString.alloc(len(s) + 1) + for i in range(len(s)): + ret.store(i, ord(s[i])) + ret.store(len(s), 0) + return ret + + +fn string_compare(a: PointerString, b: PointerString) -> Int: var index = 0 - while a._buffer[index] != 0 and b._buffer[index] != 0: - if a._buffer[index] < b._buffer[index]: + while a[index] != 0 and b[index] != 0: + if a[index] < b[index]: return -1 - if a._buffer[index] > b._buffer[index]: + if a[index] > b[index]: return 1 index += 1 - if a._buffer[index] != 0 and b._buffer[index] == 0: + if a[index] != 0 and b[index] == 0: return 1 - if a._buffer[index] == 0 and b._buffer[index] != 0: + if a[index] == 0 and b[index] != 0: return -1 - _ = (a, b) - return 0 + return 0 -fn wrap(token: String) -> String: - alias a = String("\\n") - alias b = String("\\t") - alias c = String("'") - alias d = String('"') - if token == a: - return String(List[Int8](0x0A, 0)) - if token == b: - return String(List[Int8](0x09, 0)) - if token == c: - return String(List[Int8](0x27, 0)) - if token == d: - return String(List[Int8](0x22, 0)) +# Quicksort helper function to find the partition position +fn partition( + inout array: PointerStrings, inout indices: DynamicVector[Int], low: Int, high: Int +) -> Int: + let pivot = array[high] + var ii = low - 1 + for jj in range(low, high): + if string_compare(pivot, array[jj]) == 1: + # If element smaller than pivot, swap + ii = ii + 1 + + let tmp = array[ii] + let tmp_idx = indices[ii] + array.store(ii, array[jj]) + indices[ii] = indices[jj] + array.store(jj, tmp) + indices[jj] = tmp_idx + + # Swap the pivot element + let tmp = array[ii + 1] + let tmp_idx = indices[ii + 1] + array.store(ii + 1, array[high]) + indices[ii + 1] = indices[high] + array.store(high, tmp) + indices[high] = tmp_idx + + return ii + 1 + + +fn quicksort( + inout array: PointerStrings, inout indices: DynamicVector[Int], low: Int, high: Int +): + if low < high: + let pi = partition(array, indices, low, high) + quicksort(array, indices, low, pi - 1) + quicksort(array, indices, pi + 1, high) + + +struct FileBuf: + var data: BufferPtrType + var offset: Int + var size: Int + + fn __init__(inout self): + self.data = BufferPtrType() + self.offset = 0 + self.size = 0 + + fn __del__(owned self): + self.data.free() + + fn move_offset(inout self, size: Int) raises: + let new_offset = self.offset + size + if new_offset > self.size: + raise Error("Resulting offset will be past the end of the FileBuf") + if new_offset < 0: + raise Error("Resulting offset will be before the beginning of the FileBuf") + self.offset = new_offset + + fn bitcast_offset_f32(inout self, size: Int) raises -> BufferPtrFloat32: + let ret = self.data.offset(self.offset).bitcast[DType.float32]() + self.move_offset(size * sizeof[DType.float32]()) + return ret + + fn get_offset(self) raises -> Int: + if self.offset > self.size: + raise Error("Offset is past the end of the FileBuf") + if self.offset < 0: + raise Error("Offset is before the beginning of the FileBuf") + return self.offset + + +fn wrap(token: PointerString) -> PointerString: + if string_compare(token, str_to_ptr("\\n")) == 0: + return str_to_ptr("<0x0A>") + if string_compare(token, str_to_ptr("\\t")) == 0: + return str_to_ptr("<0x09>") + if string_compare(token, str_to_ptr("'")) == 0: + return str_to_ptr("<0x27>") + elif string_compare(token, str_to_ptr('"')) == 0: + return str_to_ptr("<0x22>") return token -fn string_from_bytes(owned bytes: List[Int8]) -> String: - bytes.append(0) - return bytes^ - - -@value struct Tokenizer: - var vocab: List[String] - var vocab_scores: List[Float32] + var vocab: PointerStrings + var vocab_scores: BufferPtrFloat32 var max_token_length: Int var vocab_size: Int - var map_vocab_to_index: Dict[String, Int] - - fn __init__(inout self, vocab_size: Int, filename: String) raises: - with open(filename, "rb") as f: - - @parameter - fn read_bytes_as[dtype: DType](size: Int) raises -> SIMD[dtype, 1]: - # a List that keeps ownership of the pointer - var bytes = f.read_bytes(size) - # copy one element of new type after casting pointer - var result = bytes.data.bitcast[SIMD[dtype, 1]]()[0] - # orginal List and data can be destroyed - _ = bytes - return result - - self.vocab_size = vocab_size - self.vocab_scores = List[Float32](capacity=self.vocab_size) - self.vocab = List[String](capacity=self.vocab_size) - self.map_vocab_to_index = Dict[String, Int]() - self.max_token_length = int(read_bytes_as[DType.int32](4)) - - # read vocab_scores & vocab values (tokens) - for i in range(self.vocab_size): - var score = read_bytes_as[DType.float32](4) - var slen = int(read_bytes_as[DType.int32](4)) - var token = string_from_bytes(f.read_bytes(slen)) - self.vocab.append(token^) - self.vocab_scores.append(score) - self.map_vocab_to_index[self.vocab[i]] = i - - fn find(self, token_o: String) -> Int: - var token = wrap(token_o) - var index = self.map_vocab_to_index.find(token) - if index: - return index.value()[] + var sorted_vocab: PointerStrings + var sorted_indices: DynamicVector[Int] + + fn __init__(inout self, vocab_size: Int, inout buf: FileBuf) raises -> None: + self.vocab_size = vocab_size + self.max_token_length = read_val_int(buf) + self.vocab_scores = BufferPtrFloat32.alloc(self.vocab_size) + self.vocab = PointerStrings.alloc(self.vocab_size) + # lazy load sorted vocab + self.sorted_vocab = PointerStrings.alloc(0) + self.sorted_indices = DynamicVector[Int](0) + + # read vocab_scores & vocab values (tokens) + for i in range(0, self.vocab_size): + let score = read_val_float32(buf) + let slen = read_val_int(buf) + let token = read_val_str(buf, slen) + self.store_token(i, token, score) + return None + + fn __del__(owned self): + for i in range(0, self.vocab_size): + self.vocab[i].free() + self.vocab.free() + self.vocab_scores.free() + self.sorted_vocab.free() + + fn store_token( + inout self, index: Int, owned token: PointerString, score: Float32 + ) -> None: + self.vocab_scores.store(index, score) + self.vocab.store(index, token) + + # sort vocab by string_compare + fn sort(inout self) -> None: + if len(self.sorted_indices) < self.vocab_size: + self.sorted_indices = DynamicVector[Int](self.vocab_size) + self.sorted_vocab = PointerStrings.alloc(self.vocab_size) + for ii in range(self.vocab_size): + self.sorted_vocab.store(ii, self.vocab[ii]) + self.sorted_indices.push_back(ii) + + let n = self.vocab_size + quicksort(self.sorted_vocab, self.sorted_indices, 0, n - 1) + return None + + # Binary search that returns -1 if string is not found + fn find(inout self, token_o: PointerString) -> Int: + let token = wrap(token_o) + let n = self.vocab_size + if len(self.sorted_indices) < n: + self.sort() + var left = 0 + var right = n - 1 + while left <= right: + let mid = left + (right - left) // 2 + let comparison = string_compare(self.sorted_vocab[mid], token) + if comparison == 0: + return self.sorted_indices[mid] + if comparison < 0: + left = mid + 1 + else: + right = mid - 1 return -1 -@value struct Config: var dim: Int var kv_dim: Int @@ -244,42 +343,20 @@ struct Config: var vocab_size: Int var seq_len: Int var head_size: Int - var shared_weights: Bool - - fn __init__(inout self, fileName: String, print_config: Bool) raises: - var f = open(fileName, "r") - # reading 7 vars of type DType.int32 from the file - var bytes_of_config_params = NUM_CONFIG_INT * sizeof[DType.int32]() - # config_data_raw id Tensor[DType.int8] with bytes_of_config_params elements - var config_data_raw = f.read_bytes(bytes_of_config_params) - f.close() - # correct Tensor type and shape for easy reading, without copying data - var int32_ptr = config_data_raw.steal_data().bitcast[Int32]() - var config_data = Tensor(TensorShape(NUM_CONFIG_INT), int32_ptr) - self.dim = int(config_data[0]) - self.hidden_dim = int(config_data[1]) - self.n_layers = int(config_data[2]) - self.n_heads = int(config_data[3]) - self.n_kv_heads = int(config_data[4]) - self.vocab_size = int(config_data[5]) - self.seq_len = int(config_data[6]) - self.head_size = self.dim // self.n_heads - self.kv_dim = (self.n_kv_heads * self.dim) // self.n_heads - self.kv_mul = self.n_heads // self.n_kv_heads - # negative vocab size is hacky way of signaling unshared weights. bit yikes. - self.shared_weights = self.vocab_size > 0 - if not self.shared_weights: - self.vocab_size = -self.vocab_size - - if print_config: - print("config: dim, hidden_dim", self.dim, self.hidden_dim) - print("config: n_layers, n_heads", self.n_layers, self.n_heads) - print("config: vocab_size, seq_len", self.vocab_size, self.seq_len) - print("config: head_size", self.head_size) - print("config: kv_dim, kv_mul", self.kv_dim, self.kv_mul) - - -@value + + fn __init__(inout self): + self.dim = 0 + self.hidden_dim = 0 + self.n_layers = 0 + self.n_heads = 0 + self.n_kv_heads = 0 + self.vocab_size = 0 + self.seq_len = 0 + self.kv_dim = 0 + self.kv_mul = 0 + self.head_size = 0 + + struct RunState: var x: TensorF32 # activation at current time stamp (dim,) var xb: TensorF32 # same, but inside a residual branch (dim,) @@ -303,19 +380,14 @@ struct RunState: self.q = TensorF32(config.dim) self.att = TensorF32(config.n_heads, config.seq_len) self.logits = TensorF32(config.vocab_size) - self.key_cache = TensorF32( - config.n_layers, config.seq_len, config.kv_dim - ) - self.value_cache = TensorF32( - config.n_layers, config.seq_len, config.kv_dim - ) + self.key_cache = TensorF32(config.n_layers, config.seq_len, config.kv_dim) + self.value_cache = TensorF32(config.n_layers, config.seq_len, config.kv_dim) # So their updates flow to the caches, k and v are slices with shared memory. # Initialize with placeholders. The real tensors reference layer and position during forward pass. self.k = TensorSlice(TensorF32(TensorShape(1, config.kv_dim)), 1) self.v = TensorSlice(TensorF32(TensorShape(1, config.kv_dim)), 1) -@value struct TransformerWeights: var token_embedding_table: TensorF32 var freq_cis_real: TensorF32 @@ -332,83 +404,115 @@ struct TransformerWeights: var rms_final_weight: TensorF32 var wcls: TensorF32 - fn __init__(inout self, file_name: String, config: Config) raises: - var bytes_read = 0 - var f = open(file_name, "r") - - # throw away config data - _ = f.read_bytes(NUM_CONFIG_INT * sizeof[DType.int32]()) - bytes_read += NUM_CONFIG_INT * sizeof[DType.int32]() - - @parameter - fn read_weights(*dims: Int) raises -> TensorF32: - var shape = TensorShape(dims) - # The created tensor takes a 1D shape equal to bytes read - # So we can't reshape to target shape because dims don't match - var tmp = f.read_bytes( - shape.num_elements() * sizeof[DType.float32]() + fn __init__( + inout self, config: Config, shared_weights: Int, inout buf: FileBuf + ) raises: + fn load_weights(inout buf: FileBuf, *dims: Int) raises -> TensorF32: + # Ensure returned Tensor doesn't share a pointer with FileBuf + let shape = TensorShape(dims) + let result_data = BufferPtrFloat32.alloc(shape.num_elements()) + memcpy( + result_data, + buf.bitcast_offset_f32(shape.num_elements()), + shape.num_elements(), ) - bytes_read += shape.num_elements() * sizeof[DType.float32]() - var data = tmp.steal_data().bitcast[Float32]() - - return TensorF32(shape, data) - - self.token_embedding_table = read_weights(config.vocab_size, config.dim) - self.rms_att_weight = read_weights(config.n_layers, config.dim) - self.wq = read_weights(config.n_layers, config.dim, config.dim) - self.wk = read_weights(config.n_layers, config.kv_dim, config.dim) - self.wv = read_weights(config.n_layers, config.kv_dim, config.dim) - self.wo = read_weights(config.n_layers, config.dim, config.dim) - self.rms_ffn_weight = read_weights(config.n_layers, config.dim) - self.w1 = read_weights(config.n_layers, config.hidden_dim, config.dim) - self.w2 = read_weights(config.n_layers, config.dim, config.hidden_dim) - self.w3 = read_weights(config.n_layers, config.hidden_dim, config.dim) - self.rms_final_weight = read_weights(config.dim) + return TensorF32(result_data, shape) + + self.token_embedding_table = load_weights(buf, config.vocab_size, config.dim) + self.rms_att_weight = load_weights(buf, config.n_layers, config.dim) + self.wq = load_weights(buf, config.n_layers, config.dim, config.dim) + self.wk = load_weights(buf, config.n_layers, config.kv_dim, config.dim) + self.wv = load_weights(buf, config.n_layers, config.kv_dim, config.dim) + self.wo = load_weights(buf, config.n_layers, config.dim, config.dim) + self.rms_ffn_weight = load_weights(buf, config.n_layers, config.dim) + self.w1 = load_weights(buf, config.n_layers, config.hidden_dim, config.dim) + self.w2 = load_weights(buf, config.n_layers, config.dim, config.hidden_dim) + self.w3 = load_weights(buf, config.n_layers, config.hidden_dim, config.dim) + self.rms_final_weight = load_weights(buf, config.dim) # maybe need modifying for different model # config.head_size // 2 for stories and tinyllama-1.1 - self.freq_cis_real = read_weights(config.seq_len, config.head_size // 2) - self.freq_cis_imag = read_weights(config.seq_len, config.head_size // 2) - if config.shared_weights: + self.freq_cis_real = load_weights(buf, config.seq_len, config.head_size // 2) + self.freq_cis_imag = load_weights(buf, config.seq_len, config.head_size // 2) + if shared_weights: self.wcls = self.token_embedding_table else: - self.wcls = read_weights(config.vocab_size, config.dim) - f.close() - print( - "Total bytes read:", - bytes_read, - "Estimated checkpoint size: ", - bytes_read // 1024 // 1024, - "MB", - ) + self.wcls = load_weights(buf, config.vocab_size, config.dim) + + +fn read_file(file_name: String, inout buf: FileBuf) raises: + var fd = open(file_name, "r") + let data = fd.read() + fd.close() + + let cp_size = data._buffer.size + let cp_buf: BufferPtrType = BufferPtrType.alloc(cp_size) + + let data_ptr = data._as_ptr().bitcast[DType.uint8]() + + for i in range(cp_size): + cp_buf.store(i, data_ptr.load(i)) + + # don't free data + _ = data + + buf.data = cp_buf + buf.size = cp_size + buf.offset = 0 + return None + + +fn config_init(inout config: Config, inout buf: FileBuf) raises: + config.dim = read_val_int(buf) + config.hidden_dim = read_val_int(buf) + config.n_layers = read_val_int(buf) + config.n_heads = read_val_int(buf) + config.n_kv_heads = read_val_int(buf) + config.vocab_size = read_val_int(buf) + config.seq_len = read_val_int(buf) + config.head_size = config.dim // config.n_heads + config.kv_dim = (config.n_kv_heads * config.dim) // config.n_heads + config.kv_mul = config.n_heads // config.n_kv_heads + return None + + +@always_inline +fn accum(inout a: TensorF32, b: TensorF32) -> None: + let size = a.dim(0) + + @parameter + fn _acc[_nelts: Int](j: Int): + a.simd_store[_nelts](j, a.simd_load[_nelts](j) + b.simd_load[_nelts](j)) + + vectorize[nelts, _acc](size) @always_inline fn rmsnorm( - inout o: BufferPtrFloat32, - x: BufferPtrFloat32, - weight: BufferPtrFloat32, - size: Int, + inout o: BufferPtrFloat32, x: BufferPtrFloat32, weight: BufferPtrFloat32, size: Int ) -> None: # Calculate sum of squares - var tmp = Accumulator[DType.float32, nelts]() + var tmp = SIMD[DType.float32, nelts](0) @parameter fn _sum2[_nelts: Int](j: Int): - tmp.accumulate(x.offset(j).load[width=_nelts](0) ** 2) + if _nelts < nelts: + tmp[0] += (x.offset(j).simd_load[_nelts](0) ** 2).reduce_add() + else: + tmp += x.offset(j).simd_load[nelts](0) ** 2 - vectorize[_sum2, nelts](size) + vectorize[nelts, _sum2](size) - var ss: Float32 = tmp.total() + var ss: Float32 = tmp.reduce_add() ss = ss / size + 1e-5 ss = 1.0 / math.sqrt(ss) # Normalize and scale @parameter fn _norm[_nelts: Int](j: Int): - var val = weight.load[width=_nelts](j) * ss * x.load[width=_nelts](j) - o.offset(j).store[width=_nelts](0, val) + let val = weight.simd_load[_nelts](j) * ss * x.simd_load[_nelts](j) + o.offset(j).simd_store[_nelts](0, val) - vectorize[_norm, nelts](size) + vectorize[nelts, _norm](size) @always_inline @@ -432,66 +536,83 @@ fn softmax(inout x: TensorF32, start: Int, end: Int): @parameter fn _max[_nelts: Int](ii: Int): - var val = x.load[width=_nelts](start + ii).reduce_max() + let val = x.simd_load[_nelts](start + ii).reduce_max() if val > max_val: max_val = val - vectorize[_max, nelts](end - start) + vectorize[nelts, _max](end - start) - var acc = Accumulator[DType.float32, nelts]() + var ssum: Float32 = 0.0 @parameter fn _exp[_nelts: Int](ii: Int): - var val = math.exp(x.load[width=_nelts](start + ii) - max_val) - x.store[width=_nelts](start + ii, val) - acc.accumulate(val) - - vectorize[_exp, nelts](end - start) + x.simd_store[_nelts]( + start + ii, math.exp(x.simd_load[_nelts](start + ii) - max_val) + ) + ssum += x.simd_load[_nelts](start + ii).reduce_add() - var ssum = acc.total() + vectorize[nelts, _exp](end - start) @parameter fn _norm[_nelts: Int](ii: Int): - x.store[width=_nelts]( - start + ii, x.load[width=_nelts](start + ii) / ssum - ) + x.simd_store[_nelts](start + ii, x.simd_load[_nelts](start + ii) / ssum) - vectorize[_norm, nelts](end - start) + vectorize[nelts, _norm](end - start) @always_inline fn batch_matmul[ n: Int ]( - C: StaticTuple[BufferPtrFloat32, n], + C: StaticTuple[n, BufferPtrFloat32], A: BufferPtrFloat32, - B: StaticTuple[BufferPtrFloat32, n], + B: StaticTuple[n, BufferPtrFloat32], rows: Int, cols: Int, ): + alias nelts_list = VariadicList(128, 64, 32, 16, 8, 4, 2, 1) + @parameter fn compute_row(i: Int): - var tmp = StaticTuple[Accumulator[DType.float32, nelts], n]() + var tmp = StaticTuple[n, BufferPtrFloat32]() - @unroll - for k in range(n): - tmp[k] = Accumulator[DType.float32, nelts]() + @parameter + fn init[k: Int](): + tmp[k] = stack_allocation[nelts, DType.float32]() + memset_zero(tmp[k], nelts) - var row_offset = i * cols + unroll[n, init]() + let row_offset = i * cols + + var j = 0 @parameter - fn dot[_nelts: Int](j: Int): - var a = A.load[width=_nelts](j) + fn dot[z: Int](): + # we want the list to only contain nelts value that are 4 * simdwidth or less, if we use bigger values we get undefined behavior + @parameter + if nelts_list[z] <= nelts: + let range = cols - cols % nelts_list[z] + while j < range: + let a = A.simd_load[nelts_list[z]](j) - @unroll - for k in range(n): - tmp[k].accumulate(a * B[k].load[width=_nelts](row_offset + j)) + @parameter + fn _multiply_tail[k: Int](): + tmp[k].simd_store[nelts_list[z]]( + 0, + tmp[k].simd_load[nelts_list[z]](0) + + a * B[k].simd_load[nelts_list[z]](row_offset + j), + ) - vectorize[dot, nelts](cols) + unroll[n, _multiply_tail]() + j += nelts_list[z] - @unroll - for k in range(n): - C[k].store(i, tmp[k].total()) + unroll[len(nelts_list), dot]() + + @parameter + fn _reduce[k: Int](): + C[k].store(i, tmp[k].simd_load[nelts](0).reduce_add()) + + unroll[n, _reduce]() parallelize[compute_row](rows, workers) @@ -501,9 +622,9 @@ fn matmul(C: TensorF32, A: TensorF32, B: TensorF32) raises: # B (d,n) @ A (n,) -> C (d,) matmul_dimension_checks(A.shape(), B.shape()) batch_matmul[1]( - StaticTuple[BufferPtrFloat32, 1](C.data()), + StaticTuple[1, BufferPtrFloat32](C.data()), A.data(), - StaticTuple[BufferPtrFloat32, 1](B.data()), + StaticTuple[1, BufferPtrFloat32](B.data()), B.dim(0), B.dim(1), ) @@ -514,9 +635,9 @@ fn matmul(C: TensorF32, A: TensorF32, B: TensorSlice) raises: # B (d,n) @ A (n,) -> C (d,) matmul_dimension_checks(A.shape(), B.shape()) batch_matmul[1]( - StaticTuple[BufferPtrFloat32, 1](C.data()), + StaticTuple[1, BufferPtrFloat32](C.data()), A.data(), - StaticTuple[BufferPtrFloat32, 1](B.data()), + StaticTuple[1, BufferPtrFloat32](B.data()), B.dim(0), B.dim(1), ) @@ -527,11 +648,11 @@ fn matmul(C: TensorSlice, A: TensorF32, B: TensorSlice) raises: # B (d,n) @ A (n,) -> C (d,) matmul_dimension_checks(A.shape(), B.shape()) batch_matmul[1]( - StaticTuple[BufferPtrFloat32, 1]( + StaticTuple[1, BufferPtrFloat32]( C.data(), ), A.data(), - StaticTuple[BufferPtrFloat32, 1](B.data()), + StaticTuple[1, BufferPtrFloat32](B.data()), B.dim(0), B.dim(1), ) @@ -540,8 +661,7 @@ fn matmul(C: TensorSlice, A: TensorF32, B: TensorSlice) raises: fn matmul_dimension_checks(a: TensorShape, b: TensorShape) raises: if a[0] != b[1]: raise Error( - "matmul dimension mismatch. A rows (dim 0) not equal to B columns" - " (dim 1)" + "matmul dimension mismatch. A rows (dim 0) not equal to B columns (dim 1)" ) if b.rank() != 2: raise Error("matmul expects B to be a 2D matrix") @@ -557,22 +677,22 @@ fn rope_rotation_llama( config: Config, ) -> None: # stories model, llama2 - var head_size = config.head_size + let head_size = config.head_size @parameter fn head_loop(i: Int): # Simple vectorization with (head_size // 2) steps gave junk transformer output. # Maybe because the nelt ranges end up overlapping between the steps. for j in range(0, config.head_size, 2): - var fcr = freq_cis_real_row[j // 2] - var fci = freq_cis_imag_row[j // 2] - var q0 = state.q[i * head_size + j] - var q1 = state.q[i * head_size + j + 1] + let fcr = freq_cis_real_row[j // 2] + let fci = freq_cis_imag_row[j // 2] + let q0 = state.q[i * head_size + j] + let q1 = state.q[i * head_size + j + 1] state.q[i * head_size + j] = q0 * fcr - q1 * fci state.q[i * head_size + j + 1] = q0 * fci + q1 * fcr if i < config.n_kv_heads: - var k0 = state.k[i * head_size + j] - var k1 = state.k[i * head_size + j + 1] + let k0 = state.k[i * head_size + j] + let k1 = state.k[i * head_size + j + 1] state.k[i * head_size + j] = k0 * fcr - k1 * fci state.k[i * head_size + j + 1] = k0 * fci + k1 * fcr @@ -588,35 +708,35 @@ fn transformer( weights: TransformerWeights, ) raises -> None: # A few convenience variables - var dim = config.dim - var hidden_dim = config.hidden_dim - var head_size = config.head_size - var kv_dim = config.kv_dim - var kv_mul = config.kv_mul + let dim = config.dim + let hidden_dim = config.hidden_dim + let head_size = config.head_size + let kv_dim = config.kv_dim + let kv_mul = config.kv_mul # Copy the token embedding into x - var content_row = weights.token_embedding_table.data().offset(token * dim) - memcpy(state.x.data(), content_row, dim) + let content_row = weights.token_embedding_table.data().offset(token * dim) + memcpy[DType.float32](state.x.data(), content_row, dim) # Pluck out the "pos" row of freq_cis_real and freq_cis_imag - var freq_cis_real_row = TensorSlice(weights.freq_cis_real, pos) - var freq_cis_imag_row = TensorSlice(weights.freq_cis_imag, pos) + let freq_cis_real_row = TensorSlice(weights.freq_cis_real, pos) + let freq_cis_imag_row = TensorSlice(weights.freq_cis_imag, pos) # Forward all the layers for l in range(config.n_layers): # Attention rmsnorm rmsnorm(state.xb, state.x, TensorSlice(weights.rms_att_weight, l)) # QKV matmuls for this position - var loff = l * config.seq_len * config.kv_dim + let loff = l * config.seq_len * config.kv_dim state.k = TensorSlice(state.key_cache, l, pos) state.v = TensorSlice(state.value_cache, l, pos) if kv_dim == dim: batch_matmul[3]( - StaticTuple[BufferPtrFloat32, 3]( + StaticTuple[3, BufferPtrFloat32]( state.q.data(), state.k.data(), state.v.data() ), state.xb.data(), - StaticTuple[BufferPtrFloat32, 3]( + StaticTuple[3, BufferPtrFloat32]( TensorSlice(weights.wq, l).data(), TensorSlice(weights.wk, l).data(), TensorSlice(weights.wv, l).data(), @@ -627,13 +747,10 @@ fn transformer( else: matmul(state.q, state.xb, TensorSlice(weights.wq, l)) batch_matmul[2]( - StaticTuple[BufferPtrFloat32, 2]( - state.k.data(), state.v.data() - ), + StaticTuple[2, BufferPtrFloat32](state.k.data(), state.v.data()), state.xb.data(), - StaticTuple[BufferPtrFloat32, 2]( - TensorSlice(weights.wk, l).data(), - TensorSlice(weights.wv, l).data(), + StaticTuple[2, BufferPtrFloat32]( + TensorSlice(weights.wk, l).data(), TensorSlice(weights.wv, l).data() ), kv_dim, dim, @@ -648,26 +765,26 @@ fn transformer( @parameter fn loop_over_heads(h: Int): # Get the query vector for this head - var q_offset = h * head_size + let q_offset = h * head_size # Index of attention scores for this head - var att_offset = h * config.seq_len + let att_offset = h * config.seq_len # Iterate over all timesteps, including the current one for t in range(pos + 1): # Starting index of the key vector for this head and at this timestep - var k_offset = loff + t * kv_dim + (h // kv_mul) * head_size + let k_offset = loff + t * kv_dim + (h // kv_mul) * head_size # Calculate the attention score as the dot product of q and k var score: Float32 = 0.0 @parameter fn score_fn[_nelts: Int](i: Int): score += ( - state.q.load[width=_nelts](q_offset + i) - * state.key_cache.load[width=_nelts](k_offset + i) + state.q.simd_load[_nelts](q_offset + i) + * state.key_cache.simd_load[_nelts](k_offset + i) ).reduce_add() - vectorize[score_fn, nelts](head_size) + vectorize[nelts, score_fn](head_size) score /= math.sqrt[DType.float32, 1](head_size) # Save the score to the attention buffer @@ -676,39 +793,38 @@ fn transformer( # Softmax the scores to get attention weights, from 0..pos inclusively softmax(state.att, att_offset, att_offset + pos + 1) # Weighted sum of the values, store back into xb - var xb_offset = h * head_size + let xb_offset = h * head_size for t in range(pos + 1): # Starting index of the value vector for this head and at this timestep - var v_offset = loff + t * kv_dim + (h // kv_mul) * head_size + let v_offset = loff + t * kv_dim + (h // kv_mul) * head_size # Get the attention weight for this timestep - var a = state.att[att_offset + t] + let a = state.att[att_offset + t] # Accumulate the weighted value into xb @parameter fn xb_accumulate[_nelts: Int](i: Int): - var xbi = state.xb.load[width=_nelts]( + let xbi = state.xb.simd_load[_nelts]( xb_offset + i - ) + a * state.value_cache.load[width=_nelts](v_offset + i) - state.xb.store[width=_nelts](xb_offset + i, xbi) + ) + a * state.value_cache.simd_load[_nelts](v_offset + i) + state.xb.simd_store[_nelts](xb_offset + i, xbi) - vectorize[xb_accumulate, nelts](head_size) + vectorize[nelts, xb_accumulate](head_size) parallelize[loop_over_heads](config.n_heads, workers) # Final matrix multiplication to get the output of the attention matmul(state.xb2, state.xb, TensorSlice(weights.wo, l)) # Residual connection back into x - state.x = state.x + state.xb2 + accum(state.x, state.xb2) # FFN rmsnorm rmsnorm(state.xb, state.x, TensorSlice(weights.rms_ffn_weight, l)) # Calculate self.w1(x) and self.w3(x) for FFN batch_matmul[2]( - StaticTuple[BufferPtrFloat32, 2](state.hb.data(), state.hb2.data()), + StaticTuple[2, BufferPtrFloat32](state.hb.data(), state.hb2.data()), state.xb.data(), - StaticTuple[BufferPtrFloat32, 2]( - TensorSlice(weights.w1, l).data(), - TensorSlice(weights.w3, l).data(), + StaticTuple[2, BufferPtrFloat32]( + TensorSlice(weights.w1, l).data(), TensorSlice(weights.w3, l).data() ), hidden_dim, dim, @@ -716,20 +832,18 @@ fn transformer( @parameter fn silu[_nelts: Int](i: Int): - var initial_hb = state.hb.load[width=_nelts](i) + let initial_hb = state.hb.simd_load[_nelts](i) # Apply SiLU activation function (silu(x) = x * sigmoid(x)) - var hbi = initial_hb * (1.0 / (1.0 + math.exp(-initial_hb))) + let hbi = initial_hb * (1.0 / (1.0 + math.exp(-initial_hb))) # Elementwise multiply with w3(x) - state.hb.store[width=_nelts]( - i, hbi * state.hb2.load[width=_nelts](i) - ) + state.hb.simd_store[_nelts](i, hbi * state.hb2.simd_load[_nelts](i)) - vectorize[silu, nelts](hidden_dim) + vectorize[nelts, silu](hidden_dim) # Final matrix multiplication to get the output of the FFN matmul(state.xb, state.hb, TensorSlice(weights.w2, l)) # Residual connection - state.x = state.x + state.xb + accum(state.x, state.xb) # Final rmsnorm rmsnorm(state.x, state.x, weights.rms_final_weight) @@ -738,11 +852,22 @@ fn transformer( matmul(state.logits, state.x, weights.wcls) +fn argmax(v: TensorF32) -> Int: + # return argmax of v + var max_i: Int = 0 + var max_p: Float32 = v[0] + for i in range(v.dim(0)): + if v[i] > max_p: + max_i = i + max_p = v[i] + return max_i + + fn sample(probabilities: TensorF32) -> Int: - var n = probabilities.dim(0) + let n = probabilities.dim(0) # Sample index from probabilities, they must sum to 1 # get random value within (min, max) float32 range - var r = rand[DType.float32](1) + let r = rand[DType.float32](1) var cdf: Float32 = 0.0 for i in range(n): cdf += probabilities[i] @@ -751,14 +876,14 @@ fn sample(probabilities: TensorF32) -> Int: return n - 1 # In case of rounding errors -fn bpe_encode(inout tokens: List[Int], text: String, tok: Tokenizer): +fn bpe_encode(inout tokens: DynamicVector[Int], text: String, inout tok: Tokenizer): for pos in range(len(text)): - var char = text[pos] - var id = tok.find(char) + let char = str_to_ptr(text[pos]) + let id = tok.find(char) if id == -1: print("Not a good prompt token at pos ", pos) return - tokens.append(id) + tokens.push_back(id) while True: var best_score = Float32(-1e10) @@ -767,10 +892,10 @@ fn bpe_encode(inout tokens: List[Int], text: String, tok: Tokenizer): for i in range(len(tokens) - 1): # Check if we can merge the pair (tokens[i], tokens[i+1]) - var str = str_concat(tok.vocab[tokens[i]], tok.vocab[tokens[i + 1]]) - var id = tok.find(str) - if id != -1 and tok.vocab_scores[id] > best_score: - best_score = tok.vocab_scores[id] + let str = str_concat(tok.vocab[tokens[i]], tok.vocab[tokens[i + 1]]) + let id = tok.find(str) + if id != -1 and tok.vocab_scores.load(id) > best_score: + best_score = tok.vocab_scores.load(id) best_id = id best_idx = i @@ -781,12 +906,33 @@ fn bpe_encode(inout tokens: List[Int], text: String, tok: Tokenizer): # Merge the consecutive pair (best_idx, best_idx+1) into new token best_id tokens[best_idx] = best_id # Delete token at position best_idx+1, shift the entire sequence back 1 - var _tokens = List[Int]() + var _tokens = DynamicVector[Int]() for i in range(0, best_idx + 1): - _tokens.append(tokens[i]) + _tokens.push_back(tokens[i]) for i in range(best_idx + 2, len(tokens)): - _tokens.append(tokens[i]) - tokens = _tokens^ + _tokens.push_back(tokens[i]) + tokens = _tokens + + +fn str2num(d: Int) -> Int: + # covert Hex to decimal + if d >= ord("A"): + return d - ord("A") + 10 + return d - ord("0") + + +fn print_str(s: PointerString): + # print raw byte like <0x0A> + if (s[1].to_int() == ord("0")) and (s[2].to_int() == ord("x")): + let d1: Int = s[3].to_int() + let d2: Int = s[4].to_int() + print_no_newline(chr(str2num(d1) * 16 + str2num(d2))) + return + # print all chars till null character + var p: Int = 0 + while s[p].to_int() != 0: + print_no_newline(chr(s[p].to_int())) + p += 1 fn time_in_ms() -> Int: @@ -797,33 +943,30 @@ fn time_in_ms() -> Int: fn print_usage(): print("Usage: mojo llama2.mojo [options]") print( - 'Example: mojo llama2.mojo stories15M.bin -s 99 -n 256 -t 0.5 -i "Llama' - ' is an animal"' + 'Example: mojo llama2.mojo stories15M.bin -s 99 -n 256 -t 0.5 -i "Llama is an' + ' animal"' ) print("Options:") print(" -s random seed, default time.now()") print(" -t temperature in [0,1.0], default 1.0") - print( - " -n number of steps to run for, default 256. 0 = max_seq_len" - ) + print(" -n number of steps to run for, default 256. 0 = max_seq_len") print(" -i input prompt") print(" -z tokenizer path") print(" -j number of workers to use, default num_cores()") fn main() raises: - workers = num_performance_cores() + workers = num_cores() var tokenizer = StringRef("tokenizer.bin") var checkpoint = StringRef("stories15M.bin") var temperature = 0.9 var steps = 256 var prompt = String("") var rng_seed: Int = time.now() - var print_config = 0 @parameter fn argparse() raises -> Int: - var args = argv() + let args = argv() if len(args) < 2: return 0 checkpoint = args[1] @@ -840,10 +983,8 @@ fn main() raises: prompt = args[i + 1] if args[i] == "-j": workers = atol(args[i + 1]) - if args[i] == "-pc": - print_config = atol(args[i + 1]) if args[i] == "-t": - var val = args[i + 1] + let val = args[i + 1] temperature = 0.0 # hacky parse float, keep only 1 digit for c in range(0, len(val)): @@ -857,23 +998,43 @@ fn main() raises: return 0 return 1 - var res = argparse() + let res = argparse() if res == 0: print_usage() return print("num parallel workers:", workers, " SIMD width:", nelts) random.seed(rng_seed) - var config = Config(checkpoint, print_config == 1) - var weights = TransformerWeights(checkpoint, config) + var fbuf: FileBuf = FileBuf() + var tbuf: FileBuf = FileBuf() + var config: Config = Config() + + read_file(checkpoint, fbuf) + config_init(config, fbuf) + + # negative vocab size is hacky way of signaling unshared weights. bit yikes. + let shared_weights = 1 if config.vocab_size > 0 else 0 + config.vocab_size = ( + -config.vocab_size if config.vocab_size < 0 else config.vocab_size + ) + + let weights: TransformerWeights = TransformerWeights(config, shared_weights, fbuf) if steps <= 0 or steps > config.seq_len: steps = config.seq_len - var tok = Tokenizer(config.vocab_size, tokenizer) + # Read in the tokenizer.bin file + read_file(tokenizer, tbuf) + var tok = Tokenizer(config.vocab_size, tbuf) + # print the layers number and vocab size print( - "n layers:", + "checkpoint size: ", + fbuf.size, + "[", + fbuf.size // 1024 // 1024, + "MB ]", + "| n layers:", config.n_layers, "| vocab size:", tok.vocab_size, @@ -883,7 +1044,7 @@ fn main() raises: var state = RunState(config) # Process the prompt, if any - var prompt_tokens = List[Int]() + var prompt_tokens = DynamicVector[Int]() if prompt: bpe_encode(prompt_tokens, prompt, tok) @@ -906,7 +1067,7 @@ fn main() raises: # Sample the next token if temperature == 0.0: # Greedy argmax sampling: take the token with the highest probability - next_token = int(state.logits.argmax()[0]) + next_token = argmax(state.logits) else: # Apply the temperature to the logits for q in range(config.vocab_size): @@ -920,11 +1081,11 @@ fn main() raises: # Finish generating when EOS, BOS appear if next_token == 1 or next_token == 2: break - var token_str: String = tok.vocab[next_token] - if token == 1 and token_str._buffer[0] == ord(" "): - token_str = token_str[1:] + var token_str: PointerString = tok.vocab[next_token] + if token == 1 and token_str[0] == ord(" "): + token_str = token_str.offset(1) - print(token_str, end="") + print_str(token_str) # Advance forward token = next_token @@ -933,5 +1094,5 @@ fn main() raises: if start == 0: start = time_in_ms() - var end = time_in_ms() + let end = time_in_ms() print("\nachieved tok/s: ", (pos - 1) / (end - start) * 1000)