From bf65f31a402506328c771b60bee1d1b01e797131 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 23 Sep 2024 05:23:13 +0000
Subject: [PATCH 001/193] Bump thiserror from 1.0.63 to 1.0.64

Bumps [thiserror](https://github.com/dtolnay/thiserror) from 1.0.63 to 1.0.64.
- [Release notes](https://github.com/dtolnay/thiserror/releases)
- [Commits](https://github.com/dtolnay/thiserror/compare/1.0.63...1.0.64)

---
updated-dependencies:
- dependency-name: thiserror
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3532176a..1f724a85 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1133,18 +1133,18 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.63"
+version = "1.0.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
+checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.63"
+version = "1.0.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
+checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3"
 dependencies = [
  "proc-macro2",
  "quote",

From d2f482c4b2d3b1ec525dae6b993e7f0a923766a6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 8 Oct 2024 00:05:15 +0000
Subject: [PATCH 002/193] Bump docker/setup-buildx-action from 3.6.1 to 3.7.1

Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3.6.1 to 3.7.1.
- [Release notes](https://github.com/docker/setup-buildx-action/releases)
- [Commits](https://github.com/docker/setup-buildx-action/compare/988b5a0280414f521da01fcc63a27aeeb4b104db...c47758b77c9736f4b2ef4073d4d51994fabfe349)

---
updated-dependencies:
- dependency-name: docker/setup-buildx-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/llama-cpp-rs-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index 9e6fe4b8..4f439879 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -49,7 +49,7 @@ jobs:
         with:
           platforms: arm64,amd64
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@988b5a0280414f521da01fcc63a27aeeb4b104db
+        uses: docker/setup-buildx-action@c47758b77c9736f4b2ef4073d4d51994fabfe349
       - name: Build
         uses: docker/build-push-action@v6
         with:

From 48d81c2d24a15aa2ab2cb058c6b71dd7a0b61e17 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 28 Oct 2024 05:08:23 +0000
Subject: [PATCH 003/193] Bump actions/checkout from 4.2.1 to 4.2.2

Bumps [actions/checkout](https://github.com/actions/checkout) from 4.2.1 to 4.2.2.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871...11bd71901bbe5b1630ceea73d27597364c9af683)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/llama-cpp-rs-check.yml   | 8 ++++----
 .github/workflows/publish-upon-release.yml | 2 +-
 .github/workflows/update-llama-cpp.yml     | 2 +-
 .github/workflows/update-toml-version.yaml | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index 9e6fe4b8..124adebd 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -18,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
         with:
           submodules: recursive
       - name: Install Compile Deps
@@ -43,7 +43,7 @@ jobs:
         target: [ linux/arm64, linux/amd64 ]
     steps:
       - name: checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
       - name: Setup QEMU
         uses: docker/setup-qemu-action@49b3bc8e6bdd4a60e6116a5414239cba5943d3cf
         with:
@@ -61,7 +61,7 @@ jobs:
     runs-on: macos-latest
     steps:
       - name: checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
         with:
           submodules: recursive
       - name: Setup Rust
@@ -73,7 +73,7 @@ jobs:
     runs-on: windows-latest
     steps:
       - name: checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
         with:
           submodules: recursive
       - name: Setup Rust
diff --git a/.github/workflows/publish-upon-release.yml b/.github/workflows/publish-upon-release.yml
index 1e3cc18b..b470e3f1 100644
--- a/.github/workflows/publish-upon-release.yml
+++ b/.github/workflows/publish-upon-release.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
         with:
           submodules: recursive
       - name: Publish crates for llama-cpp-sys-2
diff --git a/.github/workflows/update-llama-cpp.yml b/.github/workflows/update-llama-cpp.yml
index 48e83f7e..230b3ee6 100644
--- a/.github/workflows/update-llama-cpp.yml
+++ b/.github/workflows/update-llama-cpp.yml
@@ -15,7 +15,7 @@ jobs:
     steps:
       - name: Set date
         run: echo "DATE=$(date -I)" >> $GITHUB_ENV
-      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
         name: Checkout latest
         with:
           submodules: recursive
diff --git a/.github/workflows/update-toml-version.yaml b/.github/workflows/update-toml-version.yaml
index f7446d3e..5055e8ba 100644
--- a/.github/workflows/update-toml-version.yaml
+++ b/.github/workflows/update-toml-version.yaml
@@ -15,7 +15,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
         with:
           submodules: recursive
 

From 6638c432024eb59d60e2a9a2e7696638e8389b46 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 28 Oct 2024 05:23:32 +0000
Subject: [PATCH 004/193] Bump anyhow from 1.0.86 to 1.0.91

Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.86 to 1.0.91.
- [Release notes](https://github.com/dtolnay/anyhow/releases)
- [Commits](https://github.com/dtolnay/anyhow/compare/1.0.86...1.0.91)

---
updated-dependencies:
- dependency-name: anyhow
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 899deed9..1cc45385 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,9 +68,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.86"
+version = "1.0.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8"
 
 [[package]]
 name = "base64"
diff --git a/Cargo.toml b/Cargo.toml
index b7abe72e..545463f2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.4"
 cc = "1.1.28"
-anyhow = "1.0.86"
+anyhow = "1.0.91"
 clap = "4.5.19"
 encoding_rs = "0.8.34"
 

From d9475ccc60813d16b704febe433e784cf46e5b75 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Mon, 28 Oct 2024 16:08:18 +0100
Subject: [PATCH 005/193] also search in lib64

---
 llama-cpp-sys-2/build.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 33b0ee19..fe3d8aa9 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -65,7 +65,7 @@ fn extract_lib_names(out_dir: &Path, build_shared_libs: bool) -> Vec<String> {
             "*.a"
         }
     };
-    let libs_dir = out_dir.join("lib");
+    let libs_dir = out_dir.join("lib*");
     let pattern = libs_dir.join(lib_pattern);
     debug_log!("Extract libs {}", pattern.display());
 
@@ -265,11 +265,13 @@ fn main() {
 
     // Search paths
     println!("cargo:rustc-link-search={}", out_dir.join("lib").display());
+    println!("cargo:rustc-link-search={}", out_dir.join("lib64").display());
     println!("cargo:rustc-link-search={}", build_dir.display());
 
     // Link libraries
     let llama_libs_kind = if build_shared_libs { "dylib" } else { "static" };
     let llama_libs = extract_lib_names(&out_dir, build_shared_libs);
+    assert_ne!(llama_libs.len(), 0);
 
     for lib in llama_libs {
         debug_log!(
@@ -349,4 +351,4 @@ fn main() {
             }
         }
     }
-}
\ No newline at end of file
+}

From 35f570d9a09ed0bc116667690122f5d642d8fbb6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 29 Oct 2024 16:43:16 +0000
Subject: [PATCH 006/193] Bump bindgen from 0.69.4 to 0.69.5

Bumps [bindgen](https://github.com/rust-lang/rust-bindgen) from 0.69.4 to 0.69.5.
- [Release notes](https://github.com/rust-lang/rust-bindgen/releases)
- [Changelog](https://github.com/rust-lang/rust-bindgen/blob/v0.69.5/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/rust-bindgen/compare/v0.69.4...v0.69.5)

---
updated-dependencies:
- dependency-name: bindgen
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1cc45385..e39a35e4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -80,9 +80,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
 name = "bindgen"
-version = "0.69.4"
+version = "0.69.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
+checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
 dependencies = [
  "bitflags",
  "cexpr",
diff --git a/Cargo.toml b/Cargo.toml
index 545463f2..dc201d11 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,7 @@ tracing = "0.1"
 hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
-bindgen = "0.69.4"
+bindgen = "0.69.5"
 cc = "1.1.28"
 anyhow = "1.0.91"
 clap = "4.5.19"

From 3a7f9a8582819575fd455a4be6849a41a80c307b Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Tue, 29 Oct 2024 16:45:19 +0000
Subject: [PATCH 007/193] Bump version to 0.1.84 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1cc45385..eea8457c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.83"
+version = "0.1.84"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.83"
+version = "0.1.84"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.83"
+version = "0.1.84"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.83"
+version = "0.1.84"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 3815fed7..44f9c1ba 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.83"
+version = "0.1.84"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index e14648ad..1d27ef3f 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.83"
+version = "0.1.84"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 7f18b2df..6f2c5f5e 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.83"
+version = "0.1.84"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 5e26631a..ec7285f0 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.83"
+version = "0.1.84"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 602eae9f2a7a201f2d931402e27cae83553c6a1c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 11 Nov 2024 05:28:36 +0000
Subject: [PATCH 008/193] chore(deps): bump anyhow from 1.0.91 to 1.0.93

Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.91 to 1.0.93.
- [Release notes](https://github.com/dtolnay/anyhow/releases)
- [Commits](https://github.com/dtolnay/anyhow/compare/1.0.91...1.0.93)

---
updated-dependencies:
- dependency-name: anyhow
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1cc45385..8330dcdb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,9 +68,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.91"
+version = "1.0.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8"
+checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
 
 [[package]]
 name = "base64"
diff --git a/Cargo.toml b/Cargo.toml
index 545463f2..6bf5a1d4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.4"
 cc = "1.1.28"
-anyhow = "1.0.91"
+anyhow = "1.0.93"
 clap = "4.5.19"
 encoding_rs = "0.8.34"
 

From cb0ecd90ff696ddc3fe13edd270bd3f0137885fb Mon Sep 17 00:00:00 2001
From: Lou Ting <ting.lou@gmail.com>
Date: Thu, 21 Nov 2024 11:25:51 +0800
Subject: [PATCH 009/193] wrap llama_batch_get_one

---
 llama-cpp-2/src/llama_batch.rs | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs
index e52bfa9e..3efb7965 100644
--- a/llama-cpp-2/src/llama_batch.rs
+++ b/llama-cpp-2/src/llama_batch.rs
@@ -149,6 +149,25 @@ impl LlamaBatch {
         }
     }
 
+    /// llama_batch_get_one
+    /// Return batch for single sequence of tokens starting at pos_0
+    ///
+    /// NOTE: this is a helper function to facilitate transition to the new batch API
+    ///
+    pub fn get_one(tokens: &[LlamaToken], pos_0: llama_pos, seq_id: llama_seq_id) -> Self {
+        unsafe {
+            let ptr = tokens.as_ptr() as *mut i32;
+            let batch =
+                llama_cpp_sys_2::llama_batch_get_one(ptr, tokens.len() as i32, pos_0, seq_id);
+
+            crate::llama_batch::LlamaBatch {
+                allocated: 0,
+                initialized_logits: vec![],
+                llama_batch: batch,
+            }
+        }
+    }
+
     /// Returns the number of tokens in the batch.
     #[must_use]
     pub fn n_tokens(&self) -> i32 {
@@ -170,7 +189,9 @@ impl Drop for LlamaBatch {
     /// # }
     fn drop(&mut self) {
         unsafe {
-            llama_batch_free(self.llama_batch);
+            if self.allocated > 0 {
+                llama_batch_free(self.llama_batch);
+            }
         }
     }
 }

From 81c2b05d32fd7703f40c483c316c02b8463283ec Mon Sep 17 00:00:00 2001
From: Lou Ting <ting.lou@gmail.com>
Date: Tue, 26 Nov 2024 16:25:45 +0800
Subject: [PATCH 010/193] get_one

---
 llama-cpp-2/src/llama_batch.rs | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs
index 3efb7965..31fb9d54 100644
--- a/llama-cpp-2/src/llama_batch.rs
+++ b/llama-cpp-2/src/llama_batch.rs
@@ -1,6 +1,6 @@
 //! Safe wrapper around `llama_batch`.
 
-use crate::token::LlamaToken;
+use crate::token::{self, LlamaToken};
 use llama_cpp_sys_2::{llama_batch, llama_batch_free, llama_batch_init, llama_pos, llama_seq_id};
 
 /// A safe wrapper around `llama_batch`.
@@ -20,6 +20,9 @@ pub enum BatchAddError {
     /// There was not enough space in the batch to add the token.
     #[error("Insufficient Space of {0}")]
     InsufficientSpace(usize),
+    /// Empty buffer is provided for get_one
+    #[error("Empty buffer")]
+    EmptyBuffer,
 }
 
 impl LlamaBatch {
@@ -154,18 +157,24 @@ impl LlamaBatch {
     ///
     /// NOTE: this is a helper function to facilitate transition to the new batch API
     ///
-    pub fn get_one(tokens: &[LlamaToken], pos_0: llama_pos, seq_id: llama_seq_id) -> Self {
-        unsafe {
-            let ptr = tokens.as_ptr() as *mut i32;
-            let batch =
-                llama_cpp_sys_2::llama_batch_get_one(ptr, tokens.len() as i32, pos_0, seq_id);
-
-            crate::llama_batch::LlamaBatch {
-                allocated: 0,
-                initialized_logits: vec![],
-                llama_batch: batch,
-            }
+    pub fn get_one(
+        tokens: &[LlamaToken],
+        pos_0: llama_pos,
+        seq_id: llama_seq_id,
+    ) -> Result<Self, BatchAddError> {
+        if tokens.is_empty() {
+            return Err(BatchAddError::EmptyBuffer);
         }
+        let batch = unsafe {
+            let ptr = tokens.as_ptr() as *mut i32;
+            llama_cpp_sys_2::llama_batch_get_one(ptr, tokens.len() as i32, pos_0, seq_id)
+        };
+        let batch = Self {
+            allocated: 0,
+            initialized_logits: vec![(tokens.len() - 1) as i32],
+            llama_batch: batch,
+        };
+        Ok(batch)
     }
 
     /// Returns the number of tokens in the batch.

From 2822e3ae86a66b571cc6de371feefb938f4f1da7 Mon Sep 17 00:00:00 2001
From: Lou Ting <ting.lou@gmail.com>
Date: Tue, 26 Nov 2024 16:27:53 +0800
Subject: [PATCH 011/193] fmt

---
 llama-cpp-2/src/llama_batch.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs
index 31fb9d54..8a2fd376 100644
--- a/llama-cpp-2/src/llama_batch.rs
+++ b/llama-cpp-2/src/llama_batch.rs
@@ -1,6 +1,6 @@
 //! Safe wrapper around `llama_batch`.
 
-use crate::token::{self, LlamaToken};
+use crate::token::LlamaToken;
 use llama_cpp_sys_2::{llama_batch, llama_batch_free, llama_batch_init, llama_pos, llama_seq_id};
 
 /// A safe wrapper around `llama_batch`.

From 05033098eeb02f98ca12b714ba1c51ef60127a15 Mon Sep 17 00:00:00 2001
From: volesen <vincolesen@gmail.com>
Date: Tue, 26 Nov 2024 14:16:36 +0100
Subject: [PATCH 012/193] Bump llama.cpp to nearest breaking change (Sampling
 API)

---
 llama-cpp-sys-2/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 8f1d81a0..df270ef7 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 8f1d81a0b6f50b9bad72db0b6fcd299ad9ecd48c
+Subproject commit df270ef74596da8f1178f08991f4c51f18c9ee82

From 6901d12b4785d5df0ea4eb65c1c8b4b8f96dfafe Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Tue, 26 Nov 2024 14:45:34 +0100
Subject: [PATCH 013/193] remove old/deprecated code

---
 llama-cpp-2/src/context.rs          |  13 -
 llama-cpp-2/src/context/params.rs   |  31 --
 llama-cpp-2/src/context/sample.rs   | 141 --------
 llama-cpp-2/src/grammar.rs          | 491 ----------------------------
 llama-cpp-2/src/lib.rs              |   2 -
 llama-cpp-2/src/timing.rs           | 183 -----------
 llama-cpp-2/src/token/data_array.rs | 398 ----------------------
 7 files changed, 1259 deletions(-)
 delete mode 100644 llama-cpp-2/src/context/sample.rs
 delete mode 100644 llama-cpp-2/src/grammar.rs
 delete mode 100644 llama-cpp-2/src/timing.rs

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index 80ee8f75..91b7926c 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -7,7 +7,6 @@ use std::slice;
 
 use crate::llama_batch::LlamaBatch;
 use crate::model::{LlamaLoraAdapter, LlamaModel};
-use crate::timing::LlamaTimings;
 use crate::token::data::LlamaTokenData;
 use crate::token::LlamaToken;
 use crate::{
@@ -17,7 +16,6 @@ use crate::{
 
 pub mod kv_cache;
 pub mod params;
-pub mod sample;
 pub mod session;
 
 /// Safe wrapper around `llama_context`.
@@ -265,17 +263,6 @@ impl<'model> LlamaContext<'model> {
         unsafe { slice::from_raw_parts(data, len) }
     }
 
-    /// Reset the timings for the context.
-    pub fn reset_timings(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_reset_timings(self.context.as_ptr()) }
-    }
-
-    /// Returns the timings for the context.
-    pub fn timings(&mut self) -> LlamaTimings {
-        let timings = unsafe { llama_cpp_sys_2::llama_get_timings(self.context.as_ptr()) };
-        LlamaTimings { timings }
-    }
-
     /// Sets a lora adapter.
     ///
     /// # Errors
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
index 14eca8b0..3de7ba53 100644
--- a/llama-cpp-2/src/context/params.rs
+++ b/llama-cpp-2/src/context/params.rs
@@ -116,37 +116,6 @@ unsafe impl Send for LlamaContextParams {}
 unsafe impl Sync for LlamaContextParams {}
 
 impl LlamaContextParams {
-    /// Set the seed of the context
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_2::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// let params = params.with_seed(1234);
-    /// assert_eq!(params.seed(), 1234);
-    /// ```
-    #[must_use]
-    pub fn with_seed(mut self, seed: u32) -> Self {
-        self.context_params.seed = seed;
-        self
-    }
-
-    /// Get the seed of the context
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_2::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_seed(1234);
-    /// assert_eq!(params.seed(), 1234);
-    /// ```
-    #[must_use]
-    pub fn seed(&self) -> u32 {
-        self.context_params.seed
-    }
-
     /// Set the side of the context
     ///
     /// # Examples
diff --git a/llama-cpp-2/src/context/sample.rs b/llama-cpp-2/src/context/sample.rs
deleted file mode 100644
index cc0f85ee..00000000
--- a/llama-cpp-2/src/context/sample.rs
+++ /dev/null
@@ -1,141 +0,0 @@
-//! Sampling functions for the context.
-
-use crate::context::LlamaContext;
-use crate::grammar::LlamaGrammar;
-use crate::token::data_array::LlamaTokenDataArray;
-use crate::token::LlamaToken;
-
-#[cfg(feature = "sampler")]
-pub mod sampler;
-
-impl LlamaContext<'_> {
-    /// Accept a token into the grammar.
-    pub fn grammar_accept_token(&mut self, grammar: &mut LlamaGrammar, token: LlamaToken) {
-        unsafe {
-            llama_cpp_sys_2::llama_grammar_accept_token(
-                grammar.grammar.as_ptr(),
-                self.context.as_ptr(),
-                token.0,
-            );
-        }
-    }
-
-    /// Perform grammar sampling.
-    pub fn sample_grammar(
-        &mut self,
-        llama_token_data_array: &mut LlamaTokenDataArray,
-        llama_grammar: &LlamaGrammar,
-    ) {
-        unsafe {
-            llama_token_data_array.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_grammar(
-                    self.context.as_ptr(),
-                    c_llama_token_data_array,
-                    llama_grammar.grammar.as_ptr(),
-                );
-            });
-        }
-    }
-
-    /// See [`LlamaTokenDataArray::sample_temp`]
-    pub fn sample_temp(&mut self, token_data: &mut LlamaTokenDataArray, temperature: f32) {
-        token_data.sample_temp(Some(self), temperature);
-    }
-
-    /// Sample a token greedily. Note that this *does not* take into account anything that has modified the probabilities - it only looks at logits.
-    ///
-    /// Most of the time [`LlamaTokenDataArray::sample_softmax`] or [`LlamaTokenDataArray::sample_token`] should be used instead.
-    ///
-    /// # Panics
-    ///
-    /// - if `token_data` is empty
-    #[must_use]
-    pub fn sample_token_greedy(&mut self, mut token_data: LlamaTokenDataArray) -> LlamaToken {
-        assert!(!token_data.data.is_empty(), "no tokens");
-        let mut data_arr = llama_cpp_sys_2::llama_token_data_array {
-            data: token_data
-                .data
-                .as_mut_ptr()
-                .cast::<llama_cpp_sys_2::llama_token_data>(),
-            size: token_data.data.len(),
-            sorted: token_data.sorted,
-        };
-        let token = unsafe {
-            llama_cpp_sys_2::llama_sample_token_greedy(
-                self.context.as_ptr(),
-                std::ptr::addr_of_mut!(data_arr),
-            )
-        };
-        LlamaToken(token)
-    }
-
-    /// See [`LlamaTokenDataArray::sample_tail_free`]
-    pub fn sample_tail_free(
-        &mut self,
-        token_data: &mut LlamaTokenDataArray,
-        z: f32,
-        min_keep: usize,
-    ) {
-        token_data.sample_tail_free(Some(self), z, min_keep);
-    }
-
-    /// See [`LlamaTokenDataArray::sample_typical`]
-    pub fn sample_typical(
-        &mut self,
-        token_data: &mut LlamaTokenDataArray,
-        p: f32,
-        min_keep: usize,
-    ) {
-        token_data.sample_typical(Some(self), p, min_keep);
-    }
-
-    /// See [`LlamaTokenDataArray::sample_top_p`]
-    pub fn sample_top_p(&mut self, token_data: &mut LlamaTokenDataArray, p: f32, min_keep: usize) {
-        token_data.sample_top_p(Some(self), p, min_keep);
-    }
-
-    /// Minimum P sampling as described in [#3841](https://github.com/ggerganov/llama.cpp/pull/3841)
-    pub fn sample_min_p(
-        &mut self,
-        llama_token_data: &mut LlamaTokenDataArray,
-        p: f32,
-        min_keep: usize,
-    ) {
-        let ctx = self.context.as_ptr();
-        unsafe {
-            llama_token_data.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_min_p(ctx, c_llama_token_data_array, p, min_keep);
-            });
-        }
-    }
-
-    /// See [`LlamaTokenDataArray::sample_top_k`]
-    pub fn sample_top_k(&mut self, token_data: &mut LlamaTokenDataArray, k: i32, min_keep: usize) {
-        token_data.sample_top_k(Some(self), k, min_keep);
-    }
-
-    /// See [`LlamaTokenDataArray::sample_softmax`]
-    pub fn sample_token_softmax(&mut self, token_data: &mut LlamaTokenDataArray) {
-        token_data.sample_softmax(Some(self));
-    }
-
-    /// See [`LlamaTokenDataArray::sample_repetition_penalty`]
-    pub fn sample_repetition_penalty(
-        &mut self,
-        token_data: &mut LlamaTokenDataArray,
-        last_tokens: &[LlamaToken],
-        penalty_last_n: usize,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-    ) {
-        token_data.sample_repetition_penalty(
-            Some(self),
-            last_tokens,
-            penalty_last_n,
-            penalty_repeat,
-            penalty_freq,
-            penalty_present,
-        );
-    }
-}
diff --git a/llama-cpp-2/src/grammar.rs b/llama-cpp-2/src/grammar.rs
deleted file mode 100644
index 667a870b..00000000
--- a/llama-cpp-2/src/grammar.rs
+++ /dev/null
@@ -1,491 +0,0 @@
-//! The grammar module contains the grammar parser and the grammar struct.
-//!
-//! This allows creating a llama-cpp grammar. This is essentially a translation of the parser in
-//! `common` to rust
-
-use std::collections::BTreeMap;
-use std::fmt::{Debug, Formatter};
-
-use llama_cpp_sys_2::{llama_grammar, llama_grammar_element, llama_gretype};
-use std::ptr::NonNull;
-use std::str::FromStr;
-use tracing::error;
-
-/// Details of extraneous characters after a rule error.
-#[derive(thiserror::Error, Debug)]
-#[error("Extraneous chars after rule {name:?}: {chars:?}")]
-pub struct ExtraneousCharsAfterRule {
-    /// The name of the rule being parsed
-    pub name: String,
-    /// the extraneous characters
-    pub chars: String,
-    /// the rest of the input, this is still to be parsed.
-    pub rest: String,
-}
-
-/// There was an error parsing the grammar.
-#[derive(thiserror::Error, Debug)]
-#[allow(clippy::module_name_repetitions)]
-pub enum GrammarParseError {
-    /// There was an unexpected end of input.
-    #[error("Unexpected end of input")]
-    UnexpectedEndOfInput {
-        /// the stage of parsing that was being performed when we ran out of input.
-        parse_stage: &'static str,
-    },
-    /// There was unexpected characters after a rule name but before "::=". There can only be whitespace.
-    #[error("Unexpected Chars after name {name:?} and before \"::=\": {chars}")]
-    UnexpectedCharsAfterName {
-        /// the name of the rule being parsed
-        name: String,
-        /// the unexpected characters
-        chars: String,
-    },
-    /// There was no "::=" after a rule name.
-    #[error("Expected ::= after name {name:?}")]
-    ExpectedEqualsAfterName {
-        /// the name of the rule being parsed
-        name: String,
-    },
-    /// There was no closing bracket in a nested rule.
-    #[error("Expected closing bracket in nested rule {name:?}")]
-    MissingClosingBracketInNestedRule {
-        /// the name of the rule being parsed
-        name: String,
-    },
-    /// There was no rule before a postfix operator.
-    #[error("Missing rule before postfix operator in {name:?}")]
-    ExpectedRuleBeforePostfixOperator {
-        /// the name of the rule being parsed
-        name: String,
-    },
-    /// There was an incorrect hex size.
-    #[error("Expected hex number with size {expected_size}, but number was {actual:?}")]
-    IncorrectHexSize {
-        /// the expected size of the hex number
-        expected_size: usize,
-        /// the actual hex number
-        actual: String,
-    },
-    /// An unknown escape character was found.
-    #[error("Unknown escape {escape:?}")]
-    UnknownEscape {
-        /// the unknown character
-        escape: char,
-    },
-    /// Failed to parse hex from a string.
-    #[error("Failed to parse hex from {string}: {error}")]
-    ParseHexError {
-        /// the error that occurred when parsing the hex
-        #[source]
-        error: std::num::ParseIntError,
-        /// the string that was being parsed
-        string: String,
-    },
-    /// there was not space after the name
-    // todo: is this actually an error?
-    #[error("Missing space after name in {rest:?}")]
-    MissingSpaceAfterName {
-        /// the rest of the input, this is still to be parsed.
-        rest: String,
-    },
-    /// There was unexpected characters after the rule.
-    #[error("{0}")]
-    ExtraneousCharsAfterRule(ExtraneousCharsAfterRule),
-}
-
-/// A grammar for llama-cpp.
-#[allow(clippy::module_name_repetitions)]
-pub struct LlamaGrammar {
-    parse: ParseState,
-    pub(crate) grammar: NonNull<llama_grammar>,
-}
-
-impl Clone for LlamaGrammar {
-    fn clone(&self) -> Self {
-        let grammar = unsafe { llama_cpp_sys_2::llama_grammar_copy(self.grammar.as_ptr()) };
-        Self {
-            parse: self.parse.clone(),
-            grammar: NonNull::new(grammar).expect("copied grammar should never be null"),
-        }
-    }
-}
-
-unsafe impl Send for LlamaGrammar {}
-
-unsafe impl Sync for LlamaGrammar {}
-
-#[allow(clippy::module_name_repetitions)]
-impl Debug for LlamaGrammar {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("LlamaGrammar")
-            .field("grammar", &self.grammar)
-            .field("parse", &self.parse)
-            .finish()
-    }
-}
-
-#[derive(Debug, Clone, PartialEq)]
-struct ParseState {
-    symbol_ids: BTreeMap<String, u32>,
-    rules: Vec<Vec<llama_grammar_element>>,
-}
-
-impl ParseState {
-    fn new() -> Self {
-        Self {
-            symbol_ids: BTreeMap::new(),
-            rules: Vec::new(),
-        }
-    }
-
-    fn get_symbol_id(&mut self, name: &str) -> u32 {
-        let next_id =
-            u32::try_from(self.symbol_ids.len()).expect("too many rules (must fit into u32)");
-        let result = self.symbol_ids.entry(name.to_string()).or_insert(next_id);
-        *result
-    }
-
-    fn generate_symbol_id(&mut self, name: &str) -> u32 {
-        let next_id =
-            u32::try_from(self.symbol_ids.len()).expect("too many rules (must fit into u32)");
-        let generated_name = format!("{name}_{next_id}");
-        let None = self.symbol_ids.insert(generated_name, next_id) else {
-            panic!("Failed to create unique name for {name}");
-        };
-        next_id
-    }
-
-    fn parse_rule<'a>(&mut self, rest: &'a str) -> Result<Option<&'a str>, GrammarParseError> {
-        let rest = Self::consume_whitespace_and_comments(rest, true);
-        if rest.is_empty() {
-            return Ok(None);
-        }
-        let (name, rest) = Self::parse_name(rest)?;
-        let rest = rest.trim_start();
-        let rule_id = self.get_symbol_id(name);
-
-        let (after_name, rest) =
-            rest.split_once("::=")
-                .ok_or_else(|| GrammarParseError::ExpectedEqualsAfterName {
-                    name: name.to_string(),
-                })?;
-
-        if !after_name.is_empty() {
-            return Err(GrammarParseError::UnexpectedCharsAfterName {
-                name: name.to_string(),
-                chars: after_name.to_string(),
-            });
-        }
-
-        let rest = self.parse_alternatives(name, rule_id, rest, false)?;
-
-        let Some((after_rule, rest)) = rest.split_once('\n') else {
-            return Ok(None);
-        };
-
-        if !after_rule.chars().all(char::is_whitespace) {
-            return Err(GrammarParseError::ExtraneousCharsAfterRule(
-                ExtraneousCharsAfterRule {
-                    name: name.to_string(),
-                    chars: after_rule.to_string(),
-                    rest: rest.to_string(),
-                },
-            ));
-        }
-
-        Ok(Some(rest))
-    }
-
-    fn consume_whitespace_and_comments(mut rest: &str, allow_newlines: bool) -> &str {
-        loop {
-            rest = rest.trim_start_matches(
-                |c: char| if allow_newlines { true } else { c != '\n' } && c.is_whitespace(),
-            );
-            if rest.starts_with('#') {
-                rest = rest.split_once('\n').map_or("", |(_comment, rest)| rest);
-            } else {
-                break;
-            }
-        }
-        rest
-    }
-
-    fn parse_alternatives<'a>(
-        &mut self,
-        name: &str,
-        id: u32,
-        rest: &'a str,
-        nested: bool,
-    ) -> Result<&'a str, GrammarParseError> {
-        let mut rule = Vec::new();
-        let rest = self.parse_sequence(rest.trim_start(), name, &mut rule, nested)?;
-        let mut rest = Self::consume_whitespace_and_comments(rest, nested);
-        while rest.starts_with('|') {
-            rule.push(llama_grammar_element {
-                type_: llama_cpp_sys_2::LLAMA_GRETYPE_ALT,
-                value: 0,
-            });
-            rest = Self::consume_whitespace_and_comments(&rest[1..], true);
-            rest = self.parse_sequence(rest, name, &mut rule, nested)?;
-        }
-        rule.push(llama_grammar_element {
-            type_: llama_cpp_sys_2::LLAMA_GRETYPE_END,
-            value: 0,
-        });
-        self.add_rule(id, rule);
-        Ok(rest)
-    }
-
-    fn add_rule(&mut self, id: u32, rule: Vec<llama_grammar_element>) {
-        let id = id as usize;
-        if self.rules.len() <= id {
-            self.rules.resize(id + 1, Vec::new());
-        }
-        self.rules[id] = rule;
-    }
-
-    #[allow(clippy::too_many_lines)]
-    fn parse_sequence<'a>(
-        &mut self,
-        mut rest: &'a str,
-        name: &str,
-        rule: &mut Vec<llama_grammar_element>,
-        nested: bool,
-    ) -> Result<&'a str, GrammarParseError> {
-        let mut last_sym_start = rule.len();
-        while !rest.is_empty() {
-            let first_char =
-                rest.chars()
-                    .next()
-                    .ok_or(GrammarParseError::UnexpectedEndOfInput {
-                        parse_stage: "sequence",
-                    })?;
-            if first_char == '"' {
-                rest = &rest[1..];
-                last_sym_start = rule.len();
-                while !rest.starts_with('"') {
-                    let (c, r) = Self::parse_char(rest)?;
-                    rest = r;
-                    rule.push(llama_grammar_element {
-                        type_: llama_cpp_sys_2::LLAMA_GRETYPE_CHAR,
-                        value: c as _,
-                    });
-                }
-                rest = Self::consume_whitespace_and_comments(&rest[1..], nested);
-            } else if first_char == '[' {
-                rest = &rest[1..];
-                let start_type = if rest.starts_with('^') {
-                    rest = &rest[1..];
-                    llama_cpp_sys_2::LLAMA_GRETYPE_CHAR_NOT
-                } else {
-                    llama_cpp_sys_2::LLAMA_GRETYPE_CHAR
-                };
-                last_sym_start = rule.len();
-                while !rest.starts_with(']') {
-                    let (c, r) = Self::parse_char(rest)?;
-                    rest = r;
-                    let gre_type = if last_sym_start < rule.len() {
-                        llama_cpp_sys_2::LLAMA_GRETYPE_CHAR_ALT
-                    } else {
-                        start_type
-                    };
-                    rule.push(llama_grammar_element {
-                        type_: gre_type,
-                        value: c as _,
-                    });
-                    if rest.starts_with('-') && rest.get(1..).is_some_and(|r| !r.starts_with(']')) {
-                        let (c, r) = Self::parse_char(&rest[1..])?;
-                        rest = r;
-                        rule.push(llama_grammar_element {
-                            type_: llama_cpp_sys_2::LLAMA_GRETYPE_CHAR_RNG_UPPER,
-                            value: c as _,
-                        });
-                    }
-                }
-                rest = Self::consume_whitespace_and_comments(&rest[1..], nested);
-            } else if first_char.is_alphabetic() {
-                let (name, r) = Self::parse_name(rest)?;
-                rest = Self::consume_whitespace_and_comments(r, nested);
-                let ref_rule_id = self.get_symbol_id(name);
-                last_sym_start = rule.len();
-                rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_RULE_REF,
-                    value: ref_rule_id,
-                });
-            } else if first_char == '(' {
-                rest = rest[1..].trim_start();
-                let sub_rule_id = self.generate_symbol_id(name);
-                rest = self.parse_alternatives(name, sub_rule_id, rest, true)?;
-                last_sym_start = rule.len();
-                rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_RULE_REF,
-                    value: sub_rule_id,
-                });
-                if !rest.starts_with(')') {
-                    return Err(GrammarParseError::MissingClosingBracketInNestedRule {
-                        name: name.to_string(),
-                    });
-                }
-                rest = Self::consume_whitespace_and_comments(&rest[1..], nested);
-            } else if first_char == '*' || first_char == '+' || first_char == '?' {
-                if last_sym_start == rule.len() {
-                    return Err(GrammarParseError::ExpectedRuleBeforePostfixOperator {
-                        name: name.to_string(),
-                    });
-                }
-                let sub_rule_id = self.generate_symbol_id(name);
-                let mut sub_rule: Vec<llama_grammar_element> =
-                    rule.iter().skip(last_sym_start).copied().collect();
-                if rest.starts_with(['*', '+']) {
-                    sub_rule.push(llama_grammar_element {
-                        type_: llama_cpp_sys_2::LLAMA_GRETYPE_RULE_REF,
-                        value: sub_rule_id,
-                    });
-                }
-                sub_rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_ALT,
-                    value: 0,
-                });
-                if rest.starts_with('+') {
-                    sub_rule.extend(rule.iter().skip(last_sym_start).copied());
-                }
-                sub_rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_END,
-                    value: 0,
-                });
-                self.add_rule(sub_rule_id, sub_rule);
-
-                rule.truncate(last_sym_start);
-                rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_RULE_REF,
-                    value: sub_rule_id,
-                });
-
-                rest = Self::consume_whitespace_and_comments(&rest[1..], nested);
-            } else {
-                break;
-            }
-        }
-
-        Ok(rest)
-    }
-
-    fn parse_hex(rest: &str, size: usize) -> Result<(llama_gretype, &str), GrammarParseError> {
-        if rest.len() < size {
-            return Err(GrammarParseError::IncorrectHexSize {
-                expected_size: size,
-                actual: rest.to_string(),
-            });
-        }
-
-        let (hex, rest) = rest.split_at(size);
-        let value =
-            u32::from_str_radix(hex, 16).map_err(|error| GrammarParseError::ParseHexError {
-                string: hex.to_string(),
-                error,
-            })?;
-
-        Ok((value as llama_gretype, rest))
-    }
-
-    fn parse_char(rest: &str) -> Result<(llama_gretype, &str), GrammarParseError> {
-        if let Some(rest) = rest.strip_prefix('\\') {
-            let Some(escaped) = rest.chars().next() else {
-                return Err(GrammarParseError::UnexpectedEndOfInput {
-                    parse_stage: "escape char",
-                });
-            };
-            let rest = &rest[escaped.len_utf8()..];
-            match escaped {
-                'x' => Self::parse_hex(rest, 2),
-                'u' => Self::parse_hex(rest, 4),
-                'U' => Self::parse_hex(rest, 8),
-                't' => Ok((u32::from('\t') as llama_gretype, rest)),
-                'r' => Ok((u32::from('\r') as llama_gretype, rest)),
-                'n' => Ok((u32::from('\n') as llama_gretype, rest)),
-                '\\' => Ok((u32::from('\\') as llama_gretype, rest)),
-                '"' => Ok((u32::from('"') as llama_gretype, rest)),
-                '[' => Ok((u32::from('[') as llama_gretype, rest)),
-                ']' => Ok((u32::from(']') as llama_gretype, rest)),
-                c => Err(GrammarParseError::UnknownEscape { escape: c }),
-            }
-        } else if let Some(c) = rest.chars().next() {
-            Ok((u32::from(c) as llama_gretype, &rest[c.len_utf8()..]))
-        } else {
-            Err(GrammarParseError::UnexpectedEndOfInput {
-                parse_stage: "char",
-            })
-        }
-    }
-
-    fn parse_name(rest: &str) -> Result<(&str, &str), GrammarParseError> {
-        let name_end = rest
-            .find(|c: char| !c.is_alphanumeric() && c != '-' && c != '_')
-            .ok_or(GrammarParseError::MissingSpaceAfterName {
-                rest: rest.to_string(),
-            })?;
-        let name = &rest[..name_end];
-        let rest = &rest[name_end..];
-        Ok((name, rest))
-    }
-}
-
-/// An error that can occur creating a grammar from a string.
-#[derive(thiserror::Error, Debug)]
-pub enum LlamaGrammarFromStrError {
-    /// There was an error parsing the grammar.
-    #[error("Failed to parse grammar {0}")]
-    ParseError(#[from] GrammarParseError),
-    /// Llama-cpp returned null - this can occur for many reasons, but should ideally be caught on
-    /// the rust side beforehand.
-    #[error("llama-cpp returned null")]
-    LlamaCppNullError,
-}
-
-impl FromStr for ParseState {
-    type Err = GrammarParseError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut parse_state = ParseState::new();
-        let mut remaining = Some(s);
-        while let Some(str) = remaining {
-            remaining = parse_state.parse_rule(str)?;
-        }
-        Ok(parse_state)
-    }
-}
-
-impl FromStr for LlamaGrammar {
-    type Err = LlamaGrammarFromStrError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut parse_state = ParseState::from_str(s)?;
-
-        let n_rules = parse_state.rules.len();
-        let root_id = parse_state.get_symbol_id("root");
-        let mut vec = parse_state
-            .rules
-            .iter_mut()
-            .map(|v| v.as_ptr())
-            .collect::<Vec<_>>();
-        let rules = vec.as_mut_ptr();
-
-        let grammar =
-            unsafe { llama_cpp_sys_2::llama_grammar_init(rules, n_rules, root_id as usize) };
-
-        Ok(Self {
-            parse: parse_state,
-            grammar: NonNull::new(grammar).ok_or(LlamaGrammarFromStrError::LlamaCppNullError)?,
-        })
-    }
-}
-
-impl Drop for LlamaGrammar {
-    fn drop(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_grammar_free(self.grammar.as_ptr()) }
-    }
-}
-
-#[cfg(test)]
-mod tests;
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 2717c845..715b2f49 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -23,11 +23,9 @@ use std::path::PathBuf;
 use std::string::FromUtf8Error;
 
 pub mod context;
-pub mod grammar;
 pub mod llama_backend;
 pub mod llama_batch;
 pub mod model;
-pub mod timing;
 pub mod token;
 pub mod token_type;
 
diff --git a/llama-cpp-2/src/timing.rs b/llama-cpp-2/src/timing.rs
deleted file mode 100644
index 51cf682a..00000000
--- a/llama-cpp-2/src/timing.rs
+++ /dev/null
@@ -1,183 +0,0 @@
-//! Safe wrapper around `llama_timings`.
-use std::fmt::{Debug, Display, Formatter};
-
-/// A wrapper around `llama_timings`.
-#[derive(Clone, Copy, Debug)]
-pub struct LlamaTimings {
-    pub(crate) timings: llama_cpp_sys_2::llama_timings,
-}
-
-impl LlamaTimings {
-    /// Create a new `LlamaTimings`.
-    /// ```
-    /// # use llama_cpp_2::timing::LlamaTimings;
-    /// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7, 8, 9);
-    /// let timings_str = "load time = 3.00 ms
-    /// sample time = 4.00 ms / 7 runs (0.57 ms per token, 1750.00 tokens per second)
-    /// prompt eval time = 5.00 ms / 8 tokens (0.62 ms per token, 1600.00 tokens per second)
-    /// eval time = 6.00 ms / 9 runs (0.67 ms per token, 1500.00 tokens per second)
-    /// total time = 1.00 ms";
-    /// assert_eq!(timings_str, format!("{}", timings));
-    /// ```
-    #[allow(clippy::too_many_arguments)]
-    #[must_use]
-    pub fn new(
-        t_start_ms: f64,
-        t_end_ms: f64,
-        t_load_ms: f64,
-        t_sample_ms: f64,
-        t_p_eval_ms: f64,
-        t_eval_ms: f64,
-        n_sample: i32,
-        n_p_eval: i32,
-        n_eval: i32,
-    ) -> Self {
-        Self {
-            timings: llama_cpp_sys_2::llama_timings {
-                t_start_ms,
-                t_end_ms,
-                t_load_ms,
-                t_sample_ms,
-                t_p_eval_ms,
-                t_eval_ms,
-                n_sample,
-                n_p_eval,
-                n_eval,
-            },
-        }
-    }
-
-    /// Get the start time in milliseconds.
-    #[must_use]
-    pub fn t_start_ms(&self) -> f64 {
-        self.timings.t_start_ms
-    }
-
-    /// Get the end time in milliseconds.
-    #[must_use]
-    pub fn t_end_ms(&self) -> f64 {
-        self.timings.t_end_ms
-    }
-
-    /// Get the load time in milliseconds.
-    #[must_use]
-    pub fn t_load_ms(&self) -> f64 {
-        self.timings.t_load_ms
-    }
-
-    /// Get the sample time in milliseconds.
-    #[must_use]
-    pub fn t_sample_ms(&self) -> f64 {
-        self.timings.t_sample_ms
-    }
-
-    /// Get the prompt evaluation time in milliseconds.
-    #[must_use]
-    pub fn t_p_eval_ms(&self) -> f64 {
-        self.timings.t_p_eval_ms
-    }
-
-    /// Get the evaluation time in milliseconds.
-    #[must_use]
-    pub fn t_eval_ms(&self) -> f64 {
-        self.timings.t_eval_ms
-    }
-
-    /// Get the number of samples.
-    #[must_use]
-    pub fn n_sample(&self) -> i32 {
-        self.timings.n_sample
-    }
-
-    /// Get the number of prompt evaluations.
-    #[must_use]
-    pub fn n_p_eval(&self) -> i32 {
-        self.timings.n_p_eval
-    }
-
-    /// Get the number of evaluations.
-    #[must_use]
-    pub fn n_eval(&self) -> i32 {
-        self.timings.n_eval
-    }
-
-    /// Set the start time in milliseconds.
-    pub fn set_t_start_ms(&mut self, t_start_ms: f64) {
-        self.timings.t_start_ms = t_start_ms;
-    }
-
-    /// Set the end time in milliseconds.
-    pub fn set_t_end_ms(&mut self, t_end_ms: f64) {
-        self.timings.t_end_ms = t_end_ms;
-    }
-
-    /// Set the load time in milliseconds.
-    pub fn set_t_load_ms(&mut self, t_load_ms: f64) {
-        self.timings.t_load_ms = t_load_ms;
-    }
-
-    /// Set the sample time in milliseconds.
-    pub fn set_t_sample_ms(&mut self, t_sample_ms: f64) {
-        self.timings.t_sample_ms = t_sample_ms;
-    }
-
-    /// Set the prompt evaluation time in milliseconds.
-    pub fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
-        self.timings.t_p_eval_ms = t_p_eval_ms;
-    }
-
-    /// Set the evaluation time in milliseconds.
-    pub fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
-        self.timings.t_eval_ms = t_eval_ms;
-    }
-
-    /// Set the number of samples.
-    pub fn set_n_sample(&mut self, n_sample: i32) {
-        self.timings.n_sample = n_sample;
-    }
-
-    /// Set the number of prompt evaluations.
-    pub fn set_n_p_eval(&mut self, n_p_eval: i32) {
-        self.timings.n_p_eval = n_p_eval;
-    }
-
-    /// Set the number of evaluations.
-    pub fn set_n_eval(&mut self, n_eval: i32) {
-        self.timings.n_eval = n_eval;
-    }
-}
-
-impl Display for LlamaTimings {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        writeln!(f, "load time = {:.2} ms", self.t_load_ms())?;
-        writeln!(
-            f,
-            "sample time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
-            self.t_sample_ms(),
-            self.n_sample(),
-            self.t_sample_ms() / f64::from(self.n_sample()),
-            1e3 / self.t_sample_ms() * f64::from(self.n_sample())
-        )?;
-        writeln!(
-            f,
-            "prompt eval time = {:.2} ms / {} tokens ({:.2} ms per token, {:.2} tokens per second)",
-            self.t_p_eval_ms(),
-            self.n_p_eval(),
-            self.t_p_eval_ms() / f64::from(self.n_p_eval()),
-            1e3 / self.t_p_eval_ms() * f64::from(self.n_p_eval())
-        )?;
-        writeln!(
-            f,
-            "eval time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
-            self.t_eval_ms(),
-            self.n_eval(),
-            self.t_eval_ms() / f64::from(self.n_eval()),
-            1e3 / self.t_eval_ms() * f64::from(self.n_eval())
-        )?;
-        write!(
-            f,
-            "total time = {:.2} ms",
-            self.t_end_ms() - self.t_start_ms()
-        )
-    }
-}
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index e81ab336..f1fa1a2d 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -1,10 +1,5 @@
 //! an rusty equivalent of `llama_token_data`.
-use crate::context::LlamaContext;
 use crate::token::data::LlamaTokenData;
-use crate::token::LlamaToken;
-use llama_cpp_sys_2::llama_token;
-use std::cmp::min;
-use std::ptr;
 
 /// a safe wrapper around `llama_token_data_array`.
 #[derive(Debug, Clone, PartialEq)]
@@ -15,396 +10,3 @@ pub struct LlamaTokenDataArray {
     /// is the data sorted?
     pub sorted: bool,
 }
-
-impl LlamaTokenDataArray {
-    /// Create a new `LlamaTokenDataArray` from a vector and weather or not the data is sorted.
-    ///
-    /// ```
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    /// let array = LlamaTokenDataArray::new(vec![
-    ///         LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
-    ///         LlamaTokenData::new(LlamaToken(1), 0.1, 0.1)
-    ///    ], false);
-    /// assert_eq!(array.data.len(), 2);
-    /// assert_eq!(array.sorted, false);
-    /// ```
-    #[must_use]
-    pub fn new(data: Vec<LlamaTokenData>, sorted: bool) -> Self {
-        Self { data, sorted }
-    }
-
-    /// Create a new `LlamaTokenDataArray` from an iterator and weather or not the data is sorted.
-    /// ```
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    /// let array = LlamaTokenDataArray::from_iter([
-    ///     LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
-    ///     LlamaTokenData::new(LlamaToken(1), 0.1, 0.1)
-    /// ], false);
-    /// assert_eq!(array.data.len(), 2);
-    /// assert_eq!(array.sorted, false);
-    pub fn from_iter<T>(data: T, sorted: bool) -> LlamaTokenDataArray
-    where
-        T: IntoIterator<Item = LlamaTokenData>,
-    {
-        Self::new(data.into_iter().collect(), sorted)
-    }
-}
-
-impl LlamaTokenDataArray {
-    /// Modify the underlying data as a `llama_token_data_array`. and reconstruct the `LlamaTokenDataArray`.
-    ///
-    /// # Panics
-    ///
-    /// Panics if some of the safety conditions are not met. (we cannot check all of them at runtime so breaking them is UB)
-    ///
-    /// SAFETY:
-    /// [modify] cannot change the data pointer.
-    /// if the data is not sorted, sorted must be false.
-    /// the size of the data can only decrease (i.e you cannot add new elements).
-    pub(crate) unsafe fn modify_as_c_llama_token_data_array<T>(
-        &mut self,
-        modify: impl FnOnce(&mut llama_cpp_sys_2::llama_token_data_array) -> T,
-    ) -> T {
-        let size = self.data.len();
-        let data = self.data.as_mut_ptr().cast();
-        let mut c_llama_token_data_array = llama_cpp_sys_2::llama_token_data_array {
-            data,
-            size,
-            sorted: self.sorted,
-        };
-        let result = modify(&mut c_llama_token_data_array);
-        assert!(
-            ptr::eq(data, c_llama_token_data_array.data),
-            "data pointer changed"
-        );
-        assert!(c_llama_token_data_array.size <= size, "size increased");
-        self.data.set_len(c_llama_token_data_array.size);
-        self.sorted = c_llama_token_data_array.sorted;
-        result
-    }
-
-    /// Repetition penalty described in [CTRL academic paper](https://arxiv.org/abs/1909.05858), with negative logit fix.
-    /// Frequency and presence penalties described in [OpenAI API](https://platform.openai.com/docs/api-reference/parameter-details).
-    ///
-    /// # Parameters
-    ///
-    /// * `ctx` - the context to use. May be `None` if you do not care to record the sample timings.
-    /// * `last_tokens` - the last tokens in the context.
-    ///
-    /// * `penalty_last_n` - the number of tokens back to consider for the repetition penalty. (0 for no penalty)
-    /// * `penalty_repeat` - the repetition penalty. (1.0 for no penalty)
-    /// * `penalty_freq` - the frequency penalty. (0.0 for no penalty)
-    /// * `penalty_present` - the presence penalty. (0.0 for no penalty)
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use std::collections::BTreeMap;
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    /// let history = vec![
-    ///   LlamaToken::new(2),
-    ///   LlamaToken::new(1),
-    ///   LlamaToken::new(0),
-    /// ];
-    ///
-    /// let candidates = vec![
-    ///    LlamaToken::new(0),
-    ///    LlamaToken::new(1),
-    ///    LlamaToken::new(2),
-    ///    LlamaToken::new(3),
-    /// ];
-    ///
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates.iter().map(|&token| LlamaTokenData::new(token, 0.0, 0.0)), false);
-    ///
-    /// candidates.sample_repetition_penalty(None, &history, 2, 1.1, 0.1, 0.1);
-    ///
-    /// let token_logits = candidates.data.into_iter().map(|token_data| (token_data.id(), token_data.logit())).collect::<BTreeMap<_, _>>();
-    /// assert_eq!(token_logits[&LlamaToken(0)], 0.0, "expected no penalty as it is out of `penalty_last_n`");
-    /// assert!(token_logits[&LlamaToken(1)] < 0.0, "expected penalty as it is in `penalty_last_n`");
-    /// assert!(token_logits[&LlamaToken(2)] < 0.0, "expected penalty as it is in `penalty_last_n`");
-    /// assert_eq!(token_logits[&LlamaToken(3)], 0.0, "expected no penalty as it is not in `history`");
-    /// ```
-    pub fn sample_repetition_penalty(
-        &mut self,
-        ctx: Option<&mut LlamaContext>,
-        last_tokens: &[LlamaToken],
-        penalty_last_n: usize,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-    ) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        let penalty_last_n = min(penalty_last_n, last_tokens.len().saturating_sub(1));
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_repetition_penalties(
-                    ctx,
-                    c_llama_token_data_array,
-                    // safe cast as LlamaToken is repr(transparent)
-                    last_tokens.as_ptr().cast::<llama_token>(),
-                    penalty_last_n,
-                    penalty_repeat,
-                    penalty_freq,
-                    penalty_present,
-                );
-            });
-        }
-    }
-
-    /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let lowest = LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0);
-    /// let middle = LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0);
-    /// let highest = LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0);
-    ///
-    /// let candidates = vec![lowest, middle, highest];
-    ///
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_softmax(None);
-    ///
-    /// assert!(candidates.sorted);
-    /// assert_eq!(candidates.data[0].id(), highest.id());
-    /// assert_eq!(candidates.data[0].logit(), highest.logit());
-    /// assert!(candidates.data[0].p() > candidates.data[1].p());
-    /// assert_eq!(candidates.data[1].id(), middle.id());
-    /// assert_eq!(candidates.data[1].logit(), middle.logit());
-    /// assert!(candidates.data[1].p() > candidates.data[2].p());
-    /// assert_eq!(candidates.data[2].id(), lowest.id());
-    /// assert_eq!(candidates.data[2].logit(), lowest.logit());
-    /// ```
-    pub fn sample_softmax(&mut self, ctx: Option<&mut LlamaContext>) {
-        unsafe {
-            let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_softmax(ctx, c_llama_token_data_array);
-            });
-        }
-    }
-
-    /// Modify the logits of [`Self`] in place using temperature sampling.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///     LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///     LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///     LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0)
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    ///
-    /// candidates.sample_temp(None, 0.5);
-    ///
-    /// assert_ne!(candidates.data[0].logit(), 0.1);
-    /// assert_ne!(candidates.data[1].logit(), 0.2);
-    /// assert_ne!(candidates.data[2].logit(), 0.7);
-    /// ```
-    pub fn sample_temp(&mut self, ctx: Option<&mut LlamaContext>, temperature: f32) {
-        if temperature == 0.0 {
-            return;
-        }
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_temp(ctx, c_llama_token_data_array, temperature);
-            });
-        }
-    }
-
-    /// Randomly selects a token from the candidates based on their probabilities.
-    pub fn sample_token(&mut self, ctx: &mut LlamaContext) -> LlamaToken {
-        let llama_token = unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_token(ctx.context.as_ptr(), c_llama_token_data_array)
-            })
-        };
-        LlamaToken(llama_token)
-    }
-
-    /// Top-K sampling described in academic paper [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
-    pub fn sample_top_k(&mut self, ctx: Option<&mut LlamaContext>, k: i32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_top_k(ctx, c_llama_token_data_array, k, min_keep);
-            });
-        }
-    }
-
-    /// Tail Free Sampling described in [Tail-Free-Sampling](https://www.trentonbricken.com/Tail-Free-Sampling/).
-    pub fn sample_tail_free(&mut self, ctx: Option<&mut LlamaContext>, z: f32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_tail_free(ctx, c_llama_token_data_array, z, min_keep);
-            });
-        }
-    }
-
-    /// Locally Typical Sampling implementation described in the [paper](https://arxiv.org/abs/2202.00666).
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    ///
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///    LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///    LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///    LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_typical(None, 0.5, 1);
-    ///
-    /// ```
-    pub fn sample_typical(&mut self, ctx: Option<&mut LlamaContext>, p: f32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_typical(ctx, c_llama_token_data_array, p, min_keep);
-            });
-        }
-    }
-
-    /// Nucleus sampling described in academic paper [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    ///
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///   LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    ///
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_top_p(None, 0.5, 1);
-    ///
-    /// assert_eq!(candidates.data.len(), 2);
-    /// assert_eq!(candidates.data[0].id(), LlamaToken::new(2));
-    /// assert_eq!(candidates.data[1].id(), LlamaToken::new(1));
-    /// ```
-    pub fn sample_top_p(&mut self, ctx: Option<&mut LlamaContext>, p: f32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_top_p(ctx, c_llama_token_data_array, p, min_keep);
-            });
-        }
-    }
-
-    /// Minimum P sampling as described in [#3841](https://github.com/ggerganov/llama.cpp/pull/3841)
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///   LlamaTokenData::new(LlamaToken::new(4), 0.0001, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_min_p(None, 0.05, 1);
-    /// ```
-    pub fn sample_min_p(&mut self, ctx: Option<&mut LlamaContext>, p: f32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_min_p(ctx, c_llama_token_data_array, p, min_keep);
-            });
-        }
-    }
-
-    ///  Mirostat 2.0 algorithm described in the [paper](https://arxiv.org/abs/2007.14966). Uses tokens instead of words.
-    ///
-    /// # Parameters
-    ///
-    /// * `tau`  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// * `eta` The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// * `mu` Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    pub fn sample_token_mirostat_v2(
-        &mut self,
-        ctx: &mut LlamaContext,
-        tau: f32,
-        eta: f32,
-        mu: &mut f32,
-    ) -> LlamaToken {
-        let mu_ptr = ptr::from_mut(mu);
-        let token = unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_token_mirostat_v2(
-                    ctx.context.as_ptr(),
-                    c_llama_token_data_array,
-                    tau,
-                    eta,
-                    mu_ptr,
-                )
-            })
-        };
-        *mu = unsafe { *mu_ptr };
-        LlamaToken(token)
-    }
-
-    ///  Mirostat 1.0 algorithm described in the [paper](https://arxiv.org/abs/2007.14966). Uses tokens instead of words.
-    ///
-    /// # Parameters
-    ///
-    /// * `tau`  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// * `eta`  The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// * `m`  The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-    /// * `mu`  Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    pub fn sample_token_mirostat_v1(
-        &mut self,
-        ctx: &mut LlamaContext,
-        tau: f32,
-        eta: f32,
-        m: i32,
-        mu: &mut f32,
-    ) -> LlamaToken {
-        let mu_ptr = ptr::from_mut(mu);
-        let token = unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_token_mirostat(
-                    ctx.context.as_ptr(),
-                    c_llama_token_data_array,
-                    tau,
-                    eta,
-                    m,
-                    mu_ptr,
-                )
-            })
-        };
-        *mu = unsafe { *mu_ptr };
-        LlamaToken(token)
-    }
-}

From 4fd0362e79d07c55c58630ead0ae7ac0de4d4193 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Tue, 26 Nov 2024 14:45:34 +0100
Subject: [PATCH 014/193] remove old/deprecated code

---
 llama-cpp-2/src/context.rs          |  13 -
 llama-cpp-2/src/context/params.rs   |  31 --
 llama-cpp-2/src/context/sample.rs   | 141 --------
 llama-cpp-2/src/grammar.rs          | 491 ----------------------------
 llama-cpp-2/src/lib.rs              |   2 -
 llama-cpp-2/src/timing.rs           | 183 -----------
 llama-cpp-2/src/token/data_array.rs | 398 ----------------------
 7 files changed, 1259 deletions(-)
 delete mode 100644 llama-cpp-2/src/context/sample.rs
 delete mode 100644 llama-cpp-2/src/grammar.rs
 delete mode 100644 llama-cpp-2/src/timing.rs

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index 80ee8f75..91b7926c 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -7,7 +7,6 @@ use std::slice;
 
 use crate::llama_batch::LlamaBatch;
 use crate::model::{LlamaLoraAdapter, LlamaModel};
-use crate::timing::LlamaTimings;
 use crate::token::data::LlamaTokenData;
 use crate::token::LlamaToken;
 use crate::{
@@ -17,7 +16,6 @@ use crate::{
 
 pub mod kv_cache;
 pub mod params;
-pub mod sample;
 pub mod session;
 
 /// Safe wrapper around `llama_context`.
@@ -265,17 +263,6 @@ impl<'model> LlamaContext<'model> {
         unsafe { slice::from_raw_parts(data, len) }
     }
 
-    /// Reset the timings for the context.
-    pub fn reset_timings(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_reset_timings(self.context.as_ptr()) }
-    }
-
-    /// Returns the timings for the context.
-    pub fn timings(&mut self) -> LlamaTimings {
-        let timings = unsafe { llama_cpp_sys_2::llama_get_timings(self.context.as_ptr()) };
-        LlamaTimings { timings }
-    }
-
     /// Sets a lora adapter.
     ///
     /// # Errors
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
index 14eca8b0..3de7ba53 100644
--- a/llama-cpp-2/src/context/params.rs
+++ b/llama-cpp-2/src/context/params.rs
@@ -116,37 +116,6 @@ unsafe impl Send for LlamaContextParams {}
 unsafe impl Sync for LlamaContextParams {}
 
 impl LlamaContextParams {
-    /// Set the seed of the context
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_2::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default();
-    /// let params = params.with_seed(1234);
-    /// assert_eq!(params.seed(), 1234);
-    /// ```
-    #[must_use]
-    pub fn with_seed(mut self, seed: u32) -> Self {
-        self.context_params.seed = seed;
-        self
-    }
-
-    /// Get the seed of the context
-    ///
-    /// # Examples
-    ///
-    /// ```rust
-    /// use llama_cpp_2::context::params::LlamaContextParams;
-    /// let params = LlamaContextParams::default()
-    ///     .with_seed(1234);
-    /// assert_eq!(params.seed(), 1234);
-    /// ```
-    #[must_use]
-    pub fn seed(&self) -> u32 {
-        self.context_params.seed
-    }
-
     /// Set the side of the context
     ///
     /// # Examples
diff --git a/llama-cpp-2/src/context/sample.rs b/llama-cpp-2/src/context/sample.rs
deleted file mode 100644
index cc0f85ee..00000000
--- a/llama-cpp-2/src/context/sample.rs
+++ /dev/null
@@ -1,141 +0,0 @@
-//! Sampling functions for the context.
-
-use crate::context::LlamaContext;
-use crate::grammar::LlamaGrammar;
-use crate::token::data_array::LlamaTokenDataArray;
-use crate::token::LlamaToken;
-
-#[cfg(feature = "sampler")]
-pub mod sampler;
-
-impl LlamaContext<'_> {
-    /// Accept a token into the grammar.
-    pub fn grammar_accept_token(&mut self, grammar: &mut LlamaGrammar, token: LlamaToken) {
-        unsafe {
-            llama_cpp_sys_2::llama_grammar_accept_token(
-                grammar.grammar.as_ptr(),
-                self.context.as_ptr(),
-                token.0,
-            );
-        }
-    }
-
-    /// Perform grammar sampling.
-    pub fn sample_grammar(
-        &mut self,
-        llama_token_data_array: &mut LlamaTokenDataArray,
-        llama_grammar: &LlamaGrammar,
-    ) {
-        unsafe {
-            llama_token_data_array.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_grammar(
-                    self.context.as_ptr(),
-                    c_llama_token_data_array,
-                    llama_grammar.grammar.as_ptr(),
-                );
-            });
-        }
-    }
-
-    /// See [`LlamaTokenDataArray::sample_temp`]
-    pub fn sample_temp(&mut self, token_data: &mut LlamaTokenDataArray, temperature: f32) {
-        token_data.sample_temp(Some(self), temperature);
-    }
-
-    /// Sample a token greedily. Note that this *does not* take into account anything that has modified the probabilities - it only looks at logits.
-    ///
-    /// Most of the time [`LlamaTokenDataArray::sample_softmax`] or [`LlamaTokenDataArray::sample_token`] should be used instead.
-    ///
-    /// # Panics
-    ///
-    /// - if `token_data` is empty
-    #[must_use]
-    pub fn sample_token_greedy(&mut self, mut token_data: LlamaTokenDataArray) -> LlamaToken {
-        assert!(!token_data.data.is_empty(), "no tokens");
-        let mut data_arr = llama_cpp_sys_2::llama_token_data_array {
-            data: token_data
-                .data
-                .as_mut_ptr()
-                .cast::<llama_cpp_sys_2::llama_token_data>(),
-            size: token_data.data.len(),
-            sorted: token_data.sorted,
-        };
-        let token = unsafe {
-            llama_cpp_sys_2::llama_sample_token_greedy(
-                self.context.as_ptr(),
-                std::ptr::addr_of_mut!(data_arr),
-            )
-        };
-        LlamaToken(token)
-    }
-
-    /// See [`LlamaTokenDataArray::sample_tail_free`]
-    pub fn sample_tail_free(
-        &mut self,
-        token_data: &mut LlamaTokenDataArray,
-        z: f32,
-        min_keep: usize,
-    ) {
-        token_data.sample_tail_free(Some(self), z, min_keep);
-    }
-
-    /// See [`LlamaTokenDataArray::sample_typical`]
-    pub fn sample_typical(
-        &mut self,
-        token_data: &mut LlamaTokenDataArray,
-        p: f32,
-        min_keep: usize,
-    ) {
-        token_data.sample_typical(Some(self), p, min_keep);
-    }
-
-    /// See [`LlamaTokenDataArray::sample_top_p`]
-    pub fn sample_top_p(&mut self, token_data: &mut LlamaTokenDataArray, p: f32, min_keep: usize) {
-        token_data.sample_top_p(Some(self), p, min_keep);
-    }
-
-    /// Minimum P sampling as described in [#3841](https://github.com/ggerganov/llama.cpp/pull/3841)
-    pub fn sample_min_p(
-        &mut self,
-        llama_token_data: &mut LlamaTokenDataArray,
-        p: f32,
-        min_keep: usize,
-    ) {
-        let ctx = self.context.as_ptr();
-        unsafe {
-            llama_token_data.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_min_p(ctx, c_llama_token_data_array, p, min_keep);
-            });
-        }
-    }
-
-    /// See [`LlamaTokenDataArray::sample_top_k`]
-    pub fn sample_top_k(&mut self, token_data: &mut LlamaTokenDataArray, k: i32, min_keep: usize) {
-        token_data.sample_top_k(Some(self), k, min_keep);
-    }
-
-    /// See [`LlamaTokenDataArray::sample_softmax`]
-    pub fn sample_token_softmax(&mut self, token_data: &mut LlamaTokenDataArray) {
-        token_data.sample_softmax(Some(self));
-    }
-
-    /// See [`LlamaTokenDataArray::sample_repetition_penalty`]
-    pub fn sample_repetition_penalty(
-        &mut self,
-        token_data: &mut LlamaTokenDataArray,
-        last_tokens: &[LlamaToken],
-        penalty_last_n: usize,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-    ) {
-        token_data.sample_repetition_penalty(
-            Some(self),
-            last_tokens,
-            penalty_last_n,
-            penalty_repeat,
-            penalty_freq,
-            penalty_present,
-        );
-    }
-}
diff --git a/llama-cpp-2/src/grammar.rs b/llama-cpp-2/src/grammar.rs
deleted file mode 100644
index 667a870b..00000000
--- a/llama-cpp-2/src/grammar.rs
+++ /dev/null
@@ -1,491 +0,0 @@
-//! The grammar module contains the grammar parser and the grammar struct.
-//!
-//! This allows creating a llama-cpp grammar. This is essentially a translation of the parser in
-//! `common` to rust
-
-use std::collections::BTreeMap;
-use std::fmt::{Debug, Formatter};
-
-use llama_cpp_sys_2::{llama_grammar, llama_grammar_element, llama_gretype};
-use std::ptr::NonNull;
-use std::str::FromStr;
-use tracing::error;
-
-/// Details of extraneous characters after a rule error.
-#[derive(thiserror::Error, Debug)]
-#[error("Extraneous chars after rule {name:?}: {chars:?}")]
-pub struct ExtraneousCharsAfterRule {
-    /// The name of the rule being parsed
-    pub name: String,
-    /// the extraneous characters
-    pub chars: String,
-    /// the rest of the input, this is still to be parsed.
-    pub rest: String,
-}
-
-/// There was an error parsing the grammar.
-#[derive(thiserror::Error, Debug)]
-#[allow(clippy::module_name_repetitions)]
-pub enum GrammarParseError {
-    /// There was an unexpected end of input.
-    #[error("Unexpected end of input")]
-    UnexpectedEndOfInput {
-        /// the stage of parsing that was being performed when we ran out of input.
-        parse_stage: &'static str,
-    },
-    /// There was unexpected characters after a rule name but before "::=". There can only be whitespace.
-    #[error("Unexpected Chars after name {name:?} and before \"::=\": {chars}")]
-    UnexpectedCharsAfterName {
-        /// the name of the rule being parsed
-        name: String,
-        /// the unexpected characters
-        chars: String,
-    },
-    /// There was no "::=" after a rule name.
-    #[error("Expected ::= after name {name:?}")]
-    ExpectedEqualsAfterName {
-        /// the name of the rule being parsed
-        name: String,
-    },
-    /// There was no closing bracket in a nested rule.
-    #[error("Expected closing bracket in nested rule {name:?}")]
-    MissingClosingBracketInNestedRule {
-        /// the name of the rule being parsed
-        name: String,
-    },
-    /// There was no rule before a postfix operator.
-    #[error("Missing rule before postfix operator in {name:?}")]
-    ExpectedRuleBeforePostfixOperator {
-        /// the name of the rule being parsed
-        name: String,
-    },
-    /// There was an incorrect hex size.
-    #[error("Expected hex number with size {expected_size}, but number was {actual:?}")]
-    IncorrectHexSize {
-        /// the expected size of the hex number
-        expected_size: usize,
-        /// the actual hex number
-        actual: String,
-    },
-    /// An unknown escape character was found.
-    #[error("Unknown escape {escape:?}")]
-    UnknownEscape {
-        /// the unknown character
-        escape: char,
-    },
-    /// Failed to parse hex from a string.
-    #[error("Failed to parse hex from {string}: {error}")]
-    ParseHexError {
-        /// the error that occurred when parsing the hex
-        #[source]
-        error: std::num::ParseIntError,
-        /// the string that was being parsed
-        string: String,
-    },
-    /// there was not space after the name
-    // todo: is this actually an error?
-    #[error("Missing space after name in {rest:?}")]
-    MissingSpaceAfterName {
-        /// the rest of the input, this is still to be parsed.
-        rest: String,
-    },
-    /// There was unexpected characters after the rule.
-    #[error("{0}")]
-    ExtraneousCharsAfterRule(ExtraneousCharsAfterRule),
-}
-
-/// A grammar for llama-cpp.
-#[allow(clippy::module_name_repetitions)]
-pub struct LlamaGrammar {
-    parse: ParseState,
-    pub(crate) grammar: NonNull<llama_grammar>,
-}
-
-impl Clone for LlamaGrammar {
-    fn clone(&self) -> Self {
-        let grammar = unsafe { llama_cpp_sys_2::llama_grammar_copy(self.grammar.as_ptr()) };
-        Self {
-            parse: self.parse.clone(),
-            grammar: NonNull::new(grammar).expect("copied grammar should never be null"),
-        }
-    }
-}
-
-unsafe impl Send for LlamaGrammar {}
-
-unsafe impl Sync for LlamaGrammar {}
-
-#[allow(clippy::module_name_repetitions)]
-impl Debug for LlamaGrammar {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("LlamaGrammar")
-            .field("grammar", &self.grammar)
-            .field("parse", &self.parse)
-            .finish()
-    }
-}
-
-#[derive(Debug, Clone, PartialEq)]
-struct ParseState {
-    symbol_ids: BTreeMap<String, u32>,
-    rules: Vec<Vec<llama_grammar_element>>,
-}
-
-impl ParseState {
-    fn new() -> Self {
-        Self {
-            symbol_ids: BTreeMap::new(),
-            rules: Vec::new(),
-        }
-    }
-
-    fn get_symbol_id(&mut self, name: &str) -> u32 {
-        let next_id =
-            u32::try_from(self.symbol_ids.len()).expect("too many rules (must fit into u32)");
-        let result = self.symbol_ids.entry(name.to_string()).or_insert(next_id);
-        *result
-    }
-
-    fn generate_symbol_id(&mut self, name: &str) -> u32 {
-        let next_id =
-            u32::try_from(self.symbol_ids.len()).expect("too many rules (must fit into u32)");
-        let generated_name = format!("{name}_{next_id}");
-        let None = self.symbol_ids.insert(generated_name, next_id) else {
-            panic!("Failed to create unique name for {name}");
-        };
-        next_id
-    }
-
-    fn parse_rule<'a>(&mut self, rest: &'a str) -> Result<Option<&'a str>, GrammarParseError> {
-        let rest = Self::consume_whitespace_and_comments(rest, true);
-        if rest.is_empty() {
-            return Ok(None);
-        }
-        let (name, rest) = Self::parse_name(rest)?;
-        let rest = rest.trim_start();
-        let rule_id = self.get_symbol_id(name);
-
-        let (after_name, rest) =
-            rest.split_once("::=")
-                .ok_or_else(|| GrammarParseError::ExpectedEqualsAfterName {
-                    name: name.to_string(),
-                })?;
-
-        if !after_name.is_empty() {
-            return Err(GrammarParseError::UnexpectedCharsAfterName {
-                name: name.to_string(),
-                chars: after_name.to_string(),
-            });
-        }
-
-        let rest = self.parse_alternatives(name, rule_id, rest, false)?;
-
-        let Some((after_rule, rest)) = rest.split_once('\n') else {
-            return Ok(None);
-        };
-
-        if !after_rule.chars().all(char::is_whitespace) {
-            return Err(GrammarParseError::ExtraneousCharsAfterRule(
-                ExtraneousCharsAfterRule {
-                    name: name.to_string(),
-                    chars: after_rule.to_string(),
-                    rest: rest.to_string(),
-                },
-            ));
-        }
-
-        Ok(Some(rest))
-    }
-
-    fn consume_whitespace_and_comments(mut rest: &str, allow_newlines: bool) -> &str {
-        loop {
-            rest = rest.trim_start_matches(
-                |c: char| if allow_newlines { true } else { c != '\n' } && c.is_whitespace(),
-            );
-            if rest.starts_with('#') {
-                rest = rest.split_once('\n').map_or("", |(_comment, rest)| rest);
-            } else {
-                break;
-            }
-        }
-        rest
-    }
-
-    fn parse_alternatives<'a>(
-        &mut self,
-        name: &str,
-        id: u32,
-        rest: &'a str,
-        nested: bool,
-    ) -> Result<&'a str, GrammarParseError> {
-        let mut rule = Vec::new();
-        let rest = self.parse_sequence(rest.trim_start(), name, &mut rule, nested)?;
-        let mut rest = Self::consume_whitespace_and_comments(rest, nested);
-        while rest.starts_with('|') {
-            rule.push(llama_grammar_element {
-                type_: llama_cpp_sys_2::LLAMA_GRETYPE_ALT,
-                value: 0,
-            });
-            rest = Self::consume_whitespace_and_comments(&rest[1..], true);
-            rest = self.parse_sequence(rest, name, &mut rule, nested)?;
-        }
-        rule.push(llama_grammar_element {
-            type_: llama_cpp_sys_2::LLAMA_GRETYPE_END,
-            value: 0,
-        });
-        self.add_rule(id, rule);
-        Ok(rest)
-    }
-
-    fn add_rule(&mut self, id: u32, rule: Vec<llama_grammar_element>) {
-        let id = id as usize;
-        if self.rules.len() <= id {
-            self.rules.resize(id + 1, Vec::new());
-        }
-        self.rules[id] = rule;
-    }
-
-    #[allow(clippy::too_many_lines)]
-    fn parse_sequence<'a>(
-        &mut self,
-        mut rest: &'a str,
-        name: &str,
-        rule: &mut Vec<llama_grammar_element>,
-        nested: bool,
-    ) -> Result<&'a str, GrammarParseError> {
-        let mut last_sym_start = rule.len();
-        while !rest.is_empty() {
-            let first_char =
-                rest.chars()
-                    .next()
-                    .ok_or(GrammarParseError::UnexpectedEndOfInput {
-                        parse_stage: "sequence",
-                    })?;
-            if first_char == '"' {
-                rest = &rest[1..];
-                last_sym_start = rule.len();
-                while !rest.starts_with('"') {
-                    let (c, r) = Self::parse_char(rest)?;
-                    rest = r;
-                    rule.push(llama_grammar_element {
-                        type_: llama_cpp_sys_2::LLAMA_GRETYPE_CHAR,
-                        value: c as _,
-                    });
-                }
-                rest = Self::consume_whitespace_and_comments(&rest[1..], nested);
-            } else if first_char == '[' {
-                rest = &rest[1..];
-                let start_type = if rest.starts_with('^') {
-                    rest = &rest[1..];
-                    llama_cpp_sys_2::LLAMA_GRETYPE_CHAR_NOT
-                } else {
-                    llama_cpp_sys_2::LLAMA_GRETYPE_CHAR
-                };
-                last_sym_start = rule.len();
-                while !rest.starts_with(']') {
-                    let (c, r) = Self::parse_char(rest)?;
-                    rest = r;
-                    let gre_type = if last_sym_start < rule.len() {
-                        llama_cpp_sys_2::LLAMA_GRETYPE_CHAR_ALT
-                    } else {
-                        start_type
-                    };
-                    rule.push(llama_grammar_element {
-                        type_: gre_type,
-                        value: c as _,
-                    });
-                    if rest.starts_with('-') && rest.get(1..).is_some_and(|r| !r.starts_with(']')) {
-                        let (c, r) = Self::parse_char(&rest[1..])?;
-                        rest = r;
-                        rule.push(llama_grammar_element {
-                            type_: llama_cpp_sys_2::LLAMA_GRETYPE_CHAR_RNG_UPPER,
-                            value: c as _,
-                        });
-                    }
-                }
-                rest = Self::consume_whitespace_and_comments(&rest[1..], nested);
-            } else if first_char.is_alphabetic() {
-                let (name, r) = Self::parse_name(rest)?;
-                rest = Self::consume_whitespace_and_comments(r, nested);
-                let ref_rule_id = self.get_symbol_id(name);
-                last_sym_start = rule.len();
-                rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_RULE_REF,
-                    value: ref_rule_id,
-                });
-            } else if first_char == '(' {
-                rest = rest[1..].trim_start();
-                let sub_rule_id = self.generate_symbol_id(name);
-                rest = self.parse_alternatives(name, sub_rule_id, rest, true)?;
-                last_sym_start = rule.len();
-                rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_RULE_REF,
-                    value: sub_rule_id,
-                });
-                if !rest.starts_with(')') {
-                    return Err(GrammarParseError::MissingClosingBracketInNestedRule {
-                        name: name.to_string(),
-                    });
-                }
-                rest = Self::consume_whitespace_and_comments(&rest[1..], nested);
-            } else if first_char == '*' || first_char == '+' || first_char == '?' {
-                if last_sym_start == rule.len() {
-                    return Err(GrammarParseError::ExpectedRuleBeforePostfixOperator {
-                        name: name.to_string(),
-                    });
-                }
-                let sub_rule_id = self.generate_symbol_id(name);
-                let mut sub_rule: Vec<llama_grammar_element> =
-                    rule.iter().skip(last_sym_start).copied().collect();
-                if rest.starts_with(['*', '+']) {
-                    sub_rule.push(llama_grammar_element {
-                        type_: llama_cpp_sys_2::LLAMA_GRETYPE_RULE_REF,
-                        value: sub_rule_id,
-                    });
-                }
-                sub_rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_ALT,
-                    value: 0,
-                });
-                if rest.starts_with('+') {
-                    sub_rule.extend(rule.iter().skip(last_sym_start).copied());
-                }
-                sub_rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_END,
-                    value: 0,
-                });
-                self.add_rule(sub_rule_id, sub_rule);
-
-                rule.truncate(last_sym_start);
-                rule.push(llama_grammar_element {
-                    type_: llama_cpp_sys_2::LLAMA_GRETYPE_RULE_REF,
-                    value: sub_rule_id,
-                });
-
-                rest = Self::consume_whitespace_and_comments(&rest[1..], nested);
-            } else {
-                break;
-            }
-        }
-
-        Ok(rest)
-    }
-
-    fn parse_hex(rest: &str, size: usize) -> Result<(llama_gretype, &str), GrammarParseError> {
-        if rest.len() < size {
-            return Err(GrammarParseError::IncorrectHexSize {
-                expected_size: size,
-                actual: rest.to_string(),
-            });
-        }
-
-        let (hex, rest) = rest.split_at(size);
-        let value =
-            u32::from_str_radix(hex, 16).map_err(|error| GrammarParseError::ParseHexError {
-                string: hex.to_string(),
-                error,
-            })?;
-
-        Ok((value as llama_gretype, rest))
-    }
-
-    fn parse_char(rest: &str) -> Result<(llama_gretype, &str), GrammarParseError> {
-        if let Some(rest) = rest.strip_prefix('\\') {
-            let Some(escaped) = rest.chars().next() else {
-                return Err(GrammarParseError::UnexpectedEndOfInput {
-                    parse_stage: "escape char",
-                });
-            };
-            let rest = &rest[escaped.len_utf8()..];
-            match escaped {
-                'x' => Self::parse_hex(rest, 2),
-                'u' => Self::parse_hex(rest, 4),
-                'U' => Self::parse_hex(rest, 8),
-                't' => Ok((u32::from('\t') as llama_gretype, rest)),
-                'r' => Ok((u32::from('\r') as llama_gretype, rest)),
-                'n' => Ok((u32::from('\n') as llama_gretype, rest)),
-                '\\' => Ok((u32::from('\\') as llama_gretype, rest)),
-                '"' => Ok((u32::from('"') as llama_gretype, rest)),
-                '[' => Ok((u32::from('[') as llama_gretype, rest)),
-                ']' => Ok((u32::from(']') as llama_gretype, rest)),
-                c => Err(GrammarParseError::UnknownEscape { escape: c }),
-            }
-        } else if let Some(c) = rest.chars().next() {
-            Ok((u32::from(c) as llama_gretype, &rest[c.len_utf8()..]))
-        } else {
-            Err(GrammarParseError::UnexpectedEndOfInput {
-                parse_stage: "char",
-            })
-        }
-    }
-
-    fn parse_name(rest: &str) -> Result<(&str, &str), GrammarParseError> {
-        let name_end = rest
-            .find(|c: char| !c.is_alphanumeric() && c != '-' && c != '_')
-            .ok_or(GrammarParseError::MissingSpaceAfterName {
-                rest: rest.to_string(),
-            })?;
-        let name = &rest[..name_end];
-        let rest = &rest[name_end..];
-        Ok((name, rest))
-    }
-}
-
-/// An error that can occur creating a grammar from a string.
-#[derive(thiserror::Error, Debug)]
-pub enum LlamaGrammarFromStrError {
-    /// There was an error parsing the grammar.
-    #[error("Failed to parse grammar {0}")]
-    ParseError(#[from] GrammarParseError),
-    /// Llama-cpp returned null - this can occur for many reasons, but should ideally be caught on
-    /// the rust side beforehand.
-    #[error("llama-cpp returned null")]
-    LlamaCppNullError,
-}
-
-impl FromStr for ParseState {
-    type Err = GrammarParseError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut parse_state = ParseState::new();
-        let mut remaining = Some(s);
-        while let Some(str) = remaining {
-            remaining = parse_state.parse_rule(str)?;
-        }
-        Ok(parse_state)
-    }
-}
-
-impl FromStr for LlamaGrammar {
-    type Err = LlamaGrammarFromStrError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut parse_state = ParseState::from_str(s)?;
-
-        let n_rules = parse_state.rules.len();
-        let root_id = parse_state.get_symbol_id("root");
-        let mut vec = parse_state
-            .rules
-            .iter_mut()
-            .map(|v| v.as_ptr())
-            .collect::<Vec<_>>();
-        let rules = vec.as_mut_ptr();
-
-        let grammar =
-            unsafe { llama_cpp_sys_2::llama_grammar_init(rules, n_rules, root_id as usize) };
-
-        Ok(Self {
-            parse: parse_state,
-            grammar: NonNull::new(grammar).ok_or(LlamaGrammarFromStrError::LlamaCppNullError)?,
-        })
-    }
-}
-
-impl Drop for LlamaGrammar {
-    fn drop(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_grammar_free(self.grammar.as_ptr()) }
-    }
-}
-
-#[cfg(test)]
-mod tests;
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 2717c845..715b2f49 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -23,11 +23,9 @@ use std::path::PathBuf;
 use std::string::FromUtf8Error;
 
 pub mod context;
-pub mod grammar;
 pub mod llama_backend;
 pub mod llama_batch;
 pub mod model;
-pub mod timing;
 pub mod token;
 pub mod token_type;
 
diff --git a/llama-cpp-2/src/timing.rs b/llama-cpp-2/src/timing.rs
deleted file mode 100644
index 51cf682a..00000000
--- a/llama-cpp-2/src/timing.rs
+++ /dev/null
@@ -1,183 +0,0 @@
-//! Safe wrapper around `llama_timings`.
-use std::fmt::{Debug, Display, Formatter};
-
-/// A wrapper around `llama_timings`.
-#[derive(Clone, Copy, Debug)]
-pub struct LlamaTimings {
-    pub(crate) timings: llama_cpp_sys_2::llama_timings,
-}
-
-impl LlamaTimings {
-    /// Create a new `LlamaTimings`.
-    /// ```
-    /// # use llama_cpp_2::timing::LlamaTimings;
-    /// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7, 8, 9);
-    /// let timings_str = "load time = 3.00 ms
-    /// sample time = 4.00 ms / 7 runs (0.57 ms per token, 1750.00 tokens per second)
-    /// prompt eval time = 5.00 ms / 8 tokens (0.62 ms per token, 1600.00 tokens per second)
-    /// eval time = 6.00 ms / 9 runs (0.67 ms per token, 1500.00 tokens per second)
-    /// total time = 1.00 ms";
-    /// assert_eq!(timings_str, format!("{}", timings));
-    /// ```
-    #[allow(clippy::too_many_arguments)]
-    #[must_use]
-    pub fn new(
-        t_start_ms: f64,
-        t_end_ms: f64,
-        t_load_ms: f64,
-        t_sample_ms: f64,
-        t_p_eval_ms: f64,
-        t_eval_ms: f64,
-        n_sample: i32,
-        n_p_eval: i32,
-        n_eval: i32,
-    ) -> Self {
-        Self {
-            timings: llama_cpp_sys_2::llama_timings {
-                t_start_ms,
-                t_end_ms,
-                t_load_ms,
-                t_sample_ms,
-                t_p_eval_ms,
-                t_eval_ms,
-                n_sample,
-                n_p_eval,
-                n_eval,
-            },
-        }
-    }
-
-    /// Get the start time in milliseconds.
-    #[must_use]
-    pub fn t_start_ms(&self) -> f64 {
-        self.timings.t_start_ms
-    }
-
-    /// Get the end time in milliseconds.
-    #[must_use]
-    pub fn t_end_ms(&self) -> f64 {
-        self.timings.t_end_ms
-    }
-
-    /// Get the load time in milliseconds.
-    #[must_use]
-    pub fn t_load_ms(&self) -> f64 {
-        self.timings.t_load_ms
-    }
-
-    /// Get the sample time in milliseconds.
-    #[must_use]
-    pub fn t_sample_ms(&self) -> f64 {
-        self.timings.t_sample_ms
-    }
-
-    /// Get the prompt evaluation time in milliseconds.
-    #[must_use]
-    pub fn t_p_eval_ms(&self) -> f64 {
-        self.timings.t_p_eval_ms
-    }
-
-    /// Get the evaluation time in milliseconds.
-    #[must_use]
-    pub fn t_eval_ms(&self) -> f64 {
-        self.timings.t_eval_ms
-    }
-
-    /// Get the number of samples.
-    #[must_use]
-    pub fn n_sample(&self) -> i32 {
-        self.timings.n_sample
-    }
-
-    /// Get the number of prompt evaluations.
-    #[must_use]
-    pub fn n_p_eval(&self) -> i32 {
-        self.timings.n_p_eval
-    }
-
-    /// Get the number of evaluations.
-    #[must_use]
-    pub fn n_eval(&self) -> i32 {
-        self.timings.n_eval
-    }
-
-    /// Set the start time in milliseconds.
-    pub fn set_t_start_ms(&mut self, t_start_ms: f64) {
-        self.timings.t_start_ms = t_start_ms;
-    }
-
-    /// Set the end time in milliseconds.
-    pub fn set_t_end_ms(&mut self, t_end_ms: f64) {
-        self.timings.t_end_ms = t_end_ms;
-    }
-
-    /// Set the load time in milliseconds.
-    pub fn set_t_load_ms(&mut self, t_load_ms: f64) {
-        self.timings.t_load_ms = t_load_ms;
-    }
-
-    /// Set the sample time in milliseconds.
-    pub fn set_t_sample_ms(&mut self, t_sample_ms: f64) {
-        self.timings.t_sample_ms = t_sample_ms;
-    }
-
-    /// Set the prompt evaluation time in milliseconds.
-    pub fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
-        self.timings.t_p_eval_ms = t_p_eval_ms;
-    }
-
-    /// Set the evaluation time in milliseconds.
-    pub fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
-        self.timings.t_eval_ms = t_eval_ms;
-    }
-
-    /// Set the number of samples.
-    pub fn set_n_sample(&mut self, n_sample: i32) {
-        self.timings.n_sample = n_sample;
-    }
-
-    /// Set the number of prompt evaluations.
-    pub fn set_n_p_eval(&mut self, n_p_eval: i32) {
-        self.timings.n_p_eval = n_p_eval;
-    }
-
-    /// Set the number of evaluations.
-    pub fn set_n_eval(&mut self, n_eval: i32) {
-        self.timings.n_eval = n_eval;
-    }
-}
-
-impl Display for LlamaTimings {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        writeln!(f, "load time = {:.2} ms", self.t_load_ms())?;
-        writeln!(
-            f,
-            "sample time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
-            self.t_sample_ms(),
-            self.n_sample(),
-            self.t_sample_ms() / f64::from(self.n_sample()),
-            1e3 / self.t_sample_ms() * f64::from(self.n_sample())
-        )?;
-        writeln!(
-            f,
-            "prompt eval time = {:.2} ms / {} tokens ({:.2} ms per token, {:.2} tokens per second)",
-            self.t_p_eval_ms(),
-            self.n_p_eval(),
-            self.t_p_eval_ms() / f64::from(self.n_p_eval()),
-            1e3 / self.t_p_eval_ms() * f64::from(self.n_p_eval())
-        )?;
-        writeln!(
-            f,
-            "eval time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
-            self.t_eval_ms(),
-            self.n_eval(),
-            self.t_eval_ms() / f64::from(self.n_eval()),
-            1e3 / self.t_eval_ms() * f64::from(self.n_eval())
-        )?;
-        write!(
-            f,
-            "total time = {:.2} ms",
-            self.t_end_ms() - self.t_start_ms()
-        )
-    }
-}
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index e81ab336..f1fa1a2d 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -1,10 +1,5 @@
 //! an rusty equivalent of `llama_token_data`.
-use crate::context::LlamaContext;
 use crate::token::data::LlamaTokenData;
-use crate::token::LlamaToken;
-use llama_cpp_sys_2::llama_token;
-use std::cmp::min;
-use std::ptr;
 
 /// a safe wrapper around `llama_token_data_array`.
 #[derive(Debug, Clone, PartialEq)]
@@ -15,396 +10,3 @@ pub struct LlamaTokenDataArray {
     /// is the data sorted?
     pub sorted: bool,
 }
-
-impl LlamaTokenDataArray {
-    /// Create a new `LlamaTokenDataArray` from a vector and weather or not the data is sorted.
-    ///
-    /// ```
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    /// let array = LlamaTokenDataArray::new(vec![
-    ///         LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
-    ///         LlamaTokenData::new(LlamaToken(1), 0.1, 0.1)
-    ///    ], false);
-    /// assert_eq!(array.data.len(), 2);
-    /// assert_eq!(array.sorted, false);
-    /// ```
-    #[must_use]
-    pub fn new(data: Vec<LlamaTokenData>, sorted: bool) -> Self {
-        Self { data, sorted }
-    }
-
-    /// Create a new `LlamaTokenDataArray` from an iterator and weather or not the data is sorted.
-    /// ```
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    /// let array = LlamaTokenDataArray::from_iter([
-    ///     LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
-    ///     LlamaTokenData::new(LlamaToken(1), 0.1, 0.1)
-    /// ], false);
-    /// assert_eq!(array.data.len(), 2);
-    /// assert_eq!(array.sorted, false);
-    pub fn from_iter<T>(data: T, sorted: bool) -> LlamaTokenDataArray
-    where
-        T: IntoIterator<Item = LlamaTokenData>,
-    {
-        Self::new(data.into_iter().collect(), sorted)
-    }
-}
-
-impl LlamaTokenDataArray {
-    /// Modify the underlying data as a `llama_token_data_array`. and reconstruct the `LlamaTokenDataArray`.
-    ///
-    /// # Panics
-    ///
-    /// Panics if some of the safety conditions are not met. (we cannot check all of them at runtime so breaking them is UB)
-    ///
-    /// SAFETY:
-    /// [modify] cannot change the data pointer.
-    /// if the data is not sorted, sorted must be false.
-    /// the size of the data can only decrease (i.e you cannot add new elements).
-    pub(crate) unsafe fn modify_as_c_llama_token_data_array<T>(
-        &mut self,
-        modify: impl FnOnce(&mut llama_cpp_sys_2::llama_token_data_array) -> T,
-    ) -> T {
-        let size = self.data.len();
-        let data = self.data.as_mut_ptr().cast();
-        let mut c_llama_token_data_array = llama_cpp_sys_2::llama_token_data_array {
-            data,
-            size,
-            sorted: self.sorted,
-        };
-        let result = modify(&mut c_llama_token_data_array);
-        assert!(
-            ptr::eq(data, c_llama_token_data_array.data),
-            "data pointer changed"
-        );
-        assert!(c_llama_token_data_array.size <= size, "size increased");
-        self.data.set_len(c_llama_token_data_array.size);
-        self.sorted = c_llama_token_data_array.sorted;
-        result
-    }
-
-    /// Repetition penalty described in [CTRL academic paper](https://arxiv.org/abs/1909.05858), with negative logit fix.
-    /// Frequency and presence penalties described in [OpenAI API](https://platform.openai.com/docs/api-reference/parameter-details).
-    ///
-    /// # Parameters
-    ///
-    /// * `ctx` - the context to use. May be `None` if you do not care to record the sample timings.
-    /// * `last_tokens` - the last tokens in the context.
-    ///
-    /// * `penalty_last_n` - the number of tokens back to consider for the repetition penalty. (0 for no penalty)
-    /// * `penalty_repeat` - the repetition penalty. (1.0 for no penalty)
-    /// * `penalty_freq` - the frequency penalty. (0.0 for no penalty)
-    /// * `penalty_present` - the presence penalty. (0.0 for no penalty)
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use std::collections::BTreeMap;
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    /// let history = vec![
-    ///   LlamaToken::new(2),
-    ///   LlamaToken::new(1),
-    ///   LlamaToken::new(0),
-    /// ];
-    ///
-    /// let candidates = vec![
-    ///    LlamaToken::new(0),
-    ///    LlamaToken::new(1),
-    ///    LlamaToken::new(2),
-    ///    LlamaToken::new(3),
-    /// ];
-    ///
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates.iter().map(|&token| LlamaTokenData::new(token, 0.0, 0.0)), false);
-    ///
-    /// candidates.sample_repetition_penalty(None, &history, 2, 1.1, 0.1, 0.1);
-    ///
-    /// let token_logits = candidates.data.into_iter().map(|token_data| (token_data.id(), token_data.logit())).collect::<BTreeMap<_, _>>();
-    /// assert_eq!(token_logits[&LlamaToken(0)], 0.0, "expected no penalty as it is out of `penalty_last_n`");
-    /// assert!(token_logits[&LlamaToken(1)] < 0.0, "expected penalty as it is in `penalty_last_n`");
-    /// assert!(token_logits[&LlamaToken(2)] < 0.0, "expected penalty as it is in `penalty_last_n`");
-    /// assert_eq!(token_logits[&LlamaToken(3)], 0.0, "expected no penalty as it is not in `history`");
-    /// ```
-    pub fn sample_repetition_penalty(
-        &mut self,
-        ctx: Option<&mut LlamaContext>,
-        last_tokens: &[LlamaToken],
-        penalty_last_n: usize,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-    ) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        let penalty_last_n = min(penalty_last_n, last_tokens.len().saturating_sub(1));
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_repetition_penalties(
-                    ctx,
-                    c_llama_token_data_array,
-                    // safe cast as LlamaToken is repr(transparent)
-                    last_tokens.as_ptr().cast::<llama_token>(),
-                    penalty_last_n,
-                    penalty_repeat,
-                    penalty_freq,
-                    penalty_present,
-                );
-            });
-        }
-    }
-
-    /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let lowest = LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0);
-    /// let middle = LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0);
-    /// let highest = LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0);
-    ///
-    /// let candidates = vec![lowest, middle, highest];
-    ///
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_softmax(None);
-    ///
-    /// assert!(candidates.sorted);
-    /// assert_eq!(candidates.data[0].id(), highest.id());
-    /// assert_eq!(candidates.data[0].logit(), highest.logit());
-    /// assert!(candidates.data[0].p() > candidates.data[1].p());
-    /// assert_eq!(candidates.data[1].id(), middle.id());
-    /// assert_eq!(candidates.data[1].logit(), middle.logit());
-    /// assert!(candidates.data[1].p() > candidates.data[2].p());
-    /// assert_eq!(candidates.data[2].id(), lowest.id());
-    /// assert_eq!(candidates.data[2].logit(), lowest.logit());
-    /// ```
-    pub fn sample_softmax(&mut self, ctx: Option<&mut LlamaContext>) {
-        unsafe {
-            let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_softmax(ctx, c_llama_token_data_array);
-            });
-        }
-    }
-
-    /// Modify the logits of [`Self`] in place using temperature sampling.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///     LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///     LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///     LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0)
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    ///
-    /// candidates.sample_temp(None, 0.5);
-    ///
-    /// assert_ne!(candidates.data[0].logit(), 0.1);
-    /// assert_ne!(candidates.data[1].logit(), 0.2);
-    /// assert_ne!(candidates.data[2].logit(), 0.7);
-    /// ```
-    pub fn sample_temp(&mut self, ctx: Option<&mut LlamaContext>, temperature: f32) {
-        if temperature == 0.0 {
-            return;
-        }
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_temp(ctx, c_llama_token_data_array, temperature);
-            });
-        }
-    }
-
-    /// Randomly selects a token from the candidates based on their probabilities.
-    pub fn sample_token(&mut self, ctx: &mut LlamaContext) -> LlamaToken {
-        let llama_token = unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_token(ctx.context.as_ptr(), c_llama_token_data_array)
-            })
-        };
-        LlamaToken(llama_token)
-    }
-
-    /// Top-K sampling described in academic paper [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
-    pub fn sample_top_k(&mut self, ctx: Option<&mut LlamaContext>, k: i32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_top_k(ctx, c_llama_token_data_array, k, min_keep);
-            });
-        }
-    }
-
-    /// Tail Free Sampling described in [Tail-Free-Sampling](https://www.trentonbricken.com/Tail-Free-Sampling/).
-    pub fn sample_tail_free(&mut self, ctx: Option<&mut LlamaContext>, z: f32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_tail_free(ctx, c_llama_token_data_array, z, min_keep);
-            });
-        }
-    }
-
-    /// Locally Typical Sampling implementation described in the [paper](https://arxiv.org/abs/2202.00666).
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    ///
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///    LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///    LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///    LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_typical(None, 0.5, 1);
-    ///
-    /// ```
-    pub fn sample_typical(&mut self, ctx: Option<&mut LlamaContext>, p: f32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_typical(ctx, c_llama_token_data_array, p, min_keep);
-            });
-        }
-    }
-
-    /// Nucleus sampling described in academic paper [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    ///
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///   LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    ///
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_top_p(None, 0.5, 1);
-    ///
-    /// assert_eq!(candidates.data.len(), 2);
-    /// assert_eq!(candidates.data[0].id(), LlamaToken::new(2));
-    /// assert_eq!(candidates.data[1].id(), LlamaToken::new(1));
-    /// ```
-    pub fn sample_top_p(&mut self, ctx: Option<&mut LlamaContext>, p: f32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_top_p(ctx, c_llama_token_data_array, p, min_keep);
-            });
-        }
-    }
-
-    /// Minimum P sampling as described in [#3841](https://github.com/ggerganov/llama.cpp/pull/3841)
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///   LlamaTokenData::new(LlamaToken::new(4), 0.0001, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_min_p(None, 0.05, 1);
-    /// ```
-    pub fn sample_min_p(&mut self, ctx: Option<&mut LlamaContext>, p: f32, min_keep: usize) {
-        let ctx = ctx.map_or(ptr::null_mut(), |ctx| ctx.context.as_ptr());
-        unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_min_p(ctx, c_llama_token_data_array, p, min_keep);
-            });
-        }
-    }
-
-    ///  Mirostat 2.0 algorithm described in the [paper](https://arxiv.org/abs/2007.14966). Uses tokens instead of words.
-    ///
-    /// # Parameters
-    ///
-    /// * `tau`  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// * `eta` The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// * `mu` Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    pub fn sample_token_mirostat_v2(
-        &mut self,
-        ctx: &mut LlamaContext,
-        tau: f32,
-        eta: f32,
-        mu: &mut f32,
-    ) -> LlamaToken {
-        let mu_ptr = ptr::from_mut(mu);
-        let token = unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_token_mirostat_v2(
-                    ctx.context.as_ptr(),
-                    c_llama_token_data_array,
-                    tau,
-                    eta,
-                    mu_ptr,
-                )
-            })
-        };
-        *mu = unsafe { *mu_ptr };
-        LlamaToken(token)
-    }
-
-    ///  Mirostat 1.0 algorithm described in the [paper](https://arxiv.org/abs/2007.14966). Uses tokens instead of words.
-    ///
-    /// # Parameters
-    ///
-    /// * `tau`  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// * `eta`  The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// * `m`  The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-    /// * `mu`  Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    pub fn sample_token_mirostat_v1(
-        &mut self,
-        ctx: &mut LlamaContext,
-        tau: f32,
-        eta: f32,
-        m: i32,
-        mu: &mut f32,
-    ) -> LlamaToken {
-        let mu_ptr = ptr::from_mut(mu);
-        let token = unsafe {
-            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-                llama_cpp_sys_2::llama_sample_token_mirostat(
-                    ctx.context.as_ptr(),
-                    c_llama_token_data_array,
-                    tau,
-                    eta,
-                    m,
-                    mu_ptr,
-                )
-            })
-        };
-        *mu = unsafe { *mu_ptr };
-        LlamaToken(token)
-    }
-}

From b888e98296e64d08002035703f8e4ba607c66f8c Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Tue, 26 Nov 2024 15:14:01 +0100
Subject: [PATCH 015/193] update to b3750, implementing new llama_perf api

---
 llama-cpp-2/src/context.rs |  12 ++++
 llama-cpp-2/src/lib.rs     |   1 +
 llama-cpp-2/src/timing.rs  | 130 +++++++++++++++++++++++++++++++++++++
 llama-cpp-sys-2/llama.cpp  |   2 +-
 4 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 llama-cpp-2/src/timing.rs

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index 91b7926c..cdebb88a 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -7,6 +7,7 @@ use std::slice;
 
 use crate::llama_batch::LlamaBatch;
 use crate::model::{LlamaLoraAdapter, LlamaModel};
+use crate::timing::LlamaTimings;
 use crate::token::data::LlamaTokenData;
 use crate::token::LlamaToken;
 use crate::{
@@ -263,6 +264,17 @@ impl<'model> LlamaContext<'model> {
         unsafe { slice::from_raw_parts(data, len) }
     }
 
+    /// Reset the timings for the context.
+    pub fn reset_timings(&mut self) {
+        unsafe { llama_cpp_sys_2::llama_perf_context_reset(self.context.as_ptr()) }
+    }
+
+    /// Returns the timings for the context.
+    pub fn timings(&mut self) -> LlamaTimings {
+        let timings = unsafe { llama_cpp_sys_2::llama_perf_context(self.context.as_ptr()) };
+        LlamaTimings { timings }
+    }
+
     /// Sets a lora adapter.
     ///
     /// # Errors
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 715b2f49..22519ae9 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -26,6 +26,7 @@ pub mod context;
 pub mod llama_backend;
 pub mod llama_batch;
 pub mod model;
+pub mod timing;
 pub mod token;
 pub mod token_type;
 
diff --git a/llama-cpp-2/src/timing.rs b/llama-cpp-2/src/timing.rs
new file mode 100644
index 00000000..b45d9318
--- /dev/null
+++ b/llama-cpp-2/src/timing.rs
@@ -0,0 +1,130 @@
+//! Safe wrapper around `llama_timings`.
+use std::fmt::{Debug, Display, Formatter};
+
+/// A wrapper around `llama_timings`.
+#[derive(Clone, Copy, Debug)]
+pub struct LlamaTimings {
+    pub(crate) timings: llama_cpp_sys_2::llama_perf_context_data,
+}
+
+impl LlamaTimings {
+    /// Create a new `LlamaTimings`.
+    /// ```
+    /// # use llama_cpp_2::timing::LlamaTimings;
+    /// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5, 6);
+    /// let timings_str = "load time = 2.00 ms
+    /// prompt eval time = 3.00 ms / 5 tokens (0.60 ms per token, 1666.67 tokens per second)
+    /// eval time = 4.00 ms / 6 runs (0.67 ms per token, 1500.00 tokens per second)\n";
+    /// assert_eq!(timings_str, format!("{}", timings));
+    /// ```
+    #[allow(clippy::too_many_arguments)]
+    #[must_use]
+    pub fn new(
+        t_start_ms: f64,
+        t_load_ms: f64,
+        t_p_eval_ms: f64,
+        t_eval_ms: f64,
+        n_p_eval: i32,
+        n_eval: i32,
+    ) -> Self {
+        Self {
+            timings: llama_cpp_sys_2::llama_perf_context_data {
+                t_start_ms,
+                t_load_ms,
+                t_p_eval_ms,
+                t_eval_ms,
+                n_p_eval,
+                n_eval,
+            },
+        }
+    }
+
+    /// Get the start time in milliseconds.
+    #[must_use]
+    pub fn t_start_ms(&self) -> f64 {
+        self.timings.t_start_ms
+    }
+
+    /// Get the load time in milliseconds.
+    #[must_use]
+    pub fn t_load_ms(&self) -> f64 {
+        self.timings.t_load_ms
+    }
+
+    /// Get the prompt evaluation time in milliseconds.
+    #[must_use]
+    pub fn t_p_eval_ms(&self) -> f64 {
+        self.timings.t_p_eval_ms
+    }
+
+    /// Get the evaluation time in milliseconds.
+    #[must_use]
+    pub fn t_eval_ms(&self) -> f64 {
+        self.timings.t_eval_ms
+    }
+
+    /// Get the number of prompt evaluations.
+    #[must_use]
+    pub fn n_p_eval(&self) -> i32 {
+        self.timings.n_p_eval
+    }
+
+    /// Get the number of evaluations.
+    #[must_use]
+    pub fn n_eval(&self) -> i32 {
+        self.timings.n_eval
+    }
+
+    /// Set the start time in milliseconds.
+    pub fn set_t_start_ms(&mut self, t_start_ms: f64) {
+        self.timings.t_start_ms = t_start_ms;
+    }
+
+    /// Set the load time in milliseconds.
+    pub fn set_t_load_ms(&mut self, t_load_ms: f64) {
+        self.timings.t_load_ms = t_load_ms;
+    }
+
+    /// Set the prompt evaluation time in milliseconds.
+    pub fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
+        self.timings.t_p_eval_ms = t_p_eval_ms;
+    }
+
+    /// Set the evaluation time in milliseconds.
+    pub fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
+        self.timings.t_eval_ms = t_eval_ms;
+    }
+
+    /// Set the number of prompt evaluations.
+    pub fn set_n_p_eval(&mut self, n_p_eval: i32) {
+        self.timings.n_p_eval = n_p_eval;
+    }
+
+    /// Set the number of evaluations.
+    pub fn set_n_eval(&mut self, n_eval: i32) {
+        self.timings.n_eval = n_eval;
+    }
+}
+
+impl Display for LlamaTimings {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "load time = {:.2} ms", self.t_load_ms())?;
+        writeln!(
+            f,
+            "prompt eval time = {:.2} ms / {} tokens ({:.2} ms per token, {:.2} tokens per second)",
+            self.t_p_eval_ms(),
+            self.n_p_eval(),
+            self.t_p_eval_ms() / f64::from(self.n_p_eval()),
+            1e3 / self.t_p_eval_ms() * f64::from(self.n_p_eval())
+        )?;
+        writeln!(
+            f,
+            "eval time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
+            self.t_eval_ms(),
+            self.n_eval(),
+            self.t_eval_ms() / f64::from(self.n_eval()),
+            1e3 / self.t_eval_ms() * f64::from(self.n_eval())
+        )?;
+        Ok(())
+    }
+}
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index df270ef7..0abc6a2c 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit df270ef74596da8f1178f08991f4c51f18c9ee82
+Subproject commit 0abc6a2c25272d5cf01384dda8ee8bfec4ba8745

From baea38c47b458c968bfc81aae5361cd955abc8af Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Tue, 26 Nov 2024 15:25:47 +0100
Subject: [PATCH 016/193] reintroduce LlamaTokenDataArray::new and ::from_iter

---
 llama-cpp-2/src/token/data_array.rs | 38 +++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index f1fa1a2d..d9693049 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -10,3 +10,41 @@ pub struct LlamaTokenDataArray {
     /// is the data sorted?
     pub sorted: bool,
 }
+
+impl LlamaTokenDataArray {
+    /// Create a new `LlamaTokenDataArray` from a vector and weather or not the data is sorted.
+    ///
+    /// ```
+    /// # use llama_cpp_2::token::data::LlamaTokenData;
+    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+    /// # use llama_cpp_2::token::LlamaToken;
+    /// let array = LlamaTokenDataArray::new(vec![
+    ///         LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
+    ///         LlamaTokenData::new(LlamaToken(1), 0.1, 0.1)
+    ///    ], false);
+    /// assert_eq!(array.data.len(), 2);
+    /// assert_eq!(array.sorted, false);
+    /// ```
+    #[must_use]
+    pub fn new(data: Vec<LlamaTokenData>, sorted: bool) -> Self {
+        Self { data, sorted }
+    }
+
+    /// Create a new `LlamaTokenDataArray` from an iterator and weather or not the data is sorted.
+    /// ```
+    /// # use llama_cpp_2::token::data::LlamaTokenData;
+    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+    /// # use llama_cpp_2::token::LlamaToken;
+    /// let array = LlamaTokenDataArray::from_iter([
+    ///     LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
+    ///     LlamaTokenData::new(LlamaToken(1), 0.1, 0.1)
+    /// ], false);
+    /// assert_eq!(array.data.len(), 2);
+    /// assert_eq!(array.sorted, false);
+    pub fn from_iter<T>(data: T, sorted: bool) -> LlamaTokenDataArray
+    where
+        T: IntoIterator<Item = LlamaTokenData>,
+    {
+        Self::new(data.into_iter().collect(), sorted)
+    }
+}

From 5c2700963909f9a9ec5df8961aec0ff399729b8d Mon Sep 17 00:00:00 2001
From: volesen <vincolesen@gmail.com>
Date: Tue, 26 Nov 2024 15:47:05 +0100
Subject: [PATCH 017/193] Expose safe wrappers around the new sampling API
 types

---
 llama-cpp-2/src/lib.rs             | 10 ++++++++
 llama-cpp-2/src/sampling.rs        | 39 ++++++++++++++++++++++++++++++
 llama-cpp-2/src/sampling/params.rs | 36 +++++++++++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 llama-cpp-2/src/sampling.rs
 create mode 100644 llama-cpp-2/src/sampling/params.rs

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 22519ae9..424572bd 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -26,6 +26,7 @@ pub mod context;
 pub mod llama_backend;
 pub mod llama_batch;
 pub mod model;
+pub mod sampling;
 pub mod timing;
 pub mod token;
 pub mod token_type;
@@ -61,6 +62,7 @@ pub enum LLamaCppError {
     /// see [`EmbeddingsError`]
     #[error(transparent)]
     EmbeddingError(#[from] EmbeddingsError),
+    // See [`LlamaSamplerError`]
 }
 
 /// There was an error while getting the chat template from a model.
@@ -193,6 +195,14 @@ pub enum LlamaLoraAdapterRemoveError {
     ErrorResult(i32),
 }
 
+/// An error that can occur when initializing a sampler.
+#[derive(Debug, Eq, PartialEq, thiserror::Error)]
+pub enum LlamaSamplerError {
+    /// llama.cpp returned null
+    #[error("null reference from llama.cpp")]
+    NullReturn,
+}
+
 /// get the time (in microseconds) according to llama.cpp
 /// ```
 /// # use llama_cpp_2::llama_time_us;
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
new file mode 100644
index 00000000..9242ef08
--- /dev/null
+++ b/llama-cpp-2/src/sampling.rs
@@ -0,0 +1,39 @@
+//! Safe wrapper around `llama_sampler`.
+pub mod params;
+
+use std::fmt::{Debug, Formatter};
+use std::ptr::NonNull;
+
+use crate::LlamaSamplerError;
+
+/// A safe wrapper around `llama_sampler`.
+pub struct LlamaSampler {
+    pub(crate) sampler: NonNull<llama_cpp_sys_2::llama_sampler>,
+}
+
+impl Debug for LlamaSampler {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("LlamaSamplerChain").finish()
+    }
+}
+
+impl LlamaSampler {
+    pub fn new(params: params::LlamaSamplerChainParams) -> Result<Self, LlamaSamplerError> {
+        let sampler = unsafe {
+            NonNull::new(llama_cpp_sys_2::llama_sampler_chain_init(
+                params.sampler_chain_params,
+            ))
+            .ok_or(LlamaSamplerError::NullReturn)
+        }?;
+
+        Ok(Self { sampler })
+    }
+}
+
+impl Drop for LlamaSampler {
+    fn drop(&mut self) {
+        unsafe {
+            llama_cpp_sys_2::llama_sampler_free(self.sampler.as_ptr());
+        }
+    }
+}
diff --git a/llama-cpp-2/src/sampling/params.rs b/llama-cpp-2/src/sampling/params.rs
new file mode 100644
index 00000000..972df840
--- /dev/null
+++ b/llama-cpp-2/src/sampling/params.rs
@@ -0,0 +1,36 @@
+//! Safe wrapper around `llama_sampler`.
+
+use std::fmt::{Debug, Formatter};
+use std::ptr::NonNull;
+
+/// A safe wrapper around `llama_sampler`.
+pub struct LlamaSamplerChainParams {
+    pub(crate) sampler_chain_params: llama_cpp_sys_2::llama_sampler_chain_params,
+}
+
+impl Debug for LlamaSamplerChainParams {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("LlamaSamplerChainParams").finish()
+    }
+}
+
+impl Default for LlamaSamplerChainParams {
+    fn default() -> Self {
+        let sampler_chain_params = unsafe { llama_cpp_sys_2::llama_sampler_chain_default_params() };
+
+        Self {
+            sampler_chain_params,
+        }
+    }
+}
+
+impl LlamaSamplerChainParams {
+    pub fn with_no_perf(&mut self, no_perf: bool) -> &mut Self {
+        self.sampler_chain_params.no_perf = no_perf;
+        self
+    }
+
+    pub fn no_perf(&self) -> bool {
+        self.sampler_chain_params.no_perf
+    }
+}

From 1f41e6e60c19e9e41ee4592d863993dbc1e3a4e6 Mon Sep 17 00:00:00 2001
From: volesen <vincolesen@gmail.com>
Date: Tue, 26 Nov 2024 18:06:20 +0100
Subject: [PATCH 018/193] Implement new sampler API This commit implements the
 new sampler API from `llama.cpp` introduced in b3680 and removes the custom
 sampling logic.

The new sampling API is exposes through a builder pattern.

Made tests pass.
---
 examples/simple/src/main.rs               |  26 +--
 examples/usage.rs                         |  22 ++-
 llama-cpp-2/src/context/params.rs         |   6 +-
 llama-cpp-2/src/context/sample/sampler.rs | 112 -----------
 llama-cpp-2/src/sampling.rs               | 217 ++++++++++++++++++++++
 llama-cpp-2/src/sampling/params.rs        |   9 +-
 6 files changed, 251 insertions(+), 141 deletions(-)
 delete mode 100644 llama-cpp-2/src/context/sample/sampler.rs

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index 267d6864..ec8f87d4 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -17,7 +17,9 @@ use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
 use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
-use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+use llama_cpp_2::sampling::params::LlamaSamplerChainParams;
+use llama_cpp_2::sampling::LlamaSampler;
+
 use std::ffi::CString;
 use std::io::Write;
 use std::num::NonZeroU32;
@@ -174,9 +176,9 @@ fn main() -> Result<()> {
         .with_context(|| "unable to load model")?;
 
     // initialize the context
-    let mut ctx_params = LlamaContextParams::default()
-        .with_n_ctx(ctx_size.or(Some(NonZeroU32::new(2048).unwrap())))
-        .with_seed(seed.unwrap_or(1234));
+    let mut ctx_params =
+        LlamaContextParams::default().with_n_ctx(ctx_size.or(Some(NonZeroU32::new(2048).unwrap())));
+
     if let Some(threads) = threads {
         ctx_params = ctx_params.with_n_threads(threads);
     }
@@ -244,23 +246,23 @@ either reduce n_len or increase n_ctx"
     // The `Decoder`
     let mut decoder = encoding_rs::UTF_8.new_decoder();
 
+    let sampler_params = LlamaSamplerChainParams::default();
+    let mut sampler = LlamaSampler::new(sampler_params)?.add_dist(seed.unwrap_or(1234));
+
     while n_cur <= n_len {
         // sample the next token
         {
-            let candidates = ctx.candidates();
-
-            let candidates_p = LlamaTokenDataArray::from_iter(candidates, false);
+            let token = sampler.sample(&ctx, batch.n_tokens() - 1);
 
-            // sample the most likely token
-            let new_token_id = ctx.sample_token_greedy(candidates_p);
+            sampler.accept(token);
 
             // is it an end of stream?
-            if model.is_eog_token(new_token_id) {
+            if model.is_eog_token(token) {
                 eprintln!();
                 break;
             }
 
-            let output_bytes = model.token_to_bytes(new_token_id, Special::Tokenize)?;
+            let output_bytes = model.token_to_bytes(token, Special::Tokenize)?;
             // use `Decoder.decode_to_string()` to avoid the intermediate buffer
             let mut output_string = String::with_capacity(32);
             let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);
@@ -268,7 +270,7 @@ either reduce n_len or increase n_ctx"
             std::io::stdout().flush()?;
 
             batch.clear();
-            batch.add(new_token_id, n_cur, &[0], true)?;
+            batch.add(token, n_cur, &[0], true)?;
         }
 
         n_cur += 1;
diff --git a/examples/usage.rs b/examples/usage.rs
index 1b7d1f5d..2b7f1915 100644
--- a/examples/usage.rs
+++ b/examples/usage.rs
@@ -14,6 +14,8 @@ use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
+use llama_cpp_2::sampling::params::LlamaSamplerChainParams;
+use llama_cpp_2::sampling::LlamaSampler;
 use llama_cpp_2::token::data_array::LlamaTokenDataArray;
 use std::io::Write;
 
@@ -54,25 +56,25 @@ fn main() {
     // The `Decoder`
     let mut decoder = encoding_rs::UTF_8.new_decoder();
 
+    let sampler_params = LlamaSamplerChainParams::default();
+    let mut sampler = LlamaSampler::new(sampler_params)
+        .expect("Failed to create sampler")
+        .add_greedy();
+
     while n_cur <= n_len {
         // sample the next token
         {
-            let candidates = ctx.candidates_ith(batch.n_tokens() - 1);
-
-            let candidates_p = LlamaTokenDataArray::from_iter(candidates, false);
+            let token = sampler.sample(&ctx, batch.n_tokens() - 1);
 
-            // sample the most likely token
-            let new_token_id = ctx.sample_token_greedy(candidates_p);
+            sampler.accept(token);
 
             // is it an end of stream?
-            if new_token_id == model.token_eos() {
+            if token == model.token_eos() {
                 eprintln!();
                 break;
             }
 
-            let output_bytes = model
-                .token_to_bytes(new_token_id, Special::Tokenize)
-                .unwrap();
+            let output_bytes = model.token_to_bytes(token, Special::Tokenize).unwrap();
             // use `Decoder.decode_to_string()` to avoid the intermediate buffer
             let mut output_string = String::with_capacity(32);
             let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false);
@@ -80,7 +82,7 @@ fn main() {
             std::io::stdout().flush().unwrap();
 
             batch.clear();
-            batch.add(new_token_id, n_cur, &[0], true).unwrap();
+            batch.add(token, n_cur, &[0], true).unwrap();
         }
 
         n_cur += 1;
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
index 3de7ba53..cfaf967b 100644
--- a/llama-cpp-2/src/context/params.rs
+++ b/llama-cpp-2/src/context/params.rs
@@ -47,7 +47,7 @@ impl From<RopeScalingType> for i32 {
 pub enum LlamaPoolingType {
     /// The pooling type is unspecified
     Unspecified = -1,
-    /// No pooling    
+    /// No pooling
     None = 0,
     /// Mean pooling
     Mean = 1,
@@ -95,10 +95,8 @@ impl From<LlamaPoolingType> for i32 {
 /// use llama_cpp_2::context::params::LlamaContextParams;
 ///
 ///let ctx_params = LlamaContextParams::default()
-///    .with_n_ctx(NonZeroU32::new(2048))
-///    .with_seed(1234);
+///    .with_n_ctx(NonZeroU32::new(2048));
 ///
-/// assert_eq!(ctx_params.seed(), 1234);
 /// assert_eq!(ctx_params.n_ctx(), NonZeroU32::new(2048));
 /// ```
 #[derive(Debug, Clone)]
diff --git a/llama-cpp-2/src/context/sample/sampler.rs b/llama-cpp-2/src/context/sample/sampler.rs
deleted file mode 100644
index 948a1aa5..00000000
--- a/llama-cpp-2/src/context/sample/sampler.rs
+++ /dev/null
@@ -1,112 +0,0 @@
-//! Create a sampler struct to encapsulate the sampling process. This allows passing all the possible
-//! sampling parameters around as a single struct, and also allow late binding of expensive context
-//! like [`crate::context::LlamaContext`] or token history to the sampler.
-//!
-//! # Example
-//!
-//! **Llama.cpp default sampler**
-//!
-//! ```rust
-//! use llama_cpp_2::context::sample::sampler::{Sampler, SampleStep};
-//! use llama_cpp_2::token::data::LlamaTokenData;
-//! use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-//! use llama_cpp_2::token::LlamaToken;
-//!
-//! // Sample a token greedily and add to the history.
-//! let mut finalizer = &|mut canidates: LlamaTokenDataArray, history: &mut Vec<LlamaToken>| {
-//!     canidates.sample_softmax(None);
-//!     let token = canidates.data[0];
-//!     history.push(token.id());
-//!     vec![token]
-//! };
-//!
-//! let mut history = vec![];
-//! let mut sampler = Sampler::new(finalizer);
-//!
-//! sampler.push_step(&|c, history| c.sample_repetition_penalty(None, history, 64, 1.1, 0.0, 0.0));
-//! sampler.push_step(&|c, _| c.sample_top_k(None, 40, 1));
-//! sampler.push_step(&|c, _| c.sample_tail_free(None, 1.0, 1));
-//! sampler.push_step(&|c, _| c.sample_typical(None, 1.0, 1));
-//! sampler.push_step(&|c, _| c.sample_top_p(None, 0.95, 1));
-//! sampler.push_step(&|c, _| c.sample_min_p(None, 0.05, 1));
-//! sampler.push_step(&|c, _| c.sample_temp(None, 0.5));
-//!
-//! // random candidates
-//! let candidates = LlamaTokenDataArray::from_iter((0..4).map(|i| LlamaTokenData::new(LlamaToken::new(i), i as f32 / 6.0, 0.0)), false);
-//!
-//! for _ in 0..10 {
-//!    let tokens = sampler.sample(&mut history, candidates.clone());
-//!    assert_eq!(tokens.len(), 1);
-//! }
-//!
-//! assert_eq!(history.len(), 10);
-//! ```
-
-use crate::token::data::LlamaTokenData;
-use crate::token::data_array::LlamaTokenDataArray;
-use std::fmt::{Debug, Formatter};
-
-/// A single step to sample tokens from the remaining candidates.
-pub type SampleStep<C> = dyn Fn(&mut LlamaTokenDataArray, &mut C);
-
-/// The final step to select tokens from the remaining candidates.
-pub type SampleFinalizer<C> = dyn Fn(LlamaTokenDataArray, &mut C) -> Vec<LlamaTokenData>;
-
-/// A series of sampling steps that will produce a vector of token data.
-///
-/// `C` is dynamic context that will be passed to the sampling functions. Some sampling steps may
-/// require state to be maintained across multiple samples, and this context can be used to store
-/// that state. For example, [`LlamaTokenDataArray::sample_token_mirostat_v2`] requires a `mu` to be
-/// shared across multiple samples.
-pub struct Sampler<'a, C> {
-    /// The steps to take when sampling.
-    pub steps: Vec<&'a SampleStep<C>>,
-    /// The final step to select one or more tokens from the remaining candidates.
-    pub finalizer: &'a SampleFinalizer<C>,
-}
-
-impl<T> Debug for Sampler<'_, T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Sampler")
-            .field(
-                "steps",
-                &format!(
-                    "{} steps of Box<dyn FnMut(&mut LlamaTokenDataArray) -> ()>",
-                    &self.steps.len()
-                ),
-            )
-            .field(
-                "finalizer",
-                &"Box<dyn FnMut(LlamaTokenDataArray) -> Vec<LlamaTokenData>>",
-            )
-            .finish()
-    }
-}
-
-impl<'a, T> Sampler<'a, T> {
-    /// Create a new sampler with a given finalizer.
-    pub fn new(finalizer: &'a SampleFinalizer<T>) -> Self {
-        Self {
-            steps: vec![],
-            finalizer,
-        }
-    }
-
-    /// Adds a step to the sampler.
-    pub fn push_step(&mut self, step: &'a SampleStep<T>) {
-        self.steps.push(step);
-    }
-
-    /// Sample a token from the given candidates.
-    #[must_use]
-    pub fn sample(
-        &mut self,
-        context: &mut T,
-        mut candidates: LlamaTokenDataArray,
-    ) -> Vec<LlamaTokenData> {
-        for step in &self.steps {
-            step(&mut candidates, context);
-        }
-        (self.finalizer)(candidates, context)
-    }
-}
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 9242ef08..7181e149 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -1,9 +1,13 @@
 //! Safe wrapper around `llama_sampler`.
 pub mod params;
 
+use std::ffi::CString;
 use std::fmt::{Debug, Formatter};
 use std::ptr::NonNull;
 
+use crate::context::LlamaContext;
+use crate::model::LlamaModel;
+use crate::token::LlamaToken;
 use crate::LlamaSamplerError;
 
 /// A safe wrapper around `llama_sampler`.
@@ -18,6 +22,9 @@ impl Debug for LlamaSampler {
 }
 
 impl LlamaSampler {
+    /// Create a new `LlamaSampler` from the given parameters.
+    /// # Errors
+    /// Returns an error if the underlying C++ code returns a null pointer.
     pub fn new(params: params::LlamaSamplerChainParams) -> Result<Self, LlamaSamplerError> {
         let sampler = unsafe {
             NonNull::new(llama_cpp_sys_2::llama_sampler_chain_init(
@@ -28,6 +35,216 @@ impl LlamaSampler {
 
         Ok(Self { sampler })
     }
+
+    /// Samples the token with the largest probability.
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_greedy(mut self) -> Self {
+        unsafe {
+            let greedy_sampler = llama_cpp_sys_2::llama_sampler_init_greedy();
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), greedy_sampler);
+        }
+
+        self
+    }
+
+    /// Samples according to the probability distribution of the tokens.
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_dist(mut self, seed: u32) -> Self {
+        unsafe {
+            let dist_sampler = llama_cpp_sys_2::llama_sampler_init_dist(seed);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), dist_sampler);
+        }
+
+        self
+    }
+
+    /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" <https://arxiv.org/abs/1904.09751>
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_top_k(mut self, k: i32) -> Self {
+        unsafe {
+            let top_k_sampler = llama_cpp_sys_2::llama_sampler_init_top_k(k);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), top_k_sampler);
+        }
+
+        self
+    }
+
+    /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" <https://arxiv.org/abs/1904.09751>
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_top_p(mut self, p: f32, min_keep: usize) -> Self {
+        unsafe {
+            let top_p_sampler = llama_cpp_sys_2::llama_sampler_init_top_p(p, min_keep);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), top_p_sampler);
+        }
+
+        self
+    }
+
+    /// Minimum P sampling as described in <https://github.com/ggerganov/llama.cpp/pull/3841>
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_min_p(mut self, p: f32, min_keep: usize) -> Self {
+        unsafe {
+            let min_p_sampler = llama_cpp_sys_2::llama_sampler_init_min_p(p, min_keep);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), min_p_sampler);
+        }
+
+        self
+    }
+
+    /// Locally Typical Sampling implementation described in the paper <https://arxiv.org/abs/2202.00666>.
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_typical(mut self, p: f32, min_keep: usize) -> Self {
+        unsafe {
+            let typical_sampler = llama_cpp_sys_2::llama_sampler_init_typical(p, min_keep);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), typical_sampler);
+        }
+
+        self
+    }
+
+    /// Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_temp(mut self, t: f32) -> Self {
+        unsafe {
+            let temp_sampler = llama_cpp_sys_2::llama_sampler_init_temp(t);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_sampler);
+        }
+
+        self
+    }
+
+    /// Dynamic temperature implementation (a.k.a. entropy) described in the paper <https://arxiv.org/abs/2309.02772>.
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_temp_ext(mut self, t: f32, delta: f32, exponent: f32) -> Self {
+        unsafe {
+            let temp_ext_sampler = llama_cpp_sys_2::llama_sampler_init_temp_ext(t, delta, exponent);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_ext_sampler);
+        }
+
+        self
+    }
+
+    /// Mirostat 1.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
+    ///
+    /// # Arguments
+    ///
+    /// * `candidates` - A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// * `tau` -  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// * `eta` - The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// * `m` - The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// * `mu` - Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_mirostat(mut self, n_vocab: i32, seed: u32, tau: f32, eta: f32, m: i32) -> Self {
+        unsafe {
+            let temp_ext_sampler =
+                llama_cpp_sys_2::llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_ext_sampler);
+        }
+
+        self
+    }
+
+    /// Mirostat 2.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
+    ///
+    /// # Arguments
+    ///
+    /// * `candidates` - A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// * `tau` -  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// * `eta` - The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// * `mu` - Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_mirostat_v2(mut self, seed: u32, tau: f32, eta: f32) -> Self {
+        unsafe {
+            let temp_ext_sampler = llama_cpp_sys_2::llama_sampler_init_mirostat_v2(seed, tau, eta);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_ext_sampler);
+        }
+
+        self
+    }
+
+    /// Samples constrained by a context-free grammar in the GGML BNF (GBNF) format.
+    ///
+    /// # Panics
+    /// Panics if a provided string contains a null byte.
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_grammar(
+        mut self,
+        model: &LlamaModel,
+        grammar_str: &str,
+        grammar_root: &str,
+    ) -> Self {
+        unsafe {
+            let grammar_str = CString::new(grammar_str).unwrap();
+            let grammar_root = CString::new(grammar_root).unwrap();
+            let grammar_sampler = llama_cpp_sys_2::llama_sampler_init_grammar(
+                model.model.as_ptr(),
+                grammar_str.as_ptr(),
+                grammar_root.as_ptr(),
+            );
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), grammar_sampler);
+        }
+
+        self
+    }
+
+    /// Adds penalties to the sampler. This can be used to penalize certain patterns in the generated text, such as repeating the same token multiple times or using the same token too frequently.
+    #[allow(unused_mut, clippy::too_many_arguments)]
+    #[must_use]
+    pub fn add_penalties(
+        mut self,
+        n_vocab: i32,
+        special_eos_id: i32,
+        linefeed_id: i32,
+        penalty_last_n: i32,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
+        penalize_nl: bool,
+        ignore_eos: bool,
+    ) -> Self {
+        unsafe {
+            let temp_ext_sampler = llama_cpp_sys_2::llama_sampler_init_penalties(
+                n_vocab,
+                special_eos_id,
+                linefeed_id,
+                penalty_last_n,
+                penalty_repeat,
+                penalty_freq,
+                penalty_present,
+                penalize_nl,
+                ignore_eos,
+            );
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_ext_sampler);
+        }
+
+        self
+    }
+
+    /// Sample and accept a token from the idx-th output of the last evaluation
+    #[must_use]
+    pub fn sample(&self, ctx: &LlamaContext, idx: i32) -> LlamaToken {
+        let token = unsafe {
+            llama_cpp_sys_2::llama_sampler_sample(self.sampler.as_ptr(), ctx.context.as_ptr(), idx)
+        };
+
+        LlamaToken(token)
+    }
+
+    /// Accepts a token from the sampler, possibly updating the internal state of certain samplers (e.g. grammar, repetition, etc.)
+    pub fn accept(&mut self, token: LlamaToken) {
+        unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler.as_ptr(), token.0) }
+    }
 }
 
 impl Drop for LlamaSampler {
diff --git a/llama-cpp-2/src/sampling/params.rs b/llama-cpp-2/src/sampling/params.rs
index 972df840..0e67c1fa 100644
--- a/llama-cpp-2/src/sampling/params.rs
+++ b/llama-cpp-2/src/sampling/params.rs
@@ -1,7 +1,6 @@
-//! Safe wrapper around `llama_sampler`.
+//! Safe wrapper around `llama_sampler_chain_params`.
 
 use std::fmt::{Debug, Formatter};
-use std::ptr::NonNull;
 
 /// A safe wrapper around `llama_sampler`.
 pub struct LlamaSamplerChainParams {
@@ -25,11 +24,15 @@ impl Default for LlamaSamplerChainParams {
 }
 
 impl LlamaSamplerChainParams {
-    pub fn with_no_perf(&mut self, no_perf: bool) -> &mut Self {
+    /// Set whether to measure performance timings
+    #[must_use]
+    pub fn with_no_perf(mut self, no_perf: bool) -> Self {
         self.sampler_chain_params.no_perf = no_perf;
         self
     }
 
+    /// Get whether to measure performance timings
+    #[must_use]
     pub fn no_perf(&self) -> bool {
         self.sampler_chain_params.no_perf
     }

From 5f639d9b331f79bfeaab753aa81632976be4fba5 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Wed, 27 Nov 2024 20:58:14 +0000
Subject: [PATCH 019/193] Bump version to 0.1.85 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8f340449..7b0d1557 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.84"
+version = "0.1.85"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.84"
+version = "0.1.85"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.84"
+version = "0.1.85"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.84"
+version = "0.1.85"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 44f9c1ba..5ce1f06b 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.84"
+version = "0.1.85"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 1d27ef3f..a1f8ac52 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.84"
+version = "0.1.85"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 6f2c5f5e..28f36877 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.84"
+version = "0.1.85"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index ec7285f0..eb0713a7 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.84"
+version = "0.1.85"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 2d12c198e9e1ca3b79d5213956f06eecf4c9616a Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Thu, 28 Nov 2024 14:39:02 +0100
Subject: [PATCH 020/193] bump llama.cpp to b4206

---
 llama-cpp-2/src/llama_batch.rs | 8 ++------
 llama-cpp-sys-2/llama.cpp      | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs
index 8a2fd376..153f5d52 100644
--- a/llama-cpp-2/src/llama_batch.rs
+++ b/llama-cpp-2/src/llama_batch.rs
@@ -157,17 +157,13 @@ impl LlamaBatch {
     ///
     /// NOTE: this is a helper function to facilitate transition to the new batch API
     ///
-    pub fn get_one(
-        tokens: &[LlamaToken],
-        pos_0: llama_pos,
-        seq_id: llama_seq_id,
-    ) -> Result<Self, BatchAddError> {
+    pub fn get_one(tokens: &[LlamaToken]) -> Result<Self, BatchAddError> {
         if tokens.is_empty() {
             return Err(BatchAddError::EmptyBuffer);
         }
         let batch = unsafe {
             let ptr = tokens.as_ptr() as *mut i32;
-            llama_cpp_sys_2::llama_batch_get_one(ptr, tokens.len() as i32, pos_0, seq_id)
+            llama_cpp_sys_2::llama_batch_get_one(ptr, tokens.len() as i32)
         };
         let batch = Self {
             allocated: 0,
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 0abc6a2c..2025fa67 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 0abc6a2c25272d5cf01384dda8ee8bfec4ba8745
+Subproject commit 2025fa67e94358deda4740a74fe9803916cb2f60

From b18a86cd1f06abf3eec5277e7969b5022304402f Mon Sep 17 00:00:00 2001
From: Jonathan Avila <jonathan.edav@gmail.com>
Date: Mon, 2 Dec 2024 00:02:53 -0700
Subject: [PATCH 021/193] Add greedy decoding to example 'simple'

---
 examples/simple/src/main.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index ec8f87d4..73932d37 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -247,7 +247,9 @@ either reduce n_len or increase n_ctx"
     let mut decoder = encoding_rs::UTF_8.new_decoder();
 
     let sampler_params = LlamaSamplerChainParams::default();
-    let mut sampler = LlamaSampler::new(sampler_params)?.add_dist(seed.unwrap_or(1234));
+    let mut sampler = LlamaSampler::new(sampler_params)?
+        .add_dist(seed.unwrap_or(1234))
+        .add_greedy();
 
     while n_cur <= n_len {
         // sample the next token

From 3558a203dd1b2a56b57ae9fb6055d0ff1f68ec37 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Mon, 2 Dec 2024 16:05:01 +0100
Subject: [PATCH 022/193] bump llama.cpp to b4240

---
 llama-cpp-sys-2/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 2025fa67..64ed2091 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 2025fa67e94358deda4740a74fe9803916cb2f60
+Subproject commit 64ed2091b24b2f9747148fdf49a34ed5938762c3

From f7e185a062d90e45d29472485864e6439334913d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 15:37:10 +0000
Subject: [PATCH 023/193] chore(deps): bump thiserror from 1.0.64 to 1.0.69

Bumps [thiserror](https://github.com/dtolnay/thiserror) from 1.0.64 to 1.0.69.
- [Release notes](https://github.com/dtolnay/thiserror/releases)
- [Commits](https://github.com/dtolnay/thiserror/compare/1.0.64...1.0.69)

---
updated-dependencies:
- dependency-name: thiserror
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7b0d1557..660b06ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1099,9 +1099,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "syn"
-version = "2.0.66"
+version = "2.0.87"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5"
+checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1133,18 +1133,18 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.64"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.64"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",

From 4ead0c7574b75b6926a96ea20debfd2abb0ce9d4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 15:37:12 +0000
Subject: [PATCH 024/193] chore(deps): bump cmake from 0.1.51 to 0.1.52

Bumps [cmake](https://github.com/rust-lang/cmake-rs) from 0.1.51 to 0.1.52.
- [Release notes](https://github.com/rust-lang/cmake-rs/releases)
- [Changelog](https://github.com/rust-lang/cmake-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cmake-rs/compare/v0.1.51...v0.1.52)

---
updated-dependencies:
- dependency-name: cmake
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7b0d1557..b13b98ab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -186,9 +186,9 @@ checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
 
 [[package]]
 name = "cmake"
-version = "0.1.51"
+version = "0.1.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
+checksum = "c682c223677e0e5b6b7f63a64b9351844c3f1b1678a68b7ee617e30fb082620e"
 dependencies = [
  "cc",
 ]

From 2d71eedb52da831ca67903f018b9a89bef39bdb0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 15:37:14 +0000
Subject: [PATCH 025/193] chore(deps): bump cc from 1.1.28 to 1.2.2

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.1.28 to 1.2.2.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.1.28...cc-v1.2.2)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7b0d1557..c42a7cca 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.1.28"
+version = "1.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1"
+checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 505bec73..b569df8e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.1.28"
+cc = "1.2.2"
 anyhow = "1.0.93"
 clap = "4.5.19"
 encoding_rs = "0.8.34"

From 9c38aa7ed92d9320f4b533ec0b865bbf8277473c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 15:37:15 +0000
Subject: [PATCH 026/193] chore(deps): bump tracing from 0.1.40 to 0.1.41

Bumps [tracing](https://github.com/tokio-rs/tracing) from 0.1.40 to 0.1.41.
- [Release notes](https://github.com/tokio-rs/tracing/releases)
- [Commits](https://github.com/tokio-rs/tracing/compare/tracing-0.1.40...tracing-0.1.41)

---
updated-dependencies:
- dependency-name: tracing
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7b0d1557..8bed0b5d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1163,9 +1163,9 @@ dependencies = [
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
  "pin-project-lite",
  "tracing-attributes",
@@ -1174,9 +1174,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1185,9 +1185,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
 ]

From d8ea6badc86b911d73d96c73512596a09415c8ba Mon Sep 17 00:00:00 2001
From: J / Jacob Babich <jacobbabichpublic+git@gmail.com>
Date: Tue, 3 Dec 2024 13:18:07 -0500
Subject: [PATCH 027/193] update the `include`d files in `llama-cpp-sys-2` so
 that the crate can be published again

---
 llama-cpp-sys-2/Cargo.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index eb0713a7..def35e53 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -31,10 +31,11 @@ include = [
     "/llama.cpp/ggml/src/ggml-metal.metal",
 
     "/llama.cpp/include/llama.h",
+    "/llama.cpp/include/llama-cpp.h",
 
+    "/llama.cpp/ggml/src/ggml-cpu/**/*",
     "/llama.cpp/ggml/src/ggml-cuda/**/*",
-
-    "/llama.cpp/ggml/src/vulkan-shaders/**/*",
+    "/llama.cpp/ggml/src/ggml-vulkan/**/*",
 
     "/llama.cpp/ggml/src/llamafile/sgemm.h",
     "/llama.cpp/ggml/src/llamafile/sgemm.cpp",
@@ -45,7 +46,6 @@ include = [
     "/llama.cpp/common/CMakeLists.txt",
     "/llama.cpp/ggml/CMakeLists.txt",
     "/llama.cpp/ggml/src/CMakeLists.txt",
-    "/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt",
     "/llama.cpp/src/CMakeLists.txt",
 
     "/llama.cpp/cmake",

From fc3c3b5fdd53eefc8970e256d9a42ed2bfc324a3 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Wed, 4 Dec 2024 10:28:09 -0600
Subject: [PATCH 028/193] Add sampling API back to LlamaTokenDataArray

---
 llama-cpp-2/src/sampling.rs         |  87 +++++++-
 llama-cpp-2/src/token/data_array.rs | 334 +++++++++++++++++++++++++++-
 llama-cpp-sys-2/build.rs            |  13 +-
 3 files changed, 422 insertions(+), 12 deletions(-)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 7181e149..89b9bc1c 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -7,6 +7,7 @@ use std::ptr::NonNull;
 
 use crate::context::LlamaContext;
 use crate::model::LlamaModel;
+use crate::token::data_array::LlamaTokenDataArray;
 use crate::token::LlamaToken;
 use crate::LlamaSamplerError;
 
@@ -132,11 +133,22 @@ impl LlamaSampler {
         self
     }
 
+    /// XTC sampling as described in <https://github.com/oobabooga/text-generation-webui/pull/6335>.
+    #[must_use]
+    #[allow(unused_mut)]
+    pub fn add_xtc(mut self, p: f32, t: f32, min_keep: usize, seed: u32) -> Self {
+        unsafe {
+            let xtc_sampler = llama_cpp_sys_2::llama_sampler_init_xtc(p, t, min_keep, seed);
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), xtc_sampler);
+        }
+
+        self
+    }
+
     /// Mirostat 1.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
     ///
     /// # Arguments
     ///
-    /// * `candidates` - A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// * `tau` -  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
     /// * `eta` - The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
     /// * `m` - The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
@@ -157,7 +169,6 @@ impl LlamaSampler {
     ///
     /// # Arguments
     ///
-    /// * `candidates` - A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// * `tau` -  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
     /// * `eta` - The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
     /// * `mu` - Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
@@ -231,6 +242,74 @@ impl LlamaSampler {
         self
     }
 
+    /// Adds penalties to the sampler. This can be used to penalize certain patterns in the generated text, such as repeating the same token multiple times or using the same token too frequently.
+    #[allow(unused_mut)]
+    #[must_use]
+    pub fn add_penalties_simple(
+        mut self,
+        model: &LlamaModel,
+        penalty_last_n: i32,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
+    ) -> Self {
+        self.add_penalties(
+            model.n_vocab(),
+            model.token_eos().0,
+            model.token_nl().0,
+            penalty_last_n,
+            penalty_repeat,
+            penalty_freq,
+            penalty_present,
+            false,
+            true,
+        )
+    }
+
+    /// Adds DRY repetition penalty to the sampler.
+    ///
+    /// DRY sampler, designed by p-e-w, as described in: <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>
+    #[allow(unused_mut)]
+    #[must_use]
+    pub fn add_dry(
+        mut self,
+        model: &LlamaModel,
+        dry_multiplier: f32,
+        dry_base: f32,
+        dry_allowed_length: i32,
+        dry_penalty_last_n: i32,
+        seq_breakers: &[impl AsRef<[u8]>],
+    ) -> Self {
+        let seq_breakers: Vec<CString> = seq_breakers
+            .iter()
+            .map(|s| {
+                let bytes = s.as_ref();
+                let null_byte = bytes.iter().position(|b| *b == 0).unwrap_or(bytes.len());
+                CString::new(&bytes[..null_byte]).expect("Failed to slice away null bytes!")
+            })
+            .collect();
+
+        let mut seq_breaker_pointers: Vec<*const i8> =
+            seq_breakers.iter().map(|s| s.as_ptr()).collect();
+
+        unsafe {
+            // Memory safety: llama_sampler_init_dry does not hold a reference to
+            // seq_breaker_pointers, so this will not UAF in future operations.
+            let dry_sampler = llama_cpp_sys_2::llama_sampler_init_dry(
+                model.model.as_ptr(),
+                dry_multiplier,
+                dry_base,
+                dry_allowed_length,
+                dry_penalty_last_n,
+                seq_breaker_pointers.as_mut_ptr(),
+                seq_breaker_pointers.len(),
+            );
+            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), dry_sampler);
+        }
+
+        self
+    }
+
     /// Sample and accept a token from the idx-th output of the last evaluation
     #[must_use]
     pub fn sample(&self, ctx: &LlamaContext, idx: i32) -> LlamaToken {
@@ -241,6 +320,10 @@ impl LlamaSampler {
         LlamaToken(token)
     }
 
+    pub fn apply(&mut self, data_array: &mut LlamaTokenDataArray) {
+        unsafe { data_array.apply_sampler(self.sampler.as_ptr()) }
+    }
+
     /// Accepts a token from the sampler, possibly updating the internal state of certain samplers (e.g. grammar, repetition, etc.)
     pub fn accept(&mut self, token: LlamaToken) {
         unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler.as_ptr(), token.0) }
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index d9693049..01b24329 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -1,5 +1,9 @@
 //! an rusty equivalent of `llama_token_data`.
-use crate::token::data::LlamaTokenData;
+use std::{ffi::CString, ptr};
+
+use crate::{model::LlamaModel, token::data::LlamaTokenData};
+
+use super::LlamaToken;
 
 /// a safe wrapper around `llama_token_data_array`.
 #[derive(Debug, Clone, PartialEq)]
@@ -7,12 +11,14 @@ use crate::token::data::LlamaTokenData;
 pub struct LlamaTokenDataArray {
     /// the underlying data
     pub data: Vec<LlamaTokenData>,
+    /// the selected token
+    pub selected: i64,
     /// is the data sorted?
     pub sorted: bool,
 }
 
 impl LlamaTokenDataArray {
-    /// Create a new `LlamaTokenDataArray` from a vector and weather or not the data is sorted.
+    /// Create a new `LlamaTokenDataArray` from a vector and whether or not the data is sorted.
     ///
     /// ```
     /// # use llama_cpp_2::token::data::LlamaTokenData;
@@ -27,10 +33,14 @@ impl LlamaTokenDataArray {
     /// ```
     #[must_use]
     pub fn new(data: Vec<LlamaTokenData>, sorted: bool) -> Self {
-        Self { data, sorted }
+        Self {
+            data,
+            selected: -1,
+            sorted,
+        }
     }
 
-    /// Create a new `LlamaTokenDataArray` from an iterator and weather or not the data is sorted.
+    /// Create a new `LlamaTokenDataArray` from an iterator and whether or not the data is sorted.
     /// ```
     /// # use llama_cpp_2::token::data::LlamaTokenData;
     /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
@@ -47,4 +57,320 @@ impl LlamaTokenDataArray {
     {
         Self::new(data.into_iter().collect(), sorted)
     }
+
+    #[must_use]
+    pub fn selected_token(&self) -> Option<LlamaToken> {
+        self.data
+            .get(usize::try_from(self.selected).ok()?)
+            .map(LlamaTokenData::id)
+    }
+}
+
+impl LlamaTokenDataArray {
+    /// Modify the underlying data as a `llama_token_data_array`. and reconstruct the `LlamaTokenDataArray`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if some of the safety conditions are not met. (we cannot check all of them at runtime so breaking them is UB)
+    ///
+    /// SAFETY:
+    /// [modify] cannot change the data pointer.
+    /// if the data is not sorted, sorted must be false.
+    /// the size of the data can only decrease (i.e you cannot add new elements).
+    pub(crate) unsafe fn modify_as_c_llama_token_data_array<T>(
+        &mut self,
+        modify: impl FnOnce(&mut llama_cpp_sys_2::llama_token_data_array) -> T,
+    ) -> T {
+        let size = self.data.len();
+        let data = self.data.as_mut_ptr().cast();
+        let mut c_llama_token_data_array = llama_cpp_sys_2::llama_token_data_array {
+            data,
+            size,
+            selected: self.selected,
+            sorted: self.sorted,
+        };
+        let result = modify(&mut c_llama_token_data_array);
+        assert!(
+            ptr::eq(data, c_llama_token_data_array.data),
+            "data pointer changed"
+        );
+        assert!(c_llama_token_data_array.size <= size, "size increased");
+        self.data.set_len(c_llama_token_data_array.size);
+        self.sorted = c_llama_token_data_array.sorted;
+        self.selected = c_llama_token_data_array.selected;
+        result
+    }
+
+    pub(crate) unsafe fn apply_sampler(&mut self, sampler: *mut llama_cpp_sys_2::llama_sampler) {
+        self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
+            llama_cpp_sys_2::llama_sampler_apply(sampler, c_llama_token_data_array);
+        })
+    }
+
+    pub(crate) unsafe fn apply_and_free_sampler(
+        &mut self,
+        sampler_fn: impl FnOnce() -> *mut llama_cpp_sys_2::llama_sampler,
+    ) {
+        let sampler = sampler_fn();
+        self.apply_sampler(sampler);
+        llama_cpp_sys_2::llama_sampler_free(sampler);
+    }
+
+    /// Modify the logits of [`Self`] in place using temperature sampling.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// # use llama_cpp_2::token::data::LlamaTokenData;
+    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+    /// # use llama_cpp_2::token::LlamaToken;
+    ///
+    /// let candidates = vec![
+    ///     LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
+    ///     LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
+    ///     LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0)
+    /// ];
+    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
+    ///
+    /// candidates.sample_temp(0.5);
+    ///
+    /// assert_eq!(candidates.data[0].logit(), 0.2);
+    /// assert_eq!(candidates.data[1].logit(), 0.4);
+    /// assert_eq!(candidates.data[2].logit(), 1.4);
+    /// ```
+    pub fn sample_temp(&mut self, temperature: f32) {
+        unsafe {
+            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_temp(temperature));
+        }
+    }
+
+    /// Dynamic temperature implementation (a.k.a. entropy) described in the paper <https://arxiv.org/abs/2309.02772>.
+    pub fn sample_temp_ext(&mut self, t: f32, delta: f32, exponent: f32) {
+        unsafe {
+            self.apply_and_free_sampler(|| {
+                llama_cpp_sys_2::llama_sampler_init_temp_ext(t, delta, exponent)
+            });
+        }
+    }
+
+    /// Top-K sampling described in academic paper [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
+    pub fn sample_top_k(&mut self, k: i32) {
+        unsafe {
+            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_top_k(k));
+        }
+    }
+
+    /// Locally Typical Sampling implementation described in the [paper](https://arxiv.org/abs/2202.00666).
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// # use llama_cpp_2::token::data::LlamaTokenData;
+    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+    /// # use llama_cpp_2::token::LlamaToken;
+    ///
+    /// let candidates = vec![
+    ///    LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
+    ///    LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
+    ///    LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
+    /// ];
+    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
+    /// candidates.sample_typical(0.5, 1);
+    /// ```
+    pub fn sample_typical(&mut self, p: f32, min_keep: usize) {
+        unsafe {
+            self.apply_and_free_sampler(|| {
+                llama_cpp_sys_2::llama_sampler_init_typical(p, min_keep)
+            });
+        }
+    }
+
+    /// Nucleus sampling described in academic paper [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    ///
+    /// # use llama_cpp_2::token::data::LlamaTokenData;
+    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+    /// # use llama_cpp_2::token::LlamaToken;
+    ///
+    /// let candidates = vec![
+    ///   LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
+    ///   LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
+    ///   LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
+    /// ];
+    ///
+    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
+    /// candidates.sample_top_p(0.5, 1);
+    ///
+    /// assert_eq!(candidates.data.len(), 2);
+    /// assert_eq!(candidates.data[0].id(), LlamaToken::new(2));
+    /// assert_eq!(candidates.data[1].id(), LlamaToken::new(1));
+    /// ```
+    pub fn sample_top_p(&mut self, p: f32, min_keep: usize) {
+        unsafe {
+            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_top_p(p, min_keep));
+        }
+    }
+
+    /// Minimum P sampling as described in [#3841](https://github.com/ggerganov/llama.cpp/pull/3841)
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use llama_cpp_2::token::data::LlamaTokenData;
+    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
+    /// # use llama_cpp_2::token::LlamaToken;
+    ///
+    /// let candidates = vec![
+    ///   LlamaTokenData::new(LlamaToken::new(4), -2., 0.0),
+    ///   LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
+    ///   LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
+    ///   LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
+    /// ];
+    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
+    /// candidates.sample_min_p(0.1, 1);
+    ///
+    /// assert_eq!(candidates.data.len(), 3);
+    /// ```
+    pub fn sample_min_p(&mut self, p: f32, min_keep: usize) {
+        unsafe {
+            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_min_p(p, min_keep));
+        }
+    }
+
+    /// XTC sampling as described in <https://github.com/oobabooga/text-generation-webui/pull/6335>.
+    pub fn sample_xtc(&mut self, p: f32, t: f32, min_keep: usize, seed: u32) {
+        unsafe {
+            self.apply_and_free_sampler(|| {
+                llama_cpp_sys_2::llama_sampler_init_xtc(p, t, min_keep, seed)
+            });
+        }
+    }
+
+    /// This can be used to penalize certain patterns in the generated text, such as repeating the same token multiple times or using the same token too frequently.
+    #[allow(clippy::too_many_arguments)]
+    pub fn sample_penalties(
+        &mut self,
+        tokens: &[LlamaToken],
+        n_vocab: i32,
+        special_eos_id: i32,
+        linefeed_id: i32,
+        penalty_last_n: i32,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
+        penalize_nl: bool,
+        ignore_eos: bool,
+    ) {
+        unsafe {
+            self.apply_and_free_sampler(|| {
+                let sampler = llama_cpp_sys_2::llama_sampler_init_penalties(
+                    n_vocab,
+                    special_eos_id,
+                    linefeed_id,
+                    penalty_last_n,
+                    penalty_repeat,
+                    penalty_freq,
+                    penalty_present,
+                    penalize_nl,
+                    ignore_eos,
+                );
+
+                for token in tokens {
+                    llama_cpp_sys_2::llama_sampler_accept(sampler, token.0);
+                }
+
+                sampler
+            });
+        }
+    }
+
+    /// This can be used to penalize certain patterns in the generated text, such as repeating the same token multiple times or using the same token too frequently.
+    pub fn sample_penalties_simple(
+        &mut self,
+        tokens: &[LlamaToken],
+        model: &LlamaModel,
+        penalty_last_n: i32,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
+    ) {
+        self.sample_penalties(
+            tokens,
+            model.n_vocab(),
+            model.token_eos().0,
+            model.token_nl().0,
+            penalty_last_n,
+            penalty_repeat,
+            penalty_freq,
+            penalty_present,
+            false,
+            true,
+        );
+    }
+
+    /// DRY sampler, designed by p-e-w, as described in: <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>
+    #[allow(clippy::too_many_arguments)]
+    pub fn sample_dry(
+        &mut self,
+        tokens: &[LlamaToken],
+        model: &LlamaModel,
+        dry_multiplier: f32,
+        dry_base: f32,
+        dry_allowed_length: i32,
+        dry_penalty_last_n: i32,
+        seq_breakers: &[impl AsRef<[u8]>],
+    ) {
+        let seq_breakers: Vec<CString> = seq_breakers
+            .iter()
+            .map(|s| {
+                let bytes = s.as_ref();
+                let null_byte = bytes.iter().position(|b| *b == 0).unwrap_or(bytes.len());
+                CString::new(&bytes[..null_byte]).expect("Failed to slice away null bytes!")
+            })
+            .collect();
+
+        let mut seq_breaker_pointers: Vec<*const i8> =
+            seq_breakers.iter().map(|s| s.as_ptr()).collect();
+
+        unsafe {
+            self.apply_and_free_sampler(|| {
+                let sampler = llama_cpp_sys_2::llama_sampler_init_dry(
+                    model.model.as_ptr(),
+                    dry_multiplier,
+                    dry_base,
+                    dry_allowed_length,
+                    dry_penalty_last_n,
+                    seq_breaker_pointers.as_mut_ptr(),
+                    seq_breaker_pointers.len(),
+                );
+
+                for token in tokens {
+                    llama_cpp_sys_2::llama_sampler_accept(sampler, token.0);
+                }
+
+                sampler
+            });
+        }
+    }
+
+    /// Randomly selects a token from the candidates based on their probabilities.
+    pub fn sample_token(&mut self, seed: u32) -> LlamaToken {
+        unsafe {
+            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_dist(seed));
+        }
+        self.selected_token()
+            .expect("Dist sampler failed to select a token!")
+    }
+
+    /// Selects the token with the highest probability.
+    pub fn sample_token_greedy(&mut self) -> LlamaToken {
+        unsafe {
+            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_greedy());
+        }
+        self.selected_token()
+            .expect("Greedy sampler failed to select a token!")
+    }
 }
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index fe3d8aa9..f5473769 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -143,7 +143,6 @@ fn macos_link_search_path() -> Option<String> {
 }
 
 fn main() {
-
     let target = env::var("TARGET").unwrap();
     let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
 
@@ -196,7 +195,6 @@ fn main() {
         .generate()
         .expect("Failed to generate bindings");
 
-
     // Write the generated bindings to an output file
     let bindings_path = out_dir.join("bindings.rs");
     bindings
@@ -231,12 +229,12 @@ fn main() {
     if cfg!(windows) {
         config.static_crt(static_crt);
     }
-    
 
     if cfg!(feature = "vulkan") {
         config.define("GGML_VULKAN", "ON");
         if cfg!(windows) {
-            let vulkan_path = env::var("VULKAN_SDK").expect("Please install Vulkan SDK and ensure that VULKAN_SDK env variable is set");
+            let vulkan_path = env::var("VULKAN_SDK")
+                .expect("Please install Vulkan SDK and ensure that VULKAN_SDK env variable is set");
             let vulkan_lib_path = Path::new(&vulkan_path).join("Lib");
             println!("cargo:rustc-link-search={}", vulkan_lib_path.display());
             println!("cargo:rustc-link-lib=vulkan-1");
@@ -265,7 +263,10 @@ fn main() {
 
     // Search paths
     println!("cargo:rustc-link-search={}", out_dir.join("lib").display());
-    println!("cargo:rustc-link-search={}", out_dir.join("lib64").display());
+    println!(
+        "cargo:rustc-link-search={}",
+        out_dir.join("lib64").display()
+    );
     println!("cargo:rustc-link-search={}", build_dir.display());
 
     // Link libraries
@@ -332,7 +333,7 @@ fn main() {
             debug_log!("HARD LINK {} TO {}", asset.display(), dst.display());
             if !dst.exists() {
                 std::fs::hard_link(asset.clone(), dst).unwrap();
-            }          
+            }
 
             // Copy DLLs to examples as well
             if target_dir.join("examples").exists() {

From 25c8e1d0ca94b1c5fbdfd5afefbdc6ae8fc14c92 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Wed, 4 Dec 2024 10:45:28 -0600
Subject: [PATCH 029/193] Add convience methods for getting
 LlamaTokenDataArrays from LlamaContexts

---
 llama-cpp-2/src/context.rs | 44 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index cdebb88a..549a1559 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -9,6 +9,7 @@ use crate::llama_batch::LlamaBatch;
 use crate::model::{LlamaLoraAdapter, LlamaModel};
 use crate::timing::LlamaTimings;
 use crate::token::data::LlamaTokenData;
+use crate::token::data_array::LlamaTokenDataArray;
 use crate::token::LlamaToken;
 use crate::{
     DecodeError, EmbeddingsError, EncodeError, LlamaLoraAdapterRemoveError,
@@ -202,6 +203,27 @@ impl<'model> LlamaContext<'model> {
         })
     }
 
+    /// Get the token data array for the last token in the context.
+    ///
+    /// This is a convience method that implements:
+    /// ```no_run
+    /// LlamaTokenDataArray::from_iter(
+    ///     self.candidates(),
+    ///     false,
+    /// )
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// - underlying logits data is null
+    #[must_use]
+    pub fn token_data_array(&self) -> LlamaTokenDataArray {
+        LlamaTokenDataArray::from_iter(
+            self.candidates(),
+            false,
+        )
+    }
+
     /// Token logits obtained from the last call to `decode()`.
     /// The logits for which `batch.logits[i] != 0` are stored contiguously
     /// in the order they have appeared in the batch.
@@ -217,6 +239,7 @@ impl<'model> LlamaContext<'model> {
     ///
     /// - `n_vocab` does not fit into a usize
     /// - token data returned is null
+    #[must_use]
     pub fn get_logits(&self) -> &[f32] {
         let data = unsafe { llama_cpp_sys_2::llama_get_logits(self.context.as_ptr()) };
         assert!(!data.is_null(), "logits data for last token is null");
@@ -237,6 +260,27 @@ impl<'model> LlamaContext<'model> {
         })
     }
 
+    /// Get the token data array for the ith token in the context.
+    ///
+    /// This is a convience method that implements:
+    /// ```no_run
+    /// LlamaTokenDataArray::from_iter(
+    ///     self.candidates_ith(i),
+    ///     false,
+    /// )
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// - logit `i` is not initialized.
+    #[must_use]
+    pub fn token_data_array_ith(&self, i: i32) -> LlamaTokenDataArray {
+        LlamaTokenDataArray::from_iter(
+            self.candidates_ith(i),
+            false,
+        )
+    }
+
     /// Get the logits for the ith token in the context.
     ///
     /// # Panics

From d61858a0e41795edd43ee5192d4aede6d78505d4 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Wed, 4 Dec 2024 10:46:50 -0600
Subject: [PATCH 030/193] Run cargo fmt

---
 llama-cpp-2/src/context.rs | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index 549a1559..fd8b5feb 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -218,10 +218,7 @@ impl<'model> LlamaContext<'model> {
     /// - underlying logits data is null
     #[must_use]
     pub fn token_data_array(&self) -> LlamaTokenDataArray {
-        LlamaTokenDataArray::from_iter(
-            self.candidates(),
-            false,
-        )
+        LlamaTokenDataArray::from_iter(self.candidates(), false)
     }
 
     /// Token logits obtained from the last call to `decode()`.
@@ -275,10 +272,7 @@ impl<'model> LlamaContext<'model> {
     /// - logit `i` is not initialized.
     #[must_use]
     pub fn token_data_array_ith(&self, i: i32) -> LlamaTokenDataArray {
-        LlamaTokenDataArray::from_iter(
-            self.candidates_ith(i),
-            false,
-        )
+        LlamaTokenDataArray::from_iter(self.candidates_ith(i), false)
     }
 
     /// Get the logits for the ith token in the context.

From 98733b89e20f249c333aefe8349576475025a609 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Thu, 5 Dec 2024 00:32:03 +0000
Subject: [PATCH 031/193] Bump version to 0.1.86 [skip ci]

---
 Cargo.lock                     | 10 +++++-----
 examples/embeddings/Cargo.toml |  2 +-
 examples/simple/Cargo.toml     |  2 +-
 llama-cpp-2/Cargo.toml         |  2 +-
 llama-cpp-sys-2/Cargo.toml     |  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0265ee13..b2fdcdd8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "adler"
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.85"
+version = "0.1.86"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.85"
+version = "0.1.86"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.85"
+version = "0.1.86"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.85"
+version = "0.1.86"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 5ce1f06b..64cd7814 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.85"
+version = "0.1.86"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index a1f8ac52..4ec54baf 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.85"
+version = "0.1.86"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 28f36877..1a583910 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.85"
+version = "0.1.86"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index def35e53..ed3f131d 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.85"
+version = "0.1.86"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From f373cf4de3433bd302b92ced01ef64ac59280603 Mon Sep 17 00:00:00 2001
From: J / Jacob Babich <jacobbabichpublic+git@gmail.com>
Date: Wed, 4 Dec 2024 20:46:19 -0500
Subject: [PATCH 032/193] Include `llama.cpp/ggml/src/ggml-metal` when
 publishing `llama-cpp-sys-2`

---
 llama-cpp-sys-2/Cargo.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index def35e53..2aef4233 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -35,6 +35,7 @@ include = [
 
     "/llama.cpp/ggml/src/ggml-cpu/**/*",
     "/llama.cpp/ggml/src/ggml-cuda/**/*",
+    "/llama.cpp/ggml/src/ggml-metal/**/*",
     "/llama.cpp/ggml/src/ggml-vulkan/**/*",
 
     "/llama.cpp/ggml/src/llamafile/sgemm.h",

From ff7bddca400719a1a6f7362e5e7c45d28bc6d130 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 5 Dec 2024 14:53:55 +0000
Subject: [PATCH 033/193] chore(deps): bump clap from 4.5.19 to 4.5.22

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.19 to 4.5.22.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.19...clap_complete-v4.5.22)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b2fdcdd8..27a0f42c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.19"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615"
+checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.19"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b"
+checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index b569df8e..77b18a8a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.2"
 anyhow = "1.0.93"
-clap = "4.5.19"
+clap = "4.5.22"
 encoding_rs = "0.8.34"
 
 [workspace.lints.rust]

From 6d3dec96e58994ec10144d2a02e42cb4a91919f8 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Thu, 5 Dec 2024 12:47:41 -0600
Subject: [PATCH 034/193] Small documentation improvement

---
 llama-cpp-2/src/context.rs | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index fd8b5feb..d7078dd7 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -207,10 +207,7 @@ impl<'model> LlamaContext<'model> {
     ///
     /// This is a convience method that implements:
     /// ```no_run
-    /// LlamaTokenDataArray::from_iter(
-    ///     self.candidates(),
-    ///     false,
-    /// )
+    /// LlamaTokenDataArray::from_iter(ctx.candidates(), false)
     /// ```
     ///
     /// # Panics
@@ -261,10 +258,7 @@ impl<'model> LlamaContext<'model> {
     ///
     /// This is a convience method that implements:
     /// ```no_run
-    /// LlamaTokenDataArray::from_iter(
-    ///     self.candidates_ith(i),
-    ///     false,
-    /// )
+    /// LlamaTokenDataArray::from_iter(ctx.candidates_ith(i), false)
     /// ```
     ///
     /// # Panics

From ca071709f4c77b911328b1d3509cd3d8026e4d09 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 7 Dec 2024 10:39:15 -0600
Subject: [PATCH 035/193] Make LlamaTokenDataArray::selected an Option<usize>

---
 llama-cpp-2/src/context.rs          |  4 ++--
 llama-cpp-2/src/token/data_array.rs | 31 +++++++++++++++++++----------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index d7078dd7..8946da2b 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -206,7 +206,7 @@ impl<'model> LlamaContext<'model> {
     /// Get the token data array for the last token in the context.
     ///
     /// This is a convience method that implements:
-    /// ```no_run
+    /// ```ignore
     /// LlamaTokenDataArray::from_iter(ctx.candidates(), false)
     /// ```
     ///
@@ -257,7 +257,7 @@ impl<'model> LlamaContext<'model> {
     /// Get the token data array for the ith token in the context.
     ///
     /// This is a convience method that implements:
-    /// ```no_run
+    /// ```ignore
     /// LlamaTokenDataArray::from_iter(ctx.candidates_ith(i), false)
     /// ```
     ///
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index 01b24329..090c866e 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -1,4 +1,4 @@
-//! an rusty equivalent of `llama_token_data`.
+//! an rusty equivalent of `llama_token_data_array`.
 use std::{ffi::CString, ptr};
 
 use crate::{model::LlamaModel, token::data::LlamaTokenData};
@@ -11,8 +11,8 @@ use super::LlamaToken;
 pub struct LlamaTokenDataArray {
     /// the underlying data
     pub data: Vec<LlamaTokenData>,
-    /// the selected token
-    pub selected: i64,
+    /// the index of the selected token in ``data``
+    pub selected: Option<usize>,
     /// is the data sorted?
     pub sorted: bool,
 }
@@ -35,7 +35,7 @@ impl LlamaTokenDataArray {
     pub fn new(data: Vec<LlamaTokenData>, sorted: bool) -> Self {
         Self {
             data,
-            selected: -1,
+            selected: None,
             sorted,
         }
     }
@@ -60,9 +60,7 @@ impl LlamaTokenDataArray {
 
     #[must_use]
     pub fn selected_token(&self) -> Option<LlamaToken> {
-        self.data
-            .get(usize::try_from(self.selected).ok()?)
-            .map(LlamaTokenData::id)
+        self.data.get(self.selected?).map(LlamaTokenData::id)
     }
 }
 
@@ -82,29 +80,40 @@ impl LlamaTokenDataArray {
         modify: impl FnOnce(&mut llama_cpp_sys_2::llama_token_data_array) -> T,
     ) -> T {
         let size = self.data.len();
-        let data = self.data.as_mut_ptr().cast();
+        let data = self
+            .data
+            .as_mut_ptr()
+            .cast::<llama_cpp_sys_2::llama_token_data>();
+
         let mut c_llama_token_data_array = llama_cpp_sys_2::llama_token_data_array {
             data,
             size,
-            selected: self.selected,
+            selected: self.selected.and_then(|s| s.try_into().ok()).unwrap_or(-1),
             sorted: self.sorted,
         };
+
         let result = modify(&mut c_llama_token_data_array);
         assert!(
             ptr::eq(data, c_llama_token_data_array.data),
             "data pointer changed"
         );
         assert!(c_llama_token_data_array.size <= size, "size increased");
+
         self.data.set_len(c_llama_token_data_array.size);
         self.sorted = c_llama_token_data_array.sorted;
-        self.selected = c_llama_token_data_array.selected;
+        self.selected = c_llama_token_data_array
+            .selected
+            .try_into()
+            .ok()
+            .filter(|&s| s < self.data.len());
+
         result
     }
 
     pub(crate) unsafe fn apply_sampler(&mut self, sampler: *mut llama_cpp_sys_2::llama_sampler) {
         self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
             llama_cpp_sys_2::llama_sampler_apply(sampler, c_llama_token_data_array);
-        })
+        });
     }
 
     pub(crate) unsafe fn apply_and_free_sampler(

From 27ebd829b946f2f4b08046287a4ea0524b38f6ad Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 7 Dec 2024 13:16:34 -0600
Subject: [PATCH 036/193] Overhaul sampling API

---
 examples/simple/src/main.rs         |  10 +-
 llama-cpp-2/src/lib.rs              |   8 -
 llama-cpp-2/src/sampling.rs         | 374 ++++++++--------------------
 llama-cpp-2/src/sampling/params.rs  | 180 +++++++++++--
 llama-cpp-2/src/token/data_array.rs | 271 ++------------------
 5 files changed, 280 insertions(+), 563 deletions(-)

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index 73932d37..f31a83c4 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -17,7 +17,7 @@ use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
 use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
-use llama_cpp_2::sampling::params::LlamaSamplerChainParams;
+use llama_cpp_2::sampling::params::LlamaSamplerParams;
 use llama_cpp_2::sampling::LlamaSampler;
 
 use std::ffi::CString;
@@ -246,10 +246,10 @@ either reduce n_len or increase n_ctx"
     // The `Decoder`
     let mut decoder = encoding_rs::UTF_8.new_decoder();
 
-    let sampler_params = LlamaSamplerChainParams::default();
-    let mut sampler = LlamaSampler::new(sampler_params)?
-        .add_dist(seed.unwrap_or(1234))
-        .add_greedy();
+    let mut sampler = LlamaSampler::new(LlamaSamplerParams::chain(&[
+        LlamaSamplerParams::Dist { seed: seed.unwrap_or(1234) },
+        LlamaSamplerParams::Greedy,
+    ]));
 
     while n_cur <= n_len {
         // sample the next token
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 424572bd..8e09608f 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -195,14 +195,6 @@ pub enum LlamaLoraAdapterRemoveError {
     ErrorResult(i32),
 }
 
-/// An error that can occur when initializing a sampler.
-#[derive(Debug, Eq, PartialEq, thiserror::Error)]
-pub enum LlamaSamplerError {
-    /// llama.cpp returned null
-    #[error("null reference from llama.cpp")]
-    NullReturn,
-}
-
 /// get the time (in microseconds) according to llama.cpp
 /// ```
 /// # use llama_cpp_2::llama_time_us;
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 89b9bc1c..e0313f4e 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -3,17 +3,16 @@ pub mod params;
 
 use std::ffi::CString;
 use std::fmt::{Debug, Formatter};
-use std::ptr::NonNull;
 
 use crate::context::LlamaContext;
-use crate::model::LlamaModel;
 use crate::token::data_array::LlamaTokenDataArray;
 use crate::token::LlamaToken;
-use crate::LlamaSamplerError;
+
+use params::LlamaSamplerParams;
 
 /// A safe wrapper around `llama_sampler`.
 pub struct LlamaSampler {
-    pub(crate) sampler: NonNull<llama_cpp_sys_2::llama_sampler>,
+    pub(crate) sampler: *mut llama_cpp_sys_2::llama_sampler,
 }
 
 impl Debug for LlamaSampler {
@@ -22,318 +21,145 @@ impl Debug for LlamaSampler {
     }
 }
 
-impl LlamaSampler {
-    /// Create a new `LlamaSampler` from the given parameters.
-    /// # Errors
-    /// Returns an error if the underlying C++ code returns a null pointer.
-    pub fn new(params: params::LlamaSamplerChainParams) -> Result<Self, LlamaSamplerError> {
-        let sampler = unsafe {
-            NonNull::new(llama_cpp_sys_2::llama_sampler_chain_init(
-                params.sampler_chain_params,
-            ))
-            .ok_or(LlamaSamplerError::NullReturn)
-        }?;
-
-        Ok(Self { sampler })
-    }
-
-    /// Samples the token with the largest probability.
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_greedy(mut self) -> Self {
-        unsafe {
-            let greedy_sampler = llama_cpp_sys_2::llama_sampler_init_greedy();
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), greedy_sampler);
-        }
-
-        self
-    }
-
-    /// Samples according to the probability distribution of the tokens.
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_dist(mut self, seed: u32) -> Self {
-        unsafe {
-            let dist_sampler = llama_cpp_sys_2::llama_sampler_init_dist(seed);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), dist_sampler);
-        }
-
-        self
-    }
-
-    /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" <https://arxiv.org/abs/1904.09751>
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_top_k(mut self, k: i32) -> Self {
-        unsafe {
-            let top_k_sampler = llama_cpp_sys_2::llama_sampler_init_top_k(k);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), top_k_sampler);
-        }
-
-        self
-    }
-
-    /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" <https://arxiv.org/abs/1904.09751>
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_top_p(mut self, p: f32, min_keep: usize) -> Self {
-        unsafe {
-            let top_p_sampler = llama_cpp_sys_2::llama_sampler_init_top_p(p, min_keep);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), top_p_sampler);
-        }
+unsafe fn new_inner(params: LlamaSamplerParams) -> *mut llama_cpp_sys_2::llama_sampler {
+    match params {
+        LlamaSamplerParams::Chain { no_perf, stages } => {
+            let chain = llama_cpp_sys_2::llama_sampler_chain_init(
+                llama_cpp_sys_2::llama_sampler_chain_params { no_perf },
+            );
 
-        self
-    }
+            for stage in stages {
+                llama_cpp_sys_2::llama_sampler_chain_add(chain, new_inner(*stage));
+            }
 
-    /// Minimum P sampling as described in <https://github.com/ggerganov/llama.cpp/pull/3841>
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_min_p(mut self, p: f32, min_keep: usize) -> Self {
-        unsafe {
-            let min_p_sampler = llama_cpp_sys_2::llama_sampler_init_min_p(p, min_keep);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), min_p_sampler);
+            chain
         }
-
-        self
-    }
-
-    /// Locally Typical Sampling implementation described in the paper <https://arxiv.org/abs/2202.00666>.
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_typical(mut self, p: f32, min_keep: usize) -> Self {
-        unsafe {
-            let typical_sampler = llama_cpp_sys_2::llama_sampler_init_typical(p, min_keep);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), typical_sampler);
+        LlamaSamplerParams::Temp(p) => llama_cpp_sys_2::llama_sampler_init_temp(p),
+        LlamaSamplerParams::TempExt { t, delta, exponent } => {
+            llama_cpp_sys_2::llama_sampler_init_temp_ext(t, delta, exponent)
         }
-
-        self
-    }
-
-    /// Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_temp(mut self, t: f32) -> Self {
-        unsafe {
-            let temp_sampler = llama_cpp_sys_2::llama_sampler_init_temp(t);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_sampler);
+        LlamaSamplerParams::TopK(k) => llama_cpp_sys_2::llama_sampler_init_top_k(k),
+        LlamaSamplerParams::Typical { p, min_keep } => {
+            llama_cpp_sys_2::llama_sampler_init_typical(p, min_keep)
         }
-
-        self
-    }
-
-    /// Dynamic temperature implementation (a.k.a. entropy) described in the paper <https://arxiv.org/abs/2309.02772>.
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_temp_ext(mut self, t: f32, delta: f32, exponent: f32) -> Self {
-        unsafe {
-            let temp_ext_sampler = llama_cpp_sys_2::llama_sampler_init_temp_ext(t, delta, exponent);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_ext_sampler);
+        LlamaSamplerParams::TopP { p, min_keep } => {
+            llama_cpp_sys_2::llama_sampler_init_top_p(p, min_keep)
         }
-
-        self
-    }
-
-    /// XTC sampling as described in <https://github.com/oobabooga/text-generation-webui/pull/6335>.
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_xtc(mut self, p: f32, t: f32, min_keep: usize, seed: u32) -> Self {
-        unsafe {
-            let xtc_sampler = llama_cpp_sys_2::llama_sampler_init_xtc(p, t, min_keep, seed);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), xtc_sampler);
+        LlamaSamplerParams::MinP { p, min_keep } => {
+            llama_cpp_sys_2::llama_sampler_init_min_p(p, min_keep)
         }
-
-        self
-    }
-
-    /// Mirostat 1.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
-    ///
-    /// # Arguments
-    ///
-    /// * `tau` -  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// * `eta` - The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// * `m` - The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-    /// * `mu` - Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_mirostat(mut self, n_vocab: i32, seed: u32, tau: f32, eta: f32, m: i32) -> Self {
-        unsafe {
-            let temp_ext_sampler =
-                llama_cpp_sys_2::llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_ext_sampler);
-        }
-
-        self
-    }
-
-    /// Mirostat 2.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
-    ///
-    /// # Arguments
-    ///
-    /// * `tau` -  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// * `eta` - The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// * `mu` - Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_mirostat_v2(mut self, seed: u32, tau: f32, eta: f32) -> Self {
-        unsafe {
-            let temp_ext_sampler = llama_cpp_sys_2::llama_sampler_init_mirostat_v2(seed, tau, eta);
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_ext_sampler);
-        }
-
-        self
-    }
-
-    /// Samples constrained by a context-free grammar in the GGML BNF (GBNF) format.
-    ///
-    /// # Panics
-    /// Panics if a provided string contains a null byte.
-    #[must_use]
-    #[allow(unused_mut)]
-    pub fn add_grammar(
-        mut self,
-        model: &LlamaModel,
-        grammar_str: &str,
-        grammar_root: &str,
-    ) -> Self {
-        unsafe {
-            let grammar_str = CString::new(grammar_str).unwrap();
-            let grammar_root = CString::new(grammar_root).unwrap();
-            let grammar_sampler = llama_cpp_sys_2::llama_sampler_init_grammar(
+        LlamaSamplerParams::Xtc {
+            p,
+            t,
+            min_keep,
+            seed,
+        } => llama_cpp_sys_2::llama_sampler_init_xtc(p, t, min_keep, seed),
+        LlamaSamplerParams::Grammar {
+            model,
+            string,
+            root,
+        } => {
+            let grammar_str = CString::new(string).unwrap();
+            let grammar_root = CString::new(root).unwrap();
+            llama_cpp_sys_2::llama_sampler_init_grammar(
                 model.model.as_ptr(),
                 grammar_str.as_ptr(),
                 grammar_root.as_ptr(),
-            );
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), grammar_sampler);
+            )
         }
-
-        self
-    }
-
-    /// Adds penalties to the sampler. This can be used to penalize certain patterns in the generated text, such as repeating the same token multiple times or using the same token too frequently.
-    #[allow(unused_mut, clippy::too_many_arguments)]
-    #[must_use]
-    pub fn add_penalties(
-        mut self,
-        n_vocab: i32,
-        special_eos_id: i32,
-        linefeed_id: i32,
-        penalty_last_n: i32,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-        penalize_nl: bool,
-        ignore_eos: bool,
-    ) -> Self {
-        unsafe {
-            let temp_ext_sampler = llama_cpp_sys_2::llama_sampler_init_penalties(
-                n_vocab,
-                special_eos_id,
-                linefeed_id,
+        LlamaSamplerParams::Dry {
+            model,
+            multiplier,
+            base,
+            allowed_length,
+            penalty_last_n,
+            seq_breakers,
+        } => {
+            let seq_breakers: Vec<CString> = seq_breakers
+                .iter()
+                .map(|s| CString::new(*s).unwrap())
+                .collect();
+            let mut seq_breaker_pointers: Vec<*const i8> =
+                seq_breakers.iter().map(|s| s.as_ptr()).collect();
+            llama_cpp_sys_2::llama_sampler_init_dry(
+                model.model.as_ptr(),
+                multiplier,
+                base,
+                allowed_length,
                 penalty_last_n,
-                penalty_repeat,
-                penalty_freq,
-                penalty_present,
-                penalize_nl,
-                ignore_eos,
-            );
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), temp_ext_sampler);
+                seq_breaker_pointers.as_mut_ptr(),
+                seq_breaker_pointers.len(),
+            )
         }
-
-        self
-    }
-
-    /// Adds penalties to the sampler. This can be used to penalize certain patterns in the generated text, such as repeating the same token multiple times or using the same token too frequently.
-    #[allow(unused_mut)]
-    #[must_use]
-    pub fn add_penalties_simple(
-        mut self,
-        model: &LlamaModel,
-        penalty_last_n: i32,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-    ) -> Self {
-        self.add_penalties(
-            model.n_vocab(),
-            model.token_eos().0,
-            model.token_nl().0,
+        LlamaSamplerParams::Penalties {
+            n_vocab,
+            special_eos_id,
+            linefeed_id,
             penalty_last_n,
             penalty_repeat,
             penalty_freq,
             penalty_present,
-            false,
-            true,
-        )
+            penalize_nl,
+            ignore_eos,
+        } => llama_cpp_sys_2::llama_sampler_init_penalties(
+            n_vocab,
+            special_eos_id,
+            linefeed_id,
+            penalty_last_n,
+            penalty_repeat,
+            penalty_freq,
+            penalty_present,
+            penalize_nl,
+            ignore_eos,
+        ),
+        LlamaSamplerParams::Dist { seed } => llama_cpp_sys_2::llama_sampler_init_dist(seed),
+        LlamaSamplerParams::Greedy => llama_cpp_sys_2::llama_sampler_init_greedy(),
     }
+}
 
-    /// Adds DRY repetition penalty to the sampler.
-    ///
-    /// DRY sampler, designed by p-e-w, as described in: <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>
-    #[allow(unused_mut)]
+impl LlamaSampler {
+    /// Create a new `LlamaSampler` from the given parameters.
     #[must_use]
-    pub fn add_dry(
-        mut self,
-        model: &LlamaModel,
-        dry_multiplier: f32,
-        dry_base: f32,
-        dry_allowed_length: i32,
-        dry_penalty_last_n: i32,
-        seq_breakers: &[impl AsRef<[u8]>],
-    ) -> Self {
-        let seq_breakers: Vec<CString> = seq_breakers
-            .iter()
-            .map(|s| {
-                let bytes = s.as_ref();
-                let null_byte = bytes.iter().position(|b| *b == 0).unwrap_or(bytes.len());
-                CString::new(&bytes[..null_byte]).expect("Failed to slice away null bytes!")
-            })
-            .collect();
-
-        let mut seq_breaker_pointers: Vec<*const i8> =
-            seq_breakers.iter().map(|s| s.as_ptr()).collect();
-
-        unsafe {
-            // Memory safety: llama_sampler_init_dry does not hold a reference to
-            // seq_breaker_pointers, so this will not UAF in future operations.
-            let dry_sampler = llama_cpp_sys_2::llama_sampler_init_dry(
-                model.model.as_ptr(),
-                dry_multiplier,
-                dry_base,
-                dry_allowed_length,
-                dry_penalty_last_n,
-                seq_breaker_pointers.as_mut_ptr(),
-                seq_breaker_pointers.len(),
-            );
-            llama_cpp_sys_2::llama_sampler_chain_add(self.sampler.as_ptr(), dry_sampler);
+    pub fn new(params: LlamaSamplerParams) -> Self {
+        Self {
+            sampler: unsafe { new_inner(params) },
         }
-
-        self
     }
 
     /// Sample and accept a token from the idx-th output of the last evaluation
     #[must_use]
     pub fn sample(&self, ctx: &LlamaContext, idx: i32) -> LlamaToken {
         let token = unsafe {
-            llama_cpp_sys_2::llama_sampler_sample(self.sampler.as_ptr(), ctx.context.as_ptr(), idx)
+            llama_cpp_sys_2::llama_sampler_sample(self.sampler, ctx.context.as_ptr(), idx)
         };
 
         LlamaToken(token)
     }
 
+    /// Applies this sampler to a [`LlamaTokenDataArray`].
     pub fn apply(&mut self, data_array: &mut LlamaTokenDataArray) {
-        unsafe { data_array.apply_sampler(self.sampler.as_ptr()) }
+        data_array.apply_sampler(self);
     }
 
-    /// Accepts a token from the sampler, possibly updating the internal state of certain samplers (e.g. grammar, repetition, etc.)
+    /// Accepts a token from the sampler, possibly updating the internal state of certain samplers
+    /// (e.g. grammar, repetition, etc.)
     pub fn accept(&mut self, token: LlamaToken) {
-        unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler.as_ptr(), token.0) }
+        unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.0) }
+    }
+
+    /// Accepts several tokens from the sampler or context, possibly updating the internal state of
+    /// certain samplers (e.g. grammar, repetition, etc.)
+    pub fn accept_many(&mut self, tokens: &[LlamaToken]) {
+        for token in tokens {
+            unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.0) }
+        }
     }
 }
 
 impl Drop for LlamaSampler {
     fn drop(&mut self) {
         unsafe {
-            llama_cpp_sys_2::llama_sampler_free(self.sampler.as_ptr());
+            llama_cpp_sys_2::llama_sampler_free(self.sampler);
         }
     }
 }
diff --git a/llama-cpp-2/src/sampling/params.rs b/llama-cpp-2/src/sampling/params.rs
index 0e67c1fa..fe5d23e2 100644
--- a/llama-cpp-2/src/sampling/params.rs
+++ b/llama-cpp-2/src/sampling/params.rs
@@ -1,39 +1,171 @@
-//! Safe wrapper around `llama_sampler_chain_params`.
+//! Safe parameters used to construct [`super::LlamaSampler`]
 
-use std::fmt::{Debug, Formatter};
+/// Safe parameters used to construct [`super::LlamaSampler`]
+#[derive(Debug, Clone, Copy)]
+pub enum LlamaSamplerParams<'a> {
+    /// A chain of samplers, applied one after another
+    #[allow(missing_docs)]
+    Chain {
+        no_perf: bool,
+        stages: &'a [LlamaSamplerParams<'a>],
+    },
 
-/// A safe wrapper around `llama_sampler`.
-pub struct LlamaSamplerChainParams {
-    pub(crate) sampler_chain_params: llama_cpp_sys_2::llama_sampler_chain_params,
-}
+    /// Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original
+    /// value, the rest are set to -inf
+    Temp(f32),
 
-impl Debug for LlamaSamplerChainParams {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("LlamaSamplerChainParams").finish()
-    }
+    /// Dynamic temperature implementation (a.k.a. entropy) described in the paper <https://arxiv.org/abs/2309.02772>.
+    #[allow(missing_docs)]
+    TempExt { t: f32, delta: f32, exponent: f32 },
+    /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration"
+    /// <https://arxiv.org/abs/1904.09751>
+    TopK(i32),
+    /// Locally Typical Sampling implementation described in the paper <https://arxiv.org/abs/2202.00666>.
+    #[allow(missing_docs)]
+    Typical { p: f32, min_keep: usize },
+    /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration"
+    /// <https://arxiv.org/abs/1904.09751>
+    #[allow(missing_docs)]
+    TopP { p: f32, min_keep: usize },
+    /// Minimum P sampling as described in <https://github.com/ggerganov/llama.cpp/pull/3841>
+    #[allow(missing_docs)]
+    MinP { p: f32, min_keep: usize },
+
+    /// XTC sampler as described in <https://github.com/oobabooga/text-generation-webui/pull/6335>
+    #[allow(missing_docs)]
+    Xtc {
+        /// The probability of this sampler being applied.
+        p: f32,
+        t: f32,
+        min_keep: usize,
+        /// Seed to use when selecting whether to apply this sampler or not
+        seed: u32,
+    },
+
+    /// Grammar sampler
+    #[allow(missing_docs)]
+    Grammar {
+        model: &'a crate::model::LlamaModel,
+        string: &'a str,
+        root: &'a str,
+    },
+
+    ///  @details DRY sampler, designed by p-e-w, as described in:
+    ///  <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp
+    ///  implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>
+    #[allow(missing_docs)]
+    Dry {
+        model: &'a crate::model::LlamaModel,
+        multiplier: f32,
+        base: f32,
+        allowed_length: i32,
+        penalty_last_n: i32,
+        seq_breakers: &'a [&'a str],
+    },
+
+    /// Penalizes tokens for being present in the context.
+    Penalties {
+        /// ``model.n_vocab()``
+        n_vocab: i32,
+        /// ``model.token_eos()``
+        special_eos_id: i32,
+        /// ``model.token_nl()``
+        linefeed_id: i32,
+        /// last n tokens to penalize (0 = disable penalty, -1 = context size)
+        penalty_last_n: i32,
+        /// 1.0 = disabled
+        penalty_repeat: f32,
+        /// 0.0 = disabled
+        penalty_freq: f32,
+        /// 0.0 = disabled
+        penalty_present: f32,
+        /// consider newlines as a repeatable token
+        penalize_nl: bool,
+        /// ignore the end-of-sequence token
+        ignore_eos: bool,
+    },
+
+    /// Select a token at random based on each token's probabilities
+    Dist {
+        /// Seed to initialize random generation with
+        seed: u32,
+    },
+
+    /// Select the most likely token
+    Greedy,
 }
 
-impl Default for LlamaSamplerChainParams {
-    fn default() -> Self {
-        let sampler_chain_params = unsafe { llama_cpp_sys_2::llama_sampler_chain_default_params() };
+impl<'a> LlamaSamplerParams<'a> {
+    /// Easily create a chain of samplers with performance metrics enabled.
+    #[must_use]
+    pub fn chain(stages: &'a [Self]) -> Self {
+        LlamaSamplerParams::Chain {
+            no_perf: false,
+            stages,
+        }
+    }
 
-        Self {
-            sampler_chain_params,
+    /// Easily create a [`LlamaSamplerParams::Penalties`] sampler using a model. This sets
+    /// `penalize_nl` to false and `ignore_eos` to true as reasonable defaults.
+    #[must_use]
+    pub fn penalties(
+        model: &'a crate::model::LlamaModel,
+        penalty_last_n: i32,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
+    ) -> Self {
+        Self::Penalties {
+            n_vocab: model.n_vocab(),
+            special_eos_id: model.token_eos().0,
+            linefeed_id: model.token_nl().0,
+            penalty_last_n,
+            penalty_repeat,
+            penalty_freq,
+            penalty_present,
+            penalize_nl: false,
+            ignore_eos: true,
         }
     }
-}
 
-impl LlamaSamplerChainParams {
-    /// Set whether to measure performance timings
+    /// Easily define a [`LlamaSamplerParams::Typical`] with `min_keep == 1`
+    #[must_use]
+    pub fn typical(p: f32) -> Self {
+        Self::Typical { p, min_keep: 1 }
+    }
+
+    /// Easily define a [`LlamaSamplerParams::TopP`] with `min_keep == 1`
     #[must_use]
-    pub fn with_no_perf(mut self, no_perf: bool) -> Self {
-        self.sampler_chain_params.no_perf = no_perf;
-        self
+    pub fn top_p(p: f32) -> Self {
+        Self::TopP { p, min_keep: 1 }
     }
 
-    /// Get whether to measure performance timings
+    /// Easily define a [`LlamaSamplerParams::MinP`] with `min_keep == 1`
     #[must_use]
-    pub fn no_perf(&self) -> bool {
-        self.sampler_chain_params.no_perf
+    pub fn min_p(p: f32) -> Self {
+        Self::MinP { p, min_keep: 1 }
+    }
+
+    /// Whether this sampler's outputs are dependent on the tokens in the model's context. 
+    pub(crate) fn uses_context_tokens(&self) -> bool {
+        match self {
+            LlamaSamplerParams::Chain { stages, .. } => {
+                stages.iter().any(LlamaSamplerParams::uses_context_tokens)
+            }
+
+            LlamaSamplerParams::Grammar { .. }
+            | LlamaSamplerParams::Penalties { .. }
+            | LlamaSamplerParams::Dry { .. } => true,
+
+            LlamaSamplerParams::Temp(_)
+            | LlamaSamplerParams::TempExt { .. }
+            | LlamaSamplerParams::TopK(_)
+            | LlamaSamplerParams::Typical { .. }
+            | LlamaSamplerParams::TopP { .. }
+            | LlamaSamplerParams::MinP { .. }
+            | LlamaSamplerParams::Xtc { .. }
+            | LlamaSamplerParams::Dist { .. }
+            | LlamaSamplerParams::Greedy => false,
+        }
     }
 }
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index 090c866e..97a1a600 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -1,7 +1,11 @@
 //! an rusty equivalent of `llama_token_data_array`.
 use std::{ffi::CString, ptr};
 
-use crate::{model::LlamaModel, token::data::LlamaTokenData};
+use crate::{
+    model::LlamaModel,
+    sampling::{params::LlamaSamplerParams, LlamaSampler},
+    token::data::LlamaTokenData,
+};
 
 use super::LlamaToken;
 
@@ -58,6 +62,7 @@ impl LlamaTokenDataArray {
         Self::new(data.into_iter().collect(), sorted)
     }
 
+    /// Returns the current selected token, if one exists.
     #[must_use]
     pub fn selected_token(&self) -> Option<LlamaToken> {
         self.data.get(self.selected?).map(LlamaTokenData::id)
@@ -110,275 +115,37 @@ impl LlamaTokenDataArray {
         result
     }
 
-    pub(crate) unsafe fn apply_sampler(&mut self, sampler: *mut llama_cpp_sys_2::llama_sampler) {
-        self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
-            llama_cpp_sys_2::llama_sampler_apply(sampler, c_llama_token_data_array);
-        });
-    }
-
-    pub(crate) unsafe fn apply_and_free_sampler(
-        &mut self,
-        sampler_fn: impl FnOnce() -> *mut llama_cpp_sys_2::llama_sampler,
-    ) {
-        let sampler = sampler_fn();
-        self.apply_sampler(sampler);
-        llama_cpp_sys_2::llama_sampler_free(sampler);
-    }
-
-    /// Modify the logits of [`Self`] in place using temperature sampling.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///     LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///     LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///     LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0)
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    ///
-    /// candidates.sample_temp(0.5);
-    ///
-    /// assert_eq!(candidates.data[0].logit(), 0.2);
-    /// assert_eq!(candidates.data[1].logit(), 0.4);
-    /// assert_eq!(candidates.data[2].logit(), 1.4);
-    /// ```
-    pub fn sample_temp(&mut self, temperature: f32) {
-        unsafe {
-            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_temp(temperature));
-        }
-    }
+    /// Applies a sampler constructed from [`LlamaSamplerParams`]. This will call
+    /// [`LlamaSampler::accept_many`] on the provided tokens if the sampler uses tokens.
+    pub fn apply_sampler_from_params(&mut self, params: LlamaSamplerParams, tokens: &[LlamaToken]) {
+        let mut sampler = LlamaSampler::new(params);
 
-    /// Dynamic temperature implementation (a.k.a. entropy) described in the paper <https://arxiv.org/abs/2309.02772>.
-    pub fn sample_temp_ext(&mut self, t: f32, delta: f32, exponent: f32) {
-        unsafe {
-            self.apply_and_free_sampler(|| {
-                llama_cpp_sys_2::llama_sampler_init_temp_ext(t, delta, exponent)
-            });
+        if params.uses_context_tokens() {
+            sampler.accept_many(tokens);
         }
-    }
 
-    /// Top-K sampling described in academic paper [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
-    pub fn sample_top_k(&mut self, k: i32) {
-        unsafe {
-            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_top_k(k));
-        }
+        self.apply_sampler(&mut sampler);
     }
 
-    /// Locally Typical Sampling implementation described in the [paper](https://arxiv.org/abs/2202.00666).
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///    LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///    LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///    LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_typical(0.5, 1);
-    /// ```
-    pub fn sample_typical(&mut self, p: f32, min_keep: usize) {
+    /// Modifies the data array by applying a sampler to it
+    pub fn apply_sampler(&mut self, sampler: &mut LlamaSampler) {
         unsafe {
-            self.apply_and_free_sampler(|| {
-                llama_cpp_sys_2::llama_sampler_init_typical(p, min_keep)
-            });
-        }
-    }
-
-    /// Nucleus sampling described in academic paper [The Curious Case of Neural Text Degeneration](https://arxiv.org/abs/1904.09751)
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    ///
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///   LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    ///
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_top_p(0.5, 1);
-    ///
-    /// assert_eq!(candidates.data.len(), 2);
-    /// assert_eq!(candidates.data[0].id(), LlamaToken::new(2));
-    /// assert_eq!(candidates.data[1].id(), LlamaToken::new(1));
-    /// ```
-    pub fn sample_top_p(&mut self, p: f32, min_keep: usize) {
-        unsafe {
-            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_top_p(p, min_keep));
-        }
-    }
-
-    /// Minimum P sampling as described in [#3841](https://github.com/ggerganov/llama.cpp/pull/3841)
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use llama_cpp_2::token::data::LlamaTokenData;
-    /// # use llama_cpp_2::token::data_array::LlamaTokenDataArray;
-    /// # use llama_cpp_2::token::LlamaToken;
-    ///
-    /// let candidates = vec![
-    ///   LlamaTokenData::new(LlamaToken::new(4), -2., 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(0), 0.1, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(1), 0.2, 0.0),
-    ///   LlamaTokenData::new(LlamaToken::new(2), 0.7, 0.0),
-    /// ];
-    /// let mut candidates = LlamaTokenDataArray::from_iter(candidates, false);
-    /// candidates.sample_min_p(0.1, 1);
-    ///
-    /// assert_eq!(candidates.data.len(), 3);
-    /// ```
-    pub fn sample_min_p(&mut self, p: f32, min_keep: usize) {
-        unsafe {
-            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_min_p(p, min_keep));
-        }
-    }
-
-    /// XTC sampling as described in <https://github.com/oobabooga/text-generation-webui/pull/6335>.
-    pub fn sample_xtc(&mut self, p: f32, t: f32, min_keep: usize, seed: u32) {
-        unsafe {
-            self.apply_and_free_sampler(|| {
-                llama_cpp_sys_2::llama_sampler_init_xtc(p, t, min_keep, seed)
-            });
-        }
-    }
-
-    /// This can be used to penalize certain patterns in the generated text, such as repeating the same token multiple times or using the same token too frequently.
-    #[allow(clippy::too_many_arguments)]
-    pub fn sample_penalties(
-        &mut self,
-        tokens: &[LlamaToken],
-        n_vocab: i32,
-        special_eos_id: i32,
-        linefeed_id: i32,
-        penalty_last_n: i32,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-        penalize_nl: bool,
-        ignore_eos: bool,
-    ) {
-        unsafe {
-            self.apply_and_free_sampler(|| {
-                let sampler = llama_cpp_sys_2::llama_sampler_init_penalties(
-                    n_vocab,
-                    special_eos_id,
-                    linefeed_id,
-                    penalty_last_n,
-                    penalty_repeat,
-                    penalty_freq,
-                    penalty_present,
-                    penalize_nl,
-                    ignore_eos,
-                );
-
-                for token in tokens {
-                    llama_cpp_sys_2::llama_sampler_accept(sampler, token.0);
-                }
-
-                sampler
-            });
-        }
-    }
-
-    /// This can be used to penalize certain patterns in the generated text, such as repeating the same token multiple times or using the same token too frequently.
-    pub fn sample_penalties_simple(
-        &mut self,
-        tokens: &[LlamaToken],
-        model: &LlamaModel,
-        penalty_last_n: i32,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-    ) {
-        self.sample_penalties(
-            tokens,
-            model.n_vocab(),
-            model.token_eos().0,
-            model.token_nl().0,
-            penalty_last_n,
-            penalty_repeat,
-            penalty_freq,
-            penalty_present,
-            false,
-            true,
-        );
-    }
-
-    /// DRY sampler, designed by p-e-w, as described in: <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>
-    #[allow(clippy::too_many_arguments)]
-    pub fn sample_dry(
-        &mut self,
-        tokens: &[LlamaToken],
-        model: &LlamaModel,
-        dry_multiplier: f32,
-        dry_base: f32,
-        dry_allowed_length: i32,
-        dry_penalty_last_n: i32,
-        seq_breakers: &[impl AsRef<[u8]>],
-    ) {
-        let seq_breakers: Vec<CString> = seq_breakers
-            .iter()
-            .map(|s| {
-                let bytes = s.as_ref();
-                let null_byte = bytes.iter().position(|b| *b == 0).unwrap_or(bytes.len());
-                CString::new(&bytes[..null_byte]).expect("Failed to slice away null bytes!")
-            })
-            .collect();
-
-        let mut seq_breaker_pointers: Vec<*const i8> =
-            seq_breakers.iter().map(|s| s.as_ptr()).collect();
-
-        unsafe {
-            self.apply_and_free_sampler(|| {
-                let sampler = llama_cpp_sys_2::llama_sampler_init_dry(
-                    model.model.as_ptr(),
-                    dry_multiplier,
-                    dry_base,
-                    dry_allowed_length,
-                    dry_penalty_last_n,
-                    seq_breaker_pointers.as_mut_ptr(),
-                    seq_breaker_pointers.len(),
-                );
-
-                for token in tokens {
-                    llama_cpp_sys_2::llama_sampler_accept(sampler, token.0);
-                }
-
-                sampler
+            self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
+                llama_cpp_sys_2::llama_sampler_apply(sampler.sampler, c_llama_token_data_array);
             });
         }
     }
 
     /// Randomly selects a token from the candidates based on their probabilities.
     pub fn sample_token(&mut self, seed: u32) -> LlamaToken {
-        unsafe {
-            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_dist(seed));
-        }
+        self.apply_sampler_from_params(LlamaSamplerParams::Dist { seed }, &[]);
         self.selected_token()
             .expect("Dist sampler failed to select a token!")
     }
 
     /// Selects the token with the highest probability.
     pub fn sample_token_greedy(&mut self) -> LlamaToken {
-        unsafe {
-            self.apply_and_free_sampler(|| llama_cpp_sys_2::llama_sampler_init_greedy());
-        }
+        self.apply_sampler_from_params(LlamaSamplerParams::Greedy, &[]);
         self.selected_token()
             .expect("Greedy sampler failed to select a token!")
     }

From d44deef3d9c5e50aaf86ca9a0e707c50389ccd2d Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 7 Dec 2024 13:25:33 -0600
Subject: [PATCH 037/193] Add Mirostat to new API

---
 examples/simple/src/main.rs        |  4 +++-
 llama-cpp-2/src/sampling.rs        | 10 ++++++++++
 llama-cpp-2/src/sampling/params.rs | 28 +++++++++++++++++++++++++++-
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index f31a83c4..e13274f1 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -247,7 +247,9 @@ either reduce n_len or increase n_ctx"
     let mut decoder = encoding_rs::UTF_8.new_decoder();
 
     let mut sampler = LlamaSampler::new(LlamaSamplerParams::chain(&[
-        LlamaSamplerParams::Dist { seed: seed.unwrap_or(1234) },
+        LlamaSamplerParams::Dist {
+            seed: seed.unwrap_or(1234),
+        },
         LlamaSamplerParams::Greedy,
     ]));
 
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index e0313f4e..2c945326 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -112,6 +112,16 @@ unsafe fn new_inner(params: LlamaSamplerParams) -> *mut llama_cpp_sys_2::llama_s
             penalize_nl,
             ignore_eos,
         ),
+        LlamaSamplerParams::Mirostat {
+            n_vocab,
+            tau,
+            eta,
+            m,
+            seed,
+        } => llama_cpp_sys_2::llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m),
+        LlamaSamplerParams::MirostatV2 { tau, eta, seed } => {
+            llama_cpp_sys_2::llama_sampler_init_mirostat_v2(seed, tau, eta)
+        }
         LlamaSamplerParams::Dist { seed } => llama_cpp_sys_2::llama_sampler_init_dist(seed),
         LlamaSamplerParams::Greedy => llama_cpp_sys_2::llama_sampler_init_greedy(),
     }
diff --git a/llama-cpp-2/src/sampling/params.rs b/llama-cpp-2/src/sampling/params.rs
index fe5d23e2..84cdbc3f 100644
--- a/llama-cpp-2/src/sampling/params.rs
+++ b/llama-cpp-2/src/sampling/params.rs
@@ -85,6 +85,30 @@ pub enum LlamaSamplerParams<'a> {
         ignore_eos: bool,
     },
 
+    /// Mirostat 1.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
+    Mirostat {
+        /// ``model.n_vocab()``
+        n_vocab: i32,
+        /// The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+        tau: f32,
+        /// The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+        eta: f32,
+        /// The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+        m: i32,
+        /// Seed to initialize random generation with
+        seed: u32,
+    },
+
+    /// Mirostat 2.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
+    MirostatV2 {
+        /// The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+        tau: f32,
+        /// The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+        eta: f32,
+        /// Seed to initialize random generation with
+        seed: u32,
+    },
+
     /// Select a token at random based on each token's probabilities
     Dist {
         /// Seed to initialize random generation with
@@ -146,7 +170,7 @@ impl<'a> LlamaSamplerParams<'a> {
         Self::MinP { p, min_keep: 1 }
     }
 
-    /// Whether this sampler's outputs are dependent on the tokens in the model's context. 
+    /// Whether this sampler's outputs are dependent on the tokens in the model's context.
     pub(crate) fn uses_context_tokens(&self) -> bool {
         match self {
             LlamaSamplerParams::Chain { stages, .. } => {
@@ -164,6 +188,8 @@ impl<'a> LlamaSamplerParams<'a> {
             | LlamaSamplerParams::TopP { .. }
             | LlamaSamplerParams::MinP { .. }
             | LlamaSamplerParams::Xtc { .. }
+            | LlamaSamplerParams::Mirostat { .. }
+            | LlamaSamplerParams::MirostatV2 { .. }
             | LlamaSamplerParams::Dist { .. }
             | LlamaSamplerParams::Greedy => false,
         }

From 4a334a44f5c43dd27d5bf5832d6e176222c5f57f Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 7 Dec 2024 14:19:54 -0600
Subject: [PATCH 038/193] Remove unused imports

---
 llama-cpp-2/src/token/data_array.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index 97a1a600..3e34dc69 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -1,8 +1,7 @@
 //! an rusty equivalent of `llama_token_data_array`.
-use std::{ffi::CString, ptr};
+use std::ptr;
 
 use crate::{
-    model::LlamaModel,
     sampling::{params::LlamaSamplerParams, LlamaSampler},
     token::data::LlamaTokenData,
 };

From 32cadf765e1125aa6f67bfff76f6f9a333571b89 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sat, 7 Dec 2024 16:13:19 -0600
Subject: [PATCH 039/193] Fix crash when running XTC sampler

---
 llama-cpp-2/src/token/data_array.rs | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index 3e34dc69..8d3266a0 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -73,12 +73,14 @@ impl LlamaTokenDataArray {
     ///
     /// # Panics
     ///
-    /// Panics if some of the safety conditions are not met. (we cannot check all of them at runtime so breaking them is UB)
+    /// Panics if some of the safety conditions are not met. (we cannot check all of them at
+    /// runtime so breaking them is UB)
     ///
     /// SAFETY:
-    /// [modify] cannot change the data pointer.
+    /// The returned array formed by the data pointer and the length must entirely consist of
+    /// initialized token data and the length must be less than the capacity of this array's data
+    /// buffer.
     /// if the data is not sorted, sorted must be false.
-    /// the size of the data can only decrease (i.e you cannot add new elements).
     pub(crate) unsafe fn modify_as_c_llama_token_data_array<T>(
         &mut self,
         modify: impl FnOnce(&mut llama_cpp_sys_2::llama_token_data_array) -> T,
@@ -97,13 +99,20 @@ impl LlamaTokenDataArray {
         };
 
         let result = modify(&mut c_llama_token_data_array);
+
         assert!(
-            ptr::eq(data, c_llama_token_data_array.data),
-            "data pointer changed"
+            c_llama_token_data_array.size <= self.data.capacity(),
+            "Size of the returned array exceeds the data buffer's capacity!"
         );
-        assert!(c_llama_token_data_array.size <= size, "size increased");
-
+        if !ptr::eq(c_llama_token_data_array.data, data) {
+            ptr::copy(
+                c_llama_token_data_array.data,
+                data,
+                c_llama_token_data_array.size,
+            );
+        }
         self.data.set_len(c_llama_token_data_array.size);
+
         self.sorted = c_llama_token_data_array.sorted;
         self.selected = c_llama_token_data_array
             .selected

From 73ef067bc0cf73379cd38592cfbf768f06b4ce14 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 8 Dec 2024 13:51:40 -0600
Subject: [PATCH 040/193] Yet another API overhaul

---
 examples/simple/src/main.rs         |  11 +-
 examples/usage.rs                   |   8 +-
 llama-cpp-2/src/sampling.rs         | 287 ++++++++++++++++++----------
 llama-cpp-2/src/sampling/params.rs  | 197 -------------------
 llama-cpp-2/src/token/data_array.rs |  18 +-
 5 files changed, 190 insertions(+), 331 deletions(-)
 delete mode 100644 llama-cpp-2/src/sampling/params.rs

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index e13274f1..f67a5309 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -17,7 +17,6 @@ use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
 use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
-use llama_cpp_2::sampling::params::LlamaSamplerParams;
 use llama_cpp_2::sampling::LlamaSampler;
 
 use std::ffi::CString;
@@ -246,12 +245,10 @@ either reduce n_len or increase n_ctx"
     // The `Decoder`
     let mut decoder = encoding_rs::UTF_8.new_decoder();
 
-    let mut sampler = LlamaSampler::new(LlamaSamplerParams::chain(&[
-        LlamaSamplerParams::Dist {
-            seed: seed.unwrap_or(1234),
-        },
-        LlamaSamplerParams::Greedy,
-    ]));
+    let mut sampler = LlamaSampler::chain(vec![
+        LlamaSampler::dist(seed.unwrap_or(1234)),
+        LlamaSampler::greedy(),
+    ]);
 
     while n_cur <= n_len {
         // sample the next token
diff --git a/examples/usage.rs b/examples/usage.rs
index 2b7f1915..323ad6c2 100644
--- a/examples/usage.rs
+++ b/examples/usage.rs
@@ -14,9 +14,7 @@ use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
-use llama_cpp_2::sampling::params::LlamaSamplerChainParams;
 use llama_cpp_2::sampling::LlamaSampler;
-use llama_cpp_2::token::data_array::LlamaTokenDataArray;
 use std::io::Write;
 
 #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
@@ -55,11 +53,7 @@ fn main() {
 
     // The `Decoder`
     let mut decoder = encoding_rs::UTF_8.new_decoder();
-
-    let sampler_params = LlamaSamplerChainParams::default();
-    let mut sampler = LlamaSampler::new(sampler_params)
-        .expect("Failed to create sampler")
-        .add_greedy();
+    let mut sampler = LlamaSampler::greedy();
 
     while n_cur <= n_len {
         // sample the next token
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 2c945326..f52ea1a8 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -1,15 +1,13 @@
 //! Safe wrapper around `llama_sampler`.
-pub mod params;
 
 use std::ffi::CString;
 use std::fmt::{Debug, Formatter};
 
 use crate::context::LlamaContext;
+use crate::model::LlamaModel;
 use crate::token::data_array::LlamaTokenDataArray;
 use crate::token::LlamaToken;
 
-use params::LlamaSamplerParams;
-
 /// A safe wrapper around `llama_sampler`.
 pub struct LlamaSampler {
     pub(crate) sampler: *mut llama_cpp_sys_2::llama_sampler,
@@ -21,63 +19,138 @@ impl Debug for LlamaSampler {
     }
 }
 
-unsafe fn new_inner(params: LlamaSamplerParams) -> *mut llama_cpp_sys_2::llama_sampler {
-    match params {
-        LlamaSamplerParams::Chain { no_perf, stages } => {
+impl LlamaSampler {
+    /// Sample and accept a token from the idx-th output of the last evaluation
+    #[must_use]
+    pub fn sample(&self, ctx: &LlamaContext, idx: i32) -> LlamaToken {
+        let token = unsafe {
+            llama_cpp_sys_2::llama_sampler_sample(self.sampler, ctx.context.as_ptr(), idx)
+        };
+
+        LlamaToken(token)
+    }
+
+    /// Applies this sampler to a [`LlamaTokenDataArray`].
+    pub fn apply(&mut self, data_array: &mut LlamaTokenDataArray) {
+        data_array.apply_sampler(self);
+    }
+
+    /// Accepts a token from the sampler, possibly updating the internal state of certain samplers
+    /// (e.g. grammar, repetition, etc.)
+    pub fn accept(&mut self, token: LlamaToken) {
+        unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.0) }
+    }
+
+    /// Accepts several tokens from the sampler or context, possibly updating the internal state of
+    /// certain samplers (e.g. grammar, repetition, etc.)
+    pub fn accept_many(&mut self, tokens: &[LlamaToken]) {
+        for token in tokens {
+            unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.0) }
+        }
+    }
+
+    /// Accepts several tokens from the sampler or context, possibly updating the internal state of
+    /// certain samplers (e.g. grammar, repetition, etc.)
+    #[must_use]
+    pub fn with_tokens(mut self, tokens: &[LlamaToken]) -> Self {
+        self.accept_many(tokens);
+        self
+    }
+
+    #[must_use]
+    pub fn chain_with_no_perf(samplers: Vec<Self>, no_perf: bool) -> Self {
+        unsafe {
             let chain = llama_cpp_sys_2::llama_sampler_chain_init(
                 llama_cpp_sys_2::llama_sampler_chain_params { no_perf },
             );
 
-            for stage in stages {
-                llama_cpp_sys_2::llama_sampler_chain_add(chain, new_inner(*stage));
+            for sampler in samplers {
+                llama_cpp_sys_2::llama_sampler_chain_add(chain, sampler.sampler);
+
+                // Do not call `llama_sampler_free` on the sampler, as the internal sampler is now
+                // owned by the chain
+                std::mem::forget(sampler);
             }
 
-            chain
-        }
-        LlamaSamplerParams::Temp(p) => llama_cpp_sys_2::llama_sampler_init_temp(p),
-        LlamaSamplerParams::TempExt { t, delta, exponent } => {
-            llama_cpp_sys_2::llama_sampler_init_temp_ext(t, delta, exponent)
-        }
-        LlamaSamplerParams::TopK(k) => llama_cpp_sys_2::llama_sampler_init_top_k(k),
-        LlamaSamplerParams::Typical { p, min_keep } => {
-            llama_cpp_sys_2::llama_sampler_init_typical(p, min_keep)
-        }
-        LlamaSamplerParams::TopP { p, min_keep } => {
-            llama_cpp_sys_2::llama_sampler_init_top_p(p, min_keep)
+            Self { sampler: chain }
         }
-        LlamaSamplerParams::MinP { p, min_keep } => {
-            llama_cpp_sys_2::llama_sampler_init_min_p(p, min_keep)
-        }
-        LlamaSamplerParams::Xtc {
-            p,
-            t,
-            min_keep,
-            seed,
-        } => llama_cpp_sys_2::llama_sampler_init_xtc(p, t, min_keep, seed),
-        LlamaSamplerParams::Grammar {
-            model,
-            string,
-            root,
-        } => {
-            let grammar_str = CString::new(string).unwrap();
-            let grammar_root = CString::new(root).unwrap();
+    }
+
+    #[must_use]
+    pub fn chain(samplers: Vec<Self>) -> Self {
+        Self::chain_with_no_perf(samplers, false)
+    }
+
+    #[must_use]
+    pub fn temp(t: f32) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_temp(t) };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn temp_ext(t: f32, delta: f32, exponent: f32) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_temp_ext(t, delta, exponent) };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn top_k(k: i32) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_top_k(k) };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn typical(p: f32, min_keep: usize) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_typical(p, min_keep) };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn top_p(p: f32, min_keep: usize) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_top_p(p, min_keep) };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn min_p(p: f32, min_keep: usize) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_min_p(p, min_keep) };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn xtc(p: f32, t: f32, min_keep: usize, seed: u32) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_xtc(p, t, min_keep, seed) };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn grammar(model: &LlamaModel, grammar_str: &str, grammar_root: &str) -> Self {
+        let grammar_str = CString::new(grammar_str).unwrap();
+        let grammar_root = CString::new(grammar_root).unwrap();
+
+        let sampler = unsafe {
             llama_cpp_sys_2::llama_sampler_init_grammar(
                 model.model.as_ptr(),
                 grammar_str.as_ptr(),
                 grammar_root.as_ptr(),
             )
-        }
-        LlamaSamplerParams::Dry {
-            model,
-            multiplier,
-            base,
-            allowed_length,
-            penalty_last_n,
-            seq_breakers,
-        } => {
+        };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn dry(
+        model: &LlamaModel,
+        multiplier: f32,
+        base: f32,
+        allowed_length: i32,
+        penalty_last_n: i32,
+        seq_breakers: &[impl AsRef<[u8]>],
+    ) -> Self {
+        let sampler = unsafe {
             let seq_breakers: Vec<CString> = seq_breakers
                 .iter()
-                .map(|s| CString::new(*s).unwrap())
+                .map(|s| CString::new(s.as_ref()).unwrap())
                 .collect();
             let mut seq_breaker_pointers: Vec<*const i8> =
                 seq_breakers.iter().map(|s| s.as_ptr()).collect();
@@ -90,79 +163,83 @@ unsafe fn new_inner(params: LlamaSamplerParams) -> *mut llama_cpp_sys_2::llama_s
                 seq_breaker_pointers.as_mut_ptr(),
                 seq_breaker_pointers.len(),
             )
-        }
-        LlamaSamplerParams::Penalties {
-            n_vocab,
-            special_eos_id,
-            linefeed_id,
-            penalty_last_n,
-            penalty_repeat,
-            penalty_freq,
-            penalty_present,
-            penalize_nl,
-            ignore_eos,
-        } => llama_cpp_sys_2::llama_sampler_init_penalties(
-            n_vocab,
-            special_eos_id,
-            linefeed_id,
+        };
+        Self { sampler }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[must_use]
+    pub fn penalties(
+        n_vocab: i32,
+        special_eos_id: i32,
+        linefeed_id: i32,
+        penalty_last_n: i32,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
+        penalize_nl: bool,
+        ignore_eos: bool,
+    ) -> Self {
+        let sampler = unsafe {
+            llama_cpp_sys_2::llama_sampler_init_penalties(
+                n_vocab,
+                special_eos_id,
+                linefeed_id,
+                penalty_last_n,
+                penalty_repeat,
+                penalty_freq,
+                penalty_present,
+                penalize_nl,
+                ignore_eos,
+            )
+        };
+        Self { sampler }
+    }
+
+    #[must_use]
+    pub fn penalties_simple(
+        model: &LlamaModel,
+        penalty_last_n: i32,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
+    ) -> Self {
+        Self::penalties(
+            model.n_vocab(),
+            model.token_eos().0,
+            model.token_nl().0,
             penalty_last_n,
             penalty_repeat,
             penalty_freq,
             penalty_present,
-            penalize_nl,
-            ignore_eos,
-        ),
-        LlamaSamplerParams::Mirostat {
-            n_vocab,
-            tau,
-            eta,
-            m,
-            seed,
-        } => llama_cpp_sys_2::llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m),
-        LlamaSamplerParams::MirostatV2 { tau, eta, seed } => {
-            llama_cpp_sys_2::llama_sampler_init_mirostat_v2(seed, tau, eta)
-        }
-        LlamaSamplerParams::Dist { seed } => llama_cpp_sys_2::llama_sampler_init_dist(seed),
-        LlamaSamplerParams::Greedy => llama_cpp_sys_2::llama_sampler_init_greedy(),
+            false,
+            true,
+        )
     }
-}
 
-impl LlamaSampler {
-    /// Create a new `LlamaSampler` from the given parameters.
     #[must_use]
-    pub fn new(params: LlamaSamplerParams) -> Self {
-        Self {
-            sampler: unsafe { new_inner(params) },
-        }
+    pub fn mirostat(n_vocab: i32, seed: u32, tau: f32, eta: f32, m: i32) -> Self {
+        let sampler =
+            unsafe { llama_cpp_sys_2::llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m) };
+        Self { sampler }
     }
 
-    /// Sample and accept a token from the idx-th output of the last evaluation
     #[must_use]
-    pub fn sample(&self, ctx: &LlamaContext, idx: i32) -> LlamaToken {
-        let token = unsafe {
-            llama_cpp_sys_2::llama_sampler_sample(self.sampler, ctx.context.as_ptr(), idx)
-        };
-
-        LlamaToken(token)
-    }
-
-    /// Applies this sampler to a [`LlamaTokenDataArray`].
-    pub fn apply(&mut self, data_array: &mut LlamaTokenDataArray) {
-        data_array.apply_sampler(self);
+    pub fn mirostat_v2(seed: u32, tau: f32, eta: f32) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_mirostat_v2(seed, tau, eta) };
+        Self { sampler }
     }
 
-    /// Accepts a token from the sampler, possibly updating the internal state of certain samplers
-    /// (e.g. grammar, repetition, etc.)
-    pub fn accept(&mut self, token: LlamaToken) {
-        unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.0) }
+    #[must_use]
+    pub fn dist(seed: u32) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_dist(seed) };
+        Self { sampler }
     }
 
-    /// Accepts several tokens from the sampler or context, possibly updating the internal state of
-    /// certain samplers (e.g. grammar, repetition, etc.)
-    pub fn accept_many(&mut self, tokens: &[LlamaToken]) {
-        for token in tokens {
-            unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.0) }
-        }
+    #[must_use]
+    pub fn greedy() -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_greedy() };
+        Self { sampler }
     }
 }
 
diff --git a/llama-cpp-2/src/sampling/params.rs b/llama-cpp-2/src/sampling/params.rs
deleted file mode 100644
index 84cdbc3f..00000000
--- a/llama-cpp-2/src/sampling/params.rs
+++ /dev/null
@@ -1,197 +0,0 @@
-//! Safe parameters used to construct [`super::LlamaSampler`]
-
-/// Safe parameters used to construct [`super::LlamaSampler`]
-#[derive(Debug, Clone, Copy)]
-pub enum LlamaSamplerParams<'a> {
-    /// A chain of samplers, applied one after another
-    #[allow(missing_docs)]
-    Chain {
-        no_perf: bool,
-        stages: &'a [LlamaSamplerParams<'a>],
-    },
-
-    /// Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original
-    /// value, the rest are set to -inf
-    Temp(f32),
-
-    /// Dynamic temperature implementation (a.k.a. entropy) described in the paper <https://arxiv.org/abs/2309.02772>.
-    #[allow(missing_docs)]
-    TempExt { t: f32, delta: f32, exponent: f32 },
-    /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration"
-    /// <https://arxiv.org/abs/1904.09751>
-    TopK(i32),
-    /// Locally Typical Sampling implementation described in the paper <https://arxiv.org/abs/2202.00666>.
-    #[allow(missing_docs)]
-    Typical { p: f32, min_keep: usize },
-    /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration"
-    /// <https://arxiv.org/abs/1904.09751>
-    #[allow(missing_docs)]
-    TopP { p: f32, min_keep: usize },
-    /// Minimum P sampling as described in <https://github.com/ggerganov/llama.cpp/pull/3841>
-    #[allow(missing_docs)]
-    MinP { p: f32, min_keep: usize },
-
-    /// XTC sampler as described in <https://github.com/oobabooga/text-generation-webui/pull/6335>
-    #[allow(missing_docs)]
-    Xtc {
-        /// The probability of this sampler being applied.
-        p: f32,
-        t: f32,
-        min_keep: usize,
-        /// Seed to use when selecting whether to apply this sampler or not
-        seed: u32,
-    },
-
-    /// Grammar sampler
-    #[allow(missing_docs)]
-    Grammar {
-        model: &'a crate::model::LlamaModel,
-        string: &'a str,
-        root: &'a str,
-    },
-
-    ///  @details DRY sampler, designed by p-e-w, as described in:
-    ///  <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp
-    ///  implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>
-    #[allow(missing_docs)]
-    Dry {
-        model: &'a crate::model::LlamaModel,
-        multiplier: f32,
-        base: f32,
-        allowed_length: i32,
-        penalty_last_n: i32,
-        seq_breakers: &'a [&'a str],
-    },
-
-    /// Penalizes tokens for being present in the context.
-    Penalties {
-        /// ``model.n_vocab()``
-        n_vocab: i32,
-        /// ``model.token_eos()``
-        special_eos_id: i32,
-        /// ``model.token_nl()``
-        linefeed_id: i32,
-        /// last n tokens to penalize (0 = disable penalty, -1 = context size)
-        penalty_last_n: i32,
-        /// 1.0 = disabled
-        penalty_repeat: f32,
-        /// 0.0 = disabled
-        penalty_freq: f32,
-        /// 0.0 = disabled
-        penalty_present: f32,
-        /// consider newlines as a repeatable token
-        penalize_nl: bool,
-        /// ignore the end-of-sequence token
-        ignore_eos: bool,
-    },
-
-    /// Mirostat 1.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
-    Mirostat {
-        /// ``model.n_vocab()``
-        n_vocab: i32,
-        /// The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-        tau: f32,
-        /// The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-        eta: f32,
-        /// The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-        m: i32,
-        /// Seed to initialize random generation with
-        seed: u32,
-    },
-
-    /// Mirostat 2.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
-    MirostatV2 {
-        /// The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-        tau: f32,
-        /// The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-        eta: f32,
-        /// Seed to initialize random generation with
-        seed: u32,
-    },
-
-    /// Select a token at random based on each token's probabilities
-    Dist {
-        /// Seed to initialize random generation with
-        seed: u32,
-    },
-
-    /// Select the most likely token
-    Greedy,
-}
-
-impl<'a> LlamaSamplerParams<'a> {
-    /// Easily create a chain of samplers with performance metrics enabled.
-    #[must_use]
-    pub fn chain(stages: &'a [Self]) -> Self {
-        LlamaSamplerParams::Chain {
-            no_perf: false,
-            stages,
-        }
-    }
-
-    /// Easily create a [`LlamaSamplerParams::Penalties`] sampler using a model. This sets
-    /// `penalize_nl` to false and `ignore_eos` to true as reasonable defaults.
-    #[must_use]
-    pub fn penalties(
-        model: &'a crate::model::LlamaModel,
-        penalty_last_n: i32,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-    ) -> Self {
-        Self::Penalties {
-            n_vocab: model.n_vocab(),
-            special_eos_id: model.token_eos().0,
-            linefeed_id: model.token_nl().0,
-            penalty_last_n,
-            penalty_repeat,
-            penalty_freq,
-            penalty_present,
-            penalize_nl: false,
-            ignore_eos: true,
-        }
-    }
-
-    /// Easily define a [`LlamaSamplerParams::Typical`] with `min_keep == 1`
-    #[must_use]
-    pub fn typical(p: f32) -> Self {
-        Self::Typical { p, min_keep: 1 }
-    }
-
-    /// Easily define a [`LlamaSamplerParams::TopP`] with `min_keep == 1`
-    #[must_use]
-    pub fn top_p(p: f32) -> Self {
-        Self::TopP { p, min_keep: 1 }
-    }
-
-    /// Easily define a [`LlamaSamplerParams::MinP`] with `min_keep == 1`
-    #[must_use]
-    pub fn min_p(p: f32) -> Self {
-        Self::MinP { p, min_keep: 1 }
-    }
-
-    /// Whether this sampler's outputs are dependent on the tokens in the model's context.
-    pub(crate) fn uses_context_tokens(&self) -> bool {
-        match self {
-            LlamaSamplerParams::Chain { stages, .. } => {
-                stages.iter().any(LlamaSamplerParams::uses_context_tokens)
-            }
-
-            LlamaSamplerParams::Grammar { .. }
-            | LlamaSamplerParams::Penalties { .. }
-            | LlamaSamplerParams::Dry { .. } => true,
-
-            LlamaSamplerParams::Temp(_)
-            | LlamaSamplerParams::TempExt { .. }
-            | LlamaSamplerParams::TopK(_)
-            | LlamaSamplerParams::Typical { .. }
-            | LlamaSamplerParams::TopP { .. }
-            | LlamaSamplerParams::MinP { .. }
-            | LlamaSamplerParams::Xtc { .. }
-            | LlamaSamplerParams::Mirostat { .. }
-            | LlamaSamplerParams::MirostatV2 { .. }
-            | LlamaSamplerParams::Dist { .. }
-            | LlamaSamplerParams::Greedy => false,
-        }
-    }
-}
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index 8d3266a0..0912af8a 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -2,7 +2,7 @@
 use std::ptr;
 
 use crate::{
-    sampling::{params::LlamaSamplerParams, LlamaSampler},
+    sampling::LlamaSampler,
     token::data::LlamaTokenData,
 };
 
@@ -123,18 +123,6 @@ impl LlamaTokenDataArray {
         result
     }
 
-    /// Applies a sampler constructed from [`LlamaSamplerParams`]. This will call
-    /// [`LlamaSampler::accept_many`] on the provided tokens if the sampler uses tokens.
-    pub fn apply_sampler_from_params(&mut self, params: LlamaSamplerParams, tokens: &[LlamaToken]) {
-        let mut sampler = LlamaSampler::new(params);
-
-        if params.uses_context_tokens() {
-            sampler.accept_many(tokens);
-        }
-
-        self.apply_sampler(&mut sampler);
-    }
-
     /// Modifies the data array by applying a sampler to it
     pub fn apply_sampler(&mut self, sampler: &mut LlamaSampler) {
         unsafe {
@@ -146,14 +134,14 @@ impl LlamaTokenDataArray {
 
     /// Randomly selects a token from the candidates based on their probabilities.
     pub fn sample_token(&mut self, seed: u32) -> LlamaToken {
-        self.apply_sampler_from_params(LlamaSamplerParams::Dist { seed }, &[]);
+        self.apply_sampler(&mut LlamaSampler::dist(seed));
         self.selected_token()
             .expect("Dist sampler failed to select a token!")
     }
 
     /// Selects the token with the highest probability.
     pub fn sample_token_greedy(&mut self) -> LlamaToken {
-        self.apply_sampler_from_params(LlamaSamplerParams::Greedy, &[]);
+        self.apply_sampler(&mut LlamaSampler::greedy());
         self.selected_token()
             .expect("Greedy sampler failed to select a token!")
     }

From bacad6574ec0836b6c35e6c59c147a219265e69f Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 8 Dec 2024 14:00:15 -0600
Subject: [PATCH 041/193] Small tweaks

---
 llama-cpp-2/src/sampling.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index f52ea1a8..67a2b57b 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -58,7 +58,7 @@ impl LlamaSampler {
     }
 
     #[must_use]
-    pub fn chain_with_no_perf(samplers: Vec<Self>, no_perf: bool) -> Self {
+    pub fn chain(samplers: Vec<Self>, no_perf: bool) -> Self {
         unsafe {
             let chain = llama_cpp_sys_2::llama_sampler_chain_init(
                 llama_cpp_sys_2::llama_sampler_chain_params { no_perf },
@@ -76,9 +76,10 @@ impl LlamaSampler {
         }
     }
 
+    /// Same as [`Self::chain`] with `no_perf = false`.
     #[must_use]
-    pub fn chain(samplers: Vec<Self>) -> Self {
-        Self::chain_with_no_perf(samplers, false)
+    pub fn chain_simple(samplers: Vec<Self>) -> Self {
+        Self::chain(samplers, false)
     }
 
     #[must_use]
@@ -196,6 +197,8 @@ impl LlamaSampler {
         Self { sampler }
     }
 
+    /// Same as [`Self::penalties`], but with `n_vocab`, `special_eos_id`, and `linefeed_id`
+    /// initialized from `model`, `penalize_nl = false`, and `ignore_eos = true`.
     #[must_use]
     pub fn penalties_simple(
         model: &LlamaModel,

From 95c2c87a33dcfa87d5470340f48526be41c70bce Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 8 Dec 2024 14:03:50 -0600
Subject: [PATCH 042/193] Generalize the arguments to LlamaSampler::chain

---
 examples/simple/src/main.rs | 2 +-
 llama-cpp-2/src/sampling.rs | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index f67a5309..f276ac24 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -245,7 +245,7 @@ either reduce n_len or increase n_ctx"
     // The `Decoder`
     let mut decoder = encoding_rs::UTF_8.new_decoder();
 
-    let mut sampler = LlamaSampler::chain(vec![
+    let mut sampler = LlamaSampler::chain_simple([
         LlamaSampler::dist(seed.unwrap_or(1234)),
         LlamaSampler::greedy(),
     ]);
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 67a2b57b..4ca1db89 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -58,7 +58,7 @@ impl LlamaSampler {
     }
 
     #[must_use]
-    pub fn chain(samplers: Vec<Self>, no_perf: bool) -> Self {
+    pub fn chain(samplers: impl IntoIterator<Item = Self>, no_perf: bool) -> Self {
         unsafe {
             let chain = llama_cpp_sys_2::llama_sampler_chain_init(
                 llama_cpp_sys_2::llama_sampler_chain_params { no_perf },
@@ -78,7 +78,7 @@ impl LlamaSampler {
 
     /// Same as [`Self::chain`] with `no_perf = false`.
     #[must_use]
-    pub fn chain_simple(samplers: Vec<Self>) -> Self {
+    pub fn chain_simple(samplers: impl IntoIterator<Item = Self>) -> Self {
         Self::chain(samplers, false)
     }
 

From 76fd77647174129c94593fe3513caa0d6aa0b801 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 8 Dec 2024 19:53:03 -0600
Subject: [PATCH 043/193] Generalize the arguments to accept_many and
 with_tokens

---
 llama-cpp-2/src/sampling.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 4ca1db89..7a64991f 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -43,16 +43,16 @@ impl LlamaSampler {
 
     /// Accepts several tokens from the sampler or context, possibly updating the internal state of
     /// certain samplers (e.g. grammar, repetition, etc.)
-    pub fn accept_many(&mut self, tokens: &[LlamaToken]) {
+    pub fn accept_many(&mut self, tokens: impl IntoIterator<Item = impl AsRef<LlamaToken>>) {
         for token in tokens {
-            unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.0) }
+            unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.as_ref().0) }
         }
     }
 
     /// Accepts several tokens from the sampler or context, possibly updating the internal state of
     /// certain samplers (e.g. grammar, repetition, etc.)
     #[must_use]
-    pub fn with_tokens(mut self, tokens: &[LlamaToken]) -> Self {
+    pub fn with_tokens(mut self, tokens: impl IntoIterator<Item = impl AsRef<LlamaToken>>) -> Self {
         self.accept_many(tokens);
         self
     }

From 8096e79b8ad449393d8db6ed826d6dc9aa32d3b4 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 8 Dec 2024 20:35:43 -0600
Subject: [PATCH 044/193] Documentation for sampler creation methods

---
 llama-cpp-2/src/sampling.rs | 156 ++++++++++++++++++++++++++++++++++--
 1 file changed, 148 insertions(+), 8 deletions(-)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 7a64991f..abe67352 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -57,6 +57,12 @@ impl LlamaSampler {
         self
     }
 
+    /// Combines a list of samplers into a single sampler that applies each component sampler one
+    /// after another.
+    ///
+    /// If you are using a chain to select a token, the chain should always end with one of
+    /// [`LlamaSampler::greedy`], [`LlamaSampler::dist`], [`LlamaSampler::mirostat`], and
+    /// [`LlamaSampler::mirostat_v2`].
     #[must_use]
     pub fn chain(samplers: impl IntoIterator<Item = Self>, no_perf: bool) -> Self {
         unsafe {
@@ -82,48 +88,108 @@ impl LlamaSampler {
         Self::chain(samplers, false)
     }
 
+    /// Updates the logits l_i' = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original
+    /// value, the rest are set to -inf
+    ///
+    /// # Example:
+    /// ```rust
+    /// use llama_cpp_2::token::{
+    ///    LlamaToken,
+    ///    data::LlamaTokenData,
+    ///    data_array::LlamaTokenDataArray
+    /// };
+    /// use llama_cpp_2::sampling::LlamaSampler;
+    ///
+    /// let mut data_array = LlamaTokenDataArray::new(vec![
+    ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),
+    ///     LlamaTokenData::new(LlamaToken(1), 1., 0.),
+    ///     LlamaTokenData::new(LlamaToken(2), 2., 0.),
+    /// ], false);
+    ///
+    /// data_array.apply_sampler(&mut LlamaSampler::temp(0.5));
+    ///
+    /// assert_eq!(data_array.data[0].logit(), 0.);
+    /// assert_eq!(data_array.data[1].logit(), 2.);
+    /// assert_eq!(data_array.data[2].logit(), 4.);
+    /// ```
     #[must_use]
     pub fn temp(t: f32) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_temp(t) };
         Self { sampler }
     }
 
+    /// Dynamic temperature implementation (a.k.a. entropy) described in the paper
+    /// <https://arxiv.org/abs/2309.02772>.
     #[must_use]
     pub fn temp_ext(t: f32, delta: f32, exponent: f32) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_temp_ext(t, delta, exponent) };
         Self { sampler }
     }
 
+    /// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration"
+    /// <https://arxiv.org/abs/1904.09751>
+    ///
+    /// # Example:
+    /// ```rust
+    /// use llama_cpp_2::token::{
+    ///    LlamaToken,
+    ///    data::LlamaTokenData,
+    ///    data_array::LlamaTokenDataArray
+    /// };
+    /// use llama_cpp_2::sampling::LlamaSampler;
+    ///
+    /// let mut data_array = LlamaTokenDataArray::new(vec![
+    ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),
+    ///     LlamaTokenData::new(LlamaToken(1), 1., 0.),
+    ///     LlamaTokenData::new(LlamaToken(2), 2., 0.),
+    ///     LlamaTokenData::new(LlamaToken(3), 3., 0.),
+    /// ], false);
+    ///
+    /// data_array.apply_sampler(&mut LlamaSampler::top_k(2));
+    ///
+    /// assert_eq!(data_array.data.len(), 2);
+    /// assert_eq!(data_array.data[0].id(), LlamaToken(3));
+    /// assert_eq!(data_array.data[1].id(), LlamaToken(2));
+    /// ```
     #[must_use]
     pub fn top_k(k: i32) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_top_k(k) };
         Self { sampler }
     }
 
+    /// Locally Typical Sampling implementation described in the paper <https://arxiv.org/abs/2202.00666>.
     #[must_use]
     pub fn typical(p: f32, min_keep: usize) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_typical(p, min_keep) };
         Self { sampler }
     }
 
+    /// Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration"
+    /// <https://arxiv.org/abs/1904.09751>
     #[must_use]
     pub fn top_p(p: f32, min_keep: usize) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_top_p(p, min_keep) };
         Self { sampler }
     }
 
+    /// Minimum P sampling as described in <https://github.com/ggerganov/llama.cpp/pull/3841>
     #[must_use]
     pub fn min_p(p: f32, min_keep: usize) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_min_p(p, min_keep) };
         Self { sampler }
     }
 
+    /// XTC sampler as described in <https://github.com/oobabooga/text-generation-webui/pull/6335>
     #[must_use]
     pub fn xtc(p: f32, t: f32, min_keep: usize, seed: u32) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_xtc(p, t, min_keep, seed) };
         Self { sampler }
     }
 
+    /// Grammar sampler 
+    ///
+    /// # Panics
+    /// If either of ``grammar_str`` or ``grammar_root`` contain null bytes.
     #[must_use]
     pub fn grammar(model: &LlamaModel, grammar_str: &str, grammar_root: &str) -> Self {
         let grammar_str = CString::new(grammar_str).unwrap();
@@ -139,6 +205,13 @@ impl LlamaSampler {
         Self { sampler }
     }
 
+    /// DRY sampler, designed by p-e-w, as described in:
+    /// <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp
+    /// implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>
+    ///
+    /// # Panics
+    /// If any string in ``seq_breakers`` contains null bytes.
+    #[allow(missing_docs)]
     #[must_use]
     pub fn dry(
         model: &LlamaModel,
@@ -146,15 +219,16 @@ impl LlamaSampler {
         base: f32,
         allowed_length: i32,
         penalty_last_n: i32,
-        seq_breakers: &[impl AsRef<[u8]>],
+        seq_breakers: impl IntoIterator<Item = impl AsRef<[u8]>>,
     ) -> Self {
+        let seq_breakers: Vec<CString> = seq_breakers
+            .into_iter()
+            .map(|s| CString::new(s.as_ref()).unwrap())
+            .collect();
+        let mut seq_breaker_pointers: Vec<*const i8> =
+            seq_breakers.iter().map(|s| s.as_ptr()).collect();
+
         let sampler = unsafe {
-            let seq_breakers: Vec<CString> = seq_breakers
-                .iter()
-                .map(|s| CString::new(s.as_ref()).unwrap())
-                .collect();
-            let mut seq_breaker_pointers: Vec<*const i8> =
-                seq_breakers.iter().map(|s| s.as_ptr()).collect();
             llama_cpp_sys_2::llama_sampler_init_dry(
                 model.model.as_ptr(),
                 multiplier,
@@ -168,6 +242,18 @@ impl LlamaSampler {
         Self { sampler }
     }
 
+    /// Penalizes tokens for being present in the context.
+    ///
+    /// Parameters:  
+    /// - ``n_vocab``: [`LlamaModel::n_vocab`]
+    /// - ``special_eos)id``: [`LlamaModel::token_eos`]
+    /// - ``linefeed_id``: [`LlamaModel::token_nl`]
+    /// - ``penalty_last_n``: last n tokens to penalize (0 = disable penalty, -1 = context size)
+    /// - ``penalty_repeat``: 1.0 = disabled
+    /// - ``penalty_freq``: 0.0 = disabled
+    /// - ``penalty_present``: 0.0 = disabled
+    /// - ``penalize_nl``: consider newlines as a repeatable token
+    /// - ``ignore_eos``: ignore the end-of-sequence token
     #[allow(clippy::too_many_arguments)]
     #[must_use]
     pub fn penalties(
@@ -199,6 +285,13 @@ impl LlamaSampler {
 
     /// Same as [`Self::penalties`], but with `n_vocab`, `special_eos_id`, and `linefeed_id`
     /// initialized from `model`, `penalize_nl = false`, and `ignore_eos = true`.
+    ///
+    /// Parameters:  
+    /// - ``model``: The model's tokenizer to use to initialize the sampler.
+    /// - ``penalty_last_n``: last n tokens to penalize (0 = disable penalty, -1 = context size)
+    /// - ``penalty_repeat``: 1.0 = disabled
+    /// - ``penalty_freq``: 0.0 = disabled
+    /// - ``penalty_present``: 0.0 = disabled
     #[must_use]
     pub fn penalties_simple(
         model: &LlamaModel,
@@ -220,6 +313,21 @@ impl LlamaSampler {
         )
     }
 
+    /// Mirostat 1.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
+    ///
+    /// # Parameters:
+    /// - ``n_vocab``: [`LlamaModel::n_vocab`]
+    /// - ``seed``: Seed to initialize random generation with.
+    /// - ``tau``: The target cross-entropy (or surprise) value you want to achieve for the
+    ///     generated text. A higher value corresponds to more surprising or less predictable text,
+    ///     while a lower value corresponds to less surprising or more predictable text.
+    /// - ``eta``: The learning rate used to update `mu` based on the error between the target and
+    ///     observed surprisal of the sampled word. A larger learning rate will cause `mu` to be
+    ///     updated more quickly, while a smaller learning rate will result in slower updates.
+    /// - ``m``: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary
+    ///     value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`.
+    ///     In the paper, they use `m = 100`, but you can experiment with different values to see how
+    ///     it affects the performance of the algorithm.
     #[must_use]
     pub fn mirostat(n_vocab: i32, seed: u32, tau: f32, eta: f32, m: i32) -> Self {
         let sampler =
@@ -227,18 +335,50 @@ impl LlamaSampler {
         Self { sampler }
     }
 
+    /// Mirostat 2.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
+    ///
+    /// # Parameters:
+    /// - ``seed``: Seed to initialize random generation with.
+    /// - ``tau``: The target cross-entropy (or surprise) value you want to achieve for the
+    ///     generated text. A higher value corresponds to more surprising or less predictable text,
+    ///     while a lower value corresponds to less surprising or more predictable text.
+    /// - ``eta``: The learning rate used to update `mu` based on the error between the target and
+    ///     observed surprisal of the sampled word. A larger learning rate will cause `mu` to be
+    ///     updated more quickly, while a smaller learning rate will result in slower updates.
     #[must_use]
     pub fn mirostat_v2(seed: u32, tau: f32, eta: f32) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_mirostat_v2(seed, tau, eta) };
         Self { sampler }
     }
 
+    /// Selects a token at random based on each token's probabilities
     #[must_use]
     pub fn dist(seed: u32) -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_dist(seed) };
         Self { sampler }
     }
-
+ 
+    /// Selects the most likely token
+    ///
+    /// # Example: 
+    /// ```rust
+    /// use llama_cpp_2::token::{
+    ///    LlamaToken,
+    ///    data::LlamaTokenData,
+    ///    data_array::LlamaTokenDataArray
+    /// };
+    /// use llama_cpp_2::sampling::LlamaSampler;
+    ///
+    /// let mut data_array = LlamaTokenDataArray::new(vec![
+    ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),
+    ///     LlamaTokenData::new(LlamaToken(1), 1., 0.),
+    /// ], false);
+    ///
+    /// data_array.apply_sampler(&mut LlamaSampler::greedy());
+    ///
+    /// assert_eq!(data_array.data.len(), 2);
+    /// assert_eq!(data_array.selected_token(), Some(LlamaToken(1)));
+    /// ```
     #[must_use]
     pub fn greedy() -> Self {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_greedy() };

From aeb76dceb622eea90764f9a9ad89b666c8583274 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 8 Dec 2024 20:49:51 -0600
Subject: [PATCH 045/193] Add LlamaTokenDataArray::with_sampler; use Borrow
 instead of AsRef for LlamaToken

---
 llama-cpp-2/src/sampling.rs         |  7 ++++---
 llama-cpp-2/src/token/data_array.rs | 13 +++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index abe67352..8781ff48 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -1,5 +1,6 @@
 //! Safe wrapper around `llama_sampler`.
 
+use std::borrow::Borrow;
 use std::ffi::CString;
 use std::fmt::{Debug, Formatter};
 
@@ -43,16 +44,16 @@ impl LlamaSampler {
 
     /// Accepts several tokens from the sampler or context, possibly updating the internal state of
     /// certain samplers (e.g. grammar, repetition, etc.)
-    pub fn accept_many(&mut self, tokens: impl IntoIterator<Item = impl AsRef<LlamaToken>>) {
+    pub fn accept_many(&mut self, tokens: impl IntoIterator<Item = impl Borrow<LlamaToken>>) {
         for token in tokens {
-            unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.as_ref().0) }
+            unsafe { llama_cpp_sys_2::llama_sampler_accept(self.sampler, token.borrow().0) }
         }
     }
 
     /// Accepts several tokens from the sampler or context, possibly updating the internal state of
     /// certain samplers (e.g. grammar, repetition, etc.)
     #[must_use]
-    pub fn with_tokens(mut self, tokens: impl IntoIterator<Item = impl AsRef<LlamaToken>>) -> Self {
+    pub fn with_tokens(mut self, tokens: impl IntoIterator<Item = impl Borrow<LlamaToken>>) -> Self {
         self.accept_many(tokens);
         self
     }
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index 0912af8a..3f75ee8f 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -132,7 +132,17 @@ impl LlamaTokenDataArray {
         }
     }
 
+    /// Modifies the data array by applying a sampler to it
+    #[must_use]
+    pub fn with_sampler(mut self, sampler: &mut LlamaSampler) -> Self {
+        self.apply_sampler(sampler);
+        self
+    }
+
     /// Randomly selects a token from the candidates based on their probabilities.
+    ///
+    /// # Panics
+    /// If the internal llama.cpp sampler fails to select a token.
     pub fn sample_token(&mut self, seed: u32) -> LlamaToken {
         self.apply_sampler(&mut LlamaSampler::dist(seed));
         self.selected_token()
@@ -140,6 +150,9 @@ impl LlamaTokenDataArray {
     }
 
     /// Selects the token with the highest probability.
+    ///
+    /// # Panics
+    /// If the internal llama.cpp sampler fails to select a token.
     pub fn sample_token_greedy(&mut self) -> LlamaToken {
         self.apply_sampler(&mut LlamaSampler::greedy());
         self.selected_token()

From 67ea6889c8ce4ed03423e9cbd4684d697a26fa0f Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Sun, 8 Dec 2024 22:15:42 -0600
Subject: [PATCH 046/193] Test for LlamaSampler::chain_simple

---
 llama-cpp-2/src/sampling.rs | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 8781ff48..69a8554f 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -84,6 +84,34 @@ impl LlamaSampler {
     }
 
     /// Same as [`Self::chain`] with `no_perf = false`.
+    ///
+    /// # Example
+    /// ```rust
+    /// use llama_cpp_2::token::{
+    ///    LlamaToken,
+    ///    data::LlamaTokenData,
+    ///    data_array::LlamaTokenDataArray
+    /// };
+    /// use llama_cpp_2::sampling::LlamaSampler;
+    ///
+    /// let mut data_array = LlamaTokenDataArray::new(vec![
+    ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),
+    ///     LlamaTokenData::new(LlamaToken(1), 1., 0.),
+    ///     LlamaTokenData::new(LlamaToken(2), 2., 0.),
+    /// ], false);
+    ///
+    /// data_array.apply_sampler(&mut LlamaSampler::chain_simple([
+    ///     LlamaSampler::temp(0.5),
+    ///     LlamaSampler::greedy(),
+    /// ]));
+    ///
+    /// assert_eq!(data_array.data[0].logit(), 0.);
+    /// assert_eq!(data_array.data[1].logit(), 2.);
+    /// assert_eq!(data_array.data[2].logit(), 4.);
+    ///
+    /// assert_eq!(data_array.data.len(), 3);
+    /// assert_eq!(data_array.selected_token(), Some(LlamaToken(2)));
+    /// ```
     #[must_use]
     pub fn chain_simple(samplers: impl IntoIterator<Item = Self>) -> Self {
         Self::chain(samplers, false)

From df599194e05c3ed424d72eabbd21f9efbbaa98e2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 05:46:22 +0000
Subject: [PATCH 047/193] chore(deps): bump anyhow from 1.0.93 to 1.0.94

Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.93 to 1.0.94.
- [Release notes](https://github.com/dtolnay/anyhow/releases)
- [Commits](https://github.com/dtolnay/anyhow/compare/1.0.93...1.0.94)

---
updated-dependencies:
- dependency-name: anyhow
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 27a0f42c..fd63304c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,9 +68,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.93"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 
 [[package]]
 name = "base64"
diff --git a/Cargo.toml b/Cargo.toml
index 77b18a8a..07b44493 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.2"
-anyhow = "1.0.93"
+anyhow = "1.0.94"
 clap = "4.5.22"
 encoding_rs = "0.8.34"
 

From c755cad903a07c6b16a2a699ea71de154b1d3703 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 05:46:28 +0000
Subject: [PATCH 048/193] chore(deps): bump encoding_rs from 0.8.34 to 0.8.35

Bumps [encoding_rs](https://github.com/hsivonen/encoding_rs) from 0.8.34 to 0.8.35.
- [Commits](https://github.com/hsivonen/encoding_rs/compare/v0.8.34...v0.8.35)

---
updated-dependencies:
- dependency-name: encoding_rs
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 27a0f42c..64c43c4f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -293,9 +293,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
 
 [[package]]
 name = "encoding_rs"
-version = "0.8.34"
+version = "0.8.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
 dependencies = [
  "cfg-if",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 77b18a8a..c33ce1db 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ bindgen = "0.69.5"
 cc = "1.2.2"
 anyhow = "1.0.93"
 clap = "4.5.22"
-encoding_rs = "0.8.34"
+encoding_rs = "0.8.35"
 
 [workspace.lints.rust]
 missing_docs = { level = "warn" }

From 6a493c64702f52426ce1a744a07533e695f42da2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 06:57:01 +0000
Subject: [PATCH 049/193] chore(deps): bump cc from 1.2.2 to 1.2.3

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.2 to 1.2.3.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.2...cc-v1.2.3)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 20af37d5..23285bcd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.2"
+version = "1.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc"
+checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 6033295c..9ed420d7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.2"
+cc = "1.2.3"
 anyhow = "1.0.94"
 clap = "4.5.22"
 encoding_rs = "0.8.35"

From c773e61b4c6c0bfd1d04aa7108a508a7b3dcfdc3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 06:57:08 +0000
Subject: [PATCH 050/193] chore(deps): bump clap from 4.5.22 to 4.5.23

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.22 to 4.5.23.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.22...clap_complete-v4.5.23)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 12 ++++++------
 Cargo.toml |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 20af37d5..ec9692ed 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.22"
+version = "4.5.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
+checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.22"
+version = "4.5.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
+checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
 dependencies = [
  "anstream",
  "anstyle",
@@ -180,9 +180,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.7.1"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
+checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
 
 [[package]]
 name = "cmake"
diff --git a/Cargo.toml b/Cargo.toml
index 6033295c..18ee3e58 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.2"
 anyhow = "1.0.94"
-clap = "4.5.22"
+clap = "4.5.23"
 encoding_rs = "0.8.35"
 
 [workspace.lints.rust]

From 46dc565a4a755ffede2d686a85113d41b7c5afd5 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 18 Dec 2024 17:31:54 +0100
Subject: [PATCH 051/193] build.rs updated to support android

---
 llama-cpp-sys-2/build.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index f5473769..8ade7a55 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -230,6 +230,25 @@ fn main() {
         config.static_crt(static_crt);
     }
 
+    if target.contains("android") && target.contains("aarch64") {
+        // build flags for android taken from this doc
+        // https://github.com/ggerganov/llama.cpp/blob/master/docs/android.md
+        let android_ndk = env::var("ANDROID_NDK")
+            .expect("Please install Android NDK and ensure that ANDROID_NDK env variable is set");
+        config.define("LLAMA_CURL", "OFF");
+        config.define(
+            "CMAKE_TOOLCHAIN_FILE",
+            format!("{android_ndk}/build/cmake/android.toolchain.cmake"),
+        );
+        config.define("ANDROID_ABI", "arm64-v8a");
+        config.define("ANDROID_PLATFORM", "android-28");
+        config.define("CMAKE_SYSTEM_PROCESSOR", "arm64");
+        config.define("CMAKE_C_FLAGS", "-march=armv8.7a");
+        config.define("CMAKE_CXX_FLAGS", "-march=armv8.7a");
+        config.define("GGML_OPENMP", "OFF");
+        config.define("GGML_LLAMAFILE", "OFF");
+    }
+
     if cfg!(feature = "vulkan") {
         config.define("GGML_VULKAN", "ON");
         if cfg!(windows) {

From 3103336eaf905671024cb37cf6a85213be4f64dc Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 18 Dec 2024 18:23:07 +0100
Subject: [PATCH 052/193] use different type in dry sampler when building for
 android

---
 llama-cpp-2/src/sampling.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 69a8554f..e3237074 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -20,6 +20,15 @@ impl Debug for LlamaSampler {
     }
 }
 
+// this is needed for the dry sampler to typecheck on android
+// ...because what is normally an i8, is an u8 
+#[cfg(target_os = "android")]
+type CChar = u8;
+
+#[cfg(not(target_os = "android"))]
+type CChar = i8;
+
+
 impl LlamaSampler {
     /// Sample and accept a token from the idx-th output of the last evaluation
     #[must_use]
@@ -254,7 +263,7 @@ impl LlamaSampler {
             .into_iter()
             .map(|s| CString::new(s.as_ref()).unwrap())
             .collect();
-        let mut seq_breaker_pointers: Vec<*const i8> =
+        let mut seq_breaker_pointers: Vec<*const CChar> =
             seq_breakers.iter().map(|s| s.as_ptr()).collect();
 
         let sampler = unsafe {

From 55059ad42ce90f3c3f9070f982984b8d496bfc10 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Thu, 19 Dec 2024 12:11:15 +0100
Subject: [PATCH 053/193] remove unnecessary LLAMA_CURL excemption

---
 llama-cpp-sys-2/build.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 8ade7a55..ed57895d 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -235,7 +235,6 @@ fn main() {
         // https://github.com/ggerganov/llama.cpp/blob/master/docs/android.md
         let android_ndk = env::var("ANDROID_NDK")
             .expect("Please install Android NDK and ensure that ANDROID_NDK env variable is set");
-        config.define("LLAMA_CURL", "OFF");
         config.define(
             "CMAKE_TOOLCHAIN_FILE",
             format!("{android_ndk}/build/cmake/android.toolchain.cmake"),

From 5430d5a8b91e120dc7da9a0e350ea49f570de6b2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 23 Dec 2024 05:12:09 +0000
Subject: [PATCH 054/193] chore(deps): bump anyhow from 1.0.94 to 1.0.95

Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.94 to 1.0.95.
- [Release notes](https://github.com/dtolnay/anyhow/releases)
- [Commits](https://github.com/dtolnay/anyhow/compare/1.0.94...1.0.95)

---
updated-dependencies:
- dependency-name: anyhow
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 39f873e6..0def8ca9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,9 +68,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.94"
+version = "1.0.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
+checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
 
 [[package]]
 name = "base64"
diff --git a/Cargo.toml b/Cargo.toml
index de4e69ae..834ccbad 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.3"
-anyhow = "1.0.94"
+anyhow = "1.0.95"
 clap = "4.5.23"
 encoding_rs = "0.8.35"
 

From b11b90148f138f2b059c2e036d61a67475339e7f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 23 Dec 2024 05:15:53 +0000
Subject: [PATCH 055/193] chore(deps): bump docker/setup-buildx-action from
 3.7.1 to 3.8.0

Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3.7.1 to 3.8.0.
- [Release notes](https://github.com/docker/setup-buildx-action/releases)
- [Commits](https://github.com/docker/setup-buildx-action/compare/c47758b77c9736f4b2ef4073d4d51994fabfe349...6524bf65af31da8d45b59e8c27de4bd072b392f5)

---
updated-dependencies:
- dependency-name: docker/setup-buildx-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/llama-cpp-rs-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index 9ce9ada5..4933e395 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -49,7 +49,7 @@ jobs:
         with:
           platforms: arm64,amd64
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@c47758b77c9736f4b2ef4073d4d51994fabfe349
+        uses: docker/setup-buildx-action@6524bf65af31da8d45b59e8c27de4bd072b392f5
       - name: Build
         uses: docker/build-push-action@v6
         with:

From 6c1b97e4aac5247bccffda6351b09129095977ca Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 30 Dec 2024 05:10:40 +0000
Subject: [PATCH 056/193] chore(deps): bump glob from 0.3.1 to 0.3.2

Bumps [glob](https://github.com/rust-lang/glob) from 0.3.1 to 0.3.2.
- [Release notes](https://github.com/rust-lang/glob/releases)
- [Changelog](https://github.com/rust-lang/glob/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/glob/compare/0.3.1...v0.3.2)

---
updated-dependencies:
- dependency-name: glob
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock                 | 4 ++--
 llama-cpp-sys-2/Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 39f873e6..e9c38594 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -383,9 +383,9 @@ dependencies = [
 
 [[package]]
 name = "glob"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
 
 [[package]]
 name = "heck"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index ce49aba9..5ec00959 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -62,7 +62,7 @@ include = [
 bindgen = { workspace = true }
 cc = { workspace = true, features = ["parallel"] }
 cmake = "0.1"
-glob = "0.3.1"
+glob = "0.3.2"
 
 [features]
 cuda = []

From ab288bd030c251bc2342e65fc58d1c638f0047c7 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Mon, 30 Dec 2024 10:41:02 -0600
Subject: [PATCH 057/193] Fix mutability for some sampling methods

---
 llama-cpp-2/src/sampling.rs         | 15 +++++++++------
 llama-cpp-2/src/token/data_array.rs |  7 ++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 69a8554f..3102c491 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -23,7 +23,7 @@ impl Debug for LlamaSampler {
 impl LlamaSampler {
     /// Sample and accept a token from the idx-th output of the last evaluation
     #[must_use]
-    pub fn sample(&self, ctx: &LlamaContext, idx: i32) -> LlamaToken {
+    pub fn sample(&mut self, ctx: &LlamaContext, idx: i32) -> LlamaToken {
         let token = unsafe {
             llama_cpp_sys_2::llama_sampler_sample(self.sampler, ctx.context.as_ptr(), idx)
         };
@@ -32,7 +32,7 @@ impl LlamaSampler {
     }
 
     /// Applies this sampler to a [`LlamaTokenDataArray`].
-    pub fn apply(&mut self, data_array: &mut LlamaTokenDataArray) {
+    pub fn apply(&self, data_array: &mut LlamaTokenDataArray) {
         data_array.apply_sampler(self);
     }
 
@@ -53,7 +53,10 @@ impl LlamaSampler {
     /// Accepts several tokens from the sampler or context, possibly updating the internal state of
     /// certain samplers (e.g. grammar, repetition, etc.)
     #[must_use]
-    pub fn with_tokens(mut self, tokens: impl IntoIterator<Item = impl Borrow<LlamaToken>>) -> Self {
+    pub fn with_tokens(
+        mut self,
+        tokens: impl IntoIterator<Item = impl Borrow<LlamaToken>>,
+    ) -> Self {
         self.accept_many(tokens);
         self
     }
@@ -215,7 +218,7 @@ impl LlamaSampler {
         Self { sampler }
     }
 
-    /// Grammar sampler 
+    /// Grammar sampler
     ///
     /// # Panics
     /// If either of ``grammar_str`` or ``grammar_root`` contain null bytes.
@@ -386,10 +389,10 @@ impl LlamaSampler {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_dist(seed) };
         Self { sampler }
     }
- 
+
     /// Selects the most likely token
     ///
-    /// # Example: 
+    /// # Example:
     /// ```rust
     /// use llama_cpp_2::token::{
     ///    LlamaToken,
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index 3f75ee8f..7f583064 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -1,10 +1,7 @@
 //! an rusty equivalent of `llama_token_data_array`.
 use std::ptr;
 
-use crate::{
-    sampling::LlamaSampler,
-    token::data::LlamaTokenData,
-};
+use crate::{sampling::LlamaSampler, token::data::LlamaTokenData};
 
 use super::LlamaToken;
 
@@ -124,7 +121,7 @@ impl LlamaTokenDataArray {
     }
 
     /// Modifies the data array by applying a sampler to it
-    pub fn apply_sampler(&mut self, sampler: &mut LlamaSampler) {
+    pub fn apply_sampler(&mut self, sampler: &LlamaSampler) {
         unsafe {
             self.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
                 llama_cpp_sys_2::llama_sampler_apply(sampler.sampler, c_llama_token_data_array);

From 0c91d64cdd1dcf22e429b7a6d3c65d402623c9c9 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Mon, 30 Dec 2024 10:41:39 -0600
Subject: [PATCH 058/193] Make chat_template and detokenization methods more
 reliable

---
 llama-cpp-2/src/lib.rs   |  3 --
 llama-cpp-2/src/model.rs | 79 +++++++++++++++++++++++-----------------
 2 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 8e09608f..862f64b2 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -280,9 +280,6 @@ pub enum NewLlamaChatMessageError {
 /// Failed to apply model chat template.
 #[derive(Debug, thiserror::Error)]
 pub enum ApplyChatTemplateError {
-    /// the buffer was too small.
-    #[error("The buffer was too small. Please contact a maintainer and we will update it.")]
-    BuffSizeError,
     /// the string contained a null byte and thus could not be converted to a c string.
     #[error("{0}")]
     NulError(#[from] NulError),
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 54c82bd5..90c5e406 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -1,5 +1,4 @@
 //! A safe wrapper around `llama_model`.
-use std::ffi::CStr;
 use std::ffi::CString;
 use std::num::NonZeroU16;
 use std::os::raw::c_int;
@@ -124,7 +123,7 @@ impl LlamaModel {
         unsafe { llama_cpp_sys_2::llama_token_is_eog(self.model.as_ptr(), token.0) }
     }
 
-    /// Get the decoder start token token.
+    /// Get the decoder start token.
     #[must_use]
     pub fn decode_start_token(&self) -> LlamaToken {
         let token =
@@ -142,7 +141,8 @@ impl LlamaModel {
         token: LlamaToken,
         special: Special,
     ) -> Result<String, TokenToStringError> {
-        self.token_to_str_with_size(token, 32, special)
+        let bytes = self.token_to_bytes(token, special)?;
+        Ok(String::from_utf8(bytes)?)
     }
 
     /// Convert single token to bytes.
@@ -155,7 +155,12 @@ impl LlamaModel {
         token: LlamaToken,
         special: Special,
     ) -> Result<Vec<u8>, TokenToStringError> {
-        self.token_to_bytes_with_size(token, 32, special, None)
+        match self.token_to_bytes_with_size(token, 8, special, None) {
+            Err(TokenToStringError::InsufficientBufferSpace(i)) => {
+                self.token_to_bytes_with_size(token, -i as usize, special, None)
+            }
+            x => x,
+        }
     }
 
     /// Convert a vector of tokens to a single string.
@@ -168,15 +173,15 @@ impl LlamaModel {
         tokens: &[LlamaToken],
         special: Special,
     ) -> Result<String, TokenToStringError> {
-        let mut builder = String::with_capacity(tokens.len() * 4);
-        for str in tokens
+        let mut builder: Vec<u8> = Vec::with_capacity(tokens.len() * 4);
+        for piece in tokens
             .iter()
             .copied()
-            .map(|t| self.token_to_str(t, special))
+            .map(|t| self.token_to_bytes(t, special))
         {
-            builder += &str?;
+            builder.extend_from_slice(&piece?);
         }
-        Ok(builder)
+        Ok(String::from_utf8(builder)?)
     }
 
     /// Convert a string to a Vector of tokens.
@@ -212,7 +217,7 @@ impl LlamaModel {
         };
 
         let tokens_estimation = std::cmp::max(8, (str.len() / 2) + usize::from(add_bos));
-        let mut buffer = Vec::with_capacity(tokens_estimation);
+        let mut buffer: Vec<LlamaToken> = Vec::with_capacity(tokens_estimation);
 
         let c_string = CString::new(str)?;
         let buffer_capacity =
@@ -223,7 +228,7 @@ impl LlamaModel {
                 self.model.as_ptr(),
                 c_string.as_ptr(),
                 c_int::try_from(c_string.as_bytes().len())?,
-                buffer.as_mut_ptr(),
+                buffer.as_mut_ptr() as *mut llama_cpp_sys_2::llama_token,
                 buffer_capacity,
                 add_bos,
                 true,
@@ -239,7 +244,7 @@ impl LlamaModel {
                     self.model.as_ptr(),
                     c_string.as_ptr(),
                     c_int::try_from(c_string.as_bytes().len())?,
-                    buffer.as_mut_ptr(),
+                    buffer.as_mut_ptr() as *mut llama_cpp_sys_2::llama_token,
                     -size,
                     add_bos,
                     true,
@@ -253,7 +258,7 @@ impl LlamaModel {
 
         // Safety: `size` < `capacity` and llama-cpp has initialized elements up to `size`
         unsafe { buffer.set_len(size) }
-        Ok(buffer.into_iter().map(LlamaToken).collect())
+        Ok(buffer)
     }
 
     /// Get the type of a token.
@@ -269,8 +274,8 @@ impl LlamaModel {
 
     /// Convert a token to a string with a specified buffer size.
     ///
-    /// Generally you should use [`LlamaModel::token_to_str`] instead as 8 bytes is enough for most words and
-    /// the extra bytes do not really matter.
+    /// Generally you should use [`LlamaModel::token_to_str`] as it is able to decode tokens with
+    /// any length.
     ///
     /// # Errors
     ///
@@ -294,8 +299,8 @@ impl LlamaModel {
 
     /// Convert a token to bytes with a specified buffer size.
     ///
-    /// Generally you should use [`LlamaModel::token_to_bytes`] instead as 8 bytes is enough for most words and
-    /// the extra bytes do not really matter.
+    /// Generally you should use [`LlamaModel::token_to_bytes`] as it is able to handle tokens of
+    /// any length.
     ///
     /// # Errors
     ///
@@ -523,7 +528,7 @@ impl LlamaModel {
         let message_length = chat.iter().fold(0, |acc, c| {
             acc + c.role.to_bytes().len() + c.content.to_bytes().len()
         });
-        let mut buff = vec![0; message_length * 4];
+        let mut buff: Vec<u8> = vec![0; message_length * 2];
 
         // Build our llama_cpp_sys_2 chat messages
         let chat: Vec<llama_cpp_sys_2::llama_chat_message> = chat
@@ -541,28 +546,36 @@ impl LlamaModel {
             None => std::ptr::null(),
         };
 
-        let formatted_chat = unsafe {
-            let res = llama_cpp_sys_2::llama_chat_apply_template(
+        let mut res = unsafe {
+            llama_cpp_sys_2::llama_chat_apply_template(
                 self.model.as_ptr(),
                 tmpl_ptr,
                 chat.as_ptr(),
                 chat.len(),
                 add_ass,
-                buff.as_mut_ptr(),
+                buff.as_mut_ptr().cast::<i8>(),
                 buff.len() as i32,
-            );
-            // A buffer twice the size should be sufficient for all models, if this is not the case for a new model, we can increase it
-            // The error message informs the user to contact a maintainer
-            if res > buff.len() as i32 {
-                return Err(ApplyChatTemplateError::BuffSizeError);
-            }
-            Ok::<String, ApplyChatTemplateError>(
-                CStr::from_ptr(buff.as_mut_ptr())
-                    .to_string_lossy()
-                    .to_string(),
             )
-        }?;
-        Ok(formatted_chat)
+        };
+
+        if res > buff.len() as i32 {
+            buff.resize(res as usize, 0);
+
+            res = unsafe {
+                llama_cpp_sys_2::llama_chat_apply_template(
+                    self.model.as_ptr(),
+                    tmpl_ptr,
+                    chat.as_ptr(),
+                    chat.len(),
+                    add_ass,
+                    buff.as_mut_ptr().cast::<i8>(),
+                    buff.len() as i32,
+                )
+            };
+            assert!(res > buff.len() as i32);
+        }
+        buff.truncate(res as usize);
+        Ok(String::from_utf8(buff)?)
     }
 }
 

From f8b3f8d2e389d5262c0a2adf22b74e82639db854 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Mon, 30 Dec 2024 10:50:49 -0600
Subject: [PATCH 059/193] Fix assertion

---
 llama-cpp-2/src/model.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 90c5e406..be828ef1 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -572,7 +572,7 @@ impl LlamaModel {
                     buff.len() as i32,
                 )
             };
-            assert!(res > buff.len() as i32);
+            assert_eq!(res, buff.len() as i32);
         }
         buff.truncate(res as usize);
         Ok(String::from_utf8(buff)?)

From 6b1042966fa9bcf16c857ed6031bbd754afe8e8c Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Mon, 30 Dec 2024 18:08:35 -0600
Subject: [PATCH 060/193] Small tweak

---
 llama-cpp-2/src/model.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index be828ef1..127ff6b7 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -546,7 +546,7 @@ impl LlamaModel {
             None => std::ptr::null(),
         };
 
-        let mut res = unsafe {
+        let res = unsafe {
             llama_cpp_sys_2::llama_chat_apply_template(
                 self.model.as_ptr(),
                 tmpl_ptr,
@@ -561,7 +561,7 @@ impl LlamaModel {
         if res > buff.len() as i32 {
             buff.resize(res as usize, 0);
 
-            res = unsafe {
+            let res = unsafe {
                 llama_cpp_sys_2::llama_chat_apply_template(
                     self.model.as_ptr(),
                     tmpl_ptr,

From 264b32deb915616d4733abb5139f87c4dbbb5026 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 2 Jan 2025 23:55:26 +0000
Subject: [PATCH 061/193] chore(deps): bump cc from 1.2.3 to 1.2.6

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.3 to 1.2.6.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.3...cc-v1.2.6)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8efd45bc..a6f92428 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.3"
+version = "1.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d"
+checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 834ccbad..e9504ecb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.3"
+cc = "1.2.6"
 anyhow = "1.0.95"
 clap = "4.5.23"
 encoding_rs = "0.8.35"

From 93a47234ac0476e4c80f53a7816f64ea6cf3014c Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Fri, 3 Jan 2025 00:00:32 +0000
Subject: [PATCH 062/193] Bump version to 0.1.87 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a6f92428..0022d77c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.86"
+version = "0.1.87"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.86"
+version = "0.1.87"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.86"
+version = "0.1.87"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.86"
+version = "0.1.87"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 64cd7814..e02aa459 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.86"
+version = "0.1.87"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 4ec54baf..5e9d0b9a 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.86"
+version = "0.1.87"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 1a583910..3e614d17 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.86"
+version = "0.1.87"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 5ec00959..38bb3989 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.86"
+version = "0.1.87"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 38bb1c15f47b2ec570a3ffe312f25358ea00e6e4 Mon Sep 17 00:00:00 2001
From: xutianyi <xutianyi1999@live.com>
Date: Tue, 7 Jan 2025 18:23:38 +0800
Subject: [PATCH 063/193] fix shared libs not found on windows #583

---
 llama-cpp-sys-2/build.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index ed57895d..cdec57e1 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -101,7 +101,8 @@ fn extract_lib_assets(out_dir: &Path) -> Vec<PathBuf> {
         "*.so"
     };
 
-    let libs_dir = out_dir.join("lib");
+    let shared_libs_dir = if cfg!(windows) { "bin" } else { "lib" };
+    let libs_dir = out_dir.join(shared_libs_dir);
     let pattern = libs_dir.join(shared_lib_pattern);
     debug_log!("Extract lib assets {}", pattern.display());
     let mut files = Vec::new();

From c3ad99b8cf815928b33d382fc7a9943e25f69bc7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 13 Jan 2025 05:19:54 +0000
Subject: [PATCH 064/193] chore(deps): bump docker/setup-qemu-action from 3.2.0
 to 3.3.0

Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3.2.0 to 3.3.0.
- [Release notes](https://github.com/docker/setup-qemu-action/releases)
- [Commits](https://github.com/docker/setup-qemu-action/compare/49b3bc8e6bdd4a60e6116a5414239cba5943d3cf...53851d14592bedcffcf25ea515637cff71ef929a)

---
updated-dependencies:
- dependency-name: docker/setup-qemu-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/llama-cpp-rs-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index 4933e395..f2300ea1 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -45,7 +45,7 @@ jobs:
       - name: checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
       - name: Setup QEMU
-        uses: docker/setup-qemu-action@49b3bc8e6bdd4a60e6116a5414239cba5943d3cf
+        uses: docker/setup-qemu-action@53851d14592bedcffcf25ea515637cff71ef929a
         with:
           platforms: arm64,amd64
       - name: Set up Docker Buildx

From 09ddb60c2f8e762347e52fb8e07e7935bcab0fdf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 13 Jan 2025 05:41:00 +0000
Subject: [PATCH 065/193] chore(deps): bump clap from 4.5.23 to 4.5.26

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.23 to 4.5.26.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.23...clap_complete-v4.5.26)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 12 ++++++------
 Cargo.toml |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0022d77c..72b08089 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.23"
+version = "4.5.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
+checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.23"
+version = "4.5.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
+checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121"
 dependencies = [
  "anstream",
  "anstyle",
@@ -168,9 +168,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.18"
+version = "4.5.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
+checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
 dependencies = [
  "heck",
  "proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index e9504ecb..2d4b0565 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.6"
 anyhow = "1.0.95"
-clap = "4.5.23"
+clap = "4.5.26"
 encoding_rs = "0.8.35"
 
 [workspace.lints.rust]

From e1f49db8b880ab8371cea16fd3e7aad9b2c5a8e7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 13 Jan 2025 05:41:08 +0000
Subject: [PATCH 066/193] chore(deps): bump cc from 1.2.6 to 1.2.9

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.6 to 1.2.9.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.6...cc-v1.2.9)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0022d77c..5785eca8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.6"
+version = "1.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333"
+checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index e9504ecb..55df1ebe 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.6"
+cc = "1.2.9"
 anyhow = "1.0.95"
 clap = "4.5.23"
 encoding_rs = "0.8.35"

From 276d165092ea4707d61d07739de04f84a6003034 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Wed, 15 Jan 2025 01:17:34 +0000
Subject: [PATCH 067/193] Bump version to 0.1.88 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f55cc0b7..985d853b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.87"
+version = "0.1.88"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.87"
+version = "0.1.88"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.87"
+version = "0.1.88"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.87"
+version = "0.1.88"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index e02aa459..117fe641 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.87"
+version = "0.1.88"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 5e9d0b9a..95e200a5 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.87"
+version = "0.1.88"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 3e614d17..543a1656 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.87"
+version = "0.1.88"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 38bb3989..b1d8d1cd 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.87"
+version = "0.1.88"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 6cfcc226e074aa04a52bf351c4008e49a86a7aa0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 20 Jan 2025 05:46:54 +0000
Subject: [PATCH 068/193] chore(deps): bump enumflags2 from 0.7.10 to 0.7.11

Bumps [enumflags2](https://github.com/meithecatte/enumflags2) from 0.7.10 to 0.7.11.
- [Release notes](https://github.com/meithecatte/enumflags2/releases)
- [Commits](https://github.com/meithecatte/enumflags2/compare/v0.7.10...v0.7.11)

---
updated-dependencies:
- dependency-name: enumflags2
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock             | 8 ++++----
 llama-cpp-2/Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 985d853b..bee2b0ee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -302,18 +302,18 @@ dependencies = [
 
 [[package]]
 name = "enumflags2"
-version = "0.7.10"
+version = "0.7.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d232db7f5956f3f14313dc2f87985c58bd2c695ce124c8cdd984e08e15ac133d"
+checksum = "ba2f4b465f5318854c6f8dd686ede6c0a9dc67d4b1ac241cf0eb51521a309147"
 dependencies = [
  "enumflags2_derive",
 ]
 
 [[package]]
 name = "enumflags2_derive"
-version = "0.7.10"
+version = "0.7.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de0d48a183585823424a4ce1aa132d174a6a81bd540895822eb4c8373a8e49e8"
+checksum = "fc4caf64a58d7a6d65ab00639b046ff54399a39f5f2554728895ace4b297cd79"
 dependencies = [
  "proc-macro2",
  "quote",
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 543a1656..bf806af0 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -9,7 +9,7 @@ repository = "https://github.com/utilityai/llama-cpp-rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-enumflags2 = "0.7.10"
+enumflags2 = "0.7.11"
 llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.69" }
 thiserror = { workspace = true }
 tracing = { workspace = true }

From badfe32c842635d7491d52d7f46adec1209b4cfa Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 20 Jan 2025 05:47:06 +0000
Subject: [PATCH 069/193] chore(deps): bump cc from 1.2.9 to 1.2.10

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.9 to 1.2.10.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.9...cc-v1.2.10)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 985d853b..e8d180c8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.9"
+version = "1.2.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b"
+checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index cf9976cd..20b75e64 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.9"
+cc = "1.2.10"
 anyhow = "1.0.95"
 clap = "4.5.26"
 encoding_rs = "0.8.35"

From 29ae7fe20c22eb1a32b83b15a80564d3d665159f Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 20 Jan 2025 16:23:41 +0000
Subject: [PATCH 070/193] Bump version to 0.1.89 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a83dfae3..5c4b4068 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.88"
+version = "0.1.89"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.88"
+version = "0.1.89"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.88"
+version = "0.1.89"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.88"
+version = "0.1.89"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 117fe641..95e753a1 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.88"
+version = "0.1.89"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 95e200a5..cd5f91aa 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.88"
+version = "0.1.89"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index bf806af0..05b31476 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.88"
+version = "0.1.89"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index b1d8d1cd..e380414a 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.88"
+version = "0.1.89"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 6942e137ac52cee5abefa279da9140f4bf5dbf84 Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Tue, 21 Jan 2025 11:16:08 -0600
Subject: [PATCH 071/193] Update llama.cpp to latest version

---
 llama-cpp-2/src/context.rs  |  4 +--
 llama-cpp-2/src/model.rs    | 31 ++++++++++++----------
 llama-cpp-2/src/sampling.rs | 52 +++----------------------------------
 llama-cpp-sys-2/llama.cpp   |  2 +-
 4 files changed, 24 insertions(+), 65 deletions(-)

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index 8946da2b..94e08b10 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -318,7 +318,7 @@ impl<'model> LlamaContext<'model> {
         scale: f32,
     ) -> Result<(), LlamaLoraAdapterSetError> {
         let err_code = unsafe {
-            llama_cpp_sys_2::llama_lora_adapter_set(
+            llama_cpp_sys_2::llama_set_adapter_lora(
                 self.context.as_ptr(),
                 adapter.lora_adapter.as_ptr(),
                 scale,
@@ -342,7 +342,7 @@ impl<'model> LlamaContext<'model> {
         adapter: &mut LlamaLoraAdapter,
     ) -> Result<(), LlamaLoraAdapterRemoveError> {
         let err_code = unsafe {
-            llama_cpp_sys_2::llama_lora_adapter_remove(
+            llama_cpp_sys_2::llama_rm_adapter_lora(
                 self.context.as_ptr(),
                 adapter.lora_adapter.as_ptr(),
             )
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 127ff6b7..deefaf0d 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -31,7 +31,7 @@ pub struct LlamaModel {
 #[repr(transparent)]
 #[allow(clippy::module_name_repetitions)]
 pub struct LlamaLoraAdapter {
-    pub(crate) lora_adapter: NonNull<llama_cpp_sys_2::llama_lora_adapter>,
+    pub(crate) lora_adapter: NonNull<llama_cpp_sys_2::llama_adapter_lora>,
 }
 
 /// A Safe wrapper around `llama_chat_message`
@@ -74,6 +74,10 @@ unsafe impl Send for LlamaModel {}
 unsafe impl Sync for LlamaModel {}
 
 impl LlamaModel {
+    pub(crate) fn vocab_ptr(&self) -> *const llama_cpp_sys_2::llama_vocab {
+        unsafe { llama_cpp_sys_2::llama_model_get_vocab(self.model.as_ptr()) }
+    }
+
     /// get the number of tokens the model was trained on
     ///
     /// # Panics
@@ -99,28 +103,28 @@ impl LlamaModel {
     /// Get the beginning of stream token.
     #[must_use]
     pub fn token_bos(&self) -> LlamaToken {
-        let token = unsafe { llama_cpp_sys_2::llama_token_bos(self.model.as_ptr()) };
+        let token = unsafe { llama_cpp_sys_2::llama_token_bos(self.vocab_ptr()) };
         LlamaToken(token)
     }
 
     /// Get the end of stream token.
     #[must_use]
     pub fn token_eos(&self) -> LlamaToken {
-        let token = unsafe { llama_cpp_sys_2::llama_token_eos(self.model.as_ptr()) };
+        let token = unsafe { llama_cpp_sys_2::llama_token_eos(self.vocab_ptr()) };
         LlamaToken(token)
     }
 
     /// Get the newline token.
     #[must_use]
     pub fn token_nl(&self) -> LlamaToken {
-        let token = unsafe { llama_cpp_sys_2::llama_token_nl(self.model.as_ptr()) };
+        let token = unsafe { llama_cpp_sys_2::llama_token_nl(self.vocab_ptr()) };
         LlamaToken(token)
     }
 
     /// Check if a token represents the end of generation (end of turn, end of sequence, etc.)
     #[must_use]
     pub fn is_eog_token(&self, token: LlamaToken) -> bool {
-        unsafe { llama_cpp_sys_2::llama_token_is_eog(self.model.as_ptr(), token.0) }
+        unsafe { llama_cpp_sys_2::llama_token_is_eog(self.vocab_ptr(), token.0) }
     }
 
     /// Get the decoder start token.
@@ -225,7 +229,7 @@ impl LlamaModel {
 
         let size = unsafe {
             llama_cpp_sys_2::llama_tokenize(
-                self.model.as_ptr(),
+                self.vocab_ptr(),
                 c_string.as_ptr(),
                 c_int::try_from(c_string.as_bytes().len())?,
                 buffer.as_mut_ptr() as *mut llama_cpp_sys_2::llama_token,
@@ -241,7 +245,7 @@ impl LlamaModel {
             buffer.reserve_exact(usize::try_from(-size).expect("usize's are larger "));
             unsafe {
                 llama_cpp_sys_2::llama_tokenize(
-                    self.model.as_ptr(),
+                    self.vocab_ptr(),
                     c_string.as_ptr(),
                     c_int::try_from(c_string.as_bytes().len())?,
                     buffer.as_mut_ptr() as *mut llama_cpp_sys_2::llama_token,
@@ -268,7 +272,7 @@ impl LlamaModel {
     /// If the token type is not known to this library.
     #[must_use]
     pub fn token_attr(&self, LlamaToken(id): LlamaToken) -> LlamaTokenAttrs {
-        let token_type = unsafe { llama_cpp_sys_2::llama_token_get_attr(self.model.as_ptr(), id) };
+        let token_type = unsafe { llama_cpp_sys_2::llama_token_get_attr(self.vocab_ptr(), id) };
         LlamaTokenAttrs::try_from(token_type).expect("token type is valid")
     }
 
@@ -347,7 +351,7 @@ impl LlamaModel {
         let lstrip = lstrip.map_or(0, |it| i32::from(it.get()));
         let size = unsafe {
             llama_cpp_sys_2::llama_token_to_piece(
-                self.model.as_ptr(),
+                self.vocab_ptr(),
                 token.0,
                 buf,
                 len,
@@ -374,7 +378,7 @@ impl LlamaModel {
     /// without issue.
     #[must_use]
     pub fn n_vocab(&self) -> i32 {
-        unsafe { llama_cpp_sys_2::llama_n_vocab(self.model.as_ptr()) }
+        unsafe { llama_cpp_sys_2::llama_n_vocab(self.vocab_ptr()) }
     }
 
     /// The type of vocab the model was trained on.
@@ -384,7 +388,8 @@ impl LlamaModel {
     /// If llama-cpp emits a vocab type that is not known to this library.
     #[must_use]
     pub fn vocab_type(&self) -> VocabType {
-        let vocab_type = unsafe { llama_cpp_sys_2::llama_vocab_type(self.model.as_ptr()) };
+        // llama_cpp_sys_2::llama_model_get_vocab
+        let vocab_type = unsafe { llama_cpp_sys_2::llama_vocab_type(self.vocab_ptr()) };
         VocabType::try_from(vocab_type).expect("invalid vocab type")
     }
 
@@ -479,7 +484,7 @@ impl LlamaModel {
 
         let cstr = CString::new(path)?;
         let adapter =
-            unsafe { llama_cpp_sys_2::llama_lora_adapter_init(self.model.as_ptr(), cstr.as_ptr()) };
+            unsafe { llama_cpp_sys_2::llama_adapter_lora_init(self.model.as_ptr(), cstr.as_ptr()) };
 
         let adapter = NonNull::new(adapter).ok_or(LlamaLoraAdapterInitError::NullResult)?;
 
@@ -548,7 +553,6 @@ impl LlamaModel {
 
         let res = unsafe {
             llama_cpp_sys_2::llama_chat_apply_template(
-                self.model.as_ptr(),
                 tmpl_ptr,
                 chat.as_ptr(),
                 chat.len(),
@@ -563,7 +567,6 @@ impl LlamaModel {
 
             let res = unsafe {
                 llama_cpp_sys_2::llama_chat_apply_template(
-                    self.model.as_ptr(),
                     tmpl_ptr,
                     chat.as_ptr(),
                     chat.len(),
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 88c4ee5d..d33f92c8 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -238,7 +238,7 @@ impl LlamaSampler {
 
         let sampler = unsafe {
             llama_cpp_sys_2::llama_sampler_init_grammar(
-                model.model.as_ptr(),
+                model.vocab_ptr(),
                 grammar_str.as_ptr(),
                 grammar_root.as_ptr(),
             )
@@ -264,14 +264,15 @@ impl LlamaSampler {
     ) -> Self {
         let seq_breakers: Vec<CString> = seq_breakers
             .into_iter()
-            .map(|s| CString::new(s.as_ref()).unwrap())
+            .map(|s| CString::new(s.as_ref()).expect("A sequence breaker contains null bytes"))
             .collect();
         let mut seq_breaker_pointers: Vec<*const CChar> =
             seq_breakers.iter().map(|s| s.as_ptr()).collect();
 
         let sampler = unsafe {
             llama_cpp_sys_2::llama_sampler_init_dry(
-                model.model.as_ptr(),
+                model.vocab_ptr(),
+                model.n_ctx_train().try_into().expect("n_ctx_train is greater than two billion"),
                 multiplier,
                 base,
                 allowed_length,
@@ -286,74 +287,29 @@ impl LlamaSampler {
     /// Penalizes tokens for being present in the context.
     ///
     /// Parameters:  
-    /// - ``n_vocab``: [`LlamaModel::n_vocab`]
-    /// - ``special_eos)id``: [`LlamaModel::token_eos`]
-    /// - ``linefeed_id``: [`LlamaModel::token_nl`]
     /// - ``penalty_last_n``: last n tokens to penalize (0 = disable penalty, -1 = context size)
     /// - ``penalty_repeat``: 1.0 = disabled
     /// - ``penalty_freq``: 0.0 = disabled
     /// - ``penalty_present``: 0.0 = disabled
-    /// - ``penalize_nl``: consider newlines as a repeatable token
-    /// - ``ignore_eos``: ignore the end-of-sequence token
     #[allow(clippy::too_many_arguments)]
     #[must_use]
     pub fn penalties(
-        n_vocab: i32,
-        special_eos_id: i32,
-        linefeed_id: i32,
         penalty_last_n: i32,
         penalty_repeat: f32,
         penalty_freq: f32,
         penalty_present: f32,
-        penalize_nl: bool,
-        ignore_eos: bool,
     ) -> Self {
         let sampler = unsafe {
             llama_cpp_sys_2::llama_sampler_init_penalties(
-                n_vocab,
-                special_eos_id,
-                linefeed_id,
                 penalty_last_n,
                 penalty_repeat,
                 penalty_freq,
                 penalty_present,
-                penalize_nl,
-                ignore_eos,
             )
         };
         Self { sampler }
     }
 
-    /// Same as [`Self::penalties`], but with `n_vocab`, `special_eos_id`, and `linefeed_id`
-    /// initialized from `model`, `penalize_nl = false`, and `ignore_eos = true`.
-    ///
-    /// Parameters:  
-    /// - ``model``: The model's tokenizer to use to initialize the sampler.
-    /// - ``penalty_last_n``: last n tokens to penalize (0 = disable penalty, -1 = context size)
-    /// - ``penalty_repeat``: 1.0 = disabled
-    /// - ``penalty_freq``: 0.0 = disabled
-    /// - ``penalty_present``: 0.0 = disabled
-    #[must_use]
-    pub fn penalties_simple(
-        model: &LlamaModel,
-        penalty_last_n: i32,
-        penalty_repeat: f32,
-        penalty_freq: f32,
-        penalty_present: f32,
-    ) -> Self {
-        Self::penalties(
-            model.n_vocab(),
-            model.token_eos().0,
-            model.token_nl().0,
-            penalty_last_n,
-            penalty_repeat,
-            penalty_freq,
-            penalty_present,
-            false,
-            true,
-        )
-    }
-
     /// Mirostat 1.0 algorithm described in the paper <https://arxiv.org/abs/2007.14966>. Uses tokens instead of words.
     ///
     /// # Parameters:
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 64ed2091..6171c9d2 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 64ed2091b24b2f9747148fdf49a34ed5938762c3
+Subproject commit 6171c9d25820ccf676b243c172868819d882848f

From 5dacedffe3ff607ed1402a718eeb5dce3ab6276e Mon Sep 17 00:00:00 2001
From: Nathan Koppel <nathankoppel0@gmail.com>
Date: Tue, 21 Jan 2025 11:54:27 -0600
Subject: [PATCH 072/193] Resolve all clippy lints

---
 llama-cpp-2/src/context.rs                   |  4 +-
 llama-cpp-2/src/context/kv_cache.rs          | 46 ++++++++++++--------
 llama-cpp-2/src/llama_batch.rs               | 24 +++++++---
 llama-cpp-2/src/model.rs                     | 44 +++++++++++--------
 llama-cpp-2/src/model/params/kv_overrides.rs |  2 +-
 llama-cpp-2/src/sampling.rs                  |  9 ++--
 llama-cpp-2/src/token/data_array.rs          |  4 +-
 llama-cpp-sys-2/build.rs                     | 25 ++++-------
 8 files changed, 91 insertions(+), 67 deletions(-)

diff --git a/llama-cpp-2/src/context.rs b/llama-cpp-2/src/context.rs
index 94e08b10..10f2d7eb 100644
--- a/llama-cpp-2/src/context.rs
+++ b/llama-cpp-2/src/context.rs
@@ -52,13 +52,13 @@ impl<'model> LlamaContext<'model> {
         }
     }
 
-    /// Gets the max number of logical tokens that can be submitted to decode. Must be greater than or equal to n_ubatch.
+    /// Gets the max number of logical tokens that can be submitted to decode. Must be greater than or equal to [`Self::n_ubatch`].
     #[must_use]
     pub fn n_batch(&self) -> u32 {
         unsafe { llama_cpp_sys_2::llama_n_batch(self.context.as_ptr()) }
     }
 
-    /// Gets the max number of physical tokens (hardware level) to decode in batch. Must be less than or equal to n_batch.
+    /// Gets the max number of physical tokens (hardware level) to decode in batch. Must be less than or equal to [`Self::n_batch`].
     #[must_use]
     pub fn n_ubatch(&self) -> u32 {
         unsafe { llama_cpp_sys_2::llama_n_ubatch(self.context.as_ptr()) }
diff --git a/llama-cpp-2/src/context/kv_cache.rs b/llama-cpp-2/src/context/kv_cache.rs
index d5a8ed65..d90a6b8a 100644
--- a/llama-cpp-2/src/context/kv_cache.rs
+++ b/llama-cpp-2/src/context/kv_cache.rs
@@ -6,6 +6,7 @@ use std::num::{NonZeroU8, TryFromIntError};
 
 /// Errors that can occur when attempting to prepare values for the kv cache
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
+#[allow(clippy::module_name_repetitions)]
 pub enum KvCacheConversionError {
     /// Sequence id conversion to i32 failed
     #[error("Provided sequence id is too large for a i32")]
@@ -33,15 +34,16 @@ impl LlamaContext<'_> {
     /// Copy the cache from one sequence to another.
     ///
     /// # Returns
-    /// A `Result` indicating whether the operation was successful. If the either position exceeds
-    /// the maximum i32 value, no copy is attempted and an `Err` is returned.
+    /// A `Result` indicating whether the operation was successful.
     ///
     /// # Parameters
-    ///
     /// * `src` - The sequence id to copy the cache from.
     /// * `dest` - The sequence id to copy the cache to.
     /// * `p0` - The start position of the cache to clear. If `None`, the entire cache is copied up to `p1`.
     /// * `p1` - The end position of the cache to clear. If `None`, the entire cache is copied starting from `p0`.
+    ///
+    /// # Errors
+    /// If either position exceeds [`i32::MAX`].
     pub fn copy_kv_cache_seq(
         &mut self,
         src: i32,
@@ -51,10 +53,10 @@ impl LlamaContext<'_> {
     ) -> Result<(), KvCacheConversionError> {
         let p0 = p0
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::P0TooLarge(e))?;
+            .map_err(KvCacheConversionError::P0TooLarge)?;
         let p1 = p1
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::P1TooLarge(e))?;
+            .map_err(KvCacheConversionError::P1TooLarge)?;
         unsafe {
             llama_cpp_sys_2::llama_kv_cache_seq_cp(self.context.as_ptr(), src, dest, p0, p1);
         }
@@ -69,10 +71,12 @@ impl LlamaContext<'_> {
     /// either position exceeds the maximum i32 value, no removal is attempted and an `Err` is returned.
     ///
     /// # Parameters
-    ///
     /// * `src` - The sequence id to clear the cache for. If `None`, matches all sequences
     /// * `p0` - The start position of the cache to clear. If `None`, the entire cache is cleared up to `p1`.
     /// * `p1` - The end position of the cache to clear. If `None`, the entire cache is cleared from `p0`.
+    ///
+    /// # Errors
+    /// If the sequence id or either position exceeds [`i32::MAX`].
     pub fn clear_kv_cache_seq(
         &mut self,
         src: Option<u32>,
@@ -81,13 +85,13 @@ impl LlamaContext<'_> {
     ) -> Result<bool, KvCacheConversionError> {
         let src = src
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::SeqIdTooLarge(e))?;
+            .map_err(KvCacheConversionError::SeqIdTooLarge)?;
         let p0 = p0
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::P0TooLarge(e))?;
+            .map_err(KvCacheConversionError::P0TooLarge)?;
         let p1 = p1
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::P1TooLarge(e))?;
+            .map_err(KvCacheConversionError::P1TooLarge)?;
         Ok(unsafe { llama_cpp_sys_2::llama_kv_cache_seq_rm(self.context.as_ptr(), src, p0, p1) })
     }
 
@@ -118,8 +122,7 @@ impl LlamaContext<'_> {
     ///   - explicitly with [`Self::kv_cache_update`]
     ///
     /// # Returns
-    /// A `Result` indicating whether the operation was successful. If either position
-    /// exceeds the maximum i32 value, no update is attempted and an `Err` is returned.
+    /// A `Result` indicating whether the operation was successful.
     ///
     /// # Parameters
     ///
@@ -127,6 +130,9 @@ impl LlamaContext<'_> {
     /// * `p0` - The start position of the cache to update. If `None`, the entire cache is updated up to `p1`.
     /// * `p1` - The end position of the cache to update. If `None`, the entire cache is updated starting from `p0`.
     /// * `delta` - The relative position to add to the tokens
+    ///
+    /// # Errors
+    /// If either position exceeds [`i32::MAX`].
     pub fn kv_cache_seq_add(
         &mut self,
         seq_id: i32,
@@ -136,10 +142,10 @@ impl LlamaContext<'_> {
     ) -> Result<(), KvCacheConversionError> {
         let p0 = p0
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::P0TooLarge(e))?;
+            .map_err(KvCacheConversionError::P0TooLarge)?;
         let p1 = p1
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::P1TooLarge(e))?;
+            .map_err(KvCacheConversionError::P1TooLarge)?;
         unsafe {
             llama_cpp_sys_2::llama_kv_cache_seq_add(self.context.as_ptr(), seq_id, p0, p1, delta);
         }
@@ -152,8 +158,7 @@ impl LlamaContext<'_> {
     ///   - explicitly with [`Self::kv_cache_update`]
     ///
     /// # Returns
-    /// A `Result` indicating whether the operation was successful. If either position
-    /// exceeds the maximum i32 value, no update is attempted and an `Err` is returned.
+    /// A `Result` indicating whether the operation was successful.
     ///
     /// # Parameters
     ///
@@ -161,6 +166,9 @@ impl LlamaContext<'_> {
     /// * `p0` - The start position of the cache to update. If `None`, the entire cache is updated up to `p1`.
     /// * `p1` - The end position of the cache to update. If `None`, the entire cache is updated starting from `p0`.
     /// * `d` - The factor to divide the positions by
+    ///
+    /// # Errors
+    /// If either position exceeds [`i32::MAX`].
     pub fn kv_cache_seq_div(
         &mut self,
         seq_id: i32,
@@ -170,10 +178,10 @@ impl LlamaContext<'_> {
     ) -> Result<(), KvCacheConversionError> {
         let p0 = p0
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::P0TooLarge(e))?;
+            .map_err(KvCacheConversionError::P0TooLarge)?;
         let p1 = p1
             .map_or(Ok(-1), i32::try_from)
-            .map_err(|e| KvCacheConversionError::P1TooLarge(e))?;
+            .map_err(KvCacheConversionError::P1TooLarge)?;
         let d = c_int::from(d.get());
         unsafe { llama_cpp_sys_2::llama_kv_cache_seq_div(self.context.as_ptr(), seq_id, p0, p1, d) }
         Ok(())
@@ -239,7 +247,7 @@ pub struct KVCacheView<'a> {
     view: llama_cpp_sys_2::llama_kv_cache_view,
 }
 
-impl<'a> KVCacheView<'a> {
+impl KVCacheView<'_> {
     /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
     pub fn update(&mut self) {
         unsafe {
@@ -314,7 +322,7 @@ impl<'a> KVCacheView<'a> {
     }
 }
 
-impl<'a> Drop for KVCacheView<'a> {
+impl Drop for KVCacheView<'_> {
     fn drop(&mut self) {
         unsafe {
             llama_cpp_sys_2::llama_kv_cache_view_free(&mut self.view);
diff --git a/llama-cpp-2/src/llama_batch.rs b/llama-cpp-2/src/llama_batch.rs
index 153f5d52..b96588c7 100644
--- a/llama-cpp-2/src/llama_batch.rs
+++ b/llama-cpp-2/src/llama_batch.rs
@@ -10,6 +10,7 @@ pub struct LlamaBatch {
     allocated: usize,
     /// The logits that are initialized. Used by [`LlamaContext`] to ensure that only initialized logits are accessed.
     pub(crate) initialized_logits: Vec<i32>,
+    #[allow(clippy::doc_markdown)]
     /// The llama_cpp batch. always initialize by `llama_cpp_sys_2::llama_batch_init(allocated, <unknown>, <unknown>)`
     pub(crate) llama_batch: llama_batch,
 }
@@ -20,7 +21,7 @@ pub enum BatchAddError {
     /// There was not enough space in the batch to add the token.
     #[error("Insufficient Space of {0}")]
     InsufficientSpace(usize),
-    /// Empty buffer is provided for get_one
+    /// Empty buffer is provided for [`LlamaBatch::get_one`]
     #[error("Empty buffer")]
     EmptyBuffer,
 }
@@ -152,22 +153,35 @@ impl LlamaBatch {
         }
     }
 
-    /// llama_batch_get_one
-    /// Return batch for single sequence of tokens starting at pos_0
+    /// ``llama_batch_get_one``
+    /// Return batch for single sequence of tokens
     ///
     /// NOTE: this is a helper function to facilitate transition to the new batch API
     ///
+    /// # Errors
+    /// If the provided token buffer is empty.
+    ///
+    /// # Panics
+    /// If the number of tokens in ``tokens`` exceeds [`i32::MAX`].
     pub fn get_one(tokens: &[LlamaToken]) -> Result<Self, BatchAddError> {
         if tokens.is_empty() {
             return Err(BatchAddError::EmptyBuffer);
         }
         let batch = unsafe {
             let ptr = tokens.as_ptr() as *mut i32;
-            llama_cpp_sys_2::llama_batch_get_one(ptr, tokens.len() as i32)
+            llama_cpp_sys_2::llama_batch_get_one(
+                ptr,
+                tokens
+                    .len()
+                    .try_into()
+                    .expect("number of tokens exceeds i32::MAX"),
+            )
         };
         let batch = Self {
             allocated: 0,
-            initialized_logits: vec![(tokens.len() - 1) as i32],
+            initialized_logits: vec![(tokens.len() - 1)
+                .try_into()
+                .expect("number of tokens exceeds i32::MAX + 1")],
             llama_batch: batch,
         };
         Ok(batch)
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index deefaf0d..85927ec6 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -43,6 +43,9 @@ pub struct LlamaChatMessage {
 
 impl LlamaChatMessage {
     /// Create a new `LlamaChatMessage`
+    ///
+    /// # Errors
+    /// If either of ``role`` or ``content`` contain null bytes.
     pub fn new(role: String, content: String) -> Result<Self, NewLlamaChatMessageError> {
         Ok(Self {
             role: CString::new(role)?,
@@ -152,17 +155,24 @@ impl LlamaModel {
     /// Convert single token to bytes.
     ///
     /// # Errors
-    ///
     /// See [`TokenToStringError`] for more information.
+    ///
+    /// # Panics
+    /// If a [`TokenToStringError::InsufficientBufferSpace`] error returned by
+    /// [`Self::token_to_bytes_with_size`] contains a positive nonzero value. This should never
+    /// happen.
     pub fn token_to_bytes(
         &self,
         token: LlamaToken,
         special: Special,
     ) -> Result<Vec<u8>, TokenToStringError> {
         match self.token_to_bytes_with_size(token, 8, special, None) {
-            Err(TokenToStringError::InsufficientBufferSpace(i)) => {
-                self.token_to_bytes_with_size(token, -i as usize, special, None)
-            }
+            Err(TokenToStringError::InsufficientBufferSpace(i)) => self.token_to_bytes_with_size(
+                token,
+                (-i).try_into().expect("Error buffer size is positive"),
+                special,
+                None,
+            ),
             x => x,
         }
     }
@@ -232,7 +242,7 @@ impl LlamaModel {
                 self.vocab_ptr(),
                 c_string.as_ptr(),
                 c_int::try_from(c_string.as_bytes().len())?,
-                buffer.as_mut_ptr() as *mut llama_cpp_sys_2::llama_token,
+                buffer.as_mut_ptr().cast::<llama_cpp_sys_2::llama_token>(),
                 buffer_capacity,
                 add_bos,
                 true,
@@ -248,7 +258,7 @@ impl LlamaModel {
                     self.vocab_ptr(),
                     c_string.as_ptr(),
                     c_int::try_from(c_string.as_bytes().len())?,
-                    buffer.as_mut_ptr() as *mut llama_cpp_sys_2::llama_token,
+                    buffer.as_mut_ptr().cast::<llama_cpp_sys_2::llama_token>(),
                     -size,
                     add_bos,
                     true,
@@ -323,18 +333,16 @@ impl LlamaModel {
         lstrip: Option<NonZeroU16>,
     ) -> Result<Vec<u8>, TokenToStringError> {
         if token == self.token_nl() {
-            return Ok(String::from("\n").into_bytes());
+            return Ok(b"\n".to_vec());
         }
 
         // unsure what to do with this in the face of the 'special' arg + attr changes
         let attrs = self.token_attr(token);
-        if attrs.contains(LlamaTokenAttr::Control)
-            && (token == self.token_bos() || token == self.token_eos())
-        {
-            return Ok(Vec::new());
-        } else if attrs.is_empty()
+        if attrs.is_empty()
             || attrs
                 .intersects(LlamaTokenAttr::Unknown | LlamaTokenAttr::Byte | LlamaTokenAttr::Unused)
+            || attrs.contains(LlamaTokenAttr::Control)
+                && (token == self.token_bos() || token == self.token_eos())
         {
             return Ok(Vec::new());
         }
@@ -558,12 +566,12 @@ impl LlamaModel {
                 chat.len(),
                 add_ass,
                 buff.as_mut_ptr().cast::<i8>(),
-                buff.len() as i32,
+                buff.len().try_into().expect("Buffer size exceeds i32::MAX"),
             )
         };
 
-        if res > buff.len() as i32 {
-            buff.resize(res as usize, 0);
+        if res > buff.len().try_into().expect("Buffer size exceeds i32::MAX") {
+            buff.resize(res.try_into().expect("res is negative"), 0);
 
             let res = unsafe {
                 llama_cpp_sys_2::llama_chat_apply_template(
@@ -572,12 +580,12 @@ impl LlamaModel {
                     chat.len(),
                     add_ass,
                     buff.as_mut_ptr().cast::<i8>(),
-                    buff.len() as i32,
+                    buff.len().try_into().expect("Buffer size exceeds i32::MAX"),
                 )
             };
-            assert_eq!(res, buff.len() as i32);
+            assert_eq!(Ok(res), buff.len().try_into());
         }
-        buff.truncate(res as usize);
+        buff.truncate(res.try_into().expect("res is negative"));
         Ok(String::from_utf8(buff)?)
     }
 }
diff --git a/llama-cpp-2/src/model/params/kv_overrides.rs b/llama-cpp-2/src/model/params/kv_overrides.rs
index 8bbcbdd4..b17516a1 100644
--- a/llama-cpp-2/src/model/params/kv_overrides.rs
+++ b/llama-cpp-2/src/model/params/kv_overrides.rs
@@ -104,7 +104,7 @@ pub struct KvOverrideValueIterator<'a> {
     current: usize,
 }
 
-impl<'a> Iterator for KvOverrideValueIterator<'a> {
+impl Iterator for KvOverrideValueIterator<'_> {
     type Item = (CString, ParamOverrideValue);
 
     fn next(&mut self) -> Option<Self::Item> {
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index d33f92c8..b3a2cf4f 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -21,14 +21,13 @@ impl Debug for LlamaSampler {
 }
 
 // this is needed for the dry sampler to typecheck on android
-// ...because what is normally an i8, is an u8 
+// ...because what is normally an i8, is an u8
 #[cfg(target_os = "android")]
 type CChar = u8;
 
 #[cfg(not(target_os = "android"))]
 type CChar = i8;
 
-
 impl LlamaSampler {
     /// Sample and accept a token from the idx-th output of the last evaluation
     #[must_use]
@@ -129,6 +128,7 @@ impl LlamaSampler {
         Self::chain(samplers, false)
     }
 
+    #[allow(clippy::doc_markdown)]
     /// Updates the logits l_i' = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original
     /// value, the rest are set to -inf
     ///
@@ -272,7 +272,10 @@ impl LlamaSampler {
         let sampler = unsafe {
             llama_cpp_sys_2::llama_sampler_init_dry(
                 model.vocab_ptr(),
-                model.n_ctx_train().try_into().expect("n_ctx_train is greater than two billion"),
+                model
+                    .n_ctx_train()
+                    .try_into()
+                    .expect("n_ctx_train exceeds i32::MAX"),
                 multiplier,
                 base,
                 allowed_length,
diff --git a/llama-cpp-2/src/token/data_array.rs b/llama-cpp-2/src/token/data_array.rs
index 7f583064..448864b9 100644
--- a/llama-cpp-2/src/token/data_array.rs
+++ b/llama-cpp-2/src/token/data_array.rs
@@ -141,7 +141,7 @@ impl LlamaTokenDataArray {
     /// # Panics
     /// If the internal llama.cpp sampler fails to select a token.
     pub fn sample_token(&mut self, seed: u32) -> LlamaToken {
-        self.apply_sampler(&mut LlamaSampler::dist(seed));
+        self.apply_sampler(&LlamaSampler::dist(seed));
         self.selected_token()
             .expect("Dist sampler failed to select a token!")
     }
@@ -151,7 +151,7 @@ impl LlamaTokenDataArray {
     /// # Panics
     /// If the internal llama.cpp sampler fails to select a token.
     pub fn sample_token_greedy(&mut self) -> LlamaToken {
-        self.apply_sampler(&mut LlamaSampler::greedy());
+        self.apply_sampler(&LlamaSampler::greedy());
         self.selected_token()
             .expect("Greedy sampler failed to select a token!")
     }
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index cdec57e1..7fff6bba 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -58,12 +58,10 @@ fn extract_lib_names(out_dir: &Path, build_shared_libs: bool) -> Vec<String> {
         } else {
             "*.a"
         }
+    } else if build_shared_libs {
+        "*.so"
     } else {
-        if build_shared_libs {
-            "*.so"
-        } else {
-            "*.a"
-        }
+        "*.a"
     };
     let libs_dir = out_dir.join("lib*");
     let pattern = libs_dir.join(lib_pattern);
@@ -294,21 +292,14 @@ fn main() {
     assert_ne!(llama_libs.len(), 0);
 
     for lib in llama_libs {
-        debug_log!(
-            "LINK {}",
-            format!("cargo:rustc-link-lib={}={}", llama_libs_kind, lib)
-        );
-        println!(
-            "{}",
-            format!("cargo:rustc-link-lib={}={}", llama_libs_kind, lib)
-        );
+        let link = format!("cargo:rustc-link-lib={}={}", llama_libs_kind, lib);
+        debug_log!("LINK {link}",);
+        println!("{link}",);
     }
 
     // OpenMP
-    if cfg!(feature = "openmp") {
-        if target.contains("gnu") {
-            println!("cargo:rustc-link-lib=gomp");
-        }
+    if cfg!(feature = "openmp") && target.contains("gnu") {
+        println!("cargo:rustc-link-lib=gomp");
     }
 
     // Windows debug

From 7914de36b451475d42b8cd1ac4cb4f1768e74717 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Wed, 22 Jan 2025 18:34:34 +0000
Subject: [PATCH 073/193] Bump version to 0.1.90 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5c4b4068..f7829fd4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.89"
+version = "0.1.90"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.89"
+version = "0.1.90"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.89"
+version = "0.1.90"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.89"
+version = "0.1.90"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 95e753a1..c8b5a6e2 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.89"
+version = "0.1.90"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index cd5f91aa..5d572780 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.89"
+version = "0.1.90"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 05b31476..fee8b552 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.89"
+version = "0.1.90"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index e380414a..8fc5ee39 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.89"
+version = "0.1.90"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 02e8cb6f7b50ffc84e187377c7cc8d35234e76c3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 Jan 2025 05:34:44 +0000
Subject: [PATCH 074/193] chore(deps): bump clap from 4.5.26 to 4.5.27

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.26 to 4.5.27.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.26...clap_complete-v4.5.27)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f7829fd4..5e84467e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.26"
+version = "4.5.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783"
+checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.26"
+version = "4.5.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121"
+checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 20b75e64..187f08e6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.10"
 anyhow = "1.0.95"
-clap = "4.5.26"
+clap = "4.5.27"
 encoding_rs = "0.8.35"
 
 [workspace.lints.rust]

From 379b7bb22f943e944ba1d2189a1352c14dfd8da4 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 27 Jan 2025 17:32:23 +0000
Subject: [PATCH 075/193] Bump version to 0.1.91 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5e84467e..9b3bb2c8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.90"
+version = "0.1.91"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.90"
+version = "0.1.91"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.90"
+version = "0.1.91"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.90"
+version = "0.1.91"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index c8b5a6e2..51fd0377 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.90"
+version = "0.1.91"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 5d572780..c6e88e95 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.90"
+version = "0.1.91"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index fee8b552..a927efc4 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.90"
+version = "0.1.91"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 8fc5ee39..42b6f026 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.90"
+version = "0.1.91"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 69d1fcfa0724db1bcbe2716e7aa6574aea935b36 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Wed, 29 Jan 2025 08:04:01 -0800
Subject: [PATCH 076/193] Use FFI types directly instead of maintaining own
 c_char definition

The standard library predefines c_char exactly for this purpose to get
rid of manual CChar type aliases. While the alias in theory looks fine
it was making cargo-ndk unhappy for some reason whereas using the
natural stdlib c_char type works perfectly.
---
 llama-cpp-2/src/model.rs    |  6 +++---
 llama-cpp-2/src/sampling.rs | 12 ++----------
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 85927ec6..3dc02ee9 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -1,5 +1,5 @@
 //! A safe wrapper around `llama_model`.
-use std::ffi::CString;
+use std::ffi::{c_char, CString};
 use std::num::NonZeroU16;
 use std::os::raw::c_int;
 use std::path::Path;
@@ -565,7 +565,7 @@ impl LlamaModel {
                 chat.as_ptr(),
                 chat.len(),
                 add_ass,
-                buff.as_mut_ptr().cast::<i8>(),
+                buff.as_mut_ptr().cast::<c_char>(),
                 buff.len().try_into().expect("Buffer size exceeds i32::MAX"),
             )
         };
@@ -579,7 +579,7 @@ impl LlamaModel {
                     chat.as_ptr(),
                     chat.len(),
                     add_ass,
-                    buff.as_mut_ptr().cast::<i8>(),
+                    buff.as_mut_ptr().cast::<c_char>(),
                     buff.len().try_into().expect("Buffer size exceeds i32::MAX"),
                 )
             };
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index b3a2cf4f..d79f351b 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -1,7 +1,7 @@
 //! Safe wrapper around `llama_sampler`.
 
 use std::borrow::Borrow;
-use std::ffi::CString;
+use std::ffi::{c_char, CString};
 use std::fmt::{Debug, Formatter};
 
 use crate::context::LlamaContext;
@@ -20,14 +20,6 @@ impl Debug for LlamaSampler {
     }
 }
 
-// this is needed for the dry sampler to typecheck on android
-// ...because what is normally an i8, is an u8
-#[cfg(target_os = "android")]
-type CChar = u8;
-
-#[cfg(not(target_os = "android"))]
-type CChar = i8;
-
 impl LlamaSampler {
     /// Sample and accept a token from the idx-th output of the last evaluation
     #[must_use]
@@ -266,7 +258,7 @@ impl LlamaSampler {
             .into_iter()
             .map(|s| CString::new(s.as_ref()).expect("A sequence breaker contains null bytes"))
             .collect();
-        let mut seq_breaker_pointers: Vec<*const CChar> =
+        let mut seq_breaker_pointers: Vec<*const c_char> =
             seq_breakers.iter().map(|s| s.as_ptr()).collect();
 
         let sampler = unsafe {

From e93e6be0d513a52db341be27443a5e9e313db799 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Wed, 29 Jan 2025 09:07:17 -0800
Subject: [PATCH 077/193] Disable OpenMP on Android builds

AFAICT trying to build with OpenMP support just results in a bunch of
linker errors about missing OpenMP symbols. Disable for now.
---
 llama-cpp-sys-2/build.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 7fff6bba..95259276 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -266,8 +266,13 @@ fn main() {
         config.define("GGML_CUDA", "ON");
     }
 
-    if cfg!(feature = "openmp") {
+    // Android doesn't have OpenMP support AFAICT and openmp is a default feature. Do this here
+    // rather than modifying the defaults in Cargo.toml just in case someone enables the OpenMP feature
+    // and tries to build for Android anyway.
+    if cfg!(feature = "openmp") && !target.contains("android") {
         config.define("GGML_OPENMP", "ON");
+    } else {
+        config.define("GGML_OPENMP", "OFF");
     }
 
     // General

From 73b2e070f2c58844bdf24148c173389a7ae72934 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Wed, 29 Jan 2025 09:08:22 -0800
Subject: [PATCH 078/193] Fix support for non-aarch64 systems

Android can target x86_64 systems too. Add support for all the triples
cargo-ndk lists as possibilities.
---
 llama-cpp-sys-2/build.rs | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 95259276..a289f1ed 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -229,21 +229,39 @@ fn main() {
         config.static_crt(static_crt);
     }
 
-    if target.contains("android") && target.contains("aarch64") {
+    if target.contains("android") {
         // build flags for android taken from this doc
         // https://github.com/ggerganov/llama.cpp/blob/master/docs/android.md
         let android_ndk = env::var("ANDROID_NDK")
             .expect("Please install Android NDK and ensure that ANDROID_NDK env variable is set");
+
+        println!("cargo::rerun-if-env-changed=ANDROID_NDK");
+
         config.define(
             "CMAKE_TOOLCHAIN_FILE",
             format!("{android_ndk}/build/cmake/android.toolchain.cmake"),
         );
-        config.define("ANDROID_ABI", "arm64-v8a");
-        config.define("ANDROID_PLATFORM", "android-28");
-        config.define("CMAKE_SYSTEM_PROCESSOR", "arm64");
-        config.define("CMAKE_C_FLAGS", "-march=armv8.7a");
-        config.define("CMAKE_CXX_FLAGS", "-march=armv8.7a");
-        config.define("GGML_OPENMP", "OFF");
+        if env::var("ANDROID_PLATFORM").is_ok() {
+            println!("cargo::rerun-if-env-changed=ANDROID_PLATFORM");
+        } else {
+            config.define("ANDROID_PLATFORM", "android-28");
+        }
+        if target.contains("aarch64") {
+            config.cflag("-march=armv8.7a");
+            config.cxxflag("-march=armv8.7a");
+        } else if target.contains("armv7") {
+            config.cflag("-march=armv8.7a");
+            config.cxxflag("-march=armv8.7a");
+        } else if target.contains("x86_64") {
+            config.cflag("-march=x86-64");
+            config.cxxflag("-march=x86-64");
+        } else if target.contains("i686") {
+            config.cflag("-march=i686");
+            config.cxxflag("-march=i686");
+        } else {
+            // Rather than guessing just fail.
+            panic!("Unsupported Android target {target}");
+        }
         config.define("GGML_LLAMAFILE", "OFF");
     }
 

From 093115e2822096af14876c52321b74babb821b3d Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Wed, 29 Jan 2025 09:09:28 -0800
Subject: [PATCH 079/193] Link against dynamic C++ stdlib on Android by
 default.

It's a configurable feature that can be disabled if needed, but I got a
bunch of linker errors about unavailable C++ symbols from the static
c++.a for some reason.
---
 llama-cpp-2/Cargo.toml     | 4 +++-
 llama-cpp-sys-2/Cargo.toml | 2 ++
 llama-cpp-sys-2/build.rs   | 4 ++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index a927efc4..9cd9e1ff 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -18,7 +18,7 @@ tracing = { workspace = true }
 encoding_rs = { workspace = true }
 
 [features]
-default = ["openmp"]
+default = ["openmp", "android-shared-stdcxx"]
 cuda = ["llama-cpp-sys-2/cuda"]
 metal = ["llama-cpp-sys-2/metal"]
 dynamic-link = ["llama-cpp-sys-2/dynamic-link"]
@@ -26,6 +26,8 @@ vulkan = ["llama-cpp-sys-2/vulkan"]
 native = ["llama-cpp-sys-2/native"]
 openmp = ["llama-cpp-sys-2/openmp"]
 sampler = []
+# Only has an impact on Android.
+android-shared-stdcxx = ["llama-cpp-sys-2/shared-stdcxx"]
 
 
 [target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies]
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 42b6f026..efcb7099 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -71,3 +71,5 @@ dynamic-link = []
 vulkan = []
 native = []
 openmp = []
+# Only has an impact on Android.
+shared-stdcxx = []
\ No newline at end of file
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index a289f1ed..2d8e9630 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -263,6 +263,10 @@ fn main() {
             panic!("Unsupported Android target {target}");
         }
         config.define("GGML_LLAMAFILE", "OFF");
+        if cfg!(feature = "shared-stdcxx") {
+            println!("cargo:rustc-link-lib=dylib=stdc++");
+            println!("cargo:rustc-link-lib=c++_shared");
+        }
     }
 
     if cfg!(feature = "vulkan") {

From 6edd962765d556c4ad3d15c58338789c97452e16 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Thu, 30 Jan 2025 18:28:56 +0000
Subject: [PATCH 080/193] Bump version to 0.1.92 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9b3bb2c8..26e02bfe 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.91"
+version = "0.1.92"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.91"
+version = "0.1.92"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.91"
+version = "0.1.92"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.91"
+version = "0.1.92"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 51fd0377..59688842 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.91"
+version = "0.1.92"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index c6e88e95..f3cfd7f8 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.91"
+version = "0.1.92"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 9cd9e1ff..4ab88391 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.91"
+version = "0.1.92"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index efcb7099..48dfeb6f 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.91"
+version = "0.1.92"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 32628f122d55c67bb4e2ff7f04dfa54184665160 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Fri, 31 Jan 2025 10:08:04 -0800
Subject: [PATCH 081/193] Weird hack

---
 llama-cpp-sys-2/build.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 2d8e9630..76c548cd 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -80,6 +80,13 @@ fn extract_lib_names(out_dir: &Path, build_shared_libs: bool) -> Vec<String> {
                 let lib_name = if stem_str.starts_with("lib") {
                     stem_str.strip_prefix("lib").unwrap_or(stem_str)
                 } else {
+                    if path.extension() == Some(std::ffi::OsStr::new("a")) {
+                        // panic!("renaming {:?} to {:?}", &path, path.join(format!("lib{}.a", stem_str)));
+                        let target = path.parent().unwrap().join(format!("lib{}.a", stem_str));
+                        std::fs::rename(&path, &target).unwrap_or_else(|e| {
+                            panic!("Failed to rename {path:?} to {target:?}: {e:?}");
+                        })
+                    }
                     stem_str
                 };
                 lib_names.push(lib_name.to_string());

From 90138e3e2500e38eed3d9b503cee91bb00a7f934 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Feb 2025 05:49:51 +0000
Subject: [PATCH 082/193] chore(deps): bump cc from 1.2.10 to 1.2.11

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.10 to 1.2.11.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.10...cc-v1.2.11)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 26e02bfe..4283a8f5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.10"
+version = "1.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229"
+checksum = "e4730490333d58093109dc02c23174c3f4d490998c3fed3cc8e82d57afedb9cf"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 187f08e6..d0a01b42 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.10"
+cc = "1.2.11"
 anyhow = "1.0.95"
 clap = "4.5.27"
 encoding_rs = "0.8.35"

From 9e08188503530b31b9330986de65f1205cf56a74 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Feb 2025 05:49:56 +0000
Subject: [PATCH 083/193] chore(deps): bump cmake from 0.1.52 to 0.1.53

Bumps [cmake](https://github.com/rust-lang/cmake-rs) from 0.1.52 to 0.1.53.
- [Release notes](https://github.com/rust-lang/cmake-rs/releases)
- [Changelog](https://github.com/rust-lang/cmake-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cmake-rs/compare/v0.1.52...v0.1.53)

---
updated-dependencies:
- dependency-name: cmake
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 26e02bfe..474ed644 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -186,9 +186,9 @@ checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
 
 [[package]]
 name = "cmake"
-version = "0.1.52"
+version = "0.1.53"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c682c223677e0e5b6b7f63a64b9351844c3f1b1678a68b7ee617e30fb082620e"
+checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6"
 dependencies = [
  "cc",
 ]

From 3c349430a01948069ea8104b2bb5698e2e5b0f7b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Feb 2025 18:47:10 +0000
Subject: [PATCH 084/193] chore(deps): bump openssl from 0.10.66 to 0.10.70

Bumps [openssl](https://github.com/sfackler/rust-openssl) from 0.10.66 to 0.10.70.
- [Release notes](https://github.com/sfackler/rust-openssl/releases)
- [Commits](https://github.com/sfackler/rust-openssl/compare/openssl-v0.10.66...openssl-v0.10.70)

---
updated-dependencies:
- dependency-name: openssl
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 26e02bfe..b8d2eb99 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -740,9 +740,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.66"
+version = "0.10.70"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
+checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -772,9 +772,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.103"
+version = "0.9.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
+checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
 dependencies = [
  "cc",
  "libc",

From adab86a79cd5734d89384b552bf7e14cbd7576db Mon Sep 17 00:00:00 2001
From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Date: Mon, 3 Feb 2025 14:24:59 -0800
Subject: [PATCH 085/193] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a4bd84e1..ea57dbc9 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 
 [readme]: https://github.com/utilityai/llama-cpp-rs/tree/main/llama-cpp-2
 
-This is the home for [llama-cpp-2][crates.io]. It also contains the [llama-cpp-sys] bindings which are updated regularly
+This is the home for [llama-cpp-2][crates.io]. It also contains the [llama-cpp-sys] bindings which are updated semi-regularly
 and in sync with [llama-cpp-2][crates.io].
 
 This project was created with the explict goal of staying as up to date as possible with llama.cpp, as a result it is

From 774626e0bda0870264282af216ca1cab9af3749b Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 3 Feb 2025 22:29:25 +0000
Subject: [PATCH 086/193] Bump version to 0.1.93 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b8550c5c..badaffad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.92"
+version = "0.1.93"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.92"
+version = "0.1.93"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -664,7 +664,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.92"
+version = "0.1.93"
 dependencies = [
  "bindgen",
  "cc",
@@ -1058,7 +1058,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.92"
+version = "0.1.93"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 59688842..4bde6769 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.92"
+version = "0.1.93"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index f3cfd7f8..e05cdaa6 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.92"
+version = "0.1.93"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 4ab88391..4e444e6b 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.92"
+version = "0.1.93"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 48dfeb6f..aaa4761c 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.92"
+version = "0.1.93"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 2590bb66e5b4dd5448301ab4a7b9b21ffdce97b2 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Tue, 4 Feb 2025 15:21:58 -0800
Subject: [PATCH 087/193] Rebuild if llama.cpp build environment changes

If the user requests different properties for how llama.cpp is built,
trigger a rebuild.
---
 llama-cpp-sys-2/build.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 76c548cd..db87b4c0 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -166,6 +166,10 @@ fn main() {
         .map(|v| v == "1")
         .unwrap_or(false);
 
+    println!("cargo:rerun-if-env-changed=LLAMA_LIB_PROFILE");
+    println!("cargo:rerun-if-env-changed=LLAMA_BUILD_SHARED_LIBS");
+    println!("cargo:rerun-if-env-changed=LLAMA_STATIC_CRT");
+
     debug_log!("TARGET: {}", target);
     debug_log!("CARGO_MANIFEST_DIR: {}", manifest_dir);
     debug_log!("TARGET_DIR: {}", target_dir.display());

From 6c2df25936d9ea7cc3b5b87ad8e035559e08991f Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Tue, 4 Feb 2025 15:05:46 -0800
Subject: [PATCH 088/193] Rebuild if llama.cpp source changes / minor build
 speedup

Instead of cp -r / robocopy, build from the source directory. This
mildly speeds up the build although probably not noticeable on NVME
drives. The cargo-cmake crate will automatically place output in the
out/ folder for us.

Additionally, walk the source tree to tell cargo that a rebuild is
necessary if anything changes from the source. This ensures that changes
in the llama.cpp code trigger a rebuild which makes hacking on things a
bit easier.

Looks like this copying logic was copied from sherpa-onnx given the
comments seem to be copy-pasted so remove those references.
---
 Cargo.lock                 | 29 +++++++++++++++++++++
 llama-cpp-sys-2/Cargo.toml |  1 +
 llama-cpp-sys-2/build.rs   | 53 +++++++++++++++++---------------------
 3 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index badaffad..42a0b018 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -670,6 +670,7 @@ dependencies = [
  "cc",
  "cmake",
  "glob",
+ "walkdir",
 ]
 
 [[package]]
@@ -987,6 +988,15 @@ version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
 [[package]]
 name = "schannel"
 version = "0.1.23"
@@ -1265,6 +1275,16 @@ version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
 
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
 [[package]]
 name = "wasi"
 version = "0.11.0+wasi-snapshot-preview1"
@@ -1292,6 +1312,15 @@ dependencies = [
  "rustix",
 ]
 
+[[package]]
+name = "winapi-util"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index aaa4761c..e0abdbd7 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -63,6 +63,7 @@ bindgen = { workspace = true }
 cc = { workspace = true, features = ["parallel"] }
 cmake = "0.1"
 glob = "0.3.2"
+walkdir = "2"
 
 [features]
 cuda = []
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index db87b4c0..be809fe1 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -1,5 +1,6 @@
 use cmake::Config;
 use glob::glob;
+use walkdir::DirEntry;
 use std::env;
 use std::path::{Path, PathBuf};
 use std::process::Command;
@@ -28,27 +29,6 @@ fn get_cargo_target_dir() -> Result<std::path::PathBuf, Box<dyn std::error::Erro
     Ok(target_dir.to_path_buf())
 }
 
-fn copy_folder(src: &Path, dst: &Path) {
-    std::fs::create_dir_all(dst).expect("Failed to create dst directory");
-    if cfg!(unix) {
-        std::process::Command::new("cp")
-            .arg("-rf")
-            .arg(src)
-            .arg(dst.parent().unwrap())
-            .status()
-            .expect("Failed to execute cp command");
-    }
-
-    if cfg!(windows) {
-        std::process::Command::new("robocopy.exe")
-            .arg("/e")
-            .arg(src)
-            .arg(dst)
-            .status()
-            .expect("Failed to execute robocopy command");
-    }
-}
-
 fn extract_lib_names(out_dir: &Path, build_shared_libs: bool) -> Vec<String> {
     let lib_pattern = if cfg!(windows) {
         "*.lib"
@@ -148,12 +128,17 @@ fn macos_link_search_path() -> Option<String> {
     None
 }
 
+fn is_hidden(e: &DirEntry) -> bool {
+    e.file_name().to_str().map(|s| s.starts_with('.')).unwrap_or_default()
+}
+
 fn main() {
+    println!("cargo:rerun-if-changed=build.rs");
+
     let target = env::var("TARGET").unwrap();
     let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
 
     let target_dir = get_cargo_target_dir().unwrap();
-    let llama_dst = out_dir.join("llama.cpp");
     let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get CARGO_MANIFEST_DIR");
     let llama_src = Path::new(&manifest_dir).join("llama.cpp");
     let build_shared_libs = cfg!(feature = "cuda") || cfg!(feature = "dynamic-link");
@@ -176,11 +161,20 @@ fn main() {
     debug_log!("OUT_DIR: {}", out_dir.display());
     debug_log!("BUILD_SHARED: {}", build_shared_libs);
 
-    // Prepare sherpa-onnx source
-    if !llama_dst.exists() {
-        debug_log!("Copy {} to {}", llama_src.display(), llama_dst.display());
-        copy_folder(&llama_src, &llama_dst);
+    // Make sure that changes to the llama.cpp project trigger a rebuild.
+    let rebuild_on_children_of = [
+        llama_src.join("src"),
+        llama_src.join("ggml/src"),
+        llama_src.join("common"),
+    ];
+    for entry in walkdir::WalkDir::new(&llama_src).into_iter().filter_entry(|e| !is_hidden(e)) {
+        let entry = entry.expect("Failed to obtain entry");
+        let rebuild = entry.file_name().to_str().map(|f| f.starts_with("CMake")).unwrap_or_default() || rebuild_on_children_of.iter().any(|src_folder| entry.path().starts_with(src_folder));
+        if rebuild {
+            println!("cargo:rerun-if-changed={}", entry.path().display());
+        }
     }
+
     // Speed up build
     env::set_var(
         "CMAKE_BUILD_PARALLEL_LEVEL",
@@ -193,8 +187,8 @@ fn main() {
     // Bindings
     let bindings = bindgen::Builder::default()
         .header("wrapper.h")
-        .clang_arg(format!("-I{}", llama_dst.join("include").display()))
-        .clang_arg(format!("-I{}", llama_dst.join("ggml/include").display()))
+        .clang_arg(format!("-I{}", llama_src.join("include").display()))
+        .clang_arg(format!("-I{}", llama_src.join("ggml/include").display()))
         .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
         .derive_partialeq(true)
         .allowlist_function("ggml_.*")
@@ -212,13 +206,12 @@ fn main() {
         .expect("Failed to write bindings");
 
     println!("cargo:rerun-if-changed=wrapper.h");
-    println!("cargo:rerun-if-changed=./sherpa-onnx");
 
     debug_log!("Bindings Created");
 
     // Build with Cmake
 
-    let mut config = Config::new(&llama_dst);
+    let mut config = Config::new(&llama_src);
 
     // Would require extra source files to pointlessly
     // be included in what's uploaded to and downloaded from

From 373f8c674bda7a2e66138dcf7dc325afc6f1e1a8 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Tue, 4 Feb 2025 15:22:47 -0800
Subject: [PATCH 089/193] Redirect llama.cpp logs into tracing module.

The simple example now needs a --verbose argument to be passed to have
the llama.cpp logs printed to the screen.
---
 Cargo.lock                  | 104 +++++++++++++++
 Cargo.toml                  |   2 +
 examples/simple/Cargo.toml  |   1 +
 examples/simple/src/main.rs |  14 +-
 llama-cpp-2/Cargo.toml      |   1 +
 llama-cpp-2/src/lib.rs      |  74 +++++++++++
 llama-cpp-2/src/log.rs      | 256 ++++++++++++++++++++++++++++++++++++
 7 files changed, 451 insertions(+), 1 deletion(-)
 create mode 100644 llama-cpp-2/src/log.rs

diff --git a/Cargo.lock b/Cargo.lock
index 42a0b018..1994a720 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -660,6 +660,7 @@ dependencies = [
  "llama-cpp-sys-2",
  "thiserror",
  "tracing",
+ "tracing-core",
 ]
 
 [[package]]
@@ -727,6 +728,16 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "number_prefix"
 version = "0.4.0"
@@ -789,6 +800,12 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
 
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "percent-encoding"
 version = "2.3.1"
@@ -1060,6 +1077,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -1075,6 +1101,7 @@ dependencies = [
  "encoding_rs",
  "hf-hub",
  "llama-cpp-2",
+ "tracing-subscriber",
 ]
 
 [[package]]
@@ -1161,6 +1188,16 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "thread_local"
+version = "1.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+]
+
 [[package]]
 name = "tinystr"
 version = "0.7.6"
@@ -1200,6 +1237,45 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
+dependencies = [
+ "nu-ansi-term",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
 ]
 
 [[package]]
@@ -1269,6 +1345,12 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
+[[package]]
+name = "valuable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -1312,6 +1394,22 @@ dependencies = [
  "rustix",
 ]
 
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
 [[package]]
 name = "winapi-util"
 version = "0.1.9"
@@ -1321,6 +1419,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
diff --git a/Cargo.toml b/Cargo.toml
index d0a01b42..1750d6ff 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ members = [
 # core library deps
 thiserror = "1"
 tracing = "0.1"
+tracing-core = "0.1"
 
 # examples and benchmarks
 hf-hub = { version = "0.3.2" }
@@ -21,6 +22,7 @@ cc = "1.2.11"
 anyhow = "1.0.95"
 clap = "4.5.27"
 encoding_rs = "0.8.35"
+tracing-subscriber = { version = "0.3", features = ["json"] }
 
 [workspace.lints.rust]
 missing_docs = { level = "warn" }
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index e05cdaa6..8e3d5062 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -11,6 +11,7 @@ hf-hub = { workspace = true }
 clap = { workspace = true , features = ["derive"] }
 anyhow = { workspace = true }
 encoding_rs = { workspace = true }
+tracing-subscriber = { workspace = true }
 
 [features]
 cuda = ["llama-cpp-2/cuda"]
diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index f276ac24..2cea197d 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -10,7 +10,7 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::Parser;
 use hf_hub::api::sync::ApiBuilder;
 use llama_cpp_2::context::params::LlamaContextParams;
-use llama_cpp_2::ggml_time_us;
+use llama_cpp_2::{ggml_time_us, send_logs_to_tracing, LogOptions};
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
@@ -67,6 +67,12 @@ struct Args {
         help = "size of the prompt context (default: loaded from themodel)"
     )]
     ctx_size: Option<NonZeroU32>,
+    #[arg(
+        short = 'v',
+        long,
+        help = "enable verbose llama.cpp logs",
+    )]
+    verbose: bool,
 }
 
 /// Parse a single key-value pair
@@ -132,8 +138,14 @@ fn main() -> Result<()> {
         threads,
         threads_batch,
         ctx_size,
+        verbose,
     } = Args::parse();
 
+    if verbose {
+        tracing_subscriber::fmt().init();
+    }
+    send_logs_to_tracing(LogOptions::default().with_logs_enabled(verbose));
+
     // init LLM
     let backend = LlamaBackend::init()?;
 
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 4e444e6b..72aae442 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -13,6 +13,7 @@ enumflags2 = "0.7.11"
 llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.69" }
 thiserror = { workspace = true }
 tracing = { workspace = true }
+tracing-core = { workspace = true }
 
 [dev-dependencies]
 encoding_rs = { workspace = true }
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 862f64b2..61de5a65 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -25,6 +25,7 @@ use std::string::FromUtf8Error;
 pub mod context;
 pub mod llama_backend;
 pub mod llama_batch;
+mod log;
 pub mod model;
 pub mod sampling;
 pub mod timing;
@@ -323,3 +324,76 @@ pub fn ggml_time_us() -> i64 {
 pub fn llama_supports_mlock() -> bool {
     unsafe { llama_cpp_sys_2::llama_supports_mlock() }
 }
+
+/// Options to configure how llama.cpp logs are intercepted.
+#[derive(Default, Debug, Clone)]
+pub struct LogOptions {
+    disabled: bool,
+}
+
+impl LogOptions {
+    /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for
+    /// logs to be sent to tracing.
+    pub fn with_logs_enabled(mut self, enabled: bool) -> Self {
+        self.disabled = !enabled;
+        self
+    }
+}
+
+extern "C" fn logs_to_trace(
+    level: llama_cpp_sys_2::ggml_log_level,
+    text: *const ::std::os::raw::c_char,
+    data: *mut ::std::os::raw::c_void,
+) {
+    // In the "fast-path" (i.e. the vast majority of logs) we want to avoid needing to take the log state
+    // lock at all. Similarly, we try to avoid any heap allocations within this function. This is accomplished
+    // by being a dummy pass-through to tracing in the normal case of DEBUG/INFO/WARN/ERROR logs that are
+    // newline terminated and limiting the slow-path of locks and/or heap allocations for other cases.
+    use std::borrow::Borrow;
+
+    let log_state = unsafe { &*(data as *const log::State) };
+
+    let text = unsafe { std::ffi::CStr::from_ptr(text) };
+    let text = text.to_string_lossy();
+    let text: &str = text.borrow();
+
+    if log_state.options.disabled {
+        return;
+    }
+
+    // As best I can tell llama.cpp / ggml require all log format strings at call sites to have the '\n'.
+    // If it's missing, it means that you expect more logs via CONT (or there's a typo in the codebase). To
+    // distinguish typo from intentional support for CONT, we have to buffer until the next message comes in
+    // to know how to flush it.
+
+    if level == llama_cpp_sys_2::GGML_LOG_LEVEL_CONT {
+        log_state.cont_buffered_log(text);
+    } else if text.ends_with('\n') {
+        log_state.emit_non_cont_line(level, text);
+    } else {
+        log_state.buffer_non_cont(level, text);
+    }
+}
+
+/// Redirect llama.cpp logs into tracing.
+pub fn send_logs_to_tracing(options: LogOptions) {
+    // TODO: Reinitialize the state to support calling send_logs_to_tracing multiple times.
+
+    // We set up separate log states for llama.cpp and ggml to make sure that CONT logs between the two
+    // can't possibly interfere with each other. In other words, if llama.cpp emits a log without a trailing
+    // newline and calls a GGML function, the logs won't be weirdly intermixed and instead we'll llama.cpp logs
+    // will CONT previous llama.cpp logs and GGML logs will CONT previous ggml logs.
+    let llama_heap_state = Box::as_ref(
+        log::LLAMA_STATE
+            .get_or_init(|| Box::new(log::State::new(log::Module::LlamaCpp, options.clone()))),
+    ) as *const _;
+    let ggml_heap_state = Box::as_ref(
+        log::GGML_STATE.get_or_init(|| Box::new(log::State::new(log::Module::GGML, options))),
+    ) as *const _;
+
+    unsafe {
+        // GGML has to be set after llama since setting llama sets ggml as well.
+        llama_cpp_sys_2::llama_log_set(Some(logs_to_trace), llama_heap_state as *mut _);
+        llama_cpp_sys_2::ggml_log_set(Some(logs_to_trace), ggml_heap_state as *mut _);
+    }
+}
diff --git a/llama-cpp-2/src/log.rs b/llama-cpp-2/src/log.rs
new file mode 100644
index 00000000..db6ff653
--- /dev/null
+++ b/llama-cpp-2/src/log.rs
@@ -0,0 +1,256 @@
+use super::LogOptions;
+use std::sync::OnceLock;
+use tracing_core::{callsite, field, identify_callsite, Interest, Kind, Metadata};
+
+static FIELD_NAMES: &[&str] = &["message", "module"];
+
+struct OverridableFields {
+    message: tracing::field::Field,
+    target: tracing::field::Field,
+}
+
+macro_rules! log_cs {
+    ($level:expr, $cs:ident, $meta:ident, $fields:ident, $ty:ident) => {
+        struct $ty;
+        static $cs: $ty = $ty;
+        static $meta: Metadata<'static> = Metadata::new(
+            "log event",
+            "llama-cpp-2",
+            $level,
+            ::core::option::Option::None,
+            ::core::option::Option::None,
+            ::core::option::Option::None,
+            field::FieldSet::new(FIELD_NAMES, identify_callsite!(&$cs)),
+            Kind::EVENT,
+        );
+        static $fields: std::sync::LazyLock<OverridableFields> = std::sync::LazyLock::new(|| {
+            let fields = $meta.fields();
+            OverridableFields {
+                message: fields.field("message").unwrap(),
+                target: fields.field("module").unwrap(),
+            }
+        });
+
+        impl callsite::Callsite for $ty {
+            fn set_interest(&self, _: Interest) {}
+            fn metadata(&self) -> &'static Metadata<'static> {
+                &$meta
+            }
+        }
+    };
+}
+log_cs!(
+    tracing_core::Level::DEBUG,
+    DEBUG_CS,
+    DEBUG_META,
+    DEBUG_FIELDS,
+    DebugCallsite
+);
+log_cs!(
+    tracing_core::Level::INFO,
+    INFO_CS,
+    INFO_META,
+    INFO_FIELDS,
+    InfoCallsite
+);
+log_cs!(
+    tracing_core::Level::WARN,
+    WARN_CS,
+    WARN_META,
+    WARN_FIELDS,
+    WarnCallsite
+);
+log_cs!(
+    tracing_core::Level::ERROR,
+    ERROR_CS,
+    ERROR_META,
+    ERROR_FIELDS,
+    ErrorCallsite
+);
+
+#[derive(Clone, Copy)]
+pub(super) enum Module {
+    GGML,
+    LlamaCpp,
+}
+
+impl Module {
+    const fn name(&self) -> &'static str {
+        match self {
+            Module::GGML => "ggml",
+            Module::LlamaCpp => "llama.cpp",
+        }
+    }
+}
+
+fn meta_for_level(
+    level: llama_cpp_sys_2::ggml_log_level,
+) -> (&'static Metadata<'static>, &'static OverridableFields) {
+    match level {
+        llama_cpp_sys_2::GGML_LOG_LEVEL_DEBUG => (&DEBUG_META, &DEBUG_FIELDS),
+        llama_cpp_sys_2::GGML_LOG_LEVEL_INFO => (&INFO_META, &INFO_FIELDS),
+        llama_cpp_sys_2::GGML_LOG_LEVEL_WARN => (&WARN_META, &WARN_FIELDS),
+        llama_cpp_sys_2::GGML_LOG_LEVEL_ERROR => (&ERROR_META, &ERROR_FIELDS),
+        _ => {
+            unreachable!("Illegal log level to be called here")
+        }
+    }
+}
+
+pub(super) struct State {
+    pub(super) options: LogOptions,
+    module: Module,
+    buffered: std::sync::Mutex<Option<(llama_cpp_sys_2::ggml_log_level, String)>>,
+    previous_level: std::sync::atomic::AtomicI32,
+    is_buffering: std::sync::atomic::AtomicBool,
+}
+
+impl State {
+    pub(super) fn new(module: Module, options: LogOptions) -> Self {
+        Self {
+            options,
+            module,
+            buffered: Default::default(),
+            previous_level: Default::default(),
+            is_buffering: Default::default(),
+        }
+    }
+
+    fn generate_log(target: Module, level: llama_cpp_sys_2::ggml_log_level, text: &str) {
+        // Annoying but tracing requires that the provided target name is a string literal and
+        // even &'static str isn't enough so we have to duplicate the generation AND we can't even
+        // extract the interrior module within llama.cpp/ggml to be able to propagate it forward.
+        // This happens because the target is part of a static variable injected by the macro that's
+        // initialized with said target.
+
+        let (module, text) = text
+            .char_indices()
+            .take_while(|(_, c)| c.is_ascii_lowercase() || *c == '_')
+            .last()
+            .and_then(|(pos, _)| {
+                let next_two = text.get(pos + 1..pos + 3);
+                if next_two == Some(": ") {
+                    let (sub_module, text) = text.split_at(pos + 1);
+                    let text = text.split_at(2).1;
+                    Some((Some(format!("{}::{sub_module}", target.name())), text))
+                } else {
+                    None
+                }
+            })
+            .unwrap_or((None, text));
+
+        let (meta, fields) = meta_for_level(level);
+
+        tracing::dispatcher::get_default(|dispatcher| {
+            dispatcher.event(&tracing::Event::new(
+                meta,
+                &meta.fields().value_set(&[
+                    (&fields.message, Some(&text as &dyn tracing::field::Value)),
+                    (
+                        &fields.target,
+                        module.as_ref().map(|s| s as &dyn tracing::field::Value),
+                    ),
+                ]),
+            ));
+        });
+    }
+
+    /// Append more text to the previously buffered log. The text may or may not end with a newline.
+    pub(super) fn cont_buffered_log(&self, text: &str) {
+        let mut lock = self.buffered.lock().unwrap();
+
+        if let Some((previous_log_level, mut buffer)) = lock.take() {
+            buffer.push_str(text);
+            if buffer.ends_with('\n') {
+                self.is_buffering
+                    .store(false, std::sync::atomic::Ordering::Release);
+                Self::generate_log(self.module, previous_log_level, buffer.as_str());
+            } else {
+                *lock = Some((previous_log_level, buffer));
+            }
+        } else {
+            let level = self
+                .previous_level
+                .load(std::sync::atomic::Ordering::Acquire) as llama_cpp_sys_2::ggml_log_level;
+            tracing::warn!(
+                inferred_level = level,
+                text = text,
+                origin = "crate",
+                "llma.cpp sent out a CONT log without any previously buffered message"
+            );
+            *lock = Some((level, text.to_string()));
+        }
+    }
+
+    /// Start buffering a message. Not the CONT log level and text is missing a newline.
+    pub(super) fn buffer_non_cont(&self, level: llama_cpp_sys_2::ggml_log_level, text: &str) {
+        debug_assert!(!text.ends_with('\n'));
+        debug_assert_ne!(level, llama_cpp_sys_2::GGML_LOG_LEVEL_CONT);
+
+        if let Some((previous_log_level, buffer)) = self
+            .buffered
+            .lock()
+            .unwrap()
+            .replace((level, text.to_string()))
+        {
+            tracing::warn!(
+                level = previous_log_level,
+                text = &buffer,
+                origin = "crate",
+                "Message buffered unnnecessarily due to missing newline and not followed by a CONT"
+            );
+            Self::generate_log(self.module, previous_log_level, buffer.as_str())
+        }
+
+        self.is_buffering
+            .store(true, std::sync::atomic::Ordering::Release);
+        self.previous_level
+            .store(level as i32, std::sync::atomic::Ordering::Release);
+    }
+
+    // Emit a normal unbuffered log message (not the CONT log level and the text ends with a newline).
+    pub(super) fn emit_non_cont_line(&self, level: llama_cpp_sys_2::ggml_log_level, text: &str) {
+        debug_assert!(text.ends_with('\n'));
+        debug_assert_ne!(level, llama_cpp_sys_2::GGML_LOG_LEVEL_CONT);
+
+        if self
+            .is_buffering
+            .swap(false, std::sync::atomic::Ordering::Acquire)
+        {
+            if let Some((buf_level, buf_text)) = self.buffered.lock().unwrap().take() {
+                // This warning indicates a bug within llama.cpp
+                tracing::warn!(level = buf_level, text = buf_text, origin = "crate", "llama.cpp message buffered spuriously due to missing \\n and being followed by a non-CONT message!");
+                Self::generate_log(self.module, buf_level, buf_text.as_str());
+            }
+        }
+
+        self.previous_level
+            .store(level as i32, std::sync::atomic::Ordering::Release);
+
+        let (text, newline) = text.split_at(text.len() - 1);
+        debug_assert_eq!(newline, "\n");
+
+        match level {
+            llama_cpp_sys_2::GGML_LOG_LEVEL_NONE => {
+                // TODO: Support logging this to stdout directly via options?
+                tracing::info!(no_log_level = true, text);
+            }
+            llama_cpp_sys_2::GGML_LOG_LEVEL_DEBUG
+            | llama_cpp_sys_2::GGML_LOG_LEVEL_INFO
+            | llama_cpp_sys_2::GGML_LOG_LEVEL_WARN
+            | llama_cpp_sys_2::GGML_LOG_LEVEL_ERROR => Self::generate_log(self.module, level, text),
+            llama_cpp_sys_2::GGML_LOG_LEVEL_CONT => unreachable!(),
+            _ => {
+                tracing::warn!(
+                    level = level,
+                    text = text,
+                    origin = "crate",
+                    "Unknown llama.cpp log level"
+                )
+            }
+        }
+    }
+}
+
+pub(super) static LLAMA_STATE: OnceLock<Box<State>> = OnceLock::new();
+pub(super) static GGML_STATE: OnceLock<Box<State>> = OnceLock::new();

From d789cace6f125c9e78faf6fd47e8c30ace609f97 Mon Sep 17 00:00:00 2001
From: srv1n <sarav@gmx.com>
Date: Thu, 6 Feb 2025 18:06:30 +0530
Subject: [PATCH 090/193] undid making initialized_logits public

---
 Cargo.lock                        |  11 +
 Cargo.toml                        |   2 +-
 examples/reranker/Cargo.toml      |  20 ++
 examples/reranker/README.md       |  75 +++++++
 examples/reranker/src/main.rs     | 340 ++++++++++++++++++++++++++++++
 llama-cpp-2/src/context/params.rs |   4 +
 6 files changed, 451 insertions(+), 1 deletion(-)
 create mode 100644 examples/reranker/Cargo.toml
 create mode 100644 examples/reranker/README.md
 create mode 100644 examples/reranker/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1994a720..09445542 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -934,6 +934,17 @@ version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
 
+[[package]]
+name = "reranker"
+version = "0.1.86"
+dependencies = [
+ "anyhow",
+ "clap",
+ "encoding_rs",
+ "hf-hub",
+ "llama-cpp-2",
+]
+
 [[package]]
 name = "ring"
 version = "0.17.8"
diff --git a/Cargo.toml b/Cargo.toml
index 1750d6ff..903bdfab 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,7 @@ members = [
     "llama-cpp-sys-2",
     "llama-cpp-2",
     "examples/embeddings",
-    "examples/simple",
+    "examples/simple", "examples/reranker",
 ]
 
 [workspace.dependencies]
diff --git a/examples/reranker/Cargo.toml b/examples/reranker/Cargo.toml
new file mode 100644
index 00000000..fa32c2d3
--- /dev/null
+++ b/examples/reranker/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "reranker"
+version = "0.1.86"
+edition = "2021"
+
+[dependencies]
+llama-cpp-2 = { path = "../../llama-cpp-2", version = "0.1.86" }
+hf-hub = { workspace = true }
+clap = { workspace = true, features = ["derive"] }
+anyhow = { workspace = true }
+encoding_rs = { workspace = true }
+
+[features]
+cuda = ["llama-cpp-2/cuda"]
+metal = ["llama-cpp-2/metal"]
+native = ["llama-cpp-2/native"]
+vulkan = ["llama-cpp-2/vulkan"]
+
+[lints]
+workspace = true
\ No newline at end of file
diff --git a/examples/reranker/README.md b/examples/reranker/README.md
new file mode 100644
index 00000000..935c37ca
--- /dev/null
+++ b/examples/reranker/README.md
@@ -0,0 +1,75 @@
+# Rust Reranker Implementation
+
+A Rust implementation of cross-encoder based reranking using llama-cpp-2. Cross-encoder reranking is a more accurate way to determine similarity between queries and documents compared to traditional embedding-based approaches.
+
+## Overview
+
+This implementation adds a new pooling type `LLAMA_POOLING_TYPE_RANK` which enables cross-encoder based reranking. Unlike traditional embedding approaches that encode query and document separately, this method:
+
+- Processes query and document pairs together in a single pass
+- Directly evaluates semantic relationships between the pairs
+- Outputs raw similarity scores indicating relevance
+
+## Installation
+
+```bash
+# Follow instructions to clone repo.
+# Navigate to examples reranker
+cd examples/reranker
+
+# Build the project
+cargo build --release
+```
+
+## Usage
+
+### Command Line Interface
+
+```bash
+cargo run --release -- \                                                                                                                 ✔ │ 5s │ 12:48:35
+    --model-path "models/bge-reranker-v2-m3.gguf" \ 
+    --query "what is panda?" \
+    --documents "hi" \
+    --documents "it's a bear" \
+    --documents "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." \
+    --pooling rank
+```
+Should output(with bge-reranker-v2-m3-Q5_0): 
+rerank score 0:   -6.551
+rerank score 1:   -3.802
+rerank score 2:    4.522
+
+### CLI Arguments
+
+- `--model-path`: Path to the GGUF model file
+- `--query`: The search query
+- `--documents`: One or more documents to rank against the query
+- `--pooling`: Pooling type (options: none, mean, rank)
+
+### Pooling Types
+
+- `rank`: Performs cross-encoder reranking 
+
+
+Note: The raw scores are not normalized through a sigmoid function. If you need scores between 0-1, you'll need to implement sigmoid normalization in your application code.
+
+# Additional notes
+
+- Query and documents are concatenated using the format <bos>query</eos><sep>answer</eos> 
+
+## Supported Models
+
+Some tested models:
+
+- [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)
+- [jinaai/jina-reranker-v1-tiny-en](https://huggingface.co/jinaai/jina-reranker-v1-tiny-en)
+
+Not tested others, but anything supported by llama.cpp should work. 
+
+## Implementation Details
+
+This is a close Rust implementation of the reranker implementation discussed in [llama.cpp PR #9510](https://github.com/ggerganov/llama.cpp/pull/9510).
+
+## Potential issues
+
+The bos, eos, sep tokens are being hardcoded. We need to ideally get it from the model and build out the prompts based on each specific model.
\ No newline at end of file
diff --git a/examples/reranker/src/main.rs b/examples/reranker/src/main.rs
new file mode 100644
index 00000000..5a6109ef
--- /dev/null
+++ b/examples/reranker/src/main.rs
@@ -0,0 +1,340 @@
+//! This is a translation of embedding.cpp in llama.cpp using llama-cpp-2.
+#![allow(
+    clippy::cast_possible_wrap,
+    clippy::cast_possible_truncation,
+    clippy::cast_precision_loss,
+    clippy::cast_sign_loss
+)]
+
+use std::io::Write;
+use std::path::PathBuf;
+use std::time::Duration;
+
+use anyhow::{bail, Context, Result};
+use clap::Parser;
+use hf_hub::api::sync::ApiBuilder;
+
+use llama_cpp_2::context::params::{LlamaContextParams, LlamaPoolingType};
+use llama_cpp_2::context::LlamaContext;
+use llama_cpp_2::ggml_time_us;
+use llama_cpp_2::llama_backend::LlamaBackend;
+use llama_cpp_2::llama_batch::LlamaBatch;
+use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::model::LlamaModel;
+use llama_cpp_2::model::{AddBos, Special};
+
+#[derive(clap::Parser, Debug, Clone)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Path to the model file
+    #[clap(long)]
+    model_path: PathBuf,
+
+    /// The query to embed
+    #[clap(long)]
+    query: String,
+
+    /// The documents to embed and compare against
+    #[clap(long, num_args = 1..)]
+    documents: Vec<String>,
+
+    /// Pooling type (none, mean, or rank)
+    #[clap(long, default_value = "none")]
+    pooling: String,
+
+    /// Whether to normalise the produced embeddings
+    #[clap(long, default_value_t = true)]
+    normalise: bool,
+}
+
+fn main() -> Result<()> {
+    let Args {
+        model_path,
+        query,
+        documents,
+        pooling,
+        normalise,
+    } = Args::parse();
+
+    // init LLM
+    let backend = LlamaBackend::init()?;
+
+    // offload all layers to the gpu
+    let model_params = {
+        #[cfg(any(feature = "cuda", feature = "vulkan"))]
+        if !disable_gpu {
+            LlamaModelParams::default().with_n_gpu_layers(1000)
+        } else {
+            LlamaModelParams::default()
+        }
+        #[cfg(not(any(feature = "cuda", feature = "vulkan")))]
+        LlamaModelParams::default()
+    };
+
+    let model = LlamaModel::load_from_file(&backend, model_path, &model_params)
+        .with_context(|| "unable to load model")?;
+    // println!("pooling: {}", pooling);
+    let pooling_type = match pooling.as_str() {
+        "mean" => LlamaPoolingType::Mean,
+        "none" => LlamaPoolingType::None,
+        "rank" => LlamaPoolingType::Rank,
+        _ => LlamaPoolingType::Unspecified,
+    };
+
+    let ctx_params = LlamaContextParams::default()
+        .with_n_threads_batch(std::thread::available_parallelism()?.get().try_into()?)
+        .with_embeddings(true)
+        .with_pooling_type(pooling_type);
+    println!("ctx_params: {:?}", ctx_params);
+    let mut ctx = model
+        .new_context(&backend, ctx_params)
+        .with_context(|| "unable to create the llama_context")?;
+
+    let n_embd = model.n_embd();
+
+    let prompt_lines = {
+        let mut lines = Vec::new();
+        for doc in documents {
+            // Todo!  update to get eos and sep from model instead of hardcoding
+            lines.push(format!("{query}{eos}{sep}{doc}", sep = "<s>", eos = "</s>"));
+        }
+        lines
+    };
+
+    println!("prompt_lines: {:?}", prompt_lines);
+    // tokenize the prompt
+    let tokens_lines_list = prompt_lines
+        .iter()
+        .map(|line| model.str_to_token(line, AddBos::Always))
+        .collect::<Result<Vec<_>, _>>()
+        .with_context(|| format!("failed to tokenize {:?}", prompt_lines))?;
+
+    let n_ctx = ctx.n_ctx() as usize;
+    let n_ctx_train = model.n_ctx_train();
+
+    eprintln!("n_ctx = {n_ctx}, n_ctx_train = {n_ctx_train}");
+
+    if tokens_lines_list.iter().any(|tok| n_ctx < tok.len()) {
+        bail!("One of the provided prompts exceeds the size of the context window");
+    }
+
+    // print the prompt token-by-token
+    eprintln!();
+
+    for (i, token_line) in tokens_lines_list.iter().enumerate() {
+        eprintln!("Prompt {i} --> {}", prompt_lines[i]);
+        eprintln!("Number of tokens: {}", token_line.len());
+        for token in token_line {
+            // Attempt to convert token to string and print it; if it fails, print the token instead
+            match model.token_to_str(*token, Special::Tokenize) {
+                Ok(token_str) => eprintln!("{token} --> {token_str}"),
+                Err(e) => {
+                    eprintln!("Failed to convert token to string, error: {e}");
+                    eprintln!("Token value: {token}");
+                }
+            }
+        }
+        eprintln!();
+    }
+
+    std::io::stderr().flush()?;
+
+    // create a llama_batch with the size of the context
+    // we use this object to submit token data for decoding
+    let mut batch = LlamaBatch::new(2048, 1);
+
+    // Todo!  update to get n_embd  to init vector size for better memory management
+    // let mut n_embd_count = if pooling == "none" {
+    //     tokens_lines_list.iter().map(|tokens| tokens.len()).sum()
+    // } else {
+    //     tokens_lines_list.len()
+    // };
+    let mut embeddings_stored = 0;
+    let mut max_seq_id_batch = 0;
+    let mut output = Vec::with_capacity(tokens_lines_list.len());
+
+    let t_main_start = ggml_time_us();
+
+    for tokens in &tokens_lines_list {
+        // Flush the batch if the next prompt would exceed our batch size
+        if (batch.n_tokens() as usize + tokens.len()) > 2048 {
+            batch_decode(
+                &mut ctx,
+                &mut batch,
+                max_seq_id_batch,
+                n_embd,
+                &mut output,
+                normalise,
+                pooling.clone(),
+            )?;
+            embeddings_stored += if pooling == "none" {
+                batch.n_tokens()
+            } else {
+                max_seq_id_batch
+            };
+            max_seq_id_batch = 0;
+            batch.clear();
+        }
+
+        batch.add_sequence(tokens, max_seq_id_batch, false)?;
+        max_seq_id_batch += 1;
+    }
+    // Handle final batch
+    batch_decode(
+        &mut ctx,
+        &mut batch,
+        max_seq_id_batch,
+        n_embd,
+        &mut output,
+        normalise,
+        pooling.clone(),
+    )?;
+
+    let t_main_end = ggml_time_us();
+
+    for (j, embeddings) in output.iter().enumerate() {
+        if pooling == "none" {
+            eprintln!("embedding {j}: ");
+            for i in 0..n_embd as usize {
+                if !normalise {
+                    eprint!("{:6.5} ", embeddings[i]);
+                } else {
+                    eprint!("{:9.6} ", embeddings[i]);
+                }
+            }
+            eprintln!();
+        } else if pooling == "rank" {
+            eprintln!("rerank score {j}: {:8.3}", embeddings[0]);
+        } else {
+            eprintln!("embedding {j}: ");
+            for i in 0..n_embd as usize {
+                if !normalise {
+                    eprint!("{:6.5} ", embeddings[i]);
+                } else {
+                    eprint!("{:9.6} ", embeddings[i]);
+                }
+            }
+            eprintln!();
+        }
+    }
+
+    let duration = Duration::from_micros((t_main_end - t_main_start) as u64);
+    let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum();
+    eprintln!(
+        "Created embeddings for {} tokens in {:.2} s, speed {:.2} t/s\n",
+        total_tokens,
+        duration.as_secs_f32(),
+        total_tokens as f32 / duration.as_secs_f32()
+    );
+
+    println!("{}", ctx.timings());
+
+    Ok(())
+}
+
+fn batch_decode(
+    ctx: &mut LlamaContext,
+    batch: &mut LlamaBatch,
+    s_batch: i32,
+    n_embd: i32,
+    output: &mut Vec<Vec<f32>>,
+    normalise: bool,
+    pooling: String,
+) -> Result<()> {
+    eprintln!(
+        "{}: n_tokens = {}, n_seq = {}",
+        stringify!(batch_decode),
+        batch.n_tokens(),
+        s_batch
+    );
+
+    // Clear previous kv_cache values
+    ctx.clear_kv_cache();
+
+    ctx.decode(batch).with_context(|| "llama_decode() failed")?;
+
+    for i in 0..s_batch {
+        let embeddings = ctx
+            .embeddings_seq_ith(i)
+            .with_context(|| "Failed to get sequence embeddings")?;
+        let normalized = if normalise {
+            if pooling == "rank" {
+                normalize_embeddings(&embeddings, -1)
+            } else {
+                normalize_embeddings(&embeddings, 2)
+            }
+        } else {
+            embeddings.to_vec()
+        };
+        output.push(normalized);
+    }
+
+    batch.clear();
+
+    Ok(())
+}
+
+/// Normalizes embeddings based on different normalization strategies
+fn normalize_embeddings(input: &[f32], embd_norm: i32) -> Vec<f32> {
+    let n = input.len();
+    let mut output = vec![0.0; n];
+
+    let sum = match embd_norm {
+        -1 => 1.0, // no normalization
+        0 => {
+            // max absolute
+            let max_abs = input.iter().map(|x| x.abs()).fold(0.0f32, f32::max) / 32760.0;
+            max_abs as f64
+        }
+        2 => {
+            // euclidean norm
+            input
+                .iter()
+                .map(|x| (*x as f64).powi(2))
+                .sum::<f64>()
+                .sqrt()
+        }
+        p => {
+            // p-norm
+            let sum = input.iter().map(|x| (x.abs() as f64).powi(p)).sum::<f64>();
+            sum.powf(1.0 / p as f64)
+        }
+    };
+
+    let norm = if sum > 0.0 { 1.0 / sum } else { 0.0 };
+
+    for i in 0..n {
+        output[i] = (input[i] as f64 * norm) as f32;
+    }
+
+    output
+}
+
+// /// Calculates cosine similarity between two embedding vectors
+// fn embedding_similarity_cos(embd1: &[f32], embd2: &[f32]) -> f32 {
+//     assert_eq!(embd1.len(), embd2.len(), "Embedding vectors must be the same length");
+
+//     let (sum, sum1, sum2) = embd1.iter().zip(embd2.iter()).fold(
+//         (0.0f64, 0.0f64, 0.0f64),
+//         |(sum, sum1, sum2), (e1, e2)| {
+//             let e1 = *e1 as f64;
+//             let e2 = *e2 as f64;
+//             (
+//                 sum + e1 * e2,
+//                 sum1 + e1 * e1,
+//                 sum2 + e2 * e2
+//             )
+//         }
+//     );
+
+//     // Handle zero vectors
+//     if sum1 == 0.0 || sum2 == 0.0 {
+//         return if sum1 == 0.0 && sum2 == 0.0 {
+//             1.0 // two zero vectors are similar
+//         } else {
+//             0.0
+//         };
+//     }
+
+//     (sum / (sum1.sqrt() * sum2.sqrt())) as f32
+// }
diff --git a/llama-cpp-2/src/context/params.rs b/llama-cpp-2/src/context/params.rs
index cfaf967b..892dc8dc 100644
--- a/llama-cpp-2/src/context/params.rs
+++ b/llama-cpp-2/src/context/params.rs
@@ -55,6 +55,8 @@ pub enum LlamaPoolingType {
     Cls = 2,
     /// Last pooling
     Last = 3,
+    /// Rank pooling
+    Rank = 4,
 }
 
 /// Create a `LlamaPoolingType` from a `c_int` - returns `LlamaPoolingType::Unspecified` if
@@ -66,6 +68,7 @@ impl From<i32> for LlamaPoolingType {
             1 => Self::Mean,
             2 => Self::Cls,
             3 => Self::Last,
+            4 => Self::Rank,
             _ => Self::Unspecified,
         }
     }
@@ -79,6 +82,7 @@ impl From<LlamaPoolingType> for i32 {
             LlamaPoolingType::Mean => 1,
             LlamaPoolingType::Cls => 2,
             LlamaPoolingType::Last => 3,
+            LlamaPoolingType::Rank => 4,
             LlamaPoolingType::Unspecified => -1,
         }
     }

From fab2b9e2052823c2381688d595cbc0621704bc74 Mon Sep 17 00:00:00 2001
From: Peter Willemsen <peter@codebuffet.co>
Date: Fri, 7 Feb 2025 20:24:05 +0100
Subject: [PATCH 091/193] more reliable target directory estimation

---
 Cargo.toml                 |  9 +++++----
 examples/simple/Cargo.toml |  4 ++--
 llama-cpp-sys-2/build.rs   | 41 ++++++++++++++++++++++----------------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 903bdfab..ca634345 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,10 +1,11 @@
 [workspace]
 resolver = "2"
 members = [
-    "llama-cpp-sys-2",
-    "llama-cpp-2",
-    "examples/embeddings",
-    "examples/simple", "examples/reranker",
+  "llama-cpp-sys-2",
+  "llama-cpp-2",
+  "examples/embeddings",
+  "examples/simple",
+  "examples/reranker",
 ]
 
 [workspace.dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 8e3d5062..e473bb28 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -8,14 +8,14 @@ edition = "2021"
 [dependencies]
 llama-cpp-2 = { path = "../../llama-cpp-2", version = "0.1.69" }
 hf-hub = { workspace = true }
-clap = { workspace = true , features = ["derive"] }
+clap = { workspace = true, features = ["derive"] }
 anyhow = { workspace = true }
 encoding_rs = { workspace = true }
 tracing-subscriber = { workspace = true }
 
 [features]
 cuda = ["llama-cpp-2/cuda"]
-metal =  ["llama-cpp-2/metal"]
+metal = ["llama-cpp-2/metal"]
 native = ["llama-cpp-2/native"]
 vulkan = ["llama-cpp-2/vulkan"]
 
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index be809fe1..f4b6a6df 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -1,9 +1,9 @@
 use cmake::Config;
 use glob::glob;
-use walkdir::DirEntry;
 use std::env;
 use std::path::{Path, PathBuf};
 use std::process::Command;
+use walkdir::DirEntry;
 
 macro_rules! debug_log {
     ($($arg:tt)*) => {
@@ -13,19 +13,13 @@ macro_rules! debug_log {
     };
 }
 
-fn get_cargo_target_dir() -> Result<std::path::PathBuf, Box<dyn std::error::Error>> {
-    let out_dir = std::path::PathBuf::from(std::env::var("OUT_DIR")?);
-    let profile = std::env::var("PROFILE")?;
-    let mut target_dir = None;
-    let mut sub_path = out_dir.as_path();
-    while let Some(parent) = sub_path.parent() {
-        if parent.ends_with(&profile) {
-            target_dir = Some(parent);
-            break;
-        }
-        sub_path = parent;
-    }
-    let target_dir = target_dir.ok_or("not found")?;
+fn get_cargo_target_dir() -> Result<PathBuf, Box<dyn std::error::Error>> {
+    let out_dir = env::var("OUT_DIR")?;
+    let path = PathBuf::from(out_dir);
+    let target_dir = path
+        .ancestors()
+        .nth(3)
+        .ok_or("OUT_DIR is not deep enough")?;
     Ok(target_dir.to_path_buf())
 }
 
@@ -129,7 +123,10 @@ fn macos_link_search_path() -> Option<String> {
 }
 
 fn is_hidden(e: &DirEntry) -> bool {
-    e.file_name().to_str().map(|s| s.starts_with('.')).unwrap_or_default()
+    e.file_name()
+        .to_str()
+        .map(|s| s.starts_with('.'))
+        .unwrap_or_default()
 }
 
 fn main() {
@@ -167,9 +164,19 @@ fn main() {
         llama_src.join("ggml/src"),
         llama_src.join("common"),
     ];
-    for entry in walkdir::WalkDir::new(&llama_src).into_iter().filter_entry(|e| !is_hidden(e)) {
+    for entry in walkdir::WalkDir::new(&llama_src)
+        .into_iter()
+        .filter_entry(|e| !is_hidden(e))
+    {
         let entry = entry.expect("Failed to obtain entry");
-        let rebuild = entry.file_name().to_str().map(|f| f.starts_with("CMake")).unwrap_or_default() || rebuild_on_children_of.iter().any(|src_folder| entry.path().starts_with(src_folder));
+        let rebuild = entry
+            .file_name()
+            .to_str()
+            .map(|f| f.starts_with("CMake"))
+            .unwrap_or_default()
+            || rebuild_on_children_of
+                .iter()
+                .any(|src_folder| entry.path().starts_with(src_folder));
         if rebuild {
             println!("cargo:rerun-if-changed={}", entry.path().display());
         }

From e59301b14f0f335dbbf08de138538f4120ef5a2c Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Sun, 9 Feb 2025 17:30:41 -0800
Subject: [PATCH 092/193] Exclude generated build.cpp

---
 llama-cpp-sys-2/Cargo.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index e0abdbd7..40ace7d9 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -24,6 +24,7 @@ include = [
 
     "/llama.cpp/convert_hf_to_gguf.py", # Yes, it's required
 
+    "!/llama.cpp/common/build-info.cpp",
     "/llama.cpp/common/build-info.cpp.in",
 
     "/llama.cpp/ggml/src/ggml-cuda.cu",
@@ -73,4 +74,4 @@ vulkan = []
 native = []
 openmp = []
 # Only has an impact on Android.
-shared-stdcxx = []
\ No newline at end of file
+shared-stdcxx = []

From a52729710f48a922533883dff2a26bbce3926b9e Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Sun, 9 Feb 2025 17:33:30 -0800
Subject: [PATCH 093/193] Fix packaging problem

Comment explains why this broke & what would need to change upstream
to remove the workaround. `cargo publish --dry-run -p llama-cpp-sys-2`
now succeeds.
---
 llama-cpp-sys-2/Cargo.toml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 40ace7d9..3d48b036 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -24,6 +24,17 @@ include = [
 
     "/llama.cpp/convert_hf_to_gguf.py", # Yes, it's required
 
+    # Erroneously the llama.cpp code currently generates the build-info.cpp
+    # into the source directory of the build instead of into the target directory
+    # as it should. Will try submitting something upstream to clean this up as
+    # well but for now explictly exclude this from the build. Previously this was
+    # implicitly excluded because the llama.cpp code was copied wholesale into the
+    # target directory for building which is why this problem wasn't visible before
+    # (i.e. we'd package the llama.cpp source from the submodule & thus this build-info.cpp
+    # generated file would still be ignored because it would only exist in the separate
+    # copy within the target directory. An alternative, if we do want to capture build-info.cpp
+    # within the package would be to change the CI task to add `--allow-dirty` to the package
+    # command.
     "!/llama.cpp/common/build-info.cpp",
     "/llama.cpp/common/build-info.cpp.in",
 

From d89c4f277a85ec06442ed9a7f862ce8276daee9c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 10 Feb 2025 05:41:45 +0000
Subject: [PATCH 094/193] chore(deps): bump docker/setup-buildx-action from
 3.8.0 to 3.9.0

Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3.8.0 to 3.9.0.
- [Release notes](https://github.com/docker/setup-buildx-action/releases)
- [Commits](https://github.com/docker/setup-buildx-action/compare/6524bf65af31da8d45b59e8c27de4bd072b392f5...f7ce87c1d6bead3e36075b2ce75da1f6cc28aaca)

---
updated-dependencies:
- dependency-name: docker/setup-buildx-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/llama-cpp-rs-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index f2300ea1..adc47835 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -49,7 +49,7 @@ jobs:
         with:
           platforms: arm64,amd64
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@6524bf65af31da8d45b59e8c27de4bd072b392f5
+        uses: docker/setup-buildx-action@f7ce87c1d6bead3e36075b2ce75da1f6cc28aaca
       - name: Build
         uses: docker/build-push-action@v6
         with:

From 201f67af44976e834672ea6159260b5eb79fec1a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 10 Feb 2025 05:41:47 +0000
Subject: [PATCH 095/193] chore(deps): bump docker/setup-qemu-action from 3.3.0
 to 3.4.0

Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3.3.0 to 3.4.0.
- [Release notes](https://github.com/docker/setup-qemu-action/releases)
- [Commits](https://github.com/docker/setup-qemu-action/compare/53851d14592bedcffcf25ea515637cff71ef929a...4574d27a4764455b42196d70a065bc6853246a25)

---
updated-dependencies:
- dependency-name: docker/setup-qemu-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/llama-cpp-rs-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index f2300ea1..317719b8 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -45,7 +45,7 @@ jobs:
       - name: checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
       - name: Setup QEMU
-        uses: docker/setup-qemu-action@53851d14592bedcffcf25ea515637cff71ef929a
+        uses: docker/setup-qemu-action@4574d27a4764455b42196d70a065bc6853246a25
         with:
           platforms: arm64,amd64
       - name: Set up Docker Buildx

From 2e8265a455c542491769840ccd5e98451b21ac8a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 10 Feb 2025 05:58:23 +0000
Subject: [PATCH 096/193] chore(deps): bump cmake from 0.1.53 to 0.1.54

Bumps [cmake](https://github.com/rust-lang/cmake-rs) from 0.1.53 to 0.1.54.
- [Release notes](https://github.com/rust-lang/cmake-rs/releases)
- [Changelog](https://github.com/rust-lang/cmake-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cmake-rs/compare/v0.1.53...v0.1.54)

---
updated-dependencies:
- dependency-name: cmake
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 09445542..43c4e90e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -186,9 +186,9 @@ checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
 
 [[package]]
 name = "cmake"
-version = "0.1.53"
+version = "0.1.54"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6"
+checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
 dependencies = [
  "cc",
 ]

From 5f7f14e6158dc69eeee760722d89462532468b28 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 10 Feb 2025 05:58:38 +0000
Subject: [PATCH 097/193] chore(deps): bump cc from 1.2.11 to 1.2.13

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.11 to 1.2.13.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.11...cc-v1.2.13)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 09445542..5558a87b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.11"
+version = "1.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4730490333d58093109dc02c23174c3f4d490998c3fed3cc8e82d57afedb9cf"
+checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index ca634345..84a83f87 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.11"
+cc = "1.2.13"
 anyhow = "1.0.95"
 clap = "4.5.27"
 encoding_rs = "0.8.35"

From b866663717959cbe633f5f7481623d462664078d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 10 Feb 2025 16:43:55 +0000
Subject: [PATCH 098/193] chore(deps): bump clap from 4.5.27 to 4.5.28

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.27 to 4.5.28.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.27...clap_complete-v4.5.28)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 09445542..9f91129c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.27"
+version = "4.5.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796"
+checksum = "3e77c3243bd94243c03672cb5154667347c457ca271254724f9f393aee1c05ff"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -168,9 +168,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.24"
+version = "4.5.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
+checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed"
 dependencies = [
  "heck",
  "proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index ca634345..202e1693 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.11"
 anyhow = "1.0.95"
-clap = "4.5.27"
+clap = "4.5.28"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From f600bf882eae65e83f62730c7208a0769bf18c7f Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Mon, 10 Feb 2025 10:58:08 -0800
Subject: [PATCH 099/193] Fix llama.cpp build dirtying src and breaking
 packaging

Move the build-info.cpp to the output target after building to avoid
polluting during packaging and causing verification to fail.
---
 llama-cpp-sys-2/build.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index f4b6a6df..4f0f058e 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -315,6 +315,11 @@ fn main() {
         .always_configure(false);
 
     let build_dir = config.build();
+    std::fs::rename(
+        llama_src.join("common/build-info.cpp"),
+        build_dir.join("build-info.cpp"),
+    )
+    .unwrap();
 
     // Search paths
     println!("cargo:rustc-link-search={}", out_dir.join("lib").display());

From 7326d327f6314ad81c9c652294ebf9edce386e1e Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 10 Feb 2025 21:01:57 +0000
Subject: [PATCH 100/193] Bump version to 0.1.94 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4922b3a2..9f34972e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.93"
+version = "0.1.94"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.93"
+version = "0.1.94"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -665,7 +665,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.93"
+version = "0.1.94"
 dependencies = [
  "bindgen",
  "cc",
@@ -1105,7 +1105,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.93"
+version = "0.1.94"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 4bde6769..874753c3 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.93"
+version = "0.1.94"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index e473bb28..e15e7334 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.93"
+version = "0.1.94"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 72aae442..bdca9c4e 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.93"
+version = "0.1.94"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 3d48b036..5af8ff38 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.93"
+version = "0.1.94"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 7a29ac49b05d06c72176d914a1fa8794b1a80543 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Thu, 13 Feb 2025 12:02:08 -0800
Subject: [PATCH 101/193] Cleanup chat template API

1. Make the template not an optional for apply_chat_template. This
   ensures you don't accidentally use the chatml template.
2. Improve performance for the expected case of using get_chat_template
   by returning a new LlamaChatTemplate struct that internally stores
   the string as a CString. Unless you try to explicitly create a copy or
   print, there's no extra copy into a Rust string that's created.
   Similarly, get_chat_template -> apply_chat_template no longer copies
   the template string.
3. Improve documentation including documentating what the add_ass
   parameter does and suggestions on what values you probably want to
   use. Additionally, I've made get_chat_template and
   apply_chat_template docs refer to one another to make it easier to
   discover how to use this.
---
 llama-cpp-2/src/lib.rs   |   9 ++-
 llama-cpp-2/src/model.rs | 150 ++++++++++++++++++++++++++++++---------
 2 files changed, 124 insertions(+), 35 deletions(-)

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 61de5a65..3d79337f 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -69,9 +69,6 @@ pub enum LLamaCppError {
 /// There was an error while getting the chat template from a model.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum ChatTemplateError {
-    /// the buffer was too small.
-    #[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
-    BuffSizeError(usize),
     /// gguf has no chat template
     #[error("the model has no meta val - returned code {0}")]
     MissingTemplate(i32),
@@ -80,6 +77,12 @@ pub enum ChatTemplateError {
     Utf8Error(#[from] std::str::Utf8Error),
 }
 
+enum InternalChatTemplateError {
+    Permanent(ChatTemplateError),
+    /// the buffer was too small.
+    RetryWithLargerBuffer(usize),
+}
+
 /// Failed to Load context
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum LlamaContextLoadError {
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 3dc02ee9..8b19c4bb 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -1,9 +1,10 @@
 //! A safe wrapper around `llama_model`.
-use std::ffi::{c_char, CString};
+use std::ffi::{c_char, CStr, CString};
 use std::num::NonZeroU16;
 use std::os::raw::c_int;
 use std::path::Path;
 use std::ptr::NonNull;
+use std::str::{FromStr, Utf8Error};
 
 use crate::context::params::LlamaContextParams;
 use crate::context::LlamaContext;
@@ -12,8 +13,9 @@ use crate::model::params::LlamaModelParams;
 use crate::token::LlamaToken;
 use crate::token_type::{LlamaTokenAttr, LlamaTokenAttrs};
 use crate::{
-    ApplyChatTemplateError, ChatTemplateError, LlamaContextLoadError, LlamaLoraAdapterInitError,
-    LlamaModelLoadError, NewLlamaChatMessageError, StringToTokenError, TokenToStringError,
+    ApplyChatTemplateError, ChatTemplateError, InternalChatTemplateError, LlamaContextLoadError,
+    LlamaLoraAdapterInitError, LlamaModelLoadError, NewLlamaChatMessageError, StringToTokenError,
+    TokenToStringError,
 };
 
 pub mod params;
@@ -34,6 +36,42 @@ pub struct LlamaLoraAdapter {
     pub(crate) lora_adapter: NonNull<llama_cpp_sys_2::llama_adapter_lora>,
 }
 
+/// A performance-friendly wrapper around [LlamaModel::get_chat_template] which is then
+/// fed into [LlamaModel::apply_chat_template] to convert a list of messages into an LLM
+/// prompt. Internally the template is stored as a CString to avoid round-trip conversions
+/// within the FFI.
+#[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash)]
+pub struct LlamaChatTemplate(CString);
+
+impl LlamaChatTemplate {
+    /// Create a new template from a string. This can either be the name of a llama.cpp [chat template](https://github.com/ggerganov/llama.cpp/blob/8a8c4ceb6050bd9392609114ca56ae6d26f5b8f5/src/llama-chat.cpp#L27-L61)
+    /// like "chatml" or "llama3" or an actual Jinja template for llama.cpp to interpret.
+    pub fn new(template: &str) -> Result<Self, std::ffi::NulError> {
+        Ok(Self(CString::from_str(template)?))
+    }
+
+    /// Accesses the template as a c string reference.
+    pub fn as_c_str(&self) -> &CStr {
+        &self.0
+    }
+
+    /// Attempts to convert the CString into a Rust str reference.
+    pub fn to_str(&self) -> Result<&str, Utf8Error> {
+        self.0.to_str()
+    }
+
+    /// Convenience method to create an owned String.
+    pub fn to_string(&self) -> Result<String, Utf8Error> {
+        self.to_str().map(str::to_string)
+    }
+}
+
+impl std::fmt::Debug for LlamaChatTemplate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// A Safe wrapper around `llama_chat_message`
 #[derive(Debug, Eq, PartialEq, Clone)]
 pub struct LlamaChatMessage {
@@ -408,41 +446,84 @@ impl LlamaModel {
         unsafe { llama_cpp_sys_2::llama_n_embd(self.model.as_ptr()) }
     }
 
-    /// Get chat template from model.
-    ///
-    /// # Errors
-    ///
-    /// * If the model has no chat template
-    /// * If the chat template is not a valid [`CString`].
-    #[allow(clippy::missing_panics_doc)] // we statically know this will not panic as
-    pub fn get_chat_template(&self, buf_size: usize) -> Result<String, ChatTemplateError> {
+    fn get_chat_template_impl(
+        &self,
+        capacity: usize,
+    ) -> Result<LlamaChatTemplate, InternalChatTemplateError> {
         // longest known template is about 1200 bytes from llama.cpp
-        let chat_temp = CString::new(vec![b'*'; buf_size]).expect("no null");
-        let chat_ptr = chat_temp.into_raw();
-        let chat_name = CString::new("tokenizer.chat_template").expect("no null bytes");
+        // TODO: Once MaybeUninit support is better, this can be converted to use that instead of dummy initializing such a large array.
+        let mut chat_temp = vec![b'*' as u8; capacity];
+        let chat_name =
+            CStr::from_bytes_with_nul(b"tokenizer.chat_template\0").expect("should have null byte");
 
         let ret = unsafe {
             llama_cpp_sys_2::llama_model_meta_val_str(
                 self.model.as_ptr(),
                 chat_name.as_ptr(),
-                chat_ptr,
-                buf_size,
+                chat_temp.as_mut_ptr() as *mut c_char,
+                chat_temp.len(),
             )
         };
 
         if ret < 0 {
-            return Err(ChatTemplateError::MissingTemplate(ret));
+            return Err(InternalChatTemplateError::Permanent(
+                ChatTemplateError::MissingTemplate(ret),
+            ));
         }
 
-        let template_c = unsafe { CString::from_raw(chat_ptr) };
-        let template = template_c.to_str()?;
+        let returned_len = ret as usize;
 
-        let ret: usize = ret.try_into().unwrap();
-        if template.len() < ret {
-            return Err(ChatTemplateError::BuffSizeError(ret + 1));
+        if ret as usize >= capacity {
+            // >= is important because if the returned length is equal to capacity, it means we're missing a trailing null
+            // since the returned length doesn't count the trailing null.
+            return Err(InternalChatTemplateError::RetryWithLargerBuffer(
+                returned_len,
+            ));
         }
 
-        Ok(template.to_owned())
+        assert_eq!(
+            chat_temp.get(returned_len),
+            Some(&0),
+            "should end with null byte"
+        );
+
+        chat_temp.resize(returned_len + 1, 0);
+
+        Ok(LlamaChatTemplate(unsafe {
+            CString::from_vec_with_nul_unchecked(chat_temp)
+        }))
+    }
+
+    /// Get chat template from model. If this fails, you may either want to fail to chat or pick the
+    /// specific shortcode that llama.cpp supports templates it has baked-in directly into its codebase
+    /// as fallbacks when the model doesn't contain. NOTE: If you don't specify a chat template, then
+    /// it uses chatml by default which is unlikely to actually be the correct template for your model
+    /// and you'll get weird results back.
+    ///
+    /// You supply this into [Self::apply_chat_template] to get back a string with the appropriate template
+    /// substitution applied to convert a list of messages into a prompt the LLM can use to complete
+    /// the chat.
+    ///
+    /// # Errors
+    ///
+    /// * If the model has no chat template
+    /// * If the chat template is not a valid [`CString`].
+    #[allow(clippy::missing_panics_doc)] // we statically know this will not panic as
+    pub fn get_chat_template(&self) -> Result<LlamaChatTemplate, ChatTemplateError> {
+        // Typical chat templates are quite small. Let's start with a small allocation likely to succeed.
+        // Ideally the performance of this would be negligible but uninitialized arrays in Rust are currently
+        // still not well supported so we end up initializing the chat template buffer twice. One idea might
+        // be to use a very small value here that will likely fail (like 0 or 1) and then use that to initialize.
+        // Not sure which approach is the most optimal but in practice this should work well.
+        match self.get_chat_template_impl(200) {
+            Ok(t) => Ok(t),
+            Err(InternalChatTemplateError::Permanent(e)) => Err(e),
+            Err(InternalChatTemplateError::RetryWithLargerBuffer(actual_len)) => match self.get_chat_template_impl(actual_len + 1) {
+                Ok(t) => Ok(t),
+                Err(InternalChatTemplateError::Permanent(e)) => Err(e),
+                Err(InternalChatTemplateError::RetryWithLargerBuffer(unexpected_len)) => panic!("Was told that the template length was {actual_len} but now it's {unexpected_len}"),
+            }
+        }
     }
 
     /// Loads a model from a file.
@@ -526,15 +607,25 @@ impl LlamaModel {
     /// Apply the models chat template to some messages.
     /// See https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
     ///
-    /// `tmpl` of None means to use the default template provided by llama.cpp for the model
+    /// Unlike the llama.cpp apply_chat_template which just randomly uses the ChatML template when given
+    /// a null pointer for the template, this requires an explicit template to be specified. If you want to
+    /// use "chatml", then just do `LlamaChatTemplate::new("chatml")` or any other model name or template
+    /// string.
+    ///
+    /// Use [Self::get_chat_template] to retrieve the template baked into the model (this is the preferred
+    /// mechanism as using the wrong chat template can result in really unexpected responses from the LLM).
+    ///
+    /// You probably want to set `add_ass` to true so that the generated template string ends with a the
+    /// opening tag of the assistant. If you fail to leave a hanging chat tag, the model will likely generate
+    /// one into the output and the output may also have unexpected output aside from that.
     ///
     /// # Errors
     /// There are many ways this can fail. See [`ApplyChatTemplateError`] for more information.
     #[tracing::instrument(skip_all)]
     pub fn apply_chat_template(
         &self,
-        tmpl: Option<String>,
-        chat: Vec<LlamaChatMessage>,
+        tmpl: &LlamaChatTemplate,
+        chat: &[LlamaChatMessage],
         add_ass: bool,
     ) -> Result<String, ApplyChatTemplateError> {
         // Buffer is twice the length of messages per their recommendation
@@ -552,12 +643,7 @@ impl LlamaModel {
             })
             .collect();
 
-        // Set the tmpl pointer
-        let tmpl = tmpl.map(CString::new);
-        let tmpl_ptr = match &tmpl {
-            Some(str) => str.as_ref().map_err(Clone::clone)?.as_ptr(),
-            None => std::ptr::null(),
-        };
+        let tmpl_ptr = tmpl.0.as_ptr();
 
         let res = unsafe {
             llama_cpp_sys_2::llama_chat_apply_template(

From 72c125549994457ad8e959df491a51d5ad088c04 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Thu, 13 Feb 2025 12:05:46 -0800
Subject: [PATCH 102/193] Run cargo fmt on project

---
 examples/simple/src/main.rs | 8 ++------
 llama-cpp-2/src/log.rs      | 3 ++-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index 2cea197d..9d4eef47 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -10,7 +10,6 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::Parser;
 use hf_hub::api::sync::ApiBuilder;
 use llama_cpp_2::context::params::LlamaContextParams;
-use llama_cpp_2::{ggml_time_us, send_logs_to_tracing, LogOptions};
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
@@ -18,6 +17,7 @@ use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::sampling::LlamaSampler;
+use llama_cpp_2::{ggml_time_us, send_logs_to_tracing, LogOptions};
 
 use std::ffi::CString;
 use std::io::Write;
@@ -67,11 +67,7 @@ struct Args {
         help = "size of the prompt context (default: loaded from themodel)"
     )]
     ctx_size: Option<NonZeroU32>,
-    #[arg(
-        short = 'v',
-        long,
-        help = "enable verbose llama.cpp logs",
-    )]
+    #[arg(short = 'v', long, help = "enable verbose llama.cpp logs")]
     verbose: bool,
 }
 
diff --git a/llama-cpp-2/src/log.rs b/llama-cpp-2/src/log.rs
index db6ff653..1c324b4b 100644
--- a/llama-cpp-2/src/log.rs
+++ b/llama-cpp-2/src/log.rs
@@ -171,7 +171,8 @@ impl State {
         } else {
             let level = self
                 .previous_level
-                .load(std::sync::atomic::Ordering::Acquire) as llama_cpp_sys_2::ggml_log_level;
+                .load(std::sync::atomic::Ordering::Acquire)
+                as llama_cpp_sys_2::ggml_log_level;
             tracing::warn!(
                 inferred_level = level,
                 text = text,

From 251b97eeeaaccaf33f4b06f1e7b2a3d33e55f969 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Thu, 13 Feb 2025 12:34:04 -0800
Subject: [PATCH 103/193] Pure refactor - cleanup build script

The OS detection code was bothering me as it wasn't properly doing cross
compilation (some places were and some weren't). Additionally, the OS
detection was a bit haphazard. This is a pure cleanup that parses the
information in TARGET up-front into an enum that is then checked instead
of working with strings.
---
 llama-cpp-sys-2/build.rs | 148 +++++++++++++++++++++++++--------------
 1 file changed, 97 insertions(+), 51 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 4f0f058e..1f393e06 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -5,6 +5,23 @@ use std::path::{Path, PathBuf};
 use std::process::Command;
 use walkdir::DirEntry;
 
+enum WindowsVariant {
+    Msvc,
+    Other,
+}
+
+enum AppleVariant {
+    MacOS,
+    Other,
+}
+
+enum TargetOs {
+    Windows(WindowsVariant),
+    Apple(AppleVariant),
+    Linux,
+    Android,
+}
+
 macro_rules! debug_log {
     ($($arg:tt)*) => {
         if std::env::var("BUILD_DEBUG").is_ok() {
@@ -13,6 +30,30 @@ macro_rules! debug_log {
     };
 }
 
+fn parse_target_os() -> Result<(TargetOs, String), String> {
+    let target = env::var("TARGET").unwrap();
+
+    if target.contains("windows") {
+        if target.ends_with("-windows-msvc") {
+            Ok((TargetOs::Windows(WindowsVariant::Msvc), target))
+        } else {
+            Ok((TargetOs::Windows(WindowsVariant::Other), target))
+        }
+    } else if target.contains("linux") {
+        Ok((TargetOs::Linux, target))
+    } else if target.contains("apple") {
+        if target.ends_with("-apple-darwin") {
+            Ok((TargetOs::Apple(AppleVariant::MacOS), target))
+        } else {
+            Ok((TargetOs::Apple(AppleVariant::Other), target))
+        }
+    } else if target.contains("android") {
+        Ok((TargetOs::Android, target))
+    } else {
+        Err(target)
+    }
+}
+
 fn get_cargo_target_dir() -> Result<PathBuf, Box<dyn std::error::Error>> {
     let out_dir = env::var("OUT_DIR")?;
     let path = PathBuf::from(out_dir);
@@ -132,7 +173,8 @@ fn is_hidden(e: &DirEntry) -> bool {
 fn main() {
     println!("cargo:rerun-if-changed=build.rs");
 
-    let target = env::var("TARGET").unwrap();
+    let (target_os, target_triple) =
+        parse_target_os().unwrap_or_else(|t| panic!("Failed to parse target os {t}"));
     let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
 
     let target_dir = get_cargo_target_dir().unwrap();
@@ -152,7 +194,7 @@ fn main() {
     println!("cargo:rerun-if-env-changed=LLAMA_BUILD_SHARED_LIBS");
     println!("cargo:rerun-if-env-changed=LLAMA_STATIC_CRT");
 
-    debug_log!("TARGET: {}", target);
+    debug_log!("TARGET: {}", target_triple);
     debug_log!("CARGO_MANIFEST_DIR: {}", manifest_dir);
     debug_log!("TARGET_DIR: {}", target_dir.display());
     debug_log!("OUT_DIR: {}", out_dir.display());
@@ -232,15 +274,13 @@ fn main() {
         if build_shared_libs { "ON" } else { "OFF" },
     );
 
-    if cfg!(target_os = "macos") {
+    if matches!(target_os, TargetOs::Apple(_)) {
         config.define("GGML_BLAS", "OFF");
     }
 
-    if cfg!(windows) {
-        config.static_crt(static_crt);
-    }
+    config.static_crt(static_crt);
 
-    if target.contains("android") {
+    if matches!(target_os, TargetOs::Android) {
         // build flags for android taken from this doc
         // https://github.com/ggerganov/llama.cpp/blob/master/docs/android.md
         let android_ndk = env::var("ANDROID_NDK")
@@ -257,21 +297,21 @@ fn main() {
         } else {
             config.define("ANDROID_PLATFORM", "android-28");
         }
-        if target.contains("aarch64") {
+        if target_triple.contains("aarch64") {
             config.cflag("-march=armv8.7a");
             config.cxxflag("-march=armv8.7a");
-        } else if target.contains("armv7") {
+        } else if target_triple.contains("armv7") {
             config.cflag("-march=armv8.7a");
             config.cxxflag("-march=armv8.7a");
-        } else if target.contains("x86_64") {
+        } else if target_triple.contains("x86_64") {
             config.cflag("-march=x86-64");
             config.cxxflag("-march=x86-64");
-        } else if target.contains("i686") {
+        } else if target_triple.contains("i686") {
             config.cflag("-march=i686");
             config.cxxflag("-march=i686");
         } else {
             // Rather than guessing just fail.
-            panic!("Unsupported Android target {target}");
+            panic!("Unsupported Android target {target_triple}");
         }
         config.define("GGML_LLAMAFILE", "OFF");
         if cfg!(feature = "shared-stdcxx") {
@@ -282,16 +322,19 @@ fn main() {
 
     if cfg!(feature = "vulkan") {
         config.define("GGML_VULKAN", "ON");
-        if cfg!(windows) {
-            let vulkan_path = env::var("VULKAN_SDK")
-                .expect("Please install Vulkan SDK and ensure that VULKAN_SDK env variable is set");
-            let vulkan_lib_path = Path::new(&vulkan_path).join("Lib");
-            println!("cargo:rustc-link-search={}", vulkan_lib_path.display());
-            println!("cargo:rustc-link-lib=vulkan-1");
-        }
-
-        if cfg!(target_os = "linux") {
-            println!("cargo:rustc-link-lib=vulkan");
+        match target_os {
+            TargetOs::Windows(_) => {
+                let vulkan_path = env::var("VULKAN_SDK").expect(
+                    "Please install Vulkan SDK and ensure that VULKAN_SDK env variable is set",
+                );
+                let vulkan_lib_path = Path::new(&vulkan_path).join("Lib");
+                println!("cargo:rustc-link-search={}", vulkan_lib_path.display());
+                println!("cargo:rustc-link-lib=vulkan-1");
+            }
+            TargetOs::Linux => {
+                println!("cargo:rustc-link-lib=vulkan");
+            }
+            _ => (),
         }
     }
 
@@ -302,7 +345,7 @@ fn main() {
     // Android doesn't have OpenMP support AFAICT and openmp is a default feature. Do this here
     // rather than modifying the defaults in Cargo.toml just in case someone enables the OpenMP feature
     // and tries to build for Android anyway.
-    if cfg!(feature = "openmp") && !target.contains("android") {
+    if cfg!(feature = "openmp") && !matches!(target_os, TargetOs::Android) {
         config.define("GGML_OPENMP", "ON");
     } else {
         config.define("GGML_OPENMP", "OFF");
@@ -341,38 +384,41 @@ fn main() {
     }
 
     // OpenMP
-    if cfg!(feature = "openmp") && target.contains("gnu") {
+    if cfg!(feature = "openmp") && target_triple.contains("gnu") {
         println!("cargo:rustc-link-lib=gomp");
     }
 
-    // Windows debug
-    if cfg!(all(debug_assertions, windows)) {
-        println!("cargo:rustc-link-lib=dylib=msvcrtd");
-    }
-
-    // // macOS
-    if cfg!(target_os = "macos") {
-        println!("cargo:rustc-link-lib=framework=Foundation");
-        println!("cargo:rustc-link-lib=framework=Metal");
-        println!("cargo:rustc-link-lib=framework=MetalKit");
-        println!("cargo:rustc-link-lib=framework=Accelerate");
-        println!("cargo:rustc-link-lib=c++");
-    }
-
-    // Linux
-    if cfg!(target_os = "linux") {
-        println!("cargo:rustc-link-lib=dylib=stdc++");
-    }
-
-    if target.contains("apple") {
-        // On (older) OSX we need to link against the clang runtime,
-        // which is hidden in some non-default path.
-        //
-        // More details at https://github.com/alexcrichton/curl-rust/issues/279.
-        if let Some(path) = macos_link_search_path() {
-            println!("cargo:rustc-link-lib=clang_rt.osx");
-            println!("cargo:rustc-link-search={}", path);
+    match target_os {
+        TargetOs::Windows(WindowsVariant::Msvc) => {
+            if cfg!(debug_assertions) {
+                println!("cargo:rustc-link-lib=dylib=msvcrtd");
+            }
+        }
+        TargetOs::Linux => {
+            println!("cargo:rustc-link-lib=dylib=stdc++");
+        }
+        TargetOs::Apple(variant) => {
+            println!("cargo:rustc-link-lib=framework=Foundation");
+            println!("cargo:rustc-link-lib=framework=Metal");
+            println!("cargo:rustc-link-lib=framework=MetalKit");
+            println!("cargo:rustc-link-lib=framework=Accelerate");
+            println!("cargo:rustc-link-lib=c++");
+
+            match variant {
+                AppleVariant::MacOS => {
+                    // On (older) OSX we need to link against the clang runtime,
+                    // which is hidden in some non-default path.
+                    //
+                    // More details at https://github.com/alexcrichton/curl-rust/issues/279.
+                    if let Some(path) = macos_link_search_path() {
+                        println!("cargo:rustc-link-lib=clang_rt.osx");
+                        println!("cargo:rustc-link-search={}", path);
+                    }
+                }
+                AppleVariant::Other => (),
+            }
         }
+        _ => (),
     }
 
     // copy DLLs to target

From 5a4dbd4429a1969d8940b67ba8b8a92dcd0366be Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Thu, 13 Feb 2025 12:41:48 -0800
Subject: [PATCH 104/193] Fix CPU inference performance when building MSVC Rust
 debug

Workaround for https://github.com/rust-lang/cmake-rs/issues/240
---
 llama-cpp-sys-2/build.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 1f393e06..b832fbbc 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -278,6 +278,22 @@ fn main() {
         config.define("GGML_BLAS", "OFF");
     }
 
+    if (cfg!(debug_assertions)
+        || std::env::var("PROFILE").as_ref().map(String::as_str) == Ok("debug"))
+        && matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc))
+        && profile == "Release"
+    {
+        // Debug Rust builds under MSVC turn off optimization even though we're ideally building the release profile of llama.cpp.
+        // Looks like an upstream bug:
+        // https://github.com/rust-lang/cmake-rs/issues/240
+        // For now explicitly reinject the optimization flags that a CMake Release build is expected to have on in this scenario.
+        // This fixes CPU inference performance when part of a Rust debug build.
+        for flag in &["/O2", "/DNDEBUG", "/Ob2"] {
+            config.cflag(flag);
+            config.cxxflag(flag);
+        }
+    }
+
     config.static_crt(static_crt);
 
     if matches!(target_os, TargetOs::Android) {

From 1039ca3a2560b7ae72f88e2f1ce85e7ee123c199 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Fri, 14 Feb 2025 01:23:12 +0000
Subject: [PATCH 105/193] Bump version to 0.1.95 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9f34972e..46c6c844 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.94"
+version = "0.1.95"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.94"
+version = "0.1.95"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -665,7 +665,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.94"
+version = "0.1.95"
 dependencies = [
  "bindgen",
  "cc",
@@ -1105,7 +1105,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.94"
+version = "0.1.95"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 874753c3..75a68d28 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.94"
+version = "0.1.95"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index e15e7334..9d0e4964 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.94"
+version = "0.1.95"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index bdca9c4e..34aeadd5 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.94"
+version = "0.1.95"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 5af8ff38..dc66bb0b 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.94"
+version = "0.1.95"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From b808a686a35bbd2771510204c3df1a96204ce998 Mon Sep 17 00:00:00 2001
From: J / Jacob Babich <jacobbabichpublic+git@gmail.com>
Date: Fri, 14 Feb 2025 14:48:57 -0500
Subject: [PATCH 106/193] Fast forward to
 https://github.com/ggerganov/llama.cpp/commit/300907b2110cc17b4337334dc397e05de2d8f5e0

---
 llama-cpp-sys-2/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 6171c9d2..300907b2 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 6171c9d25820ccf676b243c172868819d882848f
+Subproject commit 300907b2110cc17b4337334dc397e05de2d8f5e0

From 7ba0b95d3119d74dcb14a6a85b6216251594a823 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Fri, 14 Feb 2025 14:23:02 -0800
Subject: [PATCH 107/193] Fix the build crashing if target/ and src folders are
 on different disks

---
 llama-cpp-sys-2/build.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index b832fbbc..e227a951 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -96,7 +96,6 @@ fn extract_lib_names(out_dir: &Path, build_shared_libs: bool) -> Vec<String> {
                     stem_str.strip_prefix("lib").unwrap_or(stem_str)
                 } else {
                     if path.extension() == Some(std::ffi::OsStr::new("a")) {
-                        // panic!("renaming {:?} to {:?}", &path, path.join(format!("lib{}.a", stem_str)));
                         let target = path.parent().unwrap().join(format!("lib{}.a", stem_str));
                         std::fs::rename(&path, &target).unwrap_or_else(|e| {
                             panic!("Failed to rename {path:?} to {target:?}: {e:?}");
@@ -374,11 +373,18 @@ fn main() {
         .always_configure(false);
 
     let build_dir = config.build();
-    std::fs::rename(
-        llama_src.join("common/build-info.cpp"),
-        build_dir.join("build-info.cpp"),
-    )
-    .unwrap();
+    let build_info_src = llama_src.join("common/build-info.cpp");
+    let build_info_target = build_dir.join("build-info.cpp");
+    std::fs::rename(&build_info_src,&build_info_target).unwrap_or_else(|move_e| {
+        // Rename may fail if the target directory is on a different filesystem/disk from the source.
+        // Fall back to copy + delete to achieve the same effect in this case.
+        std::fs::copy(&build_info_src, &build_info_src).unwrap_or_else(|copy_e| {
+            panic!("Failed to rename {build_info_src:?} to {build_info_target:?}. Move failed with {move_e:?} and copy failed with {copy_e:?}");
+        });
+        std::fs::remove_file(&build_info_src).unwrap_or_else(|e| {
+            panic!("Failed to delete {build_info_src:?} after copying to {build_info_target:?}: {e:?} (move failed because {move_e:?})");
+        });
+    });
 
     // Search paths
     println!("cargo:rustc-link-search={}", out_dir.join("lib").display());

From 6f3e3ec0f217cab2e4e11bc93e30eb75f6fd783e Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Sat, 15 Feb 2025 17:53:48 +0000
Subject: [PATCH 108/193] Bump version to 0.1.96 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 46c6c844..84452ae3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.95"
+version = "0.1.96"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.95"
+version = "0.1.96"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -665,7 +665,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.95"
+version = "0.1.96"
 dependencies = [
  "bindgen",
  "cc",
@@ -1105,7 +1105,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.95"
+version = "0.1.96"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 75a68d28..b4959c43 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.95"
+version = "0.1.96"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 9d0e4964..70ceaf49 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.95"
+version = "0.1.96"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 34aeadd5..efca6570 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.95"
+version = "0.1.96"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index dc66bb0b..bff5ca8e 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.95"
+version = "0.1.96"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From c9555830184a825a69f89ce2460bd84aedec1316 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Sat, 15 Feb 2025 15:02:24 -0800
Subject: [PATCH 109/193] Fix Android build

Android's triple is linux-android. Move the detection of Android above
Linux as otherwise Android builds are detected as plain vanilla Linux &
break because they try to leverage OpenMP.
---
 llama-cpp-sys-2/build.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index e227a951..ec4ac7ce 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -39,8 +39,6 @@ fn parse_target_os() -> Result<(TargetOs, String), String> {
         } else {
             Ok((TargetOs::Windows(WindowsVariant::Other), target))
         }
-    } else if target.contains("linux") {
-        Ok((TargetOs::Linux, target))
     } else if target.contains("apple") {
         if target.ends_with("-apple-darwin") {
             Ok((TargetOs::Apple(AppleVariant::MacOS), target))
@@ -49,6 +47,8 @@ fn parse_target_os() -> Result<(TargetOs, String), String> {
         }
     } else if target.contains("android") {
         Ok((TargetOs::Android, target))
+    } else if target.contains("linux") {
+        Ok((TargetOs::Linux, target))
     } else {
         Err(target)
     }

From 146c17de0552271972779c539c84505a830692e8 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Sat, 15 Feb 2025 23:18:03 +0000
Subject: [PATCH 110/193] Bump version to 0.1.97 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 84452ae3..b45248c1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.96"
+version = "0.1.97"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.96"
+version = "0.1.97"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -665,7 +665,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.96"
+version = "0.1.97"
 dependencies = [
  "bindgen",
  "cc",
@@ -1105,7 +1105,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.96"
+version = "0.1.97"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index b4959c43..24f83abf 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.96"
+version = "0.1.97"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 70ceaf49..ac3d0378 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.96"
+version = "0.1.97"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index efca6570..53b8b7bb 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.96"
+version = "0.1.97"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index bff5ca8e..dc346900 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.96"
+version = "0.1.97"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 806b59e400ab6fd97b3c5fdc5d75a231614d7c6f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 17 Feb 2025 05:57:58 +0000
Subject: [PATCH 111/193] chore(deps): bump clap from 4.5.28 to 4.5.29

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.28 to 4.5.29.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.28...clap_complete-v4.5.29)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b45248c1..5133c586 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.28"
+version = "4.5.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e77c3243bd94243c03672cb5154667347c457ca271254724f9f393aee1c05ff"
+checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.27"
+version = "4.5.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7"
+checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 47d93f5a..3e8302cc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.13"
 anyhow = "1.0.95"
-clap = "4.5.28"
+clap = "4.5.29"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From 868b24332b32c587012b0b432c29c8b918f6cccf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 17 Feb 2025 05:58:10 +0000
Subject: [PATCH 112/193] chore(deps): bump cc from 1.2.13 to 1.2.14

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.13 to 1.2.14.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.13...cc-v1.2.14)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b45248c1..a6cbb525 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.13"
+version = "1.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda"
+checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 47d93f5a..322d8179 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.13"
+cc = "1.2.14"
 anyhow = "1.0.95"
 clap = "4.5.28"
 encoding_rs = "0.8.35"

From f3211665c4df89ac2c41d08637e7ddab94aa63bb Mon Sep 17 00:00:00 2001
From: Navid Haghighat <navid.haghighat@gmail.com>
Date: Thu, 20 Feb 2025 02:32:59 +0100
Subject: [PATCH 113/193] Add Logit Bias Sampler

---
 llama-cpp-2/src/logit_bias.rs | 93 +++++++++++++++++++++++++++++++++++
 llama-cpp-2/src/sampling.rs   | 37 ++++++++++++++
 llama-cpp-2/src/token.rs      |  1 +
 3 files changed, 131 insertions(+)
 create mode 100644 llama-cpp-2/src/logit_bias.rs

diff --git a/llama-cpp-2/src/logit_bias.rs b/llama-cpp-2/src/logit_bias.rs
new file mode 100644
index 00000000..631c9395
--- /dev/null
+++ b/llama-cpp-2/src/logit_bias.rs
@@ -0,0 +1,93 @@
+//! Safe wrapper around `llama_logit_bias`.
+use crate::token::LlamaToken;
+
+/// A transparent wrapper around `llama_logit_bias`.
+///
+/// Represents a bias to be applied to a specific token during text generation.
+/// The bias modifies the likelihood of the token being selected.
+///
+/// Do not rely on `repr(transparent)` for this type. It should be considered an implementation
+/// detail and may change across minor versions.
+#[derive(Clone, Copy, Debug, PartialEq)]
+#[repr(transparent)]
+#[allow(clippy::module_name_repetitions)]
+pub struct LlamaLogitBias {
+    logit_bias: llama_cpp_sys_2::llama_logit_bias,
+}
+
+impl LlamaLogitBias {
+    /// Creates a new logit bias for a specific token with the given bias value.
+    /// 
+    /// # Examples
+    /// ```
+    /// # use llama_cpp_2::token::{LlamaToken, logit_bias::LlamaLogitBias};
+    /// let token = LlamaToken::new(1);
+    /// let bias = LlamaLogitBias::new(token, 1.5);
+    /// ```
+    #[must_use]
+    pub fn new(LlamaToken(token): LlamaToken, bias: f32) -> Self {
+        Self {
+            logit_bias: llama_cpp_sys_2::llama_logit_bias {
+                token,
+                bias,
+            },
+        }
+    }
+
+    /// Gets the token this bias applies to.
+    /// 
+    /// # Examples
+    /// ```
+    /// # use llama_cpp_2::token::{LlamaToken, logit_bias::LlamaLogitBias};
+    /// let token = LlamaToken::new(1);
+    /// let bias = LlamaLogitBias::new(token, 1.5);
+    /// assert_eq!(bias.token(), token);
+    /// ```
+    #[must_use]
+    pub fn token(&self) -> LlamaToken {
+        LlamaToken(self.logit_bias.token)
+    }
+
+    /// Gets the bias value.
+    /// 
+    /// # Examples
+    /// ```
+    /// # use llama_cpp_2::token::{LlamaToken, logit_bias::LlamaLogitBias};
+    /// let token = LlamaToken::new(1);
+    /// let bias = LlamaLogitBias::new(token, 1.5);
+    /// assert_eq!(bias.bias(), 1.5);
+    /// ```
+    #[must_use]
+    pub fn bias(&self) -> f32 {
+        self.logit_bias.bias
+    }
+
+    /// Sets the token this bias applies to.
+    /// 
+    /// # Examples
+    /// ```
+    /// # use llama_cpp_2::token::{LlamaToken, logit_bias::LlamaLogitBias};
+    /// let token = LlamaToken::new(1);
+    /// let mut bias = LlamaLogitBias::new(token, 1.5);
+    /// let new_token = LlamaToken::new(2);
+    /// bias.set_token(new_token);
+    /// assert_eq!(bias.token(), new_token);
+    /// ```
+    pub fn set_token(&mut self, token: LlamaToken) {
+        self.logit_bias.token = token.0;
+    }
+
+    /// Sets the bias value.
+    /// 
+    /// # Examples
+    /// ```
+    /// # use llama_cpp_2::token::{LlamaToken, logit_bias::LlamaLogitBias};
+    /// let token = LlamaToken::new(1);
+    /// let mut bias = LlamaLogitBias::new(token, 1.5);
+    /// bias.set_bias(2.0);
+    /// assert_eq!(bias.bias(), 2.0);
+    /// ```
+    pub fn set_bias(&mut self, bias: f32) {
+        self.logit_bias.bias = bias;
+    }
+}
\ No newline at end of file
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index d79f351b..1c9663bf 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -7,6 +7,7 @@ use std::fmt::{Debug, Formatter};
 use crate::context::LlamaContext;
 use crate::model::LlamaModel;
 use crate::token::data_array::LlamaTokenDataArray;
+use crate::token::logit_bias::LlamaLogitBias;
 use crate::token::LlamaToken;
 
 /// A safe wrapper around `llama_sampler`.
@@ -376,6 +377,42 @@ impl LlamaSampler {
         let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_greedy() };
         Self { sampler }
     }
+
+    /// Creates a sampler that applies bias values to specific tokens during sampling.
+    ///
+    /// # Parameters
+    /// - ``n_vocab``: [`LlamaModel::n_vocab`]
+    /// - ``biases``: Slice of [`LlamaLogitBias`] values specifying token-bias pairs
+    ///
+    /// # Example
+    /// ```rust
+    /// use llama_cpp_2::token::{LlamaToken, logit_bias::LlamaLogitBias};
+    /// use llama_cpp_2::sampling::LlamaSampler;
+    ///
+    /// let biases = vec![
+    ///     LlamaLogitBias::new(LlamaToken(1), 1.5),  // Increase probability of token 1
+    ///     LlamaLogitBias::new(LlamaToken(2), -1.0), // Decrease probability of token 2
+    /// ];
+    ///
+    /// // Assuming vocab_size of 32000
+    /// let sampler = LlamaSampler::logit_bias(32000, &biases);
+    /// ```
+    #[must_use]
+    pub fn logit_bias(n_vocab: i32, biases: &[LlamaLogitBias]) -> Self {
+
+        let data = biases.as_ptr().cast::<llama_cpp_sys_2::llama_logit_bias>();
+        
+        let sampler = unsafe {
+            llama_cpp_sys_2::llama_sampler_init_logit_bias(
+                n_vocab,
+                biases.len() as i32,
+                data,
+            )
+        };
+        
+        Self { sampler }
+    }
+
 }
 
 impl Drop for LlamaSampler {
diff --git a/llama-cpp-2/src/token.rs b/llama-cpp-2/src/token.rs
index 3019420d..abb4fbbf 100644
--- a/llama-cpp-2/src/token.rs
+++ b/llama-cpp-2/src/token.rs
@@ -5,6 +5,7 @@ use std::fmt::Display;
 
 pub mod data;
 pub mod data_array;
+pub mod logit_bias;
 
 /// A safe wrapper for `llama_token`.
 #[repr(transparent)]

From 1f83d19ccd53415f2e2d95d6a1db65d650eb40f0 Mon Sep 17 00:00:00 2001
From: Navid Haghighat <navid.haghighat@gmail.com>
Date: Thu, 20 Feb 2025 03:49:26 +0100
Subject: [PATCH 114/193] Move logit_bias.rs to the correct location

---
 llama-cpp-2/src/{ => token}/logit_bias.rs | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llama-cpp-2/src/{ => token}/logit_bias.rs (100%)

diff --git a/llama-cpp-2/src/logit_bias.rs b/llama-cpp-2/src/token/logit_bias.rs
similarity index 100%
rename from llama-cpp-2/src/logit_bias.rs
rename to llama-cpp-2/src/token/logit_bias.rs

From e9b0bd7d2548bedf12b2d04b71001495fad7ee09 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Thu, 20 Feb 2025 21:46:24 -0800
Subject: [PATCH 115/193] Expose a bunch of information about the model

It can be useful to understand things about the model being loaded.
---
 llama-cpp-2/src/model.rs | 53 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 8b19c4bb..6425dc79 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -92,6 +92,15 @@ impl LlamaChatMessage {
     }
 }
 
+/// The Rope type that's used within the model.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum RopeType {
+    Norm,
+    NeoX,
+    MRope,
+    Vision,
+}
+
 /// How to determine if we should prepend a bos token to tokens
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum AddBos {
@@ -446,6 +455,50 @@ impl LlamaModel {
         unsafe { llama_cpp_sys_2::llama_n_embd(self.model.as_ptr()) }
     }
 
+    /// Returns the total size of all the tensors in the model in bytes.
+    pub fn size(&self) -> u64 {
+        unsafe { llama_cpp_sys_2::llama_model_size(self.model.as_ptr()) }
+    }
+
+    /// Returns the number of parameters in the model.
+    pub fn n_params(&self) -> u64 {
+        unsafe { llama_cpp_sys_2::llama_model_n_params(self.model.as_ptr()) }
+    }
+
+    /// Returns whether the model is a recurrent network (Mamba, RWKV, etc)
+    pub fn is_recurrent(&self) -> bool {
+        unsafe { llama_cpp_sys_2::llama_model_is_recurrent(self.model.as_ptr()) }
+    }
+
+    /// Returns the number of layers within the model.
+    pub fn n_layer(&self) -> u32 {
+        // It's never possible for this to panic because while the API interface is defined as an int32_t,
+        // the field it's accessing is a uint32_t.
+        u32::try_from(unsafe { llama_cpp_sys_2::llama_model_n_layer(self.model.as_ptr()) }).unwrap()
+    }
+
+    /// Returns the number of attention heads within the model.
+    pub fn n_head(&self) -> u32 {
+        // It's never possible for this to panic because while the API interface is defined as an int32_t,
+        // the field it's accessing is a uint32_t.
+        u32::try_from(unsafe { llama_cpp_sys_2::llama_model_n_head(self.model.as_ptr()) }).unwrap()
+    }
+
+    /// Returns the rope type of the model.
+    pub fn rope_type(&self) -> Option<RopeType> {
+        match unsafe { llama_cpp_sys_2::llama_model_rope_type(self.model.as_ptr()) } {
+            llama_cpp_sys_2::LLAMA_ROPE_TYPE_NONE => None,
+            llama_cpp_sys_2::LLAMA_ROPE_TYPE_NORM => Some(RopeType::Norm),
+            llama_cpp_sys_2::LLAMA_ROPE_TYPE_NEOX => Some(RopeType::NeoX),
+            llama_cpp_sys_2::LLAMA_ROPE_TYPE_MROPE => Some(RopeType::MRope),
+            llama_cpp_sys_2::LLAMA_ROPE_TYPE_VISION => Some(RopeType::Vision),
+            rope_type => {
+                tracing::error!(rope_type = rope_type, "Unexpected rope type from llama.cpp");
+                None
+            }
+        }
+    }
+
     fn get_chat_template_impl(
         &self,
         capacity: usize,

From 899c21763f78ca6745f8ce25518f607b64a27a51 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Thu, 20 Feb 2025 21:50:51 -0800
Subject: [PATCH 116/193] Expose additional information about the initialized
 backend

---
 llama-cpp-2/src/llama_backend.rs | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llama-cpp-2/src/llama_backend.rs b/llama-cpp-2/src/llama_backend.rs
index 938356f7..1cc3fa3d 100644
--- a/llama-cpp-2/src/llama_backend.rs
+++ b/llama-cpp-2/src/llama_backend.rs
@@ -70,6 +70,21 @@ impl LlamaBackend {
         Ok(LlamaBackend {})
     }
 
+    /// Was the code built for a GPU backend & is a supported one available.
+    pub fn supports_gpu_offload(&self) -> bool {
+        unsafe { llama_cpp_sys_2::llama_supports_gpu_offload() }
+    }
+
+    /// Does this platform support loading the model via mmap.
+    pub fn supports_mmap(&self) -> bool {
+        unsafe { llama_cpp_sys_2::llama_supports_mmap() }
+    }
+
+    /// Does this platform support locking the model in RAM.
+    pub fn supports_mlock(&self) -> bool {
+        unsafe { llama_cpp_sys_2::llama_supports_mlock() }
+    }
+
     /// Change the output of llama.cpp's logging to be voided instead of pushed to `stderr`.
     pub fn void_logs(&mut self) {
         unsafe extern "C" fn void_log(

From 3dca53f2c90d6f3bbfcac3ad22dc8e5a4dd7e6cb Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Fri, 21 Feb 2025 20:15:04 +0000
Subject: [PATCH 117/193] Bump version to 0.1.98 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 20a5e3bb..48f1a2a1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.97"
+version = "0.1.98"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.97"
+version = "0.1.98"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -665,7 +665,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.97"
+version = "0.1.98"
 dependencies = [
  "bindgen",
  "cc",
@@ -1105,7 +1105,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.97"
+version = "0.1.98"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 24f83abf..c223491e 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.97"
+version = "0.1.98"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index ac3d0378..7c9a5c24 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.97"
+version = "0.1.98"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 53b8b7bb..e836af25 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.97"
+version = "0.1.98"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index dc346900..5cf25353 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.97"
+version = "0.1.98"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 30df407c364a752076a18f740f206f174afcc186 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Fri, 21 Feb 2025 20:22:58 +0000
Subject: [PATCH 118/193] Bump version to 0.1.99 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 48f1a2a1..ba71de31 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.98"
+version = "0.1.99"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.98"
+version = "0.1.99"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -665,7 +665,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.98"
+version = "0.1.99"
 dependencies = [
  "bindgen",
  "cc",
@@ -1105,7 +1105,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.98"
+version = "0.1.99"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index c223491e..07a22ddc 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.98"
+version = "0.1.99"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 7c9a5c24..a891fc7b 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.98"
+version = "0.1.99"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index e836af25..cb96b4f5 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.98"
+version = "0.1.99"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 5cf25353..8ab91931 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.98"
+version = "0.1.99"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From dabcb105ccfeac309f63d760cec63d0de4ad3bbf Mon Sep 17 00:00:00 2001
From: Navid Haghighat <navid.haghighat@gmail.com>
Date: Sat, 22 Feb 2025 20:27:08 +0100
Subject: [PATCH 119/193] Add grammar_lazy sampler

---
 llama-cpp-2/src/sampling.rs | 43 +++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 1c9663bf..c0c1c84c 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -239,6 +239,49 @@ impl LlamaSampler {
         Self { sampler }
     }
 
+    /// Lazy grammar sampler, introduced in <https://github.com/ggerganov/llama.cpp/pull/9639>
+    ///
+    /// This sampler enforces grammar rules only when specific trigger words or tokens are encountered.
+    ///
+    /// # Panics
+    /// - If `grammar_str` or `grammar_root` contain null bytes
+    /// - If any trigger word contains null bytes
+    #[must_use]
+    pub fn grammar_lazy(
+        model: &LlamaModel,
+        grammar_str: &str,
+        grammar_root: &str,
+        trigger_words: impl IntoIterator<Item = impl AsRef<[u8]>>,
+        trigger_tokens: &[LlamaToken],
+    ) -> Self {
+        let grammar_str = CString::new(grammar_str).unwrap();
+        let grammar_root = CString::new(grammar_root).unwrap();
+        
+        let trigger_word_cstrings: Vec<CString> = trigger_words
+            .into_iter()
+            .map(|word| CString::new(word.as_ref()).unwrap())
+            .collect();
+            
+        let mut trigger_word_ptrs: Vec<*const c_char> = trigger_word_cstrings
+            .iter()
+            .map(|cs| cs.as_ptr())
+            .collect();
+    
+        let sampler = unsafe {
+            llama_cpp_sys_2::llama_sampler_init_grammar_lazy(
+                model.vocab_ptr(),
+                grammar_str.as_ptr(),
+                grammar_root.as_ptr(),
+                trigger_word_ptrs.as_mut_ptr(),
+                trigger_word_ptrs.len(),
+                trigger_tokens.as_ptr().cast(),
+                trigger_tokens.len(),
+            )
+        };
+        
+        Self { sampler }
+    }    
+
     /// DRY sampler, designed by p-e-w, as described in:
     /// <https://github.com/oobabooga/text-generation-webui/pull/5677>, porting Koboldcpp
     /// implementation authored by pi6am: <https://github.com/LostRuins/koboldcpp/pull/982>

From c1e17d78740bc4d904b65f9c4b3588f30a41a67b Mon Sep 17 00:00:00 2001
From: Navid Haghighat <navid.haghighat@gmail.com>
Date: Sat, 22 Feb 2025 20:34:17 +0100
Subject: [PATCH 120/193] Add top_n_sigma sampler

---
 llama-cpp-2/src/sampling.rs | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index c0c1c84c..8488ed4f 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -191,6 +191,37 @@ impl LlamaSampler {
         Self { sampler }
     }
 
+    /// Top-nσ sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" 
+    /// <https://arxiv.org/pdf/2411.07641>
+    ///
+    /// This method filters logits by selecting only those within *n* standard deviations of the mean.
+    /// 
+    /// # Parameters
+    /// - `n`: Number of standard deviations from the mean to include in sampling
+    ///
+    /// # Example
+    /// ```rust
+    /// use llama_cpp_2::sampling::LlamaSampler;
+    /// use llama_cpp_2::token::{
+    ///     LlamaToken,
+    ///     data::LlamaTokenData,
+    ///     data_array::LlamaTokenDataArray
+    /// };
+    ///
+    /// let mut data_array = LlamaTokenDataArray::new(vec![
+    ///     LlamaTokenData::new(LlamaToken(0), 0.0, 0.0),
+    ///     LlamaTokenData::new(LlamaToken(1), 1.0, 0.0), 
+    ///     LlamaTokenData::new(LlamaToken(2), 2.0, 0.0),
+    /// ], false);
+    ///
+    /// data_array.apply_sampler(&mut LlamaSampler::top_n_sigma(2.0));
+    /// ```
+    #[must_use]
+    pub fn top_n_sigma(n: f32) -> Self {
+        let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_top_n_sigma(n) };
+        Self { sampler }
+    }
+
     /// Locally Typical Sampling implementation described in the paper <https://arxiv.org/abs/2202.00666>.
     #[must_use]
     pub fn typical(p: f32, min_keep: usize) -> Self {

From 71a8e22a0a84091e19bdc0cdb06467ab4b36cdf4 Mon Sep 17 00:00:00 2001
From: Navid Haghighat <navid.haghighat@gmail.com>
Date: Sat, 22 Feb 2025 20:39:10 +0100
Subject: [PATCH 121/193] Add reset sampler

---
 llama-cpp-2/src/sampling.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 8488ed4f..982cd524 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -62,6 +62,15 @@ impl LlamaSampler {
         self
     }
 
+    /// Resets the internal state of the sampler.
+    /// 
+    /// This can be useful when you want to start fresh with a sampler without creating a new instance.
+    pub fn reset(&mut self) {
+        unsafe {
+            llama_cpp_sys_2::llama_sampler_reset(self.sampler);
+        }
+    }
+
     /// Combines a list of samplers into a single sampler that applies each component sampler one
     /// after another.
     ///

From 914bce3c2672c04560c496ba7d5f2438ad807b80 Mon Sep 17 00:00:00 2001
From: Navid Haghighat <navid.haghighat@gmail.com>
Date: Sat, 22 Feb 2025 20:51:20 +0100
Subject: [PATCH 122/193] Add get_seed method

---
 llama-cpp-2/src/sampling.rs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index 982cd524..a659ab73 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -71,6 +71,17 @@ impl LlamaSampler {
         }
     }
 
+    /// Gets the random seed used by this sampler.
+    /// 
+    /// Returns:
+    /// - For random samplers (dist, mirostat, mirostat_v2): returns their current seed
+    /// - For sampler chains: returns the first non-default seed found in reverse order
+    /// - For all other samplers: returns 0xFFFFFFFF
+    #[must_use]
+    pub fn get_seed(&self) -> u32 {
+        unsafe { llama_cpp_sys_2::llama_sampler_get_seed(self.sampler) }
+    }    
+
     /// Combines a list of samplers into a single sampler that applies each component sampler one
     /// after another.
     ///

From 75de250309a68778175eefbaf75b4814a09ff8b6 Mon Sep 17 00:00:00 2001
From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Date: Mon, 24 Feb 2025 08:31:36 -0800
Subject: [PATCH 123/193] Create LISENCE-MIT

---
 LISENCE-MIT | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 LISENCE-MIT

diff --git a/LISENCE-MIT b/LISENCE-MIT
new file mode 100644
index 00000000..7eadd881
--- /dev/null
+++ b/LISENCE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) Dial AI
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.

From 1698d45dab1b63c9aa4dd6e5090a2ad03bddc34c Mon Sep 17 00:00:00 2001
From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
Date: Mon, 24 Feb 2025 08:33:01 -0800
Subject: [PATCH 124/193] Create LICENSE-APACHE

---
 LICENSE-APACHE | 176 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 LICENSE-APACHE

diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 00000000..1b5ec8b7
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,176 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS

From 97147ab00213fe252be13659fc4065b603dc9dc2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Feb 2025 16:37:01 +0000
Subject: [PATCH 125/193] chore(deps): bump clap from 4.5.29 to 4.5.31

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.29 to 4.5.31.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.29...v4.5.31)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ba71de31..f1793022 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.29"
+version = "4.5.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184"
+checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.29"
+version = "4.5.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9"
+checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 54a63bb3..c7eb6070 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.14"
 anyhow = "1.0.95"
-clap = "4.5.29"
+clap = "4.5.31"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From 93a76449a2d4b0211b58d21680721472a27da028 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Feb 2025 16:37:02 +0000
Subject: [PATCH 126/193] chore(deps): bump cc from 1.2.14 to 1.2.15

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.14 to 1.2.15.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.14...cc-v1.2.15)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ba71de31..ece240ca 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.14"
+version = "1.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9"
+checksum = "c736e259eea577f443d5c86c304f9f4ae0295c43f3ba05c21f1d66b5f06001af"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 54a63bb3..83a894cd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.14"
+cc = "1.2.15"
 anyhow = "1.0.95"
 clap = "4.5.29"
 encoding_rs = "0.8.35"

From 2031385f8b72a9a01d2809c955922897ee76cbb1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Feb 2025 16:44:10 +0000
Subject: [PATCH 127/193] chore(deps): bump anyhow from 1.0.95 to 1.0.96

Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.95 to 1.0.96.
- [Release notes](https://github.com/dtolnay/anyhow/releases)
- [Commits](https://github.com/dtolnay/anyhow/compare/1.0.95...1.0.96)

---
updated-dependencies:
- dependency-name: anyhow
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0b76942f..6c5851b1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,9 +68,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.95"
+version = "1.0.96"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
+checksum = "6b964d184e89d9b6b67dd2715bc8e74cf3107fb2b529990c90cf517326150bf4"
 
 [[package]]
 name = "base64"
diff --git a/Cargo.toml b/Cargo.toml
index 2541f7cc..def0c7eb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.15"
-anyhow = "1.0.95"
+anyhow = "1.0.96"
 clap = "4.5.31"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }

From 3997277b22462e39dc5a8c3389573877a5ee33cb Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 24 Feb 2025 16:48:20 +0000
Subject: [PATCH 128/193] Bump version to 0.1.100 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0b76942f..c0f04bfd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.99"
+version = "0.1.100"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.99"
+version = "0.1.100"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -665,7 +665,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.99"
+version = "0.1.100"
 dependencies = [
  "bindgen",
  "cc",
@@ -1105,7 +1105,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.99"
+version = "0.1.100"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 07a22ddc..a87a1b0e 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.99"
+version = "0.1.100"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index a891fc7b..70e7deb0 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.99"
+version = "0.1.100"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index cb96b4f5..c703a2be 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.99"
+version = "0.1.100"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 8ab91931..3175c736 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.99"
+version = "0.1.100"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 2946b7f9b1cd4b7a1b2e7607344e48b0e5a9bf8e Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 24 Feb 2025 19:15:59 +0000
Subject: [PATCH 129/193] Bump version to 0.1.101 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 66cf2087..3f1c9f7c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.100"
+version = "0.1.101"
 dependencies = [
  "anyhow",
  "clap",
@@ -653,7 +653,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.100"
+version = "0.1.101"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -665,7 +665,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.100"
+version = "0.1.101"
 dependencies = [
  "bindgen",
  "cc",
@@ -1105,7 +1105,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.100"
+version = "0.1.101"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index a87a1b0e..f1067b8d 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.100"
+version = "0.1.101"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 70e7deb0..b3173a21 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.100"
+version = "0.1.101"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index c703a2be..27a34b2d 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.100"
+version = "0.1.101"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 3175c736..fe45ce25 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.100"
+version = "0.1.101"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From b30abebcabfcdbe0a3cf9f511227710abc7707de Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Tue, 25 Feb 2025 11:45:56 +0100
Subject: [PATCH 130/193] explicitly apply optimization flags to msvc compiler
 also on other profiles than debug

---
 llama-cpp-sys-2/build.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index ec4ac7ce..5710c472 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -277,10 +277,7 @@ fn main() {
         config.define("GGML_BLAS", "OFF");
     }
 
-    if (cfg!(debug_assertions)
-        || std::env::var("PROFILE").as_ref().map(String::as_str) == Ok("debug"))
-        && matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc))
-        && profile == "Release"
+    if (matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc)) && matches!(profile.as_str(), "Release" | "RelWithDebInfo" | "MinSizeRel"))
     {
         // Debug Rust builds under MSVC turn off optimization even though we're ideally building the release profile of llama.cpp.
         // Looks like an upstream bug:

From 1d49ebf6a784458105dede65a4f8440049623ad3 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Tue, 25 Feb 2025 12:22:03 +0100
Subject: [PATCH 131/193] use CString::new instead of Cstring::from_Str to init
 chat template

---
 llama-cpp-2/src/model.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 6425dc79..9c83a795 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -4,7 +4,7 @@ use std::num::NonZeroU16;
 use std::os::raw::c_int;
 use std::path::Path;
 use std::ptr::NonNull;
-use std::str::{FromStr, Utf8Error};
+use std::str::Utf8Error;
 
 use crate::context::params::LlamaContextParams;
 use crate::context::LlamaContext;
@@ -47,7 +47,7 @@ impl LlamaChatTemplate {
     /// Create a new template from a string. This can either be the name of a llama.cpp [chat template](https://github.com/ggerganov/llama.cpp/blob/8a8c4ceb6050bd9392609114ca56ae6d26f5b8f5/src/llama-chat.cpp#L27-L61)
     /// like "chatml" or "llama3" or an actual Jinja template for llama.cpp to interpret.
     pub fn new(template: &str) -> Result<Self, std::ffi::NulError> {
-        Ok(Self(CString::from_str(template)?))
+        Ok(Self(CString::new(template)?))
     }
 
     /// Accesses the template as a c string reference.

From cfa76bd567516d897aae85d868a3b26fc8295134 Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Fri, 21 Feb 2025 18:11:36 -0800
Subject: [PATCH 132/193] Don't imply dynamic llama.cpp just because CUDA is on

Link against CUDA statically as well to maintain consistency with
GGML_STATIC although technically that's our discretion.
---
 Cargo.lock                 | 10 ++++++++++
 llama-cpp-2/Cargo.toml     |  1 +
 llama-cpp-sys-2/Cargo.toml |  5 ++++-
 llama-cpp-sys-2/build.rs   | 31 ++++++++++++++++++++++++++++++-
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 48f1a2a1..85918c3b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -336,6 +336,15 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
 
+[[package]]
+name = "find_cuda_helper"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad"
+dependencies = [
+ "glob",
+]
+
 [[package]]
 name = "flate2"
 version = "1.0.30"
@@ -670,6 +679,7 @@ dependencies = [
  "bindgen",
  "cc",
  "cmake",
+ "find_cuda_helper",
  "glob",
  "walkdir",
 ]
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index e836af25..97028540 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -21,6 +21,7 @@ encoding_rs = { workspace = true }
 [features]
 default = ["openmp", "android-shared-stdcxx"]
 cuda = ["llama-cpp-sys-2/cuda"]
+cuda-no-vmm = ["cuda", "llama-cpp-sys-2/cuda-no-vmm"]
 metal = ["llama-cpp-sys-2/metal"]
 dynamic-link = ["llama-cpp-sys-2/dynamic-link"]
 vulkan = ["llama-cpp-sys-2/vulkan"]
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 5cf25353..6854794d 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -74,15 +74,18 @@ include = [
 bindgen = { workspace = true }
 cc = { workspace = true, features = ["parallel"] }
 cmake = "0.1"
+find_cuda_helper = "0.2.0"
 glob = "0.3.2"
 walkdir = "2"
 
 [features]
 cuda = []
+# Disables the need to dynamically link against libcuda.so / cuda.dll
+cuda-no-vmm = ["cuda"]
 metal = []
 dynamic-link = []
 vulkan = []
 native = []
 openmp = []
 # Only has an impact on Android.
-shared-stdcxx = []
+shared-stdcxx = []
\ No newline at end of file
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index ec4ac7ce..206baddf 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -179,7 +179,7 @@ fn main() {
     let target_dir = get_cargo_target_dir().unwrap();
     let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get CARGO_MANIFEST_DIR");
     let llama_src = Path::new(&manifest_dir).join("llama.cpp");
-    let build_shared_libs = cfg!(feature = "cuda") || cfg!(feature = "dynamic-link");
+    let build_shared_libs = cfg!(feature = "dynamic-link");
 
     let build_shared_libs = std::env::var("LLAMA_BUILD_SHARED_LIBS")
         .map(|v| v == "1")
@@ -355,6 +355,10 @@ fn main() {
 
     if cfg!(feature = "cuda") {
         config.define("GGML_CUDA", "ON");
+
+        if cfg!(feature = "cuda-no-vmm") {
+            config.define("GGML_CUDA_NO_VMM", "ON");
+        }
     }
 
     // Android doesn't have OpenMP support AFAICT and openmp is a default feature. Do this here
@@ -394,6 +398,31 @@ fn main() {
     );
     println!("cargo:rustc-link-search={}", build_dir.display());
 
+    if cfg!(feature = "cuda") && !build_shared_libs {
+        println!("cargo:rerun-if-env-changed=CUDA_PATH");
+
+        for lib_dir in find_cuda_helper::find_cuda_lib_dirs() {
+            println!("cargo:rustc-link-search=native={}", lib_dir.display());
+        }
+
+        // Logic from ggml-cuda/CMakeLists.txt
+        println!("cargo:rustc-link-lib=static=cudart_static");
+        if matches!(target_os, TargetOs::Windows(_)) {
+            println!("cargo:rustc-link-lib=static=cublas");
+            println!("cargo:rustc-link-lib=static=cublasLt");
+        } else {
+            println!("cargo:rustc-link-lib=static=cublas_static");
+            println!("cargo:rustc-link-lib=static=cublasLt_static");
+        }
+
+        // Need to link against libcuda.so unless GGML_CUDA_NO_VMM is defined.
+        if !cfg!(feature = "cuda-no-vmm") {
+            println!("cargo:rustc-link-lib=cuda");
+        }
+
+        println!("cargo:rustc-link-lib=static=culibos");
+    }
+
     // Link libraries
     let llama_libs_kind = if build_shared_libs { "dylib" } else { "static" };
     let llama_libs = extract_lib_names(&out_dir, build_shared_libs);

From ef3a8a37f59245bb65e98702ba6f3bde717b1396 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Tue, 25 Feb 2025 19:56:04 +0000
Subject: [PATCH 133/193] Bump version to 0.1.102 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 37fcddd5..447bf77f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.101"
+version = "0.1.102"
 dependencies = [
  "anyhow",
  "clap",
@@ -662,7 +662,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.101"
+version = "0.1.102"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -674,7 +674,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.101"
+version = "0.1.102"
 dependencies = [
  "bindgen",
  "cc",
@@ -1115,7 +1115,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.101"
+version = "0.1.102"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index f1067b8d..c96551ec 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.101"
+version = "0.1.102"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index b3173a21..cb5d4e07 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.101"
+version = "0.1.102"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 8235ca03..d85ca5af 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.101"
+version = "0.1.102"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index d37a143a..0d07eefd 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.101"
+version = "0.1.102"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 748d58d05774d049bf91ee963ef7405d8d332618 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Tue, 25 Feb 2025 20:50:58 +0000
Subject: [PATCH 134/193] Bump version to 0.1.103 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 447bf77f..f581047b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.102"
+version = "0.1.103"
 dependencies = [
  "anyhow",
  "clap",
@@ -662,7 +662,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.102"
+version = "0.1.103"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -674,7 +674,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.102"
+version = "0.1.103"
 dependencies = [
  "bindgen",
  "cc",
@@ -1115,7 +1115,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.102"
+version = "0.1.103"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index c96551ec..c3c5b533 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.102"
+version = "0.1.103"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index cb5d4e07..98867c57 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.102"
+version = "0.1.103"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index d85ca5af..bf52467c 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.102"
+version = "0.1.103"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 0d07eefd..6db13f98 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.102"
+version = "0.1.103"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 727419ab4d30e7a53c482bbf44428d41c61e8e9a Mon Sep 17 00:00:00 2001
From: Kusaanko <39370373+kusaanko@users.noreply.github.com>
Date: Wed, 26 Feb 2025 18:28:33 +0900
Subject: [PATCH 135/193] Fix to check should be the event recorded

---
 llama-cpp-2/src/log.rs | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/llama-cpp-2/src/log.rs b/llama-cpp-2/src/log.rs
index 1c324b4b..e77f94bb 100644
--- a/llama-cpp-2/src/log.rs
+++ b/llama-cpp-2/src/log.rs
@@ -142,16 +142,18 @@ impl State {
         let (meta, fields) = meta_for_level(level);
 
         tracing::dispatcher::get_default(|dispatcher| {
-            dispatcher.event(&tracing::Event::new(
-                meta,
-                &meta.fields().value_set(&[
-                    (&fields.message, Some(&text as &dyn tracing::field::Value)),
-                    (
-                        &fields.target,
-                        module.as_ref().map(|s| s as &dyn tracing::field::Value),
-                    ),
-                ]),
-            ));
+            if dispatcher.enabled(meta) {
+                dispatcher.event(&tracing::Event::new(
+                    meta,
+                    &meta.fields().value_set(&[
+                        (&fields.message, Some(&text as &dyn tracing::field::Value)),
+                        (
+                            &fields.target,
+                            module.as_ref().map(|s| s as &dyn tracing::field::Value),
+                        ),
+                    ]),
+                ));
+            }
         });
     }
 

From 5f3a29ed9445a50a64acc4d331c8a1e914e0fe91 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 26 Feb 2025 11:58:49 +0100
Subject: [PATCH 136/193] copy src into target, not src

---
 llama-cpp-sys-2/build.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 6179ab60..e2a56b38 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -379,7 +379,7 @@ fn main() {
     std::fs::rename(&build_info_src,&build_info_target).unwrap_or_else(|move_e| {
         // Rename may fail if the target directory is on a different filesystem/disk from the source.
         // Fall back to copy + delete to achieve the same effect in this case.
-        std::fs::copy(&build_info_src, &build_info_src).unwrap_or_else(|copy_e| {
+        std::fs::copy(&build_info_src, &build_info_target).unwrap_or_else(|copy_e| {
             panic!("Failed to rename {build_info_src:?} to {build_info_target:?}. Move failed with {move_e:?} and copy failed with {copy_e:?}");
         });
         std::fs::remove_file(&build_info_src).unwrap_or_else(|e| {

From bde92481525aa4c21c3de74911b46f4615880e2e Mon Sep 17 00:00:00 2001
From: Vitali Lovich <vlovich@openinfer.io>
Date: Fri, 28 Feb 2025 22:36:03 -0800
Subject: [PATCH 137/193] Expose n_head_kv

---
 llama-cpp-2/src/model.rs  | 7 +++++++
 llama-cpp-sys-2/llama.cpp | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 9c83a795..69e938c5 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -484,6 +484,13 @@ impl LlamaModel {
         u32::try_from(unsafe { llama_cpp_sys_2::llama_model_n_head(self.model.as_ptr()) }).unwrap()
     }
 
+    /// Returns the number of KV attention heads.
+    pub fn n_head_kv(&self) -> u32 {
+        // It's never possible for this to panic because while the API interface is defined as an int32_t,
+        // the field it's accessing is a uint32_t.
+        u32::try_from(unsafe { llama_cpp_sys_2::llama_model_n_head_kv(self.model.as_ptr()) }).unwrap()
+    }
+
     /// Returns the rope type of the model.
     pub fn rope_type(&self) -> Option<RopeType> {
         match unsafe { llama_cpp_sys_2::llama_model_rope_type(self.model.as_ptr()) } {
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 300907b2..06c2b156 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 300907b2110cc17b4337334dc397e05de2d8f5e0
+Subproject commit 06c2b1561d8b882bc018554591f8c35eb04ad30e

From 4809af31e29828565733138538ce8651dc92718a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Mar 2025 05:09:39 +0000
Subject: [PATCH 138/193] chore(deps): bump docker/setup-qemu-action from 3.4.0
 to 3.6.0

Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3.4.0 to 3.6.0.
- [Release notes](https://github.com/docker/setup-qemu-action/releases)
- [Commits](https://github.com/docker/setup-qemu-action/compare/4574d27a4764455b42196d70a065bc6853246a25...29109295f81e9208d7d86ff1c6c12d2833863392)

---
updated-dependencies:
- dependency-name: docker/setup-qemu-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/llama-cpp-rs-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index 170a1d29..62f0a0ab 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -45,7 +45,7 @@ jobs:
       - name: checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
       - name: Setup QEMU
-        uses: docker/setup-qemu-action@4574d27a4764455b42196d70a065bc6853246a25
+        uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392
         with:
           platforms: arm64,amd64
       - name: Set up Docker Buildx

From c8a569086accd2821026dfaf840f5125c7397aaa Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Mar 2025 05:58:20 +0000
Subject: [PATCH 139/193] chore(deps): bump anyhow from 1.0.96 to 1.0.97

Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.96 to 1.0.97.
- [Release notes](https://github.com/dtolnay/anyhow/releases)
- [Commits](https://github.com/dtolnay/anyhow/compare/1.0.96...1.0.97)

---
updated-dependencies:
- dependency-name: anyhow
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f581047b..abf819d6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,9 +68,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.96"
+version = "1.0.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b964d184e89d9b6b67dd2715bc8e74cf3107fb2b529990c90cf517326150bf4"
+checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f"
 
 [[package]]
 name = "base64"
diff --git a/Cargo.toml b/Cargo.toml
index def0c7eb..74629883 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.15"
-anyhow = "1.0.96"
+anyhow = "1.0.97"
 clap = "4.5.31"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }

From 792828aa10caa0ad9190eb0bdefaab887add06c0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 5 Mar 2025 23:27:13 +0000
Subject: [PATCH 140/193] chore(deps): bump docker/setup-buildx-action from
 3.9.0 to 3.10.0

Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3.9.0 to 3.10.0.
- [Release notes](https://github.com/docker/setup-buildx-action/releases)
- [Commits](https://github.com/docker/setup-buildx-action/compare/f7ce87c1d6bead3e36075b2ce75da1f6cc28aaca...b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2)

---
updated-dependencies:
- dependency-name: docker/setup-buildx-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/llama-cpp-rs-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index 62f0a0ab..5277c82d 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -49,7 +49,7 @@ jobs:
         with:
           platforms: arm64,amd64
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@f7ce87c1d6bead3e36075b2ce75da1f6cc28aaca
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2
       - name: Build
         uses: docker/build-push-action@v6
         with:

From 958a1d0f1d1f4def8df70e7a11952a8a0bcddd1f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 5 Mar 2025 23:27:21 +0000
Subject: [PATCH 141/193] chore(deps): bump cc from 1.2.15 to 1.2.16

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.15 to 1.2.16.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.15...cc-v1.2.16)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index abf819d6..bef01d80 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.15"
+version = "1.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c736e259eea577f443d5c86c304f9f4ae0295c43f3ba05c21f1d66b5f06001af"
+checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 74629883..f7d4fa9e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.15"
+cc = "1.2.16"
 anyhow = "1.0.97"
 clap = "4.5.31"
 encoding_rs = "0.8.35"

From 4ba3962587081a458ce521ed8f72226b2dcd184e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 7 Mar 2025 17:05:02 +0000
Subject: [PATCH 142/193] chore(deps): bump ring from 0.17.8 to 0.17.13

Bumps [ring](https://github.com/briansmith/ring) from 0.17.8 to 0.17.13.
- [Changelog](https://github.com/briansmith/ring/blob/main/RELEASES.md)
- [Commits](https://github.com/briansmith/ring/commits)

---
updated-dependencies:
- dependency-name: ring
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bef01d80..ed22933b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -635,7 +635,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.5",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -957,15 +957,14 @@ dependencies = [
 
 [[package]]
 name = "ring"
-version = "0.17.8"
+version = "0.17.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
+checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
 dependencies = [
  "cc",
  "cfg-if",
  "getrandom",
  "libc",
- "spin",
  "untrusted",
  "windows-sys 0.52.0",
 ]
@@ -1131,12 +1130,6 @@ version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
-[[package]]
-name = "spin"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -1437,7 +1430,7 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
 dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]

From e1b544819093e06a1b04a8cbcd3da157820067a0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 17 Mar 2025 06:03:15 +0000
Subject: [PATCH 143/193] chore(deps): bump clap from 4.5.31 to 4.5.32

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.31 to 4.5.32.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/v4.5.31...clap_complete-v4.5.32)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 16 ++++++++--------
 Cargo.toml |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ed22933b..5e97a30f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.31"
+version = "4.5.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767"
+checksum = "6088f3ae8c3608d19260cd7445411865a485688711b78b5be70d78cd96136f83"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.31"
+version = "4.5.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863"
+checksum = "22a7ef7f676155edfb82daa97f99441f3ebf4a58d5e32f295a56259f1b6facc8"
 dependencies = [
  "anstream",
  "anstyle",
@@ -168,9 +168,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.28"
+version = "4.5.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed"
+checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7"
 dependencies = [
  "heck",
  "proc-macro2",
@@ -635,7 +635,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
 dependencies = [
  "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -1430,7 +1430,7 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
 dependencies = [
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index f7d4fa9e..d5854438 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.16"
 anyhow = "1.0.97"
-clap = "4.5.31"
+clap = "4.5.32"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From 15882655ca7dbf9e99254425554d9c2f86e5d971 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Mar 2025 05:31:16 +0000
Subject: [PATCH 144/193] chore(deps): bump cc from 1.2.16 to 1.2.17

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.16 to 1.2.17.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.16...cc-v1.2.17)

---
updated-dependencies:
- dependency-name: cc
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5e97a30f..26391b55 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.16"
+version = "1.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c"
+checksum = "1fcb57c740ae1daf453ae85f16e37396f672b039e00d9d866e07ddb24e328e3a"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index d5854438..615fe4fd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.16"
+cc = "1.2.17"
 anyhow = "1.0.97"
 clap = "4.5.32"
 encoding_rs = "0.8.35"

From 21eee351b0a06170cc6fe9e80121bf05da37b047 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Mon, 24 Mar 2025 23:47:52 +0100
Subject: [PATCH 145/193] reimplement get_chat_template

---
 llama-cpp-2/src/lib.rs    | 11 +++--
 llama-cpp-2/src/model.rs  | 93 ++++++++++-----------------------------
 llama-cpp-sys-2/llama.cpp |  2 +-
 3 files changed, 33 insertions(+), 73 deletions(-)

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 3d79337f..6e251728 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -69,9 +69,14 @@ pub enum LLamaCppError {
 /// There was an error while getting the chat template from a model.
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum ChatTemplateError {
-    /// gguf has no chat template
-    #[error("the model has no meta val - returned code {0}")]
-    MissingTemplate(i32),
+    /// gguf has no chat template (by that name)
+    #[error("chat template not found - returned null pointer")]
+    MissingTemplate,
+
+    /// chat template contained a null byte
+    #[error("null byte in string {0}")]
+    NullError(#[from] NulError),
+
     /// The chat template was not valid utf8.
     #[error(transparent)]
     Utf8Error(#[from] std::str::Utf8Error),
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 69e938c5..e566e400 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -506,83 +506,38 @@ impl LlamaModel {
         }
     }
 
-    fn get_chat_template_impl(
-        &self,
-        capacity: usize,
-    ) -> Result<LlamaChatTemplate, InternalChatTemplateError> {
-        // longest known template is about 1200 bytes from llama.cpp
-        // TODO: Once MaybeUninit support is better, this can be converted to use that instead of dummy initializing such a large array.
-        let mut chat_temp = vec![b'*' as u8; capacity];
-        let chat_name =
-            CStr::from_bytes_with_nul(b"tokenizer.chat_template\0").expect("should have null byte");
-
-        let ret = unsafe {
-            llama_cpp_sys_2::llama_model_meta_val_str(
-                self.model.as_ptr(),
-                chat_name.as_ptr(),
-                chat_temp.as_mut_ptr() as *mut c_char,
-                chat_temp.len(),
-            )
-        };
-
-        if ret < 0 {
-            return Err(InternalChatTemplateError::Permanent(
-                ChatTemplateError::MissingTemplate(ret),
-            ));
-        }
-
-        let returned_len = ret as usize;
-
-        if ret as usize >= capacity {
-            // >= is important because if the returned length is equal to capacity, it means we're missing a trailing null
-            // since the returned length doesn't count the trailing null.
-            return Err(InternalChatTemplateError::RetryWithLargerBuffer(
-                returned_len,
-            ));
-        }
-
-        assert_eq!(
-            chat_temp.get(returned_len),
-            Some(&0),
-            "should end with null byte"
-        );
-
-        chat_temp.resize(returned_len + 1, 0);
-
-        Ok(LlamaChatTemplate(unsafe {
-            CString::from_vec_with_nul_unchecked(chat_temp)
-        }))
-    }
-
-    /// Get chat template from model. If this fails, you may either want to fail to chat or pick the
-    /// specific shortcode that llama.cpp supports templates it has baked-in directly into its codebase
-    /// as fallbacks when the model doesn't contain. NOTE: If you don't specify a chat template, then
-    /// it uses chatml by default which is unlikely to actually be the correct template for your model
-    /// and you'll get weird results back.
+    /// Get chat template from model by name. If the name is None, the default chat template will be returned.
     ///
     /// You supply this into [Self::apply_chat_template] to get back a string with the appropriate template
     /// substitution applied to convert a list of messages into a prompt the LLM can use to complete
     /// the chat.
     ///
+    /// You could also use an external jinja parser, like minijinja, to parse jinja templates not
+    /// supported by the llama.cpp template engine..
+    ///
     /// # Errors
     ///
-    /// * If the model has no chat template
+    /// * If the model has no chat template by that name
     /// * If the chat template is not a valid [`CString`].
-    #[allow(clippy::missing_panics_doc)] // we statically know this will not panic as
-    pub fn get_chat_template(&self) -> Result<LlamaChatTemplate, ChatTemplateError> {
-        // Typical chat templates are quite small. Let's start with a small allocation likely to succeed.
-        // Ideally the performance of this would be negligible but uninitialized arrays in Rust are currently
-        // still not well supported so we end up initializing the chat template buffer twice. One idea might
-        // be to use a very small value here that will likely fail (like 0 or 1) and then use that to initialize.
-        // Not sure which approach is the most optimal but in practice this should work well.
-        match self.get_chat_template_impl(200) {
-            Ok(t) => Ok(t),
-            Err(InternalChatTemplateError::Permanent(e)) => Err(e),
-            Err(InternalChatTemplateError::RetryWithLargerBuffer(actual_len)) => match self.get_chat_template_impl(actual_len + 1) {
-                Ok(t) => Ok(t),
-                Err(InternalChatTemplateError::Permanent(e)) => Err(e),
-                Err(InternalChatTemplateError::RetryWithLargerBuffer(unexpected_len)) => panic!("Was told that the template length was {actual_len} but now it's {unexpected_len}"),
-            }
+    pub fn get_chat_template(
+        &self,
+        name: Option<&str>,
+    ) -> Result<LlamaChatTemplate, ChatTemplateError> {
+        let name_cstr = name.map(CString::new);
+        let name_ptr = match name_cstr {
+            Some(Ok(name)) => name.as_ptr(),
+            _ => std::ptr::null(),
+        };
+        let result =
+            unsafe { llama_cpp_sys_2::llama_model_chat_template(self.model.as_ptr(), name_ptr) };
+
+        // Convert result to Rust String if not null
+        if result.is_null() {
+            Err(ChatTemplateError::MissingTemplate)
+        } else {
+            let chat_template_cstr = unsafe { CStr::from_ptr(result) };
+            let chat_template = CString::new(chat_template_cstr.to_bytes())?;
+            Ok(LlamaChatTemplate(chat_template))
         }
     }
 
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 06c2b156..2b65ae30 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 06c2b1561d8b882bc018554591f8c35eb04ad30e
+Subproject commit 2b65ae30299b9c67e25c51ee567e9a2ef22279ab

From 6c2640f1c3e0e9a295a92d3ddb88ccad5ee5782e Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Tue, 25 Mar 2025 12:07:14 +0100
Subject: [PATCH 146/193] rename get_chat_template to chat_template

---
 llama-cpp-2/src/model.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index e566e400..dd99d198 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -506,20 +506,20 @@ impl LlamaModel {
         }
     }
 
-    /// Get chat template from model by name. If the name is None, the default chat template will be returned.
+    /// Get chat template from model by name. If the name parameter is None, the default chat template will be returned.
     ///
     /// You supply this into [Self::apply_chat_template] to get back a string with the appropriate template
     /// substitution applied to convert a list of messages into a prompt the LLM can use to complete
     /// the chat.
     ///
-    /// You could also use an external jinja parser, like minijinja, to parse jinja templates not
-    /// supported by the llama.cpp template engine..
+    /// You could also use an external jinja parser, like [minijinja](https://github.com/mitsuhiko/minijinja),
+    /// to parse jinja templates not supported by the llama.cpp template engine.
     ///
     /// # Errors
     ///
     /// * If the model has no chat template by that name
     /// * If the chat template is not a valid [`CString`].
-    pub fn get_chat_template(
+    fn chat_template(
         &self,
         name: Option<&str>,
     ) -> Result<LlamaChatTemplate, ChatTemplateError> {

From eaf0782551c22fa1ae241ec8f2add7c8e5962b59 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 26 Mar 2025 11:23:17 +0100
Subject: [PATCH 147/193] make LlamaModel::chat_template public again

---
 llama-cpp-2/src/model.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index dd99d198..669ccf02 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -519,7 +519,7 @@ impl LlamaModel {
     ///
     /// * If the model has no chat template by that name
     /// * If the chat template is not a valid [`CString`].
-    fn chat_template(
+    pub fn chat_template(
         &self,
         name: Option<&str>,
     ) -> Result<LlamaChatTemplate, ChatTemplateError> {

From 6ed6248b091502ff072d11916c5bb3145a669ad2 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 26 Mar 2025 15:54:51 +0100
Subject: [PATCH 148/193] rename references to get_chat_template in doc strings

---
 llama-cpp-2/src/model.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 669ccf02..00b5bea6 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -36,7 +36,7 @@ pub struct LlamaLoraAdapter {
     pub(crate) lora_adapter: NonNull<llama_cpp_sys_2::llama_adapter_lora>,
 }
 
-/// A performance-friendly wrapper around [LlamaModel::get_chat_template] which is then
+/// A performance-friendly wrapper around [LlamaModel::chat_template] which is then
 /// fed into [LlamaModel::apply_chat_template] to convert a list of messages into an LLM
 /// prompt. Internally the template is stored as a CString to avoid round-trip conversions
 /// within the FFI.
@@ -627,7 +627,7 @@ impl LlamaModel {
     /// use "chatml", then just do `LlamaChatTemplate::new("chatml")` or any other model name or template
     /// string.
     ///
-    /// Use [Self::get_chat_template] to retrieve the template baked into the model (this is the preferred
+    /// Use [Self::chat_template] to retrieve the template baked into the model (this is the preferred
     /// mechanism as using the wrong chat template can result in really unexpected responses from the LLM).
     ///
     /// You probably want to set `add_ass` to true so that the generated template string ends with a the

From 593257eb3d2621dd058e33e4a57e0390750fa1bb Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 26 Mar 2025 16:00:25 +0100
Subject: [PATCH 149/193] remove unused error type

---
 llama-cpp-2/src/lib.rs   | 6 ------
 llama-cpp-2/src/model.rs | 5 ++---
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 6e251728..a1857950 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -82,12 +82,6 @@ pub enum ChatTemplateError {
     Utf8Error(#[from] std::str::Utf8Error),
 }
 
-enum InternalChatTemplateError {
-    Permanent(ChatTemplateError),
-    /// the buffer was too small.
-    RetryWithLargerBuffer(usize),
-}
-
 /// Failed to Load context
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum LlamaContextLoadError {
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index 00b5bea6..cb4a33bd 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -13,9 +13,8 @@ use crate::model::params::LlamaModelParams;
 use crate::token::LlamaToken;
 use crate::token_type::{LlamaTokenAttr, LlamaTokenAttrs};
 use crate::{
-    ApplyChatTemplateError, ChatTemplateError, InternalChatTemplateError, LlamaContextLoadError,
-    LlamaLoraAdapterInitError, LlamaModelLoadError, NewLlamaChatMessageError, StringToTokenError,
-    TokenToStringError,
+    ApplyChatTemplateError, ChatTemplateError, LlamaContextLoadError, LlamaLoraAdapterInitError,
+    LlamaModelLoadError, NewLlamaChatMessageError, StringToTokenError, TokenToStringError,
 };
 
 pub mod params;

From ad4f7f5ab7372a1788d52c0136ab4c34d35e0110 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 31 Mar 2025 05:41:54 +0000
Subject: [PATCH 150/193] chore(deps): bump clap from 4.5.32 to 4.5.34

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.32 to 4.5.34.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.32...clap_complete-v4.5.34)

---
updated-dependencies:
- dependency-name: clap
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 26391b55..1e0f2685 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.32"
+version = "4.5.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6088f3ae8c3608d19260cd7445411865a485688711b78b5be70d78cd96136f83"
+checksum = "e958897981290da2a852763fe9cdb89cd36977a5d729023127095fa94d95e2ff"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.32"
+version = "4.5.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22a7ef7f676155edfb82daa97f99441f3ebf4a58d5e32f295a56259f1b6facc8"
+checksum = "83b0f35019843db2160b5bb19ae09b4e6411ac33fc6a712003c33e03090e2489"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 615fe4fd..1bc7de90 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.17"
 anyhow = "1.0.97"
-clap = "4.5.32"
+clap = "4.5.34"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From cdb2cef9c9ae9e5cfd8f5463697b71b871d6816c Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 26 Mar 2025 15:53:18 +0100
Subject: [PATCH 151/193] implement metadata fetching methods on LlamaModel

---
 llama-cpp-2/src/lib.rs   | 15 +++++++
 llama-cpp-2/src/model.rs | 94 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index a1857950..3f3d7c00 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -82,6 +82,21 @@ pub enum ChatTemplateError {
     Utf8Error(#[from] std::str::Utf8Error),
 }
 
+#[derive(Debug, Eq, PartialEq, thiserror::Error)]
+pub enum MetaValError {
+    #[error("model does not have metadata key: {0}")]
+    MissingKey(String),
+
+    #[error("null byte in string {0}")]
+    NullError(#[from] NulError),
+
+    #[error("FromUtf8Error {0}")]
+    FromUtf8Error(#[from] FromUtf8Error),
+
+    #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
+    NegativeReturn(i32),
+}
+
 /// Failed to Load context
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum LlamaContextLoadError {
diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs
index cb4a33bd..b8cd26bb 100644
--- a/llama-cpp-2/src/model.rs
+++ b/llama-cpp-2/src/model.rs
@@ -13,8 +13,9 @@ use crate::model::params::LlamaModelParams;
 use crate::token::LlamaToken;
 use crate::token_type::{LlamaTokenAttr, LlamaTokenAttrs};
 use crate::{
-    ApplyChatTemplateError, ChatTemplateError, LlamaContextLoadError, LlamaLoraAdapterInitError,
-    LlamaModelLoadError, NewLlamaChatMessageError, StringToTokenError, TokenToStringError,
+    ApplyChatTemplateError, ChatTemplateError, LlamaContextLoadError,
+    LlamaLoraAdapterInitError, LlamaModelLoadError, MetaValError, NewLlamaChatMessageError,
+    StringToTokenError, TokenToStringError,
 };
 
 pub mod params;
@@ -490,6 +491,59 @@ impl LlamaModel {
         u32::try_from(unsafe { llama_cpp_sys_2::llama_model_n_head_kv(self.model.as_ptr()) }).unwrap()
     }
 
+    /// Get metadata value as a string by key name
+    pub fn meta_val_str(&self, key: &str) -> Result<String, MetaValError> {
+        let key_cstring = CString::new(key)?;
+        let key_ptr = key_cstring.as_ptr();
+
+        extract_meta_string(
+            |buf_ptr, buf_len| unsafe {
+                llama_cpp_sys_2::llama_model_meta_val_str(
+                    self.model.as_ptr(),
+                    key_ptr,
+                    buf_ptr,
+                    buf_len,
+                )
+            },
+            256,
+        )
+    }
+
+    /// Get the number of metadata key/value pairs
+    pub fn meta_count(&self) -> i32 {
+        unsafe { llama_cpp_sys_2::llama_model_meta_count(self.model.as_ptr()) }
+    }
+
+    /// Get metadata key name by index
+    pub fn meta_key_by_index(&self, index: i32) -> Result<String, MetaValError> {
+        extract_meta_string(
+            |buf_ptr, buf_len| unsafe {
+                llama_cpp_sys_2::llama_model_meta_key_by_index(
+                    self.model.as_ptr(),
+                    index,
+                    buf_ptr,
+                    buf_len,
+                )
+            },
+            256,
+        )
+    }
+
+    /// Get metadata value as a string by index
+    pub fn meta_val_str_by_index(&self, index: i32) -> Result<String, MetaValError> {
+        extract_meta_string(
+            |buf_ptr, buf_len| unsafe {
+                llama_cpp_sys_2::llama_model_meta_val_str_by_index(
+                    self.model.as_ptr(),
+                    index,
+                    buf_ptr,
+                    buf_len,
+                )
+            },
+            256,
+        )
+    }
+
     /// Returns the rope type of the model.
     pub fn rope_type(&self) -> Option<RopeType> {
         match unsafe { llama_cpp_sys_2::llama_model_rope_type(self.model.as_ptr()) } {
@@ -690,6 +744,42 @@ impl LlamaModel {
     }
 }
 
+/// Generic helper function for extracting string values from the C API
+/// This are specifically useful for the the metadata functions, where we pass in a buffer
+/// to be populated by a string, not yet knowing if the buffer is large enough.
+/// If the buffer was not large enough, we get the correct length back, which can be used to
+/// construct a buffer of appropriate size.
+fn extract_meta_string<F>(c_function: F, capacity: usize) -> Result<String, MetaValError>
+where
+    F: Fn(*mut c_char, usize) -> i32,
+{
+    let mut buffer = vec![0u8; capacity];
+
+    // call the foreign function
+    let result = c_function(buffer.as_mut_ptr() as *mut c_char, buffer.len());
+    if result < 0 {
+        return Err(MetaValError::NegativeReturn(result));
+    }
+
+    // check if the response fit in our buffer
+    let returned_len = result as usize;
+    if returned_len >= capacity {
+        // buffer wasn't large enough, try again with the correct capacity.
+        return extract_meta_string(c_function, returned_len + 1);
+    }
+
+    // verify null termination
+    debug_assert_eq!(
+        buffer.get(returned_len),
+        Some(&0),
+        "should end with null byte"
+    );
+
+    // resize, convert, and return
+    buffer.truncate(returned_len);
+    Ok(String::from_utf8(buffer)?)
+}
+
 impl Drop for LlamaModel {
     fn drop(&mut self) {
         unsafe { llama_cpp_sys_2::llama_free_model(self.model.as_ptr()) }

From c33b4a59b0c4f61d6f911b8cc1a400f0971c8e4a Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 26 Mar 2025 16:43:32 +0100
Subject: [PATCH 152/193] doc comments on MetaValError, remove unused variant

---
 llama-cpp-2/src/lib.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 3f3d7c00..1ffbca2b 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -82,17 +82,18 @@ pub enum ChatTemplateError {
     Utf8Error(#[from] std::str::Utf8Error),
 }
 
+/// Failed fetching metadata value
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum MetaValError {
-    #[error("model does not have metadata key: {0}")]
-    MissingKey(String),
-
+    /// The provided string contains an unexpected null-byte
     #[error("null byte in string {0}")]
     NullError(#[from] NulError),
 
+    /// The returned data contains invalid UTF8 data
     #[error("FromUtf8Error {0}")]
     FromUtf8Error(#[from] FromUtf8Error),
 
+    /// Got negative return value. This happens if the key or index queried does not exist.
     #[error("Negative return value. Likely due to a missing index or key. Got return value: {0}")]
     NegativeReturn(i32),
 }

From 5369ff22cf66d74b9bcb47c8c970cb516dfa2329 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 4 Apr 2025 20:54:56 +0000
Subject: [PATCH 153/193] chore(deps): bump openssl from 0.10.70 to 0.10.72

Bumps [openssl](https://github.com/sfackler/rust-openssl) from 0.10.70 to 0.10.72.
- [Release notes](https://github.com/sfackler/rust-openssl/releases)
- [Commits](https://github.com/sfackler/rust-openssl/compare/openssl-v0.10.70...openssl-v0.10.72)

---
updated-dependencies:
- dependency-name: openssl
  dependency-version: 0.10.72
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1e0f2685..25a5d202 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -762,9 +762,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.70"
+version = "0.10.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
+checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -794,9 +794,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.105"
+version = "0.9.107"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
+checksum = "8288979acd84749c744a9014b4382d42b8f7b2592847b5afb2ed29e5d16ede07"
 dependencies = [
  "cc",
  "libc",

From 26a3ec79ffab91d45a6df1875d02ffc716a32396 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 7 Apr 2025 06:16:05 +0000
Subject: [PATCH 154/193] chore(deps): bump clap from 4.5.34 to 4.5.35

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.34 to 4.5.35.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.34...clap_complete-v4.5.35)

---
updated-dependencies:
- dependency-name: clap
  dependency-version: 4.5.35
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 25a5d202..7205fa1c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.34"
+version = "4.5.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e958897981290da2a852763fe9cdb89cd36977a5d729023127095fa94d95e2ff"
+checksum = "d8aa86934b44c19c50f87cc2790e19f54f7a67aedb64101c2e1a2e5ecfb73944"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.34"
+version = "4.5.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83b0f35019843db2160b5bb19ae09b4e6411ac33fc6a712003c33e03090e2489"
+checksum = "2414dbb2dd0695280da6ea9261e327479e9d37b0630f6b53ba2a11c60c679fd9"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 1bc7de90..5cedd4e0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.17"
 anyhow = "1.0.97"
-clap = "4.5.34"
+clap = "4.5.35"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From c9cbb3dc45af7a60c2420e9fa6b8aa3cfae1bcfc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 7 Apr 2025 06:16:29 +0000
Subject: [PATCH 155/193] chore(deps): bump cc from 1.2.17 to 1.2.18

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.17 to 1.2.18.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.17...cc-v1.2.18)

---
updated-dependencies:
- dependency-name: cc
  dependency-version: 1.2.18
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 25a5d202..0972b987 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.17"
+version = "1.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fcb57c740ae1daf453ae85f16e37396f672b039e00d9d866e07ddb24e328e3a"
+checksum = "525046617d8376e3db1deffb079e91cef90a89fc3ca5c185bbf8c9ecdd15cd5c"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 1bc7de90..bcfa119c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.17"
+cc = "1.2.18"
 anyhow = "1.0.97"
 clap = "4.5.34"
 encoding_rs = "0.8.35"

From 2641a6d1cf77f5813fc810e4a5866a86ec7b9784 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 14 Apr 2025 05:40:09 +0000
Subject: [PATCH 156/193] chore(deps): bump cc from 1.2.18 to 1.2.19

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.18 to 1.2.19.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.18...cc-v1.2.19)

---
updated-dependencies:
- dependency-name: cc
  dependency-version: 1.2.19
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7e03e19a..de6766c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.18"
+version = "1.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "525046617d8376e3db1deffb079e91cef90a89fc3ca5c185bbf8c9ecdd15cd5c"
+checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 4d5cf748..05031bca 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.18"
+cc = "1.2.19"
 anyhow = "1.0.97"
 clap = "4.5.35"
 encoding_rs = "0.8.35"

From 1fd0bd20a3c724ad5a925c2bd7d3030f66cdb620 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 14 Apr 2025 05:40:21 +0000
Subject: [PATCH 157/193] chore(deps): bump clap from 4.5.35 to 4.5.36

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.35 to 4.5.36.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.35...clap_complete-v4.5.36)

---
updated-dependencies:
- dependency-name: clap
  dependency-version: 4.5.36
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7e03e19a..c4c708e1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.35"
+version = "4.5.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8aa86934b44c19c50f87cc2790e19f54f7a67aedb64101c2e1a2e5ecfb73944"
+checksum = "2df961d8c8a0d08aa9945718ccf584145eee3f3aa06cddbeac12933781102e04"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.35"
+version = "4.5.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2414dbb2dd0695280da6ea9261e327479e9d37b0630f6b53ba2a11c60c679fd9"
+checksum = "132dbda40fb6753878316a489d5a1242a8ef2f0d9e47ba01c951ea8aa7d013a5"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 4d5cf748..6b527cd8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.18"
 anyhow = "1.0.97"
-clap = "4.5.35"
+clap = "4.5.36"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From 9657f021dfe64629838c0e61d1fd19322e52fc82 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 14 Apr 2025 15:18:08 +0000
Subject: [PATCH 158/193] chore(deps): bump anyhow from 1.0.97 to 1.0.98

Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.97 to 1.0.98.
- [Release notes](https://github.com/dtolnay/anyhow/releases)
- [Commits](https://github.com/dtolnay/anyhow/compare/1.0.97...1.0.98)

---
updated-dependencies:
- dependency-name: anyhow
  dependency-version: 1.0.98
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ffdd81da..e2e7bf63 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,9 +68,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.97"
+version = "1.0.98"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f"
+checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
 
 [[package]]
 name = "base64"
diff --git a/Cargo.toml b/Cargo.toml
index cab5fce6..656fce3d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.19"
-anyhow = "1.0.97"
+anyhow = "1.0.98"
 clap = "4.5.36"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }

From 7c0a01a813241c1770af9acabbda31595ebbcc87 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 30 Apr 2025 18:16:44 +0200
Subject: [PATCH 159/193] LLAMA_CURL=OFF

---
 llama-cpp-sys-2/build.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index e2a56b38..5b39722d 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -267,6 +267,7 @@ fn main() {
     config.define("LLAMA_BUILD_TESTS", "OFF");
     config.define("LLAMA_BUILD_EXAMPLES", "OFF");
     config.define("LLAMA_BUILD_SERVER", "OFF");
+    config.define("LLAMA_CURL", "OFF");
 
     config.define(
         "BUILD_SHARED_LIBS",

From e5082c1fa8fdf45a2ca42cabeb1104de9284b408 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 30 Apr 2025 18:24:09 +0200
Subject: [PATCH 160/193] update llama.cpp to b5233

---
 llama-cpp-sys-2/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 2b65ae30..ceda28ef 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 2b65ae30299b9c67e25c51ee567e9a2ef22279ab
+Subproject commit ceda28ef8e310a8dee60bf275077a3eedae8e36c

From 5d0ea24854db06d6b13385224282f0580bb14819 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Thu, 1 May 2025 13:23:43 +0200
Subject: [PATCH 161/193] add cmake to test build dockerfile

---
 test-build.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test-build.Dockerfile b/test-build.Dockerfile
index 8540d2f9..daa3a709 100644
--- a/test-build.Dockerfile
+++ b/test-build.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} as base-cuda
 
 # Install requirements for rustup install + bindgen: https://rust-lang.github.io/rust-bindgen/requirements.html
-RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev
+RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev cmake
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 ENV PATH=/root/.cargo/bin:$PATH
 

From 496c3685417495caa8c7b73a87939a50ee92d9f5 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Thu, 1 May 2025 14:53:52 +0200
Subject: [PATCH 162/193] add git to dockerfile

---
 test-build.Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test-build.Dockerfile b/test-build.Dockerfile
index daa3a709..ca017457 100644
--- a/test-build.Dockerfile
+++ b/test-build.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} as base-cuda
 
 # Install requirements for rustup install + bindgen: https://rust-lang.github.io/rust-bindgen/requirements.html
-RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev cmake
+RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev cmake git
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 ENV PATH=/root/.cargo/bin:$PATH
 

From ddd380a8ef61577bbc1a96015343888d5e8846d4 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Thu, 1 May 2025 14:54:24 +0200
Subject: [PATCH 163/193] change casing of 'AS' to fix docker build warning

---
 test-build.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test-build.Dockerfile b/test-build.Dockerfile
index ca017457..383e0973 100644
--- a/test-build.Dockerfile
+++ b/test-build.Dockerfile
@@ -1,6 +1,6 @@
 ARG CUDA_VERSION=12.3.1
 ARG UBUNTU_VERSION=22.04
-FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} as base-cuda
+FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS base-cuda
 
 # Install requirements for rustup install + bindgen: https://rust-lang.github.io/rust-bindgen/requirements.html
 RUN DEBIAN_FRONTEND=noninteractive apt update -y && apt install -y curl llvm-dev libclang-dev clang pkg-config libssl-dev cmake git
@@ -10,7 +10,7 @@ ENV PATH=/root/.cargo/bin:$PATH
 COPY . .
 RUN cargo build --bin simple --features cuda
 
-FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} as base-cuda-runtime
+FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} AS base-cuda-runtime
 
 COPY --from=base-cuda /target/debug/simple /usr/local/bin/simple
 

From 9f7bd2503bfd059e178f3cba6e29d35507c15d24 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Thu, 1 May 2025 17:46:50 +0200
Subject: [PATCH 164/193] add llama backend init to the doc tests that depend
 on it

---
 llama-cpp-2/src/lib.rs      | 4 ++++
 llama-cpp-2/src/sampling.rs | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 1ffbca2b..f2ac5313 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -217,6 +217,8 @@ pub enum LlamaLoraAdapterRemoveError {
 /// get the time (in microseconds) according to llama.cpp
 /// ```
 /// # use llama_cpp_2::llama_time_us;
+/// # use llama_cpp_2::llama_backend::LlamaBackend;
+/// let backend = LlamaBackend::init().unwrap();
 /// let time = llama_time_us();
 /// assert!(time > 0);
 /// ```
@@ -311,6 +313,8 @@ pub enum ApplyChatTemplateError {
 ///
 /// ```
 /// # use std::time::Duration;
+/// # use llama_cpp_2::llama_backend::LlamaBackend;
+/// let backend = LlamaBackend::init().unwrap();
 /// use llama_cpp_2::ggml_time_us;
 ///
 /// let start = ggml_time_us();
diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs
index a659ab73..96feb402 100644
--- a/llama-cpp-2/src/sampling.rs
+++ b/llama-cpp-2/src/sampling.rs
@@ -117,6 +117,8 @@ impl LlamaSampler {
     ///    data_array::LlamaTokenDataArray
     /// };
     /// use llama_cpp_2::sampling::LlamaSampler;
+    /// use llama_cpp_2::llama_backend::LlamaBackend;
+    /// let backend = LlamaBackend::init().unwrap();
     ///
     /// let mut data_array = LlamaTokenDataArray::new(vec![
     ///     LlamaTokenData::new(LlamaToken(0), 0., 0.),

From eed54cb07b6dc0c82ce68c8e11b541ab0875176f Mon Sep 17 00:00:00 2001
From: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
Date: Fri, 2 May 2025 09:50:22 +0200
Subject: [PATCH 165/193] Fix common/minja missing in cargo publish

---
 llama-cpp-sys-2/Cargo.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 6db13f98..4fd842ee 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -12,9 +12,9 @@ include = [
     "build.rs",
     "/src",
 
-    "/llama.cpp/common/*.h",
-    "/llama.cpp/common/*.hpp",
-    "/llama.cpp/common/*.cpp",
+    "/llama.cpp/common/**/*.h",
+    "/llama.cpp/common/**/*.hpp",
+    "/llama.cpp/common/**/*.cpp",
     "/llama.cpp/ggml/include/*.h",
     "/llama.cpp/ggml/src/*.h",
     "/llama.cpp/ggml/src/*.c",
@@ -88,4 +88,4 @@ vulkan = []
 native = []
 openmp = []
 # Only has an impact on Android.
-shared-stdcxx = []
\ No newline at end of file
+shared-stdcxx = []

From 1587e0c48489dbd0455439752226c732296f9782 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Sat, 3 May 2025 16:02:42 +0000
Subject: [PATCH 166/193] Bump version to 0.1.104 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e2e7bf63..b09b1d55 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.103"
+version = "0.1.104"
 dependencies = [
  "anyhow",
  "clap",
@@ -662,7 +662,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.103"
+version = "0.1.104"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -674,7 +674,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.103"
+version = "0.1.104"
 dependencies = [
  "bindgen",
  "cc",
@@ -1114,7 +1114,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.103"
+version = "0.1.104"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index c3c5b533..beeaa170 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.103"
+version = "0.1.104"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 98867c57..bc628993 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.103"
+version = "0.1.104"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index bf52467c..8bb61458 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.103"
+version = "0.1.104"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 4fd842ee..fb4f53ed 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.103"
+version = "0.1.104"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From fd06d2183d05691349fc118d17cfeaeb7af11b1b Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Sat, 3 May 2025 16:04:46 +0000
Subject: [PATCH 167/193] Bump version to 0.1.105 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b09b1d55..176491bb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.104"
+version = "0.1.105"
 dependencies = [
  "anyhow",
  "clap",
@@ -662,7 +662,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.104"
+version = "0.1.105"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -674,7 +674,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.104"
+version = "0.1.105"
 dependencies = [
  "bindgen",
  "cc",
@@ -1114,7 +1114,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.104"
+version = "0.1.105"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index beeaa170..3387baf7 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.104"
+version = "0.1.105"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index bc628993..c088180a 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.104"
+version = "0.1.105"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 8bb61458..47b77512 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.104"
+version = "0.1.105"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index fb4f53ed..4ec679ea 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.104"
+version = "0.1.105"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 955f8d6f5233c4df83bd0dd1613f4a9fa344b79c Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Sat, 3 May 2025 16:06:59 +0000
Subject: [PATCH 168/193] Bump version to 0.1.106 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 176491bb..47fbdd05 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.105"
+version = "0.1.106"
 dependencies = [
  "anyhow",
  "clap",
@@ -662,7 +662,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.105"
+version = "0.1.106"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -674,7 +674,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.105"
+version = "0.1.106"
 dependencies = [
  "bindgen",
  "cc",
@@ -1114,7 +1114,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.105"
+version = "0.1.106"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 3387baf7..abe54e1e 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.105"
+version = "0.1.106"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index c088180a..d70d82ae 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.105"
+version = "0.1.106"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 47b77512..dc5fe012 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.105"
+version = "0.1.106"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 4ec679ea..fecfdb7c 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.105"
+version = "0.1.106"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From e417032897a8de07b615de35bdca4ca639557de9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 3 May 2025 19:36:27 +0000
Subject: [PATCH 169/193] chore(deps): bump clap from 4.5.36 to 4.5.37

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.36 to 4.5.37.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.36...clap_complete-v4.5.37)

---
updated-dependencies:
- dependency-name: clap
  dependency-version: 4.5.37
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 47fbdd05..9eb7f89e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.36"
+version = "4.5.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2df961d8c8a0d08aa9945718ccf584145eee3f3aa06cddbeac12933781102e04"
+checksum = "eccb054f56cbd38340b380d4a8e69ef1f02f1af43db2f0cc817a4774d80ae071"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.36"
+version = "4.5.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "132dbda40fb6753878316a489d5a1242a8ef2f0d9e47ba01c951ea8aa7d013a5"
+checksum = "efd9466fac8543255d3b1fcad4762c5e116ffe808c8a3043d4263cd4fd4862a2"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 656fce3d..ac92122a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.19"
 anyhow = "1.0.98"
-clap = "4.5.36"
+clap = "4.5.37"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From 1c1f8885eca1408581e43c6cab69261f28de799b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 3 May 2025 19:36:30 +0000
Subject: [PATCH 170/193] chore(deps): bump cc from 1.2.19 to 1.2.20

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.19 to 1.2.20.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.19...cc-v1.2.20)

---
updated-dependencies:
- dependency-name: cc
  dependency-version: 1.2.20
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 47fbdd05..1a9bbcf2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.19"
+version = "1.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362"
+checksum = "8691782945451c1c383942c4874dbe63814f61cb57ef773cda2972682b7bb3c0"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 656fce3d..043607b9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.19"
+cc = "1.2.21"
 anyhow = "1.0.98"
 clap = "4.5.36"
 encoding_rs = "0.8.35"

From 99f5e5c95e968d7c11e848fb02621f5d8fe6abb2 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Sat, 3 May 2025 19:37:29 +0000
Subject: [PATCH 171/193] Bump version to 0.1.107 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 47fbdd05..2eaec111 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.106"
+version = "0.1.107"
 dependencies = [
  "anyhow",
  "clap",
@@ -662,7 +662,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.106"
+version = "0.1.107"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -674,7 +674,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.106"
+version = "0.1.107"
 dependencies = [
  "bindgen",
  "cc",
@@ -1114,7 +1114,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.106"
+version = "0.1.107"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index abe54e1e..2df63a2e 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.106"
+version = "0.1.107"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index d70d82ae..540486ba 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.106"
+version = "0.1.107"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index dc5fe012..67e2e118 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.106"
+version = "0.1.107"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index fecfdb7c..f998598d 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.106"
+version = "0.1.107"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 4f41ba6342cf9ad91362082569b4db4df86168a0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 12 May 2025 05:48:34 +0000
Subject: [PATCH 172/193] chore(deps): bump clap from 4.5.37 to 4.5.38

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.37 to 4.5.38.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.37...clap_complete-v4.5.38)

---
updated-dependencies:
- dependency-name: clap
  dependency-version: 4.5.38
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 566171c4..838b5b7b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.37"
+version = "4.5.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eccb054f56cbd38340b380d4a8e69ef1f02f1af43db2f0cc817a4774d80ae071"
+checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.37"
+version = "4.5.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "efd9466fac8543255d3b1fcad4762c5e116ffe808c8a3043d4263cd4fd4862a2"
+checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 88925096..472760d2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.21"
 anyhow = "1.0.98"
-clap = "4.5.37"
+clap = "4.5.38"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From 3597d01132a76c515f8b73f78c9ff7ca5d585068 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 12 May 2025 05:48:44 +0000
Subject: [PATCH 173/193] chore(deps): bump cc from 1.2.21 to 1.2.22

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.21 to 1.2.22.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.21...cc-v1.2.22)

---
updated-dependencies:
- dependency-name: cc
  dependency-version: 1.2.22
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 566171c4..856f8763 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.21"
+version = "1.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8691782945451c1c383942c4874dbe63814f61cb57ef773cda2972682b7bb3c0"
+checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 88925096..f08a2969 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.21"
+cc = "1.2.22"
 anyhow = "1.0.98"
 clap = "4.5.37"
 encoding_rs = "0.8.35"

From 7e136733f8eb093797fc650f04dddb32f1c5c407 Mon Sep 17 00:00:00 2001
From: Chen Xu <windoze@0d0a.com>
Date: Sat, 17 May 2025 19:18:24 +0800
Subject: [PATCH 174/193] Fix MacOS build

---
 llama-cpp-sys-2/build.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 5b39722d..d2aef770 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -237,6 +237,7 @@ fn main() {
         .header("wrapper.h")
         .clang_arg(format!("-I{}", llama_src.join("include").display()))
         .clang_arg(format!("-I{}", llama_src.join("ggml/include").display()))
+        .clang_arg(format!("--target={}", env::var("HOST").unwrap()))
         .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
         .derive_partialeq(true)
         .allowlist_function("ggml_.*")

From b6048f05afed3775a1af21a16e38709437764c75 Mon Sep 17 00:00:00 2001
From: Chen Xu <windoze@0d0a.com>
Date: Sat, 17 May 2025 19:32:03 +0800
Subject: [PATCH 175/193] Use `TARGET` instead of `HOST`

---
 llama-cpp-sys-2/build.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index d2aef770..b2eb559c 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -232,12 +232,16 @@ fn main() {
             .to_string(),
     );
 
+    let bindgen_target = env::var("TARGET").or_else(|_| {
+        env::var("HOST")
+    }).expect("Failed to get TARGET or HOST environment variable");
+
     // Bindings
     let bindings = bindgen::Builder::default()
         .header("wrapper.h")
         .clang_arg(format!("-I{}", llama_src.join("include").display()))
         .clang_arg(format!("-I{}", llama_src.join("ggml/include").display()))
-        .clang_arg(format!("--target={}", env::var("HOST").unwrap()))
+        .clang_arg(format!("--target={}", bindgen_target))
         .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
         .derive_partialeq(true)
         .allowlist_function("ggml_.*")

From b9993ffc740ec30eb26cec11a1d5625f13c2daf9 Mon Sep 17 00:00:00 2001
From: Chen Xu <windoze@0d0a.com>
Date: Sat, 17 May 2025 23:46:51 +0800
Subject: [PATCH 176/193] Use existing target

---
 llama-cpp-sys-2/build.rs | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index b2eb559c..df654053 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -232,16 +232,12 @@ fn main() {
             .to_string(),
     );
 
-    let bindgen_target = env::var("TARGET").or_else(|_| {
-        env::var("HOST")
-    }).expect("Failed to get TARGET or HOST environment variable");
-
     // Bindings
     let bindings = bindgen::Builder::default()
         .header("wrapper.h")
         .clang_arg(format!("-I{}", llama_src.join("include").display()))
         .clang_arg(format!("-I{}", llama_src.join("ggml/include").display()))
-        .clang_arg(format!("--target={}", bindgen_target))
+        .clang_arg(format!("--target={}", target_triple))
         .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
         .derive_partialeq(true)
         .allowlist_function("ggml_.*")
@@ -315,10 +311,7 @@ fn main() {
         } else {
             config.define("ANDROID_PLATFORM", "android-28");
         }
-        if target_triple.contains("aarch64") {
-            config.cflag("-march=armv8.7a");
-            config.cxxflag("-march=armv8.7a");
-        } else if target_triple.contains("armv7") {
+        if target_triple.contains("aarch64") || target_triple.contains("armv7") {
             config.cflag("-march=armv8.7a");
             config.cxxflag("-march=armv8.7a");
         } else if target_triple.contains("x86_64") {

From 9012e88d599d23efe83c90c59ecb3ad92597b8ea Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 May 2025 05:24:18 +0000
Subject: [PATCH 177/193] chore(deps): bump cc from 1.2.22 to 1.2.23

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.22 to 1.2.23.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.22...cc-v1.2.23)

---
updated-dependencies:
- dependency-name: cc
  dependency-version: 1.2.23
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a4e4566b..3f7f3918 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.22"
+version = "1.2.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1"
+checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index f4e2fd98..f1b63014 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.22"
+cc = "1.2.23"
 anyhow = "1.0.98"
 clap = "4.5.38"
 encoding_rs = "0.8.35"

From 59864fd5b3c0e8226116972fed59c3b0158f214e Mon Sep 17 00:00:00 2001
From: Britt Lewis <brittlewis12@gmail.com>
Date: Sat, 24 May 2025 13:22:29 -0400
Subject: [PATCH 178/193] remove deprecated kv view & rename kv cache -> kv
 self APIs

* bump llama.cpp to b5474 (259469c), latest release as of 2025-05-24
  - https://github.com/ggml-org/llama.cpp/commits/259469c4b57c1a32606353bcac52ba683424a990
---
 llama-cpp-2/src/context/kv_cache.rs | 141 +++-------------------------
 llama-cpp-sys-2/llama.cpp           |   2 +-
 2 files changed, 12 insertions(+), 131 deletions(-)

diff --git a/llama-cpp-2/src/context/kv_cache.rs b/llama-cpp-2/src/context/kv_cache.rs
index d90a6b8a..14f5b5a6 100644
--- a/llama-cpp-2/src/context/kv_cache.rs
+++ b/llama-cpp-2/src/context/kv_cache.rs
@@ -28,7 +28,7 @@ impl LlamaContext<'_> {
     /// * `dest` - The sequence id to copy the cache to.
     /// * `size` - The size of the cache to copy.
     pub fn copy_cache(&mut self, src: i32, dest: i32, size: i32) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_seq_cp(self.context.as_ptr(), src, dest, 0, size) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_seq_cp(self.context.as_ptr(), src, dest, 0, size) }
     }
 
     /// Copy the cache from one sequence to another.
@@ -58,7 +58,7 @@ impl LlamaContext<'_> {
             .map_or(Ok(-1), i32::try_from)
             .map_err(KvCacheConversionError::P1TooLarge)?;
         unsafe {
-            llama_cpp_sys_2::llama_kv_cache_seq_cp(self.context.as_ptr(), src, dest, p0, p1);
+            llama_cpp_sys_2::llama_kv_self_seq_cp(self.context.as_ptr(), src, dest, p0, p1);
         }
         Ok(())
     }
@@ -92,18 +92,18 @@ impl LlamaContext<'_> {
         let p1 = p1
             .map_or(Ok(-1), i32::try_from)
             .map_err(KvCacheConversionError::P1TooLarge)?;
-        Ok(unsafe { llama_cpp_sys_2::llama_kv_cache_seq_rm(self.context.as_ptr(), src, p0, p1) })
+        Ok(unsafe { llama_cpp_sys_2::llama_kv_self_seq_rm(self.context.as_ptr(), src, p0, p1) })
     }
 
     /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
     #[must_use]
     pub fn get_kv_cache_used_cells(&self) -> i32 {
-        unsafe { llama_cpp_sys_2::llama_get_kv_cache_used_cells(self.context.as_ptr()) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_used_cells(self.context.as_ptr()) }
     }
 
     /// Clear the KV cache
     pub fn clear_kv_cache(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_clear(self.context.as_ptr()) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_clear(self.context.as_ptr()) }
     }
 
     /// Removes all tokens that do not belong to the specified sequence
@@ -112,7 +112,7 @@ impl LlamaContext<'_> {
     ///
     /// * `seq_id` - The sequence id to keep
     pub fn llama_kv_cache_seq_keep(&mut self, seq_id: i32) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_seq_keep(self.context.as_ptr(), seq_id) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_seq_keep(self.context.as_ptr(), seq_id) }
     }
 
     #[allow(clippy::doc_markdown)]
@@ -147,7 +147,7 @@ impl LlamaContext<'_> {
             .map_or(Ok(-1), i32::try_from)
             .map_err(KvCacheConversionError::P1TooLarge)?;
         unsafe {
-            llama_cpp_sys_2::llama_kv_cache_seq_add(self.context.as_ptr(), seq_id, p0, p1, delta);
+            llama_cpp_sys_2::llama_kv_self_seq_add(self.context.as_ptr(), seq_id, p0, p1, delta);
         }
         Ok(())
     }
@@ -183,7 +183,7 @@ impl LlamaContext<'_> {
             .map_or(Ok(-1), i32::try_from)
             .map_err(KvCacheConversionError::P1TooLarge)?;
         let d = c_int::from(d.get());
-        unsafe { llama_cpp_sys_2::llama_kv_cache_seq_div(self.context.as_ptr(), seq_id, p0, p1, d) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_seq_div(self.context.as_ptr(), seq_id, p0, p1, d) }
         Ok(())
     }
 
@@ -194,7 +194,7 @@ impl LlamaContext<'_> {
     /// * `seq_id` - The sequence id to get the max position for
     #[must_use]
     pub fn kv_cache_seq_pos_max(&self, seq_id: i32) -> i32 {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_seq_pos_max(self.context.as_ptr(), seq_id) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_seq_pos_max(self.context.as_ptr(), seq_id) }
     }
 
     /// Defragment the KV cache
@@ -202,130 +202,11 @@ impl LlamaContext<'_> {
     ///   - lazily on next [`LlamaContext::decode`]
     ///   - explicitly with [`Self::kv_cache_update`]
     pub fn kv_cache_defrag(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_defrag(self.context.as_ptr()) }
+        unsafe { llama_cpp_sys_2::llama_kv_self_defrag(self.context.as_ptr()) }
     }
 
     /// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
     pub fn kv_cache_update(&mut self) {
-        unsafe { llama_cpp_sys_2::llama_kv_cache_update(self.context.as_ptr()) }
-    }
-
-    /// Returns the number of tokens in the KV cache (slow, use only for debug)
-    /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    #[must_use]
-    pub fn get_kv_cache_token_count(&self) -> i32 {
-        unsafe { llama_cpp_sys_2::llama_get_kv_cache_token_count(self.context.as_ptr()) }
-    }
-
-    /// Create an empty KV cache view. (use only for debugging purposes)
-    ///
-    /// # Parameters
-    ///
-    /// * `n_max_seq` - Maximum number of sequences that can exist in a cell. It's not an error
-    ///                 if there are more sequences in a cell than this value, however they will
-    ///                 not be visible in the view `cells_sequences`.
-    #[must_use]
-    pub fn new_kv_cache_view(&self, n_max_seq: i32) -> KVCacheView {
-        let view =
-            unsafe { llama_cpp_sys_2::llama_kv_cache_view_init(self.context.as_ptr(), n_max_seq) };
-        KVCacheView { view, ctx: self }
-    }
-}
-
-/// Information associated with an individual cell in the KV cache view.
-#[derive(Debug)]
-pub struct KVCacheViewCell {
-    /// The position for this cell. Takes KV cache shifts into account.
-    /// May be negative if the cell is not populated.
-    pub pos: llama_cpp_sys_2::llama_pos,
-}
-
-/// An updateable view of the KV cache. (use only for debugging purposes)
-#[derive(Debug)]
-pub struct KVCacheView<'a> {
-    ctx: &'a LlamaContext<'a>,
-    view: llama_cpp_sys_2::llama_kv_cache_view,
-}
-
-impl KVCacheView<'_> {
-    /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    pub fn update(&mut self) {
-        unsafe {
-            llama_cpp_sys_2::llama_kv_cache_view_update(self.ctx.context.as_ptr(), &mut self.view);
-        }
-    }
-
-    /// Number of KV cache cells. This will be the same as the context size.
-    #[must_use]
-    pub fn n_cells(&self) -> i32 {
-        self.view.n_cells
-    }
-
-    /// Number of tokens in the cache. For example, if there are two populated
-    /// cells, the first with 1 sequence id in it and the second with 2 sequence
-    /// ids then you'll have 3 tokens.
-    #[must_use]
-    pub fn token_count(&self) -> i32 {
-        self.view.token_count
-    }
-
-    /// Number of populated cache cells.
-    #[must_use]
-    pub fn used_cells(&self) -> i32 {
-        self.view.used_cells
-    }
-
-    /// Maximum contiguous empty slots in the cache.
-    #[must_use]
-    pub fn max_contiguous(&self) -> i32 {
-        self.view.max_contiguous
-    }
-
-    /// Index to the start of the `max_contiguous` slot range. Can be negative
-    /// when cache is full.
-    #[must_use]
-    pub fn max_contiguous_idx(&self) -> i32 {
-        self.view.max_contiguous_idx
-    }
-
-    /// Information for individual cells.
-    ///
-    /// # Panics
-    ///
-    /// - if `n_cells` does not fit into usize.
-    pub fn cells(&self) -> impl Iterator<Item = KVCacheViewCell> {
-        unsafe {
-            std::slice::from_raw_parts(
-                self.view.cells,
-                usize::try_from(self.view.n_cells).expect("failed to fit n_cells into usize"),
-            )
-        }
-        .iter()
-        .map(|&cell| KVCacheViewCell { pos: cell.pos })
-    }
-
-    /// The sequences for each cell. There will be `n_max_seq` items per cell.
-    ///
-    /// # Panics
-    ///
-    /// - if `n_cells * n_max_seq` does not fit into usize.
-    /// - if `n_max_seq` does not fit into usize.
-    pub fn cells_sequences(&self) -> impl Iterator<Item = &[llama_cpp_sys_2::llama_seq_id]> {
-        unsafe {
-            std::slice::from_raw_parts(
-                self.view.cells_sequences,
-                usize::try_from(self.view.n_cells * self.view.n_seq_max)
-                    .expect("failed to fit n_cells * n_max_seq into usize"),
-            )
-        }
-        .chunks(usize::try_from(self.view.n_seq_max).expect("failed to fit n_max_seq into usize"))
-    }
-}
-
-impl Drop for KVCacheView<'_> {
-    fn drop(&mut self) {
-        unsafe {
-            llama_cpp_sys_2::llama_kv_cache_view_free(&mut self.view);
-        }
+        unsafe { llama_cpp_sys_2::llama_kv_self_update(self.context.as_ptr()) }
     }
 }
diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index ceda28ef..259469c4 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit ceda28ef8e310a8dee60bf275077a3eedae8e36c
+Subproject commit 259469c4b57c1a32606353bcac52ba683424a990

From b0839c391ebbb74efda2d2852603f595c94e7ff3 Mon Sep 17 00:00:00 2001
From: Britt Lewis <brittlewis12@gmail.com>
Date: Sat, 24 May 2025 13:24:37 -0400
Subject: [PATCH 179/193] update llama.cpp org-ref

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 625b54c7..0dfa7e0d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "llama-cpp-sys-2/llama.cpp"]
 	path = llama-cpp-sys-2/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp
+	url = https://github.com/ggml-org/llama.cpp

From f8d986b3f699e04b3fba3a1da7dadb9c9773fc62 Mon Sep 17 00:00:00 2001
From: Britt Lewis <brittlewis12@gmail.com>
Date: Sat, 24 May 2025 14:44:19 -0400
Subject: [PATCH 180/193] disable building tools post upstream reorganization

* https://github.com/ggml-org/llama.cpp/pull/13249
---
 llama-cpp-sys-2/build.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index df654053..156eb4b4 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -268,6 +268,7 @@ fn main() {
     config.define("LLAMA_BUILD_TESTS", "OFF");
     config.define("LLAMA_BUILD_EXAMPLES", "OFF");
     config.define("LLAMA_BUILD_SERVER", "OFF");
+    config.define("LLAMA_BUILD_TOOLS", "OFF");
     config.define("LLAMA_CURL", "OFF");
 
     config.define(

From ff4784e62db6fe15446f325d430a876454b3ec0e Mon Sep 17 00:00:00 2001
From: Britt Lewis <brittlewis12@gmail.com>
Date: Sat, 24 May 2025 14:45:34 -0400
Subject: [PATCH 181/193] cargo fmt in build.rs

---
 llama-cpp-sys-2/build.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 156eb4b4..f545ff9a 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -280,7 +280,11 @@ fn main() {
         config.define("GGML_BLAS", "OFF");
     }
 
-    if (matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc)) && matches!(profile.as_str(), "Release" | "RelWithDebInfo" | "MinSizeRel"))
+    if (matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc))
+        && matches!(
+            profile.as_str(),
+            "Release" | "RelWithDebInfo" | "MinSizeRel"
+        ))
     {
         // Debug Rust builds under MSVC turn off optimization even though we're ideally building the release profile of llama.cpp.
         // Looks like an upstream bug:

From 9bd1cc40c2ef2fc3e1e52e68f6a905c2ca9541cc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 May 2025 05:53:17 +0000
Subject: [PATCH 182/193] chore(deps): bump cc from 1.2.23 to 1.2.24

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.23 to 1.2.24.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.23...cc-v1.2.24)

---
updated-dependencies:
- dependency-name: cc
  dependency-version: 1.2.24
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f7f3918..dbb23a13 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.23"
+version = "1.2.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766"
+checksum = "16595d3be041c03b09d08d0858631facccee9221e579704070e6e9e4915d3bc7"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index f1b63014..91a7e87a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.23"
+cc = "1.2.24"
 anyhow = "1.0.98"
 clap = "4.5.38"
 encoding_rs = "0.8.35"

From 57afeb6c436eab973bfff25c898950c41a0fd0ee Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Mon, 26 May 2025 21:04:55 +0000
Subject: [PATCH 183/193] Bump version to 0.1.108 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dbb23a13..dae55148 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.107"
+version = "0.1.108"
 dependencies = [
  "anyhow",
  "clap",
@@ -662,7 +662,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.107"
+version = "0.1.108"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -674,7 +674,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.107"
+version = "0.1.108"
 dependencies = [
  "bindgen",
  "cc",
@@ -1114,7 +1114,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.107"
+version = "0.1.108"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 2df63a2e..21013e3d 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.107"
+version = "0.1.108"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index 540486ba..dbf3b965 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.107"
+version = "0.1.108"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 67e2e118..3ef28505 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.107"
+version = "0.1.108"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index f998598d..8ce03271 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.107"
+version = "0.1.108"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From af7b3f33458036ac98425b2352abc7b49b46de61 Mon Sep 17 00:00:00 2001
From: Russell Wong <russellwmy@gmail.com>
Date: Tue, 27 May 2025 18:29:51 +0930
Subject: [PATCH 184/193] fixed llama.cpp build issue on ARM (Apple aarch64)

---
 llama-cpp-sys-2/build.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index f545ff9a..8e94e81d 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -336,6 +336,16 @@ fn main() {
         }
     }
 
+    if matches!(target_os, TargetOs::Linux)
+        && target_triple.contains("aarch64")
+        && !env::var(format!("CARGO_FEATURE_{}", "native".to_uppercase())).is_ok()
+    {
+        // If the native feature is not enabled, we take off the native ARM64 support.
+        // It is useful in docker environments where the native feature is not enabled.
+        config.define("GGML_NATIVE", "OFF");
+        config.define("GGML_CPU_ARM_ARCH", "armv8-a");
+    }
+
     if cfg!(feature = "vulkan") {
         config.define("GGML_VULKAN", "ON");
         match target_os {

From 2ad42a81baa437ce7bb145d31a5860a740220f4f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Jun 2025 06:42:17 +0000
Subject: [PATCH 185/193] chore(deps): bump clap from 4.5.38 to 4.5.39

Bumps [clap](https://github.com/clap-rs/clap) from 4.5.38 to 4.5.39.
- [Release notes](https://github.com/clap-rs/clap/releases)
- [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md)
- [Commits](https://github.com/clap-rs/clap/compare/clap_complete-v4.5.38...clap_complete-v4.5.39)

---
updated-dependencies:
- dependency-name: clap
  dependency-version: 4.5.39
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 8 ++++----
 Cargo.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dae55148..38e0c30b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -146,9 +146,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.38"
+version = "4.5.39"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000"
+checksum = "fd60e63e9be68e5fb56422e397cf9baddded06dae1d2e523401542383bc72a9f"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -156,9 +156,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.38"
+version = "4.5.39"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120"
+checksum = "89cc6392a1f72bbeb820d71f32108f61fdaf18bc526e1d23954168a67759ef51"
 dependencies = [
  "anstream",
  "anstyle",
diff --git a/Cargo.toml b/Cargo.toml
index 91a7e87a..c608eed2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ pprof = "0.13.0"
 bindgen = "0.69.5"
 cc = "1.2.24"
 anyhow = "1.0.98"
-clap = "4.5.38"
+clap = "4.5.39"
 encoding_rs = "0.8.35"
 tracing-subscriber = { version = "0.3", features = ["json"] }
 

From ff82d97b5e2c8a6a53d843eda38443ae993198f9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Jun 2025 06:44:04 +0000
Subject: [PATCH 186/193] chore(deps): bump cc from 1.2.24 to 1.2.25

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.24 to 1.2.25.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.24...cc-v1.2.25)

---
updated-dependencies:
- dependency-name: cc
  dependency-version: 1.2.25
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dae55148..5d9c1244 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.24"
+version = "1.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16595d3be041c03b09d08d0858631facccee9221e579704070e6e9e4915d3bc7"
+checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 91a7e87a..f954c609 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.24"
+cc = "1.2.25"
 anyhow = "1.0.98"
 clap = "4.5.38"
 encoding_rs = "0.8.35"

From e83fa543edba8ca50e4d9ae38ec02ebb78de9800 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Thu, 5 Jun 2025 19:22:02 +0000
Subject: [PATCH 187/193] Bump version to 0.1.109 [skip ci]

---
 Cargo.lock                     | 8 ++++----
 examples/embeddings/Cargo.toml | 2 +-
 examples/simple/Cargo.toml     | 2 +-
 llama-cpp-2/Cargo.toml         | 2 +-
 llama-cpp-sys-2/Cargo.toml     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d1e4dcce..dbb9e394 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,7 @@ checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
 [[package]]
 name = "embeddings"
-version = "0.1.108"
+version = "0.1.109"
 dependencies = [
  "anyhow",
  "clap",
@@ -662,7 +662,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 
 [[package]]
 name = "llama-cpp-2"
-version = "0.1.108"
+version = "0.1.109"
 dependencies = [
  "encoding_rs",
  "enumflags2",
@@ -674,7 +674,7 @@ dependencies = [
 
 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.108"
+version = "0.1.109"
 dependencies = [
  "bindgen",
  "cc",
@@ -1114,7 +1114,7 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "simple"
-version = "0.1.108"
+version = "0.1.109"
 dependencies = [
  "anyhow",
  "clap",
diff --git a/examples/embeddings/Cargo.toml b/examples/embeddings/Cargo.toml
index 21013e3d..eb993289 100644
--- a/examples/embeddings/Cargo.toml
+++ b/examples/embeddings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "embeddings"
-version = "0.1.108"
+version = "0.1.109"
 edition = "2021"
 
 [dependencies]
diff --git a/examples/simple/Cargo.toml b/examples/simple/Cargo.toml
index dbf3b965..28d0ee6d 100644
--- a/examples/simple/Cargo.toml
+++ b/examples/simple/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "simple"
-version = "0.1.108"
+version = "0.1.109"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 3ef28505..fb2f9f57 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-2"
 description = "llama.cpp bindings for Rust"
-version = "0.1.108"
+version = "0.1.109"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 8ce03271..3dd9b94e 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "llama-cpp-sys-2"
 description = "Low Level Bindings to llama.cpp"
-version = "0.1.108"
+version = "0.1.109"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/utilityai/llama-cpp-rs"

From 46b42a8180a9d65fe3a0d7399cba13bed77998fe Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Jun 2025 06:22:45 +0000
Subject: [PATCH 188/193] chore(deps): bump cc from 1.2.25 to 1.2.26

Bumps [cc](https://github.com/rust-lang/cc-rs) from 1.2.25 to 1.2.26.
- [Release notes](https://github.com/rust-lang/cc-rs/releases)
- [Changelog](https://github.com/rust-lang/cc-rs/blob/main/CHANGELOG.md)
- [Commits](https://github.com/rust-lang/cc-rs/compare/cc-v1.2.25...cc-v1.2.26)

---
updated-dependencies:
- dependency-name: cc
  dependency-version: 1.2.26
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dbb9e394..24670a16 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,9 +109,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
 
 [[package]]
 name = "cc"
-version = "1.2.25"
+version = "1.2.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951"
+checksum = "956a5e21988b87f372569b66183b78babf23ebc2e744b733e4350a752c4dafac"
 dependencies = [
  "jobserver",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index 0a2912d0..047fac3b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.5"
-cc = "1.2.25"
+cc = "1.2.26"
 anyhow = "1.0.98"
 clap = "4.5.39"
 encoding_rs = "0.8.35"

From 04f6407fbc6e39c8a06b787b76a80b9bfafdf509 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Jun 2025 06:26:02 +0000
Subject: [PATCH 189/193] chore(deps): bump tracing-core from 0.1.33 to 0.1.34

Bumps [tracing-core](https://github.com/tokio-rs/tracing) from 0.1.33 to 0.1.34.
- [Release notes](https://github.com/tokio-rs/tracing/releases)
- [Commits](https://github.com/tokio-rs/tracing/compare/tracing-core-0.1.33...tracing-core-0.1.34)

---
updated-dependencies:
- dependency-name: tracing-core
  dependency-version: 0.1.34
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dbb9e394..dd5aec86 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1246,9 +1246,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.33"
+version = "0.1.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
+checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678"
 dependencies = [
  "once_cell",
  "valuable",

From b2aa713ee5803c988929ed7d761b5a44da7db842 Mon Sep 17 00:00:00 2001
From: Marcus Dunn <marcus@utilityai.ca>
Date: Tue, 17 Jun 2025 00:34:53 +0000
Subject: [PATCH 190/193] updated llama.cpp

---
 llama-cpp-sys-2/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-cpp-sys-2/llama.cpp b/llama-cpp-sys-2/llama.cpp
index 259469c4..e434e691 160000
--- a/llama-cpp-sys-2/llama.cpp
+++ b/llama-cpp-sys-2/llama.cpp
@@ -1 +1 @@
-Subproject commit 259469c4b57c1a32606353bcac52ba683424a990
+Subproject commit e434e69183fd9e1031f4445002083178c331a28b

From fbf6887c9aeee5da2acadeb42e9ff0808538a324 Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Tue, 17 Jun 2025 12:33:01 +0200
Subject: [PATCH 191/193] bump llamacpp, remove weird build-info.cpp copy

---
 llama-cpp-sys-2/Cargo.toml | 12 ------------
 llama-cpp-sys-2/build.rs   | 12 ------------
 2 files changed, 24 deletions(-)

diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 3dd9b94e..068204da 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -24,18 +24,6 @@ include = [
 
     "/llama.cpp/convert_hf_to_gguf.py", # Yes, it's required
 
-    # Erroneously the llama.cpp code currently generates the build-info.cpp
-    # into the source directory of the build instead of into the target directory
-    # as it should. Will try submitting something upstream to clean this up as
-    # well but for now explictly exclude this from the build. Previously this was
-    # implicitly excluded because the llama.cpp code was copied wholesale into the
-    # target directory for building which is why this problem wasn't visible before
-    # (i.e. we'd package the llama.cpp source from the submodule & thus this build-info.cpp
-    # generated file would still be ignored because it would only exist in the separate
-    # copy within the target directory. An alternative, if we do want to capture build-info.cpp
-    # within the package would be to change the CI task to add `--allow-dirty` to the package
-    # command.
-    "!/llama.cpp/common/build-info.cpp",
     "/llama.cpp/common/build-info.cpp.in",
 
     "/llama.cpp/ggml/src/ggml-cuda.cu",
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 8e94e81d..cd5c036d 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -388,18 +388,6 @@ fn main() {
         .always_configure(false);
 
     let build_dir = config.build();
-    let build_info_src = llama_src.join("common/build-info.cpp");
-    let build_info_target = build_dir.join("build-info.cpp");
-    std::fs::rename(&build_info_src,&build_info_target).unwrap_or_else(|move_e| {
-        // Rename may fail if the target directory is on a different filesystem/disk from the source.
-        // Fall back to copy + delete to achieve the same effect in this case.
-        std::fs::copy(&build_info_src, &build_info_target).unwrap_or_else(|copy_e| {
-            panic!("Failed to rename {build_info_src:?} to {build_info_target:?}. Move failed with {move_e:?} and copy failed with {copy_e:?}");
-        });
-        std::fs::remove_file(&build_info_src).unwrap_or_else(|e| {
-            panic!("Failed to delete {build_info_src:?} after copying to {build_info_target:?}: {e:?} (move failed because {move_e:?})");
-        });
-    });
 
     // Search paths
     println!("cargo:rustc-link-search={}", out_dir.join("lib").display());

From a92e90b2875238036a547b1854aefaefa2f8f59f Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 18 Jun 2025 11:42:21 +0200
Subject: [PATCH 192/193] explicitly link advapi32 on windows

---
 llama-cpp-sys-2/build.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index cd5c036d..4ad80680 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -440,6 +440,7 @@ fn main() {
 
     match target_os {
         TargetOs::Windows(WindowsVariant::Msvc) => {
+            println!("cargo:rustc-link-lib=advapi32");
             if cfg!(debug_assertions) {
                 println!("cargo:rustc-link-lib=dylib=msvcrtd");
             }

From 245e4241172368f01da821c0a25827781d5486bf Mon Sep 17 00:00:00 2001
From: AsbjornOlling <asbjornolling@gmail.com>
Date: Wed, 18 Jun 2025 12:12:41 +0200
Subject: [PATCH 193/193] add ci step for windows and vulkan

---
 .github/workflows/llama-cpp-rs-check.yml | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/llama-cpp-rs-check.yml b/.github/workflows/llama-cpp-rs-check.yml
index 5277c82d..037df337 100644
--- a/.github/workflows/llama-cpp-rs-check.yml
+++ b/.github/workflows/llama-cpp-rs-check.yml
@@ -81,4 +81,23 @@ jobs:
       - name: Build
         run: cargo build --features sampler
       - name: Test
-        run: cargo test --features sampler
\ No newline at end of file
+        run: cargo test --features sampler
+  windows-vulkan:
+    name: Check that it builds on windows with vulkan
+    runs-on: windows-latest
+    steps:
+      - name: checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          submodules: recursive
+      - name: Setup Rust
+        uses: dtolnay/rust-toolchain@stable
+      - name: Install Vulkan SDK
+        uses: jakoch/install-vulkan-sdk-action@v1.0.5
+        with:
+          vulkan_version: 1.3.296.0
+          install_runtime: true
+          cache: true
+          stripdown: true
+      - name: Build
+        run: cargo build --features "sampler vulkan"