john-rocky · dokterbob · Jun 17, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -48,3 +48,6 @@ Examples/CoreMLLLMChat/gemma4-e2b/
 
 # W4A8 calibration data — regeneratable from gen_calib_data_real.py
 conversion/calibration_data/
+# Generated experiment artifacts (not the tracked experiments/bonsai sources)
+conversion/experiments/batching_models/
+conversion/experiments/*.log
diff --git a/Package.resolved b/Package.resolved
diff --git a/Package.swift b/Package.swift
@@ -1,4 +1,4 @@
-// swift-tools-version: 6.0
+// swift-tools-version: 6.1
 import PackageDescription
 
 let package = Package(
@@ -23,6 +23,15 @@ let package = Package(
         // / `Gemma3BundleDownloader` directly, without pulling the sample CLIs.
         .executable(name: "functiongemma-demo", targets: ["FunctionGemmaDemo"]),
         .executable(name: "embeddinggemma-demo", targets: ["EmbeddingGemmaDemo"]),
+        // pplx-embed — Swift fidelity + latency harness for the native int8
+        // encoder output (not readable from the Python bridge on macOS26).
+        .executable(name: "pplx-embed-bench", targets: ["PplxEmbedBench"]),
+        // pplx-embed — the official embedding contract (plain + context late
+        // chunking; int8/binary/ubinary). The `PplxEmbed` runtime ships inside
+        // the CoreMLLLM library; this product exposes it under its own name so a
+        // wrapper can depend on just the embedder without pulling the sample CLIs.
+        .library(name: "PplxEmbed", targets: ["CoreMLLLM"]),
+        .executable(name: "pplx-embed-demo", targets: ["PplxEmbedDemo"]),
     ],
     dependencies: [
         // Range widened to 1.0.x: mlx-swift-examples caps swift-transformers at
@@ -31,12 +40,21 @@ let package = Package(
         // `Tokenizer` protocol + `AutoTokenizer.from(modelFolder:)` API that
         // CoreMLLLM uses, so 1.0.x is source-compatible with 1.1.x here.
         .package(url: "https://github.com/huggingface/swift-transformers", from: "1.0.0"),
+        // HF's native Swift Hub client (standalone — does NOT pull swift-transformers,
+        // so it's orthogonal to the 1.0.x cap above). Used by PplxEmbed.load(repo:) for
+        // content-addressed snapshot downloads: the byte-identical weight.bin across
+        // buckets is fetched ONCE (then reused) — native download dedup. The `Xet` trait
+        // is REQUIRED: HF stores large files Xet-backed by default, and without it the
+        // client forces the LFS transport and 404s on Xet-only blobs. (Needs tools 6.1+.)
+        .package(url: "https://github.com/huggingface/swift-huggingface", from: "0.9.0",
+                 traits: ["Xet"]),
     ],
     targets: [
         .target(
             name: "CoreMLLLM",
             dependencies: [
                 .product(name: "Tokenizers", package: "swift-transformers"),
+                .product(name: "HuggingFace", package: "swift-huggingface"),
             ],
             swiftSettings: [.swiftLanguageMode(.v5)]
         ),
@@ -128,5 +146,25 @@ let package = Package(
             path: "Sources/ane-residency-gate",
             swiftSettings: [.swiftLanguageMode(.v5)]
         ),
+        // pplx-embed Swift fidelity + latency bench. No CoreMLLLM / tokenizer
+        // dependency — reads pre-tokenized fixtures (conversion/export_swift_fixtures.py),
+        // so it builds fast and stays self-contained.
+        .executableTarget(
+            name: "PplxEmbedBench",
+            path: "Sources/pplx-embed-bench",
+            swiftSettings: [.swiftLanguageMode(.v5)]
+        ),
+        // pplx-embed demo CLI — embeds a few strings (plain or context) and
+        // prints int8/binary/ubinary summaries. Uses the PplxEmbed runtime +
+        // tokenizer from the CoreMLLLM library.
+        .executableTarget(
+            name: "PplxEmbedDemo",
+            dependencies: [
+                "CoreMLLLM",
+                .product(name: "Tokenizers", package: "swift-transformers"),
+            ],
+            path: "Sources/pplx-embed-demo",
+            swiftSettings: [.swiftLanguageMode(.v5)]
+        ),
     ]
 )
diff --git a/README.md b/README.md
@@ -1,11 +1,13 @@
 # CoreML-LLM
 
-**On-device LLMs on the Apple Neural Engine.** Run Gemma 4, Qwen3.5, Qwen3-VL, FunctionGemma, EmbeddingGemma, and Liquid AI's LFM2.5 on iPhone with CoreML — ANE-first, battery-friendly, no server.
+**On-device LLMs on the Apple Neural Engine.** Run Gemma 4, Qwen3.5, Qwen3-VL, FunctionGemma, EmbeddingGemma, Perplexity pplx-embed, and Liquid AI's LFM2.5 on iPhone with CoreML — ANE-first, battery-friendly, no server.
 
 Where [MLX Swift](https://github.com/ml-explore/mlx-swift) is the right call when you want maximum GPU throughput, CoreML-LLM is what you use when the LLM should live on the **ANE** so the GPU stays free for the rest of the app.
 
 [![App Store](https://toolbox.marketingtools.apple.com/api/v2/badges/download-on-the-app-store/black/en-us?releaseDate=1735689600)](https://apps.apple.com/jp/app/models-zoo/id6762083207)
 
+**Embeddings:** Perplexity's `pplx-embed` (bidirectional Qwen3 encoder, plain + late-chunking) runs on the ANE via the `PplxEmbed` Swift API — see [`docs/PPLX_EMBED.md`](docs/PPLX_EMBED.md).
+
 ## Use in your app
 
 Add the package, name a model, generate.