diff --git a/.gitignore b/.gitignore
index 6e84a38..20485ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,3 +48,6 @@ Examples/CoreMLLLMChat/gemma4-e2b/
 
 # W4A8 calibration data — regeneratable from gen_calib_data_real.py
 conversion/calibration_data/
+# Generated experiment artifacts (not the tracked experiments/bonsai sources)
+conversion/experiments/batching_models/
+conversion/experiments/*.log
diff --git a/Package.resolved b/Package.resolved
index e77ef32..42fe11a 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -1,6 +1,15 @@
 {
-  "originHash" : "c8eca12331b572902235e4ef15b1dac1f7cc8320a1686ce37591974ed9c33b78",
+  "originHash" : "8fc6d2c9b6d25f8ae57b3a3d12f5749a87f36ccd1da52410b93df8e63507c74d",
   "pins" : [
+    {
+      "identity" : "async-http-client",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/swift-server/async-http-client",
+      "state" : {
+        "revision" : "3a5b74a58782c3b4c1f0bc75e9b67b10c2494e8f",
+        "version" : "1.33.1"
+      }
+    },
     {
       "identity" : "eventsource",
       "kind" : "remoteSourceControl",
@@ -10,6 +19,15 @@
         "version" : "1.4.1"
       }
     },
+    {
+      "identity" : "swift-algorithms",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-algorithms.git",
+      "state" : {
+        "revision" : "87e50f483c54e6efd60e885f7f5aa946cee68023",
+        "version" : "1.2.1"
+      }
+    },
     {
       "identity" : "swift-asn1",
       "kind" : "remoteSourceControl",
@@ -19,6 +37,15 @@
         "version" : "1.7.0"
       }
     },
+    {
+      "identity" : "swift-async-algorithms",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-async-algorithms.git",
+      "state" : {
+        "revision" : "9d349bcc328ac3c31ce40e746b5882742a0d1272",
+        "version" : "1.1.3"
+      }
+    },
     {
       "identity" : "swift-atomics",
       "kind" : "remoteSourceControl",
@@ -28,6 +55,15 @@
         "version" : "1.3.0"
       }
     },
+    {
+      "identity" : "swift-certificates",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-certificates.git",
+      "state" : {
+        "revision" : "bde8ca32a096825dfce37467137c903418c1893d",
+        "version" : "1.19.1"
+      }
+    },
     {
       "identity" : "swift-collections",
       "kind" : "remoteSourceControl",
@@ -37,6 +73,15 @@
         "version" : "1.4.1"
       }
     },
+    {
+      "identity" : "swift-configuration",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-configuration.git",
+      "state" : {
+        "revision" : "be76c4ad929eb6c4bcaf3351799f2adf9e6848a9",
+        "version" : "1.2.0"
+      }
+    },
     {
       "identity" : "swift-crypto",
       "kind" : "remoteSourceControl",
@@ -46,6 +91,33 @@
         "version" : "4.4.0"
       }
     },
+    {
+      "identity" : "swift-distributed-tracing",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-distributed-tracing.git",
+      "state" : {
+        "revision" : "dc4030184203ffafbb2ec614352487235d747fe0",
+        "version" : "1.4.1"
+      }
+    },
+    {
+      "identity" : "swift-http-structured-headers",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-http-structured-headers.git",
+      "state" : {
+        "revision" : "933538faa42c432d385f02e07df0ace7c5ecfc47",
+        "version" : "1.7.0"
+      }
+    },
+    {
+      "identity" : "swift-http-types",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-http-types.git",
+      "state" : {
+        "revision" : "db774a277f60063a32d854f2980299caf06da041",
+        "version" : "1.6.0"
+      }
+    },
     {
       "identity" : "swift-huggingface",
       "kind" : "remoteSourceControl",
@@ -64,6 +136,15 @@
         "version" : "2.3.5"
       }
     },
+    {
+      "identity" : "swift-log",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-log.git",
+      "state" : {
+        "revision" : "92448c359f00ebe36ae97d3bd9086f13c7692b5a",
+        "version" : "1.13.2"
+      }
+    },
     {
       "identity" : "swift-nio",
       "kind" : "remoteSourceControl",
@@ -73,6 +154,69 @@
         "version" : "2.98.0"
       }
     },
+    {
+      "identity" : "swift-nio-extras",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-nio-extras.git",
+      "state" : {
+        "revision" : "d2eeec0339074034f11a040a74aa2a341a2c4506",
+        "version" : "1.34.1"
+      }
+    },
+    {
+      "identity" : "swift-nio-http2",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-nio-http2.git",
+      "state" : {
+        "revision" : "61d1b44f6e4e118792be1cff88ee2bc0267c6f9a",
+        "version" : "1.44.0"
+      }
+    },
+    {
+      "identity" : "swift-nio-ssl",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-nio-ssl.git",
+      "state" : {
+        "revision" : "407d82d5b6cc00e1c3fb83a81b1539b70c788c5e",
+        "version" : "2.37.1"
+      }
+    },
+    {
+      "identity" : "swift-nio-transport-services",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-nio-transport-services.git",
+      "state" : {
+        "revision" : "67787bb645a5e67d2edcdfbe48a216cc549222d5",
+        "version" : "1.28.0"
+      }
+    },
+    {
+      "identity" : "swift-numerics",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-numerics.git",
+      "state" : {
+        "revision" : "0c0290ff6b24942dadb83a929ffaaa1481df04a2",
+        "version" : "1.1.1"
+      }
+    },
+    {
+      "identity" : "swift-service-context",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-service-context.git",
+      "state" : {
+        "revision" : "d0997351b0c7779017f88e7a93bc30a1878d7f29",
+        "version" : "1.3.0"
+      }
+    },
+    {
+      "identity" : "swift-service-lifecycle",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/swift-server/swift-service-lifecycle",
+      "state" : {
+        "revision" : "9829955b385e5bb88128b73f1b8389e9b9c3191a",
+        "version" : "2.11.0"
+      }
+    },
     {
       "identity" : "swift-system",
       "kind" : "remoteSourceControl",
@@ -91,6 +235,15 @@
         "version" : "1.3.0"
       }
     },
+    {
+      "identity" : "swift-xet",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/huggingface/swift-xet.git",
+      "state" : {
+        "revision" : "341bfd4172f6a57119bfd49bafa11cf5d21fab75",
+        "version" : "0.2.3"
+      }
+    },
     {
       "identity" : "yyjson",
       "kind" : "remoteSourceControl",
diff --git a/Package.swift b/Package.swift
index 9ec6f0c..58c84e7 100644
--- a/Package.swift
+++ b/Package.swift
@@ -1,4 +1,4 @@
-// swift-tools-version: 6.0
+// swift-tools-version: 6.1
 import PackageDescription
 
 let package = Package(
@@ -23,6 +23,15 @@ let package = Package(
         // / `Gemma3BundleDownloader` directly, without pulling the sample CLIs.
         .executable(name: "functiongemma-demo", targets: ["FunctionGemmaDemo"]),
         .executable(name: "embeddinggemma-demo", targets: ["EmbeddingGemmaDemo"]),
+        // pplx-embed — Swift fidelity + latency harness for the native int8
+        // encoder output (not readable from the Python bridge on macOS26).
+        .executable(name: "pplx-embed-bench", targets: ["PplxEmbedBench"]),
+        // pplx-embed — the official embedding contract (plain + context late
+        // chunking; int8/binary/ubinary). The `PplxEmbed` runtime ships inside
+        // the CoreMLLLM library; this product exposes it under its own name so a
+        // wrapper can depend on just the embedder without pulling the sample CLIs.
+        .library(name: "PplxEmbed", targets: ["CoreMLLLM"]),
+        .executable(name: "pplx-embed-demo", targets: ["PplxEmbedDemo"]),
     ],
     dependencies: [
         // Range widened to 1.0.x: mlx-swift-examples caps swift-transformers at
@@ -31,12 +40,21 @@ let package = Package(
         // `Tokenizer` protocol + `AutoTokenizer.from(modelFolder:)` API that
         // CoreMLLLM uses, so 1.0.x is source-compatible with 1.1.x here.
         .package(url: "https://github.com/huggingface/swift-transformers", from: "1.0.0"),
+        // HF's native Swift Hub client (standalone — does NOT pull swift-transformers,
+        // so it's orthogonal to the 1.0.x cap above). Used by PplxEmbed.load(repo:) for
+        // content-addressed snapshot downloads: the byte-identical weight.bin across
+        // buckets is fetched ONCE (then reused) — native download dedup. The `Xet` trait
+        // is REQUIRED: HF stores large files Xet-backed by default, and without it the
+        // client forces the LFS transport and 404s on Xet-only blobs. (Needs tools 6.1+.)
+        .package(url: "https://github.com/huggingface/swift-huggingface", from: "0.9.0",
+                 traits: ["Xet"]),
     ],
     targets: [
         .target(
             name: "CoreMLLLM",
             dependencies: [
                 .product(name: "Tokenizers", package: "swift-transformers"),
+                .product(name: "HuggingFace", package: "swift-huggingface"),
             ],
             swiftSettings: [.swiftLanguageMode(.v5)]
         ),
@@ -128,5 +146,25 @@ let package = Package(
             path: "Sources/ane-residency-gate",
             swiftSettings: [.swiftLanguageMode(.v5)]
         ),
+        // pplx-embed Swift fidelity + latency bench. No CoreMLLLM / tokenizer
+        // dependency — reads pre-tokenized fixtures (conversion/export_swift_fixtures.py),
+        // so it builds fast and stays self-contained.
+        .executableTarget(
+            name: "PplxEmbedBench",
+            path: "Sources/pplx-embed-bench",
+            swiftSettings: [.swiftLanguageMode(.v5)]
+        ),
+        // pplx-embed demo CLI — embeds a few strings (plain or context) and
+        // prints int8/binary/ubinary summaries. Uses the PplxEmbed runtime +
+        // tokenizer from the CoreMLLLM library.
+        .executableTarget(
+            name: "PplxEmbedDemo",
+            dependencies: [
+                "CoreMLLLM",
+                .product(name: "Tokenizers", package: "swift-transformers"),
+            ],
+            path: "Sources/pplx-embed-demo",
+            swiftSettings: [.swiftLanguageMode(.v5)]
+        ),
     ]
 )
diff --git a/README.md b/README.md
index 6be7f49..bb1adf3 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,13 @@
 # CoreML-LLM
 
-**On-device LLMs on the Apple Neural Engine.** Run Gemma 4, Qwen3.5, Qwen3-VL, FunctionGemma, EmbeddingGemma, and Liquid AI's LFM2.5 on iPhone with CoreML — ANE-first, battery-friendly, no server.
+**On-device LLMs on the Apple Neural Engine.** Run Gemma 4, Qwen3.5, Qwen3-VL, FunctionGemma, EmbeddingGemma, Perplexity pplx-embed, and Liquid AI's LFM2.5 on iPhone with CoreML — ANE-first, battery-friendly, no server.
 
 Where [MLX Swift](https://github.com/ml-explore/mlx-swift) is the right call when you want maximum GPU throughput, CoreML-LLM is what you use when the LLM should live on the **ANE** so the GPU stays free for the rest of the app.
 
 [![App Store](https://toolbox.marketingtools.apple.com/api/v2/badges/download-on-the-app-store/black/en-us?releaseDate=1735689600)](https://apps.apple.com/jp/app/models-zoo/id6762083207)
 
+**Embeddings:** Perplexity's `pplx-embed` (bidirectional Qwen3 encoder, plain + late-chunking) runs on the ANE via the `PplxEmbed` Swift API — see [`docs/PPLX_EMBED.md`](docs/PPLX_EMBED.md).
+
 ## Use in your app
 
 Add the package, name a model, generate.
diff --git a/Sources/CoreMLLLM/PplxEmbed.swift b/Sources/CoreMLLLM/PplxEmbed.swift
new file mode 100644
index 0000000..637dd24
--- /dev/null
+++ b/Sources/CoreMLLLM/PplxEmbed.swift
@@ -0,0 +1,584 @@
+import CoreML
+import Foundation
+import HuggingFace
+import Tokenizers
+
+/// Runtime for Perplexity's pplx-embed (a bidirectional Qwen3 encoder → masked
+/// mean pool → tanh int8 quantize). Exposes the official pplx-embed contract:
+///
+///   plain   : `[String] -> [[Int8]]`        (1024-d int8 per text)
+///   context : `[[String]] -> [[[Int8]]]`    (per-document late chunking;
+///             per-chunk 1024-d int8)
+///
+/// Each call also exposes the `binary` (+1/-1 Float) and `ubinary` (packed
+/// UInt8[dim/8]) variants.
+///
+/// Output-format design decision. The underlying mlpackage emits native int8
+/// already (the `int8` output IS the deliverable; it's readable in Swift via
+/// the dtype-agnostic NSNumber subscript even though the Python CoreML bridge
+/// can't read int8 on macOS 26). We derive the other two formats directly from
+/// the int8 vector:
+///
+///   binary[i]  = int8[i] >= 0 ? +1 : -1
+///   ubinary    = packbits(int8[i] >= 0)
+///
+/// This is bit-exact with the reference `st_quantize` everywhere except the
+/// measure-zero x≈0 case: the reference branches on the raw pre-tanh value
+/// `x >= 0`, whereas we branch on the rounded int8. Since `round(tanh(x)*127)`
+/// is 0 only in a tiny neighbourhood of x=0 and is otherwise sign-faithful,
+/// the int8-derived sign agrees with the raw sign except when |x| is so small
+/// it rounds to int8 0 — there we map 0 to the `>= 0` (positive) branch to
+/// match the reference's tie direction. For strictly bit-exact binary/ubinary
+/// against a `pooled_fp16`-output model, build with
+/// `--output-mode pooled_fp16` and apply all three quantizers in Swift; we ship
+/// the int8-derived path because it needs only one model and one forward pass.
+///
+/// I/O contract of the underlying mlpackages (from build_pplx_embed_bundle.py):
+///   plain:
+///     input_ids       (1, L)   int32
+///     attention_mask  (1, L)   fp16   (1.0 valid, 0.0 pad)
+///     → embedding     (1, 1024) int8
+///   context:
+///     input_ids       (1, L)   int32
+///     attention_mask  (1, L)   fp16
+///     pool_matrix     (32, L)  fp16   (row k = 1/n_k over chunk k's span)
+///     → embedding     (32, 1024) int8 (only first n_chunks rows are valid)
+public final class PplxEmbed {
+
+    /// The three published pplx-embed output formats.
+    public enum Format: String, Sendable {
+        case int8
+        case binary
+        case ubinary
+    }
+
+    /// Per-bundle config parsed from model_config.json.
+    public struct BucketConfig: Sendable {
+        public let maxSeqLen: Int    // for dynamic, the RangeDim upper bound
+        public let embedDim: Int
+        public let variant: String   // "plain" | "context"
+        public let dynamic: Bool     // flexible RangeDim model (GPU; the >max-bucket catch-all)
+        public let url: URL
+    }
+
+    public static let embedDim = 1024
+    public static let nMaxChunks = 32
+
+    private let tokenizer: Tokenizer
+    private let sepTokenId: Int
+    private let computeUnits: MLComputeUnits
+
+    /// Fixed ANE buckets, sorted ascending by maxSeqLen.
+    private let buckets: [BucketConfig]
+    /// Optional flexible RangeDim catch-all for inputs larger than the biggest
+    /// fixed bucket. Runs on the GPU (flexible shapes force CPU fallback on ANE),
+    /// non-padded (actual length). nil if no dynamic bundle was provided.
+    private let dynamicBucket: BucketConfig?
+    private let variant: String
+
+    /// Lazily compiled+loaded fixed-bucket models, keyed by bucket maxSeqLen.
+    private var loaded: [Int: MLModel] = [:]
+    /// Lazily loaded dynamic model (GPU).
+    private var dynamicModel: MLModel?
+    private let lock = NSLock()
+
+    private init(tokenizer: Tokenizer, sepTokenId: Int, buckets: [BucketConfig],
+                 dynamicBucket: BucketConfig?, variant: String, computeUnits: MLComputeUnits) {
+        self.tokenizer = tokenizer
+        self.sepTokenId = sepTokenId
+        self.buckets = buckets
+        self.dynamicBucket = dynamicBucket
+        self.variant = variant
+        self.computeUnits = computeUnits
+    }
+
+    // MARK: - Loading
+
+    /// Load a pplx-embed bundle.
+    ///
+    /// `bundleDir` may be either:
+    ///   * a directory of bucket subdirectories (e.g. `output/pplx-embed/`
+    ///     containing `L512-int8/`, `L1024-int8/`, …) — all int8 buckets are
+    ///     discovered and used for token-length-based bucket selection, or
+    ///   * a single bucket directory directly containing `encoder.mlpackage`
+    ///     (e.g. `output/pplx-embed-context/L512-int8/`).
+    ///
+    /// Models are compiled + loaded lazily on first use per bucket; the
+    /// tokenizer is loaded eagerly from the first bucket's `hf_model/`.
+    public static func load(
+        bundleDir: URL,
+        computeUnits: MLComputeUnits = .cpuAndNeuralEngine
+    ) async throws -> PplxEmbed {
+        let fm = FileManager.default
+        var buckets: [BucketConfig] = []
+
+        // A single bucket dir directly contains encoder.mlpackage / .mlmodelc.
+        let isSingle = fm.fileExists(atPath: bundleDir.appendingPathComponent("encoder.mlpackage").path)
+            || fm.fileExists(atPath: bundleDir.appendingPathComponent("encoder.mlmodelc").path)
+
+        if isSingle {
+            if let c = parseBucket(at: bundleDir) { buckets.append(c) }
+        } else {
+            let entries = (try? fm.contentsOfDirectory(at: bundleDir,
+                includingPropertiesForKeys: nil)) ?? []
+            for e in entries.sorted(by: { $0.lastPathComponent < $1.lastPathComponent }) {
+                guard (try? e.resourceValues(forKeys: [.isDirectoryKey]))?.isDirectory == true
+                else { continue }
+                if let c = parseBucket(at: e) { buckets.append(c) }
+            }
+        }
+
+        guard !buckets.isEmpty else {
+            throw CoreMLLLMError.modelNotFound(
+                "no pplx-embed bucket with encoder.mlpackage/.mlmodelc under \(bundleDir.path)")
+        }
+
+        // Dominant variant from the fixed buckets (a dynamic-only bundle is plain).
+        let variant = (buckets.first { !$0.dynamic } ?? buckets.first!).variant
+        let matching = buckets.filter { $0.variant == variant }
+        // Fixed ANE buckets (sorted ascending) + at most one dynamic GPU catch-all.
+        let fixed = matching.filter { !$0.dynamic }.sorted { $0.maxSeqLen < $1.maxSeqLen }
+        let dynamic = matching.first { $0.dynamic }
+
+        let hfDir = (fixed.first ?? dynamic!).url.appendingPathComponent("hf_model")
+        let tokenizer = try await AutoTokenizer.from(modelFolder: hfDir)
+        let sepId = sepTokenId(fromHFDir: hfDir) ?? 151643
+
+        return PplxEmbed(tokenizer: tokenizer, sepTokenId: sepId, buckets: fixed,
+                         dynamicBucket: dynamic, variant: variant, computeUnits: computeUnits)
+    }
+
+    /// Download selected buckets from a HuggingFace repo, then load them.
+    ///
+    /// Publishes-as-download path (companion to `conversion/upload_pplx_embed.py`):
+    /// the repo holds one subfolder per bucket plus a top-level `manifest.json`
+    /// inventory. This fetches the manifest, selects the requested fixed buckets
+    /// (+ the dynamic GPU catch-all, if present) for `variant`, and downloads **only
+    /// those subfolders' chosen-format files** via the HF Swift Hub client
+    /// (`HubClient.downloadSnapshot`, glob-filtered). Crucially, that client uses HF's
+    /// **content-addressed cache**: the encoder `weight.bin` is byte-identical across
+    /// every bucket, so it is fetched **once by etag** and reused for the rest — native
+    /// download dedup (the default 3-bucket pull moves ~1.15 GB, not ~3.5 GB). The
+    /// returned snapshot is then handed to the local `load(bundleDir:)` unchanged.
+    ///
+    /// - Parameters:
+    ///   - repo: HF repo id, e.g. `"<account>/pplx-embed-coreml"`.
+    ///   - buckets: fixed bucket sizes (L) to fetch, e.g. `[512, 1024, 2048]`.
+    ///     The dynamic catch-all (if the repo has one) is always included so long
+    ///     inputs still work.
+    ///   - into: ignored when `nil` (uses the shared HF cache, enabling cross-call and
+    ///     cross-client dedup); pass a directory to download into it instead.
+    ///   - variant: `"plain"` or `"context"`.
+    ///   - preferCompiled: when the repo ships both formats, download the precompiled
+    ///     `.mlmodelc` (no on-device compile) rather than the `.mlpackage`. Only the
+    ///     chosen format's weights are fetched per bucket, never both.
+    @discardableResult
+    public static func load(
+        repo: String,
+        buckets: [Int] = [512, 1024, 2048],
+        into directory: URL? = nil,
+        computeUnits: MLComputeUnits = .cpuAndNeuralEngine,
+        variant: String = "plain",
+        preferCompiled: Bool = true,
+        hfToken: String? = nil,
+        onProgress: ((Double) -> Void)? = nil
+    ) async throws -> PplxEmbed {
+        let manifest = try await fetchManifest(repo: repo, hfToken: hfToken)
+        let want = Set(buckets)
+
+        // Select this variant's buckets (requested sizes + any dynamic catch-all) and
+        // collect each one's exact chosen-format file paths.
+        var matching: [String] = []
+        var hasContextSubfolder = false
+        for b in manifest.buckets where b.variant == variant {
+            guard b.dynamic || want.contains(b.maxSeqLen) else { continue }
+            if b.subfolder.hasPrefix("context/") { hasContextSubfolder = true }
+            matching.append(contentsOf: b.selectFiles(preferCompiled: preferCompiled))
+        }
+        guard !matching.isEmpty else {
+            throw CoreMLLLMError.modelNotFound(
+                "no \(variant) buckets in \(repo) manifest match \(buckets)")
+        }
+
+        let client = makeHubClient(hfToken: hfToken)
+        let repoID = Repo.ID(stringLiteral: repo)
+        let snapshot: URL
+        if let directory {
+            snapshot = try await client.downloadSnapshot(
+                of: repoID, kind: .model, to: directory, matching: matching,
+                progressHandler: { p in onProgress?(p.fractionCompleted) })
+        } else {
+            snapshot = try await client.downloadSnapshot(
+                of: repoID, kind: .model, matching: matching,
+                progressHandler: { p in onProgress?(p.fractionCompleted) })
+        }
+
+        // Context subfolders live under `<repo>/context/`; point load there so the
+        // bucket dirs (context/L512-int8/…) are discovered as top-level entries.
+        let loadDir = (variant == "context" && hasContextSubfolder)
+            ? snapshot.appendingPathComponent("context") : snapshot
+        return try await load(bundleDir: loadDir, computeUnits: computeUnits)
+    }
+
+    /// Build a `HubClient` (env/anonymous token, or an explicit bearer token).
+    private static func makeHubClient(hfToken: String?) -> HubClient {
+        if let hfToken, !hfToken.isEmpty {
+            return HubClient(host: URL(string: "https://huggingface.co")!, bearerToken: hfToken)
+        }
+        return HubClient()
+    }
+
+    // MARK: - Manifest
+
+    /// One bucket entry parsed from the repo's `manifest.json`.
+    private struct ManifestBucket {
+        let subfolder: String
+        let variant: String
+        let dynamic: Bool
+        let maxSeqLen: Int
+        let formats: [String]   // e.g. ["mlmodelc", "mlpackage"]
+        let files: [String]     // exact repo-relative paths (subfolder-prefixed)
+
+        /// Exact file paths to fetch for the chosen format: shared files
+        /// (model_config.json, hf_model/…) + only the chosen format's encoder dir. We
+        /// pass these to `downloadSnapshot(matching:)` as exact patterns rather than
+        /// wildcards — `listFiles(recursive:)` also returns *directory* entries, and a
+        /// glob like `encoder.mlmodelc/*` would match (and 404 trying to GET) the
+        /// `analytics/`/`weights/` directories.
+        func selectFiles(preferCompiled: Bool) -> [String] {
+            let preferred = preferCompiled ? "mlmodelc" : "mlpackage"
+            let chosen = formats.contains(preferred) ? preferred : (formats.first ?? preferred)
+            let otherDir = "\(subfolder)/encoder.\(chosen == "mlmodelc" ? "mlpackage" : "mlmodelc")/"
+            return files.filter { !$0.hasPrefix(otherDir) }
+        }
+    }
+    private struct Manifest { let buckets: [ManifestBucket] }
+
+    /// Fetch + parse `manifest.json` from a HF repo.
+    private static func fetchManifest(repo: String, hfToken: String?) async throws -> Manifest {
+        let urlStr = "https://huggingface.co/\(repo)/resolve/main/manifest.json"
+        var req = URLRequest(url: URL(string: urlStr)!)
+        if let hfToken { req.setValue("Bearer \(hfToken)", forHTTPHeaderField: "Authorization") }
+        let (data, resp) = try await URLSession.shared.data(for: req)
+        if let http = resp as? HTTPURLResponse, http.statusCode >= 400 {
+            throw Gemma3BundleDownloader.Error.httpStatus(
+                http.statusCode, url: urlStr, body: String(data: data, encoding: .utf8) ?? "")
+        }
+        guard let j = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
+              let raw = j["buckets"] as? [[String: Any]]
+        else { throw CoreMLLLMError.modelNotFound("malformed manifest.json in \(repo)") }
+
+        let buckets: [ManifestBucket] = raw.compactMap { e in
+            guard let subfolder = e["subfolder"] as? String else { return nil }
+            let dynamic = (e["dynamic"] as? Bool) ?? false
+            // Fixed buckets carry an integer "bucket"; dynamic uses dynamic_upper/max_seq_len.
+            let maxSeqLen = (e["bucket"] as? Int)
+                ?? (e["dynamic_upper"] as? Int)
+                ?? (e["max_seq_len"] as? Int) ?? 0
+            let variant = (e["variant"] as? String) ?? "plain"
+            let formats = (e["formats"] as? [String]) ?? ["mlpackage"]
+            let fileObjs = (e["files"] as? [[String: Any]]) ?? []
+            let files = fileObjs.compactMap { $0["path"] as? String }
+            return ManifestBucket(subfolder: subfolder, variant: variant,
+                                  dynamic: dynamic, maxSeqLen: maxSeqLen,
+                                  formats: formats, files: files)
+        }
+        return Manifest(buckets: buckets)
+    }
+
+    /// Parse a single bucket directory's model_config.json. Only accepts
+    /// int8-output buckets (the deliverable format).
+    private static func parseBucket(at dir: URL) -> BucketConfig? {
+        let fm = FileManager.default
+        let hasModel = fm.fileExists(atPath: dir.appendingPathComponent("encoder.mlpackage").path)
+            || fm.fileExists(atPath: dir.appendingPathComponent("encoder.mlmodelc").path)
+        guard hasModel,
+              fm.fileExists(atPath: dir.appendingPathComponent("hf_model").path)
+        else { return nil }
+
+        let cfgURL = dir.appendingPathComponent("model_config.json")
+        guard let data = try? Data(contentsOf: cfgURL),
+              let j = try? JSONSerialization.jsonObject(with: data) as? [String: Any]
+        else { return nil }
+
+        let outputMode = j["output_mode"] as? String ?? "int8"
+        guard outputMode == "int8" else { return nil }   // skip pooled_fp16 variants
+        // Ship the fp16-weight models only; skip experimental weight-quant bundles
+        // (they share output_mode "int8" but would duplicate a bucket size).
+        let weightQuant = j["quantization_weights"] as? String ?? "fp16"
+        guard weightQuant == "fp16" else { return nil }
+
+        let dynamic = (j["dynamic"] as? Bool) ?? false
+        // Fixed bucket: integer "bucket". Dynamic: "bucket" is a string ("1..N");
+        // use dynamic_upper as the effective max.
+        let maxSeqLen: Int
+        if dynamic {
+            maxSeqLen = (j["dynamic_upper"] as? Int) ?? (j["max_seq_len"] as? Int) ?? 8192
+        } else {
+            maxSeqLen = (j["bucket"] as? Int) ?? (j["max_seq_len"] as? Int) ?? 512
+        }
+        let embedDim = (j["hidden_size"] as? Int) ?? PplxEmbed.embedDim
+        let variant = (j["variant"] as? String)
+            ?? (dir.path.contains("context") ? "context" : "plain")
+
+        return BucketConfig(maxSeqLen: maxSeqLen, embedDim: embedDim,
+                            variant: variant, dynamic: dynamic, url: dir)
+    }
+
+    private static func sepTokenId(fromHFDir hfDir: URL) -> Int? {
+        // <|endoftext|> id from added_tokens.json (pplx tokenizer: 151643).
+        let url = hfDir.appendingPathComponent("added_tokens.json")
+        guard let data = try? Data(contentsOf: url),
+              let j = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
+              let id = j["<|endoftext|>"] as? Int
+        else { return nil }
+        return id
+    }
+
+    private func model(forBucket L: Int) throws -> MLModel {
+        lock.lock(); defer { lock.unlock() }
+        if let m = loaded[L] { return m }
+        guard let cfg = buckets.first(where: { $0.maxSeqLen == L }) else {
+            throw CoreMLLLMError.modelNotFound("no loaded bucket for L=\(L)")
+        }
+        let mlConfig = MLModelConfiguration()
+        mlConfig.computeUnits = computeUnits
+
+        let compiled = cfg.url.appendingPathComponent("encoder.mlmodelc")
+        let pkg = cfg.url.appendingPathComponent("encoder.mlpackage")
+        let modelURL: URL
+        if FileManager.default.fileExists(atPath: compiled.path) {
+            modelURL = compiled
+        } else {
+            modelURL = try compileSync(pkg)
+        }
+        let m = try MLModel(contentsOf: modelURL, configuration: mlConfig)
+        loaded[L] = m
+        return m
+    }
+
+    /// Synchronous compile wrapper (MLModel.compileModel is async on newer SDKs
+    /// but the legacy throwing overload is sync). Use the sync overload to keep
+    /// the model accessor non-async.
+    private func compileSync(_ pkg: URL) throws -> URL {
+        try MLModel.compileModel(at: pkg)
+    }
+
+    /// Load the flexible RangeDim catch-all model on the GPU (flexible shapes
+    /// force CPU fallback on the ANE, so this path is GPU-only).
+    private func loadDynamicModel(_ cfg: BucketConfig) throws -> MLModel {
+        lock.lock(); defer { lock.unlock() }
+        if let m = dynamicModel { return m }
+        let mlConfig = MLModelConfiguration()
+        mlConfig.computeUnits = .cpuAndGPU
+        let compiled = cfg.url.appendingPathComponent("encoder.mlmodelc")
+        let pkg = cfg.url.appendingPathComponent("encoder.mlpackage")
+        let url = FileManager.default.fileExists(atPath: compiled.path)
+            ? compiled : try compileSync(pkg)
+        let m = try MLModel(contentsOf: url, configuration: mlConfig)
+        dynamicModel = m
+        return m
+    }
+
+    /// Pick the smallest bucket whose maxSeqLen >= n; if none, the largest.
+    private func bucket(forTokens n: Int) -> BucketConfig {
+        for b in buckets where b.maxSeqLen >= n { return b }
+        return buckets.last!
+    }
+
+    // MARK: - Plain API
+
+    /// Encode texts into 1024-d int8 embeddings (one row per text).
+    public func embed(_ texts: [String]) throws -> [[Int8]] {
+        try texts.map { try embedOne($0) }
+    }
+
+    /// Encode texts and return the requested format.
+    /// - int8:    `[[Int8]]` (1024-d)
+    /// - binary:  `[[Float]]` (1024-d, +1/-1)
+    /// - ubinary: `[[UInt8]]` (128 packed bytes)
+    public func embedInt8(_ texts: [String]) throws -> [[Int8]] {
+        try embed(texts)
+    }
+
+    public func embedBinary(_ texts: [String]) throws -> [[Float]] {
+        try embed(texts).map { PplxEmbed.binary(fromInt8: $0) }
+    }
+
+    public func embedUBinary(_ texts: [String]) throws -> [[UInt8]] {
+        try embed(texts).map { PplxEmbed.ubinary(fromInt8: $0) }
+    }
+
+    private func embedOne(_ text: String) throws -> [Int8] {
+        var ids = tokenizer.encode(text: text)
+        let largestFixed = buckets.last?.maxSeqLen ?? 0
+
+        // Catch-all: inputs larger than the biggest fixed bucket go to the flexible
+        // GPU model, non-padded (actual length, capped at the RangeDim upper bound).
+        if let dyn = dynamicBucket, ids.count > largestFixed {
+            let L = min(ids.count, dyn.maxSeqLen)
+            if ids.count > L { ids = Array(ids.prefix(L)) }
+            let n = ids.count
+            let out = try loadDynamicModel(dyn).prediction(from: MLDictionaryFeatureProvider(dictionary: [
+                "input_ids": try makeInputIds(ids, L: n),
+                "attention_mask": try makeAttentionMask(n: n, L: n),
+            ]))
+            return try readPlainRow(out)
+        }
+
+        // Fast path: smallest fixed ANE bucket that fits, padded to the bucket.
+        let bucket = bucket(forTokens: ids.count)
+        let L = bucket.maxSeqLen
+        if ids.count > L { ids = Array(ids.prefix(L)) }
+        let n = ids.count
+        let out = try model(forBucket: L).prediction(from: MLDictionaryFeatureProvider(dictionary: [
+            "input_ids": try makeInputIds(ids, L: L),
+            "attention_mask": try makeAttentionMask(n: n, L: L),
+        ]))
+        return try readPlainRow(out)
+    }
+
+    /// Read a (1, 1024) int8 "embedding" output into [Int8].
+    private func readPlainRow(_ out: MLFeatureProvider) throws -> [Int8] {
+        guard let arr = out.featureValue(for: "embedding")?.multiArrayValue else {
+            throw CoreMLLLMError.predictionFailed
+        }
+        let d = min(PplxEmbed.embedDim, arr.count)
+        var vec = [Int8](repeating: 0, count: d)
+        for i in 0..<d { vec[i] = Int8(arr[i].int8Value) }
+        return vec
+    }
+
+    // MARK: - Context API (late chunking)
+
+    /// Late-chunking context embed. For each document (a list of chunk strings),
+    /// returns per-chunk 1024-d int8 embeddings: `[[Int8]]` with one row per
+    /// chunk, in order.
+    public func embedContext(_ documents: [[String]]) throws -> [[[Int8]]] {
+        try documents.map { try embedContextOne($0) }
+    }
+
+    public func embedContextBinary(_ documents: [[String]]) throws -> [[[Float]]] {
+        try embedContext(documents).map { doc in doc.map { PplxEmbed.binary(fromInt8: $0) } }
+    }
+
+    public func embedContextUBinary(_ documents: [[String]]) throws -> [[[UInt8]]] {
+        try embedContext(documents).map { doc in doc.map { PplxEmbed.ubinary(fromInt8: $0) } }
+    }
+
+    private func embedContextOne(_ chunks: [String]) throws -> [[Int8]] {
+        precondition(variant == "context",
+                     "embedContext requires a context bundle (variant=context)")
+        guard !chunks.isEmpty else { return [] }
+
+        // Join chunks with the sep token, then tokenize the whole window once.
+        // The tokenizer adds the literal <|endoftext|> between chunks; we locate
+        // its ids among the valid tokens to recover chunk spans.
+        let sep = "<|endoftext|>"
+        let joined = chunks.joined(separator: sep)
+        var ids = tokenizer.encode(text: joined)
+
+        let bucket = bucket(forTokens: ids.count)
+        let L = bucket.maxSeqLen
+        if ids.count > L { ids = Array(ids.prefix(L)) }
+        let n = ids.count
+
+        // Recover chunk spans: [start, sep) (SEP excluded), next start = sep+1,
+        // final chunk runs to n.
+        var spans: [(Int, Int)] = []
+        var start = 0
+        for i in 0..<n where ids[i] == sepTokenId {
+            spans.append((start, i))
+            start = i + 1
+        }
+        spans.append((start, n))
+        // Cap at the model's max chunk count.
+        if spans.count > PplxEmbed.nMaxChunks {
+            spans = Array(spans.prefix(PplxEmbed.nMaxChunks))
+        }
+        let nChunks = spans.count
+
+        let inputIds = try makeInputIds(ids, L: L)
+        let attn = try makeAttentionMask(n: n, L: L)
+        let pool = try makePoolMatrix(spans: spans, L: L)
+
+        let model = try model(forBucket: L)
+        let out = try model.prediction(from: MLDictionaryFeatureProvider(dictionary: [
+            "input_ids": inputIds,
+            "attention_mask": attn,
+            "pool_matrix": pool,
+        ]))
+        guard let arr = out.featureValue(for: "embedding")?.multiArrayValue else {
+            throw CoreMLLLMError.predictionFailed
+        }
+        // (32, 1024) int8 — read only the first nChunks rows (rest are all-zero).
+        let D = PplxEmbed.embedDim
+        var result: [[Int8]] = []
+        result.reserveCapacity(nChunks)
+        for c in 0..<nChunks {
+            var row = [Int8](repeating: 0, count: D)
+            let base = c * D
+            for i in 0..<D { row[i] = Int8(arr[base + i].int8Value) }
+            result.append(row)
+        }
+        return result
+    }
+
+    // MARK: - Input builders
+
+    private func makeInputIds(_ ids: [Int], L: Int) throws -> MLMultiArray {
+        let arr = try MLMultiArray(shape: [1, NSNumber(value: L)], dataType: .int32)
+        let p = arr.dataPointer.bindMemory(to: Int32.self, capacity: L)
+        for i in 0..<L { p[i] = i < ids.count ? Int32(ids[i]) : 0 }
+        return arr
+    }
+
+    private func makeAttentionMask(n: Int, L: Int) throws -> MLMultiArray {
+        let arr = try MLMultiArray(shape: [1, NSNumber(value: L)], dataType: .float16)
+        let p = arr.dataPointer.bindMemory(to: UInt16.self, capacity: L)
+        let one: UInt16 = 0x3C00  // 1.0 in fp16
+        for i in 0..<L { p[i] = i < n ? one : 0 }
+        return arr
+    }
+
+    /// (32, L) fp16 pool matrix; row k = 1/n_k over chunk k's [start,end) span,
+    /// unused rows all-zero.
+    private func makePoolMatrix(spans: [(Int, Int)], L: Int) throws -> MLMultiArray {
+        let rows = PplxEmbed.nMaxChunks
+        let arr = try MLMultiArray(shape: [NSNumber(value: rows), NSNumber(value: L)],
+                                   dataType: .float16)
+        let p = arr.dataPointer.bindMemory(to: UInt16.self, capacity: rows * L)
+        for i in 0..<(rows * L) { p[i] = 0 }
+        for (k, span) in spans.enumerated() where k < rows {
+            let (s, e) = span
+            let count = e - s
+            guard count > 0 else { continue }
+            let w = float16Bits(Float(1.0) / Float(count))
+            let base = k * L
+            for col in s..<e { p[base + col] = w }
+        }
+        return arr
+    }
+
+    // MARK: - Format derivation
+
+    /// binary[i] = int8[i] >= 0 ? +1 : -1   (matches reference x>=0 branch).
+    public static func binary(fromInt8 v: [Int8]) -> [Float] {
+        v.map { $0 >= 0 ? Float(1) : Float(-1) }
+    }
+
+    /// ubinary = packbits(int8[i] >= 0), MSB-first per byte (numpy packbits).
+    public static func ubinary(fromInt8 v: [Int8]) -> [UInt8] {
+        let nBytes = (v.count + 7) / 8
+        var out = [UInt8](repeating: 0, count: nBytes)
+        for i in 0..<v.count where v[i] >= 0 {
+            out[i / 8] |= UInt8(1 << (7 - (i % 8)))
+        }
+        return out
+    }
+
+    /// Float → IEEE-754 binary16 bit pattern (native Float16 round).
+    private func float16Bits(_ x: Float) -> UInt16 {
+        Float16(x).bitPattern
+    }
+}
diff --git a/Sources/pplx-embed-bench/main.swift b/Sources/pplx-embed-bench/main.swift
new file mode 100644
index 0000000..0ae85f4
--- /dev/null
+++ b/Sources/pplx-embed-bench/main.swift
@@ -0,0 +1,127 @@
+// pplx-embed-bench — Swift fidelity + latency harness for the pplx-embed CoreML model.
+//
+// Native int8 model output is not readable from the Python CoreML bridge on macOS26,
+// so this validates the int8 deliverable in Swift: it loads the model, runs the
+// pre-tokenized fixtures from export_swift_fixtures.py, reads the int8 output via the
+// dtype-agnostic NSNumber subscript, computes cosine vs the fp32-reference int8, and
+// times warm latency.
+//
+// Usage:
+//   swift run -c release pplx-embed-bench \
+//       --model <bundleDir with encoder.mlpackage|.mlmodelc> \
+//       --fixtures /tmp/pplx_fixtures.json \
+//       --compute-units cpuAndNE --iters 20
+
+import CoreML
+import Foundation
+
+struct Fixture: Decodable { let text: String; let input_ids: [Int]; let n: Int; let ref_int8: [Int] }
+struct Fixtures: Decodable { let L: Int; let hf_repo: String; let items: [Fixture] }
+
+func arg(_ name: String, _ def: String) -> String {
+    let a = CommandLine.arguments
+    if let i = a.firstIndex(of: name), i + 1 < a.count { return a[i + 1] }
+    return def
+}
+
+func computeUnits(_ s: String) -> MLComputeUnits {
+    switch s {
+    case "all": return .all
+    case "cpuOnly": return .cpuOnly
+    case "cpuAndGPU": return .cpuAndGPU
+    default: return .cpuAndNeuralEngine
+    }
+}
+
+func cosine(_ a: [Double], _ b: [Double]) -> Double {
+    var dot = 0.0, na = 0.0, nb = 0.0
+    for i in 0..<a.count { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i] }
+    if na < 1e-12 || nb < 1e-12 { return Double.nan }
+    return dot / (sqrt(na) * sqrt(nb))
+}
+
+let modelDir = arg("--model", "")
+let fixturesPath = arg("--fixtures", "/tmp/pplx_fixtures.json")
+let cuName = arg("--compute-units", "cpuAndNE")
+let iters = Int(arg("--iters", "20")) ?? 20
+
+guard !modelDir.isEmpty else { FileHandle.standardError.write("--model required\n".data(using: .utf8)!); exit(2) }
+
+let fx = try JSONDecoder().decode(Fixtures.self, from: Data(contentsOf: URL(fileURLWithPath: fixturesPath)))
+let L = fx.L
+
+// Load (compile .mlpackage if no .mlmodelc).
+let cfg = MLModelConfiguration()
+cfg.computeUnits = computeUnits(cuName)
+let bundle = URL(fileURLWithPath: modelDir)
+let mlmodelc = bundle.appendingPathComponent("encoder.mlmodelc")
+let mlpackage = bundle.appendingPathComponent("encoder.mlpackage")
+let modelURL: URL
+if FileManager.default.fileExists(atPath: mlmodelc.path) {
+    modelURL = mlmodelc
+} else if FileManager.default.fileExists(atPath: mlpackage.path) {
+    print("compiling \(mlpackage.lastPathComponent) …")
+    modelURL = try await MLModel.compileModel(at: mlpackage)
+} else {
+    // allow passing the .mlpackage / .mlmodelc directly
+    modelURL = bundle.pathExtension == "mlpackage"
+        ? try await MLModel.compileModel(at: bundle) : bundle
+}
+let model = try MLModel(contentsOf: modelURL, configuration: cfg)
+print("loaded \(modelURL.lastPathComponent)  compute-units=\(cuName)  L=\(L)  fixtures=\(fx.items.count)")
+
+func makeInputs(_ f: Fixture) throws -> MLDictionaryFeatureProvider {
+    let ids = try MLMultiArray(shape: [1, NSNumber(value: L)], dataType: .int32)
+    let attn = try MLMultiArray(shape: [1, NSNumber(value: L)], dataType: .float16)
+    for i in 0..<L {
+        ids[i] = NSNumber(value: Int32(i < f.input_ids.count ? f.input_ids[i] : 0))
+        attn[i] = NSNumber(value: Float(i < f.n ? 1.0 : 0.0))
+    }
+    return try MLDictionaryFeatureProvider(dictionary: ["input_ids": ids, "attention_mask": attn])
+}
+
+func readEmbedding(_ out: MLFeatureProvider) -> ([Double], String)? {
+    guard let arr = out.featureValue(for: "embedding")?.multiArrayValue else { return nil }
+    let d = arr.count
+    var v = [Double](repeating: 0, count: d)
+    for i in 0..<d { v[i] = arr[i].doubleValue }   // dtype-agnostic read
+    return (v, "\(arr.dataType.rawValue)")
+}
+
+// Fidelity pass.
+var cosines: [Double] = []
+var dtypeSeen = ""
+for f in fx.items {
+    let prov = try makeInputs(f)
+    let out = try model.prediction(from: prov)
+    guard let (vec, dt) = readEmbedding(out) else { print("  no embedding output"); continue }
+    dtypeSeen = dt
+    let ref = f.ref_int8.map { Double($0) }
+    let c = cosine(vec, ref)
+    cosines.append(c)
+    print(String(format: "  n=%4d  cos=%.6f  %@", f.n, c, String(f.text.prefix(28))))
+}
+let valid = cosines.filter { !$0.isNaN }
+let minc = valid.min() ?? Double.nan
+let meanc = valid.isEmpty ? Double.nan : valid.reduce(0, +) / Double(valid.count)
+print(String(format: "[FIDELITY] mean=%.6f min=%.6f  (output MLMultiArray dataType.rawValue=%@)", meanc, minc, dtypeSeen))
+
+// Latency pass (warm) on the longest fixture.
+let longest = fx.items.max(by: { $0.n < $1.n })!
+let provL = try makeInputs(longest)
+_ = try model.prediction(from: provL)   // warm
+var times: [Double] = []
+for _ in 0..<iters {
+    let t0 = DispatchTime.now().uptimeNanoseconds
+    _ = try model.prediction(from: provL)
+    let t1 = DispatchTime.now().uptimeNanoseconds
+    times.append(Double(t1 - t0) / 1_000_000.0)
+}
+times.sort()
+let med = times[times.count / 2]
+let mean = times.reduce(0, +) / Double(times.count)
+print(String(format: "[LATENCY] median=%.1fms mean=%.1fms min=%.1fms max=%.1fms  (n=%d, L=%d, tokens=%d, units=%@)",
+             med, mean, times.first!, times.last!, iters, L, longest.n, cuName))
+
+let gate = 0.997
+print((minc >= gate) ? "PASS vs 0.997 gate" : "FAIL vs 0.997 gate")
diff --git a/Sources/pplx-embed-demo/main.swift b/Sources/pplx-embed-demo/main.swift
new file mode 100644
index 0000000..baafe93
--- /dev/null
+++ b/Sources/pplx-embed-demo/main.swift
@@ -0,0 +1,146 @@
+// pplx-embed-demo — minimal CLI around the PplxEmbed runtime.
+//
+// Plain:
+//   swift run -c release pplx-embed-demo \
+//       --bundle-dir output/pplx-embed \
+//       --text "hello world" --text "bonjour le monde" --format int8
+//
+// Context (late chunking) — each --text is one chunk of a single document;
+// use ';;' inside a --text to split a document into multiple chunks:
+//   swift run -c release pplx-embed-demo \
+//       --bundle-dir output/pplx-embed-context/L512-int8 \
+//       --context --text "first chunk;;second chunk"
+
+import CoreML
+import CoreMLLLM
+import Foundation
+
+func args(_ name: String) -> [String] {
+    let a = CommandLine.arguments
+    var out: [String] = []
+    var i = 0
+    while i < a.count {
+        if a[i] == name, i + 1 < a.count { out.append(a[i + 1]); i += 2 } else { i += 1 }
+    }
+    return out
+}
+func arg(_ name: String, _ def: String) -> String { args(name).first ?? def }
+func flag(_ name: String) -> Bool { CommandLine.arguments.contains(name) }
+
+func computeUnits(_ s: String) -> MLComputeUnits {
+    switch s {
+    case "all": return .all
+    case "cpuOnly": return .cpuOnly
+    case "cpuAndGPU": return .cpuAndGPU
+    default: return .cpuAndNeuralEngine
+    }
+}
+
+func l2norm(_ v: [Int8]) -> Double {
+    var s = 0.0
+    for x in v { s += Double(x) * Double(x) }
+    return (s).squareRoot()
+}
+
+func summarize(_ v: [Int8]) -> String {
+    let head = v.prefix(8).map { String($0) }.joined(separator: ", ")
+    return "dim=\(v.count) first8=[\(head)] l2=\(String(format: "%.2f", l2norm(v)))"
+}
+
+// Either --bundle-dir (local) or --repo (download from HuggingFace) is required.
+let bundleDir = arg("--bundle-dir", "")
+let repo = arg("--repo", "")
+guard !bundleDir.isEmpty || !repo.isEmpty else {
+    FileHandle.standardError.write("--bundle-dir or --repo required\n".data(using: .utf8)!)
+    exit(2)
+}
+let texts = args("--text")
+guard !texts.isEmpty else {
+    FileHandle.standardError.write("at least one --text required\n".data(using: .utf8)!)
+    exit(2)
+}
+let isContext = flag("--context")
+let format = PplxEmbed.Format(rawValue: arg("--format", "int8")) ?? .int8
+let cu = computeUnits(arg("--compute-units", "cpuAndNE"))
+let asJSON = flag("--json")   // emit raw int8 vectors as JSON (for parity checks)
+
+let embedder: PplxEmbed
+if !repo.isEmpty {
+    // Download-then-run: pull only the requested buckets from HF (content-addressed
+    // cache → the shared weight.bin is fetched once), then load.
+    let buckets = args("--buckets").compactMap { Int($0) }
+    let cacheDir = args("--cache-dir").first.map { URL(fileURLWithPath: $0) }
+    let hfToken = args("--hf-token").first ?? ProcessInfo.processInfo.environment["HF_TOKEN"]
+    embedder = try await PplxEmbed.load(
+        repo: repo,
+        buckets: buckets.isEmpty ? [512, 1024, 2048] : buckets,
+        into: cacheDir,
+        computeUnits: cu,
+        variant: isContext ? "context" : "plain",
+        hfToken: hfToken,
+        onProgress: { frac in
+            FileHandle.standardError.write("\r[download] \(Int(frac * 100))%   "
+                .data(using: .utf8)!)
+        })
+    FileHandle.standardError.write("\n".data(using: .utf8)!)
+} else {
+    embedder = try await PplxEmbed.load(
+        bundleDir: URL(fileURLWithPath: bundleDir), computeUnits: cu)
+}
+
+// JSON mode: dump int8 vectors only (plain: [[Int8]]; context: [[[Int8]]]).
+if asJSON {
+    let enc = JSONEncoder()
+    if isContext {
+        let docs = texts.map { $0.components(separatedBy: ";;") }
+        let int8 = try embedder.embedContext(docs)
+        let data = try enc.encode(int8.map { $0.map { $0.map(Int.init) } })
+        FileHandle.standardOutput.write(data)
+    } else {
+        let int8 = try embedder.embed(texts)
+        let data = try enc.encode(int8.map { $0.map(Int.init) })
+        FileHandle.standardOutput.write(data)
+    }
+    FileHandle.standardOutput.write("\n".data(using: .utf8)!)
+    exit(0)
+}
+
+if isContext {
+    // Each --text is a document; ';;' splits it into chunks.
+    let docs = texts.map { $0.components(separatedBy: ";;") }
+    let int8 = try embedder.embedContext(docs)
+    for (d, doc) in int8.enumerated() {
+        print("doc[\(d)]: \(doc.count) chunk(s)")
+        for (c, row) in doc.enumerated() {
+            switch format {
+            case .int8:
+                print("  chunk[\(c)]: \(summarize(row))")
+            case .binary:
+                let b = PplxEmbed.binary(fromInt8: row)
+                let head = b.prefix(8).map { String(Int($0)) }.joined(separator: ", ")
+                print("  chunk[\(c)]: binary dim=\(b.count) first8=[\(head)]")
+            case .ubinary:
+                let u = PplxEmbed.ubinary(fromInt8: row)
+                let head = u.prefix(8).map { String($0) }.joined(separator: ", ")
+                print("  chunk[\(c)]: ubinary bytes=\(u.count) first8=[\(head)]")
+            }
+        }
+    }
+} else {
+    let int8 = try embedder.embed(texts)
+    for (i, row) in int8.enumerated() {
+        let label = String(texts[i].prefix(40))
+        switch format {
+        case .int8:
+            print("[\(i)] \"\(label)\" → \(summarize(row))")
+        case .binary:
+            let b = PplxEmbed.binary(fromInt8: row)
+            let head = b.prefix(8).map { String(Int($0)) }.joined(separator: ", ")
+            print("[\(i)] \"\(label)\" → binary dim=\(b.count) first8=[\(head)]")
+        case .ubinary:
+            let u = PplxEmbed.ubinary(fromInt8: row)
+            let head = u.prefix(8).map { String($0) }.joined(separator: ", ")
+            print("[\(i)] \"\(label)\" → ubinary bytes=\(u.count) first8=[\(head)]")
+        }
+    }
+}
diff --git a/conversion/build_pplx_embed_bundle.py b/conversion/build_pplx_embed_bundle.py
new file mode 100644
index 0000000..dbe1a10
--- /dev/null
+++ b/conversion/build_pplx_embed_bundle.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""Build a CoreML bundle for pplx-embed (bidirectional Qwen3 encoder).
+
+Stateless single-forward export — no KV cache, no causal mask. Fixed-length
+(`--max-seq-len`, the bucket) input + pad mask. Variable length is handled by
+padding to the nearest bucket at runtime (fixed shapes keep it on the ANE;
+RangeDim/EnumeratedShapes force CPU fallback).
+
+Output modes:
+    pooled_fp16  masked-mean → (1, 1024) fp16   — readable from the Python bridge;
+                 quantize to int8 downstream. Use for fidelity measurement.
+    int8         masked-mean → tanh → int8       — the deliverable (native int8;
+                 read via the Swift harness on macOS26).
+
+Usage:
+    python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 4096
+    python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 128 \
+        --output-mode pooled_fp16
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import sys
+
+import numpy as np
+import torch
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, ROOT)
+
+from models.qwen3_encoder import (  # noqa: E402
+    Qwen3EncoderConfig,
+    PplxEmbedModel,
+    PplxEmbedContextModel,
+    N_MAX_CHUNKS,
+    load_encoder_weights,
+    apply_fp16_residual_rescale,
+)
+
+# Residual rescale factor (see qwen3_encoder.apply_fp16_residual_rescale).
+# K=8 is the fidelity↔overflow sweet spot: validated overflow-safe (peak |h| ~37k at
+# 455 real tokens, 1.75× under fp16 max) and markedly better than K=16 on short chunks
+# (context mean 0.9987 vs 0.9911). K=4 overflows. Bump toward 16 if long inputs NaN.
+DEFAULT_RESCALE_K = 8.0
+
+
+def _snapshot_dir(hf_repo: str) -> str:
+    from huggingface_hub import snapshot_download
+    return snapshot_download(
+        hf_repo,
+        allow_patterns=["*.json", "*.safetensors", "tokenizer*", "*.txt", "*.py", "1_Pooling/*"],
+    )
+
+
+def build_bundle(
+    hf_repo: str,
+    model_name: str,
+    output_dir: str,
+    max_seq_len: int = 4096,
+    output_mode: str = "int8",
+    rescale_k: float = DEFAULT_RESCALE_K,
+    quantize: str | None = None,
+    variant: str = "plain",
+    dynamic_upper: int = 0,
+    skip_if_exists: bool = True,
+    norm_impl: str = "native",
+) -> str:
+    """Build a CoreML bundle.
+
+    dynamic_upper > 0 → a **flexible RangeDim** model (seq 1..dynamic_upper) targeting the
+    GPU — the non-padded, unbounded-length catch-all for inputs larger than the biggest fixed
+    ANE bucket. (Flexible shapes force CPU fallback on ANE and are ~10× slower than fixed
+    buckets, so this is GPU-only and reserved for >max-bucket inputs.) Otherwise a fixed-shape
+    bucket (the fast ANE path).
+    """
+    import coremltools as ct
+
+    os.makedirs(output_dir, exist_ok=True)
+    pkg = os.path.join(output_dir, "encoder.mlpackage")
+    if skip_if_exists and os.path.exists(pkg):
+        print(f"  [skip] {pkg} exists")
+        return pkg
+
+    dynamic = dynamic_upper > 0
+    print(f"[1/4] Loading {hf_repo} (config + weights; variant={variant}"
+          + (f", dynamic RangeDim 1..{dynamic_upper} GPU" if dynamic else "") + ")")
+    snap = hf_repo if os.path.isdir(hf_repo) else _snapshot_dir(hf_repo)
+    # The bucket (input shape). The RoPE table is built once to a fixed length
+    # (max_position_embeddings) inside Qwen3Encoder._build_rope and gathered to S at
+    # runtime, so it no longer tracks the bucket — that keeps weight.bin byte-identical
+    # across buckets (HF LFS stores one blob). For the dynamic RangeDim model max_seq_len
+    # is informational only (the input is RangeDim 1..dynamic_upper).
+    bucket_len = dynamic_upper if dynamic else max_seq_len
+    cfg = Qwen3EncoderConfig.from_json(os.path.join(snap, "config.json"),
+                                       max_seq_len=bucket_len, norm_impl=norm_impl)
+    if variant == "context":
+        model = PplxEmbedContextModel(cfg, output_mode=output_mode).eval()
+    else:
+        model = PplxEmbedModel(cfg, output_mode=output_mode).eval()
+    load_encoder_weights(model.encoder, snap)
+    if rescale_k:
+        print(f"[1.5/4] fp16 residual rescale 1/K (K={rescale_k})")
+        apply_fp16_residual_rescale(model.encoder, rescale_k)
+
+    if dynamic and variant == "context":
+        raise ValueError("dynamic (RangeDim) mode supports only the plain variant")
+
+    trace_len = min(512, dynamic_upper) if dynamic else max_seq_len
+    print(f"[2/4] Tracing (trace_len={trace_len}, mode={output_mode}, variant={variant}"
+          + (f", RangeDim 1..{dynamic_upper}" if dynamic else "") + ")")
+    sample_ids = torch.zeros((1, trace_len), dtype=torch.int32)
+    sample_mask = torch.ones((1, trace_len), dtype=torch.float16)
+    if dynamic:
+        seqdim = ct.RangeDim(lower_bound=1, upper_bound=dynamic_upper, default=trace_len)
+        inputs = [
+            ct.TensorType(name="input_ids", shape=(1, seqdim), dtype=np.int32),
+            ct.TensorType(name="attention_mask", shape=(1, seqdim), dtype=np.float16),
+        ]
+        trace_args = (sample_ids, sample_mask)
+    else:
+        inputs = [
+            ct.TensorType(name="input_ids", shape=(1, trace_len), dtype=np.int32),
+            ct.TensorType(name="attention_mask", shape=(1, trace_len), dtype=np.float16),
+        ]
+        if variant == "context":
+            sample_pool = torch.zeros((N_MAX_CHUNKS, trace_len), dtype=torch.float16)
+            sample_pool[0, :] = 1.0 / trace_len
+            trace_args = (sample_ids, sample_mask, sample_pool)
+            inputs.append(ct.TensorType(name="pool_matrix", shape=(N_MAX_CHUNKS, trace_len), dtype=np.float16))
+        else:
+            trace_args = (sample_ids, sample_mask)
+    with torch.no_grad():
+        traced = torch.jit.trace(model, trace_args)
+
+    out_dtype = np.int8 if output_mode == "int8" else np.float16
+    # Flexible shapes can't go on ANE (CPU fallback) → GPU; fixed buckets → ANE (ALL picks it).
+    compute_units = ct.ComputeUnit.CPU_AND_GPU if dynamic else ct.ComputeUnit.ALL
+    print(f"[3/4] Converting to CoreML (fp16, macOS26; out={out_dtype.__name__}; "
+          f"units={'CPU_AND_GPU' if dynamic else 'ALL'})")
+    mlmodel = ct.convert(
+        traced,
+        inputs=inputs,
+        outputs=[ct.TensorType(name="embedding", dtype=out_dtype)],
+        minimum_deployment_target=ct.target.macOS26,
+        compute_units=compute_units,
+    )
+
+    if quantize == "int4":
+        op = ct.optimize.coreml.OpPalettizerConfig(nbits=4, granularity="per_grouped_channel", group_size=32)
+        mlmodel = ct.optimize.coreml.palettize_weights(
+            mlmodel, ct.optimize.coreml.OptimizationConfig(global_config=op))
+        print("  applied int4 palettization (group_size=32)")
+    elif quantize == "int8":
+        op = ct.optimize.coreml.OpLinearQuantizerConfig(mode="linear_symmetric", dtype="int8")
+        mlmodel = ct.optimize.coreml.linear_quantize_weights(
+            mlmodel, ct.optimize.coreml.OptimizationConfig(global_config=op))
+        print("  applied int8 weight quantization")
+
+    if os.path.exists(pkg):
+        shutil.rmtree(pkg)
+    mlmodel.save(pkg)
+    size_mb = sum(os.path.getsize(os.path.join(dp, f))
+                  for dp, _, fns in os.walk(pkg) for f in fns) / 1024 / 1024
+    print(f"  saved {pkg} ({size_mb:.1f} MB)")
+
+    _write_model_config(output_dir, model_name, hf_repo, cfg, max_seq_len,
+                        output_mode, rescale_k, quantize, variant, dynamic_upper, norm_impl)
+    _copy_tokenizer(snap, output_dir)
+    print(f"[4/4] bundle ready at {output_dir}")
+    return pkg
+
+
+def _write_model_config(output_dir, model_name, hf_repo, cfg, max_seq_len,
+                        output_mode, rescale_k, quantize, variant="plain", dynamic_upper=0,
+                        norm_impl="native"):
+    dynamic = dynamic_upper > 0
+    out_dtype = "int8" if output_mode == "int8" else "fp16"
+    out_shape = [N_MAX_CHUNKS, 1024] if variant == "context" else [1, 1024]
+    seq_shape = [1, f"1..{dynamic_upper}"] if dynamic else [1, max_seq_len]
+    inputs = {
+        "input_ids": {"shape": seq_shape, "dtype": "int32"},
+        "attention_mask": {"shape": seq_shape, "dtype": "fp16",
+                           "doc": "1.0 for valid tokens, 0.0 for pad"},
+    }
+    if variant == "context":
+        inputs["pool_matrix"] = {"shape": [N_MAX_CHUNKS, max_seq_len], "dtype": "fp16",
+                                 "doc": "row k = 1/n_k over chunk k's span, else 0; unused rows all-zero"}
+    cfgd = {
+        "model_name": model_name,
+        "architecture": "qwen3-encoder",
+        "variant": variant,
+        "tokenizer_repo": hf_repo,
+        "parts": {"encoder": "encoder.mlpackage"},
+        "io_contract": {
+            "inputs": inputs,
+            "outputs": {
+                "embedding": {"shape": out_shape, "dtype": out_dtype,
+                              "doc": ("per-chunk embeddings; read first N_actual rows (unused rows are 0)"
+                                      if variant == "context" else "plain mean-pooled embedding")
+                                     + "; int8 = clamp(round(tanh(x)*127),-128,127)"},
+            },
+        },
+        "hidden_size": cfg.hidden_size,
+        "num_hidden_layers": cfg.num_hidden_layers,
+        "num_attention_heads": cfg.num_attention_heads,
+        "num_key_value_heads": cfg.num_key_value_heads,
+        "head_dim": cfg.head_dim,
+        "intermediate_size": cfg.intermediate_size,
+        "vocab_size": cfg.vocab_size,
+        "rope_theta": cfg.rope_theta,
+        "rms_norm_eps": cfg.rms_norm_eps,
+        "max_seq_len": max_seq_len,
+        "bucket": (f"1..{dynamic_upper}" if dynamic else max_seq_len),
+        "dynamic": dynamic,
+        "dynamic_upper": dynamic_upper if dynamic else 0,
+        "output_mode": output_mode,
+        "fp16_residual_rescale_k": rescale_k,
+        "norm_impl": norm_impl,
+        "pooling": "mean",
+        "quantization_weights": quantize or "fp16",
+        "matryoshka_dims": [1024, 512, 256, 128],
+        # Flexible RangeDim models force CPU fallback on ANE → run on GPU; fixed buckets on ANE.
+        "compute_units": "CPU_AND_GPU" if dynamic else "CPU_AND_NE",
+    }
+    path = os.path.join(output_dir, "model_config.json")
+    with open(path, "w") as f:
+        json.dump(cfgd, f, indent=2)
+    print(f"  wrote {path}")
+
+
+def _copy_tokenizer(snap, output_dir):
+    dst = os.path.join(output_dir, "hf_model")
+    os.makedirs(dst, exist_ok=True)
+    for name in os.listdir(snap):
+        if name.startswith("tokenizer") or name in (
+            "config.json", "special_tokens_map.json", "vocab.json", "merges.txt",
+            "added_tokens.json",
+        ):
+            shutil.copy2(os.path.join(snap, name), os.path.join(dst, name))
+    print(f"  copied tokenizer files → {dst}")
+
+
+def main():
+    from config import MODEL_REGISTRY
+
+    ap = argparse.ArgumentParser(description="Build CoreML bundle for pplx-embed")
+    ap.add_argument("--model", default="pplx-embed", choices=list(MODEL_REGISTRY.keys()))
+    ap.add_argument("--max-seq-len", type=int, default=4096)
+    ap.add_argument("--output-mode", default="int8", choices=["int8", "pooled_fp16"])
+    ap.add_argument("--rescale-k", type=float, default=DEFAULT_RESCALE_K)
+    ap.add_argument("--quantize", default="none", choices=["none", "int8", "int4"])
+    ap.add_argument("--variant", default="auto", choices=["auto", "plain", "context"],
+                    help="auto → context if the model name contains 'context', else plain")
+    ap.add_argument("--dynamic-upper", type=int, default=0,
+                    help="If >0, build a flexible RangeDim (1..N) GPU model (the >max-bucket "
+                         "catch-all), e.g. 8192. Plain only.")
+    ap.add_argument("--hf-dir", default=None, help="Override HF dir (skip download)")
+    ap.add_argument("--output", default=None)
+    ap.add_argument("--no-skip", action="store_true", help="Rebuild even if exists")
+    ap.add_argument("--norm-impl", default="native", choices=["ane_cat", "native"],
+                    help="RMSNorm for the 5 encoder norm sites: native (Qwen3RMSNorm rsqrt, "
+                         "default — 12-21%% faster on ANE per experiment_ane_rmsnorm.py) or "
+                         "ane_cat (shared cat/chunk LayerNorm trick).")
+    args = ap.parse_args()
+
+    reg = MODEL_REGISTRY[args.model]
+    hf_repo = args.hf_dir or reg.hf_repo
+    variant = ("context" if "context" in args.model else "plain") if args.variant == "auto" else args.variant
+    if args.dynamic_upper:
+        tag = f"dyn{args.dynamic_upper}-{args.output_mode}"
+    else:
+        tag = f"L{args.max_seq_len}-{args.output_mode}" + (f"-{args.quantize}" if args.quantize != "none" else "")
+    output = args.output or os.path.join(ROOT, "..", "output", args.model, tag)
+    quantize = None if args.quantize == "none" else args.quantize
+    build_bundle(hf_repo, args.model, output, args.max_seq_len, args.output_mode,
+                 args.rescale_k, quantize, variant=variant, dynamic_upper=args.dynamic_upper,
+                 skip_if_exists=not args.no_skip, norm_impl=args.norm_impl)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/conversion/config.py b/conversion/config.py
index 17beb27..501c8a2 100644
--- a/conversion/config.py
+++ b/conversion/config.py
@@ -68,6 +68,20 @@ class ConversionConfig:
         max_context_length=2048,
         description="EmbeddingGemma 300M - Gemma 3 bidirectional encoder, 768-d sentence embedding (Matryoshka)",
     ),
+    "pplx-embed": ConversionConfig(
+        hf_repo="perplexity-ai/pplx-embed-v1-0.6b",
+        architecture="qwen3-encoder",
+        default_context_length=4096,
+        max_context_length=32768,
+        description="Perplexity pplx-embed-v1 0.6B - bidirectional Qwen3 encoder, mean-pool, 1024-d int8 sentence embedding (plain). trust_remote_code; fixed-shape buckets.",
+    ),
+    "pplx-embed-context": ConversionConfig(
+        hf_repo="perplexity-ai/pplx-embed-context-v1-0.6b",
+        architecture="qwen3-encoder",
+        default_context_length=4096,
+        max_context_length=32768,
+        description="Perplexity pplx-embed-context-v1 0.6B - bidirectional Qwen3 encoder with late chunking (pool_matrix -> per-chunk 1024-d int8). trust_remote_code; fixed-shape buckets.",
+    ),
     "lfm2.5-350m": ConversionConfig(
         hf_repo="LiquidAI/LFM2.5-350M",
         architecture="lfm2",
diff --git a/conversion/experiment_ane_rmsnorm.py b/conversion/experiment_ane_rmsnorm.py
new file mode 100644
index 0000000..bb0d361
--- /dev/null
+++ b/conversion/experiment_ane_rmsnorm.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""A/B: native `Qwen3RMSNorm` vs the shared `ANERMSNorm` cat/chunk trick on the ANE.
+
+Isolates the RMSNorm half of the incidental finding in docs/PPLX_EMBED_GPU_RESIDENCY.md:
+a from-scratch "GPU-native" encoder rebuild ran the *ANE* path ~21% faster (28.6 vs
+36.1 ms at L=256), but it confounded three changes (Conv2d-1×1→Linear, cat/chunk→native
+RMSNorm, layout). This script changes ONLY the RMSNorm (`norm_impl` ∈ {ane_cat, native},
+see models/qwen3_encoder.py) and measures, for each L ∈ {256, 512}:
+
+  * ANE residency  — % of non-const MLProgram ops the static planner puts on the ANE
+                     (reuses audit_ane_residency.py's MLComputePlan pattern). The gate:
+                     native must STAY on the ANE (≈ the ane_cat residency, ~99%).
+  * latency        — CPU_AND_NE warm median of MLModel.predict (the metric that matters;
+                     the ANE path is the one pplx-embed ships).
+  * fidelity       — cosine of the pooled_fp16 output vs the fp32 `Reference` oracle
+                     (gate ≥ 0.99). Built with `--output-mode pooled_fp16` so the Python
+                     CoreML bridge can read the output.
+
+Decision rule (printed at the end): native WINS if it is faster on CPU_AND_NE, keeps
+residency ≥ ~99%, and holds cosine ≥ 0.99 at both L — then make it the pplx-embed
+default. Otherwise keep ane_cat and record the negative result.
+
+Usage:
+    uv run python conversion/experiment_ane_rmsnorm.py
+    uv run python conversion/experiment_ane_rmsnorm.py --lengths 256 512 --iters 30
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+import time
+from collections import Counter
+
+import numpy as np
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, ROOT)
+
+NORM_IMPLS = ("ane_cat", "native")
+
+# A representative multilingual paragraph; repeated/truncated to fill each bucket so the
+# fidelity check exercises a realistic (long, non-trivial) input at every L.
+SAMPLE_TEXT = (
+    "Embeddings map text into dense vectors so that semantically similar passages land "
+    "near each other. 東京は日本の首都であり、世界有数の大都市圏を形成しています。 "
+    "La inteligencia artificial avanza rápido y transforma la búsqueda de información. "
+    "Машинное обучение изменяет способы обработки естественного языка. "
+    "Retrieval-augmented generation grounds large language models in external knowledge."
+)
+
+
+def _build(hf_repo: str, norm_impl: str, L: int, out_root: str) -> str:
+    """Build a pooled_fp16 bucket for (norm_impl, L); returns the .mlpackage path."""
+    from build_pplx_embed_bundle import build_bundle
+
+    out_dir = os.path.join(out_root, f"{norm_impl}-L{L}")
+    return build_bundle(
+        hf_repo, "pplx-embed", out_dir, max_seq_len=L, output_mode="pooled_fp16",
+        quantize=None, variant="plain", dynamic_upper=0, skip_if_exists=True,
+        norm_impl=norm_impl,
+    )
+
+
+def _compile(pkg: str) -> str:
+    """Compile an .mlpackage → .mlmodelc via `xcrun coremlcompiler` (skip if present).
+
+    MLComputePlan.load_from_path wants a *compiled* model; pointed at a raw .mlpackage
+    it hard-aborts (uncatchable C++ exception). Compile once, reuse.
+    """
+    import subprocess
+
+    out_dir = os.path.dirname(pkg)
+    mlmodelc = os.path.join(out_dir, "encoder.mlmodelc")
+    if not os.path.isdir(mlmodelc):
+        subprocess.run(["xcrun", "coremlcompiler", "compile", pkg, out_dir],
+                       check=True, capture_output=True)
+    return mlmodelc
+
+
+def _ane_residency(mlmodelc: str) -> tuple[float, int, Counter]:
+    """Static op→device tally via MLComputePlan. Returns (ANE %, total ops, by-device)."""
+    import coremltools as ct
+    from coremltools.models.compute_plan import MLComputePlan
+    from audit_ane_residency import _iter_mlprogram_ops, _device_label
+
+    plan = MLComputePlan.load_from_path(path=mlmodelc, compute_units=ct.ComputeUnit.CPU_AND_NE)
+    by_device: Counter = Counter()
+    total = 0
+    for _func, op in _iter_mlprogram_ops(plan.model_structure):
+        if op.operator_name == "const":
+            continue
+        try:
+            usage = plan.get_compute_device_usage_for_mlprogram_operation(op)
+        except Exception:
+            usage = None
+        by_device[_device_label(usage)] += 1
+        total += 1
+    ane_pct = 100.0 * by_device.get("ANE", 0) / total if total else 0.0
+    return ane_pct, total, by_device
+
+
+def _make_inputs(tokenizer, L: int):
+    """Tokenize SAMPLE_TEXT (repeated to ~fill L), pad to L. Returns (inputs, n_valid)."""
+    text = SAMPLE_TEXT
+    # Repeat until the tokenized length comfortably exceeds L, then truncate to L.
+    while len(tokenizer.encode(text)) < L:
+        text = text + " " + SAMPLE_TEXT
+    enc = tokenizer([text], return_tensors="np", truncation=True, max_length=L)
+    ids = enc["input_ids"][0].astype(np.int32)
+    n = int(ids.shape[0])
+    pid = np.zeros((1, L), dtype=np.int32)
+    pid[0, :n] = ids
+    pam = np.zeros((1, L), dtype=np.float16)
+    pam[0, :n] = 1.0
+    return {"input_ids": pid, "attention_mask": pam}, n
+
+
+def _latency_and_output(pkg: str, inputs: dict, iters: int, warmup: int):
+    """CPU_AND_NE warm median latency (ms) + the pooled fp16 output of the last run."""
+    import coremltools as ct
+
+    m = ct.models.MLModel(pkg, compute_units=ct.ComputeUnit.CPU_AND_NE)
+    out = None
+    for _ in range(warmup):
+        out = m.predict(inputs)
+    times = []
+    for _ in range(iters):
+        t = time.time()
+        out = m.predict(inputs)
+        times.append((time.time() - t) * 1000.0)
+    emb = np.asarray(out["embedding"]).astype(np.float32).reshape(1, -1)
+    return float(np.median(times)), float(np.mean(times)), emb
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="ANE RMSNorm A/B (native vs cat/chunk)")
+    ap.add_argument("--hf-repo", default=None, help="Override HF repo / local dir")
+    ap.add_argument("--lengths", type=int, nargs="+", default=[256, 512])
+    ap.add_argument("--iters", type=int, default=30)
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--out-root", default=os.path.join(ROOT, "..", "output",
+                                                       "pplx-embed-rmsnorm-ab"))
+    ap.add_argument("--fidelity-gate", type=float, default=0.99)
+    args = ap.parse_args()
+
+    from config import MODEL_REGISTRY
+    hf_repo = args.hf_repo or MODEL_REGISTRY["pplx-embed"].hf_repo
+
+    import pplx_embed_reference as R
+    print(f"[ref] loading fp32 oracle {hf_repo} …")
+    ref = R.Reference(hf_repo)
+    tok = ref.tokenizer
+
+    # results[(impl, L)] = dict(ane_pct, total, latency_med, latency_mean, cosine, n)
+    results: dict[tuple[str, int], dict] = {}
+    for L in args.lengths:
+        inputs, n = _make_inputs(tok, L)
+        # fp32 reference pooled (pre-tanh) for this exact (truncated) input.
+        import torch
+        ids_t = torch.from_numpy(inputs["input_ids"]).to(torch.long)[:, :n]
+        mask_t = torch.ones((1, n), dtype=torch.float32)
+        with torch.inference_mode():
+            hidden = ref.model(input_ids=ids_t, attention_mask=mask_t).last_hidden_state.float()
+            ref_pooled = R.masked_mean(hidden, mask_t).numpy().astype(np.float32)
+
+        for impl in NORM_IMPLS:
+            print(f"\n=== norm_impl={impl}  L={L}  (n_valid={n}) ===")
+            pkg = _build(hf_repo, impl, L, args.out_root)
+            mlmodelc = _compile(pkg)
+            ane_pct, total, by_dev = _ane_residency(mlmodelc)
+            print(f"  residency: ANE {ane_pct:.2f}%  ({total} ops; "
+                  f"{dict(by_dev)})")
+            med, mean, emb = _latency_and_output(pkg, inputs, args.iters, args.warmup)
+            cos = float(R.cosine_similarity(emb, ref_pooled)[0])
+            print(f"  CPU_AND_NE latency: median {med:.2f} ms  mean {mean:.2f} ms")
+            print(f"  fidelity cosine vs fp32: {cos:.6f}  (gate ≥ {args.fidelity_gate})")
+            results[(impl, L)] = dict(ane_pct=ane_pct, total=total, latency_med=med,
+                                      latency_mean=mean, cosine=cos, n=n)
+
+    # ---- summary + decision -------------------------------------------------
+    print("\n" + "=" * 72)
+    print("SUMMARY  (norm_impl × L)")
+    print("=" * 72)
+    print(f"  {'impl':<8} {'L':>5} {'ANE%':>7} {'lat_med(ms)':>12} {'cosine':>9}")
+    for L in args.lengths:
+        for impl in NORM_IMPLS:
+            r = results[(impl, L)]
+            print(f"  {impl:<8} {L:>5} {r['ane_pct']:>7.2f} {r['latency_med']:>12.2f} "
+                  f"{r['cosine']:>9.5f}")
+
+    print("\nDECISION")
+    native_wins_all = True
+    for L in args.lengths:
+        a = results[("ane_cat", L)]
+        nv = results[("native", L)]
+        speedup = (a["latency_med"] / nv["latency_med"] - 1.0) * 100.0
+        faster = nv["latency_med"] < a["latency_med"]
+        resident = nv["ane_pct"] >= 0.99 * a["ane_pct"] and nv["ane_pct"] >= 99.0
+        fid_ok = nv["cosine"] >= args.fidelity_gate
+        verdict = "WIN" if (faster and resident and fid_ok) else "no"
+        if verdict != "WIN":
+            native_wins_all = False
+        print(f"  L={L}: native {nv['latency_med']:.2f} vs ane_cat {a['latency_med']:.2f} ms "
+              f"({speedup:+.1f}% {'faster' if faster else 'slower'}); "
+              f"ANE {nv['ane_pct']:.2f}% (resident={resident}); "
+              f"cosine {nv['cosine']:.5f} (ok={fid_ok}) → {verdict}")
+
+    print()
+    if native_wins_all:
+        print("  ✅ native RMSNorm WINS at all L → make norm_impl='native' the pplx-embed "
+              "default (build_pplx_embed_bundle.py / encoder).")
+    else:
+        print("  ❌ native does not clear the gate at every L → keep ane_cat; record the "
+              "negative result. (See per-L lines above.)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/conversion/experiment_batching.py b/conversion/experiment_batching.py
new file mode 100644
index 0000000..09df6d6
--- /dev/null
+++ b/conversion/experiment_batching.py
@@ -0,0 +1,410 @@
+#!/usr/bin/env python3
+"""Rigorous CoreML batching throughput experiment for the pplx-embed encoder.
+
+Question: does batching (B>1) give throughput gains on Apple Silicon, and if the
+earlier quick test showed FLAT docs/sec, is that real (and why) or a measurement
+blind spot?
+
+For each (L, B) we build a pooled_fp16 batched encoder, convert with coremltools
+(shape (B, L)), then load+time it under three compute-unit settings and compute
+per-doc latency = batch_latency / B and docs/sec = B / batch_latency.
+
+We also:
+  - audit the actual compute-device placement (MLComputePlan) of a batched model,
+  - run a control (1 batch-N predict vs N sequential B=1 predicts),
+  - sanity-check that batching is real (distinct input rows -> distinct outputs).
+
+Run:
+    uv run python conversion/experiment_batching.py            # full sweep
+    uv run python conversion/experiment_batching.py --quick    # smaller sweep
+"""
+from __future__ import annotations
+
+import argparse
+import gc
+import os
+import sys
+import time
+from collections import Counter
+
+import numpy as np
+import torch
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, ROOT)
+
+import coremltools as ct  # noqa: E402
+from coremltools.models.compute_plan import MLComputePlan  # noqa: E402
+
+from models.qwen3_encoder import (  # noqa: E402
+    Qwen3EncoderConfig,
+    PplxEmbedModel,
+    load_encoder_weights,
+    apply_fp16_residual_rescale,
+)
+
+HF_REPO = "perplexity-ai/pplx-embed-v1-0.6b"
+RESCALE_K = 8.0
+
+CU = {
+    "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
+    "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
+    "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
+}
+
+
+def snapshot_dir() -> str:
+    if os.path.isdir(HF_REPO):
+        return HF_REPO
+    from huggingface_hub import snapshot_download
+    return snapshot_download(
+        HF_REPO,
+        allow_patterns=["*.json", "*.safetensors", "tokenizer*", "*.txt", "*.py", "1_Pooling/*"],
+    )
+
+
+_TORCH_CACHE: dict[int, PplxEmbedModel] = {}
+
+
+def torch_model(snap: str, rope_len: int) -> PplxEmbedModel:
+    """Build (once per rope_len) the weight-loaded, rescaled torch encoder."""
+    if rope_len in _TORCH_CACHE:
+        return _TORCH_CACHE[rope_len]
+    cfg = Qwen3EncoderConfig.from_json(os.path.join(snap, "config.json"), max_seq_len=rope_len)
+    model = PplxEmbedModel(cfg, output_mode="pooled_fp16").eval()
+    load_encoder_weights(model.encoder, snap)
+    apply_fp16_residual_rescale(model.encoder, RESCALE_K)
+    _TORCH_CACHE[rope_len] = model
+    return model
+
+
+def build_mlpackage(snap: str, L: int, B: int, out_dir: str) -> str:
+    """Convert a (B, L) pooled_fp16 encoder to a .mlpackage. Cached on disk."""
+    pkg = os.path.join(out_dir, f"enc_L{L}_B{B}.mlpackage")
+    if os.path.exists(pkg):
+        return pkg
+    os.makedirs(out_dir, exist_ok=True)
+    model = torch_model(snap, rope_len=L)
+    sample_ids = torch.zeros((B, L), dtype=torch.int32)
+    sample_mask = torch.ones((B, L), dtype=torch.float16)
+    with torch.no_grad():
+        traced = torch.jit.trace(model, (sample_ids, sample_mask))
+    inputs = [
+        ct.TensorType(name="input_ids", shape=(B, L), dtype=np.int32),
+        ct.TensorType(name="attention_mask", shape=(B, L), dtype=np.float16),
+    ]
+    mlmodel = ct.convert(
+        traced,
+        inputs=inputs,
+        outputs=[ct.TensorType(name="embedding", dtype=np.float16)],
+        minimum_deployment_target=ct.target.macOS26,
+        compute_units=ct.ComputeUnit.ALL,
+    )
+    mlmodel.save(pkg)
+    del traced, mlmodel
+    gc.collect()
+    return pkg
+
+
+def make_inputs(B: int, L: int, distinct: bool = False) -> dict:
+    rng = np.random.default_rng(0)
+    if distinct:
+        ids = rng.integers(1, 5000, size=(B, L)).astype(np.int32)
+    else:
+        ids = rng.integers(1, 5000, size=(1, L)).astype(np.int32)
+        ids = np.repeat(ids, B, axis=0).astype(np.int32)
+    mask = np.ones((B, L), dtype=np.float16)
+    return {"input_ids": ids, "attention_mask": mask}
+
+
+def time_model(mlmodel, feeds: dict, n_warm: int = 3, n_runs: int = 8) -> dict:
+    for _ in range(n_warm):
+        mlmodel.predict(feeds)
+    samples = []
+    for _ in range(n_runs):
+        t0 = time.perf_counter()
+        mlmodel.predict(feeds)
+        samples.append(time.perf_counter() - t0)
+    samples.sort()
+    return {
+        "median_s": samples[len(samples) // 2],
+        "min_s": samples[0],
+        "max_s": samples[-1],
+        "runs": samples,
+    }
+
+
+def device_label(usage) -> str:
+    if usage is None:
+        return "unknown"
+    pref = getattr(usage, "preferred_compute_device", None) or getattr(usage, "preferred", None)
+    if pref is None:
+        return "unknown"
+    name = type(pref).__name__
+    if "Neural" in name or "ANE" in name:
+        return "ANE"
+    if "GPU" in name:
+        return "GPU"
+    if "CPU" in name:
+        return "CPU"
+    return name
+
+
+def _iter_ops(ms):
+    prog = getattr(ms, "program", None)
+    if prog is None:
+        return
+    for fn, func in prog.functions.items():
+        stack = [func.block]
+        while stack:
+            blk = stack.pop()
+            for op in blk.operations:
+                yield op
+                for nb in getattr(op, "blocks", ()) or ():
+                    stack.append(nb)
+
+
+_COMPILE_CACHE: dict[str, str] = {}
+
+
+def _compiled_path(pkg: str) -> str:
+    """Compile an .mlpackage to a persistent .mlmodelc once (MLComputePlan needs
+    a compiled model, and the temp one from get_compiled_model_path() is deleted
+    when its MLModel is GC'd — so copy it to a stable location)."""
+    if pkg in _COMPILE_CACHE:
+        return _COMPILE_CACHE[pkg]
+    import shutil
+    m = ct.models.MLModel(pkg, compute_units=ct.ComputeUnit.CPU_ONLY)
+    tmp = m.get_compiled_model_path()
+    dst = pkg.replace(".mlpackage", ".mlmodelc")
+    if os.path.exists(dst):
+        shutil.rmtree(dst)
+    shutil.copytree(tmp, dst)   # copy before `m` is GC'd / tmp is cleaned
+    del m
+    _COMPILE_CACHE[pkg] = dst
+    return dst
+
+
+def audit_devices(pkg: str, compute_unit: ct.ComputeUnit) -> Counter:
+    path = _compiled_path(pkg) if pkg.endswith(".mlpackage") else pkg
+    plan = MLComputePlan.load_from_path(path=path, compute_units=compute_unit)
+    ms = plan.model_structure
+    by_dev = Counter()
+    for op in _iter_ops(ms):
+        if op.operator_name == "const":
+            continue
+        try:
+            usage = plan.get_compute_device_usage_for_mlprogram_operation(op)
+        except Exception:
+            usage = None
+        by_dev[device_label(usage)] += 1
+    return by_dev
+
+
+def fmt_devs(c: Counter) -> str:
+    tot = sum(c.values()) or 1
+    return ", ".join(f"{d}:{n}({100*n/tot:.0f}%)" for d, n in c.most_common())
+
+
+class _Tee:
+    def __init__(self, *streams):
+        self.streams = streams
+    def write(self, s):
+        for st in self.streams:
+            st.write(s)
+            st.flush()
+    def flush(self):
+        for st in self.streams:
+            st.flush()
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--quick", action="store_true", help="smaller sweep (B up to 16)")
+    ap.add_argument("--runs", type=int, default=8)
+    ap.add_argument("--out-dir", default=os.path.join(ROOT, "experiments", "batching_models"))
+    ap.add_argument("--log", default=os.path.join(ROOT, "experiments", "batching_out.log"))
+    args = ap.parse_args()
+
+    os.makedirs(os.path.dirname(args.log), exist_ok=True)
+    _logf = open(args.log, "w")
+    sys.stdout = _Tee(sys.__stdout__, _logf)
+
+    Ls = [128, 512]
+    Bs = [1, 4, 16] if args.quick else [1, 4, 16, 64]
+    units = ["CPU_AND_NE", "CPU_AND_GPU", "CPU_ONLY"]
+
+    snap = snapshot_dir()
+    print(f"snapshot: {snap}")
+    print(f"Ls={Ls} Bs={Bs} units={units} runs={args.runs}\n")
+
+    # ---- Build all needed mlpackages first (one torch model per L) ----
+    pkgs: dict[tuple[int, int], str] = {}
+    for L in Ls:
+        for B in Bs:
+            print(f"[build] L={L} B={B} ...", flush=True)
+            pkgs[(L, B)] = build_mlpackage(snap, L, B, args.out_dir)
+    # free torch
+    _TORCH_CACHE.clear()
+    gc.collect()
+
+    # ---- Timing sweep ----
+    # rows: (L, B, unit) -> result
+    results = {}
+    for unit in units:
+        for L in Ls:
+            for B in Bs:
+                pkg = pkgs[(L, B)]
+                try:
+                    m = ct.models.MLModel(pkg, compute_units=CU[unit])
+                except Exception as e:
+                    print(f"  load FAIL L={L} B={B} {unit}: {e}")
+                    continue
+                feeds = make_inputs(B, L, distinct=False)
+                try:
+                    r = time_model(m, feeds, n_runs=args.runs)
+                except Exception as e:
+                    print(f"  predict FAIL L={L} B={B} {unit}: {e}")
+                    del m
+                    gc.collect()
+                    continue
+                bl = r["median_s"]
+                results[(L, B, unit)] = {
+                    "batch_lat_ms": bl * 1e3,
+                    "per_doc_ms": bl / B * 1e3,
+                    "docs_per_s": B / bl,
+                }
+                print(f"  L={L:4d} B={B:3d} {unit:12s} "
+                      f"batch={bl*1e3:8.2f}ms  per-doc={bl/B*1e3:7.3f}ms  "
+                      f"docs/s={B/bl:8.2f}")
+                del m
+                gc.collect()
+
+    # ---- Print tables ----
+    print("\n\n========== docs/sec  (rows=B, cols=unit) ==========")
+    for L in Ls:
+        print(f"\n--- L={L} ---")
+        header = "  B   " + "".join(f"{u:>14s}" for u in units)
+        print(header)
+        for B in Bs:
+            row = f"{B:4d}  "
+            for u in units:
+                r = results.get((L, B, u))
+                row += f"{r['docs_per_s']:14.2f}" if r else f"{'-':>14s}"
+            print(row)
+
+    print("\n\n========== per-doc latency ms  (rows=B, cols=unit) ==========")
+    for L in Ls:
+        print(f"\n--- L={L} ---")
+        header = "  B   " + "".join(f"{u:>14s}" for u in units)
+        print(header)
+        for B in Bs:
+            row = f"{B:4d}  "
+            for u in units:
+                r = results.get((L, B, u))
+                row += f"{r['per_doc_ms']:14.3f}" if r else f"{'-':>14s}"
+            print(row)
+
+    # ---- Speedup vs B=1 (docs/sec ratio) ----
+    print("\n\n========== batch speedup = docs/s(B) / docs/s(B=1) ==========")
+    for L in Ls:
+        print(f"\n--- L={L} ---")
+        header = "  B   " + "".join(f"{u:>14s}" for u in units)
+        print(header)
+        for B in Bs:
+            row = f"{B:4d}  "
+            for u in units:
+                r = results.get((L, B, u))
+                r1 = results.get((L, 1, u))
+                if r and r1:
+                    row += f"{r['docs_per_s']/r1['docs_per_s']:13.2f}x"
+                else:
+                    row += f"{'-':>14s}"
+            print(row)
+
+    # ---- Device audit for B=64 (or max B), L=128 ----
+    maxB = Bs[-1]
+    print(f"\n\n========== DEVICE PLACEMENT (L=128, B={maxB}) ==========")
+    pkg = pkgs[(128, maxB)]
+    for unit in units:
+        try:
+            c = audit_devices(pkg, CU[unit])
+            print(f"  requested {unit:12s} -> {fmt_devs(c)}")
+        except Exception as e:
+            print(f"  audit {unit} failed: {e}")
+    # also B=1 L=128 for contrast
+    print(f"\n  (contrast) L=128 B=1:")
+    for unit in units:
+        try:
+            c = audit_devices(pkgs[(128, 1)], CU[unit])
+            print(f"  requested {unit:12s} -> {fmt_devs(c)}")
+        except Exception as e:
+            print(f"  audit {unit} failed: {e}")
+
+    # ---- Control: 1x batch-N vs N x batch-1 (best compute unit per case) ----
+    print(f"\n\n========== CONTROL: batch-N predict vs N sequential B=1 ==========")
+    for unit in ["CPU_AND_NE", "CPU_AND_GPU", "CPU_ONLY"]:
+        for L in Ls:
+            B = maxB
+            mN = ct.models.MLModel(pkgs[(L, B)], compute_units=CU[unit])
+            m1 = ct.models.MLModel(pkgs[(L, 1)], compute_units=CU[unit])
+            feedsN = make_inputs(B, L, distinct=True)
+            feeds1_list = [
+                {"input_ids": feedsN["input_ids"][i:i+1],
+                 "attention_mask": feedsN["attention_mask"][i:i+1]}
+                for i in range(B)
+            ]
+            # warm
+            for _ in range(2):
+                mN.predict(feedsN)
+                m1.predict(feeds1_list[0])
+            # batch-N
+            tb = []
+            for _ in range(5):
+                t0 = time.perf_counter()
+                mN.predict(feedsN)
+                tb.append(time.perf_counter() - t0)
+            tb.sort(); batchN = tb[len(tb)//2]
+            # N sequential
+            ts = []
+            for _ in range(3):
+                t0 = time.perf_counter()
+                for f in feeds1_list:
+                    m1.predict(f)
+                ts.append(time.perf_counter() - t0)
+            ts.sort(); seqN = ts[len(ts)//2]
+            print(f"  {unit:12s} L={L:4d} B={B}:  batch-N={batchN*1e3:8.1f}ms   "
+                  f"{B}x(B=1)={seqN*1e3:8.1f}ms   speedup={seqN/batchN:5.2f}x")
+            del mN, m1
+            gc.collect()
+
+    # ---- Sanity: distinct rows -> distinct outputs ----
+    print(f"\n\n========== SANITY: batching is real (distinct rows) ==========")
+    L, B = 128, min(4, maxB)
+    m = ct.models.MLModel(pkgs[(L, B)], compute_units=ct.ComputeUnit.CPU_AND_NE)
+    feeds = make_inputs(B, L, distinct=True)
+    out = m.predict(feeds)["embedding"]
+    out = np.asarray(out)
+    print(f"  output shape: {out.shape}")
+    # pairwise check rows differ
+    allsame = True
+    for i in range(B):
+        for j in range(i+1, B):
+            d = float(np.abs(out[i] - out[j]).max())
+            if d > 1e-4:
+                allsame = False
+            print(f"  row {i} vs {j}: max|diff|={d:.4f}")
+    print(f"  -> distinct inputs give {'DUPLICATE (BROADCAST BUG!)' if allsame else 'DISTINCT'} outputs")
+    # also confirm a single row matches the B=1 model on same input
+    m1 = ct.models.MLModel(pkgs[(L, 1)], compute_units=ct.ComputeUnit.CPU_AND_NE)
+    o1 = np.asarray(m1.predict({"input_ids": feeds["input_ids"][0:1],
+                                "attention_mask": feeds["attention_mask"][0:1]})["embedding"])
+    d01 = float(np.abs(out[0] - o1[0]).max())
+    print(f"  batch row0 vs B=1 model on same input: max|diff|={d01:.4f} "
+          f"({'MATCH' if d01 < 0.05 else 'MISMATCH'})")
+
+    print("\nDONE.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/conversion/experiment_w8a8.py b/conversion/experiment_w8a8.py
new file mode 100644
index 0000000..6d0b814
--- /dev/null
+++ b/conversion/experiment_w8a8.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python3
+"""W8A8 (int8 weights + int8 ACTIVATIONS) viability probe for pplx-embed.
+
+Milestone B4. WEIGHT-only quant is a dead end for this encoder (int8 linear ~0.42
+cosine; int4 palettize ~0.905) *and* only buys 4-8% latency, because the model is
+activation/compute-bound (fp16 attention), not weight-bandwidth-bound. The real
+bandwidth lever is ACTIVATION quantization. This script answers empirically: can
+W8A8 reach acceptable fidelity, or does it hit the attention-family wall (~cos 0.57)?
+
+Pipeline (per the coremltools activation-quant flow):
+  1. Build an fp16 pooled_fp16 encoder at a SMALL bucket (L=128/256) — output_mode
+     "pooled_fp16" so the CoreML pooled vector is Python-readable on macOS26.
+  2. Calibrate activation ranges on a small multilingual corpus (tokenized + padded
+     to the bucket) via cto.experimental.linear_quantize_activations.
+  3. Quantize weights int8 (linear_symmetric) on top -> W8A8.
+  4. Predict on the eval texts, quantize the pooled output with int8_tanh_quant, and
+     compute cosine vs the fp32 Reference oracle. Report mean/min.
+
+Two activation-quant modes are exposed because the attention pad-mask uses a large
+negative sentinel (Qwen3Encoder.NEG_INF = -1e4; CoreML may lower the mask add to the
+fp16 -65504 floor). A SYMMETRIC activation quantizer maps that catastrophically:
+scale = 1e4/127 ~= 79, so real attention scores (+-10) round to ~0 and the model
+collapses. ASYMMETRIC (mode="linear") lets the range span [-1e4, +score]; when the
+span overflows fp16 the scale goes inf and coremltools SKIPS that op (left in fp16) —
+which is exactly what we want for the mask add, while every other activation
+quantizes normally.
+
+Usage:
+  uv run python conversion/experiment_w8a8.py --bucket 128 --mode asymmetric --rescale-k 8
+  uv run python conversion/experiment_w8a8.py --bucket 128 --mode symmetric --rescale-k 8
+  uv run python conversion/experiment_w8a8.py --bucket 128 --mode asymmetric --rescale-k 0   # no rescale
+  uv run python conversion/experiment_w8a8.py --all   # sweep the key variants + baselines
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import shutil
+import sys
+
+import numpy as np
+import torch
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, ROOT)
+sys.path.insert(0, os.path.join(ROOT, "models"))
+
+import coremltools as ct  # noqa: E402
+import coremltools.optimize.coreml as cto  # noqa: E402
+
+from models.qwen3_encoder import (  # noqa: E402
+    Qwen3EncoderConfig,
+    PplxEmbedModel,
+    load_encoder_weights,
+    apply_fp16_residual_rescale,
+)
+from pplx_embed_reference import Reference, int8_tanh_quant, cosine_similarity  # noqa: E402
+
+HF_REPO = "perplexity-ai/pplx-embed-v1-0.6b"
+fp16_mil = ct.converters.mil.mil.types.fp16
+
+
+# --------------------------------------------------------------------------- #
+# coremltools compatibility / behaviour patches.
+# --------------------------------------------------------------------------- #
+def _patch_coremltools_cast() -> None:
+    """coremltools 9 _cast() folds const int/bool casts but calls int()/bool() on
+    numpy>=2 (1,)-shaped arrays, which raises. Extract the Python scalar first."""
+    from coremltools.converters.mil.frontend.torch import ops as _torch_ops
+    from coremltools.converters.mil.frontend.torch.ops import _get_inputs
+    from coremltools.converters.mil.mil import Builder as mb
+
+    def _cast_patched(context, node, dtype, dtype_name):
+        inputs = _get_inputs(context, node, expected=1)
+        x = inputs[0]
+        if not (len(x.shape) == 0 or np.all([d == 1 for d in x.shape])):
+            raise ValueError("input to cast must be either a scalar or a length 1 tensor")
+        if x.can_be_folded_to_const():
+            val = x.val
+            if isinstance(val, np.ndarray) and val.ndim >= 1:
+                val = val.item()
+            if not isinstance(val, dtype):
+                res = mb.const(val=dtype(val), name=node.name)
+            else:
+                res = mb.const(val=val, name=node.name)
+        elif len(x.shape) > 0:
+            x = mb.squeeze(x=x, name=node.name + "_item")
+            res = mb.cast(x=x, dtype=dtype_name, name=node.name)
+        else:
+            res = mb.cast(x=x, dtype=dtype_name, name=node.name)
+        context.add(res, node.name)
+
+    _torch_ops._cast = _cast_patched
+
+
+def _patch_coremltools_act_quant() -> None:
+    """insert_prefix_quantize_dequantize_pair tries to wrap every supported op with a
+    quantize/dequantize pair, including ops whose input x is int32 (mask add / expand,
+    embedding path). MIL `quantize` requires float input, so int32-input ops crash with
+    'scale has dtype fp32 whereas input has dtype int32'. Skip non-float-input ops."""
+    from coremltools.optimize.coreml import _quantization_passes
+    from coremltools.converters.mil.mil import types as mil_types
+
+    _orig = _quantization_passes.insert_prefix_quantize_dequantize_pair.transform_op
+
+    def _patched(self, op):
+        x_var = op.inputs.get("x")
+        if x_var is not None and not mil_types.is_float(x_var.dtype):
+            return
+        return _orig(self, op)
+
+    _quantization_passes.insert_prefix_quantize_dequantize_pair.transform_op = _patched
+
+
+_patch_coremltools_cast()
+_patch_coremltools_act_quant()
+
+
+# --------------------------------------------------------------------------- #
+# Calibration / eval corpus.
+# --------------------------------------------------------------------------- #
+CALIBRATION_TEXTS = [
+    "The transformer architecture has revolutionized natural language processing.",
+    "Apple Silicon's Neural Engine achieves high energy efficiency for ML workloads.",
+    "Bidirectional attention lets every token attend to every other token.",
+    "Retrieval-augmented generation grounds responses in external knowledge.",
+    "El procesamiento del lenguaje natural ha avanzado mucho en los ultimos anos.",
+    "Le modele encode chaque phrase en un vecteur dense de grande dimension.",
+    "Maschinelles Lernen ermoglicht effiziente Inferenz direkt auf dem Geraet.",
+    "深層学習はテキストを密なベクトル表現に変換します。",
+    "向量检索通过余弦相似度衡量语义相关性。",
+    "machine learning",
+    "natural language processing",
+    "Cosine similarity measures semantic relatedness between embedding vectors.",
+    "Late chunking encodes long documents with a single bidirectional forward pass.",
+    "The int8 quantization of embeddings reduces memory bandwidth and latency.",
+]
+
+EVAL_TEXTS = [
+    "Quantum computing leverages superposition and entanglement for computation.",
+    "La inteligencia artificial transforma la manera en que trabajamos.",
+    "Les reseaux de neurones apprennent des representations hierarchiques.",
+    "Neuronale Netze lernen hierarchische Merkmalsrepraesentationen.",
+    "気候変動は地球規模で生態系に影響を与えています。",
+    "知识图谱将实体和关系组织成结构化的网络。",
+    "Vector databases enable fast approximate nearest neighbor search at scale.",
+    "Photosynthesis converts light energy into chemical energy in plants.",
+    "The stock market reacted sharply to the central bank's announcement.",
+    "Renewable energy sources are critical to mitigating climate change.",
+    "embeddings",
+    "A short multilingual sentence. Une phrase courte. Ein kurzer Satz.",
+]
+
+
+# --------------------------------------------------------------------------- #
+# Tokenization + padding to a fixed bucket.
+# --------------------------------------------------------------------------- #
+def tokenize_padded(tokenizer, texts: list[str], bucket: int) -> list[dict]:
+    """Right-pad each text to `bucket`. Returns list of {input_ids, attention_mask}
+    with input_ids int32 [1,L] and attention_mask fp16 [1,L] (1 valid / 0 pad)."""
+    pad_id = tokenizer.pad_token_id or 0
+    out = []
+    for t in texts:
+        enc = tokenizer([t], padding=False, truncation=True, max_length=bucket,
+                        return_tensors="np")
+        ids = enc["input_ids"].astype(np.int32)
+        mask = enc["attention_mask"].astype(np.float16)
+        L = ids.shape[1]
+        if L < bucket:
+            pad = bucket - L
+            ids = np.concatenate([ids, np.full((1, pad), pad_id, np.int32)], axis=1)
+            mask = np.concatenate([mask, np.zeros((1, pad), np.float16)], axis=1)
+        out.append({"input_ids": ids, "attention_mask": mask})
+    return out
+
+
+# --------------------------------------------------------------------------- #
+# Build.
+# --------------------------------------------------------------------------- #
+def build_fp16_encoder(snap: str, bucket: int, rescale_k: float) -> ct.models.MLModel:
+    cfg = Qwen3EncoderConfig.from_json(os.path.join(snap, "config.json"), max_seq_len=bucket)
+    model = PplxEmbedModel(cfg, output_mode="pooled_fp16").eval()
+    load_encoder_weights(model.encoder, snap)
+    if rescale_k:
+        apply_fp16_residual_rescale(model.encoder, rescale_k)
+
+    sample_ids = torch.zeros((1, bucket), dtype=torch.int32)
+    sample_mask = torch.ones((1, bucket), dtype=torch.float16)
+    with torch.no_grad():
+        traced = torch.jit.trace(model, (sample_ids, sample_mask))
+
+    inputs = [
+        ct.TensorType(name="input_ids", shape=(1, bucket), dtype=np.int32),
+        ct.TensorType(name="attention_mask", shape=(1, bucket), dtype=np.float16),
+    ]
+    mlmodel = ct.convert(
+        traced,
+        inputs=inputs,
+        outputs=[ct.TensorType(name="embedding", dtype=np.float16)],
+        minimum_deployment_target=ct.target.macOS26,
+        compute_units=ct.ComputeUnit.ALL,
+    )
+    return mlmodel
+
+
+def quantize_w8a8(fp16_model: ct.models.MLModel, calib: list[dict], mode: str) -> ct.models.MLModel:
+    """mode: 'asymmetric' -> activation mode='linear'; 'symmetric' -> 'linear_symmetric'."""
+    act_mode = "linear" if mode == "asymmetric" else "linear_symmetric"
+    act_cfg = cto.OptimizationConfig(
+        global_config=cto.experimental.OpActivationLinearQuantizerConfig(mode=act_mode),
+    )
+    model_a8 = cto.experimental.linear_quantize_activations(fp16_model, act_cfg, calib)
+
+    w_cfg = cto.OptimizationConfig(
+        global_config=cto.OpLinearQuantizerConfig(
+            mode="linear_symmetric", dtype=np.int8, weight_threshold=512,
+        )
+    )
+    return cto.linear_quantize_weights(model_a8, w_cfg)
+
+
+# --------------------------------------------------------------------------- #
+# Measure.
+# --------------------------------------------------------------------------- #
+def predict_pooled(mlmodel: ct.models.MLModel, samples: list[dict]) -> np.ndarray:
+    rows = []
+    for s in samples:
+        out = mlmodel.predict({"input_ids": s["input_ids"], "attention_mask": s["attention_mask"]})
+        rows.append(np.asarray(out["embedding"], dtype=np.float32).reshape(-1))
+    return np.stack(rows, axis=0)  # [N, 1024]
+
+
+def fidelity(pooled_fp16: np.ndarray, ref_int8: np.ndarray) -> tuple[float, float, np.ndarray]:
+    cm_int8 = int8_tanh_quant(pooled_fp16).astype(np.float32)
+    cos = cosine_similarity(cm_int8, ref_int8.astype(np.float32))
+    cos = cos[np.isfinite(cos)]
+    return float(cos.mean()), float(cos.min()), cos
+
+
+# --------------------------------------------------------------------------- #
+# Driver.
+# --------------------------------------------------------------------------- #
+def run_variant(snap, tokenizer, ref_int8, bucket, mode, rescale_k, out_root, save=True):
+    label = f"w8a8-{mode}-k{int(rescale_k)}-L{bucket}"
+    print(f"\n{'='*70}\n{label}\n{'='*70}", flush=True)
+
+    fp16_model = build_fp16_encoder(snap, bucket, rescale_k)
+    calib = tokenize_padded(tokenizer, CALIBRATION_TEXTS, bucket)
+    eval_samples = tokenize_padded(tokenizer, EVAL_TEXTS, bucket)
+
+    # fp16 baseline fidelity (same graph, no quant) for reference.
+    fp16_pooled = predict_pooled(fp16_model, eval_samples)
+    fp16_mean, fp16_min, _ = fidelity(fp16_pooled, ref_int8)
+    print(f"  fp16 baseline:  mean={fp16_mean:.4f}  min={fp16_min:.4f}", flush=True)
+
+    print(f"  quantizing W8A8 (activation mode={mode}) ...", flush=True)
+    w8a8 = quantize_w8a8(fp16_model, calib, mode)
+    w8a8_pooled = predict_pooled(w8a8, eval_samples)
+    w8a8_mean, w8a8_min, cos = fidelity(w8a8_pooled, ref_int8)
+    print(f"  W8A8 {mode:10s}:  mean={w8a8_mean:.4f}  min={w8a8_min:.4f}", flush=True)
+    print(f"    per-text cos: {np.round(cos, 3).tolist()}", flush=True)
+
+    pkg = None
+    if save:
+        pkg = os.path.join(out_root, f"{label}.mlpackage")
+        if os.path.exists(pkg):
+            shutil.rmtree(pkg)
+        os.makedirs(out_root, exist_ok=True)
+        w8a8.save(pkg)
+        print(f"  saved {pkg}", flush=True)
+
+    return {
+        "label": label, "bucket": bucket, "mode": mode, "rescale_k": rescale_k,
+        "fp16_mean": fp16_mean, "fp16_min": fp16_min,
+        "w8a8_mean": w8a8_mean, "w8a8_min": w8a8_min, "pkg": pkg,
+    }
+
+
+def audit_ane(pkg: str) -> None:
+    """Compile a .mlpackage and report ANE/CPU/GPU op residency (no xcrun needed)."""
+    from collections import Counter
+    from coremltools.models.utils import compile_model
+    from coremltools.models.compute_plan import MLComputePlan
+
+    mlc = pkg.rstrip("/") + ".mlmodelc"
+    if os.path.exists(mlc):
+        shutil.rmtree(mlc)
+    compiled = compile_model(pkg, mlc)
+    print(f"\n=== ANE audit: {compiled} ===", flush=True)
+    plan = MLComputePlan.load_from_path(path=compiled, compute_units=ct.ComputeUnit.CPU_AND_NE)
+    ms = plan.model_structure
+    prog = getattr(ms, "program", None)
+    by_dev = Counter()
+    by_op_dev = Counter()
+    total = 0
+
+    def walk(block, fn):
+        for op in block.operations:
+            yield fn, op
+            for nb in getattr(op, "blocks", ()) or ():
+                yield from walk(nb, fn)
+
+    for fn, func in prog.functions.items():
+        for fname, op in walk(func.block, fn):
+            if op.operator_name == "const":
+                continue
+            try:
+                usage = plan.get_compute_device_usage_for_mlprogram_operation(op)
+                pref = getattr(usage, "preferred_compute_device", None) or getattr(usage, "preferred", None)
+                name = type(pref).__name__ if pref is not None else "unknown"
+                dev = "ANE" if ("Neural" in name or "ANE" in name) else ("GPU" if "GPU" in name else ("CPU" if "CPU" in name else name))
+            except Exception:
+                dev = "unknown"
+            by_dev[dev] += 1
+            by_op_dev[(op.operator_name, dev)] += 1
+            total += 1
+    print(f"  total ops: {total}")
+    for dev, n in sorted(by_dev.items(), key=lambda kv: -kv[1]):
+        print(f"    {dev}: {n}  ({100.0*n/total:.1f}%)")
+    non_ane = [(o, d, n) for (o, d), n in by_op_dev.items() if d != "ANE"]
+    if non_ane:
+        print("  non-ANE ops:")
+        for o, d, n in sorted(non_ane, key=lambda t: -t[2])[:15]:
+            print(f"    [{d:3s}] {o:<26s} {n}")
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--bucket", type=int, default=128)
+    ap.add_argument("--mode", default="asymmetric", choices=["asymmetric", "symmetric"])
+    ap.add_argument("--rescale-k", type=float, default=8.0)
+    ap.add_argument("--all", action="store_true", help="Sweep the key variants")
+    ap.add_argument("--out", default="/tmp/w8a8-experiment")
+    ap.add_argument("--no-save", action="store_true")
+    ap.add_argument("--audit", default=None, help="Compile + ANE-audit an existing .mlpackage and exit")
+    args = ap.parse_args()
+
+    if args.audit:
+        audit_ane(args.audit)
+        return
+
+    from huggingface_hub import snapshot_download
+    snap = snapshot_download(
+        HF_REPO,
+        allow_patterns=["*.json", "*.safetensors", "tokenizer*", "*.txt", "*.py", "1_Pooling/*"],
+    )
+
+    print("Loading fp32 Reference oracle ...", flush=True)
+    ref = Reference(HF_REPO)
+    tokenizer = ref.tokenizer
+    ref_int8 = ref.embed(EVAL_TEXTS)  # [N, 1024] int8
+
+    results = []
+    if args.all:
+        variants = [
+            (args.bucket, "asymmetric", 8.0),
+            (args.bucket, "symmetric", 8.0),
+            (args.bucket, "asymmetric", 0.0),
+            (args.bucket, "asymmetric", 16.0),
+        ]
+        for bucket, mode, k in variants:
+            try:
+                results.append(run_variant(snap, tokenizer, ref_int8, bucket, mode, k,
+                                           args.out, save=not args.no_save))
+            except Exception as e:
+                print(f"  VARIANT FAILED ({mode}, k={k}): {e}", flush=True)
+                import traceback; traceback.print_exc()
+    else:
+        results.append(run_variant(snap, tokenizer, ref_int8, args.bucket, args.mode,
+                                   args.rescale_k, args.out, save=not args.no_save))
+
+    print(f"\n{'='*70}\nSUMMARY\n{'='*70}")
+    print(f"{'variant':<26s} {'fp16 mean':>10s} {'W8A8 mean':>10s} {'W8A8 min':>10s}")
+    for r in results:
+        print(f"{r['label']:<26s} {r['fp16_mean']:>10.4f} {r['w8a8_mean']:>10.4f} {r['w8a8_min']:>10.4f}")
+    print("\nReference points: fp16~0.999 | weight-only int8~0.42 | int4~0.905 | wall~0.57 | gate 0.990")
+    for r in results:
+        if r["pkg"]:
+            print(f"  artifact: {r['pkg']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/conversion/export_swift_fixtures.py b/conversion/export_swift_fixtures.py
new file mode 100644
index 0000000..23aefed
--- /dev/null
+++ b/conversion/export_swift_fixtures.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""Export fidelity fixtures for the Swift pplx-embed bench.
+
+Native int8 model output is not readable from the Python CoreML bridge on
+macOS26, so fidelity/latency for the int8 deliverable is measured in Swift.
+This script produces the ground-truth side: pre-tokenized, bucket-padded inputs
+plus the fp32-reference int8 embedding for each text.
+
+Output JSON:
+    { "L": <bucket>, "hf_repo": ..., "items": [
+        {"text": str, "input_ids": [L ints], "n": int, "ref_int8": [1024 ints]} ] }
+
+Usage:
+    python conversion/export_swift_fixtures.py --max-seq-len 4096 --out /tmp/pplx_fix.json
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+
+import numpy as np
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, ROOT)
+
+import pplx_embed_reference as R  # noqa: E402
+
+SENTENCES = [
+    "hello world",
+    "Quantum computing uses qubits.",
+    "東京は日本の首都です。",
+    "Bonjour le monde.",
+    "机器学习改变世界。",
+    "المعرفة قوة.",
+    "Привет, как дела?",
+    "Machine learning has transformed how we process information. " * 8,
+]
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--hf-repo", default="perplexity-ai/pplx-embed-v1-0.6b")
+    ap.add_argument("--max-seq-len", type=int, default=4096)
+    ap.add_argument("--out", default="/tmp/pplx_fixtures.json")
+    args = ap.parse_args()
+
+    L = args.max_seq_len
+    ref = R.Reference(args.hf_repo)
+    tok = ref.tokenizer
+
+    items = []
+    for t in SENTENCES:
+        enc = tok([t], return_tensors="pt", truncation=True, max_length=L)
+        ids = enc["input_ids"][0].tolist()
+        n = len(ids)
+        padded = ids + [0] * (L - n)
+        # Reference int8 over exactly these n tokens (matched truncation).
+        import torch
+        rh = ref.model(input_ids=enc["input_ids"][:, :n],
+                       attention_mask=torch.ones((1, n), dtype=torch.long)).last_hidden_state.float()
+        ref_i8 = R.int8_tanh_quant(R.masked_mean(rh, torch.ones((1, n)))).reshape(-1).astype(int).tolist()
+        items.append({"text": t, "input_ids": padded, "n": n, "ref_int8": ref_i8})
+        print(f"  fixture n={n:4d}  {t[:32]}")
+
+    out = {"L": L, "hf_repo": args.hf_repo, "items": items}
+    with open(args.out, "w") as f:
+        json.dump(out, f)
+    print(f"wrote {len(items)} fixtures (L={L}) → {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/conversion/measure_l8192_bucket.py b/conversion/measure_l8192_bucket.py
new file mode 100644
index 0000000..8a16ddd
--- /dev/null
+++ b/conversion/measure_l8192_bucket.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""Workstream B gate: is a fixed L=8192 ANE bucket worth shipping?
+
+The >max-bucket catch-all today is a flexible RangeDim **GPU** model (~10× slower than a
+fixed ANE bucket, because flexible shapes force ANE fallback). Inputs of 4097–8192 tokens
+take that slow path. This measures whether a **fixed L=8192 ANE bucket** instead:
+
+  1. **stays on the ANE** — compile the int8 bucket, tally op→device via MLComputePlan
+     (reuses audit_ane_residency.py). Gate: ~99% ANE (like the smaller buckets), not a
+     fall-off to CPU/GPU.
+  2. **is faster** than the dynamic GPU catch-all at a long (~8000-token) input — warm
+     median of MLModel.predict: ANE bucket (pooled_fp16, padded to 8192, CPU_AND_NE) vs
+     the dyn8192 GPU model (pooled_fp16, non-padded actual length, CPU_AND_GPU).
+  3. **holds fidelity** — cosine of both vs the fp32 `Reference` oracle (gate ≥ 0.99).
+
+Ship decision (printed): ship iff ANE-resident AND faster than the GPU catch-all.
+
+Prereqs (build first; each ~1.1 GB):
+    python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 8192
+    python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 8192 \
+        --output-mode pooled_fp16
+    python conversion/build_pplx_embed_bundle.py --model pplx-embed --dynamic-upper 8192 \
+        --output-mode pooled_fp16    # the GPU catch-all to compare against
+
+Usage:
+    uv run python conversion/measure_l8192_bucket.py --n-tokens 8000 --iters 5
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+from collections import Counter
+
+import numpy as np
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, ROOT)
+
+OUT = os.path.join(ROOT, "..", "output", "pplx-embed")
+ANE_FP16 = os.path.join(OUT, "L8192-pooled_fp16", "encoder.mlpackage")
+ANE_INT8 = os.path.join(OUT, "L8192-int8", "encoder.mlpackage")
+DYN_FP16 = os.path.join(OUT, "dyn8192-pooled_fp16", "encoder.mlpackage")
+
+SAMPLE = (
+    "Embeddings map text into dense vectors so semantically similar passages land near "
+    "each other. 東京は日本の首都であり、世界有数の大都市圏を形成しています。 "
+    "La inteligencia artificial avanza rápido. Машинное обучение меняет обработку языка. "
+    "Retrieval-augmented generation grounds large language models in external knowledge. "
+)
+
+
+def _compile(pkg: str) -> str:
+    out_dir = os.path.dirname(pkg)
+    mlmodelc = os.path.join(out_dir, "encoder.mlmodelc")
+    if not os.path.isdir(mlmodelc):
+        subprocess.run(["xcrun", "coremlcompiler", "compile", pkg, out_dir],
+                       check=True, capture_output=True)
+    return mlmodelc
+
+
+def _residency(mlmodelc: str) -> tuple[float, int, Counter]:
+    import coremltools as ct
+    from coremltools.models.compute_plan import MLComputePlan
+    from audit_ane_residency import _iter_mlprogram_ops, _device_label
+
+    plan = MLComputePlan.load_from_path(path=mlmodelc, compute_units=ct.ComputeUnit.CPU_AND_NE)
+    by_dev: Counter = Counter()
+    total = 0
+    for _f, op in _iter_mlprogram_ops(plan.model_structure):
+        if op.operator_name == "const":
+            continue
+        try:
+            usage = plan.get_compute_device_usage_for_mlprogram_operation(op)
+        except Exception:
+            usage = None
+        by_dev[_device_label(usage)] += 1
+        total += 1
+    return (100.0 * by_dev.get("ANE", 0) / total if total else 0.0), total, by_dev
+
+
+def _time(pkg: str, inputs: dict, units, iters: int, warmup: int):
+    import coremltools as ct
+    m = ct.models.MLModel(pkg, compute_units=units)
+    out = None
+    for _ in range(warmup):
+        out = m.predict(inputs)
+    ts = []
+    for _ in range(iters):
+        t = time.time()
+        out = m.predict(inputs)
+        ts.append((time.time() - t) * 1000.0)
+    emb = np.asarray(out["embedding"]).astype(np.float32).reshape(1, -1)
+    return float(np.median(ts)), emb
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="L=8192 ANE bucket ship gate")
+    ap.add_argument("--hf-repo", default=None)
+    ap.add_argument("--n-tokens", type=int, default=8000, help="valid tokens in the long input")
+    ap.add_argument("--iters", type=int, default=5)
+    ap.add_argument("--warmup", type=int, default=2)
+    ap.add_argument("--fidelity-gate", type=float, default=0.99)
+    args = ap.parse_args()
+
+    for p, what in ((ANE_FP16, "L8192 pooled_fp16"), (DYN_FP16, "dyn8192 pooled_fp16")):
+        if not os.path.isdir(p):
+            print(f"MISSING {what}: {p}\n(build it first — see this script's header.)")
+            return 1
+
+    from config import MODEL_REGISTRY
+    hf_repo = args.hf_repo or MODEL_REGISTRY["pplx-embed"].hf_repo
+    import pplx_embed_reference as R
+    import torch
+    print(f"[ref] loading fp32 oracle {hf_repo} …")
+    ref = R.Reference(hf_repo)
+    tok = ref.tokenizer
+
+    # Long multilingual input → n_tokens valid tokens.
+    text = SAMPLE
+    while len(tok.encode(text)) < args.n_tokens:
+        text = text + " " + SAMPLE
+    ids = tok([text], return_tensors="np", truncation=True, max_length=args.n_tokens)["input_ids"][0]
+    n = int(ids.shape[0])
+    print(f"[input] {n} valid tokens")
+
+    # fp32 reference pooled (this is the heavy step at long L).
+    print("[ref] fp32 forward (slow at long L) …")
+    with torch.inference_mode():
+        ids_t = torch.from_numpy(ids.astype(np.int64)).view(1, -1)
+        mask_t = torch.ones((1, n), dtype=torch.float32)
+        hidden = ref.model(input_ids=ids_t, attention_mask=mask_t).last_hidden_state.float()
+        ref_pooled = R.masked_mean(hidden, mask_t).numpy().astype(np.float32)
+
+    # --- 1. residency (int8 bucket if built, else the pooled_fp16 encoder body) -------
+    res_pkg = ANE_INT8 if os.path.isdir(ANE_INT8) else ANE_FP16
+    print(f"\n[1] ANE residency of {os.path.relpath(res_pkg, OUT)} …")
+    ane_pct, total, by_dev = _residency(_compile(res_pkg))
+    print(f"    ANE {ane_pct:.2f}%  ({total} ops; {dict(by_dev)})")
+
+    import coremltools as ct
+    L = 8192
+    # --- 2a. ANE bucket: pad to 8192, CPU_AND_NE ------------------------------------
+    pid = np.zeros((1, L), dtype=np.int32)
+    pid[0, :n] = ids
+    pam = np.zeros((1, L), dtype=np.float16)
+    pam[0, :n] = 1.0
+    print(f"\n[2a] ANE L8192 bucket latency (CPU_AND_NE, padded to {L}) …")
+    ane_ms, ane_emb = _time(ANE_FP16, {"input_ids": pid, "attention_mask": pam},
+                            ct.ComputeUnit.CPU_AND_NE, args.iters, args.warmup)
+    ane_cos = float(R.cosine_similarity(ane_emb, ref_pooled)[0])
+    print(f"     median {ane_ms:.1f} ms   cosine {ane_cos:.5f}")
+
+    # --- 2b. dynamic GPU model: actual length, CPU_AND_GPU --------------------------
+    did = ids.astype(np.int32).reshape(1, n)
+    dam = np.ones((1, n), dtype=np.float16)
+    print(f"\n[2b] dynamic GPU model latency (CPU_AND_GPU, actual {n}) …")
+    gpu_ms, gpu_emb = _time(DYN_FP16, {"input_ids": did, "attention_mask": dam},
+                            ct.ComputeUnit.CPU_AND_GPU, args.iters, args.warmup)
+    gpu_cos = float(R.cosine_similarity(gpu_emb, ref_pooled)[0])
+    print(f"     median {gpu_ms:.1f} ms   cosine {gpu_cos:.5f}")
+
+    # --- decision -------------------------------------------------------------------
+    print("\n" + "=" * 64)
+    print("L=8192 SHIP GATE")
+    print("=" * 64)
+    resident = ane_pct >= 99.0
+    faster = ane_ms < gpu_ms
+    fid_ok = ane_cos >= args.fidelity_gate
+    speedup = gpu_ms / ane_ms if ane_ms else 0.0
+    print(f"  ANE residency : {ane_pct:.2f}%   (resident ≥99% = {resident})")
+    print(f"  latency       : ANE {ane_ms:.1f} ms  vs  GPU {gpu_ms:.1f} ms  "
+          f"({speedup:.1f}× {'faster' if faster else 'SLOWER'})")
+    print(f"  fidelity      : ANE cosine {ane_cos:.5f}  (≥{args.fidelity_gate} = {fid_ok})")
+    ship = resident and faster and fid_ok
+    print()
+    if ship:
+        print("  ✅ SHIP: L=8192 stays on the ANE and beats the GPU catch-all. Place "
+              "L8192-int8/ in the bundle dir; Swift auto-routes (no Swift change).")
+    else:
+        print("  ❌ DO NOT SHIP: gate not met (see above). The >4096 path stays the GPU "
+              "model; record the result.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/conversion/models/qwen3_encoder.py b/conversion/models/qwen3_encoder.py
new file mode 100644
index 0000000..c805261
--- /dev/null
+++ b/conversion/models/qwen3_encoder.py
@@ -0,0 +1,477 @@
+"""Qwen3 bidirectional encoder (ANE-optimized) for pplx-embed.
+
+Perplexity's `pplx-embed-v1-0.6b` (plain) and `pplx-embed-context-v1-0.6b` (late
+chunking) are a **bidirectional** Qwen3-0.6B encoder (`PPLXQwen3Model`, see the HF
+checkpoint's `modeling.py`): every token attends to every non-pad token, the model
+returns `last_hidden_state`, and a downstream pooling + tanh-int8 head produces the
+embedding.
+
+This is the ANE port — templated on `models/gemma3_encoder.py` but with Qwen3 math:
+  - single pre/post RMSNorm per layer (pre-norm; NOT Gemma's 4 sandwich norms)
+  - plain-weight RMSNorm (`x*rsqrt(..)*w`, no +1 gain — matches the working Qwen3 decoder)
+  - per-head QK-norm (RMSNorm over head_dim on Q and K, before RoPE)
+  - single RoPE table, θ=1e6 (NOT Gemma's dual local/global)
+  - SwiGLU MLP (silu), GQA 16 q / 8 kv heads, head_dim 128 (q proj = 2048)
+  - full bidirectional attention (pad-mask only, no causal triangle, no sliding window)
+  - NO embedding scaling (that is a Gemma-ism)
+
+ANE layout (docs/ANE_OPTIMIZATION_SURVEY.md + conversion/ane_ops.py): all projections
+are Conv2d(1×1) on (B, C, 1, S); RMSNorm uses cat([x,−x])→LayerNorm; GQA expansion uses
+repeat_kv_ane; the residual stream is kept in fp32 (fp16 can overflow over 28 layers).
+Fixed trace-time sequence length — variable length is handled by padding to a bucket.
+"""
+
+from __future__ import annotations
+
+import gc
+import json
+import os
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ane_ops import (  # noqa: E402
+    MODEL_DTYPE,
+    ANERMSNorm,
+    apply_rotary_pos_emb,
+    stable_attention,
+)
+
+# Max chunks per document for the context (late-chunking) variant.
+N_MAX_CHUNKS = 32
+
+
+def _repeat_kv_b(x: torch.Tensor, n_rep: int, B: int, num_kv_heads: int,
+                 seq_len: int, head_dim: int) -> torch.Tensor:
+    """Batched GQA expansion: (B, kv, S, D) → (B, kv*n_rep, S, D), explicit shapes."""
+    if n_rep == 1:
+        return x
+    x = x.unsqueeze(2).expand(B, num_kv_heads, n_rep, seq_len, head_dim)
+    return x.reshape(B, num_kv_heads * n_rep, seq_len, head_dim)
+
+
+class Qwen3RMSNorm(nn.Module):
+    """Native RMSNorm `x * rsqrt(mean(x²)+eps) * w`, computed in fp32 (HF Qwen3 parity).
+
+    A *local* A/B alternative to the shared `ane_ops.ANERMSNorm` cat([x,−x])→LayerNorm
+    trick. That trick was chosen years ago because the ANE had a highly-optimized
+    LayerNorm kernel and no native `rsqrt`; on current M4 Max / macOS 26 / coremltools 9
+    that may no longer hold (see docs/PPLX_EMBED_GPU_RESIDENCY.md). This class lets the
+    pplx-embed encoder switch the 5 norm sites to native RMSNorm and measure.
+
+    It stores a 1-D fp16 weight exactly like `ANERMSNorm`, so weight loading is
+    unchanged (both are a plain `.weight` of shape `(hidden,)`). The normalization is
+    done in fp32 (fp16 `x²` can overflow for large activations) and returned in the
+    input dtype, mirroring the HF Qwen3 RMSNorm the fp32 reference already matches.
+    NB: coremltools lowers the whole graph to fp16 at convert time, so the fp32 here is
+    a trace-time/fidelity nicety; on device the op runs in fp16 like the rest.
+    """
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=MODEL_DTYPE))
+        self.eps = eps
+        self.hidden_size = hidden_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        in_dtype = x.dtype
+        x = x.to(torch.float32)
+        var = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(var + self.eps)
+        return x.to(in_dtype) * self.weight
+
+
+def make_norm(norm_impl: str, hidden_size: int, eps: float) -> nn.Module:
+    """Select the RMSNorm implementation for the encoder's 5 local norm sites.
+
+    "ane_cat" (default) → shared `ANERMSNorm` (cat/chunk LayerNorm trick, unchanged
+    behavior). "native" → local `Qwen3RMSNorm` (native rsqrt). Both store a 1-D fp16
+    weight, so swapping does not affect weight loading.
+    """
+    if norm_impl == "native":
+        return Qwen3RMSNorm(hidden_size, eps=eps)
+    if norm_impl == "ane_cat":
+        return ANERMSNorm(hidden_size, eps=eps)
+    raise ValueError(f"Unknown norm_impl '{norm_impl}'; expected 'ane_cat' or 'native'.")
+
+
+class Qwen3EncoderConfig:
+    """Qwen3 encoder config (read from the pplx-embed HF config.json)."""
+
+    def __init__(self, **kwargs):
+        self.hidden_size = kwargs.get("hidden_size", 1024)
+        self.num_hidden_layers = kwargs.get("num_hidden_layers", 28)
+        self.num_attention_heads = kwargs.get("num_attention_heads", 16)
+        self.num_key_value_heads = kwargs.get("num_key_value_heads", 8)
+        self.head_dim = kwargs.get("head_dim", 128)
+        self.intermediate_size = kwargs.get("intermediate_size", 3072)
+        self.vocab_size = kwargs.get("vocab_size", 151936)
+        self.rms_norm_eps = kwargs.get("rms_norm_eps", 1e-6)
+        self.attention_bias = bool(kwargs.get("attention_bias", False))
+        # rope_theta may live at top level or under rope_parameters.
+        rp = kwargs.get("rope_parameters") or {}
+        self.rope_theta = float(kwargs.get("rope_theta", rp.get("rope_theta", 1_000_000.0)))
+        self.max_position_embeddings = kwargs.get("max_position_embeddings", 32768)
+        # Trace-time fixed sequence length (the bucket).
+        self.max_seq_len = kwargs.get("max_seq_len", 4096)
+        # RMSNorm implementation for the 5 local norm sites: "native" (local
+        # Qwen3RMSNorm, native rsqrt — the shipped default) or "ane_cat" (shared
+        # ANERMSNorm cat/chunk LayerNorm trick). native is the default because the A/B
+        # (experiment_ane_rmsnorm.py / docs/PPLX_EMBED_GPU_RESIDENCY.md follow-up) found
+        # it 12.7% (L=256) / 21.5% (L=512) faster on the ANE at identical 99.81%
+        # residency and cosine 0.99998 vs the fp32 oracle, on M4 Max / macOS 26 /
+        # coremltools 9. (The cat/chunk trick predates a native ANE rsqrt.)
+        self.norm_impl = kwargs.get("norm_impl", "native")
+
+    @classmethod
+    def from_json(cls, path: str, max_seq_len: int = 4096,
+                  norm_impl: str = "native") -> "Qwen3EncoderConfig":
+        with open(path) as f:
+            d = json.load(f)
+        d = d.get("text_config", d)
+        d["max_seq_len"] = max_seq_len
+        d["norm_impl"] = norm_impl
+        return cls(**d)
+
+
+class Qwen3EncoderLayer(nn.Module):
+    """One bidirectional Qwen3 block (pre-norm, ANE layout)."""
+
+    def __init__(self, config: Qwen3EncoderConfig):
+        super().__init__()
+        hidden = config.hidden_size
+        head_dim = config.head_dim
+        num_heads = config.num_attention_heads
+        num_kv_heads = config.num_key_value_heads
+        inter = config.intermediate_size
+        eps = config.rms_norm_eps
+        has_bias = config.attention_bias
+        norm_impl = config.norm_impl
+
+        q_dim = num_heads * head_dim
+        kv_dim = num_kv_heads * head_dim
+
+        self.self_attn = nn.ModuleDict({
+            "q_proj": nn.Conv2d(hidden, q_dim, 1, bias=has_bias, dtype=MODEL_DTYPE),
+            "k_proj": nn.Conv2d(hidden, kv_dim, 1, bias=has_bias, dtype=MODEL_DTYPE),
+            "v_proj": nn.Conv2d(hidden, kv_dim, 1, bias=has_bias, dtype=MODEL_DTYPE),
+            "o_proj": nn.Conv2d(q_dim, hidden, 1, bias=False, dtype=MODEL_DTYPE),
+            # Qwen3 QK-norm: per-head RMSNorm over head_dim, plain weight.
+            "q_norm": make_norm(norm_impl, head_dim, eps),
+            "k_norm": make_norm(norm_impl, head_dim, eps),
+        })
+        self.mlp = nn.ModuleDict({
+            "gate_proj": nn.Conv2d(hidden, inter, 1, bias=False, dtype=MODEL_DTYPE),
+            "up_proj": nn.Conv2d(hidden, inter, 1, bias=False, dtype=MODEL_DTYPE),
+            "down_proj": nn.Conv2d(inter, hidden, 1, bias=False, dtype=MODEL_DTYPE),
+        })
+        self.input_layernorm = make_norm(norm_impl, hidden, eps)
+        self.post_attention_layernorm = make_norm(norm_impl, hidden, eps)
+
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.n_rep = num_heads // num_kv_heads
+        self.scale = float(head_dim) ** -0.5
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,   # (1, L, H) fp32
+        cos: torch.Tensor,             # (1, 1, L, head_dim)
+        sin: torch.Tensor,             # (1, 1, L, head_dim)
+        attention_mask: torch.Tensor,  # (1, 1, L, L) fp16 additive (0 / −1e4)
+        seq_len: int,
+    ) -> torch.Tensor:
+        num_heads = self.num_heads
+        num_kv_heads = self.num_kv_heads
+        head_dim = self.head_dim
+        B = hidden_states.shape[0]
+
+        residual = hidden_states
+        # Normalize in fp32 then downcast: the pre-norm residual can exceed fp16
+        # max (65504) over 28 layers, so casting *before* the norm would inf.
+        # RMSNorm is scale-invariant; its output is O(1) and fp16-safe.
+        normed = self.input_layernorm(hidden_states).to(MODEL_DTYPE)
+
+        # (B, H, 1, L) layout for Conv2d.
+        x = normed.permute(0, 2, 1).unsqueeze(2)
+
+        # Q/K/V: (B, q_dim, 1, L) → (B, heads, L, head_dim).
+        q = self.self_attn["q_proj"](x).view(B, num_heads, head_dim, seq_len).permute(0, 1, 3, 2)
+        k = self.self_attn["k_proj"](x).view(B, num_kv_heads, head_dim, seq_len).permute(0, 1, 3, 2)
+        v = self.self_attn["v_proj"](x).view(B, num_kv_heads, head_dim, seq_len).permute(0, 1, 3, 2)
+
+        # QK-norm per head, then RoPE.
+        q = self.self_attn["q_norm"](q.reshape(B, num_heads, seq_len, head_dim))
+        k = self.self_attn["k_norm"](k.reshape(B, num_kv_heads, seq_len, head_dim))
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+
+        # GQA expansion (ANE-safe, batched).
+        k = _repeat_kv_b(k, self.n_rep, B, num_kv_heads, seq_len, head_dim)
+        v = _repeat_kv_b(v, self.n_rep, B, num_kv_heads, seq_len, head_dim)
+
+        # Bidirectional attention (fp32), pad-mask only. scale = 1/sqrt(head_dim).
+        attn_out = stable_attention(q, k, v, self.scale, attention_mask)
+
+        # (B, heads, L, head_dim) → (B, L, q_dim) → Conv2d o_proj.
+        attn_out = attn_out.permute(0, 2, 1, 3).contiguous().view(B, seq_len, num_heads * head_dim)
+        attn_out = self.self_attn["o_proj"](
+            attn_out.permute(0, 2, 1).unsqueeze(2)
+        ).squeeze(2).permute(0, 2, 1)
+
+        # fp32 residual add (attn_out is fp16 → upcast).
+        hidden_states = residual + attn_out.to(torch.float32)
+
+        # MLP: post_attention_layernorm → SwiGLU → residual.
+        residual = hidden_states
+        normed = self.post_attention_layernorm(hidden_states).to(MODEL_DTYPE)
+        x_mlp = normed.permute(0, 2, 1).unsqueeze(2)
+        gate = self.mlp["gate_proj"](x_mlp)
+        up = self.mlp["up_proj"](x_mlp)
+        mlp_out = self.mlp["down_proj"](F.silu(gate) * up).squeeze(2).permute(0, 2, 1)
+        hidden_states = residual + mlp_out.to(torch.float32)
+
+        return hidden_states
+
+
+class Qwen3Encoder(nn.Module):
+    """Bidirectional Qwen3 encoder backbone → last_hidden_state (no pooling).
+
+    Input:
+        input_ids      (1, L) int32
+        attention_mask (1, L) fp16 — 1.0 for valid tokens, 0.0 for pad
+    Output:
+        hidden_states  (1, L, hidden_size) fp32
+    """
+
+    NEG_INF = -1.0e4  # ANE-safe additive-mask value
+
+    def __init__(self, config: Qwen3EncoderConfig):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [Qwen3EncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.norm = make_norm(config.norm_impl, config.hidden_size, eps=config.rms_norm_eps)
+        self._build_rope(config)
+
+    def _build_rope(self, config: Qwen3EncoderConfig):
+        head_dim = config.head_dim
+        # RoPE table size is DECOUPLED from the bucket (max_seq_len). We always build
+        # it to a single fixed length (max_position_embeddings, 32768) so the baked
+        # cos/sin constants are byte-identical across every bucket — that makes the
+        # whole CoreML weight.bin identical across buckets, so HF LFS / on-disk store
+        # it once instead of one ~1.19 GB blob per L. forward() slices [:S] at trace
+        # time; a runtime position_ids `gather` keeps that slice from being const-
+        # folded back into a per-bucket [S, head_dim] constant (verified by sha256).
+        L = config.max_position_embeddings
+        t = torch.arange(L).float()
+        inv = 1.0 / (config.rope_theta ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        freqs = torch.einsum("i,j->ij", t, inv)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(MODEL_DTYPE))
+        self.register_buffer("sin_cached", emb.sin().to(MODEL_DTYPE))
+
+    def _pad_mask(self, attention_mask: torch.Tensor, S: int) -> torch.Tensor:
+        """(B, S) {1 valid, 0 pad} → (B, 1, S, S) additive fp16 (0 / −1e4), key-side."""
+        B = attention_mask.shape[0]
+        key_pad = (1.0 - attention_mask).to(MODEL_DTYPE) * self.NEG_INF
+        return key_pad.view(B, 1, 1, S).expand(B, 1, S, S)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,       # (1, S) int32
+        attention_mask: torch.Tensor,  # (1, S) fp16
+    ) -> torch.Tensor:
+        # Derive the sequence length from the input, not config — this makes the
+        # same graph serve both fixed buckets (S == bucket, static) and a flexible
+        # RangeDim export (S dynamic, GPU).
+        head_dim = self.config.head_dim
+        S = input_ids.shape[1]
+
+        # No embedding scaling (Qwen3). Keep residual stream in fp32.
+        hidden = self.embed_tokens(input_ids).to(torch.float32)
+
+        # RoPE: the cos/sin tables are built once to a FIXED length
+        # (max_position_embeddings, 32768) so they are byte-identical across every
+        # bucket — that makes the whole CoreML weight.bin identical across buckets.
+        # A plain static `cos_cached[:S]` slice would be const-folded back into a
+        # per-bucket [S, head_dim] constant (verified: it defeats the dedup). To keep
+        # the slice fold-proof we GATHER rows [0..S-1] using position_ids derived from
+        # a runtime input (attention_mask), so the indices are runtime-dependent and
+        # coremltools cannot const-fold the gather. This needs NO new model input.
+        position_ids = (
+            torch.cumsum(torch.ones_like(attention_mask, dtype=torch.float32), dim=1) - 1.0
+        ).to(torch.int32)                                  # (1, S) = [[0,1,…,S-1]]
+        pos = position_ids[0]                              # (S,)
+        cos = self.cos_cached.index_select(0, pos).view(1, 1, S, head_dim)
+        sin = self.sin_cached.index_select(0, pos).view(1, 1, S, head_dim)
+        mask = self._pad_mask(attention_mask, S)
+
+        for layer in self.layers:
+            hidden = layer(hidden, cos, sin, mask, S)
+
+        return self.norm(hidden).to(MODEL_DTYPE)
+
+
+class PplxEmbedModel(nn.Module):
+    """Full pplx-embed plain forward: tokens → pooled embedding.
+
+    output_mode:
+      "pooled_fp16" — masked-mean → fp16 (readable from the Python bridge; for
+                      fidelity iteration; quantize to int8 downstream).
+      "int8"        — masked-mean → tanh → clamp(round(·*127), −128, 127) → int8
+                      (the deliverable; native int8 output, read via the Swift harness).
+    """
+
+    def __init__(self, config: Qwen3EncoderConfig, output_mode: str = "pooled_fp16"):
+        super().__init__()
+        assert output_mode in ("pooled_fp16", "int8")
+        self.encoder = Qwen3Encoder(config)
+        self.output_mode = output_mode
+
+    def _masked_mean(self, hidden: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        mask = attention_mask.to(torch.float32).unsqueeze(-1)      # (1, L, 1)
+        summed = (hidden.to(torch.float32) * mask).sum(dim=1)      # (1, H)
+        denom = mask.sum(dim=1).clamp_min(1.0)                     # (1, 1)
+        return summed / denom
+
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        hidden = self.encoder(input_ids, attention_mask)           # (1, L, H)
+        pooled = self._masked_mean(hidden, attention_mask)         # (1, H) fp32
+        if self.output_mode == "pooled_fp16":
+            return pooled.to(MODEL_DTYPE)
+        # int8 tanh head (matches st_quantize.py: torch.round, qmin=−128).
+        q = torch.clamp(torch.round(torch.tanh(pooled) * 127.0), -128, 127)
+        return q.to(torch.int8)
+
+
+class PplxEmbedContextModel(nn.Module):
+    """Context (late-chunking) forward: encode the whole window once, pool per chunk.
+
+    Inputs:
+        input_ids      (1, L) int32
+        attention_mask (1, L) fp16  — 1.0 valid, 0.0 pad
+        pool_matrix    (N_max, L) fp16 — row k = normalized mean weights over chunk k's
+                       token span (1/n_k on the span, else 0); unused rows are all-zero.
+    Output:
+        chunk_embeddings (N_max, 1024) — int8 or fp16. Unused rows → 0 vector (skip them).
+
+    Pooling is a single matmul `pool_matrix @ hidden`, so the same encoder serves plain
+    (one row = 1/n over all valid tokens) and context. See the pool_matrix lesson.
+    """
+
+    def __init__(self, config: Qwen3EncoderConfig, output_mode: str = "pooled_fp16",
+                 n_max: int = N_MAX_CHUNKS):
+        super().__init__()
+        assert output_mode in ("pooled_fp16", "int8")
+        self.encoder = Qwen3Encoder(config)
+        self.output_mode = output_mode
+        self.n_max = n_max
+
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
+                pool_matrix: torch.Tensor) -> torch.Tensor:
+        hidden = self.encoder(input_ids, attention_mask)           # (1, L, H)
+        h = hidden.squeeze(0).to(torch.float32)                    # (L, H)
+        pooled = pool_matrix.to(torch.float32) @ h                 # (N_max, H)
+        if self.output_mode == "pooled_fp16":
+            return pooled.to(MODEL_DTYPE)
+        q = torch.clamp(torch.round(torch.tanh(pooled) * 127.0), -128, 127)
+        return q.to(torch.int8)
+
+
+# --------------------------------------------------------------------------- #
+# Weight loading.
+# --------------------------------------------------------------------------- #
+_CONV2D_SUFFIXES = (
+    ".q_proj.weight", ".k_proj.weight", ".v_proj.weight", ".o_proj.weight",
+    ".gate_proj.weight", ".up_proj.weight", ".down_proj.weight",
+)
+
+
+def _map_weight(hf_name: str) -> str | None:
+    """Map a pplx-embed checkpoint key → local Qwen3Encoder param name.
+
+    The checkpoint is sentence-transformer style (no `model.` prefix); accept both.
+    """
+    name = hf_name[len("model."):] if hf_name.startswith("model.") else hf_name
+    if name == "embed_tokens.weight":
+        return "embed_tokens.weight"
+    if name == "norm.weight":
+        return "norm.weight"
+    if name == "lm_head.weight":
+        return None  # encoder needs no LM head (tied embeddings)
+    if name.startswith("layers."):
+        return name  # local layout mirrors HF layer naming
+    return None
+
+
+def load_encoder_weights(encoder: Qwen3Encoder, hf_dir: str) -> None:
+    """Load pplx-embed weights into a Qwen3Encoder (reshaping projections to Conv2d)."""
+    import safetensors.torch
+
+    st_files = sorted(f for f in os.listdir(hf_dir) if f.endswith(".safetensors"))
+    if not st_files:
+        raise FileNotFoundError(f"No .safetensors in {hf_dir}")
+
+    loaded = 0
+    seen: set[str] = set()
+    for st_file in st_files:
+        state = safetensors.torch.load_file(os.path.join(hf_dir, st_file))
+        for hf_name, tensor in state.items():
+            local = _map_weight(hf_name)
+            if local is None:
+                continue
+            tensor = tensor.to(MODEL_DTYPE)
+            if any(local.endswith(suf) for suf in _CONV2D_SUFFIXES) and tensor.dim() == 2:
+                tensor = tensor.unsqueeze(-1).unsqueeze(-1)
+            parts = local.split(".")
+            target = encoder
+            for p in parts[:-1]:
+                target = getattr(target, p)
+            param = getattr(target, parts[-1])
+            if param.shape != tensor.shape:
+                raise ValueError(f"Shape mismatch {hf_name}->{local}: {param.shape} vs {tensor.shape}")
+            with torch.no_grad():
+                param.copy_(tensor)
+            loaded += 1
+            seen.add(local)
+        del state
+        gc.collect()
+    print(f"  loaded {loaded} tensors into Qwen3Encoder from {len(st_files)} file(s)")
+    return None
+
+
+def apply_fp16_residual_rescale(encoder: Qwen3Encoder, K: float) -> None:
+    """Shrink the residual stream by 1/K so fp16 lowering doesn't overflow.
+
+    This 28-layer encoder's activations exceed fp16 max (65504) in deep layers —
+    specifically the `down_proj` accumulation (3072→1024) infs out around layer 19.
+    coremltools lowers float ops to fp16, so this bites on-device.
+
+    Because Qwen3 is **pre-norm** and every sublayer input goes through a
+    scale-invariant RMSNorm (and so does the final `norm`), scaling
+        embed_tokens, every o_proj, every down_proj   by 1/K
+    makes every stored residual and every down_proj accumulation exactly K×
+    smaller while leaving the pooled embedding mathematically unchanged
+    (the scale-invariant final norm cancels the 1/K factor).
+    """
+    inv = 1.0 / float(K)
+    with torch.no_grad():
+        encoder.embed_tokens.weight.mul_(inv)
+        for layer in encoder.layers:
+            layer.self_attn["o_proj"].weight.mul_(inv)
+            layer.mlp["down_proj"].weight.mul_(inv)
+
+
+__all__ = [
+    "Qwen3EncoderConfig", "Qwen3EncoderLayer", "Qwen3Encoder",
+    "Qwen3RMSNorm", "make_norm",
+    "PplxEmbedModel", "PplxEmbedContextModel", "N_MAX_CHUNKS",
+    "load_encoder_weights", "apply_fp16_residual_rescale",
+]
diff --git a/conversion/pplx_embed_reference.py b/conversion/pplx_embed_reference.py
new file mode 100644
index 0000000..557cd4d
--- /dev/null
+++ b/conversion/pplx_embed_reference.py
@@ -0,0 +1,215 @@
+"""Golden fp32 reference oracle for pplx-embed (Perplexity) on CoreML-LLM.
+
+This is the ground truth every CoreML fidelity comparison is measured against:
+
+    HF fp32 forward  ->  masked-mean (plain) / pool_matrix matmul (context)
+                     ->  st_quantize int8 / binary / ubinary
+
+CRITICAL — quantizer parity. We mirror the model's own ``st_quantize.py`` EXACTLY:
+
+    int8   = clamp(round(tanh(x) * 127), -128, 127)   # torch.round = HALF-TO-EVEN
+    binary = where(x >= 0, +1.0, -1.0)                 # float32 +/-1
+    ubinary= packbits(x >= 0)                          # uint8 [..., dim/8]
+
+NOTE the two traps, both deliberately followed here:
+  * **torch.round** (round-half-to-even / banker's), NOT the paper's / the parallel
+    effort's HALF-UP ``floor(127*tanh+0.5)`` — they differ by +/-1 at exact halves.
+  * **qmin = -128** (not -127). Note -128 is never actually reached: tanh(x)*127 in
+    (-127, 127), so round() bottoms out at -127; the -128 clamp is purely defensive.
+
+Pooling matches the reference ``modeling.py``:
+  * plain   : mean over valid (non-pad) tokens.
+  * context : late chunking -- encode the whole window once (bidirectional), then
+    mean-pool each chunk's token span. Chunks are joined with the tokenizer's
+    sep_token; the SEP token itself and padding are excluded from every chunk.
+    We express per-chunk pooling as a single matmul with a ``pool_matrix`` so the
+    same formulation drops straight into the CoreML graph (ANE-friendly); the plain
+    embed is the degenerate one-chunk case (row 0 = 1/L over all valid tokens).
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal
+
+import numpy as np
+import torch
+
+Quantization = Literal["int8", "binary", "ubinary"]
+N_MAX_CHUNKS = 32
+
+
+# --------------------------------------------------------------------------- #
+# Quantizers — bit-for-bit mirrors of st_quantize.py (operate in torch float32).
+# --------------------------------------------------------------------------- #
+def int8_tanh_quant(x: torch.Tensor | np.ndarray) -> np.ndarray:
+    """clamp(round(tanh(x) * 127), -128, 127) via torch.round. Returns int8 ndarray."""
+    t = torch.as_tensor(x, dtype=torch.float32)
+    soft = torch.tanh(t)
+    q = torch.clamp(torch.round(soft * 127.0), -128, 127)
+    return q.to(torch.int8).cpu().numpy()
+
+
+def binary_tanh_quant(x: torch.Tensor | np.ndarray) -> np.ndarray:
+    """where(x >= 0, +1.0, -1.0). Returns float32 ndarray of +/-1."""
+    t = torch.as_tensor(x, dtype=torch.float32)
+    return torch.where(t >= 0, 1.0, -1.0).cpu().numpy().astype(np.float32)
+
+
+def ubinary_pack(x: torch.Tensor | np.ndarray) -> np.ndarray:
+    """packbits(x >= 0) along the last axis. Returns uint8 ndarray [..., dim/8]."""
+    t = torch.as_tensor(x, dtype=torch.float32)
+    bits = (t.cpu().numpy() >= 0)
+    return np.packbits(bits, axis=-1)
+
+
+def quantize(x: torch.Tensor | np.ndarray, quantization: Quantization = "int8") -> np.ndarray:
+    if quantization == "int8":
+        return int8_tanh_quant(x)
+    if quantization == "binary":
+        return binary_tanh_quant(x)
+    if quantization == "ubinary":
+        return ubinary_pack(x)
+    raise ValueError(f"Invalid quantization '{quantization}'; expected int8/binary/ubinary.")
+
+
+# --------------------------------------------------------------------------- #
+# Pooling.
+# --------------------------------------------------------------------------- #
+def masked_mean(hidden: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+    """[B,L,D] x [B,L] -> [B,D] mean over valid tokens (clamped denom, like modeling.py)."""
+    m = mask.unsqueeze(-1).to(hidden.dtype)            # [B,L,1]
+    summed = (hidden * m).sum(dim=1)                   # [B,D]
+    counts = m.sum(dim=1).clamp(min=1e-9)              # [B,1]
+    return summed / counts
+
+
+def make_pool_matrix(spans: list[tuple[int, int]], L: int, n_max: int = N_MAX_CHUNKS) -> np.ndarray:
+    """[n_max, L] float32; row k = normalized mean weights over chunk k's [start,end) span.
+
+    Unused rows (>= len(spans)) are all-zero -> tanh(0)=0 -> zero vector; callers MUST
+    skip them (NaN under cosine). The plain case is one span (0, n_valid)."""
+    P = np.zeros((n_max, L), dtype=np.float32)
+    for k, (start, end) in enumerate(spans[:n_max]):
+        n = end - start
+        if n > 0:
+            P[k, start:end] = 1.0 / float(n)
+    return P
+
+
+def embed_context(hidden: torch.Tensor | np.ndarray, pool_matrix: np.ndarray,
+                  quantization: Quantization = "int8") -> np.ndarray:
+    """Late-chunking pool + quant: (pool_matrix @ hidden) -> quantize. hidden [L,D]."""
+    h = torch.as_tensor(hidden, dtype=torch.float32)
+    P = torch.as_tensor(pool_matrix, dtype=torch.float32)
+    pooled = P @ h                                     # [n_max, D]
+    return quantize(pooled, quantization)
+
+
+# --------------------------------------------------------------------------- #
+# Fidelity helper.
+# --------------------------------------------------------------------------- #
+def cosine_similarity(a: np.ndarray, b: np.ndarray, eps: float = 1e-8) -> np.ndarray:
+    """Row-wise cosine for [N,D] arrays. Near-zero-norm rows -> NaN (caller excludes)."""
+    a = np.asarray(a, dtype=np.float32)
+    b = np.asarray(b, dtype=np.float32)
+    if a.ndim == 1:
+        a, b = a[None], b[None]
+    na = np.linalg.norm(a, axis=-1)
+    nb = np.linalg.norm(b, axis=-1)
+    sim = np.full(a.shape[0], np.nan, dtype=np.float32)
+    valid = (na > eps) & (nb > eps)
+    if valid.any():
+        sim[valid] = (a[valid] * b[valid]).sum(-1) / (na[valid] * nb[valid])
+    return sim
+
+
+# --------------------------------------------------------------------------- #
+# The HF fp32 oracle.
+# --------------------------------------------------------------------------- #
+@dataclass
+class _Loaded:
+    model: torch.nn.Module
+    tokenizer: object
+
+
+class Reference:
+    """Loads a pplx-embed checkpoint (fp32, CPU) and produces golden embeddings.
+
+    >>> ref = Reference("perplexity-ai/pplx-embed-v1-0.6b")
+    >>> ref.embed(["hello world"]).shape          # (1, 1024), dtype int8
+    """
+
+    def __init__(self, hf_repo: str = "perplexity-ai/pplx-embed-v1-0.6b",
+                 device: str = "cpu", dtype: torch.dtype = torch.float32):
+        from transformers import AutoModel, AutoTokenizer
+
+        self.hf_repo = hf_repo
+        self.device = device
+        self.dtype = dtype
+        model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True, dtype=dtype)
+        model.eval().to(device)
+        tokenizer = AutoTokenizer.from_pretrained(hf_repo, trust_remote_code=True)
+        self._l = _Loaded(model=model, tokenizer=tokenizer)
+
+    @property
+    def model(self) -> torch.nn.Module:
+        return self._l.model
+
+    @property
+    def tokenizer(self):
+        return self._l.tokenizer
+
+    @torch.inference_mode()
+    def hidden_states(self, texts: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
+        """Tokenize + bidirectional forward. Returns (last_hidden_state [B,L,D], mask [B,L])."""
+        enc = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+        enc = {k: v.to(self.device) for k, v in enc.items()}
+        out = self.model(**enc)
+        return out.last_hidden_state.float(), enc["attention_mask"].float()
+
+    @torch.inference_mode()
+    def embed(self, texts: list[str], quantization: Quantization = "int8") -> np.ndarray:
+        """Plain embed: masked-mean over valid tokens -> quantize. Returns [B, 1024]."""
+        hidden, mask = self.hidden_states(texts)
+        pooled = masked_mean(hidden, mask)             # [B, D]
+        return quantize(pooled, quantization)
+
+    @torch.inference_mode()
+    def embed_chunks(self, documents: list[list[str]],
+                     quantization: Quantization = "int8") -> list[np.ndarray]:
+        """Context embed (late chunking): join chunks with sep_token, encode the whole
+        window once, mean-pool each chunk's span. Returns one [n_chunks, 1024] array per doc.
+
+        Mirrors modeling.py: SEP tokens and padding are excluded from every chunk."""
+        sep = self.tokenizer.sep_token
+        sep_id = self.tokenizer.sep_token_id
+        joined = [sep.join(chunks) for chunks in documents]
+        enc = self.tokenizer(joined, padding=True, truncation=True, return_tensors="pt")
+        enc = {k: v.to(self.device) for k, v in enc.items()}
+        out = self.model(**enc)
+        hidden = out.last_hidden_state.float()         # [B,L,D]
+        input_ids = enc["input_ids"]
+        mask = enc["attention_mask"]
+
+        results: list[np.ndarray] = []
+        for b in range(input_ids.shape[0]):
+            valid = mask[b].bool()
+            n_valid = int(valid.sum().item())
+            sep_pos = ((input_ids[b] == sep_id) & valid).nonzero(as_tuple=True)[0].tolist()
+            spans: list[tuple[int, int]] = []
+            start = 0
+            for sp in sep_pos:
+                spans.append((start, sp))              # chunk is [start, sep) — SEP excluded
+                start = sp + 1
+            spans.append((start, n_valid))             # final chunk to last valid token
+            L = hidden.shape[1]
+            P = make_pool_matrix(spans, L, n_max=len(spans))
+            results.append(embed_context(hidden[b], P, quantization))
+        return results
+
+
+__all__ = [
+    "Reference", "Quantization", "N_MAX_CHUNKS",
+    "int8_tanh_quant", "binary_tanh_quant", "ubinary_pack", "quantize",
+    "masked_mean", "make_pool_matrix", "embed_context", "cosine_similarity",
+]
diff --git a/conversion/test_pplx_embed_parity.py b/conversion/test_pplx_embed_parity.py
new file mode 100644
index 0000000..da2aca2
--- /dev/null
+++ b/conversion/test_pplx_embed_parity.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""Parity check: ANE Qwen3 encoder (PplxEmbedModel) vs the fp32 golden reference.
+
+Validates the architecture port *before* CoreML conversion — fast CPU loop, small
+fixed seq-len. Compares both the fp16 pooled embedding and the int8 output against
+conversion/pplx_embed_reference.py on a small multilingual sample.
+
+Usage:
+    python conversion/test_pplx_embed_parity.py                 # plain, L=64, K=16
+    python conversion/test_pplx_embed_parity.py --max-seq-len 128 --rescale-k 16
+
+Pass criteria (cosine vs fp32, zero-norm rows excluded):
+    pooled fp16 ≥ 0.999   (encoder port fidelity)
+    int8        ≥ 0.997   (foundation gate)
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+import numpy as np
+import torch
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, ROOT)
+
+import pplx_embed_reference as R  # noqa: E402
+from models.qwen3_encoder import (  # noqa: E402
+    Qwen3EncoderConfig,
+    PplxEmbedModel,
+    load_encoder_weights,
+    apply_fp16_residual_rescale,
+)
+
+SENTENCES = [
+    "hello world",
+    "Bonjour le monde.",
+    "東京は日本の首都です。",
+    "Embeddings are dense vectors.",
+    "La inteligencia artificial avanza rápido.",
+    "Das Wetter ist heute schön.",
+    "机器学习改变世界。",
+    "Quantum computing uses qubits.",
+    "Привет, как дела?",
+    "المعرفة قوة.",
+    "The mitochondria is the powerhouse of the cell.",
+    "Tokyo Shanghai Paris Berlin Cairo.",
+]
+
+
+def _snapshot_dir(hf_repo: str) -> str:
+    from huggingface_hub import snapshot_download
+    return snapshot_download(hf_repo, allow_patterns=["*.json", "*.safetensors", "tokenizer*", "*.txt", "*.py"])
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="pplx-embed ANE-encoder parity test")
+    ap.add_argument("--hf-repo", default="perplexity-ai/pplx-embed-v1-0.6b")
+    ap.add_argument("--max-seq-len", type=int, default=64)
+    ap.add_argument("--rescale-k", type=float, default=8.0,
+                    help="fp16 residual rescale factor (0 disables; K=8 is the default sweet spot)")
+    ap.add_argument("--pooled-gate", type=float, default=0.999)
+    ap.add_argument("--int8-gate", type=float, default=0.997)
+    args = ap.parse_args()
+
+    snap = _snapshot_dir(args.hf_repo)
+    L = args.max_seq_len
+    K = args.rescale_k or None
+
+    cfg = Qwen3EncoderConfig.from_json(os.path.join(snap, "config.json"), max_seq_len=L)
+    print(f"[cfg] hidden={cfg.hidden_size} layers={cfg.num_hidden_layers} "
+          f"heads={cfg.num_attention_heads}/{cfg.num_key_value_heads} hd={cfg.head_dim} "
+          f"theta={cfg.rope_theta} L={L} K={K}")
+
+    m_pool = PplxEmbedModel(cfg, "pooled_fp16").eval()
+    load_encoder_weights(m_pool.encoder, snap)
+    m_int8 = PplxEmbedModel(cfg, "int8").eval()
+    load_encoder_weights(m_int8.encoder, snap)
+    if K:
+        apply_fp16_residual_rescale(m_pool.encoder, K)
+        apply_fp16_residual_rescale(m_int8.encoder, K)
+
+    ref = R.Reference(args.hf_repo)
+    tok = ref.tokenizer
+
+    cos_pool, cos_int8 = [], []
+    for t in SENTENCES:
+        enc = tok([t], return_tensors="pt", truncation=True, max_length=L)
+        ids = enc["input_ids"]
+        n = ids.shape[1]
+        pid = torch.zeros((1, L), dtype=torch.int32); pid[0, :n] = ids[0].to(torch.int32)
+        pam = torch.zeros((1, L), dtype=torch.float16); pam[0, :n] = 1.0
+        with torch.no_grad():
+            o_pool = m_pool(pid, pam).numpy().astype(np.float32)
+            o_int8 = m_int8(pid, pam).numpy().astype(np.float32)
+        ref_pool = R.masked_mean(*ref.hidden_states([t])).numpy().astype(np.float32)
+        ref_int8 = ref.embed([t]).astype(np.float32)
+        cp = R.cosine_similarity(o_pool, ref_pool)[0]
+        ci = R.cosine_similarity(o_int8, ref_int8)[0]
+        cos_pool.append(cp); cos_int8.append(ci)
+        print(f"[txt] n={n:3d} pooled={cp:.6f} int8={ci:.6f}  {t[:28]}")
+
+    cp = np.array(cos_pool); ci = np.array(cos_int8)
+    pooled_min, int8_min = float(np.nanmin(cp)), float(np.nanmin(ci))
+    n_nan = int(np.isnan(cp).sum() + np.isnan(ci).sum())
+    print(f"\n[POOLED] mean={np.nanmean(cp):.6f} min={pooled_min:.6f}  (gate ≥ {args.pooled_gate})")
+    print(f"[INT8]   mean={np.nanmean(ci):.6f} min={int8_min:.6f}  (gate ≥ {args.int8_gate})")
+    ok = (n_nan == 0) and (pooled_min >= args.pooled_gate) and (int8_min >= args.int8_gate)
+    print(f"\n{'PASS' if ok else 'FAIL'}  (nan={n_nan})")
+    return 0 if ok else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/conversion/upload_pplx_embed.py b/conversion/upload_pplx_embed.py
new file mode 100644
index 0000000..09c0b7f
--- /dev/null
+++ b/conversion/upload_pplx_embed.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+"""Publish the prebuilt pplx-embed CoreML buckets to a single HuggingFace repo.
+
+End users should **download** the finished `.mlpackage` buckets, not regenerate them
+(conversion needs the toolkit + minutes per bucket). This script mirrors the local
+bundle layout into one HF repo with **per-bucket subfolders**, so a consumer pulls only
+the bucket(s) they need (the Swift downloader fetches an explicit file list — see
+`PplxEmbed.load(repo:buckets:)` / `Gemma3BundleDownloader`).
+
+Single-repo rationale (see docs/PPLX_EMBED.md): the repo convention is one HF repo per
+model family with subfolders; each bucket `.mlpackage` embeds the same ~1.1 GB weights
+but they are **byte-identical across buckets** (bucket size only changes the traced shape
++ RoPE length), so HF content-addressed LFS stores the blob once — several repos would
+not save storage.
+
+Repo layout (target `<account>/pplx-embed-coreml`):
+    L512-int8/   L1024-int8/   …   L8192-int8/   dyn8192-int8/   (plain)
+    context/L512-int8/  …                                        (context variant)
+    manifest.json   README.md
+Each bucket subfolder mirrors the local bundle: `encoder.mlpackage/` (or `.mlmodelc/`),
+`model_config.json`, `hf_model/` tokenizer json. The upstream `hf_model/*.safetensors`
+are excluded (ship only the tokenizer json — matches the other CoreML repos).
+
+This script does NOT upload directly. It compiles (optionally), stages a clean repo tree
+(symlinks → manifest.json + README.md), ensures the repo exists, and prints the resumable
+`hf upload-large-folder` command for you to run — that uploader is parallel, xet-accelerated,
+shows realtime progress, and resumes if interrupted (re-run the same command).
+
+Usage:
+    # compile + ship both formats, plain + context:
+    uv run python conversion/upload_pplx_embed.py --repo <account>/pplx-embed-coreml \
+        --plain-dir output/pplx-embed --context-dir output/pplx-embed-context --compile
+    # then run the printed command, e.g.:
+    hf upload-large-folder <account>/pplx-embed-coreml output/pplx-embed-coreml-stage --repo-type=model
+    # restrict to specific buckets:
+    uv run python conversion/upload_pplx_embed.py --repo <account>/pplx-embed-coreml \
+        --plain-dir output/pplx-embed --buckets L512-int8 L1024-int8 L2048-int8
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+# Files inside a bucket dir to never upload (the re-conversion-only weights).
+_EXCLUDE_SUFFIXES = (".safetensors",)
+
+
+def _is_shippable_bucket(bucket_dir: Path) -> bool:
+    """A bucket ships iff its model_config.json is an int8-output, fp16-weight model.
+
+    Mirrors PplxEmbed.swift's parseBucket filter so we publish exactly the set the
+    Swift runtime will load (skips pooled_fp16 fidelity bundles + weight-quant probes).
+    """
+    cfg = bucket_dir / "model_config.json"
+    if not cfg.exists():
+        return False
+    try:
+        j = json.loads(cfg.read_text())
+    except Exception:
+        return False
+    return (j.get("output_mode") == "int8"
+            and (j.get("quantization_weights") or "fp16") == "fp16")
+
+
+def _model_dir_name(bucket_dir: Path) -> str | None:
+    """Return 'encoder.mlmodelc' if compiled present, else 'encoder.mlpackage'."""
+    if (bucket_dir / "encoder.mlmodelc").is_dir():
+        return "encoder.mlmodelc"
+    if (bucket_dir / "encoder.mlpackage").is_dir():
+        return "encoder.mlpackage"
+    return None
+
+
+def _compile_bucket(bucket_dir: Path) -> None:
+    """Compile encoder.mlpackage → encoder.mlmodelc in-place (skip if present).
+
+    Ship-compiled path: precompiled `.mlmodelc` removes the consumer's first-load
+    `MLModel.compileModel` step and matches the repo's other CoreML releases. CoreML
+    `.mlmodelc` is loadable across the deployment target's devices (iPhone + Mac).
+    """
+    import subprocess
+
+    mlmodelc = bucket_dir / "encoder.mlmodelc"
+    pkg = bucket_dir / "encoder.mlpackage"
+    if mlmodelc.is_dir() or not pkg.is_dir():
+        return
+    subprocess.run(["xcrun", "coremlcompiler", "compile", str(pkg), str(bucket_dir)],
+                   check=True)
+
+
+def _present_model_dirs(bucket_dir: Path) -> list[str]:
+    """Model dirs present in this bucket (both, if shipping both formats)."""
+    return [d for d in ("encoder.mlmodelc", "encoder.mlpackage")
+            if (bucket_dir / d).is_dir()]
+
+
+def _bucket_files(bucket_dir: Path) -> list[str]:
+    """Relative file paths (POSIX) to ship for one bucket, excluding safetensors.
+
+    Includes every present model dir (so 'ship both' uploads both encoder.mlmodelc
+    and encoder.mlpackage); the Swift client selectively downloads only one format.
+    """
+    files: list[str] = []
+    model_dirs = _present_model_dirs(bucket_dir)
+    if not model_dirs:
+        return files
+    for model_dir in model_dirs:
+        for p in sorted((bucket_dir / model_dir).rglob("*")):
+            if p.is_file() and not p.name.endswith(_EXCLUDE_SUFFIXES):
+                files.append(p.relative_to(bucket_dir).as_posix())
+    # model_config.json + tokenizer json (exclude any safetensors defensively).
+    if (bucket_dir / "model_config.json").is_file():
+        files.append("model_config.json")
+    hf = bucket_dir / "hf_model"
+    if hf.is_dir():
+        for p in sorted(hf.rglob("*")):
+            if p.is_file() and not p.name.endswith(_EXCLUDE_SUFFIXES):
+                files.append(p.relative_to(bucket_dir).as_posix())
+    return files
+
+
+def _discover_buckets(plain_dir: Path | None, context_dir: Path | None,
+                      only: set[str] | None) -> list[tuple[str, Path]]:
+    """Return [(repo_subfolder, local_bucket_dir)] for every shippable bucket.
+
+    Plain buckets map to their dirname (e.g. 'L512-int8'); context buckets are
+    prefixed with 'context/' (e.g. 'context/L512-int8').
+    """
+    out: list[tuple[str, Path]] = []
+    if plain_dir and plain_dir.is_dir():
+        for d in sorted(p for p in plain_dir.iterdir() if p.is_dir()):
+            if only and d.name not in only:
+                continue
+            if _is_shippable_bucket(d):
+                out.append((d.name, d))
+    if context_dir and context_dir.is_dir():
+        for d in sorted(p for p in context_dir.iterdir() if p.is_dir()):
+            if only and d.name not in only:
+                continue
+            if _is_shippable_bucket(d):
+                out.append((f"context/{d.name}", d))
+    return out
+
+
+def _build_manifest(buckets: list[tuple[str, Path]], repo: str) -> dict:
+    # Size-only manifest: the Swift `load(repo:)` derives download globs from each
+    # bucket's subfolder + formats, and the HF Swift Hub client's content-addressed
+    # cache dedups the byte-identical weight.bin by etag on download — so no per-file
+    # sha is needed here (and staging stays instant, no ~14 GB hash).
+    base_url = f"https://huggingface.co/{repo}/resolve/main"
+    entries = []
+    total = 0
+    for subfolder, bucket_dir in buckets:
+        cfg = json.loads((bucket_dir / "model_config.json").read_text())
+        files_meta = []
+        for rel in _bucket_files(bucket_dir):
+            p = bucket_dir / rel
+            size = p.stat().st_size
+            total += size
+            repo_path = f"{subfolder}/{rel}"
+            files_meta.append({
+                "path": repo_path,
+                "url": f"{base_url}/{repo_path}",
+                "size_bytes": size,
+            })
+        formats = [d.split(".", 1)[1] for d in _present_model_dirs(bucket_dir)]
+        entries.append({
+            "subfolder": subfolder,
+            "variant": cfg.get("variant", "plain"),
+            "bucket": cfg.get("bucket"),
+            "dynamic": bool(cfg.get("dynamic", False)),
+            "dynamic_upper": cfg.get("dynamic_upper", 0),
+            "max_seq_len": cfg.get("max_seq_len"),
+            "norm_impl": cfg.get("norm_impl", "ane_cat"),
+            "formats": formats,   # e.g. ["mlmodelc", "mlpackage"]; Swift picks one
+            "files": files_meta,
+        })
+    # Aggregate of the formats actually shipped: "both", "mlmodelc", or "mlpackage".
+    all_formats = {f for _s, d in buckets for f in
+                   (x.split(".", 1)[1] for x in _present_model_dirs(d))}
+    fmt = "both" if all_formats == {"mlmodelc", "mlpackage"} else next(iter(all_formats), "mlpackage")
+    return {
+        "model_id": Path(repo).name,
+        "repo": repo,
+        "format": fmt,
+        "buckets": entries,
+        "total_size": total,
+    }
+
+
+def _readme(repo: str, manifest: dict) -> str:
+    rows = []
+    for b in manifest["buckets"]:
+        n = sum(f["size_bytes"] for f in b["files"])
+        kind = "dynamic GPU catch-all" if b["dynamic"] else "fixed ANE bucket"
+        rows.append(f"| `{b['subfolder']}/` | {b['variant']} | {b['bucket']} | "
+                    f"{kind} | {n / 1e9:.2f} GB |")
+    table = "\n".join(rows)
+    return f"""\
+---
+language: multilingual
+license: apache-2.0
+base_model: perplexity-ai/pplx-embed-v1-0.6b
+tags:
+  - coreml
+  - apple-neural-engine
+  - qwen3
+  - sentence-embedding
+  - on-device
+library_name: coreml
+---
+
+# pplx-embed for Apple CoreML (ANE-optimized)
+
+CoreML conversion of Perplexity's
+[`pplx-embed-v1-0.6b`](https://huggingface.co/perplexity-ai/pplx-embed-v1-0.6b)
+(a bidirectional Qwen3-0.6B encoder → masked-mean pool → tanh-int8 head) produced with
+the [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) pipeline. Targets macOS 26.
+
+Each subfolder is a **fixed-shape sequence-length bucket** that stays resident on the
+Apple Neural Engine (flexible shapes force CPU fallback). At runtime the Swift package
+pads each input to the smallest bucket that fits; inputs longer than the largest fixed
+bucket fall through to the `dyn*-int8/` flexible GPU catch-all. The encoder uses native
+RMSNorm and a single fixed RoPE table — the ANE-fastest path on M4 Max / macOS 26.
+
+## Buckets in this repo
+
+| Subfolder | Variant | Bucket (L) | Kind | Size |
+|---|---|---|---|---|
+{table}
+
+The encoder `weight.bin` is **byte-identical across every bucket** (a single fixed-size
+RoPE table makes the weights independent of bucket length). So HF stores the weight blob
+**once**, and the HF content-addressed cache fetches it **once by etag** on download —
+pulling several buckets costs ~1.15 GB total, not ~1.15 GB × N.
+
+## Use it
+
+Via the [CoreML-LLM Swift package](https://github.com/john-rocky/CoreML-LLM). It uses the
+HF Swift Hub client, so only the buckets you request are downloaded and the shared weight
+is fetched once into the content-addressed cache:
+
+```swift
+import CoreMLLLM
+let embedder = try await PplxEmbed.load(
+    repo: "{repo}",
+    buckets: [512, 1024, 2048])       // shared HF cache; weight fetched once by etag
+let vecs = try embedder.embed(["On-device embeddings", "Bonjour le monde"])  // [[Int8]]
+```
+
+Each bucket is published in both `.mlpackage` and precompiled `.mlmodelc`; pass
+`preferCompiled: false` for the portable package. Or download the bundle directory
+yourself and load it with `load(bundleDir:)`.
+
+## I/O contract (per bucket `model_config.json`)
+
+- `input_ids (1, L) int32`, `attention_mask (1, L) fp16` (1.0 valid, 0.0 pad)
+- `embedding (1, 1024) int8` — `clamp(round(tanh(x)*127), -128, 127)`; derive
+  `binary`/`ubinary` from the int8 sign (see `PplxEmbed`).
+
+## License
+
+Inherits the base model's [license](https://huggingface.co/perplexity-ai/pplx-embed-v1-0.6b).
+"""
+
+
+def _stage_repo_tree(buckets: list[tuple[str, Path]], stage_dir: Path,
+                     manifest: dict, repo: str) -> None:
+    """Build a clean tree mirroring the repo layout via symlinks (no copy).
+
+    `hf upload-large-folder` mirrors a local folder to the repo root, so we stage one:
+    each shippable file is **hardlinked** to its real source under
+    `stage_dir/<subfolder>/<rel>`, plus manifest.json + README.md. Hardlinks are
+    indistinguishable from real files to the uploader (no symlink-following caveat) and
+    cost no extra disk (same inode); we fall back to symlink then copy if hardlinking
+    fails (e.g. cross-filesystem). Re-running rebuilds the tree from scratch.
+    """
+    import shutil
+    if stage_dir.exists():
+        shutil.rmtree(stage_dir)
+    stage_dir.mkdir(parents=True)
+    for subfolder, bucket_dir in buckets:
+        for rel in _bucket_files(bucket_dir):
+            src = (bucket_dir / rel).resolve()
+            dst = stage_dir / subfolder / rel
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                os.link(src, dst)            # hardlink — zero copy, real-file semantics
+            except OSError:
+                try:
+                    os.symlink(src, dst)
+                except OSError:
+                    shutil.copy2(src, dst)
+    (stage_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
+    (stage_dir / "README.md").write_text(_readme(repo, manifest))
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Upload pplx-embed CoreML buckets to HF")
+    ap.add_argument("--repo", required=True, help="Target HF repo id (e.g. acct/pplx-embed-coreml)")
+    ap.add_argument("--plain-dir", default="output/pplx-embed",
+                    help="Local dir of plain buckets (Lxxxx-int8/, dynNNNN-int8/)")
+    ap.add_argument("--context-dir", default=None,
+                    help="Local dir of context buckets (uploaded under context/)")
+    ap.add_argument("--buckets", nargs="*", default=None,
+                    help="Restrict to these bucket dir names (e.g. L512-int8 L1024-int8)")
+    ap.add_argument("--compile", action="store_true",
+                    help="Compile each bucket's encoder.mlpackage → encoder.mlmodelc and ship "
+                         "BOTH formats (no on-device compile for consumers; .mlmodelc+.mlpackage "
+                         "share a bucket's weight.bin so this does not double the payload).")
+    ap.add_argument("--stage-dir", default=None,
+                    help="Where to build the upload tree (default: <plain-dir>/../pplx-embed-coreml-stage)")
+    ap.add_argument("--no-create-repo", action="store_true",
+                    help="Don't create the HF repo (the upload command will).")
+    args = ap.parse_args()
+
+    plain_dir = Path(args.plain_dir).resolve() if args.plain_dir else None
+    context_dir = Path(args.context_dir).resolve() if args.context_dir else None
+    only = set(args.buckets) if args.buckets else None
+
+    buckets = _discover_buckets(plain_dir, context_dir, only)
+    if not buckets:
+        print("No shippable buckets found (need int8-output, fp16-weight model_config.json).")
+        return 1
+
+    if args.compile:
+        print("Compiling buckets → encoder.mlmodelc …")
+        for _subfolder, d in buckets:
+            _compile_bucket(d)
+
+    print(f"Discovered {len(buckets)} bucket(s) for {args.repo}:")
+    for subfolder, d in buckets:
+        n = len(_bucket_files(d))
+        print(f"  {subfolder}  ({n} files, {_model_dir_name(d)})  ← {d}")
+
+    manifest = _build_manifest(buckets, args.repo)
+    total_gb = manifest["total_size"] / 1e9
+    print(f"\nTotal payload (pre-LFS-dedup): {total_gb:.2f} GB across "
+          f"{sum(len(b['files']) for b in manifest['buckets'])} files")
+
+    # Stage a clean tree (symlinks) mirroring the repo + manifest.json + README.md.
+    out_base = plain_dir.parent if plain_dir else Path.cwd()
+    stage_dir = Path(args.stage_dir).resolve() if args.stage_dir \
+        else (out_base / "pplx-embed-coreml-stage")
+    _stage_repo_tree(buckets, stage_dir, manifest, args.repo)
+    print(f"\nStaged repo tree → {stage_dir}  (symlinks + manifest.json + README.md)")
+
+    # Ensure the repo exists so the resumable uploader can push straight to it.
+    token = os.environ.get("HF_TOKEN") or None
+    from huggingface_hub import HfApi, create_repo
+    if token is None:
+        try:
+            print(f"Using cached HF login: {HfApi().whoami().get('name')}")
+        except Exception:
+            print("NOTE: no HF_TOKEN and no cached login — run `huggingface-cli login` "
+                  "before uploading.")
+    if not args.no_create_repo:
+        try:
+            create_repo(args.repo, repo_type="model", exist_ok=True, token=token)
+            print(f"Repo ready: https://huggingface.co/{args.repo}")
+        except Exception as e:
+            print(f"create_repo skipped ({str(e)[:80]}) — the upload command will create it.")
+
+    print("\nNow run this to upload — resumable, parallel, xet-accelerated, realtime "
+          "progress (re-run the SAME command to resume if interrupted):\n")
+    print(f"  hf upload-large-folder {args.repo} {stage_dir} --repo-type=model\n")
+    print("Weights dedupe across buckets: the encoder uses a single fixed RoPE table, so "
+          "every plain bucket (and its .mlmodelc+.mlpackage) shares ONE ~1.15 GB weight.bin; "
+          "the context variant is a second blob. HF LFS stores each unique blob once, so the "
+          "real upload is ~2 weight blobs regardless of how many buckets you ship. Restrict "
+          "buckets by re-running this script with e.g. --buckets L512-int8 L1024-int8.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/conversion/upload_pplx_embed_dedup.py b/conversion/upload_pplx_embed_dedup.py
new file mode 100644
index 0000000..09b0bd4
--- /dev/null
+++ b/conversion/upload_pplx_embed_dedup.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""Guaranteed-minimal-transfer upload of the staged pplx-embed repo tree.
+
+The staged tree has byte-identical `weight.bin`s across buckets (single fixed RoPE
+table), but `hf upload-large-folder` re-uploads identical oids across its parallel
+batches, so dedup "doesn't work" and you push ~14 GB instead of ~2.3 GB.
+
+This uploads each UNIQUE file content exactly once, then uses HF **server-side copy**
+(`CommitOperationCopy`) to materialize every duplicate path from the already-uploaded
+blob — no re-upload. Net transfer = the unique blobs only (the 2 weight blobs + the
+unique small/tokenizer files).
+
+Run (stop any in-flight `hf upload-large-folder` first):
+    HF_HUB_DISABLE_XET=1 uv run python conversion/upload_pplx_embed_dedup.py \
+        --repo dokterbob/pplx-embed-coreml \
+        --stage output/pplx-embed-coreml-stage
+"""
+from __future__ import annotations
+
+import argparse
+import hashlib
+import os
+from pathlib import Path
+
+# Duplicates at/above this size are materialized via server-side Copy (no re-upload).
+# Only the ~1.15 GB weight.bin clears this bar — and it is always LFS, so Copy is safe.
+# Smaller duplicates (tokenizer, graph files) are just re-added; that upload is tiny and
+# avoids any "Copy a non-LFS file" edge case.
+_LFS_MIN = 50 * 1024 * 1024  # 50 MB
+
+
+def _sha256(path: Path, chunk: int = 1 << 20) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        while True:
+            b = f.read(chunk)
+            if not b:
+                break
+            h.update(b)
+    return h.hexdigest()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Dedup-minimal upload of the pplx-embed stage")
+    ap.add_argument("--repo", required=True)
+    ap.add_argument("--stage", default="output/pplx-embed-coreml-stage")
+    args = ap.parse_args()
+
+    stage = Path(args.stage).resolve()
+    if not stage.is_dir():
+        print(f"stage dir not found: {stage}")
+        return 1
+
+    from huggingface_hub import (
+        HfApi, create_repo, CommitOperationAdd, CommitOperationCopy,
+    )
+
+    # Walk the staged tree; group repo paths by content sha (the dedup key).
+    print("Hashing staged files (local; finds the unique blobs) …")
+    entries: list[tuple[str, Path, str, int]] = []  # (repo_path, local, sha, size)
+    for root, dirs, fns in os.walk(stage):
+        # Skip dot-dirs (e.g. `.cache/huggingface/` that `hf upload-large-folder`
+        # writes into the folder for resume tracking) — HF rejects `.cache/` paths.
+        dirs[:] = [d for d in dirs if not d.startswith(".")]
+        for fn in fns:
+            if fn.startswith("."):
+                continue
+            lp = Path(root) / fn
+            rp = lp.relative_to(stage).as_posix()
+            entries.append((rp, lp, _sha256(lp), lp.stat().st_size))
+
+    canonical: dict[str, str] = {}        # sha -> first repo_path (the uploaded copy)
+    adds: list = []                       # unique content to upload
+    copies: list = []                     # dups → server-side copy
+    readds: list = []                     # tiny dups → just re-add (cheap)
+    for rp, lp, sha, size in sorted(entries, key=lambda e: e[0]):
+        if sha not in canonical:
+            canonical[sha] = rp
+            adds.append(CommitOperationAdd(path_in_repo=rp, path_or_fileobj=str(lp)))
+        elif size >= _LFS_MIN:
+            copies.append(CommitOperationCopy(src_path_in_repo=canonical[sha], path_in_repo=rp))
+        else:
+            readds.append(CommitOperationAdd(path_in_repo=rp, path_or_fileobj=str(lp)))
+
+    uniq_gb = sum(e[3] for e in entries if canonical[e[2]] == e[0]) / 1e9
+    total_gb = sum(e[3] for e in entries) / 1e9
+    print(f"  {len(entries)} files → {len(adds)} unique to upload "
+          f"({uniq_gb:.2f} GB transferred) + {len(copies)} server-side copies "
+          f"+ {len(readds)} tiny re-adds. (apparent total {total_gb:.1f} GB)")
+
+    token = os.environ.get("HF_TOKEN") or None
+    if token is None:
+        try:
+            print(f"Using cached HF login: {HfApi().whoami().get('name')}")
+        except Exception:
+            print("ERROR: no HF_TOKEN and no cached login (`huggingface-cli login`).")
+            return 1
+    api = HfApi(token=token)
+    create_repo(args.repo, repo_type="model", exist_ok=True, token=token)
+
+    # Commit 1: every unique blob + the tiny duplicates (these establish the copy sources).
+    print(f"\n[1/2] Uploading {len(adds) + len(readds)} unique/small files "
+          f"(~{uniq_gb:.2f} GB over the wire) …")
+    api.create_commit(args.repo, operations=adds + readds, repo_type="model",
+                      commit_message="upload unique blobs + small files (deduped)")
+
+    # Commit 2: server-side copy the large duplicates from their canonical path.
+    if copies:
+        print(f"[2/2] Server-side copying {len(copies)} duplicate weight blobs "
+              f"(no re-upload) …")
+        api.create_commit(args.repo, operations=copies, repo_type="model",
+                          commit_message="server-side copy deduped weight.bin across buckets")
+
+    print(f"\n✅ Done. https://huggingface.co/{args.repo}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/docs/ANE_RMSNORM_FOLLOWUP.md b/docs/ANE_RMSNORM_FOLLOWUP.md
new file mode 100644
index 0000000..418353e
--- /dev/null
+++ b/docs/ANE_RMSNORM_FOLLOWUP.md
@@ -0,0 +1,45 @@
+# Follow-up: native RMSNorm for the shared `ane_ops.ANERMSNorm` (all decoder families)
+
+**Status:** proposed, **not implemented**. Needs broad re-validation before any change.
+
+## What we found (pplx-embed only)
+
+`conversion/ane_ops.ANERMSNorm` implements RMSNorm via the `cat([x, −x]) → LayerNorm →
+chunk` identity, chosen years ago because "the ANE has a highly optimized LayerNorm
+kernel" and lacked a fast native `rsqrt`. On the pplx-embed bidirectional Qwen3 encoder
+we A/B'd that against a native `x * rsqrt(mean(x²) + eps) * w` RMSNorm
+(`conversion/models/qwen3_encoder.Qwen3RMSNorm`, selectable via `norm_impl`), changing
+**only** the 5 norm sites and holding Conv2d-1×1 projections and tensor layout fixed
+(`conversion/experiment_ane_rmsnorm.py`):
+
+| L   | ane_cat (cat/chunk) | native (rsqrt) | speedup | ANE residency | cosine vs fp32 |
+|-----|--------------------:|---------------:|--------:|--------------:|---------------:|
+| 256 | 35.98 ms            | 31.92 ms       | **+12.7%** | 99.81% (both) | 0.99998 |
+| 512 | 100.40 ms           | 82.61 ms       | **+21.5%** | 99.81% (both) | 0.99998 |
+
+Environment: Apple M4 Max, macOS 26, coremltools 9, torch 2.11, B=1, K=8 residual
+rescale, `pooled_fp16`. Native RMSNorm stays **fully ANE-resident** (the planner runs
+`pow`/`reduce_mean`/`rsqrt` on the ANE here) and is fidelity-neutral. The cat/chunk trick
+is now a *de-optimization* on this chip/OS/coremltools combination.
+
+## Why this is only applied to pplx-embed so far
+
+`ane_ops.ANERMSNorm` is shared by ~10 decoder families (Gemma3/4, LFM2, Qwen3.5,
+Qwen3-VL, `base_model`). pplx-embed is a **bidirectional encoder, B=1, fp32 residual,
+fixed full-attention** — a different regime from the **stateful causal decoders** (KV
+cache, T=1 decode + T=32 prefill, sliding/full sandwich norms, the `(1+w)` gain
+convention). The win may or may not carry over; a per-op kernel choice that helps a
+1×L encoder pass need not help a chunked decode step.
+
+## Proposed work (separate PR)
+
+1. Add a `norm_impl` (or `native_rmsnorm=True`) switch to `ane_ops.ANERMSNorm`
+   **without changing its default**, mirroring `Qwen3RMSNorm` (store the same 1-D weight;
+   keep the `plus_one_gain` convention in `ane_norm_from_hf`).
+2. A/B per family with that family's existing latency harness (e.g.
+   `probe_e2e_linear_latency.py`-style), measuring decode **and** prefill, ANE residency,
+   and end-to-end output parity — *not* just a single encoder pass.
+3. Flip the shared default to native **only** for families where it is faster *and*
+   residency/parity hold; leave the others on `ane_cat`.
+
+Do **not** flip the shared default globally off the pplx-embed result alone.
diff --git a/docs/PPLX_EMBED.md b/docs/PPLX_EMBED.md
new file mode 100644
index 0000000..6a4b02f
--- /dev/null
+++ b/docs/PPLX_EMBED.md
@@ -0,0 +1,128 @@
+# pplx-embed — Perplexity embedding models on the ANE
+
+Adds a **bidirectional Qwen3 encoder** path that converts Perplexity's pplx-embed models to CoreML
+and runs them on the Apple Neural Engine (macOS Tahoe / `macOS26`):
+
+- `perplexity-ai/pplx-embed-v1-0.6b` — **plain** sentence embeddings (mean-pool → 1024-d int8).
+- `perplexity-ai/pplx-embed-context-v1-0.6b` — **late chunking** (per-chunk embeddings via a
+  `pool_matrix` matmul; one encoder pass over the whole window).
+
+The encoder is a 28-layer bidirectional Qwen3-0.6B (GQA 16/8, head_dim 128, SwiGLU, QK-norm,
+RoPE θ=1e6) built on the existing ANE primitives (Conv2d-1×1 projections, native RMSNorm,
+`repeat_kv_ane`, `stable_attention`). Output matches the model's own `st_quantize.py` exactly:
+int8 = `clamp(round(tanh(x)·127), −128, 127)` (`torch.round`, half-to-even), plus `binary`
+(sign) and `ubinary` (packbits).
+
+## Files
+
+| what | where |
+|---|---|
+| Model registry | `conversion/config.py` → `pplx-embed`, `pplx-embed-context` |
+| Encoder | `conversion/models/qwen3_encoder.py` |
+| Bundle builder | `conversion/build_pplx_embed_bundle.py` |
+| Golden fp32 reference (oracle) | `conversion/pplx_embed_reference.py` |
+| Parity test | `conversion/test_pplx_embed_parity.py` |
+| ANE RMSNorm A/B | `conversion/experiment_ane_rmsnorm.py` |
+| HF uploader | `conversion/upload_pplx_embed.py` (single repo, per-bucket subfolders) |
+| Swift runtime | `Sources/CoreMLLLM/PplxEmbed.swift` (+ `pplx-embed-demo`, `pplx-embed-bench`) |
+
+## Build
+
+```bash
+# A fixed-shape ANE bucket (the fast path), plain int8 output:
+python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 512
+# Context (late chunking) variant:
+python conversion/build_pplx_embed_bundle.py --model pplx-embed-context --max-seq-len 512
+# The flexible GPU catch-all for inputs larger than the biggest bucket (up to 8192):
+python conversion/build_pplx_embed_bundle.py --model pplx-embed --dynamic-upper 8192
+```
+
+Verify fidelity against the fp32 reference (CPU, fast):
+
+```bash
+python conversion/test_pplx_embed_parity.py        # pooled ≥0.999, int8 ≥0.997
+```
+
+## Use (Swift)
+
+```swift
+let embedder = try await PplxEmbed.load(bundleDir: URL(fileURLWithPath: "output/pplx-embed"))
+let vectors = try embedder.embed(["hello world", "bonjour le monde"])   // [[Int8]] (1024-d)
+// also: embedBinary / embedUBinary; embedContext([[String]]) for late chunking
+```
+
+`embed()` tokenizes, selects the **smallest fixed bucket** that fits, pads/masks, and runs on the
+ANE. Inputs larger than the biggest bucket are routed to the flexible RangeDim model on the GPU
+(non-padded). Run the CLI demo with `swift run -c release pplx-embed-demo --bundle-dir output/pplx-embed --text "…"`.
+
+### Download prebuilt models from Hugging Face
+
+End users can **download** the prebuilt CoreML buckets instead of regenerating them (conversion
+needs the toolkit + minutes per bucket). The buckets live in one repo with per-bucket subfolders
+(`<account>/pplx-embed-coreml`; final id confirmed at publish time) plus a `manifest.json`
+inventory. `PplxEmbed.load(repo:)` reads the manifest and **selectively** downloads only the
+requested buckets (+ the dynamic catch-all) — never the whole repo:
+
+```swift
+let embedder = try await PplxEmbed.load(
+    repo: "<account>/pplx-embed-coreml",
+    buckets: [512, 1024, 2048],      // only these subfolders + tokenizer are fetched
+    into: appSupportDir)             // preferCompiled: true → pulls .mlmodelc (no on-device compile)
+let vectors = try embedder.embed(["hello world"])
+```
+
+Each bucket is published in **both** formats — precompiled `.mlmodelc` (default; no on-device
+compile) and the portable `.mlpackage` (`preferCompiled: false`). The repo hosts both, but the
+client downloads only one format's ~1.1 GB weights per bucket. The demo takes `--repo`:
+`swift run -c release pplx-embed-demo --repo <account>/pplx-embed-coreml --buckets 512 --text "…"`.
+
+Publish with `conversion/upload_pplx_embed.py` (single repo, per-bucket subfolders, `--compile` to
+ship both `.mlmodelc`+`.mlpackage`): it compiles, **stages a clean repo tree** (hardlinks +
+`manifest.json` + README card), ensures the repo exists, and prints a resumable
+`hf upload-large-folder <repo> <stage> --repo-type=model` command (parallel, xet-accelerated,
+realtime progress; re-run to resume). Every bucket's `weight.bin` is now **byte-identical** (the
+RoPE cos/sin tables are built once to a fixed length — `max_position_embeddings`, 32768 — and
+gathered to `S` at runtime via `position_ids` derived from `attention_mask`, so they no longer scale
+with `max_seq_len`; verified L512≡L1024 by sha256). So HF LFS stores the ~1.2 GB blob **once** across
+all buckets (was ~7 GB for 6 buckets), and `.mlmodelc`↔`.mlpackage` within a bucket still dedup too.
+The runtime gather is fold-proof — a plain static `[:S]` slice gets const-folded back to a per-bucket
+constant — and needs no new model input / Swift change. Fidelity unchanged (CoreML L512 cosine vs the
+fp32 oracle 0.99996; ANE residency 99.3%).
+
+## Design notes
+
+- **Fixed-shape buckets, one `.mlpackage` per bucket.** Flexible shapes (EnumeratedShapes/RangeDim)
+  force CPU fallback on the ANE and are ~10× slower; fixed buckets stay 99.8% on the ANE. Pad each
+  input to the smallest fitting bucket. Latency is O(L²) with a sharp knee at L=1024→2048.
+- **Flexible GPU model is the >max-bucket catch-all only.** Built with `--dynamic-upper N`
+  (RangeDim 1..N), it runs on the GPU non-padded for unbounded length — correct (cos 0.999) but
+  ~10× slower than a fixed bucket, so it's used only when no bucket fits.
+- **L=8192 is NOT an ANE bucket — the largest fixed ANE bucket is 4096.** A fixed L=8192 bucket
+  *statically* plans to 99.81% ANE, but the ANE **runtime fails to execute it**
+  (`ANEProgramProcessRequestDirect status=0x15: Program Inference error`,
+  `conversion/measure_l8192_bucket.py`): at 8192 the full bidirectional-attention intermediates
+  (16 heads × 8192² fp16 ≈ 2 GB per score tensor) exceed ANE buffer limits, and the ANE graph
+  compile itself takes ~25 min. So inputs of 4097–8192 tokens stay on the **dynamic GPU
+  catch-all** (which already covers them). `chunk-and-pool` to stay within a bucket would change
+  plain-embedding semantics, so it's a separate design question, not a drop-in.
+- **Native RMSNorm (`norm_impl="native"`, the default).** The 5 encoder norm sites use native
+  `x·rsqrt(mean(x²)+eps)·w` rather than the shared `ane_ops.ANERMSNorm` cat([x,−x])→LayerNorm
+  trick. The trick predates a fast native ANE rsqrt; on M4 Max / macOS 26 / coremltools 9 native
+  is **12–21% faster** on the ANE at identical 99.81% residency and cosine 0.99998
+  (`conversion/experiment_ane_rmsnorm.py`; see [`PPLX_EMBED_GPU_RESIDENCY.md`](PPLX_EMBED_GPU_RESIDENCY.md)
+  and [`ANE_RMSNORM_FOLLOWUP.md`](ANE_RMSNORM_FOLLOWUP.md)). Build with `--norm-impl ane_cat` to
+  fall back. This is local to the pplx-embed encoder — the shared decoder `ANERMSNorm` is untouched.
+- **fp16 residual rescale (K=8).** The 28-layer `down_proj` accumulation overflows fp16; scaling
+  `embed_tokens`/`o_proj`/`down_proj` by 1/K is exact for a pre-norm net (scale-invariant norms)
+  and keeps activations in range. K=8 is the fidelity/overflow sweet spot.
+- **macOS26 native int8 output** is not readable from the Python CoreML bridge; read it in Swift
+  (the `pplx-embed-bench` harness does). Fidelity is otherwise measured via a `pooled_fp16`-output
+  variant in Python.
+- **Throughput:** ANE batch-1 at the smallest bucket is both the lowest-latency and
+  highest-throughput path; batching is not a useful lever on CoreML (see below).
+
+See [`PPLX_EMBED_W8A8.md`](PPLX_EMBED_W8A8.md) (weight/activation quantization is not viable for
+this model), [`PPLX_EMBED_BATCHING.md`](PPLX_EMBED_BATCHING.md) (batching is not a useful
+throughput lever — the ANE is batch-1 by design), and
+[`PPLX_EMBED_GPU_RESIDENCY.md`](PPLX_EMBED_GPU_RESIDENCY.md) (why the GPU `CPU_AND_GPU` path has
+low GPU residency — an inherent CoreML partitioner behavior at B=1, not a fixable issue).
diff --git a/docs/PPLX_EMBED_BATCHING.md b/docs/PPLX_EMBED_BATCHING.md
new file mode 100644
index 0000000..8182341
--- /dev/null
+++ b/docs/PPLX_EMBED_BATCHING.md
@@ -0,0 +1,205 @@
+# CoreML batching throughput — pplx-embed encoder (Apple Silicon)
+
+**Question.** Does CoreML batching (B>1) raise throughput for the pplx-embed
+bidirectional Qwen3-0.6B encoder on Apple Silicon? An earlier quick test (L=512,
+pooled_fp16, warm) showed FLAT docs/sec across B — ANE ~9/s, GPU ~4.7/s. Is that
+real, and why? (All numbers below are on an **Apple M4 Max**, macOS 26.5.1.)
+
+**Verdict (bottom line).** The flat result is **real, not a measurement bug**, but
+it is **device-specific**:
+
+- **ANE (`CPU_AND_NE`) does NOT batch — it gets *worse* with B.** Per-doc latency
+  is flat-to-rising; throughput *drops* to ~0.68–0.71× of B=1 at L=128. The ANE is
+  a batch-1-oriented fixed-function accelerator: it serializes batch rows and adds
+  per-row overhead. Confirmed.
+- **GPU (`CPU_AND_GPU`) batches, but only modestly and only when one sequence
+  under-fills the GPU (small L).** At L=128 it gains up to **1.44×** (B=16); at
+  L=512 a single sequence already saturates the GPU, so batching is flat (~1.0×,
+  even regressing to 0.92× at B=64).
+- **CPU/BLAS (`CPU_ONLY`) batches the most — up to ~1.6× at L=128** (B=16), ~1.1×
+  at L=512. This is the Accelerate/BLAS GEMM batching control behaving as expected,
+  and it is the largest batch win of the three backends — but it is off a slow
+  baseline, so in *absolute* docs/sec it never beats batch-1 ANE.
+
+**So: does CoreML batching help this encoder? Only marginally (~1.4–1.6× on
+GPU/CPU at short sequences, nothing — worse — on the ANE).** The reason is
+architectural, not a bug: the fast path (ANE) is the one backend that fundamentally
+can't batch (it serializes the batch axis), and the backends that *can* batch
+(GPU/CPU) are the slow paths whose batch gains are small and saturate by L≈512.
+
+---
+
+## Setup
+
+- Machine: **Apple M4 Max**, macOS 26.5.1 (arm64). coremltools 9.0. (Numbers are
+  machine-specific; the *qualitative* conclusions — ANE doesn't batch, GPU saturates
+  by L≈512 — should generalize, but absolute throughput will differ on other chips.)
+- Model: `PplxEmbedModel(cfg, output_mode="pooled_fp16")`, fp16 residual rescale K=8,
+  traced+converted at shape **(B, L)** per cell, `minimum_deployment_target=macOS26`,
+  converted with `compute_units=ALL`, then **loaded** under each compute-unit setting
+  (`CPU_AND_NE`, `CPU_AND_GPU`, `CPU_ONLY`) so each backend is forced.
+- Timing: each (L, B, unit) shape **warmed** (3 predicts) then **median of 8** timed
+  predicts. Inputs are fp16 all-ones mask, random int32 ids.
+- `per-doc latency = batch_latency / B`, `docs/sec = B / batch_latency`.
+- Script: `conversion/experiment_batching.py` (parametrized, reproducible).
+- A second independent `--quick` run (runs=6) reproduced the ANE numbers within noise
+  (B=1 L=128: 72 vs 71 docs/s; B=1 L=512: 10.4 vs 9.9 docs/s), confirming stability.
+
+## docs/sec (rows = B, cols = compute unit)
+
+### L=128
+| B  | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY |
+|----|-----------:|------------:|---------:|
+| 1  |      71.18 |       18.37 |    20.87 |
+| 4  |      49.06 |       20.78 |    28.30 |
+| 16 |      50.75 |       26.41 |    33.37 |
+| 64 |      48.70 |       23.87 |    32.37 |
+
+### L=512
+| B  | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY |
+|----|-----------:|------------:|---------:|
+| 1  |       9.94 |        4.58 |     6.29 |
+| 4  |       9.06 |        4.95 |     6.91 |
+| 16 |       8.98 |        4.66 |     6.96 |
+| 64 |       7.32 |        4.20 |     6.86 |
+
+## per-doc latency (ms) — the key view (flat = no batch gain)
+
+### L=128
+| B  | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY |
+|----|-----------:|------------:|---------:|
+| 1  |     14.048 |      54.425 |   47.918 |
+| 4  |     20.383 |      48.116 |   35.336 |
+| 16 |     19.706 |      37.863 |   29.963 |
+| 64 |     20.532 |      41.886 |   30.892 |
+
+### L=512
+| B  | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY |
+|----|-----------:|------------:|---------:|
+| 1  |    100.583 |     218.390 |  159.103 |
+| 4  |    110.398 |     201.895 |  144.813 |
+| 16 |    111.340 |     214.736 |  143.760 |
+| 64 |    136.588 |     238.156 |  145.773 |
+
+## batch speedup = docs/s(B) / docs/s(B=1)
+
+### L=128
+| B  | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY |
+|----|-----------:|------------:|---------:|
+| 4  |      0.69× |       1.13× |    1.36× |
+| 16 |      0.71× |     **1.44×** |  **1.60×** |
+| 64 |      0.68× |       1.30× |    1.55× |
+
+### L=512
+| B  | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY |
+|----|-----------:|------------:|---------:|
+| 4  |      0.91× |       1.08× |    1.10× |
+| 16 |      0.90× |       1.02× |    1.11× |
+| 64 |      0.74× |       0.92× |    1.09× |
+
+---
+
+## Reading the data
+
+1. **ANE: per-doc latency is flat-to-rising and throughput *falls* below 1.0×.**
+   B=1 L=128 is the single fastest cell at **71 docs/s** (14.0 ms). Going to B≥4
+   *raises* per-doc latency to ~20 ms (0.68–0.71×). The ANE runs the batch as B
+   sequential single-row passes plus marshalling overhead — exactly hypothesis #1
+   ("ANE is batch-1 by design"). This is the dominant fact and it holds at **both**
+   L=128 and L=512, so it is **not** a small-L under-utilization artifact.
+
+2. **GPU does parallelize the batch — but only while the GPU is under-filled.**
+   At L=128 the single-sequence GPU pass under-utilizes the ALUs, so batching to
+   B=16 cuts per-doc latency 54→38 ms (**1.44×**). At L=512 one 512-token bidirectional
+   pass already fills the GPU, so batching is flat (1.0–1.08×) and even regresses at
+   B=64 (0.92×, thermal/occupancy). This is hypothesis #3 resolved: batching helps on
+   GPU *only* at small L, and the effect is bounded (~1.4×).
+
+3. **CPU/BLAS batches the most (the control did its job).** `CPU_ONLY` lowers the
+   GEMMs onto Accelerate/BLAS, which amortizes per-call overhead across the batch:
+   L=128 gains up to **1.60×** (B=16), L=512 up to ~1.11×. Largest *relative* gain of
+   the three, but off the slowest baseline, so it never wins in absolute docs/sec.
+
+4. **Why the earlier "flat" quick test looked flat.** It used **L=512** and reported
+   ANE (~9/s) and GPU (~4.7/s). At L=512 *both* of those backends are genuinely flat
+   (ANE structurally; GPU because 512 tokens already saturate it). The quick test
+   wasn't buggy — it just happened to pick the one sequence length where even the
+   batchable backend (GPU) has nothing left to give. Had it also probed **L=128 on
+   GPU/CPU**, it would have seen the modest 1.4–1.6× gains. The blind spot was
+   "only tested L=512, only looked at docs/sec deltas that are real-but-zero there."
+
+## Why the batching gains are bounded
+
+- The **fast path is the ANE**, and the ANE is a fixed-function, batch-1 engine: it
+  streams one (C,1,S) tile at a time and there is no batch axis to parallelize over,
+  so B>1 is pure serialization — it can never give a batch speedup (it gives a small
+  *slowdown*).
+- The **CoreML GPU path *can* batch-parallelize** (hypothesis #2 — "CoreML GPU is
+  incapable of batching" — is therefore *false*), **but** for this 0.6B encoder a single
+  sequence of L≥~256 already saturates the GPU, so there is no spare occupancy left for
+  batching to fill. The gain you can still capture (small L, where one sequence
+  under-fills the ALUs) tops out around 1.4×.
+
+Net: on this model, the throughput-optimal strategy on Apple Silicon is **batch-1 on
+the ANE** (71 docs/s at L=128, 10 docs/s at L=512); **batching is not a useful lever** —
+at most ~1.4–1.6× on GPU/CPU at short sequences, and a net loss on the ANE.
+
+## Device-placement audit (MLComputePlan)
+
+The batched models were compiled (`MLModel.get_compiled_model_path()` → copied to a
+stable `.mlmodelc`; `MLComputePlan.load_from_path` aborts on a raw `.mlpackage` in
+coremltools 9.0) and every MLProgram op's `preferred_compute_device` was tallied.
+This is the *static* compute plan (non-`const` ops only).
+
+| model        | CPU_AND_NE         | CPU_AND_GPU            | CPU_ONLY    |
+|--------------|--------------------|-----------------------|-------------|
+| L=128, B=1   | **ANE 100%**, CPU 0% | CPU 96%, GPU 4%      | CPU 100%    |
+| L=128, B=64  | **ANE 100%**, CPU 0% | CPU 84%, **GPU 16%** | CPU 100%    |
+| L=512, B=1   | **ANE 100%**, CPU 0% | CPU 86%, GPU 14%     | CPU 100%    |
+
+(1975 compute ops total per model.)
+
+**This is the decisive control: the batched (B=64) model is 100% on the ANE — it does
+NOT fall back to CPU.** So the flat-to-rising ANE per-doc latency is *not* a hidden
+device-fallback or a measurement bug: the ANE genuinely accepts the batched graph and
+runs it, but serializes the batch axis. Hypothesis #1 confirmed; the earlier "flat"
+reading was a true hardware property, not a wrong-device artifact.
+
+**The `CPU_AND_GPU` GPU residency is low (4–16%) — investigated and explained, not a
+bug.** A dedicated follow-up ([`PPLX_EMBED_GPU_RESIDENCY.md`](PPLX_EMBED_GPU_RESIDENCY.md))
+ruled out the obvious suspects: it is **not** an fp32-vs-fp16 issue (coremltools lowers
+the whole graph to fp16, so there are no fp32 ops to push to CPU) and **not** an ANE-tuning
+artifact (a from-scratch GPU-native rebuild — `nn.Linear`/native-RMSNorm/`(B,S,H)` — lands
+at the same ~12% GPU). The real cause is CoreML's **static `CPU_AND_GPU` partitioner**:
+for a single-sequence (B=1) transformer it places only weight-backed matmuls (`conv`/
+`linear`, `silu`, `gather`) on the GPU and routes all elementwise / reductions / layout
+ops **and the attention `matmul`+`softmax`** to the CPU. The static plan is **accurate,
+not misleading** — `CPU_AND_GPU` is genuinely *slower* than `CPU_ONLY` at B=1 (0.70–0.83×),
+so the GPU isn't carrying hidden work. None of this is on the critical path (the shipping
+path is the ANE fixed buckets, 99.8% ANE); no graph change raises GPU residency or makes
+the GPU win at B=1.
+
+## Sanity — batching is real (no broadcast bug)
+
+Feeding **distinct** input rows (random ids per row) to the B=4 model yields **distinct
+output rows** (pairwise max|diff| ≈ 0.34–0.49, not ~0). And batch **row 0 exactly
+matches the standalone B=1 model** on the same input (max|diff| = 0.0000, MATCH). So
+each batch element is encoded independently and correctly — the batch timings are
+genuine per-document work, the flat throughput is real.
+
+## Control — batch-N vs N × (B=1), total wall time (B=64, `conversion/experiments/control.log`)
+
+Direct head-to-head: one batch-64 predict vs 64 sequential B=1 predicts, total wall time.
+
+| unit \ L    | L=128 batch-64 / 64×(B=1) / speedup | L=512 batch-64 / 64×(B=1) / speedup |
+|-------------|-------------------------------------|-------------------------------------|
+| CPU_AND_NE  | 1313.7ms / 905.1ms / **0.69×**      | 8326.8ms / 6442.6ms / **0.77×**     |
+| CPU_AND_GPU | 2764.4ms / 3530.0ms / **1.28×**     | 16315ms / 14310ms / **0.88×**       |
+| CPU_ONLY    | 2021.5ms / 3027.9ms / **1.50×**     | 9325.8ms / 10233ms / **1.10×**      |
+
+This is the cleanest statement of the result: on the **ANE you are better off looping
+N single-document predicts** than issuing one batch-N call (0.69–0.77×, i.e. batching
+*loses*). On **GPU and CPU, batching helps at short sequences** (1.28× / 1.50× at L=128)
+but the GPU gain evaporates by L=512 (0.88×, GPU saturated) while CPU/BLAS keeps a small
+edge (1.10×). In *absolute* docs/sec, batch-1 ANE is still the throughput winner
+everywhere.
diff --git a/docs/PPLX_EMBED_GPU_RESIDENCY.md b/docs/PPLX_EMBED_GPU_RESIDENCY.md
new file mode 100644
index 0000000..82b0f32
--- /dev/null
+++ b/docs/PPLX_EMBED_GPU_RESIDENCY.md
@@ -0,0 +1,198 @@
+# pplx-embed encoder: why GPU residency is low under `CPU_AND_GPU`
+
+**TL;DR.** The reviewer's hypothesis — that the ANE-oriented graph (Conv2d‑1×1,
+cat/chunk RMSNorm, **fp32** residual/attention/norm) forces ops onto the CPU under
+`CPU_AND_GPU` — is **refuted**. Two independent tests prove it:
+
+1. An **fp16-only** encoder (residual + attention + norms all fp16) produces a
+   *byte-identical* MLProgram op→device tally to the fp32 encoder, with identical
+   latency. coremltools' default conversion pipeline already lowers the whole fp32
+   trace to fp16 in MIL, so there are **no fp32 compute ops left to push to the CPU**.
+2. A **fully GPU‑native** encoder rebuilt from scratch (`nn.Linear`/matmul instead
+   of Conv2d‑1×1, native `x*rsqrt(mean(x²))·w` RMSNorm instead of cat/chunk, plain
+   `(B,S,H)` layout, no permutes/tile) gets **12% GPU** — statistically the same as
+   the shipped encoder's **8.5%**. Removing every ANE‑ism did *not* move the needle.
+
+The real cause is a property of CoreML's **static GPU planner**: under
+`CPU_AND_GPU` it places only **weight‑backed matmul-family ops** (`conv`/`linear`,
+plus `silu`, `gather`) on the GPU and routes **all elementwise, reduction, softmax,
+layout, and the attention matmul** to the CPU. At **B=1** the resulting CPU↔GPU
+handoffs cost more than the GPU saves: **`CPU_AND_GPU` is *slower* than `CPU_ONLY`**
+(0.70–0.83×). This is not a fixable graph/implementation issue — it is how the
+backend partitions a single‑sequence transformer.
+
+Environment: Apple M4 Max, macOS 26, coremltools 9, torch 2.11. Fixed‑shape
+`pooled_fp16`, K=8 residual rescale, B=1, L∈{256,512}. Static placement via
+`MLComputePlan` on the compiled `.mlmodelc`; timing is `MLModel.predict` median.
+
+---
+
+## 1. Op / dtype breakdown (L=256, B=1, `CPU_AND_GPU`)
+
+Every non-const compute op in the MLProgram is **FLOAT16** (confirmed by reading the
+MIL spec proto: 1976 fp16 outputs, 2 int32, 1 bool). The dtype hypothesis fails at
+the proto level — there is no fp32 to assign anywhere.
+
+### Shipped ANE-tuned encoder (Conv2d‑1×1, cat/chunk RMSNorm) — **8.5% GPU**
+
+| op type        | count | device      | note |
+|----------------|------:|-------------|------|
+| mul            | 452   | **CPU** 100% | RoPE, RMSNorm scale, masking |
+| transpose      | 252   | **CPU** 100% | (B,C,1,S) layout shuffles |
+| conv (1×1)     | 196   | **GPU 71%** / CPU 29% | q/k/v/gate/up→GPU; o/down (out=1024)→CPU |
+| reshape        | 169   | **CPU** 100% | |
+| concat         | 169   | **CPU** 100% | cat([x,−x]) RMSNorm + rotate_half |
+| split          | 169   | **CPU** 100% | chunk() RMSNorm + rotate_half |
+| add            | 141   | **CPU** 100% | residual adds, mask add |
+| layer_norm     | 113   | **CPU** 100% | the RMSNorm kernel |
+| expand_dims    | 113   | **CPU** 100% | |
+| tile           | 57    | **CPU** 100% | GQA k/v expansion |
+| matmul         | 56    | **CPU** 100% | **attention scores + ctx (no weight const)** |
+| softmax        | 28    | **CPU** 100% | attention |
+| silu           | 28    | **GPU** 100% | MLP activation |
+| gather         | 1     | **GPU** 100% | embedding lookup |
+
+### GPU‑native rebuild (Linear/matmul, native RMSNorm, (B,S,H)) — **12% GPU**
+
+| op type      | count | device | note |
+|--------------|------:|--------|------|
+| linear       | 196   | **GPU** 100% | all projections now GPU |
+| silu         | 28    | **GPU** 100% | |
+| gather       | 1     | **GPU** 100% | |
+| mul / add    | 452 / 254 | CPU 100% | RoPE, RMSNorm scale, residual |
+| reshape      | 169   | CPU 100% | |
+| pow / reduce_mean / rsqrt | 113 each | CPU 100% | native RMSNorm internals |
+| transpose    | 112   | CPU 100% | |
+| split / concat | 56 each | CPU 100% | rotate_half |
+| matmul / softmax | 56 / 28 | CPU 100% | **attention still on CPU** |
+| tile / expand_dims | 57 / 57 | CPU 100% | GQA expansion |
+
+**Observation:** switching Conv2d→Linear moved `o_proj`/`down_proj` onto the GPU
+(196 vs 140 GPU ops), but the *entire elementwise/reduction/attention mass stayed on
+the CPU* — including the attention `matmul`+`softmax`, which the planner never puts
+on the GPU because they have no constant weight to anchor a GPU kernel. Net GPU share
+rose only 8.5%→12%, and latency did not improve.
+
+---
+
+## 2. Root cause
+
+CoreML's `CPU_AND_GPU` **static** partitioner is conservative for single-sequence
+(B=1) transformers. It assigns to the GPU only the ops whose dominant operand is a
+**constant weight** (the projection `conv`/`linear`, the `silu` fused after them, and
+`gather`). Everything data-dependent — elementwise (`mul`/`add`), the RMSNorm
+reductions, the **attention `matmul`/`softmax`**, and all layout ops (`transpose`/
+`reshape`/`concat`/`split`/`tile`/`expand_dims`) — is left on the CPU. The graph then
+ping‑pongs CPU→GPU→CPU around each projection.
+
+This is independent of the encoder's ANE tuning:
+- **dtype** is not the lever (everything is fp16 post-lowering; fp16/fp32 paths are
+  identical ops),
+- the **Conv2d‑1×1 + cat/chunk RMSNorm + (B,C,1,S) layout** is not the lever (a
+  textbook Linear/native‑RMSNorm/(B,S,H) graph partitions the same way).
+
+### Is the static plan misleading? No — confirmed by timing.
+
+If the GPU were secretly carrying work, `CPU_AND_GPU` would beat `CPU_ONLY`. It does
+not — it is **slower**, because the handoff overhead for the few GPU ops exceeds
+their benefit at B=1:
+
+| variant (L=256, B=1) | CPU_ONLY | CPU_AND_GPU | CPU_AND_NE | GPU speedup vs CPU_ONLY |
+|----------------------|---------:|------------:|-----------:|------------------------:|
+| ANE-tuned (fp32 resid) | 82.3 ms | 98.6 ms | **36.1 ms** | **0.83× (slower)** |
+| ANE-tuned (fp16 resid) | — | 98.6 ms | 36.0 ms | — (identical to fp32) |
+| GPU-native rebuild     | 68.0 ms | 96.5 ms | **28.6 ms** | **0.70× (slower)** |
+
+At L=512 the picture is the same: GPU share rises to 14.2% (more matmul work) but
+`CPU_AND_GPU` (220 ms) is still ~2.2× slower than `CPU_AND_NE` (100 ms).
+
+The only regime where the GPU helps (per `docs/PPLX_EMBED_BATCHING.md`) is **small L
+with batch B≫1** (L=128, B=16: ~1.4×), where the projection matmuls grow enough to
+amortize the handoff. At B=1 there is nothing to amortize.
+
+---
+
+## 3. Mitigation — before/after
+
+| metric (L=256, B=1)                | shipped (ANE fp32) | fp16 path | GPU-native | verdict |
+|------------------------------------|-------------------:|----------:|-----------:|---------|
+| GPU residency (static, CPU_AND_GPU)| 8.5%               | 8.5%      | 12.0%      | ~no change |
+| CPU_AND_GPU latency                | 98.6 ms            | 98.6 ms   | 96.5 ms    | ~no change |
+| CPU_AND_NE latency (for context)   | 36.1 ms            | 36.0 ms   | 28.6 ms    | best path unchanged |
+| fidelity (cosine vs HF fp32 oracle)| 0.99993            | 0.99993   | 0.99997    | all PASS (gate 0.99) |
+
+- **fp16 residual path:** fidelity holds (0.99993, identical to fp32), but it buys
+  **zero** GPU residency or latency. Not worth shipping as a GPU lever.
+- **GPU-native rebuild:** raises static GPU share modestly (8.5%→12%) and is even a
+  touch faster on `CPU_AND_NE` (28.6 vs 36.1 ms — interesting as an ANE micro-opt,
+  not the question here), but `CPU_AND_GPU` is unchanged and still slower than CPU.
+
+There is **no graph change that materially raises GPU residency or makes the GPU
+path win at B=1.** The bottleneck is the planner's CPU/GPU partition, not the ops we
+emit.
+
+### Does it help the dynamic RangeDim GPU model?
+
+No. The >max-bucket flexible RangeDim model is GPU-only because **flexible shapes
+force ANE fallback**, not because a GPU‑tuned graph would run well. Its ~10× slowness
+vs a fixed ANE bucket is the same CPU‑heavy `CPU_AND_GPU` partition shown here plus
+RangeDim overhead. A GPU-native graph would not fix it; only a fixed shape (→ANE)
+does. The right lever for >max-bucket inputs is **more/larger fixed ANE buckets**, or
+**chunk-and-pool** to stay within a bucket — not a GPU-tuned encoder.
+
+---
+
+## 4. Verdict & recommendation
+
+**Inherent CoreML `CPU_AND_GPU` backend behavior for single-sequence transformers —
+not a fixable implementation issue in our encoder.** Evidence:
+
+1. fp16 and fp32 encoders compile to identical op→device plans (dtype is not the
+   lever; everything is fp16 post-lowering).
+2. A clean GPU-native graph (no Conv2d‑1×1, no cat/chunk RMSNorm, no fp32) lands at
+   the same ~8–12% GPU share — removing the ANE-isms changes nothing.
+3. The static plan is **accurate**, not misleading: `CPU_AND_GPU` is *slower* than
+   `CPU_ONLY` (0.70–0.83×), so the GPU genuinely is not carrying meaningful work at
+   B=1. The CPU/GPU handoff dominates.
+
+**Recommendations:**
+
+- **Keep the shipped encoder as-is (fp32 residual, ANE-tuned).** It is correct,
+  fidelity-safe, and the ANE path (`CPU_AND_NE`, 99.8% ANE, 36 ms) is by far the
+  fastest. Do **not** add an fp16 path as a "GPU lever" — it does nothing for the
+  GPU. (An fp16 residual path was prototyped during this investigation and is
+  fidelity-safe, but provides no GPU benefit, so it was not kept.)
+- **Do not invest in a GPU-tuned encoder variant.** It will not beat the ANE bucket
+  and will not even beat `CPU_ONLY` at B=1.
+- **For the >max-bucket catch-all,** prefer adding a larger fixed ANE bucket or
+  chunk-and-pool over the dynamic GPU model. If the GPU model must stay, the only
+  knob that helps is **batching at small L** (≤128, B≫1, ~1.4×), which the catch-all
+  use case (single long doc) does not exercise.
+- **One incidental finding worth a follow-up:** the GPU-native rebuild ran the *ANE*
+  path slightly faster (28.6 vs 36.1 ms at L=256). That is an ANE micro-optimization
+  question (native RMSNorm vs cat/chunk on this chip/OS), orthogonal to GPU residency
+  — flagged, not pursued here.
+
+  **Resolved (follow-up).** `conversion/experiment_ane_rmsnorm.py` isolated the RMSNorm
+  (changing *only* the 5 encoder norm sites, `norm_impl=native` vs `ane_cat`, holding
+  Conv2d-1×1 and layout fixed): native `rsqrt(mean(x²))·w` RMSNorm is **12.7% faster at
+  L=256 and 21.5% faster at L=512** on `CPU_AND_NE` (M4 Max / macOS 26 / coremltools 9),
+  at **identical 99.81% ANE residency** and cosine **0.99998** vs the fp32 oracle. So the
+  RMSNorm alone accounts for essentially all of the GPU-native rebuild's ANE speedup —
+  the cat([x,−x])→LayerNorm trick (chosen years ago because the ANE lacked a fast native
+  rsqrt) is now a *de-optimization* on this stack. **`norm_impl=native` is now the
+  pplx-embed encoder default.** A shared rollout to the other decoder families' shared
+  `ane_ops.ANERMSNorm` is a separate flagged follow-up (`docs/ANE_RMSNORM_FOLLOWUP.md`).
+
+---
+
+## Method
+
+For each variant (shipped ANE-tuned encoder; an fp16-residual prototype; a from-scratch
+GPU-native encoder using `nn.Linear`/native RMSNorm/`(B,S,H)` layout sharing the same
+weights): build a fixed-shape `pooled_fp16` model (`build_pplx_embed_bundle.py`), compile
+to `.mlmodelc`, tally every non-const MLProgram op by `preferred_compute_device` **and**
+output dtype via `MLComputePlan` (the op×dtype×device breakdown above), then time
+`MLModel.predict` (median) under `CPU_ONLY` / `CPU_AND_GPU` / `CPU_AND_NE` and check
+cosine fidelity vs the fp32 `Reference` oracle. The op-device tally is read from the
+compiled model's MIL spec proto.
diff --git a/docs/PPLX_EMBED_W8A8.md b/docs/PPLX_EMBED_W8A8.md
new file mode 100644
index 0000000..23ecba7
--- /dev/null
+++ b/docs/PPLX_EMBED_W8A8.md
@@ -0,0 +1,182 @@
+# W8A8 (int8 weights + int8 ACTIVATIONS) viability — milestone B4
+
+**Question.** Weight-only quant is a proven dead end for this encoder (int8 linear ~0.42
+cosine, int4 palettize ~0.905) *and* buys only 4–8% latency because the forward is
+activation/compute-bound (fp16 attention), not weight-bandwidth-bound. The only real
+bandwidth lever is **activation** quantization. So: can W8A8 reach acceptable fidelity, or
+does it hit the documented **~cos 0.57 wall** on this attention family?
+
+**Model.** `perplexity-ai/pplx-embed-v1-0.6b` — 28-layer bidirectional Qwen3-0.6B encoder,
+fp16, head_dim 128, GQA 16/8, SwiGLU, QK-norm, RoPE θ=1e6. Built via
+`conversion/models/qwen3_encoder.py` (`PplxEmbedModel(cfg, output_mode="pooled_fp16")`),
+measured against `conversion/pplx_embed_reference.py` (`Reference.embed` fp32 oracle,
+int8-tanh output, cosine).
+
+---
+
+## Approach (reproducible: `conversion/experiment_w8a8.py`)
+
+1. Build an fp16 `pooled_fp16` encoder at a **small bucket** (L=128 default) so the pooled
+   vector is Python-readable on macOS26 (native int8 output is *not* Python-readable;
+   pooled_fp16 is). `compute_units=ALL`, `minimum_deployment_target=macOS26`.
+2. Calibrate activation ranges on a 14-text multilingual corpus (en/es/fr/de/ja/zh + short
+   fragments), tokenized and right-padded to the bucket, via
+   `cto.experimental.linear_quantize_activations`.
+3. Quantize weights int8 (`linear_symmetric`, `weight_threshold=512`) on top → W8A8.
+4. Predict on 12 held-out multilingual eval texts, apply `int8_tanh_quant` to the CoreML
+   pooled output, and compute cosine vs `Reference.embed`. Report mean/min.
+
+**Activation-quant mode is the crux** (parametrized `--mode asymmetric|symmetric`):
+
+- The pad-mask add uses `Qwen3Encoder.NEG_INF = -1e4`; CoreML lowers this toward the fp16
+  floor (−65504). A **symmetric** activation quantizer sets `scale ≈ 1e4/127 ≈ 79`, so real
+  attention scores (±10) round to ≈0 → after 28 layers the output collapses. This is the
+  mechanism behind the documented wall when symmetric quant is used.
+- **Asymmetric** (`mode="linear"`) lets the range span `[−1e4, +score]`; when that span
+  overflows fp16 the computed scale goes `inf`, coremltools' `isinf` guard fires, and the
+  op is **skipped (left in fp16)** — exactly the desired behaviour for the mask add, while
+  every other (small-range) activation quantizes correctly. This is the only mode with a
+  chance of beating the wall.
+
+Two coremltools-9 patches are required for the activation-quant pass to run at all (both in
+the script, written upstream-native):
+- `_cast` const-fold extracts a Python scalar before `int()/bool()` (numpy≥2 (1,)-array fix).
+- `insert_prefix_quantize_dequantize_pair.transform_op` skips ops whose input `x` is
+  non-float (int32 mask/embedding path) — MIL `quantize` requires float input.
+
+Contingencies swept by `--all`: asymmetric vs symmetric; rescale K ∈ {0 (none), 8, 16}
+(the fp16 residual rescale interacts with activation ranges — K shrinks the residual stream
+K×, changing what the activation quantizer sees).
+
+---
+
+## Reference points (measured previously, this repo / knowledge base)
+
+| config | cos vs fp32 ref (int8 output) | source |
+|---|---|---|
+| fp16 baseline | **0.999** (min 0.99912) | `weight-quant-is-a-dead-end.md` |
+| int8 `linear_quantize_weights` (weight-only) | **0.42** mean (min 0.006) | same |
+| int4 `palettize_weights` (g=32, weight-only) | **0.905** | same |
+| documented **A8 / W8A8 wall** on this attention family | **~0.57** | `contingency-fixes.md`, plan |
+| fidelity gate | **0.990** | `fidelity-gates.md` |
+
+---
+
+## Results (W8A8 — MEASURED, L=128, 12 multilingual eval texts)
+
+Run: `uv run python conversion/experiment_w8a8.py --all --bucket 128`. Each variant: 14-text
+calibration via `cto.experimental.linear_quantize_activations`, then int8 weight quant.
+fp16 baseline is the same graph with no quant (sanity: it reproduces the ~0.999 fp16 number).
+
+| variant | activation mode | rescale K | fp16 mean | **W8A8 mean cos** | W8A8 min | beats 0.57 wall? | ≥0.990 gate? |
+|---|---|---|---|---|---|---|---|
+| `w8a8-asymmetric-k8-L128`  | asymmetric | 8  | 0.9999 | **0.0157** | −0.0219 | ❌ NO | ❌ |
+| `w8a8-symmetric-k8-L128`   | symmetric  | 8  | 0.9999 | **0.0020** | −0.0459 | ❌ NO | ❌ |
+| `w8a8-asymmetric-k0-L128`  | asymmetric | 0  | 0.9996 | **0.0191** | −0.0078 | ❌ NO | ❌ |
+| `w8a8-asymmetric-k16-L128` | asymmetric | 16 | 0.9995 | **0.0201** | −0.0105 | ❌ NO | ❌ |
+
+**All four W8A8 variants collapse to cosine ≈ 0** (statistically orthogonal to the reference —
+the embedding carries no signal). This is *worse* than the documented ~0.57 wall and far worse
+than the weight-only int8 number (0.42). Activation int8 is even more destructive than weight
+int8 on this encoder.
+
+Key observations:
+- The collapse is **independent of all the contingencies**: asymmetric vs symmetric makes no
+  meaningful difference (both ≈0), and the fp16 residual rescale K (0 / 8 / 16) does not move
+  it. So the failure is not the mask-sentinel scale blow-up alone, nor the rescale interaction —
+  it is that per-tensor int8 activation quant across this 28-layer bidirectional graph destroys
+  the representation outright.
+- The `linear_quantize_activations` pass emits **fp16 overflow / NaN-scale / invalid-cast
+  RuntimeWarnings** during `insert_prefix_quantize_dequantize_pair` (the −1e4 mask sentinel and
+  other large-range activations overflow the int8 affine `zero_point` computation). Some ops are
+  skipped (left fp16) as designed, but enough activations are quantized to wreck the signal.
+- The fp16 baseline on the identical graph is 0.9995–0.9999, so the build/measure harness is
+  correct — the loss is entirely from the activation quantization.
+
+### ANE residency (MEASURED — W8A8 *does* stay on ANE)
+
+`uv run python conversion/experiment_w8a8.py --audit /tmp/w8a8-experiment/w8a8-asymmetric-k8-L128.mlpackage`
+(compiles via `coremltools.models.utils.compile_model` + `MLComputePlan`):
+
+```
+total ops: 3531
+  ANE: 3322  (94.1%)
+  unknown: 199  (5.6%)   constexpr_blockwise_shift_scale  (int8 weight-dequant consts, not compute dispatch)
+  CPU: 10   (0.3%)       greater_equal/add/select/gather  (int8-tanh + pooling tail)
+```
+
+So residency is **not** the blocker — the W8A8 model compiles and runs **94% ANE-resident**
+(the only non-ANE compute is the tiny pooling/tanh tail, identical to fp16). The int8 weights
+lower to `constexpr_blockwise_shift_scale` consts. The model is perfectly deployable on ANE; it
+just produces garbage.
+
+---
+
+## VERDICT: NOT VIABLE (post-training). Needs rotation pre-conditioning or QAT.
+
+Naive post-training W8A8 is **dead on this encoder** — it does not approach the 0.990 gate, does
+not beat the ~0.57 wall, and in fact collapses all the way to **cos ≈ 0** (worse than weight-only
+int8's 0.42). Neither asymmetric activation quant nor any rescale-K setting rescues it. ANE
+residency is fine (94%), so the failure is purely numerical fidelity, not a fallback problem.
+
+**Why it collapses this hard** (vs the documented 0.57): this is a 28-layer *bidirectional*
+encoder where every layer's input passes through QK-norm + RoPE and the residual stream is held
+in fp32 specifically because activations are wide / outlier-heavy. Per-tensor uniform int8
+activation quant sets one scale per tensor from the max, so the heavy-tailed bulk rounds toward
+zero; compounded over 28 layers the signal is annihilated. The mask sentinel (−1e4) makes at
+least one attention-input tensor's range pathological (the overflow warnings), and the
+asymmetric "skip on inf" trick only saves *that* op — every other quantized activation still
+crushes. Uniform int8 simply cannot represent this activation distribution.
+
+**Path to viability** (matches `docs/QUANTIZATION_SURVEY.md`): the bandwidth win requires
+activation quant, and activation quant requires **outlier pre-conditioning**:
+- **SpinQuant / QuaRot** — fold a learned (SpinQuant) or Hadamard (QuaRot) rotation into the
+  weights at *zero* runtime cost; it spreads activation outliers across channels so int8
+  activations become representable. This is the highest-ROI next step.
+- **SmoothQuant** — migrate per-channel activation scale into the weights pre-quant.
+- Failing those, full **QAT**.
+
+Until one of those is in place, the shipping configuration remains **fp16 + buckets** (the
+weight-quant lesson's conclusion stands, now extended: *activation* quant is also a post-training
+dead end without rotation/QAT).
+
+> Honest negative result: the wall is not just confirmed, it is *deeper* than documented for this
+> port — post-training W8A8 lands at ≈0, not 0.57. The 0.57 figure in the knowledge base likely
+> reflects a partial / A8-only or differently-scoped experiment; full per-tensor W8A8 here is ≈0.
+
+---
+
+## Mitigation feasibility (researched 2026-06) + the latency reality
+
+Recovery candidates and — critically — whether they map to the ANE's fixed op set:
+
+| approach | recovery (LLM literature) | ANE-deployable? | effort |
+|---|---|---|---|
+| **SmoothQuant** — per-channel scale migrated activation→weight | W8A8 "negligible loss" on LLMs; "alone insufficient" for total collapse | yes (folds into weights, no runtime ops) | low |
+| **Rotation (QuaRot/SpinQuant)** | 4-bit ~99% zero-shot; 8-bit "negligible" (extrapolated) | **partial** — R1/R2 fold offline, but the down-proj/value-path **online Hadamards have no adjacent linear to absorb** → extra runtime ops the ANE may reject/spill; **no public QuaRot/SpinQuant-on-ANE precedent** | high |
+| **QAT + distillation** — distil from the fp32 teacher on **unlabeled** text (no labels/contrastive pipeline) | 8-bit "almost lossless"; total collapse likely needs **full** QAT, not LoRA | yes (weights only; deployed graph stays standard int8 matmul) | highest |
+
+QAT-distillation is the only path with **both** strong recovery and clean ANE deployment.
+
+### But the premise is wrong — int8 activations barely help here (MEASURED)
+
+W8A8 exists to buy ANE bandwidth via int8 *activations*. Measured (L=128, cpuAndNE):
+
+| precision | median latency |
+|---|---|
+| fp16 (pooled) | 14.0 ms |
+| W8A8 (int8 act) | 12.7 ms (**~9% faster**) |
+
+The ANE is fp16-native; int8-activation matmul is only marginally faster, and the attention
+score matmuls (activation×activation) that dominate at large L are not int8-accelerated at all.
+With weight quant's ~4–8%, the **whole quantization latency upside is ~10%** — not the 2× the
+bandwidth intuition suggests. So even a *perfect* fidelity recovery (weeks of QAT, or a rotation
+reimplementation that may not map to ANE) would buy ~10% latency. **Not worth it.** Ship fp16 +
+buckets (0.999, 99.8% ANE, 101 ms at L=512); revisit only if a future ANE accelerates int8
+compute, or if memory (not latency) becomes the binding constraint.
+
+## Files
+
+- `conversion/experiment_w8a8.py` — builds + measures W8A8 fidelity, parametrized
+  (`--mode`, `--rescale-k`, `--bucket`, `--all`), saves `.mlpackage` artifacts for ANE audit.
+- `docs/PPLX_EMBED_W8A8.md` — this note.