diff --git a/.gitignore b/.gitignore index 6e84a38..20485ef 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ Examples/CoreMLLLMChat/gemma4-e2b/ # W4A8 calibration data — regeneratable from gen_calib_data_real.py conversion/calibration_data/ +# Generated experiment artifacts (not the tracked experiments/bonsai sources) +conversion/experiments/batching_models/ +conversion/experiments/*.log diff --git a/Package.resolved b/Package.resolved index e77ef32..42fe11a 100644 --- a/Package.resolved +++ b/Package.resolved @@ -1,6 +1,15 @@ { - "originHash" : "c8eca12331b572902235e4ef15b1dac1f7cc8320a1686ce37591974ed9c33b78", + "originHash" : "8fc6d2c9b6d25f8ae57b3a3d12f5749a87f36ccd1da52410b93df8e63507c74d", "pins" : [ + { + "identity" : "async-http-client", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/async-http-client", + "state" : { + "revision" : "3a5b74a58782c3b4c1f0bc75e9b67b10c2494e8f", + "version" : "1.33.1" + } + }, { "identity" : "eventsource", "kind" : "remoteSourceControl", @@ -10,6 +19,15 @@ "version" : "1.4.1" } }, + { + "identity" : "swift-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-algorithms.git", + "state" : { + "revision" : "87e50f483c54e6efd60e885f7f5aa946cee68023", + "version" : "1.2.1" + } + }, { "identity" : "swift-asn1", "kind" : "remoteSourceControl", @@ -19,6 +37,15 @@ "version" : "1.7.0" } }, + { + "identity" : "swift-async-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-async-algorithms.git", + "state" : { + "revision" : "9d349bcc328ac3c31ce40e746b5882742a0d1272", + "version" : "1.1.3" + } + }, { "identity" : "swift-atomics", "kind" : "remoteSourceControl", @@ -28,6 +55,15 @@ "version" : "1.3.0" } }, + { + "identity" : "swift-certificates", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-certificates.git", + "state" : { + "revision" : "bde8ca32a096825dfce37467137c903418c1893d", + "version" : "1.19.1" + } + }, { "identity" : "swift-collections", "kind" : "remoteSourceControl", @@ -37,6 +73,15 @@ "version" : "1.4.1" } }, + { + "identity" : "swift-configuration", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-configuration.git", + "state" : { + "revision" : "be76c4ad929eb6c4bcaf3351799f2adf9e6848a9", + "version" : "1.2.0" + } + }, { "identity" : "swift-crypto", "kind" : "remoteSourceControl", @@ -46,6 +91,33 @@ "version" : "4.4.0" } }, + { + "identity" : "swift-distributed-tracing", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-distributed-tracing.git", + "state" : { + "revision" : "dc4030184203ffafbb2ec614352487235d747fe0", + "version" : "1.4.1" + } + }, + { + "identity" : "swift-http-structured-headers", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-http-structured-headers.git", + "state" : { + "revision" : "933538faa42c432d385f02e07df0ace7c5ecfc47", + "version" : "1.7.0" + } + }, + { + "identity" : "swift-http-types", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-http-types.git", + "state" : { + "revision" : "db774a277f60063a32d854f2980299caf06da041", + "version" : "1.6.0" + } + }, { "identity" : "swift-huggingface", "kind" : "remoteSourceControl", @@ -64,6 +136,15 @@ "version" : "2.3.5" } }, + { + "identity" : "swift-log", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-log.git", + "state" : { + "revision" : "92448c359f00ebe36ae97d3bd9086f13c7692b5a", + "version" : "1.13.2" + } + }, { "identity" : "swift-nio", "kind" : "remoteSourceControl", @@ -73,6 +154,69 @@ "version" : "2.98.0" } }, + { + "identity" : "swift-nio-extras", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio-extras.git", + "state" : { + "revision" : "d2eeec0339074034f11a040a74aa2a341a2c4506", + "version" : "1.34.1" + } + }, + { + "identity" : "swift-nio-http2", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio-http2.git", + "state" : { + "revision" : "61d1b44f6e4e118792be1cff88ee2bc0267c6f9a", + "version" : "1.44.0" + } + }, + { + "identity" : "swift-nio-ssl", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio-ssl.git", + "state" : { + "revision" : "407d82d5b6cc00e1c3fb83a81b1539b70c788c5e", + "version" : "2.37.1" + } + }, + { + "identity" : "swift-nio-transport-services", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio-transport-services.git", + "state" : { + "revision" : "67787bb645a5e67d2edcdfbe48a216cc549222d5", + "version" : "1.28.0" + } + }, + { + "identity" : "swift-numerics", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-numerics.git", + "state" : { + "revision" : "0c0290ff6b24942dadb83a929ffaaa1481df04a2", + "version" : "1.1.1" + } + }, + { + "identity" : "swift-service-context", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-service-context.git", + "state" : { + "revision" : "d0997351b0c7779017f88e7a93bc30a1878d7f29", + "version" : "1.3.0" + } + }, + { + "identity" : "swift-service-lifecycle", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/swift-service-lifecycle", + "state" : { + "revision" : "9829955b385e5bb88128b73f1b8389e9b9c3191a", + "version" : "2.11.0" + } + }, { "identity" : "swift-system", "kind" : "remoteSourceControl", @@ -91,6 +235,15 @@ "version" : "1.3.0" } }, + { + "identity" : "swift-xet", + "kind" : "remoteSourceControl", + "location" : "https://github.com/huggingface/swift-xet.git", + "state" : { + "revision" : "341bfd4172f6a57119bfd49bafa11cf5d21fab75", + "version" : "0.2.3" + } + }, { "identity" : "yyjson", "kind" : "remoteSourceControl", diff --git a/Package.swift b/Package.swift index 9ec6f0c..58c84e7 100644 --- a/Package.swift +++ b/Package.swift @@ -1,4 +1,4 @@ -// swift-tools-version: 6.0 +// swift-tools-version: 6.1 import PackageDescription let package = Package( @@ -23,6 +23,15 @@ let package = Package( // / `Gemma3BundleDownloader` directly, without pulling the sample CLIs. .executable(name: "functiongemma-demo", targets: ["FunctionGemmaDemo"]), .executable(name: "embeddinggemma-demo", targets: ["EmbeddingGemmaDemo"]), + // pplx-embed — Swift fidelity + latency harness for the native int8 + // encoder output (not readable from the Python bridge on macOS26). + .executable(name: "pplx-embed-bench", targets: ["PplxEmbedBench"]), + // pplx-embed — the official embedding contract (plain + context late + // chunking; int8/binary/ubinary). The `PplxEmbed` runtime ships inside + // the CoreMLLLM library; this product exposes it under its own name so a + // wrapper can depend on just the embedder without pulling the sample CLIs. + .library(name: "PplxEmbed", targets: ["CoreMLLLM"]), + .executable(name: "pplx-embed-demo", targets: ["PplxEmbedDemo"]), ], dependencies: [ // Range widened to 1.0.x: mlx-swift-examples caps swift-transformers at @@ -31,12 +40,21 @@ let package = Package( // `Tokenizer` protocol + `AutoTokenizer.from(modelFolder:)` API that // CoreMLLLM uses, so 1.0.x is source-compatible with 1.1.x here. .package(url: "https://github.com/huggingface/swift-transformers", from: "1.0.0"), + // HF's native Swift Hub client (standalone — does NOT pull swift-transformers, + // so it's orthogonal to the 1.0.x cap above). Used by PplxEmbed.load(repo:) for + // content-addressed snapshot downloads: the byte-identical weight.bin across + // buckets is fetched ONCE (then reused) — native download dedup. The `Xet` trait + // is REQUIRED: HF stores large files Xet-backed by default, and without it the + // client forces the LFS transport and 404s on Xet-only blobs. (Needs tools 6.1+.) + .package(url: "https://github.com/huggingface/swift-huggingface", from: "0.9.0", + traits: ["Xet"]), ], targets: [ .target( name: "CoreMLLLM", dependencies: [ .product(name: "Tokenizers", package: "swift-transformers"), + .product(name: "HuggingFace", package: "swift-huggingface"), ], swiftSettings: [.swiftLanguageMode(.v5)] ), @@ -128,5 +146,25 @@ let package = Package( path: "Sources/ane-residency-gate", swiftSettings: [.swiftLanguageMode(.v5)] ), + // pplx-embed Swift fidelity + latency bench. No CoreMLLLM / tokenizer + // dependency — reads pre-tokenized fixtures (conversion/export_swift_fixtures.py), + // so it builds fast and stays self-contained. + .executableTarget( + name: "PplxEmbedBench", + path: "Sources/pplx-embed-bench", + swiftSettings: [.swiftLanguageMode(.v5)] + ), + // pplx-embed demo CLI — embeds a few strings (plain or context) and + // prints int8/binary/ubinary summaries. Uses the PplxEmbed runtime + + // tokenizer from the CoreMLLLM library. + .executableTarget( + name: "PplxEmbedDemo", + dependencies: [ + "CoreMLLLM", + .product(name: "Tokenizers", package: "swift-transformers"), + ], + path: "Sources/pplx-embed-demo", + swiftSettings: [.swiftLanguageMode(.v5)] + ), ] ) diff --git a/README.md b/README.md index 6be7f49..bb1adf3 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ # CoreML-LLM -**On-device LLMs on the Apple Neural Engine.** Run Gemma 4, Qwen3.5, Qwen3-VL, FunctionGemma, EmbeddingGemma, and Liquid AI's LFM2.5 on iPhone with CoreML — ANE-first, battery-friendly, no server. +**On-device LLMs on the Apple Neural Engine.** Run Gemma 4, Qwen3.5, Qwen3-VL, FunctionGemma, EmbeddingGemma, Perplexity pplx-embed, and Liquid AI's LFM2.5 on iPhone with CoreML — ANE-first, battery-friendly, no server. Where [MLX Swift](https://github.com/ml-explore/mlx-swift) is the right call when you want maximum GPU throughput, CoreML-LLM is what you use when the LLM should live on the **ANE** so the GPU stays free for the rest of the app. [![App Store](https://toolbox.marketingtools.apple.com/api/v2/badges/download-on-the-app-store/black/en-us?releaseDate=1735689600)](https://apps.apple.com/jp/app/models-zoo/id6762083207) +**Embeddings:** Perplexity's `pplx-embed` (bidirectional Qwen3 encoder, plain + late-chunking) runs on the ANE via the `PplxEmbed` Swift API — see [`docs/PPLX_EMBED.md`](docs/PPLX_EMBED.md). + ## Use in your app Add the package, name a model, generate. diff --git a/Sources/CoreMLLLM/PplxEmbed.swift b/Sources/CoreMLLLM/PplxEmbed.swift new file mode 100644 index 0000000..637dd24 --- /dev/null +++ b/Sources/CoreMLLLM/PplxEmbed.swift @@ -0,0 +1,584 @@ +import CoreML +import Foundation +import HuggingFace +import Tokenizers + +/// Runtime for Perplexity's pplx-embed (a bidirectional Qwen3 encoder → masked +/// mean pool → tanh int8 quantize). Exposes the official pplx-embed contract: +/// +/// plain : `[String] -> [[Int8]]` (1024-d int8 per text) +/// context : `[[String]] -> [[[Int8]]]` (per-document late chunking; +/// per-chunk 1024-d int8) +/// +/// Each call also exposes the `binary` (+1/-1 Float) and `ubinary` (packed +/// UInt8[dim/8]) variants. +/// +/// Output-format design decision. The underlying mlpackage emits native int8 +/// already (the `int8` output IS the deliverable; it's readable in Swift via +/// the dtype-agnostic NSNumber subscript even though the Python CoreML bridge +/// can't read int8 on macOS 26). We derive the other two formats directly from +/// the int8 vector: +/// +/// binary[i] = int8[i] >= 0 ? +1 : -1 +/// ubinary = packbits(int8[i] >= 0) +/// +/// This is bit-exact with the reference `st_quantize` everywhere except the +/// measure-zero x≈0 case: the reference branches on the raw pre-tanh value +/// `x >= 0`, whereas we branch on the rounded int8. Since `round(tanh(x)*127)` +/// is 0 only in a tiny neighbourhood of x=0 and is otherwise sign-faithful, +/// the int8-derived sign agrees with the raw sign except when |x| is so small +/// it rounds to int8 0 — there we map 0 to the `>= 0` (positive) branch to +/// match the reference's tie direction. For strictly bit-exact binary/ubinary +/// against a `pooled_fp16`-output model, build with +/// `--output-mode pooled_fp16` and apply all three quantizers in Swift; we ship +/// the int8-derived path because it needs only one model and one forward pass. +/// +/// I/O contract of the underlying mlpackages (from build_pplx_embed_bundle.py): +/// plain: +/// input_ids (1, L) int32 +/// attention_mask (1, L) fp16 (1.0 valid, 0.0 pad) +/// → embedding (1, 1024) int8 +/// context: +/// input_ids (1, L) int32 +/// attention_mask (1, L) fp16 +/// pool_matrix (32, L) fp16 (row k = 1/n_k over chunk k's span) +/// → embedding (32, 1024) int8 (only first n_chunks rows are valid) +public final class PplxEmbed { + + /// The three published pplx-embed output formats. + public enum Format: String, Sendable { + case int8 + case binary + case ubinary + } + + /// Per-bundle config parsed from model_config.json. + public struct BucketConfig: Sendable { + public let maxSeqLen: Int // for dynamic, the RangeDim upper bound + public let embedDim: Int + public let variant: String // "plain" | "context" + public let dynamic: Bool // flexible RangeDim model (GPU; the >max-bucket catch-all) + public let url: URL + } + + public static let embedDim = 1024 + public static let nMaxChunks = 32 + + private let tokenizer: Tokenizer + private let sepTokenId: Int + private let computeUnits: MLComputeUnits + + /// Fixed ANE buckets, sorted ascending by maxSeqLen. + private let buckets: [BucketConfig] + /// Optional flexible RangeDim catch-all for inputs larger than the biggest + /// fixed bucket. Runs on the GPU (flexible shapes force CPU fallback on ANE), + /// non-padded (actual length). nil if no dynamic bundle was provided. + private let dynamicBucket: BucketConfig? + private let variant: String + + /// Lazily compiled+loaded fixed-bucket models, keyed by bucket maxSeqLen. + private var loaded: [Int: MLModel] = [:] + /// Lazily loaded dynamic model (GPU). + private var dynamicModel: MLModel? + private let lock = NSLock() + + private init(tokenizer: Tokenizer, sepTokenId: Int, buckets: [BucketConfig], + dynamicBucket: BucketConfig?, variant: String, computeUnits: MLComputeUnits) { + self.tokenizer = tokenizer + self.sepTokenId = sepTokenId + self.buckets = buckets + self.dynamicBucket = dynamicBucket + self.variant = variant + self.computeUnits = computeUnits + } + + // MARK: - Loading + + /// Load a pplx-embed bundle. + /// + /// `bundleDir` may be either: + /// * a directory of bucket subdirectories (e.g. `output/pplx-embed/` + /// containing `L512-int8/`, `L1024-int8/`, …) — all int8 buckets are + /// discovered and used for token-length-based bucket selection, or + /// * a single bucket directory directly containing `encoder.mlpackage` + /// (e.g. `output/pplx-embed-context/L512-int8/`). + /// + /// Models are compiled + loaded lazily on first use per bucket; the + /// tokenizer is loaded eagerly from the first bucket's `hf_model/`. + public static func load( + bundleDir: URL, + computeUnits: MLComputeUnits = .cpuAndNeuralEngine + ) async throws -> PplxEmbed { + let fm = FileManager.default + var buckets: [BucketConfig] = [] + + // A single bucket dir directly contains encoder.mlpackage / .mlmodelc. + let isSingle = fm.fileExists(atPath: bundleDir.appendingPathComponent("encoder.mlpackage").path) + || fm.fileExists(atPath: bundleDir.appendingPathComponent("encoder.mlmodelc").path) + + if isSingle { + if let c = parseBucket(at: bundleDir) { buckets.append(c) } + } else { + let entries = (try? fm.contentsOfDirectory(at: bundleDir, + includingPropertiesForKeys: nil)) ?? [] + for e in entries.sorted(by: { $0.lastPathComponent < $1.lastPathComponent }) { + guard (try? e.resourceValues(forKeys: [.isDirectoryKey]))?.isDirectory == true + else { continue } + if let c = parseBucket(at: e) { buckets.append(c) } + } + } + + guard !buckets.isEmpty else { + throw CoreMLLLMError.modelNotFound( + "no pplx-embed bucket with encoder.mlpackage/.mlmodelc under \(bundleDir.path)") + } + + // Dominant variant from the fixed buckets (a dynamic-only bundle is plain). + let variant = (buckets.first { !$0.dynamic } ?? buckets.first!).variant + let matching = buckets.filter { $0.variant == variant } + // Fixed ANE buckets (sorted ascending) + at most one dynamic GPU catch-all. + let fixed = matching.filter { !$0.dynamic }.sorted { $0.maxSeqLen < $1.maxSeqLen } + let dynamic = matching.first { $0.dynamic } + + let hfDir = (fixed.first ?? dynamic!).url.appendingPathComponent("hf_model") + let tokenizer = try await AutoTokenizer.from(modelFolder: hfDir) + let sepId = sepTokenId(fromHFDir: hfDir) ?? 151643 + + return PplxEmbed(tokenizer: tokenizer, sepTokenId: sepId, buckets: fixed, + dynamicBucket: dynamic, variant: variant, computeUnits: computeUnits) + } + + /// Download selected buckets from a HuggingFace repo, then load them. + /// + /// Publishes-as-download path (companion to `conversion/upload_pplx_embed.py`): + /// the repo holds one subfolder per bucket plus a top-level `manifest.json` + /// inventory. This fetches the manifest, selects the requested fixed buckets + /// (+ the dynamic GPU catch-all, if present) for `variant`, and downloads **only + /// those subfolders' chosen-format files** via the HF Swift Hub client + /// (`HubClient.downloadSnapshot`, glob-filtered). Crucially, that client uses HF's + /// **content-addressed cache**: the encoder `weight.bin` is byte-identical across + /// every bucket, so it is fetched **once by etag** and reused for the rest — native + /// download dedup (the default 3-bucket pull moves ~1.15 GB, not ~3.5 GB). The + /// returned snapshot is then handed to the local `load(bundleDir:)` unchanged. + /// + /// - Parameters: + /// - repo: HF repo id, e.g. `"/pplx-embed-coreml"`. + /// - buckets: fixed bucket sizes (L) to fetch, e.g. `[512, 1024, 2048]`. + /// The dynamic catch-all (if the repo has one) is always included so long + /// inputs still work. + /// - into: ignored when `nil` (uses the shared HF cache, enabling cross-call and + /// cross-client dedup); pass a directory to download into it instead. + /// - variant: `"plain"` or `"context"`. + /// - preferCompiled: when the repo ships both formats, download the precompiled + /// `.mlmodelc` (no on-device compile) rather than the `.mlpackage`. Only the + /// chosen format's weights are fetched per bucket, never both. + @discardableResult + public static func load( + repo: String, + buckets: [Int] = [512, 1024, 2048], + into directory: URL? = nil, + computeUnits: MLComputeUnits = .cpuAndNeuralEngine, + variant: String = "plain", + preferCompiled: Bool = true, + hfToken: String? = nil, + onProgress: ((Double) -> Void)? = nil + ) async throws -> PplxEmbed { + let manifest = try await fetchManifest(repo: repo, hfToken: hfToken) + let want = Set(buckets) + + // Select this variant's buckets (requested sizes + any dynamic catch-all) and + // collect each one's exact chosen-format file paths. + var matching: [String] = [] + var hasContextSubfolder = false + for b in manifest.buckets where b.variant == variant { + guard b.dynamic || want.contains(b.maxSeqLen) else { continue } + if b.subfolder.hasPrefix("context/") { hasContextSubfolder = true } + matching.append(contentsOf: b.selectFiles(preferCompiled: preferCompiled)) + } + guard !matching.isEmpty else { + throw CoreMLLLMError.modelNotFound( + "no \(variant) buckets in \(repo) manifest match \(buckets)") + } + + let client = makeHubClient(hfToken: hfToken) + let repoID = Repo.ID(stringLiteral: repo) + let snapshot: URL + if let directory { + snapshot = try await client.downloadSnapshot( + of: repoID, kind: .model, to: directory, matching: matching, + progressHandler: { p in onProgress?(p.fractionCompleted) }) + } else { + snapshot = try await client.downloadSnapshot( + of: repoID, kind: .model, matching: matching, + progressHandler: { p in onProgress?(p.fractionCompleted) }) + } + + // Context subfolders live under `/context/`; point load there so the + // bucket dirs (context/L512-int8/…) are discovered as top-level entries. + let loadDir = (variant == "context" && hasContextSubfolder) + ? snapshot.appendingPathComponent("context") : snapshot + return try await load(bundleDir: loadDir, computeUnits: computeUnits) + } + + /// Build a `HubClient` (env/anonymous token, or an explicit bearer token). + private static func makeHubClient(hfToken: String?) -> HubClient { + if let hfToken, !hfToken.isEmpty { + return HubClient(host: URL(string: "https://huggingface.co")!, bearerToken: hfToken) + } + return HubClient() + } + + // MARK: - Manifest + + /// One bucket entry parsed from the repo's `manifest.json`. + private struct ManifestBucket { + let subfolder: String + let variant: String + let dynamic: Bool + let maxSeqLen: Int + let formats: [String] // e.g. ["mlmodelc", "mlpackage"] + let files: [String] // exact repo-relative paths (subfolder-prefixed) + + /// Exact file paths to fetch for the chosen format: shared files + /// (model_config.json, hf_model/…) + only the chosen format's encoder dir. We + /// pass these to `downloadSnapshot(matching:)` as exact patterns rather than + /// wildcards — `listFiles(recursive:)` also returns *directory* entries, and a + /// glob like `encoder.mlmodelc/*` would match (and 404 trying to GET) the + /// `analytics/`/`weights/` directories. + func selectFiles(preferCompiled: Bool) -> [String] { + let preferred = preferCompiled ? "mlmodelc" : "mlpackage" + let chosen = formats.contains(preferred) ? preferred : (formats.first ?? preferred) + let otherDir = "\(subfolder)/encoder.\(chosen == "mlmodelc" ? "mlpackage" : "mlmodelc")/" + return files.filter { !$0.hasPrefix(otherDir) } + } + } + private struct Manifest { let buckets: [ManifestBucket] } + + /// Fetch + parse `manifest.json` from a HF repo. + private static func fetchManifest(repo: String, hfToken: String?) async throws -> Manifest { + let urlStr = "https://huggingface.co/\(repo)/resolve/main/manifest.json" + var req = URLRequest(url: URL(string: urlStr)!) + if let hfToken { req.setValue("Bearer \(hfToken)", forHTTPHeaderField: "Authorization") } + let (data, resp) = try await URLSession.shared.data(for: req) + if let http = resp as? HTTPURLResponse, http.statusCode >= 400 { + throw Gemma3BundleDownloader.Error.httpStatus( + http.statusCode, url: urlStr, body: String(data: data, encoding: .utf8) ?? "") + } + guard let j = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let raw = j["buckets"] as? [[String: Any]] + else { throw CoreMLLLMError.modelNotFound("malformed manifest.json in \(repo)") } + + let buckets: [ManifestBucket] = raw.compactMap { e in + guard let subfolder = e["subfolder"] as? String else { return nil } + let dynamic = (e["dynamic"] as? Bool) ?? false + // Fixed buckets carry an integer "bucket"; dynamic uses dynamic_upper/max_seq_len. + let maxSeqLen = (e["bucket"] as? Int) + ?? (e["dynamic_upper"] as? Int) + ?? (e["max_seq_len"] as? Int) ?? 0 + let variant = (e["variant"] as? String) ?? "plain" + let formats = (e["formats"] as? [String]) ?? ["mlpackage"] + let fileObjs = (e["files"] as? [[String: Any]]) ?? [] + let files = fileObjs.compactMap { $0["path"] as? String } + return ManifestBucket(subfolder: subfolder, variant: variant, + dynamic: dynamic, maxSeqLen: maxSeqLen, + formats: formats, files: files) + } + return Manifest(buckets: buckets) + } + + /// Parse a single bucket directory's model_config.json. Only accepts + /// int8-output buckets (the deliverable format). + private static func parseBucket(at dir: URL) -> BucketConfig? { + let fm = FileManager.default + let hasModel = fm.fileExists(atPath: dir.appendingPathComponent("encoder.mlpackage").path) + || fm.fileExists(atPath: dir.appendingPathComponent("encoder.mlmodelc").path) + guard hasModel, + fm.fileExists(atPath: dir.appendingPathComponent("hf_model").path) + else { return nil } + + let cfgURL = dir.appendingPathComponent("model_config.json") + guard let data = try? Data(contentsOf: cfgURL), + let j = try? JSONSerialization.jsonObject(with: data) as? [String: Any] + else { return nil } + + let outputMode = j["output_mode"] as? String ?? "int8" + guard outputMode == "int8" else { return nil } // skip pooled_fp16 variants + // Ship the fp16-weight models only; skip experimental weight-quant bundles + // (they share output_mode "int8" but would duplicate a bucket size). + let weightQuant = j["quantization_weights"] as? String ?? "fp16" + guard weightQuant == "fp16" else { return nil } + + let dynamic = (j["dynamic"] as? Bool) ?? false + // Fixed bucket: integer "bucket". Dynamic: "bucket" is a string ("1..N"); + // use dynamic_upper as the effective max. + let maxSeqLen: Int + if dynamic { + maxSeqLen = (j["dynamic_upper"] as? Int) ?? (j["max_seq_len"] as? Int) ?? 8192 + } else { + maxSeqLen = (j["bucket"] as? Int) ?? (j["max_seq_len"] as? Int) ?? 512 + } + let embedDim = (j["hidden_size"] as? Int) ?? PplxEmbed.embedDim + let variant = (j["variant"] as? String) + ?? (dir.path.contains("context") ? "context" : "plain") + + return BucketConfig(maxSeqLen: maxSeqLen, embedDim: embedDim, + variant: variant, dynamic: dynamic, url: dir) + } + + private static func sepTokenId(fromHFDir hfDir: URL) -> Int? { + // <|endoftext|> id from added_tokens.json (pplx tokenizer: 151643). + let url = hfDir.appendingPathComponent("added_tokens.json") + guard let data = try? Data(contentsOf: url), + let j = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let id = j["<|endoftext|>"] as? Int + else { return nil } + return id + } + + private func model(forBucket L: Int) throws -> MLModel { + lock.lock(); defer { lock.unlock() } + if let m = loaded[L] { return m } + guard let cfg = buckets.first(where: { $0.maxSeqLen == L }) else { + throw CoreMLLLMError.modelNotFound("no loaded bucket for L=\(L)") + } + let mlConfig = MLModelConfiguration() + mlConfig.computeUnits = computeUnits + + let compiled = cfg.url.appendingPathComponent("encoder.mlmodelc") + let pkg = cfg.url.appendingPathComponent("encoder.mlpackage") + let modelURL: URL + if FileManager.default.fileExists(atPath: compiled.path) { + modelURL = compiled + } else { + modelURL = try compileSync(pkg) + } + let m = try MLModel(contentsOf: modelURL, configuration: mlConfig) + loaded[L] = m + return m + } + + /// Synchronous compile wrapper (MLModel.compileModel is async on newer SDKs + /// but the legacy throwing overload is sync). Use the sync overload to keep + /// the model accessor non-async. + private func compileSync(_ pkg: URL) throws -> URL { + try MLModel.compileModel(at: pkg) + } + + /// Load the flexible RangeDim catch-all model on the GPU (flexible shapes + /// force CPU fallback on the ANE, so this path is GPU-only). + private func loadDynamicModel(_ cfg: BucketConfig) throws -> MLModel { + lock.lock(); defer { lock.unlock() } + if let m = dynamicModel { return m } + let mlConfig = MLModelConfiguration() + mlConfig.computeUnits = .cpuAndGPU + let compiled = cfg.url.appendingPathComponent("encoder.mlmodelc") + let pkg = cfg.url.appendingPathComponent("encoder.mlpackage") + let url = FileManager.default.fileExists(atPath: compiled.path) + ? compiled : try compileSync(pkg) + let m = try MLModel(contentsOf: url, configuration: mlConfig) + dynamicModel = m + return m + } + + /// Pick the smallest bucket whose maxSeqLen >= n; if none, the largest. + private func bucket(forTokens n: Int) -> BucketConfig { + for b in buckets where b.maxSeqLen >= n { return b } + return buckets.last! + } + + // MARK: - Plain API + + /// Encode texts into 1024-d int8 embeddings (one row per text). + public func embed(_ texts: [String]) throws -> [[Int8]] { + try texts.map { try embedOne($0) } + } + + /// Encode texts and return the requested format. + /// - int8: `[[Int8]]` (1024-d) + /// - binary: `[[Float]]` (1024-d, +1/-1) + /// - ubinary: `[[UInt8]]` (128 packed bytes) + public func embedInt8(_ texts: [String]) throws -> [[Int8]] { + try embed(texts) + } + + public func embedBinary(_ texts: [String]) throws -> [[Float]] { + try embed(texts).map { PplxEmbed.binary(fromInt8: $0) } + } + + public func embedUBinary(_ texts: [String]) throws -> [[UInt8]] { + try embed(texts).map { PplxEmbed.ubinary(fromInt8: $0) } + } + + private func embedOne(_ text: String) throws -> [Int8] { + var ids = tokenizer.encode(text: text) + let largestFixed = buckets.last?.maxSeqLen ?? 0 + + // Catch-all: inputs larger than the biggest fixed bucket go to the flexible + // GPU model, non-padded (actual length, capped at the RangeDim upper bound). + if let dyn = dynamicBucket, ids.count > largestFixed { + let L = min(ids.count, dyn.maxSeqLen) + if ids.count > L { ids = Array(ids.prefix(L)) } + let n = ids.count + let out = try loadDynamicModel(dyn).prediction(from: MLDictionaryFeatureProvider(dictionary: [ + "input_ids": try makeInputIds(ids, L: n), + "attention_mask": try makeAttentionMask(n: n, L: n), + ])) + return try readPlainRow(out) + } + + // Fast path: smallest fixed ANE bucket that fits, padded to the bucket. + let bucket = bucket(forTokens: ids.count) + let L = bucket.maxSeqLen + if ids.count > L { ids = Array(ids.prefix(L)) } + let n = ids.count + let out = try model(forBucket: L).prediction(from: MLDictionaryFeatureProvider(dictionary: [ + "input_ids": try makeInputIds(ids, L: L), + "attention_mask": try makeAttentionMask(n: n, L: L), + ])) + return try readPlainRow(out) + } + + /// Read a (1, 1024) int8 "embedding" output into [Int8]. + private func readPlainRow(_ out: MLFeatureProvider) throws -> [Int8] { + guard let arr = out.featureValue(for: "embedding")?.multiArrayValue else { + throw CoreMLLLMError.predictionFailed + } + let d = min(PplxEmbed.embedDim, arr.count) + var vec = [Int8](repeating: 0, count: d) + for i in 0.. [[[Int8]]] { + try documents.map { try embedContextOne($0) } + } + + public func embedContextBinary(_ documents: [[String]]) throws -> [[[Float]]] { + try embedContext(documents).map { doc in doc.map { PplxEmbed.binary(fromInt8: $0) } } + } + + public func embedContextUBinary(_ documents: [[String]]) throws -> [[[UInt8]]] { + try embedContext(documents).map { doc in doc.map { PplxEmbed.ubinary(fromInt8: $0) } } + } + + private func embedContextOne(_ chunks: [String]) throws -> [[Int8]] { + precondition(variant == "context", + "embedContext requires a context bundle (variant=context)") + guard !chunks.isEmpty else { return [] } + + // Join chunks with the sep token, then tokenize the whole window once. + // The tokenizer adds the literal <|endoftext|> between chunks; we locate + // its ids among the valid tokens to recover chunk spans. + let sep = "<|endoftext|>" + let joined = chunks.joined(separator: sep) + var ids = tokenizer.encode(text: joined) + + let bucket = bucket(forTokens: ids.count) + let L = bucket.maxSeqLen + if ids.count > L { ids = Array(ids.prefix(L)) } + let n = ids.count + + // Recover chunk spans: [start, sep) (SEP excluded), next start = sep+1, + // final chunk runs to n. + var spans: [(Int, Int)] = [] + var start = 0 + for i in 0.. PplxEmbed.nMaxChunks { + spans = Array(spans.prefix(PplxEmbed.nMaxChunks)) + } + let nChunks = spans.count + + let inputIds = try makeInputIds(ids, L: L) + let attn = try makeAttentionMask(n: n, L: L) + let pool = try makePoolMatrix(spans: spans, L: L) + + let model = try model(forBucket: L) + let out = try model.prediction(from: MLDictionaryFeatureProvider(dictionary: [ + "input_ids": inputIds, + "attention_mask": attn, + "pool_matrix": pool, + ])) + guard let arr = out.featureValue(for: "embedding")?.multiArrayValue else { + throw CoreMLLLMError.predictionFailed + } + // (32, 1024) int8 — read only the first nChunks rows (rest are all-zero). + let D = PplxEmbed.embedDim + var result: [[Int8]] = [] + result.reserveCapacity(nChunks) + for c in 0.. MLMultiArray { + let arr = try MLMultiArray(shape: [1, NSNumber(value: L)], dataType: .int32) + let p = arr.dataPointer.bindMemory(to: Int32.self, capacity: L) + for i in 0.. MLMultiArray { + let arr = try MLMultiArray(shape: [1, NSNumber(value: L)], dataType: .float16) + let p = arr.dataPointer.bindMemory(to: UInt16.self, capacity: L) + let one: UInt16 = 0x3C00 // 1.0 in fp16 + for i in 0.. MLMultiArray { + let rows = PplxEmbed.nMaxChunks + let arr = try MLMultiArray(shape: [NSNumber(value: rows), NSNumber(value: L)], + dataType: .float16) + let p = arr.dataPointer.bindMemory(to: UInt16.self, capacity: rows * L) + for i in 0..<(rows * L) { p[i] = 0 } + for (k, span) in spans.enumerated() where k < rows { + let (s, e) = span + let count = e - s + guard count > 0 else { continue } + let w = float16Bits(Float(1.0) / Float(count)) + let base = k * L + for col in s..= 0 ? +1 : -1 (matches reference x>=0 branch). + public static func binary(fromInt8 v: [Int8]) -> [Float] { + v.map { $0 >= 0 ? Float(1) : Float(-1) } + } + + /// ubinary = packbits(int8[i] >= 0), MSB-first per byte (numpy packbits). + public static func ubinary(fromInt8 v: [Int8]) -> [UInt8] { + let nBytes = (v.count + 7) / 8 + var out = [UInt8](repeating: 0, count: nBytes) + for i in 0..= 0 { + out[i / 8] |= UInt8(1 << (7 - (i % 8))) + } + return out + } + + /// Float → IEEE-754 binary16 bit pattern (native Float16 round). + private func float16Bits(_ x: Float) -> UInt16 { + Float16(x).bitPattern + } +} diff --git a/Sources/pplx-embed-bench/main.swift b/Sources/pplx-embed-bench/main.swift new file mode 100644 index 0000000..0ae85f4 --- /dev/null +++ b/Sources/pplx-embed-bench/main.swift @@ -0,0 +1,127 @@ +// pplx-embed-bench — Swift fidelity + latency harness for the pplx-embed CoreML model. +// +// Native int8 model output is not readable from the Python CoreML bridge on macOS26, +// so this validates the int8 deliverable in Swift: it loads the model, runs the +// pre-tokenized fixtures from export_swift_fixtures.py, reads the int8 output via the +// dtype-agnostic NSNumber subscript, computes cosine vs the fp32-reference int8, and +// times warm latency. +// +// Usage: +// swift run -c release pplx-embed-bench \ +// --model \ +// --fixtures /tmp/pplx_fixtures.json \ +// --compute-units cpuAndNE --iters 20 + +import CoreML +import Foundation + +struct Fixture: Decodable { let text: String; let input_ids: [Int]; let n: Int; let ref_int8: [Int] } +struct Fixtures: Decodable { let L: Int; let hf_repo: String; let items: [Fixture] } + +func arg(_ name: String, _ def: String) -> String { + let a = CommandLine.arguments + if let i = a.firstIndex(of: name), i + 1 < a.count { return a[i + 1] } + return def +} + +func computeUnits(_ s: String) -> MLComputeUnits { + switch s { + case "all": return .all + case "cpuOnly": return .cpuOnly + case "cpuAndGPU": return .cpuAndGPU + default: return .cpuAndNeuralEngine + } +} + +func cosine(_ a: [Double], _ b: [Double]) -> Double { + var dot = 0.0, na = 0.0, nb = 0.0 + for i in 0.. MLDictionaryFeatureProvider { + let ids = try MLMultiArray(shape: [1, NSNumber(value: L)], dataType: .int32) + let attn = try MLMultiArray(shape: [1, NSNumber(value: L)], dataType: .float16) + for i in 0.. ([Double], String)? { + guard let arr = out.featureValue(for: "embedding")?.multiArrayValue else { return nil } + let d = arr.count + var v = [Double](repeating: 0, count: d) + for i in 0..= gate) ? "PASS vs 0.997 gate" : "FAIL vs 0.997 gate") diff --git a/Sources/pplx-embed-demo/main.swift b/Sources/pplx-embed-demo/main.swift new file mode 100644 index 0000000..baafe93 --- /dev/null +++ b/Sources/pplx-embed-demo/main.swift @@ -0,0 +1,146 @@ +// pplx-embed-demo — minimal CLI around the PplxEmbed runtime. +// +// Plain: +// swift run -c release pplx-embed-demo \ +// --bundle-dir output/pplx-embed \ +// --text "hello world" --text "bonjour le monde" --format int8 +// +// Context (late chunking) — each --text is one chunk of a single document; +// use ';;' inside a --text to split a document into multiple chunks: +// swift run -c release pplx-embed-demo \ +// --bundle-dir output/pplx-embed-context/L512-int8 \ +// --context --text "first chunk;;second chunk" + +import CoreML +import CoreMLLLM +import Foundation + +func args(_ name: String) -> [String] { + let a = CommandLine.arguments + var out: [String] = [] + var i = 0 + while i < a.count { + if a[i] == name, i + 1 < a.count { out.append(a[i + 1]); i += 2 } else { i += 1 } + } + return out +} +func arg(_ name: String, _ def: String) -> String { args(name).first ?? def } +func flag(_ name: String) -> Bool { CommandLine.arguments.contains(name) } + +func computeUnits(_ s: String) -> MLComputeUnits { + switch s { + case "all": return .all + case "cpuOnly": return .cpuOnly + case "cpuAndGPU": return .cpuAndGPU + default: return .cpuAndNeuralEngine + } +} + +func l2norm(_ v: [Int8]) -> Double { + var s = 0.0 + for x in v { s += Double(x) * Double(x) } + return (s).squareRoot() +} + +func summarize(_ v: [Int8]) -> String { + let head = v.prefix(8).map { String($0) }.joined(separator: ", ") + return "dim=\(v.count) first8=[\(head)] l2=\(String(format: "%.2f", l2norm(v)))" +} + +// Either --bundle-dir (local) or --repo (download from HuggingFace) is required. +let bundleDir = arg("--bundle-dir", "") +let repo = arg("--repo", "") +guard !bundleDir.isEmpty || !repo.isEmpty else { + FileHandle.standardError.write("--bundle-dir or --repo required\n".data(using: .utf8)!) + exit(2) +} +let texts = args("--text") +guard !texts.isEmpty else { + FileHandle.standardError.write("at least one --text required\n".data(using: .utf8)!) + exit(2) +} +let isContext = flag("--context") +let format = PplxEmbed.Format(rawValue: arg("--format", "int8")) ?? .int8 +let cu = computeUnits(arg("--compute-units", "cpuAndNE")) +let asJSON = flag("--json") // emit raw int8 vectors as JSON (for parity checks) + +let embedder: PplxEmbed +if !repo.isEmpty { + // Download-then-run: pull only the requested buckets from HF (content-addressed + // cache → the shared weight.bin is fetched once), then load. + let buckets = args("--buckets").compactMap { Int($0) } + let cacheDir = args("--cache-dir").first.map { URL(fileURLWithPath: $0) } + let hfToken = args("--hf-token").first ?? ProcessInfo.processInfo.environment["HF_TOKEN"] + embedder = try await PplxEmbed.load( + repo: repo, + buckets: buckets.isEmpty ? [512, 1024, 2048] : buckets, + into: cacheDir, + computeUnits: cu, + variant: isContext ? "context" : "plain", + hfToken: hfToken, + onProgress: { frac in + FileHandle.standardError.write("\r[download] \(Int(frac * 100))% " + .data(using: .utf8)!) + }) + FileHandle.standardError.write("\n".data(using: .utf8)!) +} else { + embedder = try await PplxEmbed.load( + bundleDir: URL(fileURLWithPath: bundleDir), computeUnits: cu) +} + +// JSON mode: dump int8 vectors only (plain: [[Int8]]; context: [[[Int8]]]). +if asJSON { + let enc = JSONEncoder() + if isContext { + let docs = texts.map { $0.components(separatedBy: ";;") } + let int8 = try embedder.embedContext(docs) + let data = try enc.encode(int8.map { $0.map { $0.map(Int.init) } }) + FileHandle.standardOutput.write(data) + } else { + let int8 = try embedder.embed(texts) + let data = try enc.encode(int8.map { $0.map(Int.init) }) + FileHandle.standardOutput.write(data) + } + FileHandle.standardOutput.write("\n".data(using: .utf8)!) + exit(0) +} + +if isContext { + // Each --text is a document; ';;' splits it into chunks. + let docs = texts.map { $0.components(separatedBy: ";;") } + let int8 = try embedder.embedContext(docs) + for (d, doc) in int8.enumerated() { + print("doc[\(d)]: \(doc.count) chunk(s)") + for (c, row) in doc.enumerated() { + switch format { + case .int8: + print(" chunk[\(c)]: \(summarize(row))") + case .binary: + let b = PplxEmbed.binary(fromInt8: row) + let head = b.prefix(8).map { String(Int($0)) }.joined(separator: ", ") + print(" chunk[\(c)]: binary dim=\(b.count) first8=[\(head)]") + case .ubinary: + let u = PplxEmbed.ubinary(fromInt8: row) + let head = u.prefix(8).map { String($0) }.joined(separator: ", ") + print(" chunk[\(c)]: ubinary bytes=\(u.count) first8=[\(head)]") + } + } + } +} else { + let int8 = try embedder.embed(texts) + for (i, row) in int8.enumerated() { + let label = String(texts[i].prefix(40)) + switch format { + case .int8: + print("[\(i)] \"\(label)\" → \(summarize(row))") + case .binary: + let b = PplxEmbed.binary(fromInt8: row) + let head = b.prefix(8).map { String(Int($0)) }.joined(separator: ", ") + print("[\(i)] \"\(label)\" → binary dim=\(b.count) first8=[\(head)]") + case .ubinary: + let u = PplxEmbed.ubinary(fromInt8: row) + let head = u.prefix(8).map { String($0) }.joined(separator: ", ") + print("[\(i)] \"\(label)\" → ubinary bytes=\(u.count) first8=[\(head)]") + } + } +} diff --git a/conversion/build_pplx_embed_bundle.py b/conversion/build_pplx_embed_bundle.py new file mode 100644 index 0000000..dbe1a10 --- /dev/null +++ b/conversion/build_pplx_embed_bundle.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 +"""Build a CoreML bundle for pplx-embed (bidirectional Qwen3 encoder). + +Stateless single-forward export — no KV cache, no causal mask. Fixed-length +(`--max-seq-len`, the bucket) input + pad mask. Variable length is handled by +padding to the nearest bucket at runtime (fixed shapes keep it on the ANE; +RangeDim/EnumeratedShapes force CPU fallback). + +Output modes: + pooled_fp16 masked-mean → (1, 1024) fp16 — readable from the Python bridge; + quantize to int8 downstream. Use for fidelity measurement. + int8 masked-mean → tanh → int8 — the deliverable (native int8; + read via the Swift harness on macOS26). + +Usage: + python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 4096 + python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 128 \ + --output-mode pooled_fp16 +""" +from __future__ import annotations + +import argparse +import json +import os +import shutil +import sys + +import numpy as np +import torch + +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, ROOT) + +from models.qwen3_encoder import ( # noqa: E402 + Qwen3EncoderConfig, + PplxEmbedModel, + PplxEmbedContextModel, + N_MAX_CHUNKS, + load_encoder_weights, + apply_fp16_residual_rescale, +) + +# Residual rescale factor (see qwen3_encoder.apply_fp16_residual_rescale). +# K=8 is the fidelity↔overflow sweet spot: validated overflow-safe (peak |h| ~37k at +# 455 real tokens, 1.75× under fp16 max) and markedly better than K=16 on short chunks +# (context mean 0.9987 vs 0.9911). K=4 overflows. Bump toward 16 if long inputs NaN. +DEFAULT_RESCALE_K = 8.0 + + +def _snapshot_dir(hf_repo: str) -> str: + from huggingface_hub import snapshot_download + return snapshot_download( + hf_repo, + allow_patterns=["*.json", "*.safetensors", "tokenizer*", "*.txt", "*.py", "1_Pooling/*"], + ) + + +def build_bundle( + hf_repo: str, + model_name: str, + output_dir: str, + max_seq_len: int = 4096, + output_mode: str = "int8", + rescale_k: float = DEFAULT_RESCALE_K, + quantize: str | None = None, + variant: str = "plain", + dynamic_upper: int = 0, + skip_if_exists: bool = True, + norm_impl: str = "native", +) -> str: + """Build a CoreML bundle. + + dynamic_upper > 0 → a **flexible RangeDim** model (seq 1..dynamic_upper) targeting the + GPU — the non-padded, unbounded-length catch-all for inputs larger than the biggest fixed + ANE bucket. (Flexible shapes force CPU fallback on ANE and are ~10× slower than fixed + buckets, so this is GPU-only and reserved for >max-bucket inputs.) Otherwise a fixed-shape + bucket (the fast ANE path). + """ + import coremltools as ct + + os.makedirs(output_dir, exist_ok=True) + pkg = os.path.join(output_dir, "encoder.mlpackage") + if skip_if_exists and os.path.exists(pkg): + print(f" [skip] {pkg} exists") + return pkg + + dynamic = dynamic_upper > 0 + print(f"[1/4] Loading {hf_repo} (config + weights; variant={variant}" + + (f", dynamic RangeDim 1..{dynamic_upper} GPU" if dynamic else "") + ")") + snap = hf_repo if os.path.isdir(hf_repo) else _snapshot_dir(hf_repo) + # The bucket (input shape). The RoPE table is built once to a fixed length + # (max_position_embeddings) inside Qwen3Encoder._build_rope and gathered to S at + # runtime, so it no longer tracks the bucket — that keeps weight.bin byte-identical + # across buckets (HF LFS stores one blob). For the dynamic RangeDim model max_seq_len + # is informational only (the input is RangeDim 1..dynamic_upper). + bucket_len = dynamic_upper if dynamic else max_seq_len + cfg = Qwen3EncoderConfig.from_json(os.path.join(snap, "config.json"), + max_seq_len=bucket_len, norm_impl=norm_impl) + if variant == "context": + model = PplxEmbedContextModel(cfg, output_mode=output_mode).eval() + else: + model = PplxEmbedModel(cfg, output_mode=output_mode).eval() + load_encoder_weights(model.encoder, snap) + if rescale_k: + print(f"[1.5/4] fp16 residual rescale 1/K (K={rescale_k})") + apply_fp16_residual_rescale(model.encoder, rescale_k) + + if dynamic and variant == "context": + raise ValueError("dynamic (RangeDim) mode supports only the plain variant") + + trace_len = min(512, dynamic_upper) if dynamic else max_seq_len + print(f"[2/4] Tracing (trace_len={trace_len}, mode={output_mode}, variant={variant}" + + (f", RangeDim 1..{dynamic_upper}" if dynamic else "") + ")") + sample_ids = torch.zeros((1, trace_len), dtype=torch.int32) + sample_mask = torch.ones((1, trace_len), dtype=torch.float16) + if dynamic: + seqdim = ct.RangeDim(lower_bound=1, upper_bound=dynamic_upper, default=trace_len) + inputs = [ + ct.TensorType(name="input_ids", shape=(1, seqdim), dtype=np.int32), + ct.TensorType(name="attention_mask", shape=(1, seqdim), dtype=np.float16), + ] + trace_args = (sample_ids, sample_mask) + else: + inputs = [ + ct.TensorType(name="input_ids", shape=(1, trace_len), dtype=np.int32), + ct.TensorType(name="attention_mask", shape=(1, trace_len), dtype=np.float16), + ] + if variant == "context": + sample_pool = torch.zeros((N_MAX_CHUNKS, trace_len), dtype=torch.float16) + sample_pool[0, :] = 1.0 / trace_len + trace_args = (sample_ids, sample_mask, sample_pool) + inputs.append(ct.TensorType(name="pool_matrix", shape=(N_MAX_CHUNKS, trace_len), dtype=np.float16)) + else: + trace_args = (sample_ids, sample_mask) + with torch.no_grad(): + traced = torch.jit.trace(model, trace_args) + + out_dtype = np.int8 if output_mode == "int8" else np.float16 + # Flexible shapes can't go on ANE (CPU fallback) → GPU; fixed buckets → ANE (ALL picks it). + compute_units = ct.ComputeUnit.CPU_AND_GPU if dynamic else ct.ComputeUnit.ALL + print(f"[3/4] Converting to CoreML (fp16, macOS26; out={out_dtype.__name__}; " + f"units={'CPU_AND_GPU' if dynamic else 'ALL'})") + mlmodel = ct.convert( + traced, + inputs=inputs, + outputs=[ct.TensorType(name="embedding", dtype=out_dtype)], + minimum_deployment_target=ct.target.macOS26, + compute_units=compute_units, + ) + + if quantize == "int4": + op = ct.optimize.coreml.OpPalettizerConfig(nbits=4, granularity="per_grouped_channel", group_size=32) + mlmodel = ct.optimize.coreml.palettize_weights( + mlmodel, ct.optimize.coreml.OptimizationConfig(global_config=op)) + print(" applied int4 palettization (group_size=32)") + elif quantize == "int8": + op = ct.optimize.coreml.OpLinearQuantizerConfig(mode="linear_symmetric", dtype="int8") + mlmodel = ct.optimize.coreml.linear_quantize_weights( + mlmodel, ct.optimize.coreml.OptimizationConfig(global_config=op)) + print(" applied int8 weight quantization") + + if os.path.exists(pkg): + shutil.rmtree(pkg) + mlmodel.save(pkg) + size_mb = sum(os.path.getsize(os.path.join(dp, f)) + for dp, _, fns in os.walk(pkg) for f in fns) / 1024 / 1024 + print(f" saved {pkg} ({size_mb:.1f} MB)") + + _write_model_config(output_dir, model_name, hf_repo, cfg, max_seq_len, + output_mode, rescale_k, quantize, variant, dynamic_upper, norm_impl) + _copy_tokenizer(snap, output_dir) + print(f"[4/4] bundle ready at {output_dir}") + return pkg + + +def _write_model_config(output_dir, model_name, hf_repo, cfg, max_seq_len, + output_mode, rescale_k, quantize, variant="plain", dynamic_upper=0, + norm_impl="native"): + dynamic = dynamic_upper > 0 + out_dtype = "int8" if output_mode == "int8" else "fp16" + out_shape = [N_MAX_CHUNKS, 1024] if variant == "context" else [1, 1024] + seq_shape = [1, f"1..{dynamic_upper}"] if dynamic else [1, max_seq_len] + inputs = { + "input_ids": {"shape": seq_shape, "dtype": "int32"}, + "attention_mask": {"shape": seq_shape, "dtype": "fp16", + "doc": "1.0 for valid tokens, 0.0 for pad"}, + } + if variant == "context": + inputs["pool_matrix"] = {"shape": [N_MAX_CHUNKS, max_seq_len], "dtype": "fp16", + "doc": "row k = 1/n_k over chunk k's span, else 0; unused rows all-zero"} + cfgd = { + "model_name": model_name, + "architecture": "qwen3-encoder", + "variant": variant, + "tokenizer_repo": hf_repo, + "parts": {"encoder": "encoder.mlpackage"}, + "io_contract": { + "inputs": inputs, + "outputs": { + "embedding": {"shape": out_shape, "dtype": out_dtype, + "doc": ("per-chunk embeddings; read first N_actual rows (unused rows are 0)" + if variant == "context" else "plain mean-pooled embedding") + + "; int8 = clamp(round(tanh(x)*127),-128,127)"}, + }, + }, + "hidden_size": cfg.hidden_size, + "num_hidden_layers": cfg.num_hidden_layers, + "num_attention_heads": cfg.num_attention_heads, + "num_key_value_heads": cfg.num_key_value_heads, + "head_dim": cfg.head_dim, + "intermediate_size": cfg.intermediate_size, + "vocab_size": cfg.vocab_size, + "rope_theta": cfg.rope_theta, + "rms_norm_eps": cfg.rms_norm_eps, + "max_seq_len": max_seq_len, + "bucket": (f"1..{dynamic_upper}" if dynamic else max_seq_len), + "dynamic": dynamic, + "dynamic_upper": dynamic_upper if dynamic else 0, + "output_mode": output_mode, + "fp16_residual_rescale_k": rescale_k, + "norm_impl": norm_impl, + "pooling": "mean", + "quantization_weights": quantize or "fp16", + "matryoshka_dims": [1024, 512, 256, 128], + # Flexible RangeDim models force CPU fallback on ANE → run on GPU; fixed buckets on ANE. + "compute_units": "CPU_AND_GPU" if dynamic else "CPU_AND_NE", + } + path = os.path.join(output_dir, "model_config.json") + with open(path, "w") as f: + json.dump(cfgd, f, indent=2) + print(f" wrote {path}") + + +def _copy_tokenizer(snap, output_dir): + dst = os.path.join(output_dir, "hf_model") + os.makedirs(dst, exist_ok=True) + for name in os.listdir(snap): + if name.startswith("tokenizer") or name in ( + "config.json", "special_tokens_map.json", "vocab.json", "merges.txt", + "added_tokens.json", + ): + shutil.copy2(os.path.join(snap, name), os.path.join(dst, name)) + print(f" copied tokenizer files → {dst}") + + +def main(): + from config import MODEL_REGISTRY + + ap = argparse.ArgumentParser(description="Build CoreML bundle for pplx-embed") + ap.add_argument("--model", default="pplx-embed", choices=list(MODEL_REGISTRY.keys())) + ap.add_argument("--max-seq-len", type=int, default=4096) + ap.add_argument("--output-mode", default="int8", choices=["int8", "pooled_fp16"]) + ap.add_argument("--rescale-k", type=float, default=DEFAULT_RESCALE_K) + ap.add_argument("--quantize", default="none", choices=["none", "int8", "int4"]) + ap.add_argument("--variant", default="auto", choices=["auto", "plain", "context"], + help="auto → context if the model name contains 'context', else plain") + ap.add_argument("--dynamic-upper", type=int, default=0, + help="If >0, build a flexible RangeDim (1..N) GPU model (the >max-bucket " + "catch-all), e.g. 8192. Plain only.") + ap.add_argument("--hf-dir", default=None, help="Override HF dir (skip download)") + ap.add_argument("--output", default=None) + ap.add_argument("--no-skip", action="store_true", help="Rebuild even if exists") + ap.add_argument("--norm-impl", default="native", choices=["ane_cat", "native"], + help="RMSNorm for the 5 encoder norm sites: native (Qwen3RMSNorm rsqrt, " + "default — 12-21%% faster on ANE per experiment_ane_rmsnorm.py) or " + "ane_cat (shared cat/chunk LayerNorm trick).") + args = ap.parse_args() + + reg = MODEL_REGISTRY[args.model] + hf_repo = args.hf_dir or reg.hf_repo + variant = ("context" if "context" in args.model else "plain") if args.variant == "auto" else args.variant + if args.dynamic_upper: + tag = f"dyn{args.dynamic_upper}-{args.output_mode}" + else: + tag = f"L{args.max_seq_len}-{args.output_mode}" + (f"-{args.quantize}" if args.quantize != "none" else "") + output = args.output or os.path.join(ROOT, "..", "output", args.model, tag) + quantize = None if args.quantize == "none" else args.quantize + build_bundle(hf_repo, args.model, output, args.max_seq_len, args.output_mode, + args.rescale_k, quantize, variant=variant, dynamic_upper=args.dynamic_upper, + skip_if_exists=not args.no_skip, norm_impl=args.norm_impl) + + +if __name__ == "__main__": + main() diff --git a/conversion/config.py b/conversion/config.py index 17beb27..501c8a2 100644 --- a/conversion/config.py +++ b/conversion/config.py @@ -68,6 +68,20 @@ class ConversionConfig: max_context_length=2048, description="EmbeddingGemma 300M - Gemma 3 bidirectional encoder, 768-d sentence embedding (Matryoshka)", ), + "pplx-embed": ConversionConfig( + hf_repo="perplexity-ai/pplx-embed-v1-0.6b", + architecture="qwen3-encoder", + default_context_length=4096, + max_context_length=32768, + description="Perplexity pplx-embed-v1 0.6B - bidirectional Qwen3 encoder, mean-pool, 1024-d int8 sentence embedding (plain). trust_remote_code; fixed-shape buckets.", + ), + "pplx-embed-context": ConversionConfig( + hf_repo="perplexity-ai/pplx-embed-context-v1-0.6b", + architecture="qwen3-encoder", + default_context_length=4096, + max_context_length=32768, + description="Perplexity pplx-embed-context-v1 0.6B - bidirectional Qwen3 encoder with late chunking (pool_matrix -> per-chunk 1024-d int8). trust_remote_code; fixed-shape buckets.", + ), "lfm2.5-350m": ConversionConfig( hf_repo="LiquidAI/LFM2.5-350M", architecture="lfm2", diff --git a/conversion/experiment_ane_rmsnorm.py b/conversion/experiment_ane_rmsnorm.py new file mode 100644 index 0000000..bb0d361 --- /dev/null +++ b/conversion/experiment_ane_rmsnorm.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +"""A/B: native `Qwen3RMSNorm` vs the shared `ANERMSNorm` cat/chunk trick on the ANE. + +Isolates the RMSNorm half of the incidental finding in docs/PPLX_EMBED_GPU_RESIDENCY.md: +a from-scratch "GPU-native" encoder rebuild ran the *ANE* path ~21% faster (28.6 vs +36.1 ms at L=256), but it confounded three changes (Conv2d-1×1→Linear, cat/chunk→native +RMSNorm, layout). This script changes ONLY the RMSNorm (`norm_impl` ∈ {ane_cat, native}, +see models/qwen3_encoder.py) and measures, for each L ∈ {256, 512}: + + * ANE residency — % of non-const MLProgram ops the static planner puts on the ANE + (reuses audit_ane_residency.py's MLComputePlan pattern). The gate: + native must STAY on the ANE (≈ the ane_cat residency, ~99%). + * latency — CPU_AND_NE warm median of MLModel.predict (the metric that matters; + the ANE path is the one pplx-embed ships). + * fidelity — cosine of the pooled_fp16 output vs the fp32 `Reference` oracle + (gate ≥ 0.99). Built with `--output-mode pooled_fp16` so the Python + CoreML bridge can read the output. + +Decision rule (printed at the end): native WINS if it is faster on CPU_AND_NE, keeps +residency ≥ ~99%, and holds cosine ≥ 0.99 at both L — then make it the pplx-embed +default. Otherwise keep ane_cat and record the negative result. + +Usage: + uv run python conversion/experiment_ane_rmsnorm.py + uv run python conversion/experiment_ane_rmsnorm.py --lengths 256 512 --iters 30 +""" +from __future__ import annotations + +import argparse +import os +import sys +import time +from collections import Counter + +import numpy as np + +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, ROOT) + +NORM_IMPLS = ("ane_cat", "native") + +# A representative multilingual paragraph; repeated/truncated to fill each bucket so the +# fidelity check exercises a realistic (long, non-trivial) input at every L. +SAMPLE_TEXT = ( + "Embeddings map text into dense vectors so that semantically similar passages land " + "near each other. 東京は日本の首都であり、世界有数の大都市圏を形成しています。 " + "La inteligencia artificial avanza rápido y transforma la búsqueda de información. " + "Машинное обучение изменяет способы обработки естественного языка. " + "Retrieval-augmented generation grounds large language models in external knowledge." +) + + +def _build(hf_repo: str, norm_impl: str, L: int, out_root: str) -> str: + """Build a pooled_fp16 bucket for (norm_impl, L); returns the .mlpackage path.""" + from build_pplx_embed_bundle import build_bundle + + out_dir = os.path.join(out_root, f"{norm_impl}-L{L}") + return build_bundle( + hf_repo, "pplx-embed", out_dir, max_seq_len=L, output_mode="pooled_fp16", + quantize=None, variant="plain", dynamic_upper=0, skip_if_exists=True, + norm_impl=norm_impl, + ) + + +def _compile(pkg: str) -> str: + """Compile an .mlpackage → .mlmodelc via `xcrun coremlcompiler` (skip if present). + + MLComputePlan.load_from_path wants a *compiled* model; pointed at a raw .mlpackage + it hard-aborts (uncatchable C++ exception). Compile once, reuse. + """ + import subprocess + + out_dir = os.path.dirname(pkg) + mlmodelc = os.path.join(out_dir, "encoder.mlmodelc") + if not os.path.isdir(mlmodelc): + subprocess.run(["xcrun", "coremlcompiler", "compile", pkg, out_dir], + check=True, capture_output=True) + return mlmodelc + + +def _ane_residency(mlmodelc: str) -> tuple[float, int, Counter]: + """Static op→device tally via MLComputePlan. Returns (ANE %, total ops, by-device).""" + import coremltools as ct + from coremltools.models.compute_plan import MLComputePlan + from audit_ane_residency import _iter_mlprogram_ops, _device_label + + plan = MLComputePlan.load_from_path(path=mlmodelc, compute_units=ct.ComputeUnit.CPU_AND_NE) + by_device: Counter = Counter() + total = 0 + for _func, op in _iter_mlprogram_ops(plan.model_structure): + if op.operator_name == "const": + continue + try: + usage = plan.get_compute_device_usage_for_mlprogram_operation(op) + except Exception: + usage = None + by_device[_device_label(usage)] += 1 + total += 1 + ane_pct = 100.0 * by_device.get("ANE", 0) / total if total else 0.0 + return ane_pct, total, by_device + + +def _make_inputs(tokenizer, L: int): + """Tokenize SAMPLE_TEXT (repeated to ~fill L), pad to L. Returns (inputs, n_valid).""" + text = SAMPLE_TEXT + # Repeat until the tokenized length comfortably exceeds L, then truncate to L. + while len(tokenizer.encode(text)) < L: + text = text + " " + SAMPLE_TEXT + enc = tokenizer([text], return_tensors="np", truncation=True, max_length=L) + ids = enc["input_ids"][0].astype(np.int32) + n = int(ids.shape[0]) + pid = np.zeros((1, L), dtype=np.int32) + pid[0, :n] = ids + pam = np.zeros((1, L), dtype=np.float16) + pam[0, :n] = 1.0 + return {"input_ids": pid, "attention_mask": pam}, n + + +def _latency_and_output(pkg: str, inputs: dict, iters: int, warmup: int): + """CPU_AND_NE warm median latency (ms) + the pooled fp16 output of the last run.""" + import coremltools as ct + + m = ct.models.MLModel(pkg, compute_units=ct.ComputeUnit.CPU_AND_NE) + out = None + for _ in range(warmup): + out = m.predict(inputs) + times = [] + for _ in range(iters): + t = time.time() + out = m.predict(inputs) + times.append((time.time() - t) * 1000.0) + emb = np.asarray(out["embedding"]).astype(np.float32).reshape(1, -1) + return float(np.median(times)), float(np.mean(times)), emb + + +def main() -> int: + ap = argparse.ArgumentParser(description="ANE RMSNorm A/B (native vs cat/chunk)") + ap.add_argument("--hf-repo", default=None, help="Override HF repo / local dir") + ap.add_argument("--lengths", type=int, nargs="+", default=[256, 512]) + ap.add_argument("--iters", type=int, default=30) + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--out-root", default=os.path.join(ROOT, "..", "output", + "pplx-embed-rmsnorm-ab")) + ap.add_argument("--fidelity-gate", type=float, default=0.99) + args = ap.parse_args() + + from config import MODEL_REGISTRY + hf_repo = args.hf_repo or MODEL_REGISTRY["pplx-embed"].hf_repo + + import pplx_embed_reference as R + print(f"[ref] loading fp32 oracle {hf_repo} …") + ref = R.Reference(hf_repo) + tok = ref.tokenizer + + # results[(impl, L)] = dict(ane_pct, total, latency_med, latency_mean, cosine, n) + results: dict[tuple[str, int], dict] = {} + for L in args.lengths: + inputs, n = _make_inputs(tok, L) + # fp32 reference pooled (pre-tanh) for this exact (truncated) input. + import torch + ids_t = torch.from_numpy(inputs["input_ids"]).to(torch.long)[:, :n] + mask_t = torch.ones((1, n), dtype=torch.float32) + with torch.inference_mode(): + hidden = ref.model(input_ids=ids_t, attention_mask=mask_t).last_hidden_state.float() + ref_pooled = R.masked_mean(hidden, mask_t).numpy().astype(np.float32) + + for impl in NORM_IMPLS: + print(f"\n=== norm_impl={impl} L={L} (n_valid={n}) ===") + pkg = _build(hf_repo, impl, L, args.out_root) + mlmodelc = _compile(pkg) + ane_pct, total, by_dev = _ane_residency(mlmodelc) + print(f" residency: ANE {ane_pct:.2f}% ({total} ops; " + f"{dict(by_dev)})") + med, mean, emb = _latency_and_output(pkg, inputs, args.iters, args.warmup) + cos = float(R.cosine_similarity(emb, ref_pooled)[0]) + print(f" CPU_AND_NE latency: median {med:.2f} ms mean {mean:.2f} ms") + print(f" fidelity cosine vs fp32: {cos:.6f} (gate ≥ {args.fidelity_gate})") + results[(impl, L)] = dict(ane_pct=ane_pct, total=total, latency_med=med, + latency_mean=mean, cosine=cos, n=n) + + # ---- summary + decision ------------------------------------------------- + print("\n" + "=" * 72) + print("SUMMARY (norm_impl × L)") + print("=" * 72) + print(f" {'impl':<8} {'L':>5} {'ANE%':>7} {'lat_med(ms)':>12} {'cosine':>9}") + for L in args.lengths: + for impl in NORM_IMPLS: + r = results[(impl, L)] + print(f" {impl:<8} {L:>5} {r['ane_pct']:>7.2f} {r['latency_med']:>12.2f} " + f"{r['cosine']:>9.5f}") + + print("\nDECISION") + native_wins_all = True + for L in args.lengths: + a = results[("ane_cat", L)] + nv = results[("native", L)] + speedup = (a["latency_med"] / nv["latency_med"] - 1.0) * 100.0 + faster = nv["latency_med"] < a["latency_med"] + resident = nv["ane_pct"] >= 0.99 * a["ane_pct"] and nv["ane_pct"] >= 99.0 + fid_ok = nv["cosine"] >= args.fidelity_gate + verdict = "WIN" if (faster and resident and fid_ok) else "no" + if verdict != "WIN": + native_wins_all = False + print(f" L={L}: native {nv['latency_med']:.2f} vs ane_cat {a['latency_med']:.2f} ms " + f"({speedup:+.1f}% {'faster' if faster else 'slower'}); " + f"ANE {nv['ane_pct']:.2f}% (resident={resident}); " + f"cosine {nv['cosine']:.5f} (ok={fid_ok}) → {verdict}") + + print() + if native_wins_all: + print(" ✅ native RMSNorm WINS at all L → make norm_impl='native' the pplx-embed " + "default (build_pplx_embed_bundle.py / encoder).") + else: + print(" ❌ native does not clear the gate at every L → keep ane_cat; record the " + "negative result. (See per-L lines above.)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/conversion/experiment_batching.py b/conversion/experiment_batching.py new file mode 100644 index 0000000..09df6d6 --- /dev/null +++ b/conversion/experiment_batching.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +"""Rigorous CoreML batching throughput experiment for the pplx-embed encoder. + +Question: does batching (B>1) give throughput gains on Apple Silicon, and if the +earlier quick test showed FLAT docs/sec, is that real (and why) or a measurement +blind spot? + +For each (L, B) we build a pooled_fp16 batched encoder, convert with coremltools +(shape (B, L)), then load+time it under three compute-unit settings and compute +per-doc latency = batch_latency / B and docs/sec = B / batch_latency. + +We also: + - audit the actual compute-device placement (MLComputePlan) of a batched model, + - run a control (1 batch-N predict vs N sequential B=1 predicts), + - sanity-check that batching is real (distinct input rows -> distinct outputs). + +Run: + uv run python conversion/experiment_batching.py # full sweep + uv run python conversion/experiment_batching.py --quick # smaller sweep +""" +from __future__ import annotations + +import argparse +import gc +import os +import sys +import time +from collections import Counter + +import numpy as np +import torch + +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, ROOT) + +import coremltools as ct # noqa: E402 +from coremltools.models.compute_plan import MLComputePlan # noqa: E402 + +from models.qwen3_encoder import ( # noqa: E402 + Qwen3EncoderConfig, + PplxEmbedModel, + load_encoder_weights, + apply_fp16_residual_rescale, +) + +HF_REPO = "perplexity-ai/pplx-embed-v1-0.6b" +RESCALE_K = 8.0 + +CU = { + "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE, + "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU, + "CPU_ONLY": ct.ComputeUnit.CPU_ONLY, +} + + +def snapshot_dir() -> str: + if os.path.isdir(HF_REPO): + return HF_REPO + from huggingface_hub import snapshot_download + return snapshot_download( + HF_REPO, + allow_patterns=["*.json", "*.safetensors", "tokenizer*", "*.txt", "*.py", "1_Pooling/*"], + ) + + +_TORCH_CACHE: dict[int, PplxEmbedModel] = {} + + +def torch_model(snap: str, rope_len: int) -> PplxEmbedModel: + """Build (once per rope_len) the weight-loaded, rescaled torch encoder.""" + if rope_len in _TORCH_CACHE: + return _TORCH_CACHE[rope_len] + cfg = Qwen3EncoderConfig.from_json(os.path.join(snap, "config.json"), max_seq_len=rope_len) + model = PplxEmbedModel(cfg, output_mode="pooled_fp16").eval() + load_encoder_weights(model.encoder, snap) + apply_fp16_residual_rescale(model.encoder, RESCALE_K) + _TORCH_CACHE[rope_len] = model + return model + + +def build_mlpackage(snap: str, L: int, B: int, out_dir: str) -> str: + """Convert a (B, L) pooled_fp16 encoder to a .mlpackage. Cached on disk.""" + pkg = os.path.join(out_dir, f"enc_L{L}_B{B}.mlpackage") + if os.path.exists(pkg): + return pkg + os.makedirs(out_dir, exist_ok=True) + model = torch_model(snap, rope_len=L) + sample_ids = torch.zeros((B, L), dtype=torch.int32) + sample_mask = torch.ones((B, L), dtype=torch.float16) + with torch.no_grad(): + traced = torch.jit.trace(model, (sample_ids, sample_mask)) + inputs = [ + ct.TensorType(name="input_ids", shape=(B, L), dtype=np.int32), + ct.TensorType(name="attention_mask", shape=(B, L), dtype=np.float16), + ] + mlmodel = ct.convert( + traced, + inputs=inputs, + outputs=[ct.TensorType(name="embedding", dtype=np.float16)], + minimum_deployment_target=ct.target.macOS26, + compute_units=ct.ComputeUnit.ALL, + ) + mlmodel.save(pkg) + del traced, mlmodel + gc.collect() + return pkg + + +def make_inputs(B: int, L: int, distinct: bool = False) -> dict: + rng = np.random.default_rng(0) + if distinct: + ids = rng.integers(1, 5000, size=(B, L)).astype(np.int32) + else: + ids = rng.integers(1, 5000, size=(1, L)).astype(np.int32) + ids = np.repeat(ids, B, axis=0).astype(np.int32) + mask = np.ones((B, L), dtype=np.float16) + return {"input_ids": ids, "attention_mask": mask} + + +def time_model(mlmodel, feeds: dict, n_warm: int = 3, n_runs: int = 8) -> dict: + for _ in range(n_warm): + mlmodel.predict(feeds) + samples = [] + for _ in range(n_runs): + t0 = time.perf_counter() + mlmodel.predict(feeds) + samples.append(time.perf_counter() - t0) + samples.sort() + return { + "median_s": samples[len(samples) // 2], + "min_s": samples[0], + "max_s": samples[-1], + "runs": samples, + } + + +def device_label(usage) -> str: + if usage is None: + return "unknown" + pref = getattr(usage, "preferred_compute_device", None) or getattr(usage, "preferred", None) + if pref is None: + return "unknown" + name = type(pref).__name__ + if "Neural" in name or "ANE" in name: + return "ANE" + if "GPU" in name: + return "GPU" + if "CPU" in name: + return "CPU" + return name + + +def _iter_ops(ms): + prog = getattr(ms, "program", None) + if prog is None: + return + for fn, func in prog.functions.items(): + stack = [func.block] + while stack: + blk = stack.pop() + for op in blk.operations: + yield op + for nb in getattr(op, "blocks", ()) or (): + stack.append(nb) + + +_COMPILE_CACHE: dict[str, str] = {} + + +def _compiled_path(pkg: str) -> str: + """Compile an .mlpackage to a persistent .mlmodelc once (MLComputePlan needs + a compiled model, and the temp one from get_compiled_model_path() is deleted + when its MLModel is GC'd — so copy it to a stable location).""" + if pkg in _COMPILE_CACHE: + return _COMPILE_CACHE[pkg] + import shutil + m = ct.models.MLModel(pkg, compute_units=ct.ComputeUnit.CPU_ONLY) + tmp = m.get_compiled_model_path() + dst = pkg.replace(".mlpackage", ".mlmodelc") + if os.path.exists(dst): + shutil.rmtree(dst) + shutil.copytree(tmp, dst) # copy before `m` is GC'd / tmp is cleaned + del m + _COMPILE_CACHE[pkg] = dst + return dst + + +def audit_devices(pkg: str, compute_unit: ct.ComputeUnit) -> Counter: + path = _compiled_path(pkg) if pkg.endswith(".mlpackage") else pkg + plan = MLComputePlan.load_from_path(path=path, compute_units=compute_unit) + ms = plan.model_structure + by_dev = Counter() + for op in _iter_ops(ms): + if op.operator_name == "const": + continue + try: + usage = plan.get_compute_device_usage_for_mlprogram_operation(op) + except Exception: + usage = None + by_dev[device_label(usage)] += 1 + return by_dev + + +def fmt_devs(c: Counter) -> str: + tot = sum(c.values()) or 1 + return ", ".join(f"{d}:{n}({100*n/tot:.0f}%)" for d, n in c.most_common()) + + +class _Tee: + def __init__(self, *streams): + self.streams = streams + def write(self, s): + for st in self.streams: + st.write(s) + st.flush() + def flush(self): + for st in self.streams: + st.flush() + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--quick", action="store_true", help="smaller sweep (B up to 16)") + ap.add_argument("--runs", type=int, default=8) + ap.add_argument("--out-dir", default=os.path.join(ROOT, "experiments", "batching_models")) + ap.add_argument("--log", default=os.path.join(ROOT, "experiments", "batching_out.log")) + args = ap.parse_args() + + os.makedirs(os.path.dirname(args.log), exist_ok=True) + _logf = open(args.log, "w") + sys.stdout = _Tee(sys.__stdout__, _logf) + + Ls = [128, 512] + Bs = [1, 4, 16] if args.quick else [1, 4, 16, 64] + units = ["CPU_AND_NE", "CPU_AND_GPU", "CPU_ONLY"] + + snap = snapshot_dir() + print(f"snapshot: {snap}") + print(f"Ls={Ls} Bs={Bs} units={units} runs={args.runs}\n") + + # ---- Build all needed mlpackages first (one torch model per L) ---- + pkgs: dict[tuple[int, int], str] = {} + for L in Ls: + for B in Bs: + print(f"[build] L={L} B={B} ...", flush=True) + pkgs[(L, B)] = build_mlpackage(snap, L, B, args.out_dir) + # free torch + _TORCH_CACHE.clear() + gc.collect() + + # ---- Timing sweep ---- + # rows: (L, B, unit) -> result + results = {} + for unit in units: + for L in Ls: + for B in Bs: + pkg = pkgs[(L, B)] + try: + m = ct.models.MLModel(pkg, compute_units=CU[unit]) + except Exception as e: + print(f" load FAIL L={L} B={B} {unit}: {e}") + continue + feeds = make_inputs(B, L, distinct=False) + try: + r = time_model(m, feeds, n_runs=args.runs) + except Exception as e: + print(f" predict FAIL L={L} B={B} {unit}: {e}") + del m + gc.collect() + continue + bl = r["median_s"] + results[(L, B, unit)] = { + "batch_lat_ms": bl * 1e3, + "per_doc_ms": bl / B * 1e3, + "docs_per_s": B / bl, + } + print(f" L={L:4d} B={B:3d} {unit:12s} " + f"batch={bl*1e3:8.2f}ms per-doc={bl/B*1e3:7.3f}ms " + f"docs/s={B/bl:8.2f}") + del m + gc.collect() + + # ---- Print tables ---- + print("\n\n========== docs/sec (rows=B, cols=unit) ==========") + for L in Ls: + print(f"\n--- L={L} ---") + header = " B " + "".join(f"{u:>14s}" for u in units) + print(header) + for B in Bs: + row = f"{B:4d} " + for u in units: + r = results.get((L, B, u)) + row += f"{r['docs_per_s']:14.2f}" if r else f"{'-':>14s}" + print(row) + + print("\n\n========== per-doc latency ms (rows=B, cols=unit) ==========") + for L in Ls: + print(f"\n--- L={L} ---") + header = " B " + "".join(f"{u:>14s}" for u in units) + print(header) + for B in Bs: + row = f"{B:4d} " + for u in units: + r = results.get((L, B, u)) + row += f"{r['per_doc_ms']:14.3f}" if r else f"{'-':>14s}" + print(row) + + # ---- Speedup vs B=1 (docs/sec ratio) ---- + print("\n\n========== batch speedup = docs/s(B) / docs/s(B=1) ==========") + for L in Ls: + print(f"\n--- L={L} ---") + header = " B " + "".join(f"{u:>14s}" for u in units) + print(header) + for B in Bs: + row = f"{B:4d} " + for u in units: + r = results.get((L, B, u)) + r1 = results.get((L, 1, u)) + if r and r1: + row += f"{r['docs_per_s']/r1['docs_per_s']:13.2f}x" + else: + row += f"{'-':>14s}" + print(row) + + # ---- Device audit for B=64 (or max B), L=128 ---- + maxB = Bs[-1] + print(f"\n\n========== DEVICE PLACEMENT (L=128, B={maxB}) ==========") + pkg = pkgs[(128, maxB)] + for unit in units: + try: + c = audit_devices(pkg, CU[unit]) + print(f" requested {unit:12s} -> {fmt_devs(c)}") + except Exception as e: + print(f" audit {unit} failed: {e}") + # also B=1 L=128 for contrast + print(f"\n (contrast) L=128 B=1:") + for unit in units: + try: + c = audit_devices(pkgs[(128, 1)], CU[unit]) + print(f" requested {unit:12s} -> {fmt_devs(c)}") + except Exception as e: + print(f" audit {unit} failed: {e}") + + # ---- Control: 1x batch-N vs N x batch-1 (best compute unit per case) ---- + print(f"\n\n========== CONTROL: batch-N predict vs N sequential B=1 ==========") + for unit in ["CPU_AND_NE", "CPU_AND_GPU", "CPU_ONLY"]: + for L in Ls: + B = maxB + mN = ct.models.MLModel(pkgs[(L, B)], compute_units=CU[unit]) + m1 = ct.models.MLModel(pkgs[(L, 1)], compute_units=CU[unit]) + feedsN = make_inputs(B, L, distinct=True) + feeds1_list = [ + {"input_ids": feedsN["input_ids"][i:i+1], + "attention_mask": feedsN["attention_mask"][i:i+1]} + for i in range(B) + ] + # warm + for _ in range(2): + mN.predict(feedsN) + m1.predict(feeds1_list[0]) + # batch-N + tb = [] + for _ in range(5): + t0 = time.perf_counter() + mN.predict(feedsN) + tb.append(time.perf_counter() - t0) + tb.sort(); batchN = tb[len(tb)//2] + # N sequential + ts = [] + for _ in range(3): + t0 = time.perf_counter() + for f in feeds1_list: + m1.predict(f) + ts.append(time.perf_counter() - t0) + ts.sort(); seqN = ts[len(ts)//2] + print(f" {unit:12s} L={L:4d} B={B}: batch-N={batchN*1e3:8.1f}ms " + f"{B}x(B=1)={seqN*1e3:8.1f}ms speedup={seqN/batchN:5.2f}x") + del mN, m1 + gc.collect() + + # ---- Sanity: distinct rows -> distinct outputs ---- + print(f"\n\n========== SANITY: batching is real (distinct rows) ==========") + L, B = 128, min(4, maxB) + m = ct.models.MLModel(pkgs[(L, B)], compute_units=ct.ComputeUnit.CPU_AND_NE) + feeds = make_inputs(B, L, distinct=True) + out = m.predict(feeds)["embedding"] + out = np.asarray(out) + print(f" output shape: {out.shape}") + # pairwise check rows differ + allsame = True + for i in range(B): + for j in range(i+1, B): + d = float(np.abs(out[i] - out[j]).max()) + if d > 1e-4: + allsame = False + print(f" row {i} vs {j}: max|diff|={d:.4f}") + print(f" -> distinct inputs give {'DUPLICATE (BROADCAST BUG!)' if allsame else 'DISTINCT'} outputs") + # also confirm a single row matches the B=1 model on same input + m1 = ct.models.MLModel(pkgs[(L, 1)], compute_units=ct.ComputeUnit.CPU_AND_NE) + o1 = np.asarray(m1.predict({"input_ids": feeds["input_ids"][0:1], + "attention_mask": feeds["attention_mask"][0:1]})["embedding"]) + d01 = float(np.abs(out[0] - o1[0]).max()) + print(f" batch row0 vs B=1 model on same input: max|diff|={d01:.4f} " + f"({'MATCH' if d01 < 0.05 else 'MISMATCH'})") + + print("\nDONE.") + + +if __name__ == "__main__": + main() diff --git a/conversion/experiment_w8a8.py b/conversion/experiment_w8a8.py new file mode 100644 index 0000000..6d0b814 --- /dev/null +++ b/conversion/experiment_w8a8.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python3 +"""W8A8 (int8 weights + int8 ACTIVATIONS) viability probe for pplx-embed. + +Milestone B4. WEIGHT-only quant is a dead end for this encoder (int8 linear ~0.42 +cosine; int4 palettize ~0.905) *and* only buys 4-8% latency, because the model is +activation/compute-bound (fp16 attention), not weight-bandwidth-bound. The real +bandwidth lever is ACTIVATION quantization. This script answers empirically: can +W8A8 reach acceptable fidelity, or does it hit the attention-family wall (~cos 0.57)? + +Pipeline (per the coremltools activation-quant flow): + 1. Build an fp16 pooled_fp16 encoder at a SMALL bucket (L=128/256) — output_mode + "pooled_fp16" so the CoreML pooled vector is Python-readable on macOS26. + 2. Calibrate activation ranges on a small multilingual corpus (tokenized + padded + to the bucket) via cto.experimental.linear_quantize_activations. + 3. Quantize weights int8 (linear_symmetric) on top -> W8A8. + 4. Predict on the eval texts, quantize the pooled output with int8_tanh_quant, and + compute cosine vs the fp32 Reference oracle. Report mean/min. + +Two activation-quant modes are exposed because the attention pad-mask uses a large +negative sentinel (Qwen3Encoder.NEG_INF = -1e4; CoreML may lower the mask add to the +fp16 -65504 floor). A SYMMETRIC activation quantizer maps that catastrophically: +scale = 1e4/127 ~= 79, so real attention scores (+-10) round to ~0 and the model +collapses. ASYMMETRIC (mode="linear") lets the range span [-1e4, +score]; when the +span overflows fp16 the scale goes inf and coremltools SKIPS that op (left in fp16) — +which is exactly what we want for the mask add, while every other activation +quantizes normally. + +Usage: + uv run python conversion/experiment_w8a8.py --bucket 128 --mode asymmetric --rescale-k 8 + uv run python conversion/experiment_w8a8.py --bucket 128 --mode symmetric --rescale-k 8 + uv run python conversion/experiment_w8a8.py --bucket 128 --mode asymmetric --rescale-k 0 # no rescale + uv run python conversion/experiment_w8a8.py --all # sweep the key variants + baselines +""" +from __future__ import annotations + +import argparse +import os +import shutil +import sys + +import numpy as np +import torch + +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, ROOT) +sys.path.insert(0, os.path.join(ROOT, "models")) + +import coremltools as ct # noqa: E402 +import coremltools.optimize.coreml as cto # noqa: E402 + +from models.qwen3_encoder import ( # noqa: E402 + Qwen3EncoderConfig, + PplxEmbedModel, + load_encoder_weights, + apply_fp16_residual_rescale, +) +from pplx_embed_reference import Reference, int8_tanh_quant, cosine_similarity # noqa: E402 + +HF_REPO = "perplexity-ai/pplx-embed-v1-0.6b" +fp16_mil = ct.converters.mil.mil.types.fp16 + + +# --------------------------------------------------------------------------- # +# coremltools compatibility / behaviour patches. +# --------------------------------------------------------------------------- # +def _patch_coremltools_cast() -> None: + """coremltools 9 _cast() folds const int/bool casts but calls int()/bool() on + numpy>=2 (1,)-shaped arrays, which raises. Extract the Python scalar first.""" + from coremltools.converters.mil.frontend.torch import ops as _torch_ops + from coremltools.converters.mil.frontend.torch.ops import _get_inputs + from coremltools.converters.mil.mil import Builder as mb + + def _cast_patched(context, node, dtype, dtype_name): + inputs = _get_inputs(context, node, expected=1) + x = inputs[0] + if not (len(x.shape) == 0 or np.all([d == 1 for d in x.shape])): + raise ValueError("input to cast must be either a scalar or a length 1 tensor") + if x.can_be_folded_to_const(): + val = x.val + if isinstance(val, np.ndarray) and val.ndim >= 1: + val = val.item() + if not isinstance(val, dtype): + res = mb.const(val=dtype(val), name=node.name) + else: + res = mb.const(val=val, name=node.name) + elif len(x.shape) > 0: + x = mb.squeeze(x=x, name=node.name + "_item") + res = mb.cast(x=x, dtype=dtype_name, name=node.name) + else: + res = mb.cast(x=x, dtype=dtype_name, name=node.name) + context.add(res, node.name) + + _torch_ops._cast = _cast_patched + + +def _patch_coremltools_act_quant() -> None: + """insert_prefix_quantize_dequantize_pair tries to wrap every supported op with a + quantize/dequantize pair, including ops whose input x is int32 (mask add / expand, + embedding path). MIL `quantize` requires float input, so int32-input ops crash with + 'scale has dtype fp32 whereas input has dtype int32'. Skip non-float-input ops.""" + from coremltools.optimize.coreml import _quantization_passes + from coremltools.converters.mil.mil import types as mil_types + + _orig = _quantization_passes.insert_prefix_quantize_dequantize_pair.transform_op + + def _patched(self, op): + x_var = op.inputs.get("x") + if x_var is not None and not mil_types.is_float(x_var.dtype): + return + return _orig(self, op) + + _quantization_passes.insert_prefix_quantize_dequantize_pair.transform_op = _patched + + +_patch_coremltools_cast() +_patch_coremltools_act_quant() + + +# --------------------------------------------------------------------------- # +# Calibration / eval corpus. +# --------------------------------------------------------------------------- # +CALIBRATION_TEXTS = [ + "The transformer architecture has revolutionized natural language processing.", + "Apple Silicon's Neural Engine achieves high energy efficiency for ML workloads.", + "Bidirectional attention lets every token attend to every other token.", + "Retrieval-augmented generation grounds responses in external knowledge.", + "El procesamiento del lenguaje natural ha avanzado mucho en los ultimos anos.", + "Le modele encode chaque phrase en un vecteur dense de grande dimension.", + "Maschinelles Lernen ermoglicht effiziente Inferenz direkt auf dem Geraet.", + "深層学習はテキストを密なベクトル表現に変換します。", + "向量检索通过余弦相似度衡量语义相关性。", + "machine learning", + "natural language processing", + "Cosine similarity measures semantic relatedness between embedding vectors.", + "Late chunking encodes long documents with a single bidirectional forward pass.", + "The int8 quantization of embeddings reduces memory bandwidth and latency.", +] + +EVAL_TEXTS = [ + "Quantum computing leverages superposition and entanglement for computation.", + "La inteligencia artificial transforma la manera en que trabajamos.", + "Les reseaux de neurones apprennent des representations hierarchiques.", + "Neuronale Netze lernen hierarchische Merkmalsrepraesentationen.", + "気候変動は地球規模で生態系に影響を与えています。", + "知识图谱将实体和关系组织成结构化的网络。", + "Vector databases enable fast approximate nearest neighbor search at scale.", + "Photosynthesis converts light energy into chemical energy in plants.", + "The stock market reacted sharply to the central bank's announcement.", + "Renewable energy sources are critical to mitigating climate change.", + "embeddings", + "A short multilingual sentence. Une phrase courte. Ein kurzer Satz.", +] + + +# --------------------------------------------------------------------------- # +# Tokenization + padding to a fixed bucket. +# --------------------------------------------------------------------------- # +def tokenize_padded(tokenizer, texts: list[str], bucket: int) -> list[dict]: + """Right-pad each text to `bucket`. Returns list of {input_ids, attention_mask} + with input_ids int32 [1,L] and attention_mask fp16 [1,L] (1 valid / 0 pad).""" + pad_id = tokenizer.pad_token_id or 0 + out = [] + for t in texts: + enc = tokenizer([t], padding=False, truncation=True, max_length=bucket, + return_tensors="np") + ids = enc["input_ids"].astype(np.int32) + mask = enc["attention_mask"].astype(np.float16) + L = ids.shape[1] + if L < bucket: + pad = bucket - L + ids = np.concatenate([ids, np.full((1, pad), pad_id, np.int32)], axis=1) + mask = np.concatenate([mask, np.zeros((1, pad), np.float16)], axis=1) + out.append({"input_ids": ids, "attention_mask": mask}) + return out + + +# --------------------------------------------------------------------------- # +# Build. +# --------------------------------------------------------------------------- # +def build_fp16_encoder(snap: str, bucket: int, rescale_k: float) -> ct.models.MLModel: + cfg = Qwen3EncoderConfig.from_json(os.path.join(snap, "config.json"), max_seq_len=bucket) + model = PplxEmbedModel(cfg, output_mode="pooled_fp16").eval() + load_encoder_weights(model.encoder, snap) + if rescale_k: + apply_fp16_residual_rescale(model.encoder, rescale_k) + + sample_ids = torch.zeros((1, bucket), dtype=torch.int32) + sample_mask = torch.ones((1, bucket), dtype=torch.float16) + with torch.no_grad(): + traced = torch.jit.trace(model, (sample_ids, sample_mask)) + + inputs = [ + ct.TensorType(name="input_ids", shape=(1, bucket), dtype=np.int32), + ct.TensorType(name="attention_mask", shape=(1, bucket), dtype=np.float16), + ] + mlmodel = ct.convert( + traced, + inputs=inputs, + outputs=[ct.TensorType(name="embedding", dtype=np.float16)], + minimum_deployment_target=ct.target.macOS26, + compute_units=ct.ComputeUnit.ALL, + ) + return mlmodel + + +def quantize_w8a8(fp16_model: ct.models.MLModel, calib: list[dict], mode: str) -> ct.models.MLModel: + """mode: 'asymmetric' -> activation mode='linear'; 'symmetric' -> 'linear_symmetric'.""" + act_mode = "linear" if mode == "asymmetric" else "linear_symmetric" + act_cfg = cto.OptimizationConfig( + global_config=cto.experimental.OpActivationLinearQuantizerConfig(mode=act_mode), + ) + model_a8 = cto.experimental.linear_quantize_activations(fp16_model, act_cfg, calib) + + w_cfg = cto.OptimizationConfig( + global_config=cto.OpLinearQuantizerConfig( + mode="linear_symmetric", dtype=np.int8, weight_threshold=512, + ) + ) + return cto.linear_quantize_weights(model_a8, w_cfg) + + +# --------------------------------------------------------------------------- # +# Measure. +# --------------------------------------------------------------------------- # +def predict_pooled(mlmodel: ct.models.MLModel, samples: list[dict]) -> np.ndarray: + rows = [] + for s in samples: + out = mlmodel.predict({"input_ids": s["input_ids"], "attention_mask": s["attention_mask"]}) + rows.append(np.asarray(out["embedding"], dtype=np.float32).reshape(-1)) + return np.stack(rows, axis=0) # [N, 1024] + + +def fidelity(pooled_fp16: np.ndarray, ref_int8: np.ndarray) -> tuple[float, float, np.ndarray]: + cm_int8 = int8_tanh_quant(pooled_fp16).astype(np.float32) + cos = cosine_similarity(cm_int8, ref_int8.astype(np.float32)) + cos = cos[np.isfinite(cos)] + return float(cos.mean()), float(cos.min()), cos + + +# --------------------------------------------------------------------------- # +# Driver. +# --------------------------------------------------------------------------- # +def run_variant(snap, tokenizer, ref_int8, bucket, mode, rescale_k, out_root, save=True): + label = f"w8a8-{mode}-k{int(rescale_k)}-L{bucket}" + print(f"\n{'='*70}\n{label}\n{'='*70}", flush=True) + + fp16_model = build_fp16_encoder(snap, bucket, rescale_k) + calib = tokenize_padded(tokenizer, CALIBRATION_TEXTS, bucket) + eval_samples = tokenize_padded(tokenizer, EVAL_TEXTS, bucket) + + # fp16 baseline fidelity (same graph, no quant) for reference. + fp16_pooled = predict_pooled(fp16_model, eval_samples) + fp16_mean, fp16_min, _ = fidelity(fp16_pooled, ref_int8) + print(f" fp16 baseline: mean={fp16_mean:.4f} min={fp16_min:.4f}", flush=True) + + print(f" quantizing W8A8 (activation mode={mode}) ...", flush=True) + w8a8 = quantize_w8a8(fp16_model, calib, mode) + w8a8_pooled = predict_pooled(w8a8, eval_samples) + w8a8_mean, w8a8_min, cos = fidelity(w8a8_pooled, ref_int8) + print(f" W8A8 {mode:10s}: mean={w8a8_mean:.4f} min={w8a8_min:.4f}", flush=True) + print(f" per-text cos: {np.round(cos, 3).tolist()}", flush=True) + + pkg = None + if save: + pkg = os.path.join(out_root, f"{label}.mlpackage") + if os.path.exists(pkg): + shutil.rmtree(pkg) + os.makedirs(out_root, exist_ok=True) + w8a8.save(pkg) + print(f" saved {pkg}", flush=True) + + return { + "label": label, "bucket": bucket, "mode": mode, "rescale_k": rescale_k, + "fp16_mean": fp16_mean, "fp16_min": fp16_min, + "w8a8_mean": w8a8_mean, "w8a8_min": w8a8_min, "pkg": pkg, + } + + +def audit_ane(pkg: str) -> None: + """Compile a .mlpackage and report ANE/CPU/GPU op residency (no xcrun needed).""" + from collections import Counter + from coremltools.models.utils import compile_model + from coremltools.models.compute_plan import MLComputePlan + + mlc = pkg.rstrip("/") + ".mlmodelc" + if os.path.exists(mlc): + shutil.rmtree(mlc) + compiled = compile_model(pkg, mlc) + print(f"\n=== ANE audit: {compiled} ===", flush=True) + plan = MLComputePlan.load_from_path(path=compiled, compute_units=ct.ComputeUnit.CPU_AND_NE) + ms = plan.model_structure + prog = getattr(ms, "program", None) + by_dev = Counter() + by_op_dev = Counter() + total = 0 + + def walk(block, fn): + for op in block.operations: + yield fn, op + for nb in getattr(op, "blocks", ()) or (): + yield from walk(nb, fn) + + for fn, func in prog.functions.items(): + for fname, op in walk(func.block, fn): + if op.operator_name == "const": + continue + try: + usage = plan.get_compute_device_usage_for_mlprogram_operation(op) + pref = getattr(usage, "preferred_compute_device", None) or getattr(usage, "preferred", None) + name = type(pref).__name__ if pref is not None else "unknown" + dev = "ANE" if ("Neural" in name or "ANE" in name) else ("GPU" if "GPU" in name else ("CPU" if "CPU" in name else name)) + except Exception: + dev = "unknown" + by_dev[dev] += 1 + by_op_dev[(op.operator_name, dev)] += 1 + total += 1 + print(f" total ops: {total}") + for dev, n in sorted(by_dev.items(), key=lambda kv: -kv[1]): + print(f" {dev}: {n} ({100.0*n/total:.1f}%)") + non_ane = [(o, d, n) for (o, d), n in by_op_dev.items() if d != "ANE"] + if non_ane: + print(" non-ANE ops:") + for o, d, n in sorted(non_ane, key=lambda t: -t[2])[:15]: + print(f" [{d:3s}] {o:<26s} {n}") + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--bucket", type=int, default=128) + ap.add_argument("--mode", default="asymmetric", choices=["asymmetric", "symmetric"]) + ap.add_argument("--rescale-k", type=float, default=8.0) + ap.add_argument("--all", action="store_true", help="Sweep the key variants") + ap.add_argument("--out", default="/tmp/w8a8-experiment") + ap.add_argument("--no-save", action="store_true") + ap.add_argument("--audit", default=None, help="Compile + ANE-audit an existing .mlpackage and exit") + args = ap.parse_args() + + if args.audit: + audit_ane(args.audit) + return + + from huggingface_hub import snapshot_download + snap = snapshot_download( + HF_REPO, + allow_patterns=["*.json", "*.safetensors", "tokenizer*", "*.txt", "*.py", "1_Pooling/*"], + ) + + print("Loading fp32 Reference oracle ...", flush=True) + ref = Reference(HF_REPO) + tokenizer = ref.tokenizer + ref_int8 = ref.embed(EVAL_TEXTS) # [N, 1024] int8 + + results = [] + if args.all: + variants = [ + (args.bucket, "asymmetric", 8.0), + (args.bucket, "symmetric", 8.0), + (args.bucket, "asymmetric", 0.0), + (args.bucket, "asymmetric", 16.0), + ] + for bucket, mode, k in variants: + try: + results.append(run_variant(snap, tokenizer, ref_int8, bucket, mode, k, + args.out, save=not args.no_save)) + except Exception as e: + print(f" VARIANT FAILED ({mode}, k={k}): {e}", flush=True) + import traceback; traceback.print_exc() + else: + results.append(run_variant(snap, tokenizer, ref_int8, args.bucket, args.mode, + args.rescale_k, args.out, save=not args.no_save)) + + print(f"\n{'='*70}\nSUMMARY\n{'='*70}") + print(f"{'variant':<26s} {'fp16 mean':>10s} {'W8A8 mean':>10s} {'W8A8 min':>10s}") + for r in results: + print(f"{r['label']:<26s} {r['fp16_mean']:>10.4f} {r['w8a8_mean']:>10.4f} {r['w8a8_min']:>10.4f}") + print("\nReference points: fp16~0.999 | weight-only int8~0.42 | int4~0.905 | wall~0.57 | gate 0.990") + for r in results: + if r["pkg"]: + print(f" artifact: {r['pkg']}") + + +if __name__ == "__main__": + main() diff --git a/conversion/export_swift_fixtures.py b/conversion/export_swift_fixtures.py new file mode 100644 index 0000000..23aefed --- /dev/null +++ b/conversion/export_swift_fixtures.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +"""Export fidelity fixtures for the Swift pplx-embed bench. + +Native int8 model output is not readable from the Python CoreML bridge on +macOS26, so fidelity/latency for the int8 deliverable is measured in Swift. +This script produces the ground-truth side: pre-tokenized, bucket-padded inputs +plus the fp32-reference int8 embedding for each text. + +Output JSON: + { "L": , "hf_repo": ..., "items": [ + {"text": str, "input_ids": [L ints], "n": int, "ref_int8": [1024 ints]} ] } + +Usage: + python conversion/export_swift_fixtures.py --max-seq-len 4096 --out /tmp/pplx_fix.json +""" +from __future__ import annotations + +import argparse +import json +import os +import sys + +import numpy as np + +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, ROOT) + +import pplx_embed_reference as R # noqa: E402 + +SENTENCES = [ + "hello world", + "Quantum computing uses qubits.", + "東京は日本の首都です。", + "Bonjour le monde.", + "机器学习改变世界。", + "المعرفة قوة.", + "Привет, как дела?", + "Machine learning has transformed how we process information. " * 8, +] + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--hf-repo", default="perplexity-ai/pplx-embed-v1-0.6b") + ap.add_argument("--max-seq-len", type=int, default=4096) + ap.add_argument("--out", default="/tmp/pplx_fixtures.json") + args = ap.parse_args() + + L = args.max_seq_len + ref = R.Reference(args.hf_repo) + tok = ref.tokenizer + + items = [] + for t in SENTENCES: + enc = tok([t], return_tensors="pt", truncation=True, max_length=L) + ids = enc["input_ids"][0].tolist() + n = len(ids) + padded = ids + [0] * (L - n) + # Reference int8 over exactly these n tokens (matched truncation). + import torch + rh = ref.model(input_ids=enc["input_ids"][:, :n], + attention_mask=torch.ones((1, n), dtype=torch.long)).last_hidden_state.float() + ref_i8 = R.int8_tanh_quant(R.masked_mean(rh, torch.ones((1, n)))).reshape(-1).astype(int).tolist() + items.append({"text": t, "input_ids": padded, "n": n, "ref_int8": ref_i8}) + print(f" fixture n={n:4d} {t[:32]}") + + out = {"L": L, "hf_repo": args.hf_repo, "items": items} + with open(args.out, "w") as f: + json.dump(out, f) + print(f"wrote {len(items)} fixtures (L={L}) → {args.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/conversion/measure_l8192_bucket.py b/conversion/measure_l8192_bucket.py new file mode 100644 index 0000000..8a16ddd --- /dev/null +++ b/conversion/measure_l8192_bucket.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +"""Workstream B gate: is a fixed L=8192 ANE bucket worth shipping? + +The >max-bucket catch-all today is a flexible RangeDim **GPU** model (~10× slower than a +fixed ANE bucket, because flexible shapes force ANE fallback). Inputs of 4097–8192 tokens +take that slow path. This measures whether a **fixed L=8192 ANE bucket** instead: + + 1. **stays on the ANE** — compile the int8 bucket, tally op→device via MLComputePlan + (reuses audit_ane_residency.py). Gate: ~99% ANE (like the smaller buckets), not a + fall-off to CPU/GPU. + 2. **is faster** than the dynamic GPU catch-all at a long (~8000-token) input — warm + median of MLModel.predict: ANE bucket (pooled_fp16, padded to 8192, CPU_AND_NE) vs + the dyn8192 GPU model (pooled_fp16, non-padded actual length, CPU_AND_GPU). + 3. **holds fidelity** — cosine of both vs the fp32 `Reference` oracle (gate ≥ 0.99). + +Ship decision (printed): ship iff ANE-resident AND faster than the GPU catch-all. + +Prereqs (build first; each ~1.1 GB): + python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 8192 + python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 8192 \ + --output-mode pooled_fp16 + python conversion/build_pplx_embed_bundle.py --model pplx-embed --dynamic-upper 8192 \ + --output-mode pooled_fp16 # the GPU catch-all to compare against + +Usage: + uv run python conversion/measure_l8192_bucket.py --n-tokens 8000 --iters 5 +""" +from __future__ import annotations + +import argparse +import os +import subprocess +import sys +import time +from collections import Counter + +import numpy as np + +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, ROOT) + +OUT = os.path.join(ROOT, "..", "output", "pplx-embed") +ANE_FP16 = os.path.join(OUT, "L8192-pooled_fp16", "encoder.mlpackage") +ANE_INT8 = os.path.join(OUT, "L8192-int8", "encoder.mlpackage") +DYN_FP16 = os.path.join(OUT, "dyn8192-pooled_fp16", "encoder.mlpackage") + +SAMPLE = ( + "Embeddings map text into dense vectors so semantically similar passages land near " + "each other. 東京は日本の首都であり、世界有数の大都市圏を形成しています。 " + "La inteligencia artificial avanza rápido. Машинное обучение меняет обработку языка. " + "Retrieval-augmented generation grounds large language models in external knowledge. " +) + + +def _compile(pkg: str) -> str: + out_dir = os.path.dirname(pkg) + mlmodelc = os.path.join(out_dir, "encoder.mlmodelc") + if not os.path.isdir(mlmodelc): + subprocess.run(["xcrun", "coremlcompiler", "compile", pkg, out_dir], + check=True, capture_output=True) + return mlmodelc + + +def _residency(mlmodelc: str) -> tuple[float, int, Counter]: + import coremltools as ct + from coremltools.models.compute_plan import MLComputePlan + from audit_ane_residency import _iter_mlprogram_ops, _device_label + + plan = MLComputePlan.load_from_path(path=mlmodelc, compute_units=ct.ComputeUnit.CPU_AND_NE) + by_dev: Counter = Counter() + total = 0 + for _f, op in _iter_mlprogram_ops(plan.model_structure): + if op.operator_name == "const": + continue + try: + usage = plan.get_compute_device_usage_for_mlprogram_operation(op) + except Exception: + usage = None + by_dev[_device_label(usage)] += 1 + total += 1 + return (100.0 * by_dev.get("ANE", 0) / total if total else 0.0), total, by_dev + + +def _time(pkg: str, inputs: dict, units, iters: int, warmup: int): + import coremltools as ct + m = ct.models.MLModel(pkg, compute_units=units) + out = None + for _ in range(warmup): + out = m.predict(inputs) + ts = [] + for _ in range(iters): + t = time.time() + out = m.predict(inputs) + ts.append((time.time() - t) * 1000.0) + emb = np.asarray(out["embedding"]).astype(np.float32).reshape(1, -1) + return float(np.median(ts)), emb + + +def main() -> int: + ap = argparse.ArgumentParser(description="L=8192 ANE bucket ship gate") + ap.add_argument("--hf-repo", default=None) + ap.add_argument("--n-tokens", type=int, default=8000, help="valid tokens in the long input") + ap.add_argument("--iters", type=int, default=5) + ap.add_argument("--warmup", type=int, default=2) + ap.add_argument("--fidelity-gate", type=float, default=0.99) + args = ap.parse_args() + + for p, what in ((ANE_FP16, "L8192 pooled_fp16"), (DYN_FP16, "dyn8192 pooled_fp16")): + if not os.path.isdir(p): + print(f"MISSING {what}: {p}\n(build it first — see this script's header.)") + return 1 + + from config import MODEL_REGISTRY + hf_repo = args.hf_repo or MODEL_REGISTRY["pplx-embed"].hf_repo + import pplx_embed_reference as R + import torch + print(f"[ref] loading fp32 oracle {hf_repo} …") + ref = R.Reference(hf_repo) + tok = ref.tokenizer + + # Long multilingual input → n_tokens valid tokens. + text = SAMPLE + while len(tok.encode(text)) < args.n_tokens: + text = text + " " + SAMPLE + ids = tok([text], return_tensors="np", truncation=True, max_length=args.n_tokens)["input_ids"][0] + n = int(ids.shape[0]) + print(f"[input] {n} valid tokens") + + # fp32 reference pooled (this is the heavy step at long L). + print("[ref] fp32 forward (slow at long L) …") + with torch.inference_mode(): + ids_t = torch.from_numpy(ids.astype(np.int64)).view(1, -1) + mask_t = torch.ones((1, n), dtype=torch.float32) + hidden = ref.model(input_ids=ids_t, attention_mask=mask_t).last_hidden_state.float() + ref_pooled = R.masked_mean(hidden, mask_t).numpy().astype(np.float32) + + # --- 1. residency (int8 bucket if built, else the pooled_fp16 encoder body) ------- + res_pkg = ANE_INT8 if os.path.isdir(ANE_INT8) else ANE_FP16 + print(f"\n[1] ANE residency of {os.path.relpath(res_pkg, OUT)} …") + ane_pct, total, by_dev = _residency(_compile(res_pkg)) + print(f" ANE {ane_pct:.2f}% ({total} ops; {dict(by_dev)})") + + import coremltools as ct + L = 8192 + # --- 2a. ANE bucket: pad to 8192, CPU_AND_NE ------------------------------------ + pid = np.zeros((1, L), dtype=np.int32) + pid[0, :n] = ids + pam = np.zeros((1, L), dtype=np.float16) + pam[0, :n] = 1.0 + print(f"\n[2a] ANE L8192 bucket latency (CPU_AND_NE, padded to {L}) …") + ane_ms, ane_emb = _time(ANE_FP16, {"input_ids": pid, "attention_mask": pam}, + ct.ComputeUnit.CPU_AND_NE, args.iters, args.warmup) + ane_cos = float(R.cosine_similarity(ane_emb, ref_pooled)[0]) + print(f" median {ane_ms:.1f} ms cosine {ane_cos:.5f}") + + # --- 2b. dynamic GPU model: actual length, CPU_AND_GPU -------------------------- + did = ids.astype(np.int32).reshape(1, n) + dam = np.ones((1, n), dtype=np.float16) + print(f"\n[2b] dynamic GPU model latency (CPU_AND_GPU, actual {n}) …") + gpu_ms, gpu_emb = _time(DYN_FP16, {"input_ids": did, "attention_mask": dam}, + ct.ComputeUnit.CPU_AND_GPU, args.iters, args.warmup) + gpu_cos = float(R.cosine_similarity(gpu_emb, ref_pooled)[0]) + print(f" median {gpu_ms:.1f} ms cosine {gpu_cos:.5f}") + + # --- decision ------------------------------------------------------------------- + print("\n" + "=" * 64) + print("L=8192 SHIP GATE") + print("=" * 64) + resident = ane_pct >= 99.0 + faster = ane_ms < gpu_ms + fid_ok = ane_cos >= args.fidelity_gate + speedup = gpu_ms / ane_ms if ane_ms else 0.0 + print(f" ANE residency : {ane_pct:.2f}% (resident ≥99% = {resident})") + print(f" latency : ANE {ane_ms:.1f} ms vs GPU {gpu_ms:.1f} ms " + f"({speedup:.1f}× {'faster' if faster else 'SLOWER'})") + print(f" fidelity : ANE cosine {ane_cos:.5f} (≥{args.fidelity_gate} = {fid_ok})") + ship = resident and faster and fid_ok + print() + if ship: + print(" ✅ SHIP: L=8192 stays on the ANE and beats the GPU catch-all. Place " + "L8192-int8/ in the bundle dir; Swift auto-routes (no Swift change).") + else: + print(" ❌ DO NOT SHIP: gate not met (see above). The >4096 path stays the GPU " + "model; record the result.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/conversion/models/qwen3_encoder.py b/conversion/models/qwen3_encoder.py new file mode 100644 index 0000000..c805261 --- /dev/null +++ b/conversion/models/qwen3_encoder.py @@ -0,0 +1,477 @@ +"""Qwen3 bidirectional encoder (ANE-optimized) for pplx-embed. + +Perplexity's `pplx-embed-v1-0.6b` (plain) and `pplx-embed-context-v1-0.6b` (late +chunking) are a **bidirectional** Qwen3-0.6B encoder (`PPLXQwen3Model`, see the HF +checkpoint's `modeling.py`): every token attends to every non-pad token, the model +returns `last_hidden_state`, and a downstream pooling + tanh-int8 head produces the +embedding. + +This is the ANE port — templated on `models/gemma3_encoder.py` but with Qwen3 math: + - single pre/post RMSNorm per layer (pre-norm; NOT Gemma's 4 sandwich norms) + - plain-weight RMSNorm (`x*rsqrt(..)*w`, no +1 gain — matches the working Qwen3 decoder) + - per-head QK-norm (RMSNorm over head_dim on Q and K, before RoPE) + - single RoPE table, θ=1e6 (NOT Gemma's dual local/global) + - SwiGLU MLP (silu), GQA 16 q / 8 kv heads, head_dim 128 (q proj = 2048) + - full bidirectional attention (pad-mask only, no causal triangle, no sliding window) + - NO embedding scaling (that is a Gemma-ism) + +ANE layout (docs/ANE_OPTIMIZATION_SURVEY.md + conversion/ane_ops.py): all projections +are Conv2d(1×1) on (B, C, 1, S); RMSNorm uses cat([x,−x])→LayerNorm; GQA expansion uses +repeat_kv_ane; the residual stream is kept in fp32 (fp16 can overflow over 28 layers). +Fixed trace-time sequence length — variable length is handled by padding to a bucket. +""" + +from __future__ import annotations + +import gc +import json +import os +import sys + +import torch +import torch.nn as nn +import torch.nn.functional as F + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from ane_ops import ( # noqa: E402 + MODEL_DTYPE, + ANERMSNorm, + apply_rotary_pos_emb, + stable_attention, +) + +# Max chunks per document for the context (late-chunking) variant. +N_MAX_CHUNKS = 32 + + +def _repeat_kv_b(x: torch.Tensor, n_rep: int, B: int, num_kv_heads: int, + seq_len: int, head_dim: int) -> torch.Tensor: + """Batched GQA expansion: (B, kv, S, D) → (B, kv*n_rep, S, D), explicit shapes.""" + if n_rep == 1: + return x + x = x.unsqueeze(2).expand(B, num_kv_heads, n_rep, seq_len, head_dim) + return x.reshape(B, num_kv_heads * n_rep, seq_len, head_dim) + + +class Qwen3RMSNorm(nn.Module): + """Native RMSNorm `x * rsqrt(mean(x²)+eps) * w`, computed in fp32 (HF Qwen3 parity). + + A *local* A/B alternative to the shared `ane_ops.ANERMSNorm` cat([x,−x])→LayerNorm + trick. That trick was chosen years ago because the ANE had a highly-optimized + LayerNorm kernel and no native `rsqrt`; on current M4 Max / macOS 26 / coremltools 9 + that may no longer hold (see docs/PPLX_EMBED_GPU_RESIDENCY.md). This class lets the + pplx-embed encoder switch the 5 norm sites to native RMSNorm and measure. + + It stores a 1-D fp16 weight exactly like `ANERMSNorm`, so weight loading is + unchanged (both are a plain `.weight` of shape `(hidden,)`). The normalization is + done in fp32 (fp16 `x²` can overflow for large activations) and returned in the + input dtype, mirroring the HF Qwen3 RMSNorm the fp32 reference already matches. + NB: coremltools lowers the whole graph to fp16 at convert time, so the fp32 here is + a trace-time/fidelity nicety; on device the op runs in fp16 like the rest. + """ + + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size, dtype=MODEL_DTYPE)) + self.eps = eps + self.hidden_size = hidden_size + + def forward(self, x: torch.Tensor) -> torch.Tensor: + in_dtype = x.dtype + x = x.to(torch.float32) + var = x.pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(var + self.eps) + return x.to(in_dtype) * self.weight + + +def make_norm(norm_impl: str, hidden_size: int, eps: float) -> nn.Module: + """Select the RMSNorm implementation for the encoder's 5 local norm sites. + + "ane_cat" (default) → shared `ANERMSNorm` (cat/chunk LayerNorm trick, unchanged + behavior). "native" → local `Qwen3RMSNorm` (native rsqrt). Both store a 1-D fp16 + weight, so swapping does not affect weight loading. + """ + if norm_impl == "native": + return Qwen3RMSNorm(hidden_size, eps=eps) + if norm_impl == "ane_cat": + return ANERMSNorm(hidden_size, eps=eps) + raise ValueError(f"Unknown norm_impl '{norm_impl}'; expected 'ane_cat' or 'native'.") + + +class Qwen3EncoderConfig: + """Qwen3 encoder config (read from the pplx-embed HF config.json).""" + + def __init__(self, **kwargs): + self.hidden_size = kwargs.get("hidden_size", 1024) + self.num_hidden_layers = kwargs.get("num_hidden_layers", 28) + self.num_attention_heads = kwargs.get("num_attention_heads", 16) + self.num_key_value_heads = kwargs.get("num_key_value_heads", 8) + self.head_dim = kwargs.get("head_dim", 128) + self.intermediate_size = kwargs.get("intermediate_size", 3072) + self.vocab_size = kwargs.get("vocab_size", 151936) + self.rms_norm_eps = kwargs.get("rms_norm_eps", 1e-6) + self.attention_bias = bool(kwargs.get("attention_bias", False)) + # rope_theta may live at top level or under rope_parameters. + rp = kwargs.get("rope_parameters") or {} + self.rope_theta = float(kwargs.get("rope_theta", rp.get("rope_theta", 1_000_000.0))) + self.max_position_embeddings = kwargs.get("max_position_embeddings", 32768) + # Trace-time fixed sequence length (the bucket). + self.max_seq_len = kwargs.get("max_seq_len", 4096) + # RMSNorm implementation for the 5 local norm sites: "native" (local + # Qwen3RMSNorm, native rsqrt — the shipped default) or "ane_cat" (shared + # ANERMSNorm cat/chunk LayerNorm trick). native is the default because the A/B + # (experiment_ane_rmsnorm.py / docs/PPLX_EMBED_GPU_RESIDENCY.md follow-up) found + # it 12.7% (L=256) / 21.5% (L=512) faster on the ANE at identical 99.81% + # residency and cosine 0.99998 vs the fp32 oracle, on M4 Max / macOS 26 / + # coremltools 9. (The cat/chunk trick predates a native ANE rsqrt.) + self.norm_impl = kwargs.get("norm_impl", "native") + + @classmethod + def from_json(cls, path: str, max_seq_len: int = 4096, + norm_impl: str = "native") -> "Qwen3EncoderConfig": + with open(path) as f: + d = json.load(f) + d = d.get("text_config", d) + d["max_seq_len"] = max_seq_len + d["norm_impl"] = norm_impl + return cls(**d) + + +class Qwen3EncoderLayer(nn.Module): + """One bidirectional Qwen3 block (pre-norm, ANE layout).""" + + def __init__(self, config: Qwen3EncoderConfig): + super().__init__() + hidden = config.hidden_size + head_dim = config.head_dim + num_heads = config.num_attention_heads + num_kv_heads = config.num_key_value_heads + inter = config.intermediate_size + eps = config.rms_norm_eps + has_bias = config.attention_bias + norm_impl = config.norm_impl + + q_dim = num_heads * head_dim + kv_dim = num_kv_heads * head_dim + + self.self_attn = nn.ModuleDict({ + "q_proj": nn.Conv2d(hidden, q_dim, 1, bias=has_bias, dtype=MODEL_DTYPE), + "k_proj": nn.Conv2d(hidden, kv_dim, 1, bias=has_bias, dtype=MODEL_DTYPE), + "v_proj": nn.Conv2d(hidden, kv_dim, 1, bias=has_bias, dtype=MODEL_DTYPE), + "o_proj": nn.Conv2d(q_dim, hidden, 1, bias=False, dtype=MODEL_DTYPE), + # Qwen3 QK-norm: per-head RMSNorm over head_dim, plain weight. + "q_norm": make_norm(norm_impl, head_dim, eps), + "k_norm": make_norm(norm_impl, head_dim, eps), + }) + self.mlp = nn.ModuleDict({ + "gate_proj": nn.Conv2d(hidden, inter, 1, bias=False, dtype=MODEL_DTYPE), + "up_proj": nn.Conv2d(hidden, inter, 1, bias=False, dtype=MODEL_DTYPE), + "down_proj": nn.Conv2d(inter, hidden, 1, bias=False, dtype=MODEL_DTYPE), + }) + self.input_layernorm = make_norm(norm_impl, hidden, eps) + self.post_attention_layernorm = make_norm(norm_impl, hidden, eps) + + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + self.n_rep = num_heads // num_kv_heads + self.scale = float(head_dim) ** -0.5 + + def forward( + self, + hidden_states: torch.Tensor, # (1, L, H) fp32 + cos: torch.Tensor, # (1, 1, L, head_dim) + sin: torch.Tensor, # (1, 1, L, head_dim) + attention_mask: torch.Tensor, # (1, 1, L, L) fp16 additive (0 / −1e4) + seq_len: int, + ) -> torch.Tensor: + num_heads = self.num_heads + num_kv_heads = self.num_kv_heads + head_dim = self.head_dim + B = hidden_states.shape[0] + + residual = hidden_states + # Normalize in fp32 then downcast: the pre-norm residual can exceed fp16 + # max (65504) over 28 layers, so casting *before* the norm would inf. + # RMSNorm is scale-invariant; its output is O(1) and fp16-safe. + normed = self.input_layernorm(hidden_states).to(MODEL_DTYPE) + + # (B, H, 1, L) layout for Conv2d. + x = normed.permute(0, 2, 1).unsqueeze(2) + + # Q/K/V: (B, q_dim, 1, L) → (B, heads, L, head_dim). + q = self.self_attn["q_proj"](x).view(B, num_heads, head_dim, seq_len).permute(0, 1, 3, 2) + k = self.self_attn["k_proj"](x).view(B, num_kv_heads, head_dim, seq_len).permute(0, 1, 3, 2) + v = self.self_attn["v_proj"](x).view(B, num_kv_heads, head_dim, seq_len).permute(0, 1, 3, 2) + + # QK-norm per head, then RoPE. + q = self.self_attn["q_norm"](q.reshape(B, num_heads, seq_len, head_dim)) + k = self.self_attn["k_norm"](k.reshape(B, num_kv_heads, seq_len, head_dim)) + q, k = apply_rotary_pos_emb(q, k, cos, sin) + + # GQA expansion (ANE-safe, batched). + k = _repeat_kv_b(k, self.n_rep, B, num_kv_heads, seq_len, head_dim) + v = _repeat_kv_b(v, self.n_rep, B, num_kv_heads, seq_len, head_dim) + + # Bidirectional attention (fp32), pad-mask only. scale = 1/sqrt(head_dim). + attn_out = stable_attention(q, k, v, self.scale, attention_mask) + + # (B, heads, L, head_dim) → (B, L, q_dim) → Conv2d o_proj. + attn_out = attn_out.permute(0, 2, 1, 3).contiguous().view(B, seq_len, num_heads * head_dim) + attn_out = self.self_attn["o_proj"]( + attn_out.permute(0, 2, 1).unsqueeze(2) + ).squeeze(2).permute(0, 2, 1) + + # fp32 residual add (attn_out is fp16 → upcast). + hidden_states = residual + attn_out.to(torch.float32) + + # MLP: post_attention_layernorm → SwiGLU → residual. + residual = hidden_states + normed = self.post_attention_layernorm(hidden_states).to(MODEL_DTYPE) + x_mlp = normed.permute(0, 2, 1).unsqueeze(2) + gate = self.mlp["gate_proj"](x_mlp) + up = self.mlp["up_proj"](x_mlp) + mlp_out = self.mlp["down_proj"](F.silu(gate) * up).squeeze(2).permute(0, 2, 1) + hidden_states = residual + mlp_out.to(torch.float32) + + return hidden_states + + +class Qwen3Encoder(nn.Module): + """Bidirectional Qwen3 encoder backbone → last_hidden_state (no pooling). + + Input: + input_ids (1, L) int32 + attention_mask (1, L) fp16 — 1.0 for valid tokens, 0.0 for pad + Output: + hidden_states (1, L, hidden_size) fp32 + """ + + NEG_INF = -1.0e4 # ANE-safe additive-mask value + + def __init__(self, config: Qwen3EncoderConfig): + super().__init__() + self.config = config + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList( + [Qwen3EncoderLayer(config) for _ in range(config.num_hidden_layers)] + ) + self.norm = make_norm(config.norm_impl, config.hidden_size, eps=config.rms_norm_eps) + self._build_rope(config) + + def _build_rope(self, config: Qwen3EncoderConfig): + head_dim = config.head_dim + # RoPE table size is DECOUPLED from the bucket (max_seq_len). We always build + # it to a single fixed length (max_position_embeddings, 32768) so the baked + # cos/sin constants are byte-identical across every bucket — that makes the + # whole CoreML weight.bin identical across buckets, so HF LFS / on-disk store + # it once instead of one ~1.19 GB blob per L. forward() slices [:S] at trace + # time; a runtime position_ids `gather` keeps that slice from being const- + # folded back into a per-bucket [S, head_dim] constant (verified by sha256). + L = config.max_position_embeddings + t = torch.arange(L).float() + inv = 1.0 / (config.rope_theta ** (torch.arange(0, head_dim, 2).float() / head_dim)) + freqs = torch.einsum("i,j->ij", t, inv) + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(MODEL_DTYPE)) + self.register_buffer("sin_cached", emb.sin().to(MODEL_DTYPE)) + + def _pad_mask(self, attention_mask: torch.Tensor, S: int) -> torch.Tensor: + """(B, S) {1 valid, 0 pad} → (B, 1, S, S) additive fp16 (0 / −1e4), key-side.""" + B = attention_mask.shape[0] + key_pad = (1.0 - attention_mask).to(MODEL_DTYPE) * self.NEG_INF + return key_pad.view(B, 1, 1, S).expand(B, 1, S, S) + + def forward( + self, + input_ids: torch.Tensor, # (1, S) int32 + attention_mask: torch.Tensor, # (1, S) fp16 + ) -> torch.Tensor: + # Derive the sequence length from the input, not config — this makes the + # same graph serve both fixed buckets (S == bucket, static) and a flexible + # RangeDim export (S dynamic, GPU). + head_dim = self.config.head_dim + S = input_ids.shape[1] + + # No embedding scaling (Qwen3). Keep residual stream in fp32. + hidden = self.embed_tokens(input_ids).to(torch.float32) + + # RoPE: the cos/sin tables are built once to a FIXED length + # (max_position_embeddings, 32768) so they are byte-identical across every + # bucket — that makes the whole CoreML weight.bin identical across buckets. + # A plain static `cos_cached[:S]` slice would be const-folded back into a + # per-bucket [S, head_dim] constant (verified: it defeats the dedup). To keep + # the slice fold-proof we GATHER rows [0..S-1] using position_ids derived from + # a runtime input (attention_mask), so the indices are runtime-dependent and + # coremltools cannot const-fold the gather. This needs NO new model input. + position_ids = ( + torch.cumsum(torch.ones_like(attention_mask, dtype=torch.float32), dim=1) - 1.0 + ).to(torch.int32) # (1, S) = [[0,1,…,S-1]] + pos = position_ids[0] # (S,) + cos = self.cos_cached.index_select(0, pos).view(1, 1, S, head_dim) + sin = self.sin_cached.index_select(0, pos).view(1, 1, S, head_dim) + mask = self._pad_mask(attention_mask, S) + + for layer in self.layers: + hidden = layer(hidden, cos, sin, mask, S) + + return self.norm(hidden).to(MODEL_DTYPE) + + +class PplxEmbedModel(nn.Module): + """Full pplx-embed plain forward: tokens → pooled embedding. + + output_mode: + "pooled_fp16" — masked-mean → fp16 (readable from the Python bridge; for + fidelity iteration; quantize to int8 downstream). + "int8" — masked-mean → tanh → clamp(round(·*127), −128, 127) → int8 + (the deliverable; native int8 output, read via the Swift harness). + """ + + def __init__(self, config: Qwen3EncoderConfig, output_mode: str = "pooled_fp16"): + super().__init__() + assert output_mode in ("pooled_fp16", "int8") + self.encoder = Qwen3Encoder(config) + self.output_mode = output_mode + + def _masked_mean(self, hidden: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + mask = attention_mask.to(torch.float32).unsqueeze(-1) # (1, L, 1) + summed = (hidden.to(torch.float32) * mask).sum(dim=1) # (1, H) + denom = mask.sum(dim=1).clamp_min(1.0) # (1, 1) + return summed / denom + + def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + hidden = self.encoder(input_ids, attention_mask) # (1, L, H) + pooled = self._masked_mean(hidden, attention_mask) # (1, H) fp32 + if self.output_mode == "pooled_fp16": + return pooled.to(MODEL_DTYPE) + # int8 tanh head (matches st_quantize.py: torch.round, qmin=−128). + q = torch.clamp(torch.round(torch.tanh(pooled) * 127.0), -128, 127) + return q.to(torch.int8) + + +class PplxEmbedContextModel(nn.Module): + """Context (late-chunking) forward: encode the whole window once, pool per chunk. + + Inputs: + input_ids (1, L) int32 + attention_mask (1, L) fp16 — 1.0 valid, 0.0 pad + pool_matrix (N_max, L) fp16 — row k = normalized mean weights over chunk k's + token span (1/n_k on the span, else 0); unused rows are all-zero. + Output: + chunk_embeddings (N_max, 1024) — int8 or fp16. Unused rows → 0 vector (skip them). + + Pooling is a single matmul `pool_matrix @ hidden`, so the same encoder serves plain + (one row = 1/n over all valid tokens) and context. See the pool_matrix lesson. + """ + + def __init__(self, config: Qwen3EncoderConfig, output_mode: str = "pooled_fp16", + n_max: int = N_MAX_CHUNKS): + super().__init__() + assert output_mode in ("pooled_fp16", "int8") + self.encoder = Qwen3Encoder(config) + self.output_mode = output_mode + self.n_max = n_max + + def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, + pool_matrix: torch.Tensor) -> torch.Tensor: + hidden = self.encoder(input_ids, attention_mask) # (1, L, H) + h = hidden.squeeze(0).to(torch.float32) # (L, H) + pooled = pool_matrix.to(torch.float32) @ h # (N_max, H) + if self.output_mode == "pooled_fp16": + return pooled.to(MODEL_DTYPE) + q = torch.clamp(torch.round(torch.tanh(pooled) * 127.0), -128, 127) + return q.to(torch.int8) + + +# --------------------------------------------------------------------------- # +# Weight loading. +# --------------------------------------------------------------------------- # +_CONV2D_SUFFIXES = ( + ".q_proj.weight", ".k_proj.weight", ".v_proj.weight", ".o_proj.weight", + ".gate_proj.weight", ".up_proj.weight", ".down_proj.weight", +) + + +def _map_weight(hf_name: str) -> str | None: + """Map a pplx-embed checkpoint key → local Qwen3Encoder param name. + + The checkpoint is sentence-transformer style (no `model.` prefix); accept both. + """ + name = hf_name[len("model."):] if hf_name.startswith("model.") else hf_name + if name == "embed_tokens.weight": + return "embed_tokens.weight" + if name == "norm.weight": + return "norm.weight" + if name == "lm_head.weight": + return None # encoder needs no LM head (tied embeddings) + if name.startswith("layers."): + return name # local layout mirrors HF layer naming + return None + + +def load_encoder_weights(encoder: Qwen3Encoder, hf_dir: str) -> None: + """Load pplx-embed weights into a Qwen3Encoder (reshaping projections to Conv2d).""" + import safetensors.torch + + st_files = sorted(f for f in os.listdir(hf_dir) if f.endswith(".safetensors")) + if not st_files: + raise FileNotFoundError(f"No .safetensors in {hf_dir}") + + loaded = 0 + seen: set[str] = set() + for st_file in st_files: + state = safetensors.torch.load_file(os.path.join(hf_dir, st_file)) + for hf_name, tensor in state.items(): + local = _map_weight(hf_name) + if local is None: + continue + tensor = tensor.to(MODEL_DTYPE) + if any(local.endswith(suf) for suf in _CONV2D_SUFFIXES) and tensor.dim() == 2: + tensor = tensor.unsqueeze(-1).unsqueeze(-1) + parts = local.split(".") + target = encoder + for p in parts[:-1]: + target = getattr(target, p) + param = getattr(target, parts[-1]) + if param.shape != tensor.shape: + raise ValueError(f"Shape mismatch {hf_name}->{local}: {param.shape} vs {tensor.shape}") + with torch.no_grad(): + param.copy_(tensor) + loaded += 1 + seen.add(local) + del state + gc.collect() + print(f" loaded {loaded} tensors into Qwen3Encoder from {len(st_files)} file(s)") + return None + + +def apply_fp16_residual_rescale(encoder: Qwen3Encoder, K: float) -> None: + """Shrink the residual stream by 1/K so fp16 lowering doesn't overflow. + + This 28-layer encoder's activations exceed fp16 max (65504) in deep layers — + specifically the `down_proj` accumulation (3072→1024) infs out around layer 19. + coremltools lowers float ops to fp16, so this bites on-device. + + Because Qwen3 is **pre-norm** and every sublayer input goes through a + scale-invariant RMSNorm (and so does the final `norm`), scaling + embed_tokens, every o_proj, every down_proj by 1/K + makes every stored residual and every down_proj accumulation exactly K× + smaller while leaving the pooled embedding mathematically unchanged + (the scale-invariant final norm cancels the 1/K factor). + """ + inv = 1.0 / float(K) + with torch.no_grad(): + encoder.embed_tokens.weight.mul_(inv) + for layer in encoder.layers: + layer.self_attn["o_proj"].weight.mul_(inv) + layer.mlp["down_proj"].weight.mul_(inv) + + +__all__ = [ + "Qwen3EncoderConfig", "Qwen3EncoderLayer", "Qwen3Encoder", + "Qwen3RMSNorm", "make_norm", + "PplxEmbedModel", "PplxEmbedContextModel", "N_MAX_CHUNKS", + "load_encoder_weights", "apply_fp16_residual_rescale", +] diff --git a/conversion/pplx_embed_reference.py b/conversion/pplx_embed_reference.py new file mode 100644 index 0000000..557cd4d --- /dev/null +++ b/conversion/pplx_embed_reference.py @@ -0,0 +1,215 @@ +"""Golden fp32 reference oracle for pplx-embed (Perplexity) on CoreML-LLM. + +This is the ground truth every CoreML fidelity comparison is measured against: + + HF fp32 forward -> masked-mean (plain) / pool_matrix matmul (context) + -> st_quantize int8 / binary / ubinary + +CRITICAL — quantizer parity. We mirror the model's own ``st_quantize.py`` EXACTLY: + + int8 = clamp(round(tanh(x) * 127), -128, 127) # torch.round = HALF-TO-EVEN + binary = where(x >= 0, +1.0, -1.0) # float32 +/-1 + ubinary= packbits(x >= 0) # uint8 [..., dim/8] + +NOTE the two traps, both deliberately followed here: + * **torch.round** (round-half-to-even / banker's), NOT the paper's / the parallel + effort's HALF-UP ``floor(127*tanh+0.5)`` — they differ by +/-1 at exact halves. + * **qmin = -128** (not -127). Note -128 is never actually reached: tanh(x)*127 in + (-127, 127), so round() bottoms out at -127; the -128 clamp is purely defensive. + +Pooling matches the reference ``modeling.py``: + * plain : mean over valid (non-pad) tokens. + * context : late chunking -- encode the whole window once (bidirectional), then + mean-pool each chunk's token span. Chunks are joined with the tokenizer's + sep_token; the SEP token itself and padding are excluded from every chunk. + We express per-chunk pooling as a single matmul with a ``pool_matrix`` so the + same formulation drops straight into the CoreML graph (ANE-friendly); the plain + embed is the degenerate one-chunk case (row 0 = 1/L over all valid tokens). +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +import numpy as np +import torch + +Quantization = Literal["int8", "binary", "ubinary"] +N_MAX_CHUNKS = 32 + + +# --------------------------------------------------------------------------- # +# Quantizers — bit-for-bit mirrors of st_quantize.py (operate in torch float32). +# --------------------------------------------------------------------------- # +def int8_tanh_quant(x: torch.Tensor | np.ndarray) -> np.ndarray: + """clamp(round(tanh(x) * 127), -128, 127) via torch.round. Returns int8 ndarray.""" + t = torch.as_tensor(x, dtype=torch.float32) + soft = torch.tanh(t) + q = torch.clamp(torch.round(soft * 127.0), -128, 127) + return q.to(torch.int8).cpu().numpy() + + +def binary_tanh_quant(x: torch.Tensor | np.ndarray) -> np.ndarray: + """where(x >= 0, +1.0, -1.0). Returns float32 ndarray of +/-1.""" + t = torch.as_tensor(x, dtype=torch.float32) + return torch.where(t >= 0, 1.0, -1.0).cpu().numpy().astype(np.float32) + + +def ubinary_pack(x: torch.Tensor | np.ndarray) -> np.ndarray: + """packbits(x >= 0) along the last axis. Returns uint8 ndarray [..., dim/8].""" + t = torch.as_tensor(x, dtype=torch.float32) + bits = (t.cpu().numpy() >= 0) + return np.packbits(bits, axis=-1) + + +def quantize(x: torch.Tensor | np.ndarray, quantization: Quantization = "int8") -> np.ndarray: + if quantization == "int8": + return int8_tanh_quant(x) + if quantization == "binary": + return binary_tanh_quant(x) + if quantization == "ubinary": + return ubinary_pack(x) + raise ValueError(f"Invalid quantization '{quantization}'; expected int8/binary/ubinary.") + + +# --------------------------------------------------------------------------- # +# Pooling. +# --------------------------------------------------------------------------- # +def masked_mean(hidden: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """[B,L,D] x [B,L] -> [B,D] mean over valid tokens (clamped denom, like modeling.py).""" + m = mask.unsqueeze(-1).to(hidden.dtype) # [B,L,1] + summed = (hidden * m).sum(dim=1) # [B,D] + counts = m.sum(dim=1).clamp(min=1e-9) # [B,1] + return summed / counts + + +def make_pool_matrix(spans: list[tuple[int, int]], L: int, n_max: int = N_MAX_CHUNKS) -> np.ndarray: + """[n_max, L] float32; row k = normalized mean weights over chunk k's [start,end) span. + + Unused rows (>= len(spans)) are all-zero -> tanh(0)=0 -> zero vector; callers MUST + skip them (NaN under cosine). The plain case is one span (0, n_valid).""" + P = np.zeros((n_max, L), dtype=np.float32) + for k, (start, end) in enumerate(spans[:n_max]): + n = end - start + if n > 0: + P[k, start:end] = 1.0 / float(n) + return P + + +def embed_context(hidden: torch.Tensor | np.ndarray, pool_matrix: np.ndarray, + quantization: Quantization = "int8") -> np.ndarray: + """Late-chunking pool + quant: (pool_matrix @ hidden) -> quantize. hidden [L,D].""" + h = torch.as_tensor(hidden, dtype=torch.float32) + P = torch.as_tensor(pool_matrix, dtype=torch.float32) + pooled = P @ h # [n_max, D] + return quantize(pooled, quantization) + + +# --------------------------------------------------------------------------- # +# Fidelity helper. +# --------------------------------------------------------------------------- # +def cosine_similarity(a: np.ndarray, b: np.ndarray, eps: float = 1e-8) -> np.ndarray: + """Row-wise cosine for [N,D] arrays. Near-zero-norm rows -> NaN (caller excludes).""" + a = np.asarray(a, dtype=np.float32) + b = np.asarray(b, dtype=np.float32) + if a.ndim == 1: + a, b = a[None], b[None] + na = np.linalg.norm(a, axis=-1) + nb = np.linalg.norm(b, axis=-1) + sim = np.full(a.shape[0], np.nan, dtype=np.float32) + valid = (na > eps) & (nb > eps) + if valid.any(): + sim[valid] = (a[valid] * b[valid]).sum(-1) / (na[valid] * nb[valid]) + return sim + + +# --------------------------------------------------------------------------- # +# The HF fp32 oracle. +# --------------------------------------------------------------------------- # +@dataclass +class _Loaded: + model: torch.nn.Module + tokenizer: object + + +class Reference: + """Loads a pplx-embed checkpoint (fp32, CPU) and produces golden embeddings. + + >>> ref = Reference("perplexity-ai/pplx-embed-v1-0.6b") + >>> ref.embed(["hello world"]).shape # (1, 1024), dtype int8 + """ + + def __init__(self, hf_repo: str = "perplexity-ai/pplx-embed-v1-0.6b", + device: str = "cpu", dtype: torch.dtype = torch.float32): + from transformers import AutoModel, AutoTokenizer + + self.hf_repo = hf_repo + self.device = device + self.dtype = dtype + model = AutoModel.from_pretrained(hf_repo, trust_remote_code=True, dtype=dtype) + model.eval().to(device) + tokenizer = AutoTokenizer.from_pretrained(hf_repo, trust_remote_code=True) + self._l = _Loaded(model=model, tokenizer=tokenizer) + + @property + def model(self) -> torch.nn.Module: + return self._l.model + + @property + def tokenizer(self): + return self._l.tokenizer + + @torch.inference_mode() + def hidden_states(self, texts: list[str]) -> tuple[torch.Tensor, torch.Tensor]: + """Tokenize + bidirectional forward. Returns (last_hidden_state [B,L,D], mask [B,L]).""" + enc = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt") + enc = {k: v.to(self.device) for k, v in enc.items()} + out = self.model(**enc) + return out.last_hidden_state.float(), enc["attention_mask"].float() + + @torch.inference_mode() + def embed(self, texts: list[str], quantization: Quantization = "int8") -> np.ndarray: + """Plain embed: masked-mean over valid tokens -> quantize. Returns [B, 1024].""" + hidden, mask = self.hidden_states(texts) + pooled = masked_mean(hidden, mask) # [B, D] + return quantize(pooled, quantization) + + @torch.inference_mode() + def embed_chunks(self, documents: list[list[str]], + quantization: Quantization = "int8") -> list[np.ndarray]: + """Context embed (late chunking): join chunks with sep_token, encode the whole + window once, mean-pool each chunk's span. Returns one [n_chunks, 1024] array per doc. + + Mirrors modeling.py: SEP tokens and padding are excluded from every chunk.""" + sep = self.tokenizer.sep_token + sep_id = self.tokenizer.sep_token_id + joined = [sep.join(chunks) for chunks in documents] + enc = self.tokenizer(joined, padding=True, truncation=True, return_tensors="pt") + enc = {k: v.to(self.device) for k, v in enc.items()} + out = self.model(**enc) + hidden = out.last_hidden_state.float() # [B,L,D] + input_ids = enc["input_ids"] + mask = enc["attention_mask"] + + results: list[np.ndarray] = [] + for b in range(input_ids.shape[0]): + valid = mask[b].bool() + n_valid = int(valid.sum().item()) + sep_pos = ((input_ids[b] == sep_id) & valid).nonzero(as_tuple=True)[0].tolist() + spans: list[tuple[int, int]] = [] + start = 0 + for sp in sep_pos: + spans.append((start, sp)) # chunk is [start, sep) — SEP excluded + start = sp + 1 + spans.append((start, n_valid)) # final chunk to last valid token + L = hidden.shape[1] + P = make_pool_matrix(spans, L, n_max=len(spans)) + results.append(embed_context(hidden[b], P, quantization)) + return results + + +__all__ = [ + "Reference", "Quantization", "N_MAX_CHUNKS", + "int8_tanh_quant", "binary_tanh_quant", "ubinary_pack", "quantize", + "masked_mean", "make_pool_matrix", "embed_context", "cosine_similarity", +] diff --git a/conversion/test_pplx_embed_parity.py b/conversion/test_pplx_embed_parity.py new file mode 100644 index 0000000..da2aca2 --- /dev/null +++ b/conversion/test_pplx_embed_parity.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""Parity check: ANE Qwen3 encoder (PplxEmbedModel) vs the fp32 golden reference. + +Validates the architecture port *before* CoreML conversion — fast CPU loop, small +fixed seq-len. Compares both the fp16 pooled embedding and the int8 output against +conversion/pplx_embed_reference.py on a small multilingual sample. + +Usage: + python conversion/test_pplx_embed_parity.py # plain, L=64, K=16 + python conversion/test_pplx_embed_parity.py --max-seq-len 128 --rescale-k 16 + +Pass criteria (cosine vs fp32, zero-norm rows excluded): + pooled fp16 ≥ 0.999 (encoder port fidelity) + int8 ≥ 0.997 (foundation gate) +""" +from __future__ import annotations + +import argparse +import os +import sys + +import numpy as np +import torch + +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, ROOT) + +import pplx_embed_reference as R # noqa: E402 +from models.qwen3_encoder import ( # noqa: E402 + Qwen3EncoderConfig, + PplxEmbedModel, + load_encoder_weights, + apply_fp16_residual_rescale, +) + +SENTENCES = [ + "hello world", + "Bonjour le monde.", + "東京は日本の首都です。", + "Embeddings are dense vectors.", + "La inteligencia artificial avanza rápido.", + "Das Wetter ist heute schön.", + "机器学习改变世界。", + "Quantum computing uses qubits.", + "Привет, как дела?", + "المعرفة قوة.", + "The mitochondria is the powerhouse of the cell.", + "Tokyo Shanghai Paris Berlin Cairo.", +] + + +def _snapshot_dir(hf_repo: str) -> str: + from huggingface_hub import snapshot_download + return snapshot_download(hf_repo, allow_patterns=["*.json", "*.safetensors", "tokenizer*", "*.txt", "*.py"]) + + +def main() -> int: + ap = argparse.ArgumentParser(description="pplx-embed ANE-encoder parity test") + ap.add_argument("--hf-repo", default="perplexity-ai/pplx-embed-v1-0.6b") + ap.add_argument("--max-seq-len", type=int, default=64) + ap.add_argument("--rescale-k", type=float, default=8.0, + help="fp16 residual rescale factor (0 disables; K=8 is the default sweet spot)") + ap.add_argument("--pooled-gate", type=float, default=0.999) + ap.add_argument("--int8-gate", type=float, default=0.997) + args = ap.parse_args() + + snap = _snapshot_dir(args.hf_repo) + L = args.max_seq_len + K = args.rescale_k or None + + cfg = Qwen3EncoderConfig.from_json(os.path.join(snap, "config.json"), max_seq_len=L) + print(f"[cfg] hidden={cfg.hidden_size} layers={cfg.num_hidden_layers} " + f"heads={cfg.num_attention_heads}/{cfg.num_key_value_heads} hd={cfg.head_dim} " + f"theta={cfg.rope_theta} L={L} K={K}") + + m_pool = PplxEmbedModel(cfg, "pooled_fp16").eval() + load_encoder_weights(m_pool.encoder, snap) + m_int8 = PplxEmbedModel(cfg, "int8").eval() + load_encoder_weights(m_int8.encoder, snap) + if K: + apply_fp16_residual_rescale(m_pool.encoder, K) + apply_fp16_residual_rescale(m_int8.encoder, K) + + ref = R.Reference(args.hf_repo) + tok = ref.tokenizer + + cos_pool, cos_int8 = [], [] + for t in SENTENCES: + enc = tok([t], return_tensors="pt", truncation=True, max_length=L) + ids = enc["input_ids"] + n = ids.shape[1] + pid = torch.zeros((1, L), dtype=torch.int32); pid[0, :n] = ids[0].to(torch.int32) + pam = torch.zeros((1, L), dtype=torch.float16); pam[0, :n] = 1.0 + with torch.no_grad(): + o_pool = m_pool(pid, pam).numpy().astype(np.float32) + o_int8 = m_int8(pid, pam).numpy().astype(np.float32) + ref_pool = R.masked_mean(*ref.hidden_states([t])).numpy().astype(np.float32) + ref_int8 = ref.embed([t]).astype(np.float32) + cp = R.cosine_similarity(o_pool, ref_pool)[0] + ci = R.cosine_similarity(o_int8, ref_int8)[0] + cos_pool.append(cp); cos_int8.append(ci) + print(f"[txt] n={n:3d} pooled={cp:.6f} int8={ci:.6f} {t[:28]}") + + cp = np.array(cos_pool); ci = np.array(cos_int8) + pooled_min, int8_min = float(np.nanmin(cp)), float(np.nanmin(ci)) + n_nan = int(np.isnan(cp).sum() + np.isnan(ci).sum()) + print(f"\n[POOLED] mean={np.nanmean(cp):.6f} min={pooled_min:.6f} (gate ≥ {args.pooled_gate})") + print(f"[INT8] mean={np.nanmean(ci):.6f} min={int8_min:.6f} (gate ≥ {args.int8_gate})") + ok = (n_nan == 0) and (pooled_min >= args.pooled_gate) and (int8_min >= args.int8_gate) + print(f"\n{'PASS' if ok else 'FAIL'} (nan={n_nan})") + return 0 if ok else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/conversion/upload_pplx_embed.py b/conversion/upload_pplx_embed.py new file mode 100644 index 0000000..09c0b7f --- /dev/null +++ b/conversion/upload_pplx_embed.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +"""Publish the prebuilt pplx-embed CoreML buckets to a single HuggingFace repo. + +End users should **download** the finished `.mlpackage` buckets, not regenerate them +(conversion needs the toolkit + minutes per bucket). This script mirrors the local +bundle layout into one HF repo with **per-bucket subfolders**, so a consumer pulls only +the bucket(s) they need (the Swift downloader fetches an explicit file list — see +`PplxEmbed.load(repo:buckets:)` / `Gemma3BundleDownloader`). + +Single-repo rationale (see docs/PPLX_EMBED.md): the repo convention is one HF repo per +model family with subfolders; each bucket `.mlpackage` embeds the same ~1.1 GB weights +but they are **byte-identical across buckets** (bucket size only changes the traced shape ++ RoPE length), so HF content-addressed LFS stores the blob once — several repos would +not save storage. + +Repo layout (target `/pplx-embed-coreml`): + L512-int8/ L1024-int8/ … L8192-int8/ dyn8192-int8/ (plain) + context/L512-int8/ … (context variant) + manifest.json README.md +Each bucket subfolder mirrors the local bundle: `encoder.mlpackage/` (or `.mlmodelc/`), +`model_config.json`, `hf_model/` tokenizer json. The upstream `hf_model/*.safetensors` +are excluded (ship only the tokenizer json — matches the other CoreML repos). + +This script does NOT upload directly. It compiles (optionally), stages a clean repo tree +(symlinks → manifest.json + README.md), ensures the repo exists, and prints the resumable +`hf upload-large-folder` command for you to run — that uploader is parallel, xet-accelerated, +shows realtime progress, and resumes if interrupted (re-run the same command). + +Usage: + # compile + ship both formats, plain + context: + uv run python conversion/upload_pplx_embed.py --repo /pplx-embed-coreml \ + --plain-dir output/pplx-embed --context-dir output/pplx-embed-context --compile + # then run the printed command, e.g.: + hf upload-large-folder /pplx-embed-coreml output/pplx-embed-coreml-stage --repo-type=model + # restrict to specific buckets: + uv run python conversion/upload_pplx_embed.py --repo /pplx-embed-coreml \ + --plain-dir output/pplx-embed --buckets L512-int8 L1024-int8 L2048-int8 +""" +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + +# Files inside a bucket dir to never upload (the re-conversion-only weights). +_EXCLUDE_SUFFIXES = (".safetensors",) + + +def _is_shippable_bucket(bucket_dir: Path) -> bool: + """A bucket ships iff its model_config.json is an int8-output, fp16-weight model. + + Mirrors PplxEmbed.swift's parseBucket filter so we publish exactly the set the + Swift runtime will load (skips pooled_fp16 fidelity bundles + weight-quant probes). + """ + cfg = bucket_dir / "model_config.json" + if not cfg.exists(): + return False + try: + j = json.loads(cfg.read_text()) + except Exception: + return False + return (j.get("output_mode") == "int8" + and (j.get("quantization_weights") or "fp16") == "fp16") + + +def _model_dir_name(bucket_dir: Path) -> str | None: + """Return 'encoder.mlmodelc' if compiled present, else 'encoder.mlpackage'.""" + if (bucket_dir / "encoder.mlmodelc").is_dir(): + return "encoder.mlmodelc" + if (bucket_dir / "encoder.mlpackage").is_dir(): + return "encoder.mlpackage" + return None + + +def _compile_bucket(bucket_dir: Path) -> None: + """Compile encoder.mlpackage → encoder.mlmodelc in-place (skip if present). + + Ship-compiled path: precompiled `.mlmodelc` removes the consumer's first-load + `MLModel.compileModel` step and matches the repo's other CoreML releases. CoreML + `.mlmodelc` is loadable across the deployment target's devices (iPhone + Mac). + """ + import subprocess + + mlmodelc = bucket_dir / "encoder.mlmodelc" + pkg = bucket_dir / "encoder.mlpackage" + if mlmodelc.is_dir() or not pkg.is_dir(): + return + subprocess.run(["xcrun", "coremlcompiler", "compile", str(pkg), str(bucket_dir)], + check=True) + + +def _present_model_dirs(bucket_dir: Path) -> list[str]: + """Model dirs present in this bucket (both, if shipping both formats).""" + return [d for d in ("encoder.mlmodelc", "encoder.mlpackage") + if (bucket_dir / d).is_dir()] + + +def _bucket_files(bucket_dir: Path) -> list[str]: + """Relative file paths (POSIX) to ship for one bucket, excluding safetensors. + + Includes every present model dir (so 'ship both' uploads both encoder.mlmodelc + and encoder.mlpackage); the Swift client selectively downloads only one format. + """ + files: list[str] = [] + model_dirs = _present_model_dirs(bucket_dir) + if not model_dirs: + return files + for model_dir in model_dirs: + for p in sorted((bucket_dir / model_dir).rglob("*")): + if p.is_file() and not p.name.endswith(_EXCLUDE_SUFFIXES): + files.append(p.relative_to(bucket_dir).as_posix()) + # model_config.json + tokenizer json (exclude any safetensors defensively). + if (bucket_dir / "model_config.json").is_file(): + files.append("model_config.json") + hf = bucket_dir / "hf_model" + if hf.is_dir(): + for p in sorted(hf.rglob("*")): + if p.is_file() and not p.name.endswith(_EXCLUDE_SUFFIXES): + files.append(p.relative_to(bucket_dir).as_posix()) + return files + + +def _discover_buckets(plain_dir: Path | None, context_dir: Path | None, + only: set[str] | None) -> list[tuple[str, Path]]: + """Return [(repo_subfolder, local_bucket_dir)] for every shippable bucket. + + Plain buckets map to their dirname (e.g. 'L512-int8'); context buckets are + prefixed with 'context/' (e.g. 'context/L512-int8'). + """ + out: list[tuple[str, Path]] = [] + if plain_dir and plain_dir.is_dir(): + for d in sorted(p for p in plain_dir.iterdir() if p.is_dir()): + if only and d.name not in only: + continue + if _is_shippable_bucket(d): + out.append((d.name, d)) + if context_dir and context_dir.is_dir(): + for d in sorted(p for p in context_dir.iterdir() if p.is_dir()): + if only and d.name not in only: + continue + if _is_shippable_bucket(d): + out.append((f"context/{d.name}", d)) + return out + + +def _build_manifest(buckets: list[tuple[str, Path]], repo: str) -> dict: + # Size-only manifest: the Swift `load(repo:)` derives download globs from each + # bucket's subfolder + formats, and the HF Swift Hub client's content-addressed + # cache dedups the byte-identical weight.bin by etag on download — so no per-file + # sha is needed here (and staging stays instant, no ~14 GB hash). + base_url = f"https://huggingface.co/{repo}/resolve/main" + entries = [] + total = 0 + for subfolder, bucket_dir in buckets: + cfg = json.loads((bucket_dir / "model_config.json").read_text()) + files_meta = [] + for rel in _bucket_files(bucket_dir): + p = bucket_dir / rel + size = p.stat().st_size + total += size + repo_path = f"{subfolder}/{rel}" + files_meta.append({ + "path": repo_path, + "url": f"{base_url}/{repo_path}", + "size_bytes": size, + }) + formats = [d.split(".", 1)[1] for d in _present_model_dirs(bucket_dir)] + entries.append({ + "subfolder": subfolder, + "variant": cfg.get("variant", "plain"), + "bucket": cfg.get("bucket"), + "dynamic": bool(cfg.get("dynamic", False)), + "dynamic_upper": cfg.get("dynamic_upper", 0), + "max_seq_len": cfg.get("max_seq_len"), + "norm_impl": cfg.get("norm_impl", "ane_cat"), + "formats": formats, # e.g. ["mlmodelc", "mlpackage"]; Swift picks one + "files": files_meta, + }) + # Aggregate of the formats actually shipped: "both", "mlmodelc", or "mlpackage". + all_formats = {f for _s, d in buckets for f in + (x.split(".", 1)[1] for x in _present_model_dirs(d))} + fmt = "both" if all_formats == {"mlmodelc", "mlpackage"} else next(iter(all_formats), "mlpackage") + return { + "model_id": Path(repo).name, + "repo": repo, + "format": fmt, + "buckets": entries, + "total_size": total, + } + + +def _readme(repo: str, manifest: dict) -> str: + rows = [] + for b in manifest["buckets"]: + n = sum(f["size_bytes"] for f in b["files"]) + kind = "dynamic GPU catch-all" if b["dynamic"] else "fixed ANE bucket" + rows.append(f"| `{b['subfolder']}/` | {b['variant']} | {b['bucket']} | " + f"{kind} | {n / 1e9:.2f} GB |") + table = "\n".join(rows) + return f"""\ +--- +language: multilingual +license: apache-2.0 +base_model: perplexity-ai/pplx-embed-v1-0.6b +tags: + - coreml + - apple-neural-engine + - qwen3 + - sentence-embedding + - on-device +library_name: coreml +--- + +# pplx-embed for Apple CoreML (ANE-optimized) + +CoreML conversion of Perplexity's +[`pplx-embed-v1-0.6b`](https://huggingface.co/perplexity-ai/pplx-embed-v1-0.6b) +(a bidirectional Qwen3-0.6B encoder → masked-mean pool → tanh-int8 head) produced with +the [CoreML-LLM](https://github.com/john-rocky/CoreML-LLM) pipeline. Targets macOS 26. + +Each subfolder is a **fixed-shape sequence-length bucket** that stays resident on the +Apple Neural Engine (flexible shapes force CPU fallback). At runtime the Swift package +pads each input to the smallest bucket that fits; inputs longer than the largest fixed +bucket fall through to the `dyn*-int8/` flexible GPU catch-all. The encoder uses native +RMSNorm and a single fixed RoPE table — the ANE-fastest path on M4 Max / macOS 26. + +## Buckets in this repo + +| Subfolder | Variant | Bucket (L) | Kind | Size | +|---|---|---|---|---| +{table} + +The encoder `weight.bin` is **byte-identical across every bucket** (a single fixed-size +RoPE table makes the weights independent of bucket length). So HF stores the weight blob +**once**, and the HF content-addressed cache fetches it **once by etag** on download — +pulling several buckets costs ~1.15 GB total, not ~1.15 GB × N. + +## Use it + +Via the [CoreML-LLM Swift package](https://github.com/john-rocky/CoreML-LLM). It uses the +HF Swift Hub client, so only the buckets you request are downloaded and the shared weight +is fetched once into the content-addressed cache: + +```swift +import CoreMLLLM +let embedder = try await PplxEmbed.load( + repo: "{repo}", + buckets: [512, 1024, 2048]) // shared HF cache; weight fetched once by etag +let vecs = try embedder.embed(["On-device embeddings", "Bonjour le monde"]) // [[Int8]] +``` + +Each bucket is published in both `.mlpackage` and precompiled `.mlmodelc`; pass +`preferCompiled: false` for the portable package. Or download the bundle directory +yourself and load it with `load(bundleDir:)`. + +## I/O contract (per bucket `model_config.json`) + +- `input_ids (1, L) int32`, `attention_mask (1, L) fp16` (1.0 valid, 0.0 pad) +- `embedding (1, 1024) int8` — `clamp(round(tanh(x)*127), -128, 127)`; derive + `binary`/`ubinary` from the int8 sign (see `PplxEmbed`). + +## License + +Inherits the base model's [license](https://huggingface.co/perplexity-ai/pplx-embed-v1-0.6b). +""" + + +def _stage_repo_tree(buckets: list[tuple[str, Path]], stage_dir: Path, + manifest: dict, repo: str) -> None: + """Build a clean tree mirroring the repo layout via symlinks (no copy). + + `hf upload-large-folder` mirrors a local folder to the repo root, so we stage one: + each shippable file is **hardlinked** to its real source under + `stage_dir//`, plus manifest.json + README.md. Hardlinks are + indistinguishable from real files to the uploader (no symlink-following caveat) and + cost no extra disk (same inode); we fall back to symlink then copy if hardlinking + fails (e.g. cross-filesystem). Re-running rebuilds the tree from scratch. + """ + import shutil + if stage_dir.exists(): + shutil.rmtree(stage_dir) + stage_dir.mkdir(parents=True) + for subfolder, bucket_dir in buckets: + for rel in _bucket_files(bucket_dir): + src = (bucket_dir / rel).resolve() + dst = stage_dir / subfolder / rel + dst.parent.mkdir(parents=True, exist_ok=True) + try: + os.link(src, dst) # hardlink — zero copy, real-file semantics + except OSError: + try: + os.symlink(src, dst) + except OSError: + shutil.copy2(src, dst) + (stage_dir / "manifest.json").write_text(json.dumps(manifest, indent=2)) + (stage_dir / "README.md").write_text(_readme(repo, manifest)) + + +def main() -> int: + ap = argparse.ArgumentParser(description="Upload pplx-embed CoreML buckets to HF") + ap.add_argument("--repo", required=True, help="Target HF repo id (e.g. acct/pplx-embed-coreml)") + ap.add_argument("--plain-dir", default="output/pplx-embed", + help="Local dir of plain buckets (Lxxxx-int8/, dynNNNN-int8/)") + ap.add_argument("--context-dir", default=None, + help="Local dir of context buckets (uploaded under context/)") + ap.add_argument("--buckets", nargs="*", default=None, + help="Restrict to these bucket dir names (e.g. L512-int8 L1024-int8)") + ap.add_argument("--compile", action="store_true", + help="Compile each bucket's encoder.mlpackage → encoder.mlmodelc and ship " + "BOTH formats (no on-device compile for consumers; .mlmodelc+.mlpackage " + "share a bucket's weight.bin so this does not double the payload).") + ap.add_argument("--stage-dir", default=None, + help="Where to build the upload tree (default: /../pplx-embed-coreml-stage)") + ap.add_argument("--no-create-repo", action="store_true", + help="Don't create the HF repo (the upload command will).") + args = ap.parse_args() + + plain_dir = Path(args.plain_dir).resolve() if args.plain_dir else None + context_dir = Path(args.context_dir).resolve() if args.context_dir else None + only = set(args.buckets) if args.buckets else None + + buckets = _discover_buckets(plain_dir, context_dir, only) + if not buckets: + print("No shippable buckets found (need int8-output, fp16-weight model_config.json).") + return 1 + + if args.compile: + print("Compiling buckets → encoder.mlmodelc …") + for _subfolder, d in buckets: + _compile_bucket(d) + + print(f"Discovered {len(buckets)} bucket(s) for {args.repo}:") + for subfolder, d in buckets: + n = len(_bucket_files(d)) + print(f" {subfolder} ({n} files, {_model_dir_name(d)}) ← {d}") + + manifest = _build_manifest(buckets, args.repo) + total_gb = manifest["total_size"] / 1e9 + print(f"\nTotal payload (pre-LFS-dedup): {total_gb:.2f} GB across " + f"{sum(len(b['files']) for b in manifest['buckets'])} files") + + # Stage a clean tree (symlinks) mirroring the repo + manifest.json + README.md. + out_base = plain_dir.parent if plain_dir else Path.cwd() + stage_dir = Path(args.stage_dir).resolve() if args.stage_dir \ + else (out_base / "pplx-embed-coreml-stage") + _stage_repo_tree(buckets, stage_dir, manifest, args.repo) + print(f"\nStaged repo tree → {stage_dir} (symlinks + manifest.json + README.md)") + + # Ensure the repo exists so the resumable uploader can push straight to it. + token = os.environ.get("HF_TOKEN") or None + from huggingface_hub import HfApi, create_repo + if token is None: + try: + print(f"Using cached HF login: {HfApi().whoami().get('name')}") + except Exception: + print("NOTE: no HF_TOKEN and no cached login — run `huggingface-cli login` " + "before uploading.") + if not args.no_create_repo: + try: + create_repo(args.repo, repo_type="model", exist_ok=True, token=token) + print(f"Repo ready: https://huggingface.co/{args.repo}") + except Exception as e: + print(f"create_repo skipped ({str(e)[:80]}) — the upload command will create it.") + + print("\nNow run this to upload — resumable, parallel, xet-accelerated, realtime " + "progress (re-run the SAME command to resume if interrupted):\n") + print(f" hf upload-large-folder {args.repo} {stage_dir} --repo-type=model\n") + print("Weights dedupe across buckets: the encoder uses a single fixed RoPE table, so " + "every plain bucket (and its .mlmodelc+.mlpackage) shares ONE ~1.15 GB weight.bin; " + "the context variant is a second blob. HF LFS stores each unique blob once, so the " + "real upload is ~2 weight blobs regardless of how many buckets you ship. Restrict " + "buckets by re-running this script with e.g. --buckets L512-int8 L1024-int8.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/conversion/upload_pplx_embed_dedup.py b/conversion/upload_pplx_embed_dedup.py new file mode 100644 index 0000000..09b0bd4 --- /dev/null +++ b/conversion/upload_pplx_embed_dedup.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +"""Guaranteed-minimal-transfer upload of the staged pplx-embed repo tree. + +The staged tree has byte-identical `weight.bin`s across buckets (single fixed RoPE +table), but `hf upload-large-folder` re-uploads identical oids across its parallel +batches, so dedup "doesn't work" and you push ~14 GB instead of ~2.3 GB. + +This uploads each UNIQUE file content exactly once, then uses HF **server-side copy** +(`CommitOperationCopy`) to materialize every duplicate path from the already-uploaded +blob — no re-upload. Net transfer = the unique blobs only (the 2 weight blobs + the +unique small/tokenizer files). + +Run (stop any in-flight `hf upload-large-folder` first): + HF_HUB_DISABLE_XET=1 uv run python conversion/upload_pplx_embed_dedup.py \ + --repo dokterbob/pplx-embed-coreml \ + --stage output/pplx-embed-coreml-stage +""" +from __future__ import annotations + +import argparse +import hashlib +import os +from pathlib import Path + +# Duplicates at/above this size are materialized via server-side Copy (no re-upload). +# Only the ~1.15 GB weight.bin clears this bar — and it is always LFS, so Copy is safe. +# Smaller duplicates (tokenizer, graph files) are just re-added; that upload is tiny and +# avoids any "Copy a non-LFS file" edge case. +_LFS_MIN = 50 * 1024 * 1024 # 50 MB + + +def _sha256(path: Path, chunk: int = 1 << 20) -> str: + h = hashlib.sha256() + with open(path, "rb") as f: + while True: + b = f.read(chunk) + if not b: + break + h.update(b) + return h.hexdigest() + + +def main() -> int: + ap = argparse.ArgumentParser(description="Dedup-minimal upload of the pplx-embed stage") + ap.add_argument("--repo", required=True) + ap.add_argument("--stage", default="output/pplx-embed-coreml-stage") + args = ap.parse_args() + + stage = Path(args.stage).resolve() + if not stage.is_dir(): + print(f"stage dir not found: {stage}") + return 1 + + from huggingface_hub import ( + HfApi, create_repo, CommitOperationAdd, CommitOperationCopy, + ) + + # Walk the staged tree; group repo paths by content sha (the dedup key). + print("Hashing staged files (local; finds the unique blobs) …") + entries: list[tuple[str, Path, str, int]] = [] # (repo_path, local, sha, size) + for root, dirs, fns in os.walk(stage): + # Skip dot-dirs (e.g. `.cache/huggingface/` that `hf upload-large-folder` + # writes into the folder for resume tracking) — HF rejects `.cache/` paths. + dirs[:] = [d for d in dirs if not d.startswith(".")] + for fn in fns: + if fn.startswith("."): + continue + lp = Path(root) / fn + rp = lp.relative_to(stage).as_posix() + entries.append((rp, lp, _sha256(lp), lp.stat().st_size)) + + canonical: dict[str, str] = {} # sha -> first repo_path (the uploaded copy) + adds: list = [] # unique content to upload + copies: list = [] # dups → server-side copy + readds: list = [] # tiny dups → just re-add (cheap) + for rp, lp, sha, size in sorted(entries, key=lambda e: e[0]): + if sha not in canonical: + canonical[sha] = rp + adds.append(CommitOperationAdd(path_in_repo=rp, path_or_fileobj=str(lp))) + elif size >= _LFS_MIN: + copies.append(CommitOperationCopy(src_path_in_repo=canonical[sha], path_in_repo=rp)) + else: + readds.append(CommitOperationAdd(path_in_repo=rp, path_or_fileobj=str(lp))) + + uniq_gb = sum(e[3] for e in entries if canonical[e[2]] == e[0]) / 1e9 + total_gb = sum(e[3] for e in entries) / 1e9 + print(f" {len(entries)} files → {len(adds)} unique to upload " + f"({uniq_gb:.2f} GB transferred) + {len(copies)} server-side copies " + f"+ {len(readds)} tiny re-adds. (apparent total {total_gb:.1f} GB)") + + token = os.environ.get("HF_TOKEN") or None + if token is None: + try: + print(f"Using cached HF login: {HfApi().whoami().get('name')}") + except Exception: + print("ERROR: no HF_TOKEN and no cached login (`huggingface-cli login`).") + return 1 + api = HfApi(token=token) + create_repo(args.repo, repo_type="model", exist_ok=True, token=token) + + # Commit 1: every unique blob + the tiny duplicates (these establish the copy sources). + print(f"\n[1/2] Uploading {len(adds) + len(readds)} unique/small files " + f"(~{uniq_gb:.2f} GB over the wire) …") + api.create_commit(args.repo, operations=adds + readds, repo_type="model", + commit_message="upload unique blobs + small files (deduped)") + + # Commit 2: server-side copy the large duplicates from their canonical path. + if copies: + print(f"[2/2] Server-side copying {len(copies)} duplicate weight blobs " + f"(no re-upload) …") + api.create_commit(args.repo, operations=copies, repo_type="model", + commit_message="server-side copy deduped weight.bin across buckets") + + print(f"\n✅ Done. https://huggingface.co/{args.repo}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/ANE_RMSNORM_FOLLOWUP.md b/docs/ANE_RMSNORM_FOLLOWUP.md new file mode 100644 index 0000000..418353e --- /dev/null +++ b/docs/ANE_RMSNORM_FOLLOWUP.md @@ -0,0 +1,45 @@ +# Follow-up: native RMSNorm for the shared `ane_ops.ANERMSNorm` (all decoder families) + +**Status:** proposed, **not implemented**. Needs broad re-validation before any change. + +## What we found (pplx-embed only) + +`conversion/ane_ops.ANERMSNorm` implements RMSNorm via the `cat([x, −x]) → LayerNorm → +chunk` identity, chosen years ago because "the ANE has a highly optimized LayerNorm +kernel" and lacked a fast native `rsqrt`. On the pplx-embed bidirectional Qwen3 encoder +we A/B'd that against a native `x * rsqrt(mean(x²) + eps) * w` RMSNorm +(`conversion/models/qwen3_encoder.Qwen3RMSNorm`, selectable via `norm_impl`), changing +**only** the 5 norm sites and holding Conv2d-1×1 projections and tensor layout fixed +(`conversion/experiment_ane_rmsnorm.py`): + +| L | ane_cat (cat/chunk) | native (rsqrt) | speedup | ANE residency | cosine vs fp32 | +|-----|--------------------:|---------------:|--------:|--------------:|---------------:| +| 256 | 35.98 ms | 31.92 ms | **+12.7%** | 99.81% (both) | 0.99998 | +| 512 | 100.40 ms | 82.61 ms | **+21.5%** | 99.81% (both) | 0.99998 | + +Environment: Apple M4 Max, macOS 26, coremltools 9, torch 2.11, B=1, K=8 residual +rescale, `pooled_fp16`. Native RMSNorm stays **fully ANE-resident** (the planner runs +`pow`/`reduce_mean`/`rsqrt` on the ANE here) and is fidelity-neutral. The cat/chunk trick +is now a *de-optimization* on this chip/OS/coremltools combination. + +## Why this is only applied to pplx-embed so far + +`ane_ops.ANERMSNorm` is shared by ~10 decoder families (Gemma3/4, LFM2, Qwen3.5, +Qwen3-VL, `base_model`). pplx-embed is a **bidirectional encoder, B=1, fp32 residual, +fixed full-attention** — a different regime from the **stateful causal decoders** (KV +cache, T=1 decode + T=32 prefill, sliding/full sandwich norms, the `(1+w)` gain +convention). The win may or may not carry over; a per-op kernel choice that helps a +1×L encoder pass need not help a chunked decode step. + +## Proposed work (separate PR) + +1. Add a `norm_impl` (or `native_rmsnorm=True`) switch to `ane_ops.ANERMSNorm` + **without changing its default**, mirroring `Qwen3RMSNorm` (store the same 1-D weight; + keep the `plus_one_gain` convention in `ane_norm_from_hf`). +2. A/B per family with that family's existing latency harness (e.g. + `probe_e2e_linear_latency.py`-style), measuring decode **and** prefill, ANE residency, + and end-to-end output parity — *not* just a single encoder pass. +3. Flip the shared default to native **only** for families where it is faster *and* + residency/parity hold; leave the others on `ane_cat`. + +Do **not** flip the shared default globally off the pplx-embed result alone. diff --git a/docs/PPLX_EMBED.md b/docs/PPLX_EMBED.md new file mode 100644 index 0000000..6a4b02f --- /dev/null +++ b/docs/PPLX_EMBED.md @@ -0,0 +1,128 @@ +# pplx-embed — Perplexity embedding models on the ANE + +Adds a **bidirectional Qwen3 encoder** path that converts Perplexity's pplx-embed models to CoreML +and runs them on the Apple Neural Engine (macOS Tahoe / `macOS26`): + +- `perplexity-ai/pplx-embed-v1-0.6b` — **plain** sentence embeddings (mean-pool → 1024-d int8). +- `perplexity-ai/pplx-embed-context-v1-0.6b` — **late chunking** (per-chunk embeddings via a + `pool_matrix` matmul; one encoder pass over the whole window). + +The encoder is a 28-layer bidirectional Qwen3-0.6B (GQA 16/8, head_dim 128, SwiGLU, QK-norm, +RoPE θ=1e6) built on the existing ANE primitives (Conv2d-1×1 projections, native RMSNorm, +`repeat_kv_ane`, `stable_attention`). Output matches the model's own `st_quantize.py` exactly: +int8 = `clamp(round(tanh(x)·127), −128, 127)` (`torch.round`, half-to-even), plus `binary` +(sign) and `ubinary` (packbits). + +## Files + +| what | where | +|---|---| +| Model registry | `conversion/config.py` → `pplx-embed`, `pplx-embed-context` | +| Encoder | `conversion/models/qwen3_encoder.py` | +| Bundle builder | `conversion/build_pplx_embed_bundle.py` | +| Golden fp32 reference (oracle) | `conversion/pplx_embed_reference.py` | +| Parity test | `conversion/test_pplx_embed_parity.py` | +| ANE RMSNorm A/B | `conversion/experiment_ane_rmsnorm.py` | +| HF uploader | `conversion/upload_pplx_embed.py` (single repo, per-bucket subfolders) | +| Swift runtime | `Sources/CoreMLLLM/PplxEmbed.swift` (+ `pplx-embed-demo`, `pplx-embed-bench`) | + +## Build + +```bash +# A fixed-shape ANE bucket (the fast path), plain int8 output: +python conversion/build_pplx_embed_bundle.py --model pplx-embed --max-seq-len 512 +# Context (late chunking) variant: +python conversion/build_pplx_embed_bundle.py --model pplx-embed-context --max-seq-len 512 +# The flexible GPU catch-all for inputs larger than the biggest bucket (up to 8192): +python conversion/build_pplx_embed_bundle.py --model pplx-embed --dynamic-upper 8192 +``` + +Verify fidelity against the fp32 reference (CPU, fast): + +```bash +python conversion/test_pplx_embed_parity.py # pooled ≥0.999, int8 ≥0.997 +``` + +## Use (Swift) + +```swift +let embedder = try await PplxEmbed.load(bundleDir: URL(fileURLWithPath: "output/pplx-embed")) +let vectors = try embedder.embed(["hello world", "bonjour le monde"]) // [[Int8]] (1024-d) +// also: embedBinary / embedUBinary; embedContext([[String]]) for late chunking +``` + +`embed()` tokenizes, selects the **smallest fixed bucket** that fits, pads/masks, and runs on the +ANE. Inputs larger than the biggest bucket are routed to the flexible RangeDim model on the GPU +(non-padded). Run the CLI demo with `swift run -c release pplx-embed-demo --bundle-dir output/pplx-embed --text "…"`. + +### Download prebuilt models from Hugging Face + +End users can **download** the prebuilt CoreML buckets instead of regenerating them (conversion +needs the toolkit + minutes per bucket). The buckets live in one repo with per-bucket subfolders +(`/pplx-embed-coreml`; final id confirmed at publish time) plus a `manifest.json` +inventory. `PplxEmbed.load(repo:)` reads the manifest and **selectively** downloads only the +requested buckets (+ the dynamic catch-all) — never the whole repo: + +```swift +let embedder = try await PplxEmbed.load( + repo: "/pplx-embed-coreml", + buckets: [512, 1024, 2048], // only these subfolders + tokenizer are fetched + into: appSupportDir) // preferCompiled: true → pulls .mlmodelc (no on-device compile) +let vectors = try embedder.embed(["hello world"]) +``` + +Each bucket is published in **both** formats — precompiled `.mlmodelc` (default; no on-device +compile) and the portable `.mlpackage` (`preferCompiled: false`). The repo hosts both, but the +client downloads only one format's ~1.1 GB weights per bucket. The demo takes `--repo`: +`swift run -c release pplx-embed-demo --repo /pplx-embed-coreml --buckets 512 --text "…"`. + +Publish with `conversion/upload_pplx_embed.py` (single repo, per-bucket subfolders, `--compile` to +ship both `.mlmodelc`+`.mlpackage`): it compiles, **stages a clean repo tree** (hardlinks + +`manifest.json` + README card), ensures the repo exists, and prints a resumable +`hf upload-large-folder --repo-type=model` command (parallel, xet-accelerated, +realtime progress; re-run to resume). Every bucket's `weight.bin` is now **byte-identical** (the +RoPE cos/sin tables are built once to a fixed length — `max_position_embeddings`, 32768 — and +gathered to `S` at runtime via `position_ids` derived from `attention_mask`, so they no longer scale +with `max_seq_len`; verified L512≡L1024 by sha256). So HF LFS stores the ~1.2 GB blob **once** across +all buckets (was ~7 GB for 6 buckets), and `.mlmodelc`↔`.mlpackage` within a bucket still dedup too. +The runtime gather is fold-proof — a plain static `[:S]` slice gets const-folded back to a per-bucket +constant — and needs no new model input / Swift change. Fidelity unchanged (CoreML L512 cosine vs the +fp32 oracle 0.99996; ANE residency 99.3%). + +## Design notes + +- **Fixed-shape buckets, one `.mlpackage` per bucket.** Flexible shapes (EnumeratedShapes/RangeDim) + force CPU fallback on the ANE and are ~10× slower; fixed buckets stay 99.8% on the ANE. Pad each + input to the smallest fitting bucket. Latency is O(L²) with a sharp knee at L=1024→2048. +- **Flexible GPU model is the >max-bucket catch-all only.** Built with `--dynamic-upper N` + (RangeDim 1..N), it runs on the GPU non-padded for unbounded length — correct (cos 0.999) but + ~10× slower than a fixed bucket, so it's used only when no bucket fits. +- **L=8192 is NOT an ANE bucket — the largest fixed ANE bucket is 4096.** A fixed L=8192 bucket + *statically* plans to 99.81% ANE, but the ANE **runtime fails to execute it** + (`ANEProgramProcessRequestDirect status=0x15: Program Inference error`, + `conversion/measure_l8192_bucket.py`): at 8192 the full bidirectional-attention intermediates + (16 heads × 8192² fp16 ≈ 2 GB per score tensor) exceed ANE buffer limits, and the ANE graph + compile itself takes ~25 min. So inputs of 4097–8192 tokens stay on the **dynamic GPU + catch-all** (which already covers them). `chunk-and-pool` to stay within a bucket would change + plain-embedding semantics, so it's a separate design question, not a drop-in. +- **Native RMSNorm (`norm_impl="native"`, the default).** The 5 encoder norm sites use native + `x·rsqrt(mean(x²)+eps)·w` rather than the shared `ane_ops.ANERMSNorm` cat([x,−x])→LayerNorm + trick. The trick predates a fast native ANE rsqrt; on M4 Max / macOS 26 / coremltools 9 native + is **12–21% faster** on the ANE at identical 99.81% residency and cosine 0.99998 + (`conversion/experiment_ane_rmsnorm.py`; see [`PPLX_EMBED_GPU_RESIDENCY.md`](PPLX_EMBED_GPU_RESIDENCY.md) + and [`ANE_RMSNORM_FOLLOWUP.md`](ANE_RMSNORM_FOLLOWUP.md)). Build with `--norm-impl ane_cat` to + fall back. This is local to the pplx-embed encoder — the shared decoder `ANERMSNorm` is untouched. +- **fp16 residual rescale (K=8).** The 28-layer `down_proj` accumulation overflows fp16; scaling + `embed_tokens`/`o_proj`/`down_proj` by 1/K is exact for a pre-norm net (scale-invariant norms) + and keeps activations in range. K=8 is the fidelity/overflow sweet spot. +- **macOS26 native int8 output** is not readable from the Python CoreML bridge; read it in Swift + (the `pplx-embed-bench` harness does). Fidelity is otherwise measured via a `pooled_fp16`-output + variant in Python. +- **Throughput:** ANE batch-1 at the smallest bucket is both the lowest-latency and + highest-throughput path; batching is not a useful lever on CoreML (see below). + +See [`PPLX_EMBED_W8A8.md`](PPLX_EMBED_W8A8.md) (weight/activation quantization is not viable for +this model), [`PPLX_EMBED_BATCHING.md`](PPLX_EMBED_BATCHING.md) (batching is not a useful +throughput lever — the ANE is batch-1 by design), and +[`PPLX_EMBED_GPU_RESIDENCY.md`](PPLX_EMBED_GPU_RESIDENCY.md) (why the GPU `CPU_AND_GPU` path has +low GPU residency — an inherent CoreML partitioner behavior at B=1, not a fixable issue). diff --git a/docs/PPLX_EMBED_BATCHING.md b/docs/PPLX_EMBED_BATCHING.md new file mode 100644 index 0000000..8182341 --- /dev/null +++ b/docs/PPLX_EMBED_BATCHING.md @@ -0,0 +1,205 @@ +# CoreML batching throughput — pplx-embed encoder (Apple Silicon) + +**Question.** Does CoreML batching (B>1) raise throughput for the pplx-embed +bidirectional Qwen3-0.6B encoder on Apple Silicon? An earlier quick test (L=512, +pooled_fp16, warm) showed FLAT docs/sec across B — ANE ~9/s, GPU ~4.7/s. Is that +real, and why? (All numbers below are on an **Apple M4 Max**, macOS 26.5.1.) + +**Verdict (bottom line).** The flat result is **real, not a measurement bug**, but +it is **device-specific**: + +- **ANE (`CPU_AND_NE`) does NOT batch — it gets *worse* with B.** Per-doc latency + is flat-to-rising; throughput *drops* to ~0.68–0.71× of B=1 at L=128. The ANE is + a batch-1-oriented fixed-function accelerator: it serializes batch rows and adds + per-row overhead. Confirmed. +- **GPU (`CPU_AND_GPU`) batches, but only modestly and only when one sequence + under-fills the GPU (small L).** At L=128 it gains up to **1.44×** (B=16); at + L=512 a single sequence already saturates the GPU, so batching is flat (~1.0×, + even regressing to 0.92× at B=64). +- **CPU/BLAS (`CPU_ONLY`) batches the most — up to ~1.6× at L=128** (B=16), ~1.1× + at L=512. This is the Accelerate/BLAS GEMM batching control behaving as expected, + and it is the largest batch win of the three backends — but it is off a slow + baseline, so in *absolute* docs/sec it never beats batch-1 ANE. + +**So: does CoreML batching help this encoder? Only marginally (~1.4–1.6× on +GPU/CPU at short sequences, nothing — worse — on the ANE).** The reason is +architectural, not a bug: the fast path (ANE) is the one backend that fundamentally +can't batch (it serializes the batch axis), and the backends that *can* batch +(GPU/CPU) are the slow paths whose batch gains are small and saturate by L≈512. + +--- + +## Setup + +- Machine: **Apple M4 Max**, macOS 26.5.1 (arm64). coremltools 9.0. (Numbers are + machine-specific; the *qualitative* conclusions — ANE doesn't batch, GPU saturates + by L≈512 — should generalize, but absolute throughput will differ on other chips.) +- Model: `PplxEmbedModel(cfg, output_mode="pooled_fp16")`, fp16 residual rescale K=8, + traced+converted at shape **(B, L)** per cell, `minimum_deployment_target=macOS26`, + converted with `compute_units=ALL`, then **loaded** under each compute-unit setting + (`CPU_AND_NE`, `CPU_AND_GPU`, `CPU_ONLY`) so each backend is forced. +- Timing: each (L, B, unit) shape **warmed** (3 predicts) then **median of 8** timed + predicts. Inputs are fp16 all-ones mask, random int32 ids. +- `per-doc latency = batch_latency / B`, `docs/sec = B / batch_latency`. +- Script: `conversion/experiment_batching.py` (parametrized, reproducible). +- A second independent `--quick` run (runs=6) reproduced the ANE numbers within noise + (B=1 L=128: 72 vs 71 docs/s; B=1 L=512: 10.4 vs 9.9 docs/s), confirming stability. + +## docs/sec (rows = B, cols = compute unit) + +### L=128 +| B | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY | +|----|-----------:|------------:|---------:| +| 1 | 71.18 | 18.37 | 20.87 | +| 4 | 49.06 | 20.78 | 28.30 | +| 16 | 50.75 | 26.41 | 33.37 | +| 64 | 48.70 | 23.87 | 32.37 | + +### L=512 +| B | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY | +|----|-----------:|------------:|---------:| +| 1 | 9.94 | 4.58 | 6.29 | +| 4 | 9.06 | 4.95 | 6.91 | +| 16 | 8.98 | 4.66 | 6.96 | +| 64 | 7.32 | 4.20 | 6.86 | + +## per-doc latency (ms) — the key view (flat = no batch gain) + +### L=128 +| B | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY | +|----|-----------:|------------:|---------:| +| 1 | 14.048 | 54.425 | 47.918 | +| 4 | 20.383 | 48.116 | 35.336 | +| 16 | 19.706 | 37.863 | 29.963 | +| 64 | 20.532 | 41.886 | 30.892 | + +### L=512 +| B | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY | +|----|-----------:|------------:|---------:| +| 1 | 100.583 | 218.390 | 159.103 | +| 4 | 110.398 | 201.895 | 144.813 | +| 16 | 111.340 | 214.736 | 143.760 | +| 64 | 136.588 | 238.156 | 145.773 | + +## batch speedup = docs/s(B) / docs/s(B=1) + +### L=128 +| B | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY | +|----|-----------:|------------:|---------:| +| 4 | 0.69× | 1.13× | 1.36× | +| 16 | 0.71× | **1.44×** | **1.60×** | +| 64 | 0.68× | 1.30× | 1.55× | + +### L=512 +| B | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY | +|----|-----------:|------------:|---------:| +| 4 | 0.91× | 1.08× | 1.10× | +| 16 | 0.90× | 1.02× | 1.11× | +| 64 | 0.74× | 0.92× | 1.09× | + +--- + +## Reading the data + +1. **ANE: per-doc latency is flat-to-rising and throughput *falls* below 1.0×.** + B=1 L=128 is the single fastest cell at **71 docs/s** (14.0 ms). Going to B≥4 + *raises* per-doc latency to ~20 ms (0.68–0.71×). The ANE runs the batch as B + sequential single-row passes plus marshalling overhead — exactly hypothesis #1 + ("ANE is batch-1 by design"). This is the dominant fact and it holds at **both** + L=128 and L=512, so it is **not** a small-L under-utilization artifact. + +2. **GPU does parallelize the batch — but only while the GPU is under-filled.** + At L=128 the single-sequence GPU pass under-utilizes the ALUs, so batching to + B=16 cuts per-doc latency 54→38 ms (**1.44×**). At L=512 one 512-token bidirectional + pass already fills the GPU, so batching is flat (1.0–1.08×) and even regresses at + B=64 (0.92×, thermal/occupancy). This is hypothesis #3 resolved: batching helps on + GPU *only* at small L, and the effect is bounded (~1.4×). + +3. **CPU/BLAS batches the most (the control did its job).** `CPU_ONLY` lowers the + GEMMs onto Accelerate/BLAS, which amortizes per-call overhead across the batch: + L=128 gains up to **1.60×** (B=16), L=512 up to ~1.11×. Largest *relative* gain of + the three, but off the slowest baseline, so it never wins in absolute docs/sec. + +4. **Why the earlier "flat" quick test looked flat.** It used **L=512** and reported + ANE (~9/s) and GPU (~4.7/s). At L=512 *both* of those backends are genuinely flat + (ANE structurally; GPU because 512 tokens already saturate it). The quick test + wasn't buggy — it just happened to pick the one sequence length where even the + batchable backend (GPU) has nothing left to give. Had it also probed **L=128 on + GPU/CPU**, it would have seen the modest 1.4–1.6× gains. The blind spot was + "only tested L=512, only looked at docs/sec deltas that are real-but-zero there." + +## Why the batching gains are bounded + +- The **fast path is the ANE**, and the ANE is a fixed-function, batch-1 engine: it + streams one (C,1,S) tile at a time and there is no batch axis to parallelize over, + so B>1 is pure serialization — it can never give a batch speedup (it gives a small + *slowdown*). +- The **CoreML GPU path *can* batch-parallelize** (hypothesis #2 — "CoreML GPU is + incapable of batching" — is therefore *false*), **but** for this 0.6B encoder a single + sequence of L≥~256 already saturates the GPU, so there is no spare occupancy left for + batching to fill. The gain you can still capture (small L, where one sequence + under-fills the ALUs) tops out around 1.4×. + +Net: on this model, the throughput-optimal strategy on Apple Silicon is **batch-1 on +the ANE** (71 docs/s at L=128, 10 docs/s at L=512); **batching is not a useful lever** — +at most ~1.4–1.6× on GPU/CPU at short sequences, and a net loss on the ANE. + +## Device-placement audit (MLComputePlan) + +The batched models were compiled (`MLModel.get_compiled_model_path()` → copied to a +stable `.mlmodelc`; `MLComputePlan.load_from_path` aborts on a raw `.mlpackage` in +coremltools 9.0) and every MLProgram op's `preferred_compute_device` was tallied. +This is the *static* compute plan (non-`const` ops only). + +| model | CPU_AND_NE | CPU_AND_GPU | CPU_ONLY | +|--------------|--------------------|-----------------------|-------------| +| L=128, B=1 | **ANE 100%**, CPU 0% | CPU 96%, GPU 4% | CPU 100% | +| L=128, B=64 | **ANE 100%**, CPU 0% | CPU 84%, **GPU 16%** | CPU 100% | +| L=512, B=1 | **ANE 100%**, CPU 0% | CPU 86%, GPU 14% | CPU 100% | + +(1975 compute ops total per model.) + +**This is the decisive control: the batched (B=64) model is 100% on the ANE — it does +NOT fall back to CPU.** So the flat-to-rising ANE per-doc latency is *not* a hidden +device-fallback or a measurement bug: the ANE genuinely accepts the batched graph and +runs it, but serializes the batch axis. Hypothesis #1 confirmed; the earlier "flat" +reading was a true hardware property, not a wrong-device artifact. + +**The `CPU_AND_GPU` GPU residency is low (4–16%) — investigated and explained, not a +bug.** A dedicated follow-up ([`PPLX_EMBED_GPU_RESIDENCY.md`](PPLX_EMBED_GPU_RESIDENCY.md)) +ruled out the obvious suspects: it is **not** an fp32-vs-fp16 issue (coremltools lowers +the whole graph to fp16, so there are no fp32 ops to push to CPU) and **not** an ANE-tuning +artifact (a from-scratch GPU-native rebuild — `nn.Linear`/native-RMSNorm/`(B,S,H)` — lands +at the same ~12% GPU). The real cause is CoreML's **static `CPU_AND_GPU` partitioner**: +for a single-sequence (B=1) transformer it places only weight-backed matmuls (`conv`/ +`linear`, `silu`, `gather`) on the GPU and routes all elementwise / reductions / layout +ops **and the attention `matmul`+`softmax`** to the CPU. The static plan is **accurate, +not misleading** — `CPU_AND_GPU` is genuinely *slower* than `CPU_ONLY` at B=1 (0.70–0.83×), +so the GPU isn't carrying hidden work. None of this is on the critical path (the shipping +path is the ANE fixed buckets, 99.8% ANE); no graph change raises GPU residency or makes +the GPU win at B=1. + +## Sanity — batching is real (no broadcast bug) + +Feeding **distinct** input rows (random ids per row) to the B=4 model yields **distinct +output rows** (pairwise max|diff| ≈ 0.34–0.49, not ~0). And batch **row 0 exactly +matches the standalone B=1 model** on the same input (max|diff| = 0.0000, MATCH). So +each batch element is encoded independently and correctly — the batch timings are +genuine per-document work, the flat throughput is real. + +## Control — batch-N vs N × (B=1), total wall time (B=64, `conversion/experiments/control.log`) + +Direct head-to-head: one batch-64 predict vs 64 sequential B=1 predicts, total wall time. + +| unit \ L | L=128 batch-64 / 64×(B=1) / speedup | L=512 batch-64 / 64×(B=1) / speedup | +|-------------|-------------------------------------|-------------------------------------| +| CPU_AND_NE | 1313.7ms / 905.1ms / **0.69×** | 8326.8ms / 6442.6ms / **0.77×** | +| CPU_AND_GPU | 2764.4ms / 3530.0ms / **1.28×** | 16315ms / 14310ms / **0.88×** | +| CPU_ONLY | 2021.5ms / 3027.9ms / **1.50×** | 9325.8ms / 10233ms / **1.10×** | + +This is the cleanest statement of the result: on the **ANE you are better off looping +N single-document predicts** than issuing one batch-N call (0.69–0.77×, i.e. batching +*loses*). On **GPU and CPU, batching helps at short sequences** (1.28× / 1.50× at L=128) +but the GPU gain evaporates by L=512 (0.88×, GPU saturated) while CPU/BLAS keeps a small +edge (1.10×). In *absolute* docs/sec, batch-1 ANE is still the throughput winner +everywhere. diff --git a/docs/PPLX_EMBED_GPU_RESIDENCY.md b/docs/PPLX_EMBED_GPU_RESIDENCY.md new file mode 100644 index 0000000..82b0f32 --- /dev/null +++ b/docs/PPLX_EMBED_GPU_RESIDENCY.md @@ -0,0 +1,198 @@ +# pplx-embed encoder: why GPU residency is low under `CPU_AND_GPU` + +**TL;DR.** The reviewer's hypothesis — that the ANE-oriented graph (Conv2d‑1×1, +cat/chunk RMSNorm, **fp32** residual/attention/norm) forces ops onto the CPU under +`CPU_AND_GPU` — is **refuted**. Two independent tests prove it: + +1. An **fp16-only** encoder (residual + attention + norms all fp16) produces a + *byte-identical* MLProgram op→device tally to the fp32 encoder, with identical + latency. coremltools' default conversion pipeline already lowers the whole fp32 + trace to fp16 in MIL, so there are **no fp32 compute ops left to push to the CPU**. +2. A **fully GPU‑native** encoder rebuilt from scratch (`nn.Linear`/matmul instead + of Conv2d‑1×1, native `x*rsqrt(mean(x²))·w` RMSNorm instead of cat/chunk, plain + `(B,S,H)` layout, no permutes/tile) gets **12% GPU** — statistically the same as + the shipped encoder's **8.5%**. Removing every ANE‑ism did *not* move the needle. + +The real cause is a property of CoreML's **static GPU planner**: under +`CPU_AND_GPU` it places only **weight‑backed matmul-family ops** (`conv`/`linear`, +plus `silu`, `gather`) on the GPU and routes **all elementwise, reduction, softmax, +layout, and the attention matmul** to the CPU. At **B=1** the resulting CPU↔GPU +handoffs cost more than the GPU saves: **`CPU_AND_GPU` is *slower* than `CPU_ONLY`** +(0.70–0.83×). This is not a fixable graph/implementation issue — it is how the +backend partitions a single‑sequence transformer. + +Environment: Apple M4 Max, macOS 26, coremltools 9, torch 2.11. Fixed‑shape +`pooled_fp16`, K=8 residual rescale, B=1, L∈{256,512}. Static placement via +`MLComputePlan` on the compiled `.mlmodelc`; timing is `MLModel.predict` median. + +--- + +## 1. Op / dtype breakdown (L=256, B=1, `CPU_AND_GPU`) + +Every non-const compute op in the MLProgram is **FLOAT16** (confirmed by reading the +MIL spec proto: 1976 fp16 outputs, 2 int32, 1 bool). The dtype hypothesis fails at +the proto level — there is no fp32 to assign anywhere. + +### Shipped ANE-tuned encoder (Conv2d‑1×1, cat/chunk RMSNorm) — **8.5% GPU** + +| op type | count | device | note | +|----------------|------:|-------------|------| +| mul | 452 | **CPU** 100% | RoPE, RMSNorm scale, masking | +| transpose | 252 | **CPU** 100% | (B,C,1,S) layout shuffles | +| conv (1×1) | 196 | **GPU 71%** / CPU 29% | q/k/v/gate/up→GPU; o/down (out=1024)→CPU | +| reshape | 169 | **CPU** 100% | | +| concat | 169 | **CPU** 100% | cat([x,−x]) RMSNorm + rotate_half | +| split | 169 | **CPU** 100% | chunk() RMSNorm + rotate_half | +| add | 141 | **CPU** 100% | residual adds, mask add | +| layer_norm | 113 | **CPU** 100% | the RMSNorm kernel | +| expand_dims | 113 | **CPU** 100% | | +| tile | 57 | **CPU** 100% | GQA k/v expansion | +| matmul | 56 | **CPU** 100% | **attention scores + ctx (no weight const)** | +| softmax | 28 | **CPU** 100% | attention | +| silu | 28 | **GPU** 100% | MLP activation | +| gather | 1 | **GPU** 100% | embedding lookup | + +### GPU‑native rebuild (Linear/matmul, native RMSNorm, (B,S,H)) — **12% GPU** + +| op type | count | device | note | +|--------------|------:|--------|------| +| linear | 196 | **GPU** 100% | all projections now GPU | +| silu | 28 | **GPU** 100% | | +| gather | 1 | **GPU** 100% | | +| mul / add | 452 / 254 | CPU 100% | RoPE, RMSNorm scale, residual | +| reshape | 169 | CPU 100% | | +| pow / reduce_mean / rsqrt | 113 each | CPU 100% | native RMSNorm internals | +| transpose | 112 | CPU 100% | | +| split / concat | 56 each | CPU 100% | rotate_half | +| matmul / softmax | 56 / 28 | CPU 100% | **attention still on CPU** | +| tile / expand_dims | 57 / 57 | CPU 100% | GQA expansion | + +**Observation:** switching Conv2d→Linear moved `o_proj`/`down_proj` onto the GPU +(196 vs 140 GPU ops), but the *entire elementwise/reduction/attention mass stayed on +the CPU* — including the attention `matmul`+`softmax`, which the planner never puts +on the GPU because they have no constant weight to anchor a GPU kernel. Net GPU share +rose only 8.5%→12%, and latency did not improve. + +--- + +## 2. Root cause + +CoreML's `CPU_AND_GPU` **static** partitioner is conservative for single-sequence +(B=1) transformers. It assigns to the GPU only the ops whose dominant operand is a +**constant weight** (the projection `conv`/`linear`, the `silu` fused after them, and +`gather`). Everything data-dependent — elementwise (`mul`/`add`), the RMSNorm +reductions, the **attention `matmul`/`softmax`**, and all layout ops (`transpose`/ +`reshape`/`concat`/`split`/`tile`/`expand_dims`) — is left on the CPU. The graph then +ping‑pongs CPU→GPU→CPU around each projection. + +This is independent of the encoder's ANE tuning: +- **dtype** is not the lever (everything is fp16 post-lowering; fp16/fp32 paths are + identical ops), +- the **Conv2d‑1×1 + cat/chunk RMSNorm + (B,C,1,S) layout** is not the lever (a + textbook Linear/native‑RMSNorm/(B,S,H) graph partitions the same way). + +### Is the static plan misleading? No — confirmed by timing. + +If the GPU were secretly carrying work, `CPU_AND_GPU` would beat `CPU_ONLY`. It does +not — it is **slower**, because the handoff overhead for the few GPU ops exceeds +their benefit at B=1: + +| variant (L=256, B=1) | CPU_ONLY | CPU_AND_GPU | CPU_AND_NE | GPU speedup vs CPU_ONLY | +|----------------------|---------:|------------:|-----------:|------------------------:| +| ANE-tuned (fp32 resid) | 82.3 ms | 98.6 ms | **36.1 ms** | **0.83× (slower)** | +| ANE-tuned (fp16 resid) | — | 98.6 ms | 36.0 ms | — (identical to fp32) | +| GPU-native rebuild | 68.0 ms | 96.5 ms | **28.6 ms** | **0.70× (slower)** | + +At L=512 the picture is the same: GPU share rises to 14.2% (more matmul work) but +`CPU_AND_GPU` (220 ms) is still ~2.2× slower than `CPU_AND_NE` (100 ms). + +The only regime where the GPU helps (per `docs/PPLX_EMBED_BATCHING.md`) is **small L +with batch B≫1** (L=128, B=16: ~1.4×), where the projection matmuls grow enough to +amortize the handoff. At B=1 there is nothing to amortize. + +--- + +## 3. Mitigation — before/after + +| metric (L=256, B=1) | shipped (ANE fp32) | fp16 path | GPU-native | verdict | +|------------------------------------|-------------------:|----------:|-----------:|---------| +| GPU residency (static, CPU_AND_GPU)| 8.5% | 8.5% | 12.0% | ~no change | +| CPU_AND_GPU latency | 98.6 ms | 98.6 ms | 96.5 ms | ~no change | +| CPU_AND_NE latency (for context) | 36.1 ms | 36.0 ms | 28.6 ms | best path unchanged | +| fidelity (cosine vs HF fp32 oracle)| 0.99993 | 0.99993 | 0.99997 | all PASS (gate 0.99) | + +- **fp16 residual path:** fidelity holds (0.99993, identical to fp32), but it buys + **zero** GPU residency or latency. Not worth shipping as a GPU lever. +- **GPU-native rebuild:** raises static GPU share modestly (8.5%→12%) and is even a + touch faster on `CPU_AND_NE` (28.6 vs 36.1 ms — interesting as an ANE micro-opt, + not the question here), but `CPU_AND_GPU` is unchanged and still slower than CPU. + +There is **no graph change that materially raises GPU residency or makes the GPU +path win at B=1.** The bottleneck is the planner's CPU/GPU partition, not the ops we +emit. + +### Does it help the dynamic RangeDim GPU model? + +No. The >max-bucket flexible RangeDim model is GPU-only because **flexible shapes +force ANE fallback**, not because a GPU‑tuned graph would run well. Its ~10× slowness +vs a fixed ANE bucket is the same CPU‑heavy `CPU_AND_GPU` partition shown here plus +RangeDim overhead. A GPU-native graph would not fix it; only a fixed shape (→ANE) +does. The right lever for >max-bucket inputs is **more/larger fixed ANE buckets**, or +**chunk-and-pool** to stay within a bucket — not a GPU-tuned encoder. + +--- + +## 4. Verdict & recommendation + +**Inherent CoreML `CPU_AND_GPU` backend behavior for single-sequence transformers — +not a fixable implementation issue in our encoder.** Evidence: + +1. fp16 and fp32 encoders compile to identical op→device plans (dtype is not the + lever; everything is fp16 post-lowering). +2. A clean GPU-native graph (no Conv2d‑1×1, no cat/chunk RMSNorm, no fp32) lands at + the same ~8–12% GPU share — removing the ANE-isms changes nothing. +3. The static plan is **accurate**, not misleading: `CPU_AND_GPU` is *slower* than + `CPU_ONLY` (0.70–0.83×), so the GPU genuinely is not carrying meaningful work at + B=1. The CPU/GPU handoff dominates. + +**Recommendations:** + +- **Keep the shipped encoder as-is (fp32 residual, ANE-tuned).** It is correct, + fidelity-safe, and the ANE path (`CPU_AND_NE`, 99.8% ANE, 36 ms) is by far the + fastest. Do **not** add an fp16 path as a "GPU lever" — it does nothing for the + GPU. (An fp16 residual path was prototyped during this investigation and is + fidelity-safe, but provides no GPU benefit, so it was not kept.) +- **Do not invest in a GPU-tuned encoder variant.** It will not beat the ANE bucket + and will not even beat `CPU_ONLY` at B=1. +- **For the >max-bucket catch-all,** prefer adding a larger fixed ANE bucket or + chunk-and-pool over the dynamic GPU model. If the GPU model must stay, the only + knob that helps is **batching at small L** (≤128, B≫1, ~1.4×), which the catch-all + use case (single long doc) does not exercise. +- **One incidental finding worth a follow-up:** the GPU-native rebuild ran the *ANE* + path slightly faster (28.6 vs 36.1 ms at L=256). That is an ANE micro-optimization + question (native RMSNorm vs cat/chunk on this chip/OS), orthogonal to GPU residency + — flagged, not pursued here. + + **Resolved (follow-up).** `conversion/experiment_ane_rmsnorm.py` isolated the RMSNorm + (changing *only* the 5 encoder norm sites, `norm_impl=native` vs `ane_cat`, holding + Conv2d-1×1 and layout fixed): native `rsqrt(mean(x²))·w` RMSNorm is **12.7% faster at + L=256 and 21.5% faster at L=512** on `CPU_AND_NE` (M4 Max / macOS 26 / coremltools 9), + at **identical 99.81% ANE residency** and cosine **0.99998** vs the fp32 oracle. So the + RMSNorm alone accounts for essentially all of the GPU-native rebuild's ANE speedup — + the cat([x,−x])→LayerNorm trick (chosen years ago because the ANE lacked a fast native + rsqrt) is now a *de-optimization* on this stack. **`norm_impl=native` is now the + pplx-embed encoder default.** A shared rollout to the other decoder families' shared + `ane_ops.ANERMSNorm` is a separate flagged follow-up (`docs/ANE_RMSNORM_FOLLOWUP.md`). + +--- + +## Method + +For each variant (shipped ANE-tuned encoder; an fp16-residual prototype; a from-scratch +GPU-native encoder using `nn.Linear`/native RMSNorm/`(B,S,H)` layout sharing the same +weights): build a fixed-shape `pooled_fp16` model (`build_pplx_embed_bundle.py`), compile +to `.mlmodelc`, tally every non-const MLProgram op by `preferred_compute_device` **and** +output dtype via `MLComputePlan` (the op×dtype×device breakdown above), then time +`MLModel.predict` (median) under `CPU_ONLY` / `CPU_AND_GPU` / `CPU_AND_NE` and check +cosine fidelity vs the fp32 `Reference` oracle. The op-device tally is read from the +compiled model's MIL spec proto. diff --git a/docs/PPLX_EMBED_W8A8.md b/docs/PPLX_EMBED_W8A8.md new file mode 100644 index 0000000..23ecba7 --- /dev/null +++ b/docs/PPLX_EMBED_W8A8.md @@ -0,0 +1,182 @@ +# W8A8 (int8 weights + int8 ACTIVATIONS) viability — milestone B4 + +**Question.** Weight-only quant is a proven dead end for this encoder (int8 linear ~0.42 +cosine, int4 palettize ~0.905) *and* buys only 4–8% latency because the forward is +activation/compute-bound (fp16 attention), not weight-bandwidth-bound. The only real +bandwidth lever is **activation** quantization. So: can W8A8 reach acceptable fidelity, or +does it hit the documented **~cos 0.57 wall** on this attention family? + +**Model.** `perplexity-ai/pplx-embed-v1-0.6b` — 28-layer bidirectional Qwen3-0.6B encoder, +fp16, head_dim 128, GQA 16/8, SwiGLU, QK-norm, RoPE θ=1e6. Built via +`conversion/models/qwen3_encoder.py` (`PplxEmbedModel(cfg, output_mode="pooled_fp16")`), +measured against `conversion/pplx_embed_reference.py` (`Reference.embed` fp32 oracle, +int8-tanh output, cosine). + +--- + +## Approach (reproducible: `conversion/experiment_w8a8.py`) + +1. Build an fp16 `pooled_fp16` encoder at a **small bucket** (L=128 default) so the pooled + vector is Python-readable on macOS26 (native int8 output is *not* Python-readable; + pooled_fp16 is). `compute_units=ALL`, `minimum_deployment_target=macOS26`. +2. Calibrate activation ranges on a 14-text multilingual corpus (en/es/fr/de/ja/zh + short + fragments), tokenized and right-padded to the bucket, via + `cto.experimental.linear_quantize_activations`. +3. Quantize weights int8 (`linear_symmetric`, `weight_threshold=512`) on top → W8A8. +4. Predict on 12 held-out multilingual eval texts, apply `int8_tanh_quant` to the CoreML + pooled output, and compute cosine vs `Reference.embed`. Report mean/min. + +**Activation-quant mode is the crux** (parametrized `--mode asymmetric|symmetric`): + +- The pad-mask add uses `Qwen3Encoder.NEG_INF = -1e4`; CoreML lowers this toward the fp16 + floor (−65504). A **symmetric** activation quantizer sets `scale ≈ 1e4/127 ≈ 79`, so real + attention scores (±10) round to ≈0 → after 28 layers the output collapses. This is the + mechanism behind the documented wall when symmetric quant is used. +- **Asymmetric** (`mode="linear"`) lets the range span `[−1e4, +score]`; when that span + overflows fp16 the computed scale goes `inf`, coremltools' `isinf` guard fires, and the + op is **skipped (left in fp16)** — exactly the desired behaviour for the mask add, while + every other (small-range) activation quantizes correctly. This is the only mode with a + chance of beating the wall. + +Two coremltools-9 patches are required for the activation-quant pass to run at all (both in +the script, written upstream-native): +- `_cast` const-fold extracts a Python scalar before `int()/bool()` (numpy≥2 (1,)-array fix). +- `insert_prefix_quantize_dequantize_pair.transform_op` skips ops whose input `x` is + non-float (int32 mask/embedding path) — MIL `quantize` requires float input. + +Contingencies swept by `--all`: asymmetric vs symmetric; rescale K ∈ {0 (none), 8, 16} +(the fp16 residual rescale interacts with activation ranges — K shrinks the residual stream +K×, changing what the activation quantizer sees). + +--- + +## Reference points (measured previously, this repo / knowledge base) + +| config | cos vs fp32 ref (int8 output) | source | +|---|---|---| +| fp16 baseline | **0.999** (min 0.99912) | `weight-quant-is-a-dead-end.md` | +| int8 `linear_quantize_weights` (weight-only) | **0.42** mean (min 0.006) | same | +| int4 `palettize_weights` (g=32, weight-only) | **0.905** | same | +| documented **A8 / W8A8 wall** on this attention family | **~0.57** | `contingency-fixes.md`, plan | +| fidelity gate | **0.990** | `fidelity-gates.md` | + +--- + +## Results (W8A8 — MEASURED, L=128, 12 multilingual eval texts) + +Run: `uv run python conversion/experiment_w8a8.py --all --bucket 128`. Each variant: 14-text +calibration via `cto.experimental.linear_quantize_activations`, then int8 weight quant. +fp16 baseline is the same graph with no quant (sanity: it reproduces the ~0.999 fp16 number). + +| variant | activation mode | rescale K | fp16 mean | **W8A8 mean cos** | W8A8 min | beats 0.57 wall? | ≥0.990 gate? | +|---|---|---|---|---|---|---|---| +| `w8a8-asymmetric-k8-L128` | asymmetric | 8 | 0.9999 | **0.0157** | −0.0219 | ❌ NO | ❌ | +| `w8a8-symmetric-k8-L128` | symmetric | 8 | 0.9999 | **0.0020** | −0.0459 | ❌ NO | ❌ | +| `w8a8-asymmetric-k0-L128` | asymmetric | 0 | 0.9996 | **0.0191** | −0.0078 | ❌ NO | ❌ | +| `w8a8-asymmetric-k16-L128` | asymmetric | 16 | 0.9995 | **0.0201** | −0.0105 | ❌ NO | ❌ | + +**All four W8A8 variants collapse to cosine ≈ 0** (statistically orthogonal to the reference — +the embedding carries no signal). This is *worse* than the documented ~0.57 wall and far worse +than the weight-only int8 number (0.42). Activation int8 is even more destructive than weight +int8 on this encoder. + +Key observations: +- The collapse is **independent of all the contingencies**: asymmetric vs symmetric makes no + meaningful difference (both ≈0), and the fp16 residual rescale K (0 / 8 / 16) does not move + it. So the failure is not the mask-sentinel scale blow-up alone, nor the rescale interaction — + it is that per-tensor int8 activation quant across this 28-layer bidirectional graph destroys + the representation outright. +- The `linear_quantize_activations` pass emits **fp16 overflow / NaN-scale / invalid-cast + RuntimeWarnings** during `insert_prefix_quantize_dequantize_pair` (the −1e4 mask sentinel and + other large-range activations overflow the int8 affine `zero_point` computation). Some ops are + skipped (left fp16) as designed, but enough activations are quantized to wreck the signal. +- The fp16 baseline on the identical graph is 0.9995–0.9999, so the build/measure harness is + correct — the loss is entirely from the activation quantization. + +### ANE residency (MEASURED — W8A8 *does* stay on ANE) + +`uv run python conversion/experiment_w8a8.py --audit /tmp/w8a8-experiment/w8a8-asymmetric-k8-L128.mlpackage` +(compiles via `coremltools.models.utils.compile_model` + `MLComputePlan`): + +``` +total ops: 3531 + ANE: 3322 (94.1%) + unknown: 199 (5.6%) constexpr_blockwise_shift_scale (int8 weight-dequant consts, not compute dispatch) + CPU: 10 (0.3%) greater_equal/add/select/gather (int8-tanh + pooling tail) +``` + +So residency is **not** the blocker — the W8A8 model compiles and runs **94% ANE-resident** +(the only non-ANE compute is the tiny pooling/tanh tail, identical to fp16). The int8 weights +lower to `constexpr_blockwise_shift_scale` consts. The model is perfectly deployable on ANE; it +just produces garbage. + +--- + +## VERDICT: NOT VIABLE (post-training). Needs rotation pre-conditioning or QAT. + +Naive post-training W8A8 is **dead on this encoder** — it does not approach the 0.990 gate, does +not beat the ~0.57 wall, and in fact collapses all the way to **cos ≈ 0** (worse than weight-only +int8's 0.42). Neither asymmetric activation quant nor any rescale-K setting rescues it. ANE +residency is fine (94%), so the failure is purely numerical fidelity, not a fallback problem. + +**Why it collapses this hard** (vs the documented 0.57): this is a 28-layer *bidirectional* +encoder where every layer's input passes through QK-norm + RoPE and the residual stream is held +in fp32 specifically because activations are wide / outlier-heavy. Per-tensor uniform int8 +activation quant sets one scale per tensor from the max, so the heavy-tailed bulk rounds toward +zero; compounded over 28 layers the signal is annihilated. The mask sentinel (−1e4) makes at +least one attention-input tensor's range pathological (the overflow warnings), and the +asymmetric "skip on inf" trick only saves *that* op — every other quantized activation still +crushes. Uniform int8 simply cannot represent this activation distribution. + +**Path to viability** (matches `docs/QUANTIZATION_SURVEY.md`): the bandwidth win requires +activation quant, and activation quant requires **outlier pre-conditioning**: +- **SpinQuant / QuaRot** — fold a learned (SpinQuant) or Hadamard (QuaRot) rotation into the + weights at *zero* runtime cost; it spreads activation outliers across channels so int8 + activations become representable. This is the highest-ROI next step. +- **SmoothQuant** — migrate per-channel activation scale into the weights pre-quant. +- Failing those, full **QAT**. + +Until one of those is in place, the shipping configuration remains **fp16 + buckets** (the +weight-quant lesson's conclusion stands, now extended: *activation* quant is also a post-training +dead end without rotation/QAT). + +> Honest negative result: the wall is not just confirmed, it is *deeper* than documented for this +> port — post-training W8A8 lands at ≈0, not 0.57. The 0.57 figure in the knowledge base likely +> reflects a partial / A8-only or differently-scoped experiment; full per-tensor W8A8 here is ≈0. + +--- + +## Mitigation feasibility (researched 2026-06) + the latency reality + +Recovery candidates and — critically — whether they map to the ANE's fixed op set: + +| approach | recovery (LLM literature) | ANE-deployable? | effort | +|---|---|---|---| +| **SmoothQuant** — per-channel scale migrated activation→weight | W8A8 "negligible loss" on LLMs; "alone insufficient" for total collapse | yes (folds into weights, no runtime ops) | low | +| **Rotation (QuaRot/SpinQuant)** | 4-bit ~99% zero-shot; 8-bit "negligible" (extrapolated) | **partial** — R1/R2 fold offline, but the down-proj/value-path **online Hadamards have no adjacent linear to absorb** → extra runtime ops the ANE may reject/spill; **no public QuaRot/SpinQuant-on-ANE precedent** | high | +| **QAT + distillation** — distil from the fp32 teacher on **unlabeled** text (no labels/contrastive pipeline) | 8-bit "almost lossless"; total collapse likely needs **full** QAT, not LoRA | yes (weights only; deployed graph stays standard int8 matmul) | highest | + +QAT-distillation is the only path with **both** strong recovery and clean ANE deployment. + +### But the premise is wrong — int8 activations barely help here (MEASURED) + +W8A8 exists to buy ANE bandwidth via int8 *activations*. Measured (L=128, cpuAndNE): + +| precision | median latency | +|---|---| +| fp16 (pooled) | 14.0 ms | +| W8A8 (int8 act) | 12.7 ms (**~9% faster**) | + +The ANE is fp16-native; int8-activation matmul is only marginally faster, and the attention +score matmuls (activation×activation) that dominate at large L are not int8-accelerated at all. +With weight quant's ~4–8%, the **whole quantization latency upside is ~10%** — not the 2× the +bandwidth intuition suggests. So even a *perfect* fidelity recovery (weeks of QAT, or a rotation +reimplementation that may not map to ANE) would buy ~10% latency. **Not worth it.** Ship fp16 + +buckets (0.999, 99.8% ANE, 101 ms at L=512); revisit only if a future ANE accelerates int8 +compute, or if memory (not latency) becomes the binding constraint. + +## Files + +- `conversion/experiment_w8a8.py` — builds + measures W8A8 fidelity, parametrized + (`--mode`, `--rescale-k`, `--bucket`, `--all`), saves `.mlpackage` artifacts for ANE audit. +- `docs/PPLX_EMBED_W8A8.md` — this note.