diff --git a/Libraries/IntegrationTestHelpers/IntegrationTestHelpers.swift b/Libraries/IntegrationTestHelpers/IntegrationTestHelpers.swift index 0f424175f..7b6b54239 100644 --- a/Libraries/IntegrationTestHelpers/IntegrationTestHelpers.swift +++ b/Libraries/IntegrationTestHelpers/IntegrationTestHelpers.swift @@ -2,7 +2,6 @@ // Integration packages inject their own Downloader and TokenizerLoader, then call // these functions which run the test and throw on failure. -import CoreImage import Foundation import MLX import MLXEmbedders @@ -10,6 +9,10 @@ import MLXLLM import MLXLMCommon import MLXVLM +#if canImport(CoreImage) + import CoreImage +#endif + // Both MLXLMCommon and MLXEmbedders define ModelContainer. public typealias LLModelContainer = MLXLMCommon.ModelContainer public typealias EmbeddingModelContainer = MLXEmbedders.EmbedderModelContainer @@ -176,18 +179,23 @@ public enum ChatSessionTests { } public static func visionModel(container: LLModelContainer) async throws { - let session = ChatSession(container, generateParameters: generateParameters) - let redImage = CIImage(color: .red).cropped( - to: CGRect(x: 0, y: 0, width: 100, height: 100)) + #if canImport(CoreImage) + let session = ChatSession(container, generateParameters: generateParameters) + let redImage = CIImage(color: .red).cropped( + to: CGRect(x: 0, y: 0, width: 100, height: 100)) - let result = try await streamAndCollect( - session.streamResponse( - to: "What color is this image? Reply with just the color name.", - image: .ciImage(redImage)), label: "Vision") - try check( - result.lowercased().contains("red"), - "Expected 'red' in response, got: \(result)" - ) + let result = try await streamAndCollect( + session.streamResponse( + to: "What color is this image? Reply with just the color name.", + image: .ciImage(redImage)), label: "Vision") + try check( + result.lowercased().contains("red"), + "Expected 'red' in response, got: \(result)" + ) + #else + fatalError( + "Vision model test requires CoreImage, which is not available on this platform.") + #endif } public static func streamDetailsWithTools(container: LLModelContainer) async throws { diff --git a/Libraries/MLXLMCommon/ChatSession.swift b/Libraries/MLXLMCommon/ChatSession.swift index 648e39548..4e893d5fd 100644 --- a/Libraries/MLXLMCommon/ChatSession.swift +++ b/Libraries/MLXLMCommon/ChatSession.swift @@ -1,9 +1,12 @@ // Copyright © 2025 Apple Inc. -import CoreGraphics import Foundation import MLX +#if canImport(CoreGraphics) + import CoreGraphics +#endif + /// Configuration for speculative decoding in a `ChatSession`. /// /// Speculative decoding uses a small draft model to propose candidate tokens diff --git a/Libraries/MLXLMCommon/Linux/CoreGraphics.swift b/Libraries/MLXLMCommon/Linux/CoreGraphics.swift new file mode 100644 index 000000000..3190c27eb --- /dev/null +++ b/Libraries/MLXLMCommon/Linux/CoreGraphics.swift @@ -0,0 +1,22 @@ +// Copyright © 2026 Apple Inc. + +#if !canImport(CoreGraphics) + + public typealias CGFloat = Double + + public struct CGSize: Sendable { + public var width: CGFloat + public var height: CGFloat + + public init(width: CGFloat, height: CGFloat) { + self.width = width + self.height = height + } + + public init(width: Int, height: Int) { + self.width = CGFloat(width) + self.height = CGFloat(height) + } + } + +#endif diff --git a/Libraries/MLXLMCommon/Linux/CoreMedia.swift b/Libraries/MLXLMCommon/Linux/CoreMedia.swift new file mode 100644 index 000000000..2867bc409 --- /dev/null +++ b/Libraries/MLXLMCommon/Linux/CoreMedia.swift @@ -0,0 +1,10 @@ +// Copyright © 2026 Apple Inc. + +#if !canImport(CoreMedia) + + public struct CMTime { + public var value: Int64 + public var timescale: Int32 + } + +#endif diff --git a/Libraries/MLXLMCommon/Linux/Logger.swift b/Libraries/MLXLMCommon/Linux/Logger.swift new file mode 100644 index 000000000..0b7036908 --- /dev/null +++ b/Libraries/MLXLMCommon/Linux/Logger.swift @@ -0,0 +1,29 @@ +// Copyright © 2026 Apple Inc. + +#if canImport(os) + + import os + + typealias Logger = os.Logger + +#else + + final class Logger: Sendable { + private let subsystem: String + private let category: String + + init(subsystem: String, category: String) { + self.subsystem = subsystem + self.category = category + } + + func info(_ message: String) { + print("[INFO] [\(subsystem).\(category)] \(message)") + } + + func error(_ message: String) { + print("[ERROR] [\(subsystem).\(category)] \(message)") + } + } + +#endif diff --git a/Libraries/MLXLMCommon/Linux/String+Linux.swift b/Libraries/MLXLMCommon/Linux/String+Linux.swift new file mode 100644 index 000000000..9f0569e88 --- /dev/null +++ b/Libraries/MLXLMCommon/Linux/String+Linux.swift @@ -0,0 +1,13 @@ +// Copyright © 2026 Apple Inc. + +import Foundation + +#if os(Linux) + + extension String { + public init(localized resource: String) { + self = resource + } + } + +#endif diff --git a/Libraries/MLXLMCommon/ParoQuant/ParoQuantLoader.swift b/Libraries/MLXLMCommon/ParoQuant/ParoQuantLoader.swift index ebf9d60b9..7e7b301b5 100644 --- a/Libraries/MLXLMCommon/ParoQuant/ParoQuantLoader.swift +++ b/Libraries/MLXLMCommon/ParoQuant/ParoQuantLoader.swift @@ -1,7 +1,6 @@ import Foundation import MLX import MLXNN -import os private let logger = Logger(subsystem: "mlx-swift-lm", category: "paroquant") diff --git a/Libraries/MLXLMCommon/UserInput.swift b/Libraries/MLXLMCommon/UserInput.swift index 09fc7d226..5f7371bbf 100644 --- a/Libraries/MLXLMCommon/UserInput.swift +++ b/Libraries/MLXLMCommon/UserInput.swift @@ -1,10 +1,15 @@ // Copyright © 2024 Apple Inc. -@preconcurrency import AVFoundation -import CoreImage import Foundation import MLX +#if canImport(AVFoundation) + @preconcurrency import AVFoundation +#endif +#if canImport(CoreImage) + import CoreImage +#endif + public typealias Message = [String: any Sendable] /// Container for raw user input. @@ -40,101 +45,131 @@ public struct UserInput { } public struct VideoFrame { - public let frame: CIImage + public let image: Image public let timeStamp: CMTime - public init(frame: CIImage, timeStamp: CMTime) { - self.frame = frame + public init(image: Image, timeStamp: CMTime) { + self.image = image self.timeStamp = timeStamp } + + #if canImport(CoreImage) + + @available( + *, deprecated, + message: "Use init(image:, timeStamp:) instead" + ) + public init(frame: CIImage, timeStamp: CMTime) { + self.image = .ciImage(frame) + self.timeStamp = timeStamp + } + + @available( + *, deprecated, + message: "Use image.asCIImage()" + ) + public var frame: CIImage { + return try! image.asCIImage() + } + + #endif } /// Representation of a video resource. public enum Video { - case avAsset(AVAsset) + #if canImport(AVFoundation) + case avAsset(AVAsset) + #endif case url(URL) /// Useful for decoded frames held in memory case frames([VideoFrame]) - @available( - *, deprecated, - message: "Use MediaProcessing.asProcessedSequence() with the Video directly" - ) - public func asAVAsset() -> AVAsset { - switch self { - case .avAsset(let asset): - return asset - case .url(let url): - return AVAsset(url: url) - case .frames: - fatalError( - "calling asAVAsset() on Video Input with VideoFames provided is unsupported and deprecated - please use MediaProcessing.asProcessedSequence() instead" - ) + #if canImport(AVFoundation) + @available( + *, deprecated, + message: "Use MediaProcessing.asProcessedSequence() with the Video directly" + ) + public func asAVAsset() -> AVAsset { + switch self { + case .avAsset(let asset): + return asset + case .url(let url): + return AVAsset(url: url) + case .frames: + fatalError( + "calling asAVAsset() on Video Input with VideoFames provided is unsupported and deprecated - please use MediaProcessing.asProcessedSequence() instead" + ) + } } - } + #endif } /// Representation of an image resource. public enum Image { - case ciImage(CIImage) + #if canImport(CoreImage) + case ciImage(CIImage) + #endif case url(URL) case array(MLXArray) - public func asCIImage() throws -> CIImage { - switch self { - case .ciImage(let image): - return image - - case .url(let url): - if let image = CIImage(contentsOf: url) { + #if canImport(CoreImage) + public func asCIImage() throws -> CIImage { + switch self { + case .ciImage(let image): return image - } - throw UserInputError.unableToLoad(url) - - case .array(let array): - guard array.ndim == 3 else { - throw UserInputError.arrayError("array must have 3 dimensions: \(array.ndim)") - } - - var array = array - - // convert to 0 .. 255 - if array.max().item(Float.self) <= 1.0 { - array = array * 255 - } - // planar -> pixels - switch array.dim(0) { - case 3, 4: - // channels first (planar) - array = array.transposed(1, 2, 0) - default: - break + case .url(let url): + if let image = CIImage(contentsOf: url) { + return image + } + throw UserInputError.unableToLoad(url) + + case .array(let array): + guard array.ndim == 3 else { + throw UserInputError.arrayError( + "array must have 3 dimensions: \(array.ndim)") + } + + var array = array + + // convert to 0 .. 255 + if array.max().item(Float.self) <= 1.0 { + array = array * 255 + } + + // planar -> pixels + switch array.dim(0) { + case 3, 4: + // channels first (planar) + array = array.transposed(1, 2, 0) + default: + break + } + + // 4 components per pixel + switch array.dim(-1) { + case 3: + // pad to 4 bytes per pixel + array = padded(array, widths: [0, 0, [0, 1]], value: MLXArray(255)) + case 4: + // good + break + default: + throw UserInputError.arrayError( + "channel dimension must be last and 3/4: \(array.shape)") + } + + let arrayData = array.asData() + let (H, W, _) = array.shape3 + let cs = CGColorSpace(name: CGColorSpace.sRGB)! + + return CIImage( + bitmapData: arrayData.data, bytesPerRow: W * 4, + size: .init(width: W, height: H), + format: .RGBA8, colorSpace: cs) } - - // 4 components per pixel - switch array.dim(-1) { - case 3: - // pad to 4 bytes per pixel - array = padded(array, widths: [0, 0, [0, 1]], value: MLXArray(255)) - case 4: - // good - break - default: - throw UserInputError.arrayError( - "channel dimension must be last and 3/4: \(array.shape)") - } - - let arrayData = array.asData() - let (H, W, _) = array.shape3 - let cs = CGColorSpace(name: CGColorSpace.sRGB)! - - return CIImage( - bitmapData: arrayData.data, bytesPerRow: W * 4, - size: .init(width: W, height: H), - format: .RGBA8, colorSpace: cs) } - } + #endif } /// Representation of processing to apply to media. diff --git a/Libraries/MLXVLM/MediaProcessing.swift b/Libraries/MLXVLM/MediaProcessing.swift index 1d75072b1..0b91f11cc 100644 --- a/Libraries/MLXVLM/MediaProcessing.swift +++ b/Libraries/MLXVLM/MediaProcessing.swift @@ -443,8 +443,8 @@ public enum MediaProcessing { case .success(requestedTime: _, let image, actualTime: let actual): let ciImage = CIImage( cgImage: image, options: [.colorSpace: CGColorSpace(name: CGColorSpace.sRGB)!]) - let frame = try frameProcessing(.init(frame: ciImage, timeStamp: actual)) - ciImages.append(frame.frame) + let frame = try frameProcessing(.init(image: .ciImage(ciImage), timeStamp: actual)) + ciImages.append(try frame.image.asCIImage()) timestamps.append(frame.timeStamp) case .failure(requestedTime: _, _): break @@ -511,8 +511,8 @@ public enum MediaProcessing { if let targetIndex { let videoFrame = videoFrames[targetIndex] let frame = try frameProcessing( - .init(frame: videoFrame.frame, timeStamp: videoFrame.timeStamp)) - ciImages.append(frame.frame) + .init(image: videoFrame.image, timeStamp: videoFrame.timeStamp)) + ciImages.append(try frame.image.asCIImage()) timestamps.append(frame.timeStamp) } } diff --git a/Libraries/MLXVLM/Models/Qwen25VL.swift b/Libraries/MLXVLM/Models/Qwen25VL.swift index abd4912d9..ebea2aaca 100644 --- a/Libraries/MLXVLM/Models/Qwen25VL.swift +++ b/Libraries/MLXVLM/Models/Qwen25VL.swift @@ -760,7 +760,7 @@ public struct Qwen25VLProcessor: UserInputProcessor { ) { frame in // first apply the user requested resizing, etc. if any let resizedImage = MediaProcessing.apply( - frame.frame, processing: input.processing) + try frame.image.asCIImage(), processing: input.processing) if resizedSize == .zero { let size = resizedImage.extent.size let (resizedHeight, resizedWidth) = try QwenVL.targetSize( @@ -770,7 +770,7 @@ public struct Qwen25VLProcessor: UserInputProcessor { resizedSize = CGSize(width: resizedWidth, height: resizedHeight) } let processedImage = preprocess(image: resizedImage, resizedSize: resizedSize) - return VideoFrame(frame: processedImage, timeStamp: frame.timeStamp) + return VideoFrame(image: .ciImage(processedImage), timeStamp: frame.timeStamp) } videosAsImageSequences.append(imageSequence.frames) diff --git a/Libraries/MLXVLM/Models/Qwen2VL.swift b/Libraries/MLXVLM/Models/Qwen2VL.swift index 701f755f1..05b051c0e 100644 --- a/Libraries/MLXVLM/Models/Qwen2VL.swift +++ b/Libraries/MLXVLM/Models/Qwen2VL.swift @@ -599,7 +599,7 @@ public struct Qwen2VLProcessor: UserInputProcessor { ) { frame in // first apply the user requested resizing, etc. if any let resizedImage = MediaProcessing.apply( - frame.frame, processing: input.processing) + try frame.image.asCIImage(), processing: input.processing) if resizedSize == .zero { let size = resizedImage.extent.size let (resizedHeight, resizedWidth) = try QwenVL.targetSize( @@ -609,7 +609,7 @@ public struct Qwen2VLProcessor: UserInputProcessor { resizedSize = CGSize(width: resizedWidth, height: resizedHeight) } let processedImage = preprocess(image: resizedImage, resizedSize: resizedSize) - return VideoFrame(frame: processedImage, timeStamp: frame.timeStamp) + return VideoFrame(image: .ciImage(processedImage), timeStamp: frame.timeStamp) } videosAsImageSequences.append(imageSequence.frames) diff --git a/Libraries/MLXVLM/Models/Qwen3VL.swift b/Libraries/MLXVLM/Models/Qwen3VL.swift index 523553a16..775f9553b 100644 --- a/Libraries/MLXVLM/Models/Qwen3VL.swift +++ b/Libraries/MLXVLM/Models/Qwen3VL.swift @@ -112,7 +112,8 @@ public struct Qwen3VLProcessor: UserInputProcessor { let sequence = try await MediaProcessing.asProcessedSequence( video, targetFPS: { _ in Double(2) } ) { frame in - let processed = MediaProcessing.apply(frame.frame, processing: input.processing) + let processed = MediaProcessing.apply( + try frame.image.asCIImage(), processing: input.processing) if resizedSize == .zero { let size = processed.extent.size let (height, width) = try QwenVL.targetSize( @@ -124,7 +125,7 @@ public struct Qwen3VLProcessor: UserInputProcessor { resizedSize = CGSize(width: width, height: height) } let finalImage = preprocess(image: processed, resizedSize: resizedSize) - return VideoFrame(frame: finalImage, timeStamp: frame.timeStamp) + return VideoFrame(image: .ciImage(finalImage), timeStamp: frame.timeStamp) } accumulatedFrames.append(sequence.frames) } diff --git a/Libraries/MLXVLM/Models/SmolVLM2.swift b/Libraries/MLXVLM/Models/SmolVLM2.swift index c10772054..5591aa76c 100644 --- a/Libraries/MLXVLM/Models/SmolVLM2.swift +++ b/Libraries/MLXVLM/Models/SmolVLM2.swift @@ -320,14 +320,15 @@ public struct SmolVLMProcessor: UserInputProcessor { } ) { frame in - let processedFrame = frame.frame + let processedFrame = try frame.image + .asCIImage() .toSRGB() .resampled( to: CGSize(width: fixedImageSize, height: fixedImageSize), method: CIImage.ResamplingMethod.lanczos ) .normalized(mean: config.imageMeanTuple, std: config.imageStdTuple) - return VideoFrame(frame: processedFrame, timeStamp: frame.timeStamp) + return VideoFrame(image: .ciImage(processedFrame), timeStamp: frame.timeStamp) } let thwFrames = (0 ..< processedFrames.frames.count).map { diff --git a/Tests/MLXLMTests/MediaProcessingTests.swift b/Tests/MLXLMTests/MediaProcessingTests.swift index 9c6b7e7a6..efe16c2d7 100644 --- a/Tests/MLXLMTests/MediaProcessingTests.swift +++ b/Tests/MLXLMTests/MediaProcessingTests.swift @@ -72,9 +72,10 @@ public class MediaProcesingTests: XCTestCase { // We know video is exactly 5 seconds long, expect 10 samples let frames = try await MediaProcessing.asProcessedSequence(video, samplesPerSecond: 2) { frame in - let image = preprocess(image: frame.frame, resizedSize: .init(width: 224, height: 224)) + let image = preprocess( + image: try frame.image.asCIImage(), resizedSize: .init(width: 224, height: 224)) - return VideoFrame.init(frame: image, timeStamp: frame.timeStamp) + return VideoFrame.init(image: .ciImage(image), timeStamp: frame.timeStamp) } XCTAssert(frames.frames.count == 10) @@ -101,7 +102,7 @@ public class MediaProcesingTests: XCTestCase { for i in 0 ..< (seconds * framerate) { let image = imageWithColor(colors.randomElement()!) let timeStamp: CMTime = .init(value: Int64(i), timescale: Int32(framerate)) - rawFrames.append(VideoFrame(frame: image, timeStamp: timeStamp)) + rawFrames.append(VideoFrame(image: .ciImage(image), timeStamp: timeStamp)) } // Bogus preprocessing values @@ -117,9 +118,10 @@ public class MediaProcesingTests: XCTestCase { // We know video is exactly 5 seconds long, expect 10 samples let frames = try await MediaProcessing.asProcessedSequence(video, samplesPerSecond: 2) { frame in - let image = preprocess(image: frame.frame, resizedSize: .init(width: 224, height: 224)) + let image = preprocess( + image: try frame.image.asCIImage(), resizedSize: .init(width: 224, height: 224)) - return VideoFrame.init(frame: image, timeStamp: frame.timeStamp) + return VideoFrame.init(image: .ciImage(image), timeStamp: frame.timeStamp) } XCTAssert(frames.frames.count == 10)