JuliaGPU · maleadt · Aug 12, 2023 · Aug 11, 2023 · Aug 11, 2023 · Aug 11, 2023
diff --git a/README.md b/README.md
@@ -64,15 +64,15 @@ importing the package:
 julia> using Metal
 
 julia> Metal.versioninfo()
-macOS 13.3.1, Darwin 22.4.0
+macOS 13.5.0, Darwin 22.6.0
 
 Toolchain:
-- Julia: 1.9.0
+- Julia: 1.9.3
 - LLVM: 14.0.6
 
 Julia packages:
-- Metal.jl: 0.3.0
-- Metal_LLVM_Tools_jll: 0.3.0+2
+- Metal.jl: 0.5.0
+- Metal_LLVM_Tools_jll: 0.5.1+0
 
 1 device:
 - Apple M2 Max (64.000 KiB allocated)

diff --git a/docs/make.jl b/docs/make.jl
@@ -25,18 +25,23 @@ function main()
         modules = [Metal],
         pages = Any[
             "Home" => "index.md",
+            "Usage" => Any[
+                "usage/overview.md",
+                "usage/array.md",
+                "usage/kernel.md",
+            ],
+            "Profiling" => "profiling.md",
             "API reference" => Any[
                 "api/essentials.md",
                 "api/compiler.md",
                 "api/kernel.md",
                 "api/array.md",
+                "api/mps.md",
             ],
-            "Usage" => Any[
-                "usage/overview.md",
-                "usage/array.md",
+            "FAQ" => Any[
+                "faq/faq.md",
+                "faq/contributing.md",
             ],
-            "Profiling" => "profiling.md",
-            "FAQ" => "faq.md",
         ]
     )
 end

diff --git a/docs/src/api/array.md b/docs/src/api/array.md
@@ -1,4 +1,10 @@
 # Array programming
 
-The Metal array type, `MtlArray`, generally implements the Base array interface and all of its
-expected methods.
+The Metal array type, `MtlArray`, generally implements the Base array interface
+and all of its expected methods.
+
+However, there is the special function `mtl` for transferring an array over to the gpu. For compatibility reasons, it will automatically convert arrays of `Float64` to `Float32`.
+
+```@docs
+mtl
+```
diff --git a/docs/src/api/kernel.md b/docs/src/api/kernel.md
@@ -53,6 +53,7 @@ MtlThreadGroupArray
 ## Synchronization
 
 ```@docs
+MemoryFlags
 threadgroup_barrier
 simdgroup_barrier
 ```
diff --git a/docs/src/api/mps.md b/docs/src/api/mps.md
@@ -0,0 +1,28 @@
+# Metal Performance Shaders
+
+This section lists the package's public functionality that corresponds to the Metal
+Performance Shaders functions. For more information about these functions, or to see
+which functions have yet to be implemented in this package, please consult
+the [Metal Performance Shaders Documentation](https://developer.apple.com/documentation/metalperformanceshaders?language=objc).
+
+## Matrices and Vectors
+
+```@docs
+MPS.MPSMatrix
+MPS.MPSVector
+```
+
+### Matrix Arithmetic Operators
+
+```@docs
+MPS.matmul!
+MPS.matvecmul!
+MPS.topk
+MPS.topk!
+```
+
+### Linear Algebra
+
+Many of the currently implemented MPS functions are for linear algebra operations.
+Therefore, you use them by calling the corresponding LinearAlgebra function with an
+`MtlArray`. They are nonetheless listed below:
diff --git a/docs/src/contributing.md → docs/src/faq/contributing.md b/docs/src/contributing.md → docs/src/faq/contributing.md
@@ -43,7 +43,7 @@ skip the first two steps.
 Some Metal functions map directly to Apple intermediate representation intrinsics. In this
 case, wrapping them into Metal.jl is relatively easy. All that needs to be done is to create
 a mapping from a Julia function via a simple ccall. See the
-[threadgroup barrier implementation](../../src/device/intrinsics/synchronization.jl#L43) for
+[threadgroup barrier implementation](https://github.com/JuliaGPU/Metal.jl/blob/main/src/device/intrinsics/synchronization.jl#L43-L44) for
 reference.
 
 However, the Metal documentation doesn't tell you what the format of the intrinsic names
@@ -84,7 +84,10 @@ optimized away. Double-check that the kernel's IR makes sense for what you wrote
 Metal exposes a special interface to its library of optimized kernels. Rather than accepting
 the normal set of input GPU data structures, it requires special `MPS` datatypes that assume
 row-major memory layout. As this is not the Julia default, adapt accordingly. Adding MPS
-functionality should be mostly straightforward, so this can be an easy entrypoint to helping.
+functionality should be mostly straightforward, so this can be an easy entry point to helping.
+To get started, you can have a look at the [Metal Performance Shaders
+Documentation](https://developer.apple.com/documentation/metalperformanceshaders?language=objc)
+from Apple.
 
 ## Exposing your Interface
 
@@ -97,7 +100,7 @@ The only thing beyond this is exporting into the global namespace. That would be
 functions/structures/macros with clear and common use-cases (`MtlArray` or `@metal`).
 
 Additionally, you can override non-Metal.jl functions like `LinearAlgebra.mul!` seen
-[here](../../lib/mps/linalg.jl#L63). This is essentially (ab)using multiple dispatch to
+[here](https://github.com/JuliaGPU/Metal.jl/blob/main/lib/mps/linalg.jl#L34). This is essentially (ab)using multiple dispatch to
 specialize for certain cases (usually for more performant execution).
 
 If your function is only available from within GPU kernels (like thread indexing intrinsics).

diff --git a/docs/src/faq.md → docs/src/faq/faq.md b/docs/src/faq.md → docs/src/faq/faq.md
@@ -1,5 +1,7 @@
+# Frequently Asked Questions
+
 ## Can you wrap this Metal API?
 
 Most likely. Any help on designing or implementing high-level wrappers for MSL's low-level functionality
 is greatly appreciated, so please consider [contributing](contributing.md) your uses of these APIs on the
-respective repositories.
+respective repositories.
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,6 +1,6 @@
 # MacOS GPU programming in Julia
 
-The Metal.jl package is the main entrypoint for GPU programming on MacOS in Julia. The package
+The Metal.jl package is the main entry point for GPU programming on MacOS in Julia. The package
 makes it possible to do so at various abstraction levels, from easy-to-use arrays down to
 hand-written kernels using low-level Metal APIs.
 
@@ -37,13 +37,13 @@ Pkg.test("Metal")
 The following resources may also be of interest (although are mainly focused on the CUDA GPU
  backend):
 
-- Effectively using GPUs with Julia: [video](https://www.youtube.com/watch?v=7Yq1UyncDNc),
+- Effectively using GPUs with Julia:
   [slides](https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/)
 - How Julia is compiled to GPUs: [video](https://www.youtube.com/watch?v=Fz-ogmASMAE)
 
 ## Contributing
 
-If you want to help improve this package, look at [the contributing page](contributing.md) for more details.
+If you want to help improve this package, look at [the contributing page](faq/contributing.md) for more details.
 
 ## Acknowledgements
 
@@ -60,5 +60,5 @@ Some of the software in this ecosystem was developed as part of academic researc
 would like to help support it, please star the repository as such metrics may help us secure
 funding in the future. If you use our software as part of your research, teaching, or other
 activities, we would be grateful if you could cite our work. The
-[CITATION.bib](https://github.com/JuliaGPU/Metal.jl/blob/master/CITATION.bib) file in the
+[CITATION.cff](https://github.com/JuliaGPU/Metal.jl/blob/main/CITATION.cff) file in the
 root of this repository lists the relevant papers.
diff --git a/docs/src/profiling.md b/docs/src/profiling.md
@@ -18,8 +18,8 @@ Note that the allocations as reported by BenchmarkTools are CPU allocations.
 
 ## Application profiling
 
-For profiling large applications, simple timings are insufficient. Instead, we want a
-overview of how and when the GPU was active, to avoid times where the device was idle and/or
+For profiling large applications, simple timings are insufficient. Instead, we want an
+overview of how and when the GPU was active to avoid times where the device was idle and/or
 find which kernels needs optimization.
 
 As we cannot use the Julia profiler for this task, we will use Metal's GPU profiler directly.

diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
@@ -26,18 +26,18 @@ The `MtlArray` type aims to implement the `AbstractArray` interface, and provide
 implementations of methods that are commonly used when working with arrays. That means you
 can construct `MtlArray`s in the same way as regular `Array` objects:
 
-```julia
+```jldoctest
 julia> MtlArray{Int}(undef, 2)
-2-element MtlVector{Int64}:
+2-element MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  0
  0
 
 julia> MtlArray{Int}(undef, (1,2))
-1×2 MtlMatrix{Int64}:
+1×2 MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  0  0
 
 julia> similar(ans)
-1×2 MtlMatrix{Int64}:
+1×2 MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  0  0
 ```
 
@@ -46,7 +46,7 @@ Copying memory to or from the GPU can be expressed using constructors as well, o
 
 ```jldoctest
 julia> a = MtlArray([1,2])
-2-element MtlVector{Int64}:
+2-element MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  1
  2
 
@@ -73,11 +73,11 @@ perform simple element-wise operations you can use `map` or `broadcast`:
 julia> a = MtlArray{Float32}(undef, (1,2));
 
 julia> a .= 5
-1×2 MtlMatrix{Float32}:
+1×2 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  5.0  5.0
 
 julia> map(sin, a)
-1×2 MtlMatrix{Float32}:
+1×2 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  -0.958924  -0.958924
 ```
 
@@ -86,23 +86,23 @@ To reduce the dimensionality of arrays, Metal.jl implements the various flavours
 
 ```jldoctest
 julia> a = Metal.ones(2,3)
-2×3 MtlMatrix{Float32}:
+2×3 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  1.0  1.0  1.0
  1.0  1.0  1.0
 
 julia> reduce(+, a)
 6.0f0
 
 julia> mapreduce(sin, *, a; dims=2)
-2×1 MtlMatrix{Float32}:
+2×1 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  0.59582335
  0.59582335
 
 julia> b = Metal.zeros(1)
-1-element MtlVector{Float32}:
+1-element MtlVector{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  0.0
 
 julia> Base.mapreducedim!(identity, +, b, a)
-1×1 MtlMatrix{Float32}:
+1×1 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  6.0
 ```
diff --git a/docs/src/metal-programming.md → docs/src/usage/kernel.md b/docs/src/metal-programming.md → docs/src/usage/kernel.md
@@ -1,15 +1,4 @@
-# Introduction to Metal Programming in Julia
-
-*A quick and dirty introduction to GPU programming on MacOS*
-
-## High-Level Array Operations
-
-This document isn't meant to cover the high-level array operations that Metal.jl enables.
-Performing `Array` operations on `MtlArrays` should *just work* if the types are compatible.
-For example, `gpu_c .= gpu_a .+ gpu_b`
-Thus, this document is more about an introduction to explicit GPU *kernel* programming.
-
-## Kernel Programming
+# Kernel programming
 
 Metal.jl is based off of Apple's [Metal Shading Language (MSL)](https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf)
 and Metal framework. The interface allows you to utilize the graphics and computing power of
@@ -53,18 +42,18 @@ also query what the grid and threadgroup sizes are as well.
 
 For Metal.jl, these values are accessed via the following functions:
 
-- thread_index_in_threadgroup()
-- grid_size_Xd()
-- thread_position_in_grid_Xd()
-- thread_position_in_threadgroup_Xd()
-- threadgroup_position_in_grid_Xd()
-- threadgroups_per_grid_Xd()
-- threads_per_grid_Xd()
-- threads_per_threadgroup_Xd()
+- `thread_index_in_threadgroup()`
+- `grid_size_Xd()`
+- `thread_position_in_grid_Xd()`
+- `thread_position_in_threadgroup_Xd()`
+- `threadgroup_position_in_grid_Xd()`
+- `threadgroups_per_grid_Xd()`
+- `threads_per_grid_Xd()`
+- `threads_per_threadgroup_Xd()`
 
 *Where 'X' is 1, 2, or 3 according to the number of dimensions requested.*
 
-Using these in a kernel (taken directly from the [vadd example](../../examples/vadd.jl)):
+Using these in a kernel (taken directly from the [vadd example](https://github.com/JuliaGPU/Metal.jl/blob/main/examples/vadd.jl)):
 
 ```julia
 function vadd(a, b, c)

diff --git a/docs/src/usage/overview.md b/docs/src/usage/overview.md
@@ -39,8 +39,3 @@ a = Metal.zeros(1024)
 b = Metal.ones(1024)
 a.^2 .+ sin.(b)
 ```
-
-When possible, these operations integrate with existing vendor libraries.For example,
-multiplying matrices or generating random numbers will automatically dispatch to these
-high-quality libraries, if types are supported, and fall back to generic implementations
-otherwise.
diff --git a/lib/mps/matrix.jl b/lib/mps/matrix.jl
@@ -118,8 +118,11 @@ end
 """
     matMulMPS(a::MtlMatrix, b::MtlMatrix, c::MtlMatrix, alpha=1, beta=1,
               transpose_left=false, transpose_right=false)
+A `MPSMatrixMultiplication` kernel thay computes:
+`c = alpha * op(a) * beta * op(b) + beta * C`
 
-Perform `c = alpha * op(a) * beta * op(b) + beta * C`.
+This function should not typically be used. Rather, use the normal `LinearAlgebra` interface
+with any `MtlArray` and it should be accelerated using Metal Performance Shaders.
 """
 function matmul!(c::MtlMatrix, a::MtlMatrix, b::MtlMatrix,
                  alpha::Number=true, beta::Number=true,
@@ -146,7 +149,7 @@ function matmul!(c::MtlMatrix, a::MtlMatrix, b::MtlMatrix,
     encode!(cmdbuf, mat_mul_kernel, mps_b, mps_a, mps_c)
     commit!(cmdbuf)
 
-    c
+    return c
 end
 
 export MPSMatrixFindTopK
@@ -187,6 +190,8 @@ Return the indices in `I` and the values in `V`.
 
 `k` cannot be greater than 16.
 
+Uses `MPSMatrixFindTopK`.
+
 See also: [`topk`](@ref).
 """
 function topk!(A::MtlMatrix{T}, I::MtlMatrix{UInt32}, V::MtlMatrix{T}, k) where {T<:MtlFloat}
@@ -197,7 +202,7 @@ function topk!(A::MtlMatrix{T}, I::MtlMatrix{UInt32}, V::MtlMatrix{T}, k) where
     @assert size(V,1) >= k         "Matrix 'V' must be large enough for k rows"
     @assert size(V,2) >= size(A,2) "Matrix 'V' must have at least as many columns as A"
 
-    _topk!(A,I,V,k)
+    return _topk!(A,I,V,k)
 end
 @inline function _topk!(A::MtlMatrix{T}, I::MtlMatrix{UInt32}, V::MtlMatrix{T}, k) where {T<:MtlFloat}
     # Create MPS-compatible matrix from the MtlArrays
@@ -215,7 +220,7 @@ end
     encode!(cmdbuf, topk_kernel, mps_a, mps_i, mps_v)
     commit!(cmdbuf)
 
-    I, V
+    return I, V
 end
 
 """
@@ -226,6 +231,8 @@ Return the indices in `I` and the values in `V`.
 
 `k` cannot be greater than 16.
 
+Uses `MPSMatrixFindTopK`.
+
 See also: [`topk!`](@ref).
 """
 function topk(A::MtlMatrix{T,S}, k) where {T<:MtlFloat,S}