diff --git a/.gitignore b/.gitignore index f33838e..a85dddb 100644 --- a/.gitignore +++ b/.gitignore @@ -154,3 +154,4 @@ docs/site/ # committed for packages, but should be committed for applications that require a static # environment. Manifest.toml +*.json diff --git a/multi-layer-perceptron/avalon_notebook.ipynb b/multi-layer-perceptron/avalon_notebook.ipynb new file mode 100644 index 0000000..caa2c1d --- /dev/null +++ b/multi-layer-perceptron/avalon_notebook.ipynb @@ -0,0 +1,612 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "using Yota;\n", + "using MLDatasets;\n", + "using NNlib;\n", + "using Statistics;\n", + "using Distributions;\n", + "using Functors;\n", + "using Optimisers;\n", + "using MLUtils: DataLoader;\n", + "using OneHotArrays: onehotbatch\n", + "using Metrics;\n", + "using TimerOutputs;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Primitives" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linear " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "mutable struct Linear\n", + " W::AbstractMatrix{T} where T\n", + " b::AbstractVector{T} where T\n", + "end\n", + "\n", + "@functor Linear\n", + "\n", + "# Init\n", + "function Linear(in_features::Int, out_features::Int)\n", + " k_sqrt = sqrt(1 / in_features)\n", + " d = Uniform(-k_sqrt, k_sqrt)\n", + " return Linear(rand(d, out_features, in_features), rand(d, out_features))\n", + "end\n", + "Linear(in_out::Pair{Int, Int}) = Linear(in_out[1], in_out[2])\n", + "\n", + "function Base.show(io::IO, l::Linear)\n", + " o, i = size(l.W)\n", + " print(io, \"Linear($i=>$o)\")\n", + "end\n", + "\n", + "# Forward\n", + "(l::Linear)(x::Union{AbstractVector{T}, AbstractMatrix{T}}) where T = l.W * x .+ l.b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Logit Cross Entropy" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "logitcrossentropy (generic function with 1 method)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "function logitcrossentropy(ŷ, y; dims=1, agg=mean)\n", + " # Compute cross entropy loss from logits\n", + " # Cross entropy computed from NLL loss on logsoftmax of model outputs\n", + " agg(.-sum(y .* logsoftmax(ŷ; dims=dims); dims=dims));\n", + " end" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Define the model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "mutable struct Net\n", + " fc1::Linear\n", + " fc2::Linear\n", + "end\n", + "\n", + "# Need to mark functor for Optimizer to work\n", + "@functor Net\n", + "\n", + "# Init\n", + "Net() = Net(\n", + " Linear(28*28, 100),\n", + " Linear(100, 10)\n", + ")\n", + "\n", + "# Forward\n", + "function (model::Net)(x::AbstractArray)\n", + " x = reshape(x, 28*28, :)\n", + " x = model.fc1(x)\n", + " x = relu(x)\n", + " x = model.fc2(x)\n", + " return x\n", + "end" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = MNIST(dir=\"/Users/trevoryu/Code/data/mnist\", split=:train);\n", + "test_dataset = MNIST(dir=\"/Users/trevoryu/Code/data/mnist\", split=:test);\n", + "\n", + "X_train = train_dataset.features;\n", + "Y_train = train_dataset.targets;\n", + "\n", + "X_test = test_dataset.features;\n", + "Y_test = test_dataset.targets;" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(28, 28, 10000)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "size(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Flatten features to be 784 dim\n", + "X_train = reshape(X_train, 784, :); # (dim x batch)\n", + "X_test = reshape(X_test, 784, :);" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert targets to one-hot vectors\n", + "Y_train = onehotbatch(Y_train, 0:9);\n", + "Y_test = onehotbatch(Y_test, 0:9); # (dim x batch)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 128;\n", + "train_loader = DataLoader((X_train, Y_train), shuffle=true, batchsize=batch_size);\n", + "test_loader = DataLoader((X_test, Y_test), shuffle=false, batchsize=batch_size);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training setup" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Net(Linear(784=>100), Linear(100=>10))" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Make model\n", + "mlp = Net()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup Adam optimizer\n", + "# Default Β is (0.9, 0.999)\n", + "state = Optimisers.setup(Optimisers.Adam(1e-3), mlp);" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "loss_function (generic function with 1 method)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create objective function to optimize\n", + "function loss_function(model::Net, x::AbstractArray, y::AbstractArray)\n", + " ŷ = model(x)\n", + " loss = logitcrossentropy(ŷ, y)\n", + " return loss\n", + "end" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluation function" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "evaluate (generic function with 1 method)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "function evaluate(mlp, test_loader)\n", + " preds = []\n", + " targets = []\n", + " for (x, y) in test_loader\n", + " # Get model predictions\n", + " # Note argmax of nd-array gives CartesianIndex\n", + " # Need to grab the first element of each CartesianIndex to get the true index\n", + " logits = mlp(x)\n", + " ŷ = map(i -> i[1], argmax(logits, dims=1))\n", + " append!(preds, ŷ)\n", + "\n", + " # Get true labels\n", + " true_label = map(i -> i[1], argmax(y, dims=1))\n", + " append!(targets, true_label)\n", + " end\n", + " accuracy = sum(preds .== targets) / length(targets)\n", + " return accuracy\n", + "end" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training loop" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0m\u001b[1m ────────────────────────────────────────────────────────────────────\u001b[22m\n", + "\u001b[0m\u001b[1m \u001b[22m Time Allocations \n", + " ─────────────────────── ────────────────────────\n", + " Tot / % measured: 341ms / 0.0% 45.3MiB / 0.0% \n", + "\n", + " Section ncalls time %tot avg alloc %tot avg\n", + " ────────────────────────────────────────────────────────────────────\n", + "\u001b[0m\u001b[1m ────────────────────────────────────────────────────────────────────\u001b[22m" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Setup timing output\n", + "const to = TimerOutput()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: epoch 1 loss = 0.43098854388539676\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 1 eval accuracy = 0.9329\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 2 loss = 0.20844910683454007\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 2 eval accuracy = 0.9482\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 3 loss = 0.15448442086061948\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 3 eval accuracy = 0.9593\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 4 loss = 0.12039177602707632\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 4 eval accuracy = 0.966\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 5 loss = 0.0983232690863611\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 5 eval accuracy = 0.9675\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 6 loss = 0.08252257340927549\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 6 eval accuracy = 0.972\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 7 loss = 0.07077896451852372\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 7 eval accuracy = 0.9729\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 8 loss = 0.06109750930723943\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 8 eval accuracy = 0.9755\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 9 loss = 0.05307631371444329\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 9 eval accuracy = 0.976\n", + "└ @ Main In[15]:22\n", + "┌ Info: epoch 10 loss = 0.04616427247831039\n", + "└ @ Main In[15]:17\n", + "┌ Info: epoch 10 eval accuracy = 0.9757\n", + "└ @ Main In[15]:22\n" + ] + } + ], + "source": [ + "last_loss = 0;\n", + "@timeit to \"total_training_time\" begin\n", + " for epoch in 1:10\n", + " timing_name = epoch > 1 ? \"average_epoch_training_time\" : \"train_jit\"\n", + " @timeit to timing_name begin\n", + " losses = []\n", + " for (x, y) in train_loader\n", + " # loss_function does forward pass\n", + " # Yota.jl grad function computes model parameter gradients in g[2]\n", + " loss, g = grad(loss_function, mlp, x, y)\n", + " \n", + " # Optimiser updates parameters\n", + " Optimisers.update!(state, mlp, g[2])\n", + " push!(losses, loss)\n", + " end\n", + " last_loss = mean(losses)\n", + " @info(\"epoch $epoch loss = $(mean(losses))\")\n", + " end\n", + " timing_name = epoch > 1 ? \"average_inference_time\" : \"eval_jit\"\n", + " @timeit to timing_name begin\n", + " acc = evaluate(mlp, test_loader)\n", + " @info(\"epoch $epoch eval accuracy = $(acc)\")\n", + " end\n", + " end\n", + "end" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0m\u001b[1m ────────────────────────────────────────────────────────────────────────────────\u001b[22m\n", + "\u001b[0m\u001b[1m \u001b[22m Time Allocations \n", + " ─────────────────────── ────────────────────────\n", + " Tot / % measured: 33.6s / 98.1% 29.0GiB / 99.7% \n", + "\n", + " Section ncalls time %tot avg alloc %tot avg\n", + " ────────────────────────────────────────────────────────────────────────────────\n", + " total_training_time 1 32.9s 100.0% 32.9s 28.9GiB 100.0% 28.9GiB\n", + " train_jit 1 26.2s 79.7% 26.2s 10.0GiB 34.8% 10.0GiB\n", + " average_epoch_tr... 9 5.82s 17.7% 646ms 17.6GiB 60.8% 1.95GiB\n", + " eval_jit 1 559ms 1.7% 559ms 283MiB 1.0% 283MiB\n", + " average_inferenc... 9 305ms 0.9% 33.9ms 1.02GiB 3.5% 116MiB\n", + "\u001b[0m\u001b[1m ────────────────────────────────────────────────────────────────────────────────\u001b[22m" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6461179073333333" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train time\n", + "# Exclude jit time\n", + "average_epoch_train_time = TimerOutputs.time(to[\"total_training_time\"][\"average_epoch_training_time\"]) / (9 * 1e9) # Outputs in nanoseconds, conver to seconds" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.42875644444444444" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Eval batch time\n", + "# Exclude jit time\n", + "num_batches = length(test_loader)\n", + "average_eval_batch_time = TimerOutputs.time(to[\"total_training_time\"][\"average_inference_time\"]) / (9 * 1e6 * num_batches) # Outputs in nanoseconds, conver to milliseconds" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9757" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total_train_time = TimerOutputs.time(to[\"total_training_time\"]) / 1e9 # Convert nanos to seconds\n", + "final_eval_accuracy = evaluate(mlp, test_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dict{String, Any} with 9 entries:\n", + " \"task\" => \"classification\"\n", + " \"framework_name\" => \"Avalon.jl\"\n", + " \"final_trianing_loss\" => 0.0461643\n", + " \"total_training_time\" => 32.9112\n", + " \"average_epoch_training_time\" => 0.646118\n", + " \"final_evaluation_accuracy\" => 0.9757\n", + " \"model_name\" => \"MLP\"\n", + " \"dataset\" => \"MNIST Digits\"\n", + " \"average_batch_inference_time\" => 0.428756" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics = Dict(\n", + " \"model_name\" => \"MLP\",\n", + " \"dataset\" => \"MNIST Digits\",\n", + " \"framework_name\" => \"Avalon.jl\",\n", + " \"task\" => \"classification\",\n", + " \"total_training_time\" => total_train_time,\n", + " \"average_epoch_training_time\" => average_epoch_train_time,\n", + " \"average_batch_inference_time\" => average_eval_batch_time,\n", + " \"final_trianing_loss\" => last_loss,\n", + " \"final_evaluation_accuracy\" => final_eval_accuracy\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "using JSON;\n", + "\n", + "open(\"m1-avalon-mlp.json\",\"w\") do f\n", + " JSON.print(f, metrics)\n", + "end" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 1.8.0", + "language": "julia", + "name": "julia-1.8" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.8.0" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/multi-layer-perceptron/avalon_test.jl b/multi-layer-perceptron/avalon_test.jl new file mode 100644 index 0000000..af2e756 --- /dev/null +++ b/multi-layer-perceptron/avalon_test.jl @@ -0,0 +1,202 @@ +# Avalon.jl implementation of multi-layer perceptron + +# Imports +using Yota; +using MLDatasets; +using NNlib; +using Statistics; +using Distributions; +using Functors; +using Optimisers; +using MLUtils: DataLoader; +using OneHotArrays: onehotbatch +using Metrics; +using TimerOutputs; +using JSON; + + +# Primitives +# Linear layer +mutable struct Linear + W::AbstractMatrix{T} where T + b::AbstractVector{T} where T +end + +@functor Linear + +# Init +function Linear(in_features::Int, out_features::Int) + k_sqrt = sqrt(1 / in_features) + d = Uniform(-k_sqrt, k_sqrt) + return Linear(rand(d, out_features, in_features), rand(d, out_features)) +end +Linear(in_out::Pair{Int, Int}) = Linear(in_out[1], in_out[2]) + +function Base.show(io::IO, l::Linear) + o, i = size(l.W) + print(io, "Linear($i=>$o)") +end + +# Forward +(l::Linear)(x::Union{AbstractVector{T}, AbstractMatrix{T}}) where T = l.W * x .+ l.b + +# Cross entropy loss +function logitcrossentropy(ŷ, y; dims=1, agg=mean) + # Compute cross entropy loss from logits + # Cross entropy computed from NLL loss on logsoftmax of model outputs + agg(.-sum(y .* logsoftmax(ŷ; dims=dims); dims=dims)); +end + + +# Model definition +mutable struct Net + fc1::Linear + fc2::Linear +end + +# Need to mark functor for Optimizer to work +@functor Net + +# Init +Net() = Net( + Linear(28*28, 100), + Linear(100, 10) +) + +# Forward +function (model::Net)(x::AbstractArray) + x = reshape(x, 28*28, :) + x = model.fc1(x) + x = relu(x) + x = model.fc2(x) + return x +end + +# Create objective function to optimize +function loss_function(model::Net, x::AbstractArray, y::AbstractArray) + ŷ = model(x) + loss = logitcrossentropy(ŷ, y) + return loss +end + + +# Evaluation function +function evaluate(mlp::Net, test_loader::DataLoader)::Number + preds = [] + targets = [] + for (x, y) in test_loader + # Get model predictions + # Note argmax of nd-array gives CartesianIndex + # Need to grab the first element of each CartesianIndex to get the true index + logits = mlp(x) + ŷ = map(i -> i[1], argmax(logits, dims=1)) + append!(preds, ŷ) + + # Get true labels + true_label = map(i -> i[1], argmax(y, dims=1)) + append!(targets, true_label) + end + accuracy = sum(preds .== targets) / length(targets) + return accuracy +end + + +# Data loading and processing +function get_data_loaders(; batch_size=128) + # Data loading + train_dataset = MNIST(split=:train); + test_dataset = MNIST(split=:test); + + X_train = train_dataset.features; + Y_train = train_dataset.targets; + + X_test = test_dataset.features; + Y_test = test_dataset.targets; + + # Flatten features to be 784 dim + X_train = reshape(X_train, 784, :); # (dim x batch) + X_test = reshape(X_test, 784, :); + + # Convert targets to one-hot vectors + Y_train = onehotbatch(Y_train, 0:9); + Y_test = onehotbatch(Y_test, 0:9); # (dim x batch) + + train_loader = DataLoader((X_train, Y_train), shuffle=true, batchsize=batch_size); + test_loader = DataLoader((X_test, Y_test), shuffle=false, batchsize=batch_size); + return train_loader, test_loader +end + +# Setup timing +const to = TimerOutput() + + +function main() + train_loader, test_loader = get_data_loaders(batch_size=128) + + # Setup model and optimizer + mlp = Net() + + # Default Β is (0.9, 0.999) + state = Optimisers.setup(Optimisers.Adam(1e-3), mlp); + + # Training loop + last_loss = 0; + @timeit to "total_training_time" begin + for epoch in 1:10 + # Store the timing in the first epoch into a separate timer for jit + timing_name = epoch > 1 ? "train_epoch" : "train_jit" + @timeit to timing_name begin + losses = [] + for (x, y) in train_loader + # loss_function does forward pass + # Yota.jl grad function computes model parameter gradients in g[2] + loss, g = grad(loss_function, mlp, x, y) + + # Optimiser updates parameters + Optimisers.update!(state, mlp, g[2]) + push!(losses, loss) + end + last_loss = mean(losses) + @info("epoch $epoch loss = $(mean(losses))") + end + timing_name = epoch > 1 ? "eval_epoch" : "eval_jit" + @timeit to timing_name begin + acc = evaluate(mlp, test_loader) + @info("epoch $epoch eval accuracy = $(acc)") + end + end + end + + # Compute timing metrics + # Outputs in nanoseconds, convert to seconds + average_epoch_train_time = TimerOutputs.time(to["total_training_time"]["train_epoch"]) / (9 * 1e9) + total_train_time = TimerOutputs.time(to["total_training_time"]) / 1e9 + + + num_batches = length(test_loader) + # Outputs in nanoseconds, conver to milliseconds + average_eval_batch_time = TimerOutputs.time(to["total_training_time"]["eval_epoch"]) / (9 * 1e6 * num_batches) + + final_eval_accuracy = evaluate(mlp, test_loader) + + metrics = Dict( + "model_name" => "MLP", + "dataset" => "MNIST Digits", + "framework_name" => "Avalon.jl", + "task" => "classification", + "total_training_time" => total_train_time, + "average_epoch_training_time" => average_epoch_train_time, + "average_batch_inference_time" => average_eval_batch_time, + "final_trianing_loss" => last_loss, + "final_evaluation_accuracy" => final_eval_accuracy + ) + open("m1-avalon-mlp.json","w") do f + JSON.print(f, metrics) + end +end + + +# Run main function +if abspath(PROGRAM_FILE) == @__FILE__ + main() +end \ No newline at end of file