diff --git a/.gitignore b/.gitignore
index f33838e..a85dddb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -154,3 +154,4 @@ docs/site/
 # committed for packages, but should be committed for applications that require a static
 # environment.
 Manifest.toml
+*.json
diff --git a/multi-layer-perceptron/avalon_notebook.ipynb b/multi-layer-perceptron/avalon_notebook.ipynb
new file mode 100644
index 0000000..caa2c1d
--- /dev/null
+++ b/multi-layer-perceptron/avalon_notebook.ipynb
@@ -0,0 +1,612 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using Yota;\n",
+    "using MLDatasets;\n",
+    "using NNlib;\n",
+    "using Statistics;\n",
+    "using Distributions;\n",
+    "using Functors;\n",
+    "using Optimisers;\n",
+    "using MLUtils: DataLoader;\n",
+    "using OneHotArrays: onehotbatch\n",
+    "using Metrics;\n",
+    "using TimerOutputs;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Primitives"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Linear "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mutable struct Linear\n",
+    "    W::AbstractMatrix{T} where T\n",
+    "    b::AbstractVector{T} where T\n",
+    "end\n",
+    "\n",
+    "@functor Linear\n",
+    "\n",
+    "# Init\n",
+    "function Linear(in_features::Int, out_features::Int)\n",
+    "    k_sqrt = sqrt(1 / in_features)\n",
+    "    d = Uniform(-k_sqrt, k_sqrt)\n",
+    "    return Linear(rand(d, out_features, in_features), rand(d, out_features))\n",
+    "end\n",
+    "Linear(in_out::Pair{Int, Int}) = Linear(in_out[1], in_out[2])\n",
+    "\n",
+    "function Base.show(io::IO, l::Linear)\n",
+    "    o, i = size(l.W)\n",
+    "    print(io, \"Linear($i=>$o)\")\n",
+    "end\n",
+    "\n",
+    "# Forward\n",
+    "(l::Linear)(x::Union{AbstractVector{T}, AbstractMatrix{T}}) where T = l.W * x .+ l.b"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Logit Cross Entropy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "logitcrossentropy (generic function with 1 method)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "function logitcrossentropy(ŷ, y; dims=1, agg=mean)\n",
+    "  # Compute cross entropy loss from logits\n",
+    "  # Cross entropy computed from NLL loss on logsoftmax of model outputs\n",
+    "    agg(.-sum(y .* logsoftmax(ŷ; dims=dims); dims=dims));\n",
+    "  end"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mutable struct Net\n",
+    "    fc1::Linear\n",
+    "    fc2::Linear\n",
+    "end\n",
+    "\n",
+    "# Need to mark functor for Optimizer to work\n",
+    "@functor Net\n",
+    "\n",
+    "# Init\n",
+    "Net() = Net(\n",
+    "    Linear(28*28, 100),\n",
+    "    Linear(100, 10)\n",
+    ")\n",
+    "\n",
+    "# Forward\n",
+    "function (model::Net)(x::AbstractArray)\n",
+    "    x = reshape(x, 28*28, :)\n",
+    "    x = model.fc1(x)\n",
+    "    x = relu(x)\n",
+    "    x = model.fc2(x)\n",
+    "    return x\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = MNIST(dir=\"/Users/trevoryu/Code/data/mnist\", split=:train);\n",
+    "test_dataset = MNIST(dir=\"/Users/trevoryu/Code/data/mnist\", split=:test);\n",
+    "\n",
+    "X_train = train_dataset.features;\n",
+    "Y_train = train_dataset.targets;\n",
+    "\n",
+    "X_test = test_dataset.features;\n",
+    "Y_test = test_dataset.targets;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(28, 28, 10000)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "size(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Flatten features to be 784 dim\n",
+    "X_train = reshape(X_train, 784, :);  # (dim x batch)\n",
+    "X_test = reshape(X_test, 784, :);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert targets to one-hot vectors\n",
+    "Y_train = onehotbatch(Y_train, 0:9);\n",
+    "Y_test = onehotbatch(Y_test, 0:9);  # (dim x batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 128;\n",
+    "train_loader = DataLoader((X_train, Y_train), shuffle=true, batchsize=batch_size);\n",
+    "test_loader = DataLoader((X_test, Y_test), shuffle=false, batchsize=batch_size);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Net(Linear(784=>100), Linear(100=>10))"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Make model\n",
+    "mlp = Net()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup Adam optimizer\n",
+    "# Default Β is (0.9, 0.999)\n",
+    "state = Optimisers.setup(Optimisers.Adam(1e-3), mlp);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "loss_function (generic function with 1 method)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Create objective function to optimize\n",
+    "function loss_function(model::Net, x::AbstractArray, y::AbstractArray)\n",
+    "    ŷ = model(x)\n",
+    "    loss = logitcrossentropy(ŷ, y)\n",
+    "    return loss\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluation function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "evaluate (generic function with 1 method)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "function evaluate(mlp, test_loader)\n",
+    "    preds = []\n",
+    "    targets = []\n",
+    "    for (x, y) in test_loader\n",
+    "        # Get model predictions\n",
+    "        # Note argmax of nd-array gives CartesianIndex\n",
+    "        # Need to grab the first element of each CartesianIndex to get the true index\n",
+    "        logits = mlp(x)\n",
+    "        ŷ = map(i -> i[1], argmax(logits, dims=1))\n",
+    "        append!(preds, ŷ)\n",
+    "\n",
+    "        # Get true labels\n",
+    "        true_label = map(i -> i[1], argmax(y, dims=1))\n",
+    "        append!(targets, true_label)\n",
+    "    end\n",
+    "    accuracy = sum(preds .== targets) / length(targets)\n",
+    "    return accuracy\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\u001b[0m\u001b[1m ────────────────────────────────────────────────────────────────────\u001b[22m\n",
+       "\u001b[0m\u001b[1m                   \u001b[22m         Time                    Allocations      \n",
+       "                   ───────────────────────   ────────────────────────\n",
+       " Tot / % measured:      341ms /   0.0%           45.3MiB /   0.0%    \n",
+       "\n",
+       " Section   ncalls     time    %tot     avg     alloc    %tot      avg\n",
+       " ────────────────────────────────────────────────────────────────────\n",
+       "\u001b[0m\u001b[1m ────────────────────────────────────────────────────────────────────\u001b[22m"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Setup timing output\n",
+    "const to = TimerOutput()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "┌ Info: epoch 1 loss = 0.43098854388539676\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 1 eval accuracy = 0.9329\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 2 loss = 0.20844910683454007\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 2 eval accuracy = 0.9482\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 3 loss = 0.15448442086061948\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 3 eval accuracy = 0.9593\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 4 loss = 0.12039177602707632\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 4 eval accuracy = 0.966\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 5 loss = 0.0983232690863611\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 5 eval accuracy = 0.9675\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 6 loss = 0.08252257340927549\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 6 eval accuracy = 0.972\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 7 loss = 0.07077896451852372\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 7 eval accuracy = 0.9729\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 8 loss = 0.06109750930723943\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 8 eval accuracy = 0.9755\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 9 loss = 0.05307631371444329\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 9 eval accuracy = 0.976\n",
+      "└ @ Main In[15]:22\n",
+      "┌ Info: epoch 10 loss = 0.04616427247831039\n",
+      "└ @ Main In[15]:17\n",
+      "┌ Info: epoch 10 eval accuracy = 0.9757\n",
+      "└ @ Main In[15]:22\n"
+     ]
+    }
+   ],
+   "source": [
+    "last_loss = 0;\n",
+    "@timeit to \"total_training_time\" begin\n",
+    "    for epoch in 1:10\n",
+    "        timing_name = epoch > 1 ? \"average_epoch_training_time\" : \"train_jit\"\n",
+    "        @timeit to timing_name begin\n",
+    "            losses = []\n",
+    "            for (x, y) in train_loader\n",
+    "                # loss_function does forward pass\n",
+    "                # Yota.jl grad function computes model parameter gradients in g[2]\n",
+    "                loss, g = grad(loss_function, mlp, x, y)\n",
+    "                \n",
+    "                # Optimiser updates parameters\n",
+    "                Optimisers.update!(state, mlp, g[2])\n",
+    "                push!(losses, loss)\n",
+    "            end\n",
+    "            last_loss = mean(losses)\n",
+    "            @info(\"epoch $epoch loss = $(mean(losses))\")\n",
+    "        end\n",
+    "        timing_name = epoch > 1 ? \"average_inference_time\" : \"eval_jit\"\n",
+    "        @timeit to timing_name begin\n",
+    "            acc = evaluate(mlp, test_loader)\n",
+    "            @info(\"epoch $epoch eval accuracy = $(acc)\")\n",
+    "        end\n",
+    "    end\n",
+    "end"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\u001b[0m\u001b[1m ────────────────────────────────────────────────────────────────────────────────\u001b[22m\n",
+       "\u001b[0m\u001b[1m                               \u001b[22m         Time                    Allocations      \n",
+       "                               ───────────────────────   ────────────────────────\n",
+       "       Tot / % measured:            33.6s /  98.1%           29.0GiB /  99.7%    \n",
+       "\n",
+       " Section               ncalls     time    %tot     avg     alloc    %tot      avg\n",
+       " ────────────────────────────────────────────────────────────────────────────────\n",
+       " total_training_time        1    32.9s  100.0%   32.9s   28.9GiB  100.0%  28.9GiB\n",
+       "   train_jit                1    26.2s   79.7%   26.2s   10.0GiB   34.8%  10.0GiB\n",
+       "   average_epoch_tr...      9    5.82s   17.7%   646ms   17.6GiB   60.8%  1.95GiB\n",
+       "   eval_jit                 1    559ms    1.7%   559ms    283MiB    1.0%   283MiB\n",
+       "   average_inferenc...      9    305ms    0.9%  33.9ms   1.02GiB    3.5%   116MiB\n",
+       "\u001b[0m\u001b[1m ────────────────────────────────────────────────────────────────────────────────\u001b[22m"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "to"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.6461179073333333"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Train time\n",
+    "# Exclude jit time\n",
+    "average_epoch_train_time = TimerOutputs.time(to[\"total_training_time\"][\"average_epoch_training_time\"]) / (9 * 1e9)  # Outputs in nanoseconds, conver to seconds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.42875644444444444"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Eval batch time\n",
+    "# Exclude jit time\n",
+    "num_batches = length(test_loader)\n",
+    "average_eval_batch_time = TimerOutputs.time(to[\"total_training_time\"][\"average_inference_time\"]) / (9 * 1e6 * num_batches)  # Outputs in nanoseconds, conver to milliseconds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9757"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "total_train_time = TimerOutputs.time(to[\"total_training_time\"]) / 1e9  # Convert nanos to seconds\n",
+    "final_eval_accuracy = evaluate(mlp, test_loader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dict{String, Any} with 9 entries:\n",
+       "  \"task\"                         => \"classification\"\n",
+       "  \"framework_name\"               => \"Avalon.jl\"\n",
+       "  \"final_trianing_loss\"          => 0.0461643\n",
+       "  \"total_training_time\"          => 32.9112\n",
+       "  \"average_epoch_training_time\"  => 0.646118\n",
+       "  \"final_evaluation_accuracy\"    => 0.9757\n",
+       "  \"model_name\"                   => \"MLP\"\n",
+       "  \"dataset\"                      => \"MNIST Digits\"\n",
+       "  \"average_batch_inference_time\" => 0.428756"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "metrics = Dict(\n",
+    "    \"model_name\" => \"MLP\",\n",
+    "    \"dataset\" => \"MNIST Digits\",\n",
+    "    \"framework_name\" => \"Avalon.jl\",\n",
+    "    \"task\" => \"classification\",\n",
+    "    \"total_training_time\" => total_train_time,\n",
+    "    \"average_epoch_training_time\" => average_epoch_train_time,\n",
+    "    \"average_batch_inference_time\" => average_eval_batch_time,\n",
+    "    \"final_trianing_loss\" => last_loss,\n",
+    "    \"final_evaluation_accuracy\" => final_eval_accuracy\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using JSON;\n",
+    "\n",
+    "open(\"m1-avalon-mlp.json\",\"w\") do f\n",
+    "    JSON.print(f, metrics)\n",
+    "end"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Julia 1.8.0",
+   "language": "julia",
+   "name": "julia-1.8"
+  },
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.8.0"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/multi-layer-perceptron/avalon_test.jl b/multi-layer-perceptron/avalon_test.jl
new file mode 100644
index 0000000..af2e756
--- /dev/null
+++ b/multi-layer-perceptron/avalon_test.jl
@@ -0,0 +1,202 @@
+# Avalon.jl implementation of multi-layer perceptron
+
+# Imports
+using Yota;
+using MLDatasets;
+using NNlib;
+using Statistics;
+using Distributions;
+using Functors;
+using Optimisers;
+using MLUtils: DataLoader;
+using OneHotArrays: onehotbatch
+using Metrics;
+using TimerOutputs;
+using JSON;
+
+
+# Primitives
+# Linear layer
+mutable struct Linear
+    W::AbstractMatrix{T} where T
+    b::AbstractVector{T} where T
+end
+
+@functor Linear
+
+# Init
+function Linear(in_features::Int, out_features::Int)
+    k_sqrt = sqrt(1 / in_features)
+    d = Uniform(-k_sqrt, k_sqrt)
+    return Linear(rand(d, out_features, in_features), rand(d, out_features))
+end
+Linear(in_out::Pair{Int, Int}) = Linear(in_out[1], in_out[2])
+
+function Base.show(io::IO, l::Linear)
+    o, i = size(l.W)
+    print(io, "Linear($i=>$o)")
+end
+
+# Forward
+(l::Linear)(x::Union{AbstractVector{T}, AbstractMatrix{T}}) where T = l.W * x .+ l.b
+
+# Cross entropy loss
+function logitcrossentropy(ŷ, y; dims=1, agg=mean)
+    # Compute cross entropy loss from logits
+    # Cross entropy computed from NLL loss on logsoftmax of model outputs
+      agg(.-sum(y .* logsoftmax(ŷ; dims=dims); dims=dims));
+end
+
+
+# Model definition
+mutable struct Net
+    fc1::Linear
+    fc2::Linear
+end
+
+# Need to mark functor for Optimizer to work
+@functor Net
+
+# Init
+Net() = Net(
+    Linear(28*28, 100),
+    Linear(100, 10)
+)
+
+# Forward
+function (model::Net)(x::AbstractArray)
+    x = reshape(x, 28*28, :)
+    x = model.fc1(x)
+    x = relu(x)
+    x = model.fc2(x)
+    return x
+end
+
+# Create objective function to optimize
+function loss_function(model::Net, x::AbstractArray, y::AbstractArray)
+    ŷ = model(x)
+    loss = logitcrossentropy(ŷ, y)
+    return loss
+end
+
+
+# Evaluation function
+function evaluate(mlp::Net, test_loader::DataLoader)::Number
+    preds = []
+    targets = []
+    for (x, y) in test_loader
+        # Get model predictions
+        # Note argmax of nd-array gives CartesianIndex
+        # Need to grab the first element of each CartesianIndex to get the true index
+        logits = mlp(x)
+        ŷ = map(i -> i[1], argmax(logits, dims=1))
+        append!(preds, ŷ)
+
+        # Get true labels
+        true_label = map(i -> i[1], argmax(y, dims=1))
+        append!(targets, true_label)
+    end
+    accuracy = sum(preds .== targets) / length(targets)
+    return accuracy
+end
+
+
+# Data loading and processing
+function get_data_loaders(; batch_size=128)
+    # Data loading
+    train_dataset = MNIST(split=:train);
+    test_dataset = MNIST(split=:test);
+
+    X_train = train_dataset.features;
+    Y_train = train_dataset.targets;
+
+    X_test = test_dataset.features;
+    Y_test = test_dataset.targets;
+
+    # Flatten features to be 784 dim
+    X_train = reshape(X_train, 784, :);  # (dim x batch)
+    X_test = reshape(X_test, 784, :);
+
+    # Convert targets to one-hot vectors
+    Y_train = onehotbatch(Y_train, 0:9);
+    Y_test = onehotbatch(Y_test, 0:9);  # (dim x batch)
+
+    train_loader = DataLoader((X_train, Y_train), shuffle=true, batchsize=batch_size);
+    test_loader = DataLoader((X_test, Y_test), shuffle=false, batchsize=batch_size);
+    return train_loader, test_loader
+end
+
+# Setup timing
+const to = TimerOutput()
+
+
+function main()
+    train_loader, test_loader = get_data_loaders(batch_size=128)
+    
+    # Setup model and optimizer
+    mlp = Net()
+
+    # Default Β is (0.9, 0.999)
+    state = Optimisers.setup(Optimisers.Adam(1e-3), mlp);
+
+    # Training loop
+    last_loss = 0;
+    @timeit to "total_training_time" begin
+        for epoch in 1:10
+            # Store the timing in the first epoch into a separate timer for jit
+            timing_name = epoch > 1 ? "train_epoch" : "train_jit"
+            @timeit to timing_name begin
+                losses = []
+                for (x, y) in train_loader
+                    # loss_function does forward pass
+                    # Yota.jl grad function computes model parameter gradients in g[2]
+                    loss, g = grad(loss_function, mlp, x, y)
+                    
+                    # Optimiser updates parameters
+                    Optimisers.update!(state, mlp, g[2])
+                    push!(losses, loss)
+                end
+                last_loss = mean(losses)
+                @info("epoch $epoch loss = $(mean(losses))")
+            end
+            timing_name = epoch > 1 ? "eval_epoch" : "eval_jit"
+            @timeit to timing_name begin
+                acc = evaluate(mlp, test_loader)
+                @info("epoch $epoch eval accuracy = $(acc)")
+            end
+        end
+    end
+
+    # Compute timing metrics
+    # Outputs in nanoseconds, convert to seconds
+    average_epoch_train_time = TimerOutputs.time(to["total_training_time"]["train_epoch"]) / (9 * 1e9)
+    total_train_time = TimerOutputs.time(to["total_training_time"]) / 1e9
+
+
+    num_batches = length(test_loader)
+    # Outputs in nanoseconds, conver to milliseconds
+    average_eval_batch_time = TimerOutputs.time(to["total_training_time"]["eval_epoch"]) / (9 * 1e6 * num_batches)
+
+    final_eval_accuracy = evaluate(mlp, test_loader)
+
+    metrics = Dict(
+        "model_name" => "MLP",
+        "dataset" => "MNIST Digits",
+        "framework_name" => "Avalon.jl",
+        "task" => "classification",
+        "total_training_time" => total_train_time,
+        "average_epoch_training_time" => average_epoch_train_time,
+        "average_batch_inference_time" => average_eval_batch_time,
+        "final_trianing_loss" => last_loss,
+        "final_evaluation_accuracy" => final_eval_accuracy
+    )
+    open("m1-avalon-mlp.json","w") do f
+        JSON.print(f, metrics)
+    end
+end
+
+
+# Run main function
+if abspath(PROGRAM_FILE) == @__FILE__
+    main()
+end
\ No newline at end of file