diff --git a/torch-neuronx/inference/hf_pretrained_perceiver_multimodal_inference.ipynb b/torch-neuronx/inference/hf_pretrained_perceiver_multimodal_inference.ipynb
index a99e04a..574113c 100644
--- a/torch-neuronx/inference/hf_pretrained_perceiver_multimodal_inference.ipynb
+++ b/torch-neuronx/inference/hf_pretrained_perceiver_multimodal_inference.ipynb
@@ -1,789 +1,798 @@
 {
-    "cells": [
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## HuggingFace Multimodal Perceiver Inference on Trn1 / Inf2"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "**Introduction**\n",
-                "\n",
-                "This notebook demonstrates how to compile and run the HuggingFace Multimodal Perceiver model to classify and autoencode video inputs on Neuron. The script is loosely based on HuggingFace's official tutorial for running inference on the multimodal perceiver at https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Perceiver/Perceiver_for_Multimodal_Autoencoding.ipynb\n",
-                "\n",
-                "This notebook can be run on the smallest Inf2 instance `inf2.xlarge`"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Verify that this Jupyter notebook is running the Python kernel environment that was set up according to the [PyTorch Installation Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/torch-neuronx.html#setup-torch-neuronx). You can select the kernel from the 'Kernel -> Change Kernel' option on the top of this Jupyter notebook page."
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "**Install Dependencies**\n",
-                "\n",
-                "This tutorial requires the following pip packages to be installed:\n",
-                "- `torch-neuronx`\n",
-                "- `neuronx-cc`\n",
-                "- `transformers==4.30.2`\n",
-                "- `opencv-python-headless`\n",
-                "- `imageio`\n",
-                "- `scipy`\n",
-                "- `accelerate`\n",
-                "Furthermore, it requires the `ffmpeg` video-audio converter which is used to extract audio from the input videos.\n",
-                "\n",
-                "`torch-neuronx` and `neuronx-cc` should be installed when you configure your environment following the Inf2 setup guide. The remaining dependencies can be installed below:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "%env TOKENIZERS_PARALLELISM=True #Supresses tokenizer warnings making errors easier to detect\n",
-                "!pip install transformers==4.30.2 opencv-python-headless==4.8.0.74 imageio scipy accelerate opencv-python==4.8.0.74\n",
-                "\n",
-                "!wget https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz\n",
-                "!tar xvf ffmpeg-git-amd64-static.tar.xz\n",
-                "!mv ffmpeg-git-*-amd64-static/ffmpeg .\n",
-                "!rm -rf ffmpeg-git-*-amd64-static ffmpeg-git-amd64-static.tar.xz"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "**Imports**"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import base64\n",
-                "import os\n",
-                "import ssl\n",
-                "import re\n",
-                "from urllib import request\n",
-                "import cv2\n",
-                "import imageio\n",
-                "import time\n",
-                "import random\n",
-                "from tqdm import tqdm\n",
-                "import numpy as np\n",
-                "import scipy.io.wavfile\n",
-                "from IPython.display import HTML\n",
-                "\n",
-                "from typing import Optional, Tuple, Union\n",
-                "from transformers import PerceiverForMultimodalAutoencoding\n",
-                "from transformers.modeling_outputs import BaseModelOutputWithCrossAttentions\n",
-                "from transformers.models.perceiver.modeling_perceiver import PerceiverBasicDecoder, PerceiverClassifierOutput\n",
-                "from transformers.models.perceiver.modeling_perceiver import restructure\n",
-                "import torch\n",
-                "import torch.nn as nn\n",
-                "import torch_neuronx"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "**Video Preprocessing Utilities**\n",
-                "\n",
-                "The following code cell defines some useful functions for fetching, preprocessing and visualizing the input video. Most of these are taken directly from HuggingFace's official multimodal perceiver tutorial at https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Perceiver/Perceiver_for_Multimodal_Autoencoding.ipynb.  "
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# Utilities to fetch videos from UCF101 dataset\n",
-                "UCF_ROOT = 'https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/'\n",
-                "_VIDEO_LIST = None\n",
-                "_CACHE_DIR_NAME = \"video_cache\"\n",
-                "\n",
-                "os.makedirs(\"video_cache\", exist_ok=True)\n",
-                "# As of July 2020, crcv.ucf.edu doesn't use a certificate accepted by the\n",
-                "# default Colab environment anymore.\n",
-                "unverified_context = ssl._create_unverified_context()\n",
-                "\n",
-                "def list_ucf_videos():\n",
-                "  \"\"\"Lists videos available in UCF101 dataset.\"\"\"\n",
-                "  global _VIDEO_LIST\n",
-                "  if not _VIDEO_LIST:\n",
-                "    index = request.urlopen(UCF_ROOT, context=unverified_context).read().decode('utf-8')\n",
-                "    videos = re.findall('(v_[\\w_]+\\.avi)', index)\n",
-                "    _VIDEO_LIST = sorted(set(videos))\n",
-                "  return list(_VIDEO_LIST)\n",
-                "\n",
-                "def fetch_ucf_video(video):\n",
-                "  \"\"\"Fetchs a video and cache into local filesystem.\"\"\"\n",
-                "  cache_path = os.path.join(_CACHE_DIR_NAME, video)\n",
-                "  if not os.path.exists(cache_path):\n",
-                "    urlpath = request.urljoin(UCF_ROOT, video)\n",
-                "    print('Fetching %s => %s' % (urlpath, cache_path))\n",
-                "    data = request.urlopen(urlpath, context=unverified_context).read()\n",
-                "    open(cache_path, \"wb\").write(data)\n",
-                "  return cache_path\n",
-                "\n",
-                "# Utilities to open video files using CV2\n",
-                "def crop_center_square(frame):\n",
-                "  y, x = frame.shape[0:2]\n",
-                "  min_dim = min(y, x)\n",
-                "  start_x = (x // 2) - (min_dim // 2)\n",
-                "  start_y = (y // 2) - (min_dim // 2)\n",
-                "  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]\n",
-                "\n",
-                "def load_video(path, max_frames=0, resize=(224, 224)):\n",
-                "  cap = cv2.VideoCapture(path)\n",
-                "  frames = []\n",
-                "  try:\n",
-                "    while True:\n",
-                "      ret, frame = cap.read()\n",
-                "      if not ret:\n",
-                "        break\n",
-                "      frame = crop_center_square(frame)\n",
-                "      frame = cv2.resize(frame, resize)\n",
-                "      frame = frame[:, :, [2, 1, 0]]\n",
-                "      frames.append(frame)\n",
-                "\n",
-                "      if len(frames) == max_frames:\n",
-                "        break\n",
-                "  finally:\n",
-                "    cap.release()\n",
-                "  return np.array(frames) / 255.0\n",
-                "\n",
-                "def to_gif(images):\n",
-                "  converted_images = np.clip(images * 255, 0, 255).astype(np.uint8)\n",
-                "  imageio.mimsave('./animation.gif', converted_images, duration=40, loop=100)\n",
-                "  with open('./animation.gif', 'rb') as f:\n",
-                "    gif_64 = base64.b64encode(f.read()).decode('utf-8')\n",
-                "  return HTML('<img src=\"data:image/gif;base64,%s\"/>' % gif_64)\n",
-                "\n",
-                "def play_audio(data, sample_rate=48000):\n",
-                "  scipy.io.wavfile.write('tmp_audio.wav', sample_rate, data)\n",
-                "\n",
-                "  with open('./tmp_audio.wav', 'rb') as f:\n",
-                "    audio_64 = base64.b64encode(f.read()).decode('utf-8')\n",
-                "  return HTML('<audio controls src=\"data:audio/wav;base64,%s\"/>' % audio_64)\n",
-                "\n",
-                "def table(elements):\n",
-                "  row = ['<td>%s</td>' % el.data for el in elements]\n",
-                "  return HTML('<table><tr>%s</tr></table>' % ''.join(row))\n"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "**Fetch and Preprocess Input Videos**\n",
-                "\n",
-                "The following cell samples a number of videos at random from the UCF101 dataset and preprocesses them using the utilities defined in the previous cell. You can control how many videos you'd like to process by changing the `num_videos_to_process` variable, keeping in mind that setting this value too high may lead to memory issues on smaller inf2 instances (For demonstration purposes, this script will do all the preprocessing before running any inference. You can, of course, modify the script so that the preprocessing and the inference are pipelined, thereby reducing intermediate memory pressure.)\n",
-                "\n",
-                "Note also that because some input videos are unusable due to their lack of an audio stream (handled in the try-except block below), the final number of preprocessed inputs may be less than the value you give to `num_videos_to_process`."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "video_names = list_ucf_videos()\n",
-                "num_videos_to_process = 20\n",
-                "vid_indices = random.sample(range(len(video_names)), num_videos_to_process) # Select videos to process at random\n",
-                "videos, audios = [], []\n",
-                "for i in vid_indices:\n",
-                "  video_path = fetch_ucf_video(video_names[i])\n",
-                "\n",
-                "  # Extract audio using FFMPEG and encode as pcm float wavfile (only format readable by scipy.io.wavfile).\n",
-                "  !yes | ./ffmpeg -i \"$video_path\"  -c copy  -f wav -map 0:a? pcm_f32le -ar 48000 -loglevel quiet output.wav\n",
-                "\n",
-                "  # There may be no audio stream present in the input video, in which case we simply skip this input, because the model requires both modalities to be present.\n",
-                "  try:\n",
-                "    sample_rate, audio = scipy.io.wavfile.read(\"output.wav\")\n",
-                "  except:\n",
-                "    continue\n",
-                "\n",
-                "  if audio.dtype == np.int16:\n",
-                "    audio = audio.astype(np.float32) / 2**15\n",
-                "  elif audio.dtype != np.float32:\n",
-                "    raise ValueError('Unexpected datatype. Model expects sound samples to lie in [-1, 1]')\n",
-                "\n",
-                "  video = load_video(video_path)\n",
-                "  audios.append(audio)\n",
-                "  videos.append(video)\n",
-                "\n",
-                "print(f\"Received {len(audios)} valid input videos to run inference on.\")"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Now we can visualize the first input:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# Visualize inputs\n",
-                "table([to_gif(videos[0]), play_audio(audios[0])])"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "While many of the input clips have many frames, we only feed the first 16 frames to the multimodal perceiver. In the code below, we select the first 16 frames of each input and the corresponding audio samples."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# Select the first 16 frames of the video and one of the audio channels for autoencoding and classification\n",
-                "# Also add a dummy batch dimension.\n",
-                "AUDIO_SAMPLES_PER_FRAME = 48000 // 25\n",
-                "SAMPLES_PER_PATCH = 16\n",
-                "\n",
-                "preprocessed_images, preprocessed_audios = [], []\n",
-                "for i in range(len(videos)):\n",
-                "    image = videos[i][None, :16]\n",
-                "    preprocessed_images.append(image)\n",
-                "\n",
-                "    audio = audios[i]\n",
-                "    if len(audio.shape) == 2:\n",
-                "        audio = audio[None, :16*AUDIO_SAMPLES_PER_FRAME, 0:1]\n",
-                "    elif len(audio.shape) == 1:\n",
-                "        audio = audio[None, :16*AUDIO_SAMPLES_PER_FRAME]\n",
-                "    else:\n",
-                "        raise ValueError(\"audio has wrong shape\")\n",
-                "    preprocessed_audios.append(audio)"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "**Utilities for Neuron Tracing**\n",
-                "\n",
-                "The following cells define some utilities and wrappers that make tracing the multimodal perceiver on Neuron robust and performant. For users that simply want to run the model and see the results, you can stop reading here as it is entirely unnecessary to understand the content inside this cell - you simply need to run it before running the subsequent code.\n",
-                "\n",
-                "We define three wrappers and two utility functions:\n",
-                "1. `MultimodalPerceiverWrapper` wraps the perceiver and is called inside the custom forward function `custom_model_forward`. It is an optimization to avoid redundant computation.\n",
-                "2. `custom_model_forward` replaces the model's original `forward` method. When the model is called later on during inference, the `custom_model_forward` function will execute instead of the model's original `forward` method. It instantiates and uses a `MultimodalPerceiverWrapper` to take advantage of the optimization that avoids redundant computation.\n",
-                "3. `custom_decoder_query` replaces the `decoder_query` method of the `PerceiverBasicDecoder` class. This replacement is necessary to make tracing work - without the replacement, tracing the decoder will generate a segfault.\n",
-                "4. `EncoderWrapper` and `NeuronEncoder` wrap the encoder of the perceiver so that it can be traced.\n",
-                "5. `DecoderWrapper` and `NeuronDecoder` wrap the decoder query, decoder, and output postprocessor of the perceiver so that they can be traced together."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "class MultimodalPerceiverWrapper(nn.Module):\n",
-                "    def __init__(self, perceiver_model, nchunks, image_chunk_size, audio_chunk_size):\n",
-                "        super().__init__()\n",
-                "        self.perceiver_model = perceiver_model\n",
-                "        self.nchunks = nchunks\n",
-                "        self.image_chunk_size = image_chunk_size\n",
-                "        self.audio_chunk_size = audio_chunk_size\n",
-                "    \n",
-                "    def forward(self, inputs: torch.FloatTensor,\n",
-                "        neuron_decoder,\n",
-                "        attention_mask: Optional[torch.FloatTensor] = None,\n",
-                "        head_mask: Optional[torch.FloatTensor] = None,\n",
-                "        output_attentions: Optional[bool] = None,\n",
-                "        output_hidden_states: Optional[bool] = None,\n",
-                "        return_dict: Optional[bool] = None):\n",
-                "\n",
-                "\n",
-                "        output_attentions = output_attentions if output_attentions is not None else self.perceiver_model.config.output_attentions\n",
-                "        output_hidden_states = (\n",
-                "            output_hidden_states if output_hidden_states is not None else self.perceiver_model.config.output_hidden_states\n",
-                "        )\n",
-                "        return_dict = return_dict if return_dict is not None else self.perceiver_model.config.use_return_dict\n",
-                "        \n",
-                "        if self.perceiver_model.input_preprocessor is not None:\n",
-                "            inputs, modality_sizes, inputs_without_pos = self.perceiver_model.input_preprocessor(inputs)\n",
-                "        else:\n",
-                "            modality_sizes = None\n",
-                "            inputs_without_pos = None\n",
-                "            if inputs.size()[-1] != self.perceiver_model.config.d_model:\n",
-                "                raise ValueError(\n",
-                "                    f\"Last dimension of the inputs: {inputs.size()[-1]} doesn't correspond to config.d_model:\"\n",
-                "                    f\" {self.perceiver_model.config.d_model}. Make sure to set config.d_model appropriately.\"\n",
-                "                )\n",
-                "\n",
-                "        batch_size, seq_length, _ = inputs.size()\n",
-                "        device = inputs.device\n",
-                "\n",
-                "        # If no attention mask is provided, make them all ones\n",
-                "        if attention_mask is None:\n",
-                "            attention_mask = torch.ones((batch_size, seq_length), device=device)\n",
-                "        # Make the attention mask broadcastable to [batch_size, num_heads, seq_length, seq_length]\n",
-                "        extended_attention_mask = self.perceiver_model.invert_attention_mask(attention_mask)\n",
-                "\n",
-                "        head_mask = self.perceiver_model.get_head_mask(head_mask, self.perceiver_model.config.num_blocks * self.perceiver_model.config.num_self_attends_per_block)\n",
-                "        embedding_output = self.perceiver_model.embeddings(batch_size=batch_size)\n",
-                "\n",
-                "        encoder_outputs = self.perceiver_model.encoder(\n",
-                "            embedding_output,\n",
-                "            attention_mask=None,\n",
-                "            head_mask=head_mask,\n",
-                "            inputs=inputs,\n",
-                "            inputs_mask=extended_attention_mask,\n",
-                "            output_attentions=output_attentions,\n",
-                "            output_hidden_states=output_hidden_states,\n",
-                "            return_dict=return_dict,\n",
-                "        )\n",
-                "        sequence_output = encoder_outputs[0]\n",
-                "\n",
-                "        logits = None\n",
-                "        reconstruction = {}\n",
-                "        for chunk_idx in tqdm(range(self.nchunks)):\n",
-                "            subsampled_output_points = {\n",
-                "            'image': torch.arange(\n",
-                "                self.image_chunk_size * chunk_idx, self.image_chunk_size * (chunk_idx + 1)).to(device),\n",
-                "            'audio': torch.arange(\n",
-                "                self.audio_chunk_size * chunk_idx, self.audio_chunk_size * (chunk_idx + 1)).to(device),\n",
-                "            'label': None,\n",
-                "            }\n",
-                "            \n",
-                "            logits = neuron_decoder(sequence_output, extended_attention_mask, \n",
-                "                                             inputs, modality_sizes, inputs_without_pos, subsampled_points=subsampled_output_points)\n",
-                "\n",
-                "            reconstruction['label'] = logits['label']\n",
-                "            if 'image' not in reconstruction:\n",
-                "                reconstruction['image'] = logits['image']\n",
-                "                reconstruction['audio'] = logits['audio']\n",
-                "            else:\n",
-                "                reconstruction['image'] = torch.cat(\n",
-                "                    [reconstruction['image'], logits['image']], dim=1)\n",
-                "                reconstruction['audio'] = torch.cat(\n",
-                "                    [reconstruction['audio'], logits['audio']], dim=1)\n",
-                "            \n",
-                "            del logits\n",
-                "\n",
-                "        return reconstruction\n",
-                "\n",
-                "def custom_model_forward(\n",
-                "        self,\n",
-                "        nchunks,\n",
-                "        image_chunk_size,\n",
-                "        audio_chunk_size,\n",
-                "        neuron_decoder,\n",
-                "        inputs: Optional[torch.Tensor] = None,\n",
-                "        attention_mask: Optional[torch.Tensor] = None,\n",
-                "        head_mask: Optional[torch.Tensor] = None,\n",
-                "        output_attentions: Optional[bool] = None,\n",
-                "        output_hidden_states: Optional[bool] = None,\n",
-                "        return_dict: Optional[bool] = None,\n",
-                "    ) -> Union[Tuple, PerceiverClassifierOutput]:\n",
-                "\n",
-                "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
-                "\n",
-                "        perceiver_wrapper = MultimodalPerceiverWrapper(self.perceiver, nchunks, image_chunk_size, audio_chunk_size)\n",
-                "        outputs = perceiver_wrapper(\n",
-                "            inputs,\n",
-                "            neuron_decoder,\n",
-                "            attention_mask=attention_mask,\n",
-                "            head_mask=head_mask,\n",
-                "            output_attentions=output_attentions,\n",
-                "            output_hidden_states=output_hidden_states,\n",
-                "            return_dict=return_dict,\n",
-                "        )\n",
-                "        return outputs\n",
-                "\n",
-                "\n",
-                "def custom_decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, subsampled_points=None):\n",
-                "    if self.position_encoding_type == \"none\":  # Queries come from elsewhere\n",
-                "        raise ValueError(\"You cannot construct decoder queries when position_encoding_type is set to none\")\n",
-                "    if subsampled_points is not None:\n",
-                "        # subsampled_points are the indices if the inputs would be flattened\n",
-                "        # however, the inputs aren't flattened, that's why we use unravel_index\n",
-                "        # to get the indices for the unflattened array\n",
-                "        # unravel_index returns a tuple (x_idx, y_idx, ...)\n",
-                "        # stack to get the [n, d] tensor of coordinates\n",
-                "\n",
-                "        def unravel_indices(indices, shape):\n",
-                "            coord = []\n",
-                "\n",
-                "            for dim in reversed(shape):\n",
-                "                coord.append(indices % dim)\n",
-                "                indices = indices // dim\n",
-                "\n",
-                "            coord = torch.stack(coord[::-1], dim=-1)\n",
-                "\n",
-                "            return coord\n",
-                "\n",
-                "        pos = unravel_indices(subsampled_points, self.output_index_dims)\n",
-                "\n",
-                "        batch_size = inputs.shape[0]\n",
-                "        # Map these coordinates to [-1, 1]\n",
-                "        pos = -1 + 2 * pos / torch.tensor(self.output_index_dims)[None, :]\n",
-                "        pos = torch.broadcast_to(pos[None], [batch_size, pos.shape[0], pos.shape[1]])\n",
-                "        # Construct the position encoding.\n",
-                "        if self.position_encoding_type == \"trainable\":\n",
-                "            pos_emb = self.output_position_encodings(batch_size)\n",
-                "        elif self.position_encoding_type == \"fourier\":\n",
-                "            pos_emb = self.output_position_encodings(\n",
-                "                self.output_index_dims, batch_size=batch_size, device=inputs.device, dtype=inputs.dtype, pos=pos\n",
-                "            )\n",
-                "\n",
-                "        # Optionally project them to a target dimension.\n",
-                "        pos_emb = self.positions_projection(pos_emb)\n",
-                "        pos_emb = torch.reshape(pos_emb, [pos_emb.shape[0], -1, pos_emb.shape[-1]])\n",
-                "    else:\n",
-                "        batch_size = inputs.shape[0]\n",
-                "        index_dims = inputs.shape[2:]\n",
-                "\n",
-                "        # Construct the position encoding.\n",
-                "        if self.position_encoding_type == \"trainable\":\n",
-                "            pos_emb = self.output_position_encodings(batch_size)\n",
-                "        elif self.position_encoding_type == \"fourier\":\n",
-                "            pos_emb = self.output_position_encodings(\n",
-                "                index_dims, batch_size, device=inputs.device, dtype=inputs.dtype\n",
-                "            )\n",
-                "\n",
-                "        # Optionally project them to a target dimension.\n",
-                "        pos_emb = self.positions_projection(pos_emb)\n",
-                "\n",
-                "    if self.concat_preprocessed_input:\n",
-                "        if inputs_without_pos is None:\n",
-                "            raise ValueError(\"Value is required for inputs_without_pos if concat_preprocessed_input is True\")\n",
-                "        pos_emb = torch.cat([inputs_without_pos, pos_emb], dim=-1)\n",
-                "\n",
-                "    return pos_emb\n",
-                "\n",
-                "\n",
-                "# Define wrapper for tracing encoder\n",
-                "class EncoderWrapper(nn.Module):\n",
-                "    def __init__(self, encoder):\n",
-                "        super().__init__()\n",
-                "        self.encoder = encoder\n",
-                "    \n",
-                "    def forward(self, embedding_output, inputs, extended_attention_mask):\n",
-                "        output = self.encoder(embedding_output, inputs=inputs, inputs_mask=extended_attention_mask)\n",
-                "        return output\n",
-                "\n",
-                "class NeuronEncoder(nn.Module):\n",
-                "    def __init__(self, encoder_wrapper):\n",
-                "       super().__init__()\n",
-                "       self.encoder_wrapper = encoder_wrapper\n",
-                "    \n",
-                "    def forward(self,\n",
-                "        hidden_states: torch.Tensor,\n",
-                "        attention_mask: Optional[torch.FloatTensor] = None,\n",
-                "        head_mask: Optional[torch.FloatTensor] = None,\n",
-                "        inputs: Optional[torch.FloatTensor] = None,\n",
-                "        inputs_mask: Optional[torch.FloatTensor] = None,\n",
-                "        output_attentions: Optional[bool] = False,\n",
-                "        output_hidden_states: Optional[bool] = False,\n",
-                "        return_dict: Optional[bool] = True):\n",
-                "\n",
-                "        last_hidden_states = self.encoder_wrapper(hidden_states, inputs, inputs_mask)['last_hidden_state']\n",
-                "        return BaseModelOutputWithCrossAttentions(last_hidden_state=last_hidden_states)\n",
-                "\n",
-                "\n",
-                "# Define wrapper for tracing decoder\n",
-                "class DecoderWrapper(nn.Module):\n",
-                "    def __init__(self, decoder, decoder_query_audio, decoder_query_image, decoder_query_label, output_postprocessor):\n",
-                "        super().__init__()\n",
-                "        self.decoder = decoder\n",
-                "        self.decoder_query_audio = decoder_query_audio\n",
-                "        self.decoder_query_image = decoder_query_image\n",
-                "        self.decoder_query_label = decoder_query_label\n",
-                "        self.output_postprocessor = output_postprocessor\n",
-                "        self.num_query_channels = decoder.num_query_channels\n",
-                "    \n",
-                "    def forward(self, z, query_mask,\n",
-                "                audio_input, audio_input_without_pos, audio_subsampled_point, audio_padding,\n",
-                "                image_input, image_input_without_pos, image_subsampled_point, image_padding,\n",
-                "                label_input, label_input_without_pos, label_padding):\n",
-                "        audio_query = self.decoder_query_audio(inputs=audio_input, inputs_without_pos=audio_input_without_pos, subsampled_points=audio_subsampled_point)\n",
-                "        image_query = self.decoder_query_image(inputs=image_input, inputs_without_pos=image_input_without_pos, subsampled_points=image_subsampled_point)\n",
-                "        label_query = self.decoder_query_label(inputs=label_input, inputs_without_pos=label_input_without_pos)\n",
-                "\n",
-                "        def embed(x, pos):\n",
-                "            x = torch.reshape(x, [x.shape[0], np.prod(x.shape[1:-1]), x.shape[-1]])\n",
-                "            pos = torch.broadcast_to(pos, [x.shape[0], x.shape[1], self.num_query_channels - x.shape[2]])\n",
-                "            return torch.cat([x, pos], dim=2)\n",
-                "\n",
-                "        audio_padded = embed(audio_query, audio_padding)\n",
-                "        image_padded = embed(image_query, image_padding)\n",
-                "        label_padded = embed(label_query, label_padding)\n",
-                "\n",
-                "        decoder_query = torch.cat([audio_padded, image_padded, label_padded], dim=1)\n",
-                "        logits = self.decoder(decoder_query, z, query_mask).logits\n",
-                "        \n",
-                "        output_modality_sizes = {\"audio\": audio_subsampled_point.shape[0],\n",
-                "                                 \"image\": image_subsampled_point.shape[0],\n",
-                "                                 \"label\": 1}\n",
-                "        logits = self.output_postprocessor(logits, modality_sizes=output_modality_sizes)\n",
-                "        return logits\n",
-                "\n",
-                "class NeuronDecoder(nn.Module):\n",
-                "    def __init__(self, decoder_wrapper):\n",
-                "        super().__init__()\n",
-                "        self.decoder_wrapper = decoder_wrapper\n",
-                "        self.modalities = decoder_wrapper.decoder.modalities\n",
-                "        self.padding = decoder_wrapper.decoder.padding\n",
-                "\n",
-                "    def forward(self, z, query_mask, inputs, modality_sizes, inputs_without_pos=None, subsampled_points=None, output_attentions=False):\n",
-                "        # Partition the flat inputs among the different modalities\n",
-                "        inputs = restructure(modality_sizes, inputs)\n",
-                "\n",
-                "        assert(subsampled_points is not None)\n",
-                "        assert(inputs_without_pos is not None)\n",
-                "\n",
-                "        for modality, decoder in self.modalities.items():\n",
-                "            if modality == \"audio\":\n",
-                "                audio_input, audio_input_without_pos, audio_subsampled_point, audio_padding = inputs[modality], inputs_without_pos[modality], subsampled_points[modality].to(torch.float32), self.padding[modality]\n",
-                "            elif modality == \"image\":\n",
-                "                image_input, image_input_without_pos, image_subsampled_point, image_padding = inputs[modality], inputs_without_pos[modality], subsampled_points[modality].to(torch.float32), self.padding[modality]\n",
-                "            else:\n",
-                "                # label doesn't have subsampled point\n",
-                "                label_input, label_input_without_pos, label_padding = inputs[modality], inputs_without_pos[modality], self.padding[modality]\n",
-                "\n",
-                "        assert(audio_input_without_pos is not None)\n",
-                "        assert(audio_subsampled_point is not None)\n",
-                "        assert(image_input_without_pos is not None)\n",
-                "        assert(image_subsampled_point is not None)\n",
-                "        assert(label_input_without_pos is not None)\n",
-                "\n",
-                "        output = self.decoder_wrapper(z, query_mask, \n",
-                "                                        audio_input, audio_input_without_pos, audio_subsampled_point, audio_padding,\n",
-                "                                        image_input, image_input_without_pos, image_subsampled_point, image_padding,\n",
-                "                                        label_input, label_input_without_pos, label_padding)\n",
-                "        return output\n"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "**Trace (Compile) the Model on Neuron**\n",
-                "\n",
-                "With the utilities defined above, we can now compile the perceiver encoder and the perceiver decoder for inference on Neuron. Note that we compile the encoder and the decoder as two independent modules and save them respectively. Note also that in both calls to `torch_neuronx.trace`, we are passing `--auto-cast=none` as a compiler argument. This was intentional in order to avoid numerical errors when casting to BF16 is allowed. We are aware of the performance hit and are hoping to remove this restriction in an upcoming release."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "model = PerceiverForMultimodalAutoencoding.from_pretrained(\"deepmind/multimodal-perceiver\", \n",
-                "                                                                   low_cpu_mem_usage=True)\n",
-                "COMPILER_WORKDIR_ROOT=\"perceiver_multimodal_compile_dir\"\n",
-                "\n",
-                "PerceiverForMultimodalAutoencoding.forward = custom_model_forward\n",
-                "PerceiverBasicDecoder.decoder_query = custom_decoder_query\n",
-                "\n",
-                "\n",
-                "# --- Compile Encoder ---\n",
-                "# Define sample inputs for tracing encoder\n",
-                "embedding_output = torch.randn(1, 784, 512)\n",
-                "sample_inputs = torch.randn(1, 52097, 704)\n",
-                "extended_attention_mask = torch.zeros(1, 1, 1, 52097)\n",
-                "\n",
-                "# Wrap and trace the encoder, save the traced encoder\n",
-                "COMPILER_WORKDIR_ENCODER = os.path.join(COMPILER_WORKDIR_ROOT, \"encoder\")\n",
-                "neuron_encoder = NeuronEncoder(EncoderWrapper(model.perceiver.encoder))\n",
-                "\n",
-                "# You might see a warning from trace about unused input - these are safe to ignore.\n",
-                "neuron_encoder.encoder_wrapper = torch_neuronx.trace(\n",
-                "  neuron_encoder.encoder_wrapper,\n",
-                "  (embedding_output, sample_inputs, extended_attention_mask),\n",
-                "  compiler_workdir=COMPILER_WORKDIR_ENCODER,\n",
-                "  compiler_args=[f\"--temp-dir={COMPILER_WORKDIR_ENCODER}\", \"--auto-cast=none\"] # --auto-cast=none is needed to avoid numerical error.\n",
-                ")\n",
-                "\n",
-                "# Save compiled encoder\n",
-                "encoder_fname = os.path.join(COMPILER_WORKDIR_ENCODER, 'model.pt')\n",
-                "torch.jit.save(neuron_encoder.encoder_wrapper, encoder_fname)\n",
-                "\n",
-                "\n",
-                "# --- Compile Decoder ---\n",
-                "# Define sample inputs for tracing decoder\n",
-                "z = torch.randn(1, 784, 512)\n",
-                "query_mask = torch.zeros(1, 1, 1, 52097)\n",
-                "\n",
-                "audio_input = torch.randn(1, 1920, 704)\n",
-                "audio_input_without_pos = torch.randn(1, 1920, 16)\n",
-                "audio_subsampled_point = torch.arange(0, 15, dtype=torch.float32) # 15 = 1920/128\n",
-                "audio_padding = torch.randn(1, 641)\n",
-                "\n",
-                "image_input = torch.randn(1, 50176, 704)\n",
-                "image_input_without_pos = torch.randn(1, 50176, 48)\n",
-                "image_subsampled_point = torch.arange(0, 6272, dtype=torch.float32) # 6272 = 224*224*16/128\n",
-                "image_padding = torch.randn(1, 831)\n",
-                "\n",
-                "label_input = torch.randn(1, 1, 704)\n",
-                "label_input_without_pos = torch.randn(1, 1, 700)\n",
-                "label_padding = torch.randn(1, 2)\n",
-                "\n",
-                "# Wrap and trace the decoder, save the traced decoder\n",
-                "COMPILER_WORKDIR_DECODER = os.path.join(COMPILER_WORKDIR_ROOT, \"decoder\")\n",
-                "neuron_decoder = NeuronDecoder(DecoderWrapper(model.perceiver.decoder, model.perceiver.decoder.modalities['audio'].decoder_query, \\\n",
-                "                                              model.perceiver.decoder.modalities['image'].decoder_query, model.perceiver.decoder.modalities['label'].decoder_query, \\\n",
-                "                                              model.perceiver.output_postprocessor))\n",
-                "\n",
-                "# You might see a warning from trace about unused input - these are safe to ignore.\n",
-                "neuron_decoder.decoder_wrapper = torch_neuronx.trace(\n",
-                "   neuron_decoder.decoder_wrapper,\n",
-                "   (z, query_mask, audio_input, audio_input_without_pos, audio_subsampled_point, audio_padding,\n",
-                "        image_input, image_input_without_pos, image_subsampled_point, image_padding,\n",
-                "        label_input, label_input_without_pos, label_padding),\n",
-                "   compiler_workdir=COMPILER_WORKDIR_DECODER,\n",
-                "   compiler_args=[f\"--temp-dir={COMPILER_WORKDIR_DECODER}\", \"--auto-cast=none\"] # --auto-cast=none is needed to avoid numerical error.\n",
-                ")\n",
-                "\n",
-                "# Save compiled decoder\n",
-                "decoder_fname = os.path.join(COMPILER_WORKDIR_DECODER, 'model.pt')\n",
-                "torch.jit.save(neuron_decoder.decoder_wrapper, decoder_fname)\n"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "**Load the Compiled Models and Run Inference**\n",
-                "\n",
-                "Now that the model is compiled, you can load them and run inference on the preprocessed images and audios. For each set of image and audio, a `reconstruction` dictionary is returned which contains three items: An `audio` with the reconstructed audio tensor, an `image` with the reconstructed image tensor, and a `label` representing the classification logits. We print out the top 3 predicted labels of each input video along with the inference latency."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# -- Load compiled models --\n",
-                "model = PerceiverForMultimodalAutoencoding.from_pretrained(\"deepmind/multimodal-perceiver\", \n",
-                "                                                                  low_cpu_mem_usage=True)\n",
-                "\n",
-                "# load saved encoder from disk\n",
-                "encoder_fname = os.path.join(COMPILER_WORKDIR_ENCODER, 'model.pt')\n",
-                "neuron_encoder = NeuronEncoder(EncoderWrapper(model.perceiver.encoder))\n",
-                "neuron_encoder.encoder_wrapper = torch.jit.load(encoder_fname)\n",
-                "model.perceiver.encoder = neuron_encoder\n",
-                "\n",
-                "# load saved decoder from disk\n",
-                "decoder_fname = os.path.join(COMPILER_WORKDIR_DECODER, 'model.pt')\n",
-                "neuron_decoder = NeuronDecoder(DecoderWrapper(model.perceiver.decoder, model.perceiver.decoder.modalities['audio'].decoder_query, \\\n",
-                "                                              model.perceiver.decoder.modalities['image'].decoder_query, model.perceiver.decoder.modalities['label'].decoder_query, \\\n",
-                "                                              model.perceiver.output_postprocessor))\n",
-                "neuron_decoder.decoder_wrapper = torch.jit.load(decoder_fname)\n",
-                "\n",
-                "# Inference function\n",
-                "def autoencode_video(images, audio, nchunks, image_chunk_size, audio_chunk_size):\n",
-                "    input_image = torch.from_numpy(np.moveaxis(images, -1, 2)).to(torch.float32)\n",
-                "    input_audio = torch.from_numpy(audio)\n",
-                "    input_label = torch.zeros((images.shape[0], 700))\n",
-                "\n",
-                "    inputs = {'image': input_image, 'audio': input_audio, 'label':input_label}\n",
-                "\n",
-                "    reconstruction = {}\n",
-                "    with torch.no_grad():\n",
-                "        reconstruction = model(nchunks, image_chunk_size, audio_chunk_size, neuron_decoder, inputs=inputs)\n",
-                "\n",
-                "    # reshape image and audio modalities back to original shape\n",
-                "    reconstruction['image'] = torch.reshape(reconstruction['image'], images.shape)\n",
-                "    reconstruction['audio'] = torch.reshape(reconstruction['audio'], audio.shape)\n",
-                "    return reconstruction\n",
-                "\n",
-                "nchunks = 128\n",
-                "reconstructions = []\n",
-                "for audio, image in zip(preprocessed_audios, preprocessed_images):\n",
-                "    image_chunk_size = np.prod(image.shape[1:-1]) // nchunks\n",
-                "    audio_chunk_size = audio.shape[1] // SAMPLES_PER_PATCH // nchunks\n",
-                "\n",
-                "    start = time.time()\n",
-                "    reconstruction = autoencode_video(image, audio, nchunks, image_chunk_size, audio_chunk_size)\n",
-                "    print(f\"Inference latency is {time.time()-start} seconds\")\n",
-                "    reconstructions.append(reconstruction)\n",
-                "\n",
-                "    # Print top 3 predicted labels\n",
-                "    scores, indices = torch.topk(torch.softmax(reconstruction[\"label\"], dim=1), k=3)\n",
-                "    top3 = []\n",
-                "    for score, index in zip(scores[0], indices[0]):\n",
-                "        print(\"%s: %s\" % (model.config.id2label[index.item()], score.item()))"
-            ]
-        },
-        {
-            "attachments": {},
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "We can also visualize any of the reconstructed videos:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "# Visualize reconstruction of first 16 frames\n",
-                "table([to_gif(reconstructions[0][\"image\"][0].numpy()), play_audio(np.array(reconstructions[0][\"audio\"][0].numpy()))])"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.8.16"
-        },
-        "orig_nbformat": 4
-    },
-    "nbformat": 4,
-    "nbformat_minor": 2
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## HuggingFace Multimodal Perceiver Inference on Trn1 / Inf2"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Introduction**\n",
+    "\n",
+    "This notebook demonstrates how to compile and run the HuggingFace Multimodal Perceiver model to classify and autoencode video inputs on Neuron. The script is loosely based on HuggingFace's official tutorial for running inference on the multimodal perceiver at https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Perceiver/Perceiver_for_Multimodal_Autoencoding.ipynb\n",
+    "\n",
+    "This notebook can be run on the smallest Inf2 instance `inf2.xlarge`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Verify that this Jupyter notebook is running the Python kernel environment that was set up according to the [PyTorch Installation Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/torch-neuronx.html#setup-torch-neuronx). You can select the kernel from the 'Kernel -> Change Kernel' option on the top of this Jupyter notebook page."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Install Dependencies**\n",
+    "\n",
+    "This tutorial requires the following pip packages to be installed:\n",
+    "- `torch-neuronx`\n",
+    "- `neuronx-cc`\n",
+    "- `transformers==4.30.2`\n",
+    "- `opencv-python-headless`\n",
+    "- `imageio`\n",
+    "- `scipy`\n",
+    "- `accelerate`\n",
+    "Furthermore, it requires the `ffmpeg` video-audio converter which is used to extract audio from the input videos.\n",
+    "\n",
+    "`torch-neuronx` and `neuronx-cc` should be installed when you configure your environment following the Inf2 setup guide. The remaining dependencies can be installed below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env TOKENIZERS_PARALLELISM=True #Supresses tokenizer warnings making errors easier to detect\n",
+    "!pip install transformers==4.30.2 opencv-python-headless==4.8.0.74 imageio scipy accelerate opencv-python==4.8.0.74\n",
+    "\n",
+    "!wget https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz\n",
+    "!tar xvf ffmpeg-git-amd64-static.tar.xz\n",
+    "!mv ffmpeg-git-*-amd64-static/ffmpeg .\n",
+    "!rm -rf ffmpeg-git-*-amd64-static ffmpeg-git-amd64-static.tar.xz"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Imports**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "import os\n",
+    "import ssl\n",
+    "import re\n",
+    "from urllib import request\n",
+    "import cv2\n",
+    "import imageio\n",
+    "import time\n",
+    "import random\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import scipy.io.wavfile\n",
+    "from IPython.display import HTML\n",
+    "import zipfile\n",
+    "import glob\n",
+    "\n",
+    "from typing import Optional, Tuple, Union\n",
+    "from transformers import PerceiverForMultimodalAutoencoding\n",
+    "from transformers.modeling_outputs import BaseModelOutputWithCrossAttentions\n",
+    "from transformers.models.perceiver.modeling_perceiver import PerceiverBasicDecoder, PerceiverClassifierOutput\n",
+    "from transformers.models.perceiver.modeling_perceiver import restructure\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch_neuronx"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Video Preprocessing Utilities**\n",
+    "\n",
+    "The following code cell defines some useful functions for fetching, preprocessing and visualizing the input video. Most of these are taken directly from HuggingFace's official multimodal perceiver tutorial at https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Perceiver/Perceiver_for_Multimodal_Autoencoding.ipynb.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Utilities to fetch videos from UCF101 dataset\n",
+    "UCF_ZIP = 'http://storage.googleapis.com/thumos14_files/UCF101_videos.zip'\n",
+    "_VIDEO_LIST = None\n",
+    "_CACHE_DIR_NAME = \"video_cache\"\n",
+    "\n",
+    "os.makedirs(\"video_cache\", exist_ok=True)\n",
+    "# As of July 2020, crcv.ucf.edu doesn't use a certificate accepted by the\n",
+    "# default Colab environment anymore.\n",
+    "unverified_context = ssl._create_unverified_context()\n",
+    "\n",
+    "def list_videos(directory):\n",
+    "  _VIDEO_LIST = []\n",
+    "  for file in glob.glob(f\"{directory}/**/*.avi\", recursive=True):\n",
+    "    _VIDEO_LIST.append(file)\n",
+    "  return _VIDEO_LIST\n",
+    "\n",
+    "def list_ucf_videos():\n",
+    "  \"\"\"Download and extract videos.\"\"\"\n",
+    "  global _VIDEO_LIST\n",
+    "  if not _VIDEO_LIST:\n",
+    "    data = request.urlopen(UCF_ZIP, context=unverified_context).read()\n",
+    "    cached_zip = os.path.join(_CACHE_DIR_NAME, \"UCF101_videos.zip\")\n",
+    "    open(cached_zip, \"wb\").write(data)\n",
+    "    with zipfile.ZipFile(cached_zip, 'r') as zip_ref:\n",
+    "      zip_ref.extractall(_CACHE_DIR_NAME)\n",
+    "    return list_videos(_CACHE_DIR_NAME)\n",
+    "  return list(_CACHE_DIR_NAME)\n",
+    "\n",
+    "\n",
+    "# Utilities to open video files using CV2\n",
+    "def crop_center_square(frame):\n",
+    "  y, x = frame.shape[0:2]\n",
+    "  min_dim = min(y, x)\n",
+    "  start_x = (x // 2) - (min_dim // 2)\n",
+    "  start_y = (y // 2) - (min_dim // 2)\n",
+    "  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]\n",
+    "\n",
+    "def load_video(path, max_frames=0, resize=(224, 224)):\n",
+    "  cap = cv2.VideoCapture(path)\n",
+    "  frames = []\n",
+    "  try:\n",
+    "    while True:\n",
+    "      ret, frame = cap.read()\n",
+    "      if not ret:\n",
+    "        break\n",
+    "      frame = crop_center_square(frame)\n",
+    "      frame = cv2.resize(frame, resize)\n",
+    "      frame = frame[:, :, [2, 1, 0]]\n",
+    "      frames.append(frame)\n",
+    "\n",
+    "      if len(frames) == max_frames:\n",
+    "        break\n",
+    "  finally:\n",
+    "    cap.release()\n",
+    "  return np.array(frames) / 255.0\n",
+    "\n",
+    "def to_gif(images):\n",
+    "  converted_images = np.clip(images * 255, 0, 255).astype(np.uint8)\n",
+    "  imageio.mimsave('./animation.gif', converted_images, duration=40, loop=100)\n",
+    "  with open('./animation.gif', 'rb') as f:\n",
+    "    gif_64 = base64.b64encode(f.read()).decode('utf-8')\n",
+    "  return HTML('<img src=\"data:image/gif;base64,%s\"/>' % gif_64)\n",
+    "\n",
+    "def play_audio(data, sample_rate=48000):\n",
+    "  scipy.io.wavfile.write('tmp_audio.wav', sample_rate, data)\n",
+    "\n",
+    "  with open('./tmp_audio.wav', 'rb') as f:\n",
+    "    audio_64 = base64.b64encode(f.read()).decode('utf-8')\n",
+    "  return HTML('<audio controls src=\"data:audio/wav;base64,%s\"/>' % audio_64)\n",
+    "\n",
+    "def table(elements):\n",
+    "  row = ['<td>%s</td>' % el.data for el in elements]\n",
+    "  return HTML('<table><tr>%s</tr></table>' % ''.join(row))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Fetch and Preprocess Input Videos**\n",
+    "\n",
+    "The following cell samples a number of videos at random from the UCF101 dataset and preprocesses them using the utilities defined in the previous cell. You can control how many videos you'd like to process by changing the `num_videos_to_process` variable, keeping in mind that setting this value too high may lead to memory issues on smaller inf2 instances (For demonstration purposes, this script will do all the preprocessing before running any inference. You can, of course, modify the script so that the preprocessing and the inference are pipelined, thereby reducing intermediate memory pressure.)\n",
+    "\n",
+    "Note also that because some input videos are unusable due to their lack of an audio stream (handled in the try-except block below), the final number of preprocessed inputs may be less than the value you give to `num_videos_to_process`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video_names = list_ucf_videos()\n",
+    "num_videos_to_process = 20\n",
+    "random.seed(0)\n",
+    "vid_indices = random.sample(range(len(video_names)), num_videos_to_process) # Select videos to process at random\n",
+    "videos, audios = [], []\n",
+    "for i in vid_indices:\n",
+    "  video_path = video_names[i]\n",
+    "\n",
+    "  # Extract audio using FFMPEG and encode as pcm float wavfile (only format readable by scipy.io.wavfile).\n",
+    "  !yes | ./ffmpeg -i \"$video_path\"  -c copy  -f wav -map 0:a? pcm_f32le -ar 48000 -loglevel quiet output.wav\n",
+    "\n",
+    "  # There may be no audio stream present in the input video, in which case we simply skip this input, because the model requires both modalities to be present.\n",
+    "  try:\n",
+    "    sample_rate, audio = scipy.io.wavfile.read(\"output.wav\")\n",
+    "  except:\n",
+    "    continue\n",
+    "\n",
+    "  if audio.dtype == np.int16:\n",
+    "    audio = audio.astype(np.float32) / 2**15\n",
+    "  elif audio.dtype != np.float32:\n",
+    "    raise ValueError('Unexpected datatype. Model expects sound samples to lie in [-1, 1]')\n",
+    "\n",
+    "  video = load_video(video_path)\n",
+    "  audios.append(audio)\n",
+    "  videos.append(video)\n",
+    "\n",
+    "print(f\"Received {len(audios)} valid input videos to run inference on.\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can visualize the first input:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize inputs\n",
+    "table([to_gif(videos[0]), play_audio(audios[0])])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "While many of the input clips have many frames, we only feed the first 16 frames to the multimodal perceiver. In the code below, we select the first 16 frames of each input and the corresponding audio samples."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select the first 16 frames of the video and one of the audio channels for autoencoding and classification\n",
+    "# Also add a dummy batch dimension.\n",
+    "AUDIO_SAMPLES_PER_FRAME = 48000 // 25\n",
+    "SAMPLES_PER_PATCH = 16\n",
+    "\n",
+    "preprocessed_images, preprocessed_audios = [], []\n",
+    "for i in range(len(videos)):\n",
+    "    image = videos[i][None, :16]\n",
+    "    preprocessed_images.append(image)\n",
+    "\n",
+    "    audio = audios[i]\n",
+    "    if len(audio.shape) == 2:\n",
+    "        audio = audio[None, :16*AUDIO_SAMPLES_PER_FRAME, 0:1]\n",
+    "    elif len(audio.shape) == 1:\n",
+    "        audio = audio[None, :16*AUDIO_SAMPLES_PER_FRAME]\n",
+    "    else:\n",
+    "        raise ValueError(\"audio has wrong shape\")\n",
+    "    preprocessed_audios.append(audio)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Utilities for Neuron Tracing**\n",
+    "\n",
+    "The following cells define some utilities and wrappers that make tracing the multimodal perceiver on Neuron robust and performant. For users that simply want to run the model and see the results, you can stop reading here as it is entirely unnecessary to understand the content inside this cell - you simply need to run it before running the subsequent code.\n",
+    "\n",
+    "We define three wrappers and two utility functions:\n",
+    "1. `MultimodalPerceiverWrapper` wraps the perceiver and is called inside the custom forward function `custom_model_forward`. It is an optimization to avoid redundant computation.\n",
+    "2. `custom_model_forward` replaces the model's original `forward` method. When the model is called later on during inference, the `custom_model_forward` function will execute instead of the model's original `forward` method. It instantiates and uses a `MultimodalPerceiverWrapper` to take advantage of the optimization that avoids redundant computation.\n",
+    "3. `custom_decoder_query` replaces the `decoder_query` method of the `PerceiverBasicDecoder` class. This replacement is necessary to make tracing work - without the replacement, tracing the decoder will generate a segfault.\n",
+    "4. `EncoderWrapper` and `NeuronEncoder` wrap the encoder of the perceiver so that it can be traced.\n",
+    "5. `DecoderWrapper` and `NeuronDecoder` wrap the decoder query, decoder, and output postprocessor of the perceiver so that they can be traced together."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MultimodalPerceiverWrapper(nn.Module):\n",
+    "    def __init__(self, perceiver_model, nchunks, image_chunk_size, audio_chunk_size):\n",
+    "        super().__init__()\n",
+    "        self.perceiver_model = perceiver_model\n",
+    "        self.nchunks = nchunks\n",
+    "        self.image_chunk_size = image_chunk_size\n",
+    "        self.audio_chunk_size = audio_chunk_size\n",
+    "    \n",
+    "    def forward(self, inputs: torch.FloatTensor,\n",
+    "        neuron_decoder,\n",
+    "        attention_mask: Optional[torch.FloatTensor] = None,\n",
+    "        head_mask: Optional[torch.FloatTensor] = None,\n",
+    "        output_attentions: Optional[bool] = None,\n",
+    "        output_hidden_states: Optional[bool] = None,\n",
+    "        return_dict: Optional[bool] = None):\n",
+    "\n",
+    "\n",
+    "        output_attentions = output_attentions if output_attentions is not None else self.perceiver_model.config.output_attentions\n",
+    "        output_hidden_states = (\n",
+    "            output_hidden_states if output_hidden_states is not None else self.perceiver_model.config.output_hidden_states\n",
+    "        )\n",
+    "        return_dict = return_dict if return_dict is not None else self.perceiver_model.config.use_return_dict\n",
+    "        \n",
+    "        if self.perceiver_model.input_preprocessor is not None:\n",
+    "            inputs, modality_sizes, inputs_without_pos = self.perceiver_model.input_preprocessor(inputs)\n",
+    "        else:\n",
+    "            modality_sizes = None\n",
+    "            inputs_without_pos = None\n",
+    "            if inputs.size()[-1] != self.perceiver_model.config.d_model:\n",
+    "                raise ValueError(\n",
+    "                    f\"Last dimension of the inputs: {inputs.size()[-1]} doesn't correspond to config.d_model:\"\n",
+    "                    f\" {self.perceiver_model.config.d_model}. Make sure to set config.d_model appropriately.\"\n",
+    "                )\n",
+    "\n",
+    "        batch_size, seq_length, _ = inputs.size()\n",
+    "        device = inputs.device\n",
+    "\n",
+    "        # If no attention mask is provided, make them all ones\n",
+    "        if attention_mask is None:\n",
+    "            attention_mask = torch.ones((batch_size, seq_length), device=device)\n",
+    "        # Make the attention mask broadcastable to [batch_size, num_heads, seq_length, seq_length]\n",
+    "        extended_attention_mask = self.perceiver_model.invert_attention_mask(attention_mask)\n",
+    "\n",
+    "        head_mask = self.perceiver_model.get_head_mask(head_mask, self.perceiver_model.config.num_blocks * self.perceiver_model.config.num_self_attends_per_block)\n",
+    "        embedding_output = self.perceiver_model.embeddings(batch_size=batch_size)\n",
+    "\n",
+    "        encoder_outputs = self.perceiver_model.encoder(\n",
+    "            embedding_output,\n",
+    "            attention_mask=None,\n",
+    "            head_mask=head_mask,\n",
+    "            inputs=inputs,\n",
+    "            inputs_mask=extended_attention_mask,\n",
+    "            output_attentions=output_attentions,\n",
+    "            output_hidden_states=output_hidden_states,\n",
+    "            return_dict=return_dict,\n",
+    "        )\n",
+    "        sequence_output = encoder_outputs[0]\n",
+    "\n",
+    "        logits = None\n",
+    "        reconstruction = {}\n",
+    "        for chunk_idx in tqdm(range(self.nchunks)):\n",
+    "            subsampled_output_points = {\n",
+    "            'image': torch.arange(\n",
+    "                self.image_chunk_size * chunk_idx, self.image_chunk_size * (chunk_idx + 1)).to(device),\n",
+    "            'audio': torch.arange(\n",
+    "                self.audio_chunk_size * chunk_idx, self.audio_chunk_size * (chunk_idx + 1)).to(device),\n",
+    "            'label': None,\n",
+    "            }\n",
+    "            \n",
+    "            logits = neuron_decoder(sequence_output, extended_attention_mask, \n",
+    "                                             inputs, modality_sizes, inputs_without_pos, subsampled_points=subsampled_output_points)\n",
+    "\n",
+    "            reconstruction['label'] = logits['label']\n",
+    "            if 'image' not in reconstruction:\n",
+    "                reconstruction['image'] = logits['image']\n",
+    "                reconstruction['audio'] = logits['audio']\n",
+    "            else:\n",
+    "                reconstruction['image'] = torch.cat(\n",
+    "                    [reconstruction['image'], logits['image']], dim=1)\n",
+    "                reconstruction['audio'] = torch.cat(\n",
+    "                    [reconstruction['audio'], logits['audio']], dim=1)\n",
+    "            \n",
+    "            del logits\n",
+    "\n",
+    "        return reconstruction\n",
+    "\n",
+    "def custom_model_forward(\n",
+    "        self,\n",
+    "        nchunks,\n",
+    "        image_chunk_size,\n",
+    "        audio_chunk_size,\n",
+    "        neuron_decoder,\n",
+    "        inputs: Optional[torch.Tensor] = None,\n",
+    "        attention_mask: Optional[torch.Tensor] = None,\n",
+    "        head_mask: Optional[torch.Tensor] = None,\n",
+    "        output_attentions: Optional[bool] = None,\n",
+    "        output_hidden_states: Optional[bool] = None,\n",
+    "        return_dict: Optional[bool] = None,\n",
+    "    ) -> Union[Tuple, PerceiverClassifierOutput]:\n",
+    "\n",
+    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
+    "\n",
+    "        perceiver_wrapper = MultimodalPerceiverWrapper(self.perceiver, nchunks, image_chunk_size, audio_chunk_size)\n",
+    "        outputs = perceiver_wrapper(\n",
+    "            inputs,\n",
+    "            neuron_decoder,\n",
+    "            attention_mask=attention_mask,\n",
+    "            head_mask=head_mask,\n",
+    "            output_attentions=output_attentions,\n",
+    "            output_hidden_states=output_hidden_states,\n",
+    "            return_dict=return_dict,\n",
+    "        )\n",
+    "        return outputs\n",
+    "\n",
+    "\n",
+    "def custom_decoder_query(self, inputs, modality_sizes=None, inputs_without_pos=None, subsampled_points=None):\n",
+    "    if self.position_encoding_type == \"none\":  # Queries come from elsewhere\n",
+    "        raise ValueError(\"You cannot construct decoder queries when position_encoding_type is set to none\")\n",
+    "    if subsampled_points is not None:\n",
+    "        # subsampled_points are the indices if the inputs would be flattened\n",
+    "        # however, the inputs aren't flattened, that's why we use unravel_index\n",
+    "        # to get the indices for the unflattened array\n",
+    "        # unravel_index returns a tuple (x_idx, y_idx, ...)\n",
+    "        # stack to get the [n, d] tensor of coordinates\n",
+    "\n",
+    "        def unravel_indices(indices, shape):\n",
+    "            coord = []\n",
+    "\n",
+    "            for dim in reversed(shape):\n",
+    "                coord.append(indices % dim)\n",
+    "                indices = indices // dim\n",
+    "\n",
+    "            coord = torch.stack(coord[::-1], dim=-1)\n",
+    "\n",
+    "            return coord\n",
+    "\n",
+    "        pos = unravel_indices(subsampled_points, self.output_index_dims)\n",
+    "\n",
+    "        batch_size = inputs.shape[0]\n",
+    "        # Map these coordinates to [-1, 1]\n",
+    "        pos = -1 + 2 * pos / torch.tensor(self.output_index_dims)[None, :]\n",
+    "        pos = torch.broadcast_to(pos[None], [batch_size, pos.shape[0], pos.shape[1]])\n",
+    "        # Construct the position encoding.\n",
+    "        if self.position_encoding_type == \"trainable\":\n",
+    "            pos_emb = self.output_position_encodings(batch_size)\n",
+    "        elif self.position_encoding_type == \"fourier\":\n",
+    "            pos_emb = self.output_position_encodings(\n",
+    "                self.output_index_dims, batch_size=batch_size, device=inputs.device, dtype=inputs.dtype, pos=pos\n",
+    "            )\n",
+    "\n",
+    "        # Optionally project them to a target dimension.\n",
+    "        pos_emb = self.positions_projection(pos_emb)\n",
+    "        pos_emb = torch.reshape(pos_emb, [pos_emb.shape[0], -1, pos_emb.shape[-1]])\n",
+    "    else:\n",
+    "        batch_size = inputs.shape[0]\n",
+    "        index_dims = inputs.shape[2:]\n",
+    "\n",
+    "        # Construct the position encoding.\n",
+    "        if self.position_encoding_type == \"trainable\":\n",
+    "            pos_emb = self.output_position_encodings(batch_size)\n",
+    "        elif self.position_encoding_type == \"fourier\":\n",
+    "            pos_emb = self.output_position_encodings(\n",
+    "                index_dims, batch_size, device=inputs.device, dtype=inputs.dtype\n",
+    "            )\n",
+    "\n",
+    "        # Optionally project them to a target dimension.\n",
+    "        pos_emb = self.positions_projection(pos_emb)\n",
+    "\n",
+    "    if self.concat_preprocessed_input:\n",
+    "        if inputs_without_pos is None:\n",
+    "            raise ValueError(\"Value is required for inputs_without_pos if concat_preprocessed_input is True\")\n",
+    "        pos_emb = torch.cat([inputs_without_pos, pos_emb], dim=-1)\n",
+    "\n",
+    "    return pos_emb\n",
+    "\n",
+    "\n",
+    "# Define wrapper for tracing encoder\n",
+    "class EncoderWrapper(nn.Module):\n",
+    "    def __init__(self, encoder):\n",
+    "        super().__init__()\n",
+    "        self.encoder = encoder\n",
+    "    \n",
+    "    def forward(self, embedding_output, inputs, extended_attention_mask):\n",
+    "        output = self.encoder(embedding_output, inputs=inputs, inputs_mask=extended_attention_mask)\n",
+    "        return output\n",
+    "\n",
+    "class NeuronEncoder(nn.Module):\n",
+    "    def __init__(self, encoder_wrapper):\n",
+    "       super().__init__()\n",
+    "       self.encoder_wrapper = encoder_wrapper\n",
+    "    \n",
+    "    def forward(self,\n",
+    "        hidden_states: torch.Tensor,\n",
+    "        attention_mask: Optional[torch.FloatTensor] = None,\n",
+    "        head_mask: Optional[torch.FloatTensor] = None,\n",
+    "        inputs: Optional[torch.FloatTensor] = None,\n",
+    "        inputs_mask: Optional[torch.FloatTensor] = None,\n",
+    "        output_attentions: Optional[bool] = False,\n",
+    "        output_hidden_states: Optional[bool] = False,\n",
+    "        return_dict: Optional[bool] = True):\n",
+    "\n",
+    "        last_hidden_states = self.encoder_wrapper(hidden_states, inputs, inputs_mask)['last_hidden_state']\n",
+    "        return BaseModelOutputWithCrossAttentions(last_hidden_state=last_hidden_states)\n",
+    "\n",
+    "\n",
+    "# Define wrapper for tracing decoder\n",
+    "class DecoderWrapper(nn.Module):\n",
+    "    def __init__(self, decoder, decoder_query_audio, decoder_query_image, decoder_query_label, output_postprocessor):\n",
+    "        super().__init__()\n",
+    "        self.decoder = decoder\n",
+    "        self.decoder_query_audio = decoder_query_audio\n",
+    "        self.decoder_query_image = decoder_query_image\n",
+    "        self.decoder_query_label = decoder_query_label\n",
+    "        self.output_postprocessor = output_postprocessor\n",
+    "        self.num_query_channels = decoder.num_query_channels\n",
+    "    \n",
+    "    def forward(self, z, query_mask,\n",
+    "                audio_input, audio_input_without_pos, audio_subsampled_point, audio_padding,\n",
+    "                image_input, image_input_without_pos, image_subsampled_point, image_padding,\n",
+    "                label_input, label_input_without_pos, label_padding):\n",
+    "        audio_query = self.decoder_query_audio(inputs=audio_input, inputs_without_pos=audio_input_without_pos, subsampled_points=audio_subsampled_point)\n",
+    "        image_query = self.decoder_query_image(inputs=image_input, inputs_without_pos=image_input_without_pos, subsampled_points=image_subsampled_point)\n",
+    "        label_query = self.decoder_query_label(inputs=label_input, inputs_without_pos=label_input_without_pos)\n",
+    "\n",
+    "        def embed(x, pos):\n",
+    "            x = torch.reshape(x, [x.shape[0], np.prod(x.shape[1:-1]), x.shape[-1]])\n",
+    "            pos = torch.broadcast_to(pos, [x.shape[0], x.shape[1], self.num_query_channels - x.shape[2]])\n",
+    "            return torch.cat([x, pos], dim=2)\n",
+    "\n",
+    "        audio_padded = embed(audio_query, audio_padding)\n",
+    "        image_padded = embed(image_query, image_padding)\n",
+    "        label_padded = embed(label_query, label_padding)\n",
+    "\n",
+    "        decoder_query = torch.cat([audio_padded, image_padded, label_padded], dim=1)\n",
+    "        logits = self.decoder(decoder_query, z, query_mask).logits\n",
+    "        \n",
+    "        output_modality_sizes = {\"audio\": audio_subsampled_point.shape[0],\n",
+    "                                 \"image\": image_subsampled_point.shape[0],\n",
+    "                                 \"label\": 1}\n",
+    "        logits = self.output_postprocessor(logits, modality_sizes=output_modality_sizes)\n",
+    "        return logits\n",
+    "\n",
+    "class NeuronDecoder(nn.Module):\n",
+    "    def __init__(self, decoder_wrapper):\n",
+    "        super().__init__()\n",
+    "        self.decoder_wrapper = decoder_wrapper\n",
+    "        self.modalities = decoder_wrapper.decoder.modalities\n",
+    "        self.padding = decoder_wrapper.decoder.padding\n",
+    "\n",
+    "    def forward(self, z, query_mask, inputs, modality_sizes, inputs_without_pos=None, subsampled_points=None, output_attentions=False):\n",
+    "        # Partition the flat inputs among the different modalities\n",
+    "        inputs = restructure(modality_sizes, inputs)\n",
+    "\n",
+    "        assert(subsampled_points is not None)\n",
+    "        assert(inputs_without_pos is not None)\n",
+    "\n",
+    "        for modality, decoder in self.modalities.items():\n",
+    "            if modality == \"audio\":\n",
+    "                audio_input, audio_input_without_pos, audio_subsampled_point, audio_padding = inputs[modality], inputs_without_pos[modality], subsampled_points[modality].to(torch.float32), self.padding[modality]\n",
+    "            elif modality == \"image\":\n",
+    "                image_input, image_input_without_pos, image_subsampled_point, image_padding = inputs[modality], inputs_without_pos[modality], subsampled_points[modality].to(torch.float32), self.padding[modality]\n",
+    "            else:\n",
+    "                # label doesn't have subsampled point\n",
+    "                label_input, label_input_without_pos, label_padding = inputs[modality], inputs_without_pos[modality], self.padding[modality]\n",
+    "\n",
+    "        assert(audio_input_without_pos is not None)\n",
+    "        assert(audio_subsampled_point is not None)\n",
+    "        assert(image_input_without_pos is not None)\n",
+    "        assert(image_subsampled_point is not None)\n",
+    "        assert(label_input_without_pos is not None)\n",
+    "\n",
+    "        output = self.decoder_wrapper(z, query_mask, \n",
+    "                                        audio_input, audio_input_without_pos, audio_subsampled_point, audio_padding,\n",
+    "                                        image_input, image_input_without_pos, image_subsampled_point, image_padding,\n",
+    "                                        label_input, label_input_without_pos, label_padding)\n",
+    "        return output\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Trace (Compile) the Model on Neuron**\n",
+    "\n",
+    "With the utilities defined above, we can now compile the perceiver encoder and the perceiver decoder for inference on Neuron. Note that we compile the encoder and the decoder as two independent modules and save them respectively. Note also that in both calls to `torch_neuronx.trace`, we are passing `--auto-cast=none` as a compiler argument. This was intentional in order to avoid numerical errors when casting to BF16 is allowed. We are aware of the performance hit and are hoping to remove this restriction in an upcoming release."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = PerceiverForMultimodalAutoencoding.from_pretrained(\"deepmind/multimodal-perceiver\", \n",
+    "                                                                   low_cpu_mem_usage=True)\n",
+    "COMPILER_WORKDIR_ROOT=\"perceiver_multimodal_compile_dir\"\n",
+    "\n",
+    "PerceiverForMultimodalAutoencoding.forward = custom_model_forward\n",
+    "PerceiverBasicDecoder.decoder_query = custom_decoder_query\n",
+    "\n",
+    "\n",
+    "# --- Compile Encoder ---\n",
+    "# Define sample inputs for tracing encoder\n",
+    "embedding_output = torch.randn(1, 784, 512)\n",
+    "sample_inputs = torch.randn(1, 52097, 704)\n",
+    "extended_attention_mask = torch.zeros(1, 1, 1, 52097)\n",
+    "\n",
+    "# Wrap and trace the encoder, save the traced encoder\n",
+    "COMPILER_WORKDIR_ENCODER = os.path.join(COMPILER_WORKDIR_ROOT, \"encoder\")\n",
+    "neuron_encoder = NeuronEncoder(EncoderWrapper(model.perceiver.encoder))\n",
+    "\n",
+    "# You might see a warning from trace about unused input - these are safe to ignore.\n",
+    "neuron_encoder.encoder_wrapper = torch_neuronx.trace(\n",
+    "  neuron_encoder.encoder_wrapper,\n",
+    "  (embedding_output, sample_inputs, extended_attention_mask),\n",
+    "  compiler_workdir=COMPILER_WORKDIR_ENCODER,\n",
+    "  compiler_args=[f\"--temp-dir={COMPILER_WORKDIR_ENCODER}\", \"--auto-cast=none\"] # --auto-cast=none is needed to avoid numerical error.\n",
+    ")\n",
+    "\n",
+    "# Save compiled encoder\n",
+    "encoder_fname = os.path.join(COMPILER_WORKDIR_ENCODER, 'model.pt')\n",
+    "torch.jit.save(neuron_encoder.encoder_wrapper, encoder_fname)\n",
+    "\n",
+    "\n",
+    "# --- Compile Decoder ---\n",
+    "# Define sample inputs for tracing decoder\n",
+    "z = torch.randn(1, 784, 512)\n",
+    "query_mask = torch.zeros(1, 1, 1, 52097)\n",
+    "\n",
+    "audio_input = torch.randn(1, 1920, 704)\n",
+    "audio_input_without_pos = torch.randn(1, 1920, 16)\n",
+    "audio_subsampled_point = torch.arange(0, 15, dtype=torch.float32) # 15 = 1920/128\n",
+    "audio_padding = torch.randn(1, 641)\n",
+    "\n",
+    "image_input = torch.randn(1, 50176, 704)\n",
+    "image_input_without_pos = torch.randn(1, 50176, 48)\n",
+    "image_subsampled_point = torch.arange(0, 6272, dtype=torch.float32) # 6272 = 224*224*16/128\n",
+    "image_padding = torch.randn(1, 831)\n",
+    "\n",
+    "label_input = torch.randn(1, 1, 704)\n",
+    "label_input_without_pos = torch.randn(1, 1, 700)\n",
+    "label_padding = torch.randn(1, 2)\n",
+    "\n",
+    "# Wrap and trace the decoder, save the traced decoder\n",
+    "COMPILER_WORKDIR_DECODER = os.path.join(COMPILER_WORKDIR_ROOT, \"decoder\")\n",
+    "neuron_decoder = NeuronDecoder(DecoderWrapper(model.perceiver.decoder, model.perceiver.decoder.modalities['audio'].decoder_query, \\\n",
+    "                                              model.perceiver.decoder.modalities['image'].decoder_query, model.perceiver.decoder.modalities['label'].decoder_query, \\\n",
+    "                                              model.perceiver.output_postprocessor))\n",
+    "\n",
+    "# You might see a warning from trace about unused input - these are safe to ignore.\n",
+    "neuron_decoder.decoder_wrapper = torch_neuronx.trace(\n",
+    "   neuron_decoder.decoder_wrapper,\n",
+    "   (z, query_mask, audio_input, audio_input_without_pos, audio_subsampled_point, audio_padding,\n",
+    "        image_input, image_input_without_pos, image_subsampled_point, image_padding,\n",
+    "        label_input, label_input_without_pos, label_padding),\n",
+    "   compiler_workdir=COMPILER_WORKDIR_DECODER,\n",
+    "   compiler_args=[f\"--temp-dir={COMPILER_WORKDIR_DECODER}\", \"--auto-cast=none\"] # --auto-cast=none is needed to avoid numerical error.\n",
+    ")\n",
+    "\n",
+    "# Save compiled decoder\n",
+    "decoder_fname = os.path.join(COMPILER_WORKDIR_DECODER, 'model.pt')\n",
+    "torch.jit.save(neuron_decoder.decoder_wrapper, decoder_fname)\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Load the Compiled Models and Run Inference**\n",
+    "\n",
+    "Now that the model is compiled, you can load them and run inference on the preprocessed images and audios. For each set of image and audio, a `reconstruction` dictionary is returned which contains three items: An `audio` with the reconstructed audio tensor, an `image` with the reconstructed image tensor, and a `label` representing the classification logits. We print out the top 3 predicted labels of each input video along with the inference latency."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -- Load compiled models --\n",
+    "model = PerceiverForMultimodalAutoencoding.from_pretrained(\"deepmind/multimodal-perceiver\", \n",
+    "                                                                  low_cpu_mem_usage=True)\n",
+    "\n",
+    "# load saved encoder from disk\n",
+    "encoder_fname = os.path.join(COMPILER_WORKDIR_ENCODER, 'model.pt')\n",
+    "neuron_encoder = NeuronEncoder(EncoderWrapper(model.perceiver.encoder))\n",
+    "neuron_encoder.encoder_wrapper = torch.jit.load(encoder_fname)\n",
+    "model.perceiver.encoder = neuron_encoder\n",
+    "\n",
+    "# load saved decoder from disk\n",
+    "decoder_fname = os.path.join(COMPILER_WORKDIR_DECODER, 'model.pt')\n",
+    "neuron_decoder = NeuronDecoder(DecoderWrapper(model.perceiver.decoder, model.perceiver.decoder.modalities['audio'].decoder_query, \\\n",
+    "                                              model.perceiver.decoder.modalities['image'].decoder_query, model.perceiver.decoder.modalities['label'].decoder_query, \\\n",
+    "                                              model.perceiver.output_postprocessor))\n",
+    "neuron_decoder.decoder_wrapper = torch.jit.load(decoder_fname)\n",
+    "\n",
+    "# Inference function\n",
+    "def autoencode_video(images, audio, nchunks, image_chunk_size, audio_chunk_size):\n",
+    "    input_image = torch.from_numpy(np.moveaxis(images, -1, 2)).to(torch.float32)\n",
+    "    input_audio = torch.from_numpy(audio)\n",
+    "    input_label = torch.zeros((images.shape[0], 700))\n",
+    "\n",
+    "    inputs = {'image': input_image, 'audio': input_audio, 'label':input_label}\n",
+    "\n",
+    "    reconstruction = {}\n",
+    "    with torch.no_grad():\n",
+    "        reconstruction = model(nchunks, image_chunk_size, audio_chunk_size, neuron_decoder, inputs=inputs)\n",
+    "\n",
+    "    # reshape image and audio modalities back to original shape\n",
+    "    reconstruction['image'] = torch.reshape(reconstruction['image'], images.shape)\n",
+    "    reconstruction['audio'] = torch.reshape(reconstruction['audio'], audio.shape)\n",
+    "    return reconstruction\n",
+    "\n",
+    "nchunks = 128\n",
+    "reconstructions = []\n",
+    "for audio, image in zip(preprocessed_audios, preprocessed_images):\n",
+    "    image_chunk_size = np.prod(image.shape[1:-1]) // nchunks\n",
+    "    audio_chunk_size = audio.shape[1] // SAMPLES_PER_PATCH // nchunks\n",
+    "\n",
+    "    start = time.time()\n",
+    "    reconstruction = autoencode_video(image, audio, nchunks, image_chunk_size, audio_chunk_size)\n",
+    "    print(f\"Inference latency is {time.time()-start} seconds\")\n",
+    "    reconstructions.append(reconstruction)\n",
+    "\n",
+    "    # Print top 3 predicted labels\n",
+    "    scores, indices = torch.topk(torch.softmax(reconstruction[\"label\"], dim=1), k=3)\n",
+    "    top3 = []\n",
+    "    for score, index in zip(scores[0], indices[0]):\n",
+    "        print(\"%s: %s\" % (model.config.id2label[index.item()], score.item()))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also visualize any of the reconstructed videos:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize reconstruction of first 16 frames\n",
+    "table([to_gif(reconstructions[0][\"image\"][0].numpy()), play_audio(np.array(reconstructions[0][\"audio\"][0].numpy()))])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
 }