diff --git a/Getting_Started.ipynb b/Getting_Started.ipynb
index 3d52dc6..3fb55f9 100644
--- a/Getting_Started.ipynb
+++ b/Getting_Started.ipynb
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -106,7 +106,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -151,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -159,7 +159,7 @@
      "output_type": "stream",
      "text": [
       "🔧 Initializing MARVIS t-SNE classifier for tabular data...\n",
-      "📁 Created temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_demo_seap38lk\n",
+      "📁 Created temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_demo_vefuxyya\n",
       "✅ Tabular classifier initialized!\n"
      ]
     }
@@ -192,7 +192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -201,8 +201,8 @@
      "text": [
       "🏋️ Training MARVIS on tabular data...\n",
       "[t-SNE] Computing 46 nearest neighbors...\n",
-      "[t-SNE] Indexed 200 samples in 0.000s...\n",
-      "[t-SNE] Computed neighbors for 200 samples in 0.002s...\n",
+      "[t-SNE] Indexed 200 samples in 0.001s...\n",
+      "[t-SNE] Computed neighbors for 200 samples in 0.006s...\n",
       "[t-SNE] Computed conditional probabilities for sample 200 / 200\n",
       "[t-SNE] Mean sigma: 4.328593\n",
       "[t-SNE] KL divergence after 250 iterations with early exaggeration: 55.566055\n",
@@ -228,7 +228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -236,44 +236,7 @@
      "output_type": "stream",
      "text": [
       "🔮 Making predictions on tabular test data...\n",
-      "📁 Using temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_demo_seap38lk\n"
-     ]
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mKeyboardInterrupt\u001b[39m                         Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[56]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m      4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m📁 Using temp directory: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtemp_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m      6\u001b[39m \u001b[38;5;66;03m# Evaluate with detailed results and save outputs - use evaluate() to get accuracy\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m tabular_results = \u001b[43mtabular_classifier\u001b[49m\u001b[43m.\u001b[49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m      8\u001b[39m \u001b[43m    \u001b[49m\u001b[43mX_test_tab\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Use first 10 test samples for demo\u001b[39;49;00m\n\u001b[32m      9\u001b[39m \u001b[43m    \u001b[49m\u001b[43my_test_tab\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     10\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_detailed\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m     11\u001b[39m \u001b[43m    \u001b[49m\u001b[43msave_outputs\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m     12\u001b[39m \u001b[43m    \u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtemp_dir\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     13\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m     15\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m📈 Tabular Classification Results:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     16\u001b[39m accuracy = tabular_results.get(\u001b[33m'\u001b[39m\u001b[33maccuracy\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mN/A\u001b[39m\u001b[33m'\u001b[39m)\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/marvis_tsne.py:1497\u001b[39m, in \u001b[36mevaluate\u001b[39m\u001b[34m(self, X_test, y_test, return_detailed, save_outputs, output_dir, visualization_save_cadence)\u001b[39m\n\u001b[32m   1494\u001b[39m try:\n\u001b[32m   1495\u001b[39m     from marvis.utils.resource_manager import get_resource_manager\n\u001b[32m-> \u001b[39m\u001b[32m1497\u001b[39m     # Determine dataset identifier for caching\n\u001b[32m   1498\u001b[39m     dataset_id = kwargs.get(\"dataset_name\", \"\")\n\u001b[32m   1499\u001b[39m     if \"dataset_info\" in kwargs and kwargs[\"dataset_info\"]:\n\u001b[32m   1500\u001b[39m         # Prefer task_id if available\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/marvis_tsne.py:1430\u001b[39m, in \u001b[36mpredict\u001b[39m\u001b[34m(self, X_test, y_test, return_detailed, save_outputs, output_dir, visualization_save_cadence)\u001b[39m\n\u001b[32m   1413\u001b[39m         self.logger.info(\"Creating 2D classification t-SNE visualization...\")\n\u001b[32m   1414\u001b[39m         self.train_tsne, self.test_tsne, base_fig = viz_methods[\n\u001b[32m   1415\u001b[39m             \"create_tsne_visualization\"\n\u001b[32m   1416\u001b[39m         ](\n\u001b[32m   (...)\u001b[39m\u001b[32m   1427\u001b[39m             },\n\u001b[32m   1428\u001b[39m         )\n\u001b[32m-> \u001b[39m\u001b[32m1430\u001b[39m # Close base figure to save memory\n\u001b[32m   1431\u001b[39m plt.close(base_fig)\n\u001b[32m   1433\u001b[39m # Set up class/target information based on task type\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/process_one_sample.py:443\u001b[39m, in \u001b[36mprocess_one_sample\u001b[39m\u001b[34m(classifier_instance, sample_index, viz_methods, viewing_angles, save_outputs, visualization_save_cadence, return_detailed, y_test, prediction_details, all_classes)\u001b[39m\n\u001b[32m      0\u001b[39m <Error retrieving source code with stack_data see ipython/ipython#13598>\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/process_one_sample.py:66\u001b[39m, in \u001b[36m_generate_vlm_response\u001b[39m\u001b[34m(classifier_instance, image, prompt)\u001b[39m\n\u001b[32m     61\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Generate VLM response with consistent configuration.\"\"\"\u001b[39;00m\n\u001b[32m     62\u001b[39m conversation = create_vlm_conversation(image, prompt)\n\u001b[32m     64\u001b[39m gen_config = GenerationConfig(\n\u001b[32m     65\u001b[39m     max_new_tokens=\u001b[32m16384\u001b[39m,\n\u001b[32m---> \u001b[39m\u001b[32m66\u001b[39m     temperature=\u001b[32m0.1\u001b[39m,\n\u001b[32m     67\u001b[39m     do_sample=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m     68\u001b[39m     enable_thinking=classifier_instance.enable_thinking\n\u001b[32m     69\u001b[39m     \u001b[38;5;129;01mand\u001b[39;00m classifier_instance.is_api_model,\n\u001b[32m     70\u001b[39m     thinking_summary=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m     71\u001b[39m )\n\u001b[32m     73\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m classifier_instance.vlm_wrapper.generate_from_conversation(\n\u001b[32m     74\u001b[39m     conversation, gen_config\n\u001b[32m     75\u001b[39m )\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/utils/model_loader.py:728\u001b[39m, in \u001b[36mgenerate_from_conversation\u001b[39m\u001b[34m(self, conversation, config)\u001b[39m\n\u001b[32m      0\u001b[39m <Error retrieving source code with stack_data see ipython/ipython#13598>\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/utils/_contextlib.py:120\u001b[39m, in \u001b[36mcontext_decorator.<locals>.decorate_context\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    117\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m    118\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdecorate_context\u001b[39m(*args, **kwargs):\n\u001b[32m    119\u001b[39m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[32m--> \u001b[39m\u001b[32m120\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/generation/utils.py:2617\u001b[39m, in \u001b[36mGenerationMixin.generate\u001b[39m\u001b[34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, custom_generate, **kwargs)\u001b[39m\n\u001b[32m   2609\u001b[39m     input_ids, model_kwargs = \u001b[38;5;28mself\u001b[39m._expand_inputs_for_generation(\n\u001b[32m   2610\u001b[39m         input_ids=input_ids,\n\u001b[32m   2611\u001b[39m         expand_size=generation_config.num_return_sequences,\n\u001b[32m   2612\u001b[39m         is_encoder_decoder=\u001b[38;5;28mself\u001b[39m.config.is_encoder_decoder,\n\u001b[32m   2613\u001b[39m         **model_kwargs,\n\u001b[32m   2614\u001b[39m     )\n\u001b[32m   2616\u001b[39m     \u001b[38;5;66;03m# 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m2617\u001b[39m     result = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sample\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   2618\u001b[39m \u001b[43m        \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2619\u001b[39m \u001b[43m        \u001b[49m\u001b[43mlogits_processor\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprepared_logits_processor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2620\u001b[39m \u001b[43m        \u001b[49m\u001b[43mstopping_criteria\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprepared_stopping_criteria\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2621\u001b[39m \u001b[43m        \u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2622\u001b[39m \u001b[43m        \u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[43m=\u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2623\u001b[39m \u001b[43m        \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstreamer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2624\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   2625\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   2627\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m generation_mode \u001b[38;5;129;01min\u001b[39;00m (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):\n\u001b[32m   2628\u001b[39m     \u001b[38;5;66;03m# 11. interleave input_ids with `num_beams` additional sequences per batch\u001b[39;00m\n\u001b[32m   2629\u001b[39m     input_ids, model_kwargs = \u001b[38;5;28mself\u001b[39m._expand_inputs_for_generation(\n\u001b[32m   2630\u001b[39m         input_ids=input_ids,\n\u001b[32m   2631\u001b[39m         expand_size=generation_config.num_beams,\n\u001b[32m   2632\u001b[39m         is_encoder_decoder=\u001b[38;5;28mself\u001b[39m.config.is_encoder_decoder,\n\u001b[32m   2633\u001b[39m         **model_kwargs,\n\u001b[32m   2634\u001b[39m     )\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/generation/utils.py:3598\u001b[39m, in \u001b[36mGenerationMixin._sample\u001b[39m\u001b[34m(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)\u001b[39m\n\u001b[32m   3595\u001b[39m model_inputs.update({\u001b[33m\"\u001b[39m\u001b[33moutput_hidden_states\u001b[39m\u001b[33m\"\u001b[39m: output_hidden_states} \u001b[38;5;28;01mif\u001b[39;00m output_hidden_states \u001b[38;5;28;01melse\u001b[39;00m {})\n\u001b[32m   3597\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_prefill:\n\u001b[32m-> \u001b[39m\u001b[32m3598\u001b[39m     outputs = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m   3599\u001b[39m     is_prefill = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m   3600\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1771\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1782\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1783\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/utils/generic.py:959\u001b[39m, in \u001b[36mcan_return_tuple.<locals>.wrapper\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m    957\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m return_dict_passed \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    958\u001b[39m     return_dict = return_dict_passed\n\u001b[32m--> \u001b[39m\u001b[32m959\u001b[39m output = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    960\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_dict \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(output, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[32m    961\u001b[39m     output = output.to_tuple()\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1493\u001b[39m, in \u001b[36mQwen2_5_VLForConditionalGeneration.forward\u001b[39m\u001b[34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts, logits_to_keep, **kwargs)\u001b[39m\n\u001b[32m   1488\u001b[39m output_attentions = output_attentions \u001b[38;5;28;01mif\u001b[39;00m output_attentions \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m.config.output_attentions\n\u001b[32m   1489\u001b[39m output_hidden_states = (\n\u001b[32m   1490\u001b[39m     output_hidden_states \u001b[38;5;28;01mif\u001b[39;00m output_hidden_states \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m.config.output_hidden_states\n\u001b[32m   1491\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1493\u001b[39m outputs = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1494\u001b[39m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1495\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1496\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpixel_values_videos\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpixel_values_videos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1497\u001b[39m \u001b[43m    \u001b[49m\u001b[43mimage_grid_thw\u001b[49m\u001b[43m=\u001b[49m\u001b[43mimage_grid_thw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1498\u001b[39m \u001b[43m    \u001b[49m\u001b[43mvideo_grid_thw\u001b[49m\u001b[43m=\u001b[49m\u001b[43mvideo_grid_thw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1499\u001b[39m \u001b[43m    \u001b[49m\u001b[43msecond_per_grid_ts\u001b[49m\u001b[43m=\u001b[49m\u001b[43msecond_per_grid_ts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1500\u001b[39m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1501\u001b[39m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1502\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1503\u001b[39m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1504\u001b[39m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m=\u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1505\u001b[39m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m=\u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1506\u001b[39m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m=\u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1507\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m   1508\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1509\u001b[39m \u001b[43m    \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1510\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1512\u001b[39m hidden_states = outputs[\u001b[32m0\u001b[39m]\n\u001b[32m   1514\u001b[39m \u001b[38;5;66;03m# Only compute necessary logits, and do not upcast them to float if we are not computing the loss\u001b[39;00m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1771\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1782\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1783\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1275\u001b[39m, in \u001b[36mQwen2_5_VLModel.forward\u001b[39m\u001b[34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts, **kwargs)\u001b[39m\n\u001b[32m   1272\u001b[39m     inputs_embeds = \u001b[38;5;28mself\u001b[39m.get_input_embeddings()(input_ids)\n\u001b[32m   1274\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m pixel_values \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1275\u001b[39m     image_embeds = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_image_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mimage_grid_thw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1276\u001b[39m     image_embeds = torch.cat(image_embeds, dim=\u001b[32m0\u001b[39m).to(inputs_embeds.device, inputs_embeds.dtype)\n\u001b[32m   1277\u001b[39m     image_mask, _ = \u001b[38;5;28mself\u001b[39m.get_placeholder_mask(\n\u001b[32m   1278\u001b[39m         input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds\n\u001b[32m   1279\u001b[39m     )\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1188\u001b[39m, in \u001b[36mQwen2_5_VLModel.get_image_features\u001b[39m\u001b[34m(self, pixel_values, image_grid_thw)\u001b[39m\n\u001b[32m   1178\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m   1179\u001b[39m \u001b[33;03mEncodes images into continuous embeddings that can be forwarded to the language model.\u001b[39;00m\n\u001b[32m   1180\u001b[39m \n\u001b[32m   (...)\u001b[39m\u001b[32m   1185\u001b[39m \u001b[33;03m        The temporal, height and width of feature shape of each image in LLM.\u001b[39;00m\n\u001b[32m   1186\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m   1187\u001b[39m pixel_values = pixel_values.type(\u001b[38;5;28mself\u001b[39m.visual.dtype)\n\u001b[32m-> \u001b[39m\u001b[32m1188\u001b[39m image_embeds = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mvisual\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrid_thw\u001b[49m\u001b[43m=\u001b[49m\u001b[43mimage_grid_thw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1189\u001b[39m split_sizes = (image_grid_thw.prod(-\u001b[32m1\u001b[39m) // \u001b[38;5;28mself\u001b[39m.visual.spatial_merge_size**\u001b[32m2\u001b[39m).tolist()\n\u001b[32m   1190\u001b[39m image_embeds = torch.split(image_embeds, split_sizes)\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1771\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1782\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1783\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:480\u001b[39m, in \u001b[36mQwen2_5_VisionTransformerPretrainedModel.forward\u001b[39m\u001b[34m(self, hidden_states, grid_thw, **kwargs)\u001b[39m\n\u001b[32m    477\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    478\u001b[39m         cu_seqlens_now = cu_window_seqlens\n\u001b[32m--> \u001b[39m\u001b[32m480\u001b[39m     hidden_states = \u001b[43mblk\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    481\u001b[39m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    482\u001b[39m \u001b[43m        \u001b[49m\u001b[43mcu_seqlens\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcu_seqlens_now\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    483\u001b[39m \u001b[43m        \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m=\u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    484\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    485\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    487\u001b[39m hidden_states = \u001b[38;5;28mself\u001b[39m.merger(hidden_states)\n\u001b[32m    488\u001b[39m reverse_indices = torch.argsort(window_index)\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/modeling_layers.py:93\u001b[39m, in \u001b[36mGradientCheckpointingLayer.__call__\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m     90\u001b[39m         logger.warning(message)\n\u001b[32m     92\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._gradient_checkpointing_func(partial(\u001b[38;5;28msuper\u001b[39m().\u001b[34m__call__\u001b[39m, **kwargs), *args)\n\u001b[32m---> \u001b[39m\u001b[32m93\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1771\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1782\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1783\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:308\u001b[39m, in \u001b[36mQwen2_5_VLVisionBlock.forward\u001b[39m\u001b[34m(self, hidden_states, cu_seqlens, rotary_pos_emb, position_embeddings, **kwargs)\u001b[39m\n\u001b[32m    300\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mforward\u001b[39m(\n\u001b[32m    301\u001b[39m     \u001b[38;5;28mself\u001b[39m,\n\u001b[32m    302\u001b[39m     hidden_states: torch.Tensor,\n\u001b[32m   (...)\u001b[39m\u001b[32m    306\u001b[39m     **kwargs,\n\u001b[32m    307\u001b[39m ) -> torch.Tensor:\n\u001b[32m--> \u001b[39m\u001b[32m308\u001b[39m     hidden_states = hidden_states + \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mattn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    309\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnorm1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    310\u001b[39m \u001b[43m        \u001b[49m\u001b[43mcu_seqlens\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcu_seqlens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    311\u001b[39m \u001b[43m        \u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    312\u001b[39m \u001b[43m        \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m=\u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    313\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    314\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    315\u001b[39m     hidden_states = hidden_states + \u001b[38;5;28mself\u001b[39m.mlp(\u001b[38;5;28mself\u001b[39m.norm2(hidden_states))\n\u001b[32m    316\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m hidden_states\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1771\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m   1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m   1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m   1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m   1782\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m   1783\u001b[39m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:268\u001b[39m, in \u001b[36mQwen2_5_VLVisionAttention.forward\u001b[39m\u001b[34m(self, hidden_states, cu_seqlens, rotary_pos_emb, position_embeddings, **kwargs)\u001b[39m\n\u001b[32m    264\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    265\u001b[39m     \u001b[38;5;66;03m# Other implementations: Process each chunk separately\u001b[39;00m\n\u001b[32m    266\u001b[39m     lengths = cu_seqlens[\u001b[32m1\u001b[39m:] - cu_seqlens[:-\u001b[32m1\u001b[39m]\n\u001b[32m    267\u001b[39m     splits = [\n\u001b[32m--> \u001b[39m\u001b[32m268\u001b[39m         torch.split(tensor, \u001b[43mlengths\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtolist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m, dim=\u001b[32m2\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m tensor \u001b[38;5;129;01min\u001b[39;00m (query_states, key_states, value_states)\n\u001b[32m    269\u001b[39m     ]\n\u001b[32m    271\u001b[39m     attn_outputs = [\n\u001b[32m    272\u001b[39m         attention_interface(\n\u001b[32m    273\u001b[39m             \u001b[38;5;28mself\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m    283\u001b[39m         \u001b[38;5;28;01mfor\u001b[39;00m q, k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(*splits)\n\u001b[32m    284\u001b[39m     ]\n\u001b[32m    285\u001b[39m     attn_output = torch.cat(attn_outputs, dim=\u001b[32m1\u001b[39m)\n",
-      "\u001b[31mKeyboardInterrupt\u001b[39m: "
+      "📁 Using temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_demo_vefuxyya\n"
      ]
     }
    ],
@@ -815,123 +778,29 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Evaluate audio classifier if available and trained\nif audio_available and 'audio_test_paths' in locals():\n    print(\"🔮 Making predictions on audio test data...\")\n    \n    print(f\"📁 Using audio temp directory: {temp_dir_audio}\")\n    \n    try:\n        # Evaluate on a small subset using evaluate() method\n        audio_results = audio_classifier.evaluate(\n            audio_test_paths[:5],  # First 5 test samples\n            y_test_audio[:5],\n            return_detailed=True,\n            save_outputs=True,\n            output_dir=str(temp_dir_audio)\n        )\n        \n        print(\"\\n📈 Audio Classification Results:\")\n        accuracy = audio_results.get('accuracy', 'N/A')\n        total_samples = len(audio_test_paths[:5])\n        completed_samples = audio_results.get('completed_samples', total_samples)\n        \n        print(f\"   • Accuracy: {accuracy}\")\n        print(f\"   • Completed samples: {completed_samples}\")\n        \n        # Display predictions\n        predictions = audio_results.get('predictions', [])\n        if predictions:\n            print(\"\\n🎯 Sample Audio Predictions:\")\n            for i, pred in enumerate(predictions[:3]):\n                predicted_class = pred if isinstance(pred, str) else class_names_audio[pred] if pred < len(class_names_audio) else f\"Class {pred}\"\n                true_class = class_names_audio[y_test_audio[i]]\n                print(f\"   Sample {i+1}: Predicted='{predicted_class}', True='{true_class}'\")\n        \n        # Display audio visualization\n        print(\"\\n🖼️ Displaying t-SNE visualization for audio data...\")\n        \n        viz_files_audio = list(Path(temp_dir_audio).glob(\"*.png\"))\n        \n        if viz_files_audio:\n            latest_viz_audio = max(viz_files_audio, key=lambda p: p.stat().st_mtime)\n            print(f\"📊 Found audio visualization: {latest_viz_audio.name}\")\n            \n            display(IPImage(filename=str(latest_viz_audio), width=600))\n            \n            print(\"\\n🎨 Audio Visualization Features:\")\n            print(\"   • Each point represents an audio sample in t-SNE space\")\n            print(\"   • Whisper embeddings capture audio frequency patterns\")\n            print(\"   • Different tones cluster based on frequency content\")\n            print(\"   • VLM recognizes patterns to classify audio by frequency\")\n        else:\n            print(\"⚠️ No audio visualization found.\")\n            \n    except Exception as e:\n        print(f\"❌ Error during audio evaluation: {e}\")\n        print(\"This might be due to:\")\n        print(\"   • Whisper model loading issues\")\n        print(\"   • VLM API access limitations\") \n        print(\"   • Audio processing dependencies\")\n        \nelse:\n    print(\"⚠️ Skipping audio evaluation - no audio data available\")\n    print(\"Audio classification would work with:\")\n    print(\"   • Real audio files or synthetic audio data\")\n    print(\"   • Proper soundfile and Whisper dependencies\")\n    print(\"   • VLM model access\")"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "🔮 Making predictions on audio test data...\n",
-      "📁 Using audio temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_audio_demo_j3wuld80\n",
-      "❌ Error: Audio classifier must be fitted before making predictions\n",
-      "💡 Make sure the training cell above executed successfully\n"
-     ]
-    }
-   ],
    "source": [
-    "# Evaluate audio classifier if available and trained\n",
+    "# Evaluate audio classifier if available\n",
     "if audio_available and 'audio_test_paths' in locals():\n",
     "    print(\"🔮 Making predictions on audio test data...\")\n",
     "    \n",
     "    print(f\"📁 Using audio temp directory: {temp_dir_audio}\")\n",
     "    \n",
     "    try:\n",
-    "        # Check if classifier is fitted\n",
+    "        # Check if classifier needs to be trained first\n",
     "        if not hasattr(audio_classifier, 'train_embeddings') or audio_classifier.train_embeddings is None:\n",
-    "            print(\"❌ Error: Audio classifier must be fitted before making predictions\")\n",
-    "            print(\"💡 Make sure the training cell above executed successfully\")\n",
-    "        else:\n",
-    "            # Evaluate on a small subset using evaluate() method\n",
-    "            audio_results = audio_classifier.evaluate(\n",
-    "                audio_test_paths[:5],  # First 5 test samples\n",
-    "                y_test_audio[:5],\n",
-    "                return_detailed=True,\n",
-    "                save_outputs=True,\n",
-    "                output_dir=str(temp_dir_audio)\n",
-    "            )\n",
-    "            \n",
-    "            print(\"\\n📈 Audio Classification Results:\")\n",
-    "            accuracy = audio_results.get('accuracy', 'N/A')\n",
-    "            total_samples = len(audio_test_paths[:5])\n",
-    "            completed_samples = audio_results.get('completed_samples', total_samples)\n",
-    "            \n",
-    "            print(f\"   • Accuracy: {accuracy}\")\n",
-    "            print(f\"   • Completed samples: {completed_samples}\")\n",
-    "            \n",
-    "            # Display predictions\n",
-    "            predictions = audio_results.get('predictions', [])\n",
-    "            if predictions:\n",
-    "                print(\"\\n🎯 Sample Audio Predictions:\")\n",
-    "                for i, pred in enumerate(predictions[:3]):\n",
-    "                    predicted_class = pred if isinstance(pred, str) else class_names_audio[pred] if pred < len(class_names_audio) else f\"Class {pred}\"\n",
-    "                    true_class = class_names_audio[y_test_audio[i]]\n",
-    "                    print(f\"   Sample {i+1}: Predicted='{predicted_class}', True='{true_class}'\")\n",
-    "            \n",
-    "            # Display audio visualization\n",
-    "            print(\"\\n🖼️ Displaying t-SNE visualization for audio data...\")\n",
+    "            print(\"🏋️ Training audio classifier first...\")\n",
     "            \n",
-    "            viz_files_audio = list(Path(temp_dir_audio).glob(\"*.png\"))\n",
-    "            \n",
-    "            if viz_files_audio:\n",
-    "                latest_viz_audio = max(viz_files_audio, key=lambda p: p.stat().st_mtime)\n",
-    "                print(f\"📊 Found audio visualization: {latest_viz_audio.name}\")\n",
-    "                \n",
-    "                display(IPImage(filename=str(latest_viz_audio), width=600))\n",
-    "                \n",
-    "                print(\"\\n🎨 Audio Visualization Features:\")\n",
-    "                print(\"   • Each point represents an audio sample in t-SNE space\")\n",
-    "                print(\"   • Whisper embeddings capture audio frequency patterns\")\n",
-    "                print(\"   • Different tones cluster based on frequency content\")\n",
-    "                print(\"   • VLM recognizes patterns to classify audio by frequency\")\n",
-    "            else:\n",
-    "                print(\"⚠️ No audio visualization found.\")\n",
+    "            # Train the audio classifier with the created audio files\n",
+    "            audio_classifier.fit(\n",
+    "                audio_train_paths,  # Training audio file paths\n",
+    "                y_train_audio,      # Training labels\n",
+    "                audio_test_paths[:5],  # Use subset of test data for demo\n",
+    "                class_names=class_names_audio,\n",
+    "                task_type='classification'\n",
+    "            )\n",
     "            \n",
-    "    except Exception as e:\n",
-    "        print(f\"❌ Error during audio evaluation: {e}\")\n",
-    "        print(\"This might be due to:\")\n",
-    "        print(\"   • Whisper model loading issues\")\n",
-    "        print(\"   • VLM API access limitations\") \n",
-    "        print(\"   • Audio processing dependencies\")\n",
-    "        print(\"   • Model not being fitted properly\")\n",
+    "            print(\"✅ Audio classifier trained successfully!\")\n",
     "        \n",
-    "else:\n",
-    "    print(\"⚠️ Skipping audio evaluation - no audio data or classifier not trained\")\n",
-    "    print(\"Audio classification would work with:\")\n",
-    "    print(\"   • Real audio files or synthetic audio data\")\n",
-    "    print(\"   • Proper soundfile and Whisper dependencies\")\n",
-    "    print(\"   • VLM model access\")\n",
-    "    print(\"   • Successfully trained audio classifier\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "🔮 Making predictions on audio test data...\n",
-      "📁 Using audio temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_audio_demo_j3wuld80\n",
-      "❌ Error during audio evaluation: Model must be fitted before making predictions\n",
-      "This might be due to Whisper model loading or VLM API issues.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Evaluate audio classifier if available\n",
-    "if audio_available and 'audio_test_paths' in locals():\n",
-    "    print(\"🔮 Making predictions on audio test data...\")\n",
-    "    \n",
-    "    print(f\"📁 Using audio temp directory: {temp_dir_audio}\")\n",
-    "    \n",
-    "    try:\n",
     "        # Evaluate on a small subset using evaluate() method\n",
     "        audio_results = audio_classifier.evaluate(\n",
     "            audio_test_paths[:5],  # First 5 test samples\n",
@@ -979,12 +848,17 @@
     "            \n",
     "    except Exception as e:\n",
     "        print(f\"❌ Error during audio evaluation: {e}\")\n",
-    "        print(\"This might be due to Whisper model loading or VLM API issues.\")\n",
+    "        print(\"This might be due to:\")\n",
+    "        print(\"   • Whisper model loading issues\")\n",
+    "        print(\"   • VLM API access limitations\") \n",
+    "        print(\"   • Audio processing dependencies\")\n",
+    "        import traceback\n",
+    "        traceback.print_exc()\n",
     "        \n",
     "else:\n",
-    "    print(\"⚠️ Skipping audio evaluation - no audio data or classifier not trained\")\n",
+    "    print(\"⚠️ Skipping audio evaluation - no audio data available\")\n",
     "    print(\"Audio classification would work with:\")\n",
-    "    print(\"   • Real audio files\")\n",
+    "    print(\"   • Real audio files or synthetic audio data\")\n",
     "    print(\"   • Proper soundfile and Whisper dependencies\")\n",
     "    print(\"   • VLM model access\")"
    ]
@@ -1038,26 +912,6 @@
     "print(\"   • Focus: Cross-method pattern comparison\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "get_whisper_embeddings() got an unexpected keyword argument 'whisper_model'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mTypeError\u001b[39m                                 Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[51]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;66;03m# Train the audio classifier if audio data is available\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m audio_available \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m'\u001b[39m\u001b[33maudio_train_paths\u001b[39m\u001b[33m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mlocals\u001b[39m():\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m     \u001b[43maudio_classifier\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m      4\u001b[39m \u001b[43m        \u001b[49m\u001b[43maudio_train_paths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Training audio file paths\u001b[39;49;00m\n\u001b[32m      5\u001b[39m \u001b[43m        \u001b[49m\u001b[43my_train_audio\u001b[49m\u001b[43m,\u001b[49m\u001b[43m      \u001b[49m\u001b[38;5;66;43;03m# Training labels\u001b[39;49;00m\n\u001b[32m      6\u001b[39m \u001b[43m        \u001b[49m\u001b[43maudio_test_paths\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Use subset of test data for demo\u001b[39;49;00m\n\u001b[32m      7\u001b[39m \u001b[43m        \u001b[49m\u001b[43mclass_names\u001b[49m\u001b[43m=\u001b[49m\u001b[43mclass_names_audio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      8\u001b[39m \u001b[43m        \u001b[49m\u001b[43mtask_type\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mclassification\u001b[39;49m\u001b[33;43m'\u001b[39;49m\n\u001b[32m      9\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/marvis_tsne.py:1056\u001b[39m, in \u001b[36mfit\u001b[39m\u001b[34m(self, X_train, y_train, X_test, class_names, task_type, **kwargs)\u001b[39m\n\u001b[32m   1053\u001b[39m task_id = dataset_info.get(\u001b[33m\"\u001b[39m\u001b[33mtask_id\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01mif\u001b[39;00m dataset_info \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1055\u001b[39m \u001b[38;5;66;03m# For non-tabular modalities, use special task IDs if task_id not provided\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1056\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m task_id \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.modality == \u001b[33m\"\u001b[39m\u001b[33mvision\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m   1057\u001b[39m     task_id = VISION_CLASSIFICATION_TASK_ID\n\u001b[32m   1058\u001b[39m     \u001b[38;5;28mself\u001b[39m.logger.debug(\n\u001b[32m   1059\u001b[39m         \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mUsing special vision classification task_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtask_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m   1060\u001b[39m     )\n",
-      "\u001b[31mTypeError\u001b[39m: get_whisper_embeddings() got an unexpected keyword argument 'whisper_model'"
-     ]
-    }
-   ],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1190,84 +1044,109 @@
     "print(\"   ✅ No need for extensive hyperparameter tuning\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 💬 Part 5: Interactive Chat with MARVIS\n",
+    "\n",
+    "One of MARVIS's unique features is the ability to have a natural language conversation about your predictions! After running `predict()` or `evaluate()`, you can use the `.chat()` method to ask questions about the results, discuss patterns, or get explanations.\n",
+    "\n",
+    "Let's demonstrate this powerful feature:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Summary of results across modalities\n",
-    "print(\"📊 MARVIS Performance Summary Across Modalities\")\n",
-    "print(\"=\" * 60)\n",
+    "# Chat demonstration using the tabular classifier from earlier\n",
+    "print(\"💬 Starting Interactive Chat with MARVIS\")\n",
+    "print(\"=\" * 50)\n",
     "\n",
-    "# Collect results from all completed experiments\n",
-    "results_summary = {}\n",
-    "\n",
-    "# Tabular data results\n",
-    "if 'tabular_results' in locals():\n",
-    "    results_summary[\"Tabular Data\"] = {\n",
-    "        \"samples\": len(X_test_tab[:10]) if 'X_test_tab' in locals() else 'N/A',\n",
-    "        \"features\": X_tabular.shape[1] if 'X_tabular' in locals() else 'N/A',\n",
-    "        \"classes\": len(class_names_tab) if 'class_names_tab' in locals() else 'N/A',\n",
-    "        \"accuracy\": tabular_results.get('accuracy', 'N/A'),\n",
-    "        \"method\": \"TabPFN embeddings → t-SNE → VLM reasoning\"\n",
-    "    }\n",
+    "# Check if we have a classifier with predictions available\n",
+    "if 'tabular_classifier' in locals() and hasattr(tabular_classifier, '_last_prediction_context'):\n",
+    "    print(\"✅ Using tabular classifier with existing predictions\")\n",
+    "    \n",
+    "    try:\n",
+    "        # First conversation - ask about model performance\n",
+    "        print(\"\\n🧠 User: How well did the model perform on the test data?\")\n",
+    "        response1 = tabular_classifier.chat(\"How well did the model perform on the test data?\")\n",
+    "        print(f\"🤖 MARVIS: {response1}\")\n",
+    "        \n",
+    "        print(\"\\n\" + \"-\" * 60)\n",
+    "        \n",
+    "        # Second conversation - ask about patterns\n",
+    "        print(\"\\n🧠 User: What patterns did you observe in the visualization?\") \n",
+    "        response2 = tabular_classifier.chat(\"What patterns did you observe in the visualization?\")\n",
+    "        print(f\"🤖 MARVIS: {response2}\")\n",
+    "        \n",
+    "        print(\"\\n\" + \"-\" * 60)\n",
+    "        \n",
+    "        # Third conversation - ask for improvement suggestions\n",
+    "        print(\"\\n🧠 User: How could we improve the classification results?\")\n",
+    "        response3 = tabular_classifier.chat(\"How could we improve the classification results?\")\n",
+    "        print(f\"🤖 MARVIS: {response3}\")\n",
+    "        \n",
+    "        print(\"\\n\" + \"-\" * 60)\n",
+    "        \n",
+    "        # Show chat history\n",
+    "        print(\"\\n📚 Chat History Summary:\")\n",
+    "        history = tabular_classifier.get_chat_history()\n",
+    "        for i, exchange in enumerate(history, 1):\n",
+    "            print(f\"   {i}. User: {exchange['user'][:50]}...\" if len(exchange['user']) > 50 else f\"   {i}. User: {exchange['user']}\")\n",
+    "            print(f\"      MARVIS: {exchange['assistant'][:50]}...\" if len(exchange['assistant']) > 50 else f\"      MARVIS: {exchange['assistant']}\")\n",
+    "        \n",
+    "        print(f\"\\n💡 Total chat exchanges: {len(history)}\")\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ Chat demonstration failed: {e}\")\n",
+    "        print(\"💡 This might be due to VLM model limitations or API access issues.\")\n",
+    "        print(\"🔧 In a real scenario with proper model access, the chat would work as shown.\")\n",
+    "        \n",
+    "        # Show what the conversation would look like\n",
+    "        print(\"\\n🎭 Example Conversation (Simulated):\")\n",
+    "        print(\"\\n🧠 User: How well did the model perform on the test data?\")\n",
+    "        print(\"🤖 MARVIS: Based on the recent evaluation, the model achieved 90% accuracy on 10 test samples. The high accuracy suggests that the t-SNE visualization effectively captured the underlying patterns in your tabular data, allowing the vision language model to distinguish between Class A, Class B, and Class C with good reliability.\")\n",
+    "        \n",
+    "        print(\"\\n🧠 User: What patterns did you observe in the visualization?\")\n",
+    "        print(\"🤖 MARVIS: In the t-SNE visualization, I observed three distinct clusters corresponding to your three classes. The clusters showed good separation with minimal overlap, which explains the high classification accuracy. The KNN connections (k=5) revealed that similar data points were properly grouped together, indicating that the TabPFN embeddings captured meaningful feature relationships.\")\n",
+    "        \n",
+    "        print(\"\\n🧠 User: How could we improve the classification results?\")\n",
+    "        print(\"🤖 MARVIS: Several strategies could potentially improve results: 1) Increase the number of training samples for better embedding quality, 2) Experiment with different t-SNE perplexity values (currently 15) to optimize cluster separation, 3) Try 3D visualization for complex datasets, 4) Consider multi-visualization mode to combine PCA and t-SNE insights, or 5) Use a more powerful VLM model for enhanced reasoning capability.\")\n",
     "\n",
-    "# Image data results  \n",
-    "if 'image_results' in locals():\n",
-    "    results_summary[\"Image Data\"] = {\n",
-    "        \"samples\": len(X_test_img[:8]) if 'X_test_img' in locals() else 'N/A',\n",
-    "        \"features\": \"8x8 pixels (DINOV2 embeddings)\",\n",
-    "        \"classes\": len(class_names_img) if 'class_names_img' in locals() else 'N/A',\n",
-    "        \"accuracy\": image_results.get('accuracy', 'N/A'),\n",
-    "        \"method\": \"Raw pixels → DINOV2 → t-SNE → VLM reasoning\"\n",
-    "    }\n",
+    "else:\n",
+    "    print(\"⚠️ No classifier with predictions available for chat demonstration\")\n",
+    "    print(\"💡 The chat feature requires running predict() or evaluate() first\")\n",
+    "    \n",
+    "    # Show example chat workflow\n",
+    "    print(\"\\n🎭 Example Chat Workflow:\")\n",
+    "    print(\"\"\"\n",
+    "# 1. Train and evaluate your model\n",
+    "classifier = MarvisTsneClassifier(modality=\"tabular\", vlm_model_id=\"Qwen/Qwen2.5-VL-3B-Instruct\")\n",
+    "classifier.fit(X_train, y_train, X_test, class_names=[\"A\", \"B\", \"C\"])\n",
+    "results = classifier.evaluate(X_test, y_test)\n",
     "\n",
-    "# Audio data results\n",
-    "if 'audio_available' in locals() and audio_available and 'audio_results' in locals():\n",
-    "    results_summary[\"Audio Data\"] = {\n",
-    "        \"samples\": len(audio_test_paths[:5]) if 'audio_test_paths' in locals() else 'N/A',\n",
-    "        \"features\": \"Whisper embeddings\",\n",
-    "        \"classes\": len(class_names_audio) if 'class_names_audio' in locals() else 'N/A',\n",
-    "        \"accuracy\": audio_results.get('accuracy', 'N/A'),\n",
-    "        \"method\": \"Audio → Whisper → t-SNE → VLM reasoning\"\n",
-    "    }\n",
+    "# 2. Start chatting about the results\n",
+    "response = classifier.chat(\"How accurate were the predictions?\")\n",
+    "print(response)\n",
     "\n",
-    "# Multi-visualization results\n",
-    "if 'multi_viz_results' in locals():\n",
-    "    results_summary[\"Multi-Visualization\"] = {\n",
-    "        \"samples\": len(X_test_tab[:8]) if 'X_test_tab' in locals() else 'N/A',\n",
-    "        \"features\": \"Same tabular data\",\n",
-    "        \"classes\": len(class_names_tab) if 'class_names_tab' in locals() else 'N/A',\n",
-    "        \"accuracy\": multi_viz_results.get('accuracy', 'N/A'),\n",
-    "        \"method\": \"PCA + t-SNE → Multi-panel → VLM reasoning\"\n",
-    "    }\n",
+    "# 3. Continue the conversation\n",
+    "response = classifier.chat(\"What patterns did you see in the data?\")\n",
+    "print(response)\n",
     "\n",
-    "# Display results\n",
-    "for modality, info in results_summary.items():\n",
-    "    print(f\"\\n🎯 {modality}:\")\n",
-    "    print(f\"   • Test samples: {info['samples']}\")\n",
-    "    print(f\"   • Features: {info['features']}\")\n",
-    "    print(f\"   • Classes: {info['classes']}\")\n",
-    "    print(f\"   • Accuracy: {info['accuracy']}\")\n",
-    "    print(f\"   • Method: {info['method']}\")\n",
+    "# 4. Ask follow-up questions\n",
+    "response = classifier.chat(\"Which samples were misclassified and why?\")\n",
+    "print(response)\n",
     "\n",
-    "print(\"\\n🎉 Key MARVIS Advantages:\")\n",
-    "print(\"   ✅ Unified interface across modalities (tabular, image, audio)\")\n",
-    "print(\"   ✅ Visual reasoning using state-of-the-art VLMs\")\n",
-    "print(\"   ✅ Interpretable predictions through visualization\")\n",
-    "print(\"   ✅ Multi-visualization support for robust classification\")\n",
-    "print(\"   ✅ Handles few-shot learning scenarios effectively\")\n",
-    "print(\"   ✅ No need for extensive hyperparameter tuning\")\n",
-    "print(\"   ✅ Interactive chat for discussing predictions\")\n",
-    "\n",
-    "print(f\"\\n📊 Experiments completed: {len(results_summary)}\")\n",
-    "if len(results_summary) == 0:\n",
-    "    print(\"   ⚠️ No experiments completed successfully\")\n",
-    "elif len(results_summary) < 4:\n",
-    "    print(\"   ⚠️ Some experiments may have failed due to missing dependencies\")\n",
-    "else:\n",
-    "    print(\"   ✅ All experiments completed successfully!\")"
+    "# 5. Get chat history\n",
+    "history = classifier.get_chat_history()\n",
+    "print(f\"Had {len(history)} exchanges\")\n",
+    "\n",
+    "# 6. Clear history if needed\n",
+    "classifier.clear_chat_history()\n",
+    "\"\"\")"
    ]
   },
   {
@@ -1442,105 +1321,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Chat demonstration using the tabular classifier from earlier\n",
-    "print(\"💬 Starting Interactive Chat with MARVIS\")\n",
-    "print(\"=\" * 50)\n",
-    "\n",
-    "# Check if we have a classifier with predictions available\n",
-    "if 'tabular_classifier' in locals() and hasattr(tabular_classifier, '_last_prediction_context'):\n",
-    "    print(\"✅ Using tabular classifier with existing predictions\")\n",
-    "    \n",
-    "    try:\n",
-    "        # First conversation - ask about model performance\n",
-    "        print(\"\\n🧠 User: How well did the model perform on the test data?\")\n",
-    "        response1 = tabular_classifier.chat(\"How well did the model perform on the test data?\")\n",
-    "        print(f\"🤖 MARVIS: {response1}\")\n",
-    "        \n",
-    "        print(\"\\n\" + \"-\" * 60)\n",
-    "        \n",
-    "        # Second conversation - ask about patterns\n",
-    "        print(\"\\n🧠 User: What patterns did you observe in the visualization?\") \n",
-    "        response2 = tabular_classifier.chat(\"What patterns did you observe in the visualization?\")\n",
-    "        print(f\"🤖 MARVIS: {response2}\")\n",
-    "        \n",
-    "        print(\"\\n\" + \"-\" * 60)\n",
-    "        \n",
-    "        # Third conversation - ask for improvement suggestions\n",
-    "        print(\"\\n🧠 User: How could we improve the classification results?\")\n",
-    "        response3 = tabular_classifier.chat(\"How could we improve the classification results?\")\n",
-    "        print(f\"🤖 MARVIS: {response3}\")\n",
-    "        \n",
-    "        print(\"\\n\" + \"-\" * 60)\n",
-    "        \n",
-    "        # Show chat history\n",
-    "        print(\"\\n📚 Chat History Summary:\")\n",
-    "        history = tabular_classifier.get_chat_history()\n",
-    "        for i, exchange in enumerate(history, 1):\n",
-    "            print(f\"   {i}. User: {exchange['user'][:50]}...\" if len(exchange['user']) > 50 else f\"   {i}. User: {exchange['user']}\")\n",
-    "            print(f\"      MARVIS: {exchange['assistant'][:50]}...\" if len(exchange['assistant']) > 50 else f\"      MARVIS: {exchange['assistant']}\")\n",
-    "        \n",
-    "        print(f\"\\n💡 Total chat exchanges: {len(history)}\")\n",
-    "        \n",
-    "    except Exception as e:\n",
-    "        print(f\"❌ Chat demonstration failed: {e}\")\n",
-    "        print(\"💡 This might be due to VLM model limitations or API access issues.\")\n",
-    "        print(\"🔧 In a real scenario with proper model access, the chat would work as shown.\")\n",
-    "        \n",
-    "        # Show what the conversation would look like\n",
-    "        print(\"\\n🎭 Example Conversation (Simulated):\")\n",
-    "        print(\"\\n🧠 User: How well did the model perform on the test data?\")\n",
-    "        print(\"🤖 MARVIS: Based on the recent evaluation, the model achieved 90% accuracy on 10 test samples. The high accuracy suggests that the t-SNE visualization effectively captured the underlying patterns in your tabular data, allowing the vision language model to distinguish between Class A, Class B, and Class C with good reliability.\")\n",
-    "        \n",
-    "        print(\"\\n🧠 User: What patterns did you observe in the visualization?\")\n",
-    "        print(\"🤖 MARVIS: In the t-SNE visualization, I observed three distinct clusters corresponding to your three classes. The clusters showed good separation with minimal overlap, which explains the high classification accuracy. The KNN connections (k=5) revealed that similar data points were properly grouped together, indicating that the TabPFN embeddings captured meaningful feature relationships.\")\n",
-    "        \n",
-    "        print(\"\\n🧠 User: How could we improve the classification results?\")\n",
-    "        print(\"🤖 MARVIS: Several strategies could potentially improve results: 1) Increase the number of training samples for better embedding quality, 2) Experiment with different t-SNE perplexity values (currently 15) to optimize cluster separation, 3) Try 3D visualization for complex datasets, 4) Consider multi-visualization mode to combine PCA and t-SNE insights, or 5) Use a more powerful VLM model for enhanced reasoning capability.\")\n",
-    "\n",
-    "else:\n",
-    "    print(\"⚠️ No classifier with predictions available for chat demonstration\")\n",
-    "    print(\"💡 The chat feature requires running predict() or evaluate() first\")\n",
-    "    \n",
-    "    # Show example chat workflow\n",
-    "    print(\"\\n🎭 Example Chat Workflow:\")\n",
-    "    print(\"\"\"\n",
-    "# 1. Train and evaluate your model\n",
-    "classifier = MarvisTsneClassifier(modality=\"tabular\", vlm_model_id=\"Qwen/Qwen2.5-VL-3B-Instruct\")\n",
-    "classifier.fit(X_train, y_train, X_test, class_names=[\"A\", \"B\", \"C\"])\n",
-    "results = classifier.evaluate(X_test, y_test)\n",
-    "\n",
-    "# 2. Start chatting about the results\n",
-    "response = classifier.chat(\"How accurate were the predictions?\")\n",
-    "print(response)\n",
-    "\n",
-    "# 3. Continue the conversation\n",
-    "response = classifier.chat(\"What patterns did you see in the data?\")\n",
-    "print(response)\n",
-    "\n",
-    "# 4. Ask follow-up questions\n",
-    "response = classifier.chat(\"Which samples were misclassified and why?\")\n",
-    "print(response)\n",
-    "\n",
-    "# 5. Get chat history\n",
-    "history = classifier.get_chat_history()\n",
-    "print(f\"Had {len(history)} exchanges\")\n",
-    "\n",
-    "# 6. Clear history if needed\n",
-    "classifier.clear_chat_history()\n",
-    "\"\"\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 💬 Part 5: Interactive Chat with MARVIS\n",
-    "\n",
-    "One of MARVIS's unique features is the ability to have a natural language conversation about your predictions! After running `predict()` or `evaluate()`, you can use the `.chat()` method to ask questions about the results, discuss patterns, or get explanations.\n",
-    "\n",
-    "Let's demonstrate this powerful feature:"
-   ]
+   "source": []
   },
   {
    "cell_type": "markdown",
@@ -1736,4 +1517,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/marvis/models/marvis_tsne.py b/marvis/models/marvis_tsne.py
index 3a511d4..6b80e54 100644
--- a/marvis/models/marvis_tsne.py
+++ b/marvis/models/marvis_tsne.py
@@ -2065,17 +2065,45 @@ def chat(self, user_input: str, max_history: int = 10) -> str:
             # Generate response using the VLM
             self.logger.info("Generating chat response...")
             
-            # Use the same interface as predictions but with text-only input
+            # Use the VLM wrapper interface for chat
             if hasattr(self.vlm_wrapper, 'generate_response'):
+                # Use the standard generate_response interface
                 response = self.vlm_wrapper.generate_response(
                     text_input=chat_prompt,
                     image_input=None,  # Text-only conversation
                     max_tokens=1000,
                     temperature=0.7  # Slightly higher temperature for conversational responses
                 )
+            elif hasattr(self.vlm_wrapper, 'generate'):
+                # Use the direct generate interface with proper parameters
+                from marvis.utils.model_loader import GenerationConfig
+                config = GenerationConfig(
+                    max_new_tokens=512,
+                    temperature=0.7,
+                    do_sample=True,
+                    top_p=0.9
+                )
+                response = self.vlm_wrapper.generate(
+                    inputs=chat_prompt,
+                    config=config
+                )
+            elif hasattr(self.vlm_wrapper, 'generate_from_conversation'):
+                # Use conversation interface if available
+                from marvis.utils.model_loader import GenerationConfig
+                conversation = [{"role": "user", "content": chat_prompt}]
+                config = GenerationConfig(
+                    max_new_tokens=512,
+                    temperature=0.7,
+                    do_sample=True,
+                    top_p=0.9
+                )
+                response = self.vlm_wrapper.generate_from_conversation(
+                    conversation,
+                    config
+                )
             else:
-                # Fallback for different wrapper interfaces
-                response = self.vlm_wrapper.chat(chat_prompt)
+                # Final fallback - raise informative error
+                raise AttributeError(f"VLM wrapper {type(self.vlm_wrapper)} doesn't have a supported generation method")
                 
             # Clean up response if needed
             if isinstance(response, dict) and 'text' in response: