diff --git a/Getting_Started.ipynb b/Getting_Started.ipynb index 3d52dc6..3fb55f9 100644 --- a/Getting_Started.ipynb +++ b/Getting_Started.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -151,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -159,7 +159,7 @@ "output_type": "stream", "text": [ "šŸ”§ Initializing MARVIS t-SNE classifier for tabular data...\n", - "šŸ“ Created temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_demo_seap38lk\n", + "šŸ“ Created temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_demo_vefuxyya\n", "āœ… Tabular classifier initialized!\n" ] } @@ -192,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -201,8 +201,8 @@ "text": [ "šŸ‹ļø Training MARVIS on tabular data...\n", "[t-SNE] Computing 46 nearest neighbors...\n", - "[t-SNE] Indexed 200 samples in 0.000s...\n", - "[t-SNE] Computed neighbors for 200 samples in 0.002s...\n", + "[t-SNE] Indexed 200 samples in 0.001s...\n", + "[t-SNE] Computed neighbors for 200 samples in 0.006s...\n", "[t-SNE] Computed conditional probabilities for sample 200 / 200\n", "[t-SNE] Mean sigma: 4.328593\n", "[t-SNE] KL divergence after 250 iterations with early exaggeration: 55.566055\n", @@ -228,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -236,44 +236,7 @@ "output_type": "stream", "text": [ "šŸ”® Making predictions on tabular test data...\n", - "šŸ“ Using temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_demo_seap38lk\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[56]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mšŸ“ Using temp directory: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtemp_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# Evaluate with detailed results and save outputs - use evaluate() to get accuracy\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m tabular_results = \u001b[43mtabular_classifier\u001b[49m\u001b[43m.\u001b[49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 8\u001b[39m \u001b[43m \u001b[49m\u001b[43mX_test_tab\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Use first 10 test samples for demo\u001b[39;49;00m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[43my_test_tab\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_detailed\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[43msave_outputs\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtemp_dir\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 15\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33mšŸ“ˆ Tabular Classification Results:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 16\u001b[39m accuracy = tabular_results.get(\u001b[33m'\u001b[39m\u001b[33maccuracy\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mN/A\u001b[39m\u001b[33m'\u001b[39m)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/marvis_tsne.py:1497\u001b[39m, in \u001b[36mevaluate\u001b[39m\u001b[34m(self, X_test, y_test, return_detailed, save_outputs, output_dir, visualization_save_cadence)\u001b[39m\n\u001b[32m 1494\u001b[39m try:\n\u001b[32m 1495\u001b[39m from marvis.utils.resource_manager import get_resource_manager\n\u001b[32m-> \u001b[39m\u001b[32m1497\u001b[39m # Determine dataset identifier for caching\n\u001b[32m 1498\u001b[39m dataset_id = kwargs.get(\"dataset_name\", \"\")\n\u001b[32m 1499\u001b[39m if \"dataset_info\" in kwargs and kwargs[\"dataset_info\"]:\n\u001b[32m 1500\u001b[39m # Prefer task_id if available\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/marvis_tsne.py:1430\u001b[39m, in \u001b[36mpredict\u001b[39m\u001b[34m(self, X_test, y_test, return_detailed, save_outputs, output_dir, visualization_save_cadence)\u001b[39m\n\u001b[32m 1413\u001b[39m self.logger.info(\"Creating 2D classification t-SNE visualization...\")\n\u001b[32m 1414\u001b[39m self.train_tsne, self.test_tsne, base_fig = viz_methods[\n\u001b[32m 1415\u001b[39m \"create_tsne_visualization\"\n\u001b[32m 1416\u001b[39m ](\n\u001b[32m (...)\u001b[39m\u001b[32m 1427\u001b[39m },\n\u001b[32m 1428\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1430\u001b[39m # Close base figure to save memory\n\u001b[32m 1431\u001b[39m plt.close(base_fig)\n\u001b[32m 1433\u001b[39m # Set up class/target information based on task type\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/process_one_sample.py:443\u001b[39m, in \u001b[36mprocess_one_sample\u001b[39m\u001b[34m(classifier_instance, sample_index, viz_methods, viewing_angles, save_outputs, visualization_save_cadence, return_detailed, y_test, prediction_details, all_classes)\u001b[39m\n\u001b[32m 0\u001b[39m \n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/process_one_sample.py:66\u001b[39m, in \u001b[36m_generate_vlm_response\u001b[39m\u001b[34m(classifier_instance, image, prompt)\u001b[39m\n\u001b[32m 61\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Generate VLM response with consistent configuration.\"\"\"\u001b[39;00m\n\u001b[32m 62\u001b[39m conversation = create_vlm_conversation(image, prompt)\n\u001b[32m 64\u001b[39m gen_config = GenerationConfig(\n\u001b[32m 65\u001b[39m max_new_tokens=\u001b[32m16384\u001b[39m,\n\u001b[32m---> \u001b[39m\u001b[32m66\u001b[39m temperature=\u001b[32m0.1\u001b[39m,\n\u001b[32m 67\u001b[39m do_sample=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m 68\u001b[39m enable_thinking=classifier_instance.enable_thinking\n\u001b[32m 69\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m classifier_instance.is_api_model,\n\u001b[32m 70\u001b[39m thinking_summary=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m 71\u001b[39m )\n\u001b[32m 73\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m classifier_instance.vlm_wrapper.generate_from_conversation(\n\u001b[32m 74\u001b[39m conversation, gen_config\n\u001b[32m 75\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/utils/model_loader.py:728\u001b[39m, in \u001b[36mgenerate_from_conversation\u001b[39m\u001b[34m(self, conversation, config)\u001b[39m\n\u001b[32m 0\u001b[39m \n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/utils/_contextlib.py:120\u001b[39m, in \u001b[36mcontext_decorator..decorate_context\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 117\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 118\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdecorate_context\u001b[39m(*args, **kwargs):\n\u001b[32m 119\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[32m--> \u001b[39m\u001b[32m120\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/generation/utils.py:2617\u001b[39m, in \u001b[36mGenerationMixin.generate\u001b[39m\u001b[34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, custom_generate, **kwargs)\u001b[39m\n\u001b[32m 2609\u001b[39m input_ids, model_kwargs = \u001b[38;5;28mself\u001b[39m._expand_inputs_for_generation(\n\u001b[32m 2610\u001b[39m input_ids=input_ids,\n\u001b[32m 2611\u001b[39m expand_size=generation_config.num_return_sequences,\n\u001b[32m 2612\u001b[39m is_encoder_decoder=\u001b[38;5;28mself\u001b[39m.config.is_encoder_decoder,\n\u001b[32m 2613\u001b[39m **model_kwargs,\n\u001b[32m 2614\u001b[39m )\n\u001b[32m 2616\u001b[39m \u001b[38;5;66;03m# 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m2617\u001b[39m result = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sample\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2618\u001b[39m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2619\u001b[39m \u001b[43m \u001b[49m\u001b[43mlogits_processor\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprepared_logits_processor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2620\u001b[39m \u001b[43m \u001b[49m\u001b[43mstopping_criteria\u001b[49m\u001b[43m=\u001b[49m\u001b[43mprepared_stopping_criteria\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2621\u001b[39m \u001b[43m \u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2622\u001b[39m \u001b[43m \u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[43m=\u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2623\u001b[39m \u001b[43m \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstreamer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2624\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2625\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2627\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m generation_mode \u001b[38;5;129;01min\u001b[39;00m (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):\n\u001b[32m 2628\u001b[39m \u001b[38;5;66;03m# 11. interleave input_ids with `num_beams` additional sequences per batch\u001b[39;00m\n\u001b[32m 2629\u001b[39m input_ids, model_kwargs = \u001b[38;5;28mself\u001b[39m._expand_inputs_for_generation(\n\u001b[32m 2630\u001b[39m input_ids=input_ids,\n\u001b[32m 2631\u001b[39m expand_size=generation_config.num_beams,\n\u001b[32m 2632\u001b[39m is_encoder_decoder=\u001b[38;5;28mself\u001b[39m.config.is_encoder_decoder,\n\u001b[32m 2633\u001b[39m **model_kwargs,\n\u001b[32m 2634\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/generation/utils.py:3598\u001b[39m, in \u001b[36mGenerationMixin._sample\u001b[39m\u001b[34m(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)\u001b[39m\n\u001b[32m 3595\u001b[39m model_inputs.update({\u001b[33m\"\u001b[39m\u001b[33moutput_hidden_states\u001b[39m\u001b[33m\"\u001b[39m: output_hidden_states} \u001b[38;5;28;01mif\u001b[39;00m output_hidden_states \u001b[38;5;28;01melse\u001b[39;00m {})\n\u001b[32m 3597\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m is_prefill:\n\u001b[32m-> \u001b[39m\u001b[32m3598\u001b[39m outputs = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 3599\u001b[39m is_prefill = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 3600\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/utils/generic.py:959\u001b[39m, in \u001b[36mcan_return_tuple..wrapper\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 957\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m return_dict_passed \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 958\u001b[39m return_dict = return_dict_passed\n\u001b[32m--> \u001b[39m\u001b[32m959\u001b[39m output = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_dict \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(output, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[32m 961\u001b[39m output = output.to_tuple()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1493\u001b[39m, in \u001b[36mQwen2_5_VLForConditionalGeneration.forward\u001b[39m\u001b[34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts, logits_to_keep, **kwargs)\u001b[39m\n\u001b[32m 1488\u001b[39m output_attentions = output_attentions \u001b[38;5;28;01mif\u001b[39;00m output_attentions \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m.config.output_attentions\n\u001b[32m 1489\u001b[39m output_hidden_states = (\n\u001b[32m 1490\u001b[39m output_hidden_states \u001b[38;5;28;01mif\u001b[39;00m output_hidden_states \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m.config.output_hidden_states\n\u001b[32m 1491\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1493\u001b[39m outputs = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1494\u001b[39m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1495\u001b[39m \u001b[43m \u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1496\u001b[39m \u001b[43m \u001b[49m\u001b[43mpixel_values_videos\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpixel_values_videos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1497\u001b[39m \u001b[43m \u001b[49m\u001b[43mimage_grid_thw\u001b[49m\u001b[43m=\u001b[49m\u001b[43mimage_grid_thw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1498\u001b[39m \u001b[43m \u001b[49m\u001b[43mvideo_grid_thw\u001b[49m\u001b[43m=\u001b[49m\u001b[43mvideo_grid_thw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1499\u001b[39m \u001b[43m \u001b[49m\u001b[43msecond_per_grid_ts\u001b[49m\u001b[43m=\u001b[49m\u001b[43msecond_per_grid_ts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1500\u001b[39m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1501\u001b[39m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1502\u001b[39m \u001b[43m \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1503\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1504\u001b[39m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m=\u001b[49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1505\u001b[39m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m=\u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1506\u001b[39m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m=\u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1507\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 1508\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_position\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_position\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1509\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1510\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1512\u001b[39m hidden_states = outputs[\u001b[32m0\u001b[39m]\n\u001b[32m 1514\u001b[39m \u001b[38;5;66;03m# Only compute necessary logits, and do not upcast them to float if we are not computing the loss\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1275\u001b[39m, in \u001b[36mQwen2_5_VLModel.forward\u001b[39m\u001b[34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts, **kwargs)\u001b[39m\n\u001b[32m 1272\u001b[39m inputs_embeds = \u001b[38;5;28mself\u001b[39m.get_input_embeddings()(input_ids)\n\u001b[32m 1274\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m pixel_values \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1275\u001b[39m image_embeds = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_image_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mimage_grid_thw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1276\u001b[39m image_embeds = torch.cat(image_embeds, dim=\u001b[32m0\u001b[39m).to(inputs_embeds.device, inputs_embeds.dtype)\n\u001b[32m 1277\u001b[39m image_mask, _ = \u001b[38;5;28mself\u001b[39m.get_placeholder_mask(\n\u001b[32m 1278\u001b[39m input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds\n\u001b[32m 1279\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1188\u001b[39m, in \u001b[36mQwen2_5_VLModel.get_image_features\u001b[39m\u001b[34m(self, pixel_values, image_grid_thw)\u001b[39m\n\u001b[32m 1178\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 1179\u001b[39m \u001b[33;03mEncodes images into continuous embeddings that can be forwarded to the language model.\u001b[39;00m\n\u001b[32m 1180\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 1185\u001b[39m \u001b[33;03m The temporal, height and width of feature shape of each image in LLM.\u001b[39;00m\n\u001b[32m 1186\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 1187\u001b[39m pixel_values = pixel_values.type(\u001b[38;5;28mself\u001b[39m.visual.dtype)\n\u001b[32m-> \u001b[39m\u001b[32m1188\u001b[39m image_embeds = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mvisual\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrid_thw\u001b[49m\u001b[43m=\u001b[49m\u001b[43mimage_grid_thw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1189\u001b[39m split_sizes = (image_grid_thw.prod(-\u001b[32m1\u001b[39m) // \u001b[38;5;28mself\u001b[39m.visual.spatial_merge_size**\u001b[32m2\u001b[39m).tolist()\n\u001b[32m 1190\u001b[39m image_embeds = torch.split(image_embeds, split_sizes)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:480\u001b[39m, in \u001b[36mQwen2_5_VisionTransformerPretrainedModel.forward\u001b[39m\u001b[34m(self, hidden_states, grid_thw, **kwargs)\u001b[39m\n\u001b[32m 477\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 478\u001b[39m cu_seqlens_now = cu_window_seqlens\n\u001b[32m--> \u001b[39m\u001b[32m480\u001b[39m hidden_states = \u001b[43mblk\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 481\u001b[39m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 482\u001b[39m \u001b[43m \u001b[49m\u001b[43mcu_seqlens\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcu_seqlens_now\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 483\u001b[39m \u001b[43m \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m=\u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 484\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 485\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 487\u001b[39m hidden_states = \u001b[38;5;28mself\u001b[39m.merger(hidden_states)\n\u001b[32m 488\u001b[39m reverse_indices = torch.argsort(window_index)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/modeling_layers.py:93\u001b[39m, in \u001b[36mGradientCheckpointingLayer.__call__\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 90\u001b[39m logger.warning(message)\n\u001b[32m 92\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._gradient_checkpointing_func(partial(\u001b[38;5;28msuper\u001b[39m().\u001b[34m__call__\u001b[39m, **kwargs), *args)\n\u001b[32m---> \u001b[39m\u001b[32m93\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:308\u001b[39m, in \u001b[36mQwen2_5_VLVisionBlock.forward\u001b[39m\u001b[34m(self, hidden_states, cu_seqlens, rotary_pos_emb, position_embeddings, **kwargs)\u001b[39m\n\u001b[32m 300\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mforward\u001b[39m(\n\u001b[32m 301\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 302\u001b[39m hidden_states: torch.Tensor,\n\u001b[32m (...)\u001b[39m\u001b[32m 306\u001b[39m **kwargs,\n\u001b[32m 307\u001b[39m ) -> torch.Tensor:\n\u001b[32m--> \u001b[39m\u001b[32m308\u001b[39m hidden_states = hidden_states + \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mattn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 309\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnorm1\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 310\u001b[39m \u001b[43m \u001b[49m\u001b[43mcu_seqlens\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcu_seqlens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 311\u001b[39m \u001b[43m \u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 312\u001b[39m \u001b[43m \u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m=\u001b[49m\u001b[43mposition_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 313\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 314\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 315\u001b[39m hidden_states = hidden_states + \u001b[38;5;28mself\u001b[39m.mlp(\u001b[38;5;28mself\u001b[39m.norm2(hidden_states))\n\u001b[32m 316\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m hidden_states\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/torch/nn/modules/module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/marvis/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:268\u001b[39m, in \u001b[36mQwen2_5_VLVisionAttention.forward\u001b[39m\u001b[34m(self, hidden_states, cu_seqlens, rotary_pos_emb, position_embeddings, **kwargs)\u001b[39m\n\u001b[32m 264\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 265\u001b[39m \u001b[38;5;66;03m# Other implementations: Process each chunk separately\u001b[39;00m\n\u001b[32m 266\u001b[39m lengths = cu_seqlens[\u001b[32m1\u001b[39m:] - cu_seqlens[:-\u001b[32m1\u001b[39m]\n\u001b[32m 267\u001b[39m splits = [\n\u001b[32m--> \u001b[39m\u001b[32m268\u001b[39m torch.split(tensor, \u001b[43mlengths\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtolist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m, dim=\u001b[32m2\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m tensor \u001b[38;5;129;01min\u001b[39;00m (query_states, key_states, value_states)\n\u001b[32m 269\u001b[39m ]\n\u001b[32m 271\u001b[39m attn_outputs = [\n\u001b[32m 272\u001b[39m attention_interface(\n\u001b[32m 273\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 283\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m q, k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(*splits)\n\u001b[32m 284\u001b[39m ]\n\u001b[32m 285\u001b[39m attn_output = torch.cat(attn_outputs, dim=\u001b[32m1\u001b[39m)\n", - "\u001b[31mKeyboardInterrupt\u001b[39m: " + "šŸ“ Using temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_demo_vefuxyya\n" ] } ], @@ -815,123 +778,29 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Evaluate audio classifier if available and trained\nif audio_available and 'audio_test_paths' in locals():\n print(\"šŸ”® Making predictions on audio test data...\")\n \n print(f\"šŸ“ Using audio temp directory: {temp_dir_audio}\")\n \n try:\n # Evaluate on a small subset using evaluate() method\n audio_results = audio_classifier.evaluate(\n audio_test_paths[:5], # First 5 test samples\n y_test_audio[:5],\n return_detailed=True,\n save_outputs=True,\n output_dir=str(temp_dir_audio)\n )\n \n print(\"\\nšŸ“ˆ Audio Classification Results:\")\n accuracy = audio_results.get('accuracy', 'N/A')\n total_samples = len(audio_test_paths[:5])\n completed_samples = audio_results.get('completed_samples', total_samples)\n \n print(f\" • Accuracy: {accuracy}\")\n print(f\" • Completed samples: {completed_samples}\")\n \n # Display predictions\n predictions = audio_results.get('predictions', [])\n if predictions:\n print(\"\\nšŸŽÆ Sample Audio Predictions:\")\n for i, pred in enumerate(predictions[:3]):\n predicted_class = pred if isinstance(pred, str) else class_names_audio[pred] if pred < len(class_names_audio) else f\"Class {pred}\"\n true_class = class_names_audio[y_test_audio[i]]\n print(f\" Sample {i+1}: Predicted='{predicted_class}', True='{true_class}'\")\n \n # Display audio visualization\n print(\"\\nšŸ–¼ļø Displaying t-SNE visualization for audio data...\")\n \n viz_files_audio = list(Path(temp_dir_audio).glob(\"*.png\"))\n \n if viz_files_audio:\n latest_viz_audio = max(viz_files_audio, key=lambda p: p.stat().st_mtime)\n print(f\"šŸ“Š Found audio visualization: {latest_viz_audio.name}\")\n \n display(IPImage(filename=str(latest_viz_audio), width=600))\n \n print(\"\\nšŸŽØ Audio Visualization Features:\")\n print(\" • Each point represents an audio sample in t-SNE space\")\n print(\" • Whisper embeddings capture audio frequency patterns\")\n print(\" • Different tones cluster based on frequency content\")\n print(\" • VLM recognizes patterns to classify audio by frequency\")\n else:\n print(\"āš ļø No audio visualization found.\")\n \n except Exception as e:\n print(f\"āŒ Error during audio evaluation: {e}\")\n print(\"This might be due to:\")\n print(\" • Whisper model loading issues\")\n print(\" • VLM API access limitations\") \n print(\" • Audio processing dependencies\")\n \nelse:\n print(\"āš ļø Skipping audio evaluation - no audio data available\")\n print(\"Audio classification would work with:\")\n print(\" • Real audio files or synthetic audio data\")\n print(\" • Proper soundfile and Whisper dependencies\")\n print(\" • VLM model access\")" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "šŸ”® Making predictions on audio test data...\n", - "šŸ“ Using audio temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_audio_demo_j3wuld80\n", - "āŒ Error: Audio classifier must be fitted before making predictions\n", - "šŸ’” Make sure the training cell above executed successfully\n" - ] - } - ], "source": [ - "# Evaluate audio classifier if available and trained\n", + "# Evaluate audio classifier if available\n", "if audio_available and 'audio_test_paths' in locals():\n", " print(\"šŸ”® Making predictions on audio test data...\")\n", " \n", " print(f\"šŸ“ Using audio temp directory: {temp_dir_audio}\")\n", " \n", " try:\n", - " # Check if classifier is fitted\n", + " # Check if classifier needs to be trained first\n", " if not hasattr(audio_classifier, 'train_embeddings') or audio_classifier.train_embeddings is None:\n", - " print(\"āŒ Error: Audio classifier must be fitted before making predictions\")\n", - " print(\"šŸ’” Make sure the training cell above executed successfully\")\n", - " else:\n", - " # Evaluate on a small subset using evaluate() method\n", - " audio_results = audio_classifier.evaluate(\n", - " audio_test_paths[:5], # First 5 test samples\n", - " y_test_audio[:5],\n", - " return_detailed=True,\n", - " save_outputs=True,\n", - " output_dir=str(temp_dir_audio)\n", - " )\n", - " \n", - " print(\"\\nšŸ“ˆ Audio Classification Results:\")\n", - " accuracy = audio_results.get('accuracy', 'N/A')\n", - " total_samples = len(audio_test_paths[:5])\n", - " completed_samples = audio_results.get('completed_samples', total_samples)\n", - " \n", - " print(f\" • Accuracy: {accuracy}\")\n", - " print(f\" • Completed samples: {completed_samples}\")\n", - " \n", - " # Display predictions\n", - " predictions = audio_results.get('predictions', [])\n", - " if predictions:\n", - " print(\"\\nšŸŽÆ Sample Audio Predictions:\")\n", - " for i, pred in enumerate(predictions[:3]):\n", - " predicted_class = pred if isinstance(pred, str) else class_names_audio[pred] if pred < len(class_names_audio) else f\"Class {pred}\"\n", - " true_class = class_names_audio[y_test_audio[i]]\n", - " print(f\" Sample {i+1}: Predicted='{predicted_class}', True='{true_class}'\")\n", - " \n", - " # Display audio visualization\n", - " print(\"\\nšŸ–¼ļø Displaying t-SNE visualization for audio data...\")\n", + " print(\"šŸ‹ļø Training audio classifier first...\")\n", " \n", - " viz_files_audio = list(Path(temp_dir_audio).glob(\"*.png\"))\n", - " \n", - " if viz_files_audio:\n", - " latest_viz_audio = max(viz_files_audio, key=lambda p: p.stat().st_mtime)\n", - " print(f\"šŸ“Š Found audio visualization: {latest_viz_audio.name}\")\n", - " \n", - " display(IPImage(filename=str(latest_viz_audio), width=600))\n", - " \n", - " print(\"\\nšŸŽØ Audio Visualization Features:\")\n", - " print(\" • Each point represents an audio sample in t-SNE space\")\n", - " print(\" • Whisper embeddings capture audio frequency patterns\")\n", - " print(\" • Different tones cluster based on frequency content\")\n", - " print(\" • VLM recognizes patterns to classify audio by frequency\")\n", - " else:\n", - " print(\"āš ļø No audio visualization found.\")\n", + " # Train the audio classifier with the created audio files\n", + " audio_classifier.fit(\n", + " audio_train_paths, # Training audio file paths\n", + " y_train_audio, # Training labels\n", + " audio_test_paths[:5], # Use subset of test data for demo\n", + " class_names=class_names_audio,\n", + " task_type='classification'\n", + " )\n", " \n", - " except Exception as e:\n", - " print(f\"āŒ Error during audio evaluation: {e}\")\n", - " print(\"This might be due to:\")\n", - " print(\" • Whisper model loading issues\")\n", - " print(\" • VLM API access limitations\") \n", - " print(\" • Audio processing dependencies\")\n", - " print(\" • Model not being fitted properly\")\n", + " print(\"āœ… Audio classifier trained successfully!\")\n", " \n", - "else:\n", - " print(\"āš ļø Skipping audio evaluation - no audio data or classifier not trained\")\n", - " print(\"Audio classification would work with:\")\n", - " print(\" • Real audio files or synthetic audio data\")\n", - " print(\" • Proper soundfile and Whisper dependencies\")\n", - " print(\" • VLM model access\")\n", - " print(\" • Successfully trained audio classifier\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "šŸ”® Making predictions on audio test data...\n", - "šŸ“ Using audio temp directory: /var/folders/nh/rgvpvzyd32d9v1gwjtp7_brr0000gn/T/marvis_audio_demo_j3wuld80\n", - "āŒ Error during audio evaluation: Model must be fitted before making predictions\n", - "This might be due to Whisper model loading or VLM API issues.\n" - ] - } - ], - "source": [ - "# Evaluate audio classifier if available\n", - "if audio_available and 'audio_test_paths' in locals():\n", - " print(\"šŸ”® Making predictions on audio test data...\")\n", - " \n", - " print(f\"šŸ“ Using audio temp directory: {temp_dir_audio}\")\n", - " \n", - " try:\n", " # Evaluate on a small subset using evaluate() method\n", " audio_results = audio_classifier.evaluate(\n", " audio_test_paths[:5], # First 5 test samples\n", @@ -979,12 +848,17 @@ " \n", " except Exception as e:\n", " print(f\"āŒ Error during audio evaluation: {e}\")\n", - " print(\"This might be due to Whisper model loading or VLM API issues.\")\n", + " print(\"This might be due to:\")\n", + " print(\" • Whisper model loading issues\")\n", + " print(\" • VLM API access limitations\") \n", + " print(\" • Audio processing dependencies\")\n", + " import traceback\n", + " traceback.print_exc()\n", " \n", "else:\n", - " print(\"āš ļø Skipping audio evaluation - no audio data or classifier not trained\")\n", + " print(\"āš ļø Skipping audio evaluation - no audio data available\")\n", " print(\"Audio classification would work with:\")\n", - " print(\" • Real audio files\")\n", + " print(\" • Real audio files or synthetic audio data\")\n", " print(\" • Proper soundfile and Whisper dependencies\")\n", " print(\" • VLM model access\")" ] @@ -1038,26 +912,6 @@ "print(\" • Focus: Cross-method pattern comparison\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "get_whisper_embeddings() got an unexpected keyword argument 'whisper_model'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[51]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Train the audio classifier if audio data is available\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m audio_available \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m'\u001b[39m\u001b[33maudio_train_paths\u001b[39m\u001b[33m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mlocals\u001b[39m():\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43maudio_classifier\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[43maudio_train_paths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Training audio file paths\u001b[39;49;00m\n\u001b[32m 5\u001b[39m \u001b[43m \u001b[49m\u001b[43my_train_audio\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Training labels\u001b[39;49;00m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43maudio_test_paths\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Use subset of test data for demo\u001b[39;49;00m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mclass_names\u001b[49m\u001b[43m=\u001b[49m\u001b[43mclass_names_audio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 8\u001b[39m \u001b[43m \u001b[49m\u001b[43mtask_type\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mclassification\u001b[39;49m\u001b[33;43m'\u001b[39;49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Library/CloudStorage/GoogleDrive-penfever@gmail.com/My Drive/Current Papers/marvis/marvis/marvis/models/marvis_tsne.py:1056\u001b[39m, in \u001b[36mfit\u001b[39m\u001b[34m(self, X_train, y_train, X_test, class_names, task_type, **kwargs)\u001b[39m\n\u001b[32m 1053\u001b[39m task_id = dataset_info.get(\u001b[33m\"\u001b[39m\u001b[33mtask_id\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01mif\u001b[39;00m dataset_info \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1055\u001b[39m \u001b[38;5;66;03m# For non-tabular modalities, use special task IDs if task_id not provided\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1056\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m task_id \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.modality == \u001b[33m\"\u001b[39m\u001b[33mvision\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 1057\u001b[39m task_id = VISION_CLASSIFICATION_TASK_ID\n\u001b[32m 1058\u001b[39m \u001b[38;5;28mself\u001b[39m.logger.debug(\n\u001b[32m 1059\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mUsing special vision classification task_id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtask_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 1060\u001b[39m )\n", - "\u001b[31mTypeError\u001b[39m: get_whisper_embeddings() got an unexpected keyword argument 'whisper_model'" - ] - } - ], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -1190,84 +1044,109 @@ "print(\" āœ… No need for extensive hyperparameter tuning\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## šŸ’¬ Part 5: Interactive Chat with MARVIS\n", + "\n", + "One of MARVIS's unique features is the ability to have a natural language conversation about your predictions! After running `predict()` or `evaluate()`, you can use the `.chat()` method to ask questions about the results, discuss patterns, or get explanations.\n", + "\n", + "Let's demonstrate this powerful feature:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Summary of results across modalities\n", - "print(\"šŸ“Š MARVIS Performance Summary Across Modalities\")\n", - "print(\"=\" * 60)\n", + "# Chat demonstration using the tabular classifier from earlier\n", + "print(\"šŸ’¬ Starting Interactive Chat with MARVIS\")\n", + "print(\"=\" * 50)\n", "\n", - "# Collect results from all completed experiments\n", - "results_summary = {}\n", - "\n", - "# Tabular data results\n", - "if 'tabular_results' in locals():\n", - " results_summary[\"Tabular Data\"] = {\n", - " \"samples\": len(X_test_tab[:10]) if 'X_test_tab' in locals() else 'N/A',\n", - " \"features\": X_tabular.shape[1] if 'X_tabular' in locals() else 'N/A',\n", - " \"classes\": len(class_names_tab) if 'class_names_tab' in locals() else 'N/A',\n", - " \"accuracy\": tabular_results.get('accuracy', 'N/A'),\n", - " \"method\": \"TabPFN embeddings → t-SNE → VLM reasoning\"\n", - " }\n", + "# Check if we have a classifier with predictions available\n", + "if 'tabular_classifier' in locals() and hasattr(tabular_classifier, '_last_prediction_context'):\n", + " print(\"āœ… Using tabular classifier with existing predictions\")\n", + " \n", + " try:\n", + " # First conversation - ask about model performance\n", + " print(\"\\n🧠 User: How well did the model perform on the test data?\")\n", + " response1 = tabular_classifier.chat(\"How well did the model perform on the test data?\")\n", + " print(f\"šŸ¤– MARVIS: {response1}\")\n", + " \n", + " print(\"\\n\" + \"-\" * 60)\n", + " \n", + " # Second conversation - ask about patterns\n", + " print(\"\\n🧠 User: What patterns did you observe in the visualization?\") \n", + " response2 = tabular_classifier.chat(\"What patterns did you observe in the visualization?\")\n", + " print(f\"šŸ¤– MARVIS: {response2}\")\n", + " \n", + " print(\"\\n\" + \"-\" * 60)\n", + " \n", + " # Third conversation - ask for improvement suggestions\n", + " print(\"\\n🧠 User: How could we improve the classification results?\")\n", + " response3 = tabular_classifier.chat(\"How could we improve the classification results?\")\n", + " print(f\"šŸ¤– MARVIS: {response3}\")\n", + " \n", + " print(\"\\n\" + \"-\" * 60)\n", + " \n", + " # Show chat history\n", + " print(\"\\nšŸ“š Chat History Summary:\")\n", + " history = tabular_classifier.get_chat_history()\n", + " for i, exchange in enumerate(history, 1):\n", + " print(f\" {i}. User: {exchange['user'][:50]}...\" if len(exchange['user']) > 50 else f\" {i}. User: {exchange['user']}\")\n", + " print(f\" MARVIS: {exchange['assistant'][:50]}...\" if len(exchange['assistant']) > 50 else f\" MARVIS: {exchange['assistant']}\")\n", + " \n", + " print(f\"\\nšŸ’” Total chat exchanges: {len(history)}\")\n", + " \n", + " except Exception as e:\n", + " print(f\"āŒ Chat demonstration failed: {e}\")\n", + " print(\"šŸ’” This might be due to VLM model limitations or API access issues.\")\n", + " print(\"šŸ”§ In a real scenario with proper model access, the chat would work as shown.\")\n", + " \n", + " # Show what the conversation would look like\n", + " print(\"\\nšŸŽ­ Example Conversation (Simulated):\")\n", + " print(\"\\n🧠 User: How well did the model perform on the test data?\")\n", + " print(\"šŸ¤– MARVIS: Based on the recent evaluation, the model achieved 90% accuracy on 10 test samples. The high accuracy suggests that the t-SNE visualization effectively captured the underlying patterns in your tabular data, allowing the vision language model to distinguish between Class A, Class B, and Class C with good reliability.\")\n", + " \n", + " print(\"\\n🧠 User: What patterns did you observe in the visualization?\")\n", + " print(\"šŸ¤– MARVIS: In the t-SNE visualization, I observed three distinct clusters corresponding to your three classes. The clusters showed good separation with minimal overlap, which explains the high classification accuracy. The KNN connections (k=5) revealed that similar data points were properly grouped together, indicating that the TabPFN embeddings captured meaningful feature relationships.\")\n", + " \n", + " print(\"\\n🧠 User: How could we improve the classification results?\")\n", + " print(\"šŸ¤– MARVIS: Several strategies could potentially improve results: 1) Increase the number of training samples for better embedding quality, 2) Experiment with different t-SNE perplexity values (currently 15) to optimize cluster separation, 3) Try 3D visualization for complex datasets, 4) Consider multi-visualization mode to combine PCA and t-SNE insights, or 5) Use a more powerful VLM model for enhanced reasoning capability.\")\n", "\n", - "# Image data results \n", - "if 'image_results' in locals():\n", - " results_summary[\"Image Data\"] = {\n", - " \"samples\": len(X_test_img[:8]) if 'X_test_img' in locals() else 'N/A',\n", - " \"features\": \"8x8 pixels (DINOV2 embeddings)\",\n", - " \"classes\": len(class_names_img) if 'class_names_img' in locals() else 'N/A',\n", - " \"accuracy\": image_results.get('accuracy', 'N/A'),\n", - " \"method\": \"Raw pixels → DINOV2 → t-SNE → VLM reasoning\"\n", - " }\n", + "else:\n", + " print(\"āš ļø No classifier with predictions available for chat demonstration\")\n", + " print(\"šŸ’” The chat feature requires running predict() or evaluate() first\")\n", + " \n", + " # Show example chat workflow\n", + " print(\"\\nšŸŽ­ Example Chat Workflow:\")\n", + " print(\"\"\"\n", + "# 1. Train and evaluate your model\n", + "classifier = MarvisTsneClassifier(modality=\"tabular\", vlm_model_id=\"Qwen/Qwen2.5-VL-3B-Instruct\")\n", + "classifier.fit(X_train, y_train, X_test, class_names=[\"A\", \"B\", \"C\"])\n", + "results = classifier.evaluate(X_test, y_test)\n", "\n", - "# Audio data results\n", - "if 'audio_available' in locals() and audio_available and 'audio_results' in locals():\n", - " results_summary[\"Audio Data\"] = {\n", - " \"samples\": len(audio_test_paths[:5]) if 'audio_test_paths' in locals() else 'N/A',\n", - " \"features\": \"Whisper embeddings\",\n", - " \"classes\": len(class_names_audio) if 'class_names_audio' in locals() else 'N/A',\n", - " \"accuracy\": audio_results.get('accuracy', 'N/A'),\n", - " \"method\": \"Audio → Whisper → t-SNE → VLM reasoning\"\n", - " }\n", + "# 2. Start chatting about the results\n", + "response = classifier.chat(\"How accurate were the predictions?\")\n", + "print(response)\n", "\n", - "# Multi-visualization results\n", - "if 'multi_viz_results' in locals():\n", - " results_summary[\"Multi-Visualization\"] = {\n", - " \"samples\": len(X_test_tab[:8]) if 'X_test_tab' in locals() else 'N/A',\n", - " \"features\": \"Same tabular data\",\n", - " \"classes\": len(class_names_tab) if 'class_names_tab' in locals() else 'N/A',\n", - " \"accuracy\": multi_viz_results.get('accuracy', 'N/A'),\n", - " \"method\": \"PCA + t-SNE → Multi-panel → VLM reasoning\"\n", - " }\n", + "# 3. Continue the conversation\n", + "response = classifier.chat(\"What patterns did you see in the data?\")\n", + "print(response)\n", "\n", - "# Display results\n", - "for modality, info in results_summary.items():\n", - " print(f\"\\nšŸŽÆ {modality}:\")\n", - " print(f\" • Test samples: {info['samples']}\")\n", - " print(f\" • Features: {info['features']}\")\n", - " print(f\" • Classes: {info['classes']}\")\n", - " print(f\" • Accuracy: {info['accuracy']}\")\n", - " print(f\" • Method: {info['method']}\")\n", + "# 4. Ask follow-up questions\n", + "response = classifier.chat(\"Which samples were misclassified and why?\")\n", + "print(response)\n", "\n", - "print(\"\\nšŸŽ‰ Key MARVIS Advantages:\")\n", - "print(\" āœ… Unified interface across modalities (tabular, image, audio)\")\n", - "print(\" āœ… Visual reasoning using state-of-the-art VLMs\")\n", - "print(\" āœ… Interpretable predictions through visualization\")\n", - "print(\" āœ… Multi-visualization support for robust classification\")\n", - "print(\" āœ… Handles few-shot learning scenarios effectively\")\n", - "print(\" āœ… No need for extensive hyperparameter tuning\")\n", - "print(\" āœ… Interactive chat for discussing predictions\")\n", - "\n", - "print(f\"\\nšŸ“Š Experiments completed: {len(results_summary)}\")\n", - "if len(results_summary) == 0:\n", - " print(\" āš ļø No experiments completed successfully\")\n", - "elif len(results_summary) < 4:\n", - " print(\" āš ļø Some experiments may have failed due to missing dependencies\")\n", - "else:\n", - " print(\" āœ… All experiments completed successfully!\")" + "# 5. Get chat history\n", + "history = classifier.get_chat_history()\n", + "print(f\"Had {len(history)} exchanges\")\n", + "\n", + "# 6. Clear history if needed\n", + "classifier.clear_chat_history()\n", + "\"\"\")" ] }, { @@ -1442,105 +1321,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Chat demonstration using the tabular classifier from earlier\n", - "print(\"šŸ’¬ Starting Interactive Chat with MARVIS\")\n", - "print(\"=\" * 50)\n", - "\n", - "# Check if we have a classifier with predictions available\n", - "if 'tabular_classifier' in locals() and hasattr(tabular_classifier, '_last_prediction_context'):\n", - " print(\"āœ… Using tabular classifier with existing predictions\")\n", - " \n", - " try:\n", - " # First conversation - ask about model performance\n", - " print(\"\\n🧠 User: How well did the model perform on the test data?\")\n", - " response1 = tabular_classifier.chat(\"How well did the model perform on the test data?\")\n", - " print(f\"šŸ¤– MARVIS: {response1}\")\n", - " \n", - " print(\"\\n\" + \"-\" * 60)\n", - " \n", - " # Second conversation - ask about patterns\n", - " print(\"\\n🧠 User: What patterns did you observe in the visualization?\") \n", - " response2 = tabular_classifier.chat(\"What patterns did you observe in the visualization?\")\n", - " print(f\"šŸ¤– MARVIS: {response2}\")\n", - " \n", - " print(\"\\n\" + \"-\" * 60)\n", - " \n", - " # Third conversation - ask for improvement suggestions\n", - " print(\"\\n🧠 User: How could we improve the classification results?\")\n", - " response3 = tabular_classifier.chat(\"How could we improve the classification results?\")\n", - " print(f\"šŸ¤– MARVIS: {response3}\")\n", - " \n", - " print(\"\\n\" + \"-\" * 60)\n", - " \n", - " # Show chat history\n", - " print(\"\\nšŸ“š Chat History Summary:\")\n", - " history = tabular_classifier.get_chat_history()\n", - " for i, exchange in enumerate(history, 1):\n", - " print(f\" {i}. User: {exchange['user'][:50]}...\" if len(exchange['user']) > 50 else f\" {i}. User: {exchange['user']}\")\n", - " print(f\" MARVIS: {exchange['assistant'][:50]}...\" if len(exchange['assistant']) > 50 else f\" MARVIS: {exchange['assistant']}\")\n", - " \n", - " print(f\"\\nšŸ’” Total chat exchanges: {len(history)}\")\n", - " \n", - " except Exception as e:\n", - " print(f\"āŒ Chat demonstration failed: {e}\")\n", - " print(\"šŸ’” This might be due to VLM model limitations or API access issues.\")\n", - " print(\"šŸ”§ In a real scenario with proper model access, the chat would work as shown.\")\n", - " \n", - " # Show what the conversation would look like\n", - " print(\"\\nšŸŽ­ Example Conversation (Simulated):\")\n", - " print(\"\\n🧠 User: How well did the model perform on the test data?\")\n", - " print(\"šŸ¤– MARVIS: Based on the recent evaluation, the model achieved 90% accuracy on 10 test samples. The high accuracy suggests that the t-SNE visualization effectively captured the underlying patterns in your tabular data, allowing the vision language model to distinguish between Class A, Class B, and Class C with good reliability.\")\n", - " \n", - " print(\"\\n🧠 User: What patterns did you observe in the visualization?\")\n", - " print(\"šŸ¤– MARVIS: In the t-SNE visualization, I observed three distinct clusters corresponding to your three classes. The clusters showed good separation with minimal overlap, which explains the high classification accuracy. The KNN connections (k=5) revealed that similar data points were properly grouped together, indicating that the TabPFN embeddings captured meaningful feature relationships.\")\n", - " \n", - " print(\"\\n🧠 User: How could we improve the classification results?\")\n", - " print(\"šŸ¤– MARVIS: Several strategies could potentially improve results: 1) Increase the number of training samples for better embedding quality, 2) Experiment with different t-SNE perplexity values (currently 15) to optimize cluster separation, 3) Try 3D visualization for complex datasets, 4) Consider multi-visualization mode to combine PCA and t-SNE insights, or 5) Use a more powerful VLM model for enhanced reasoning capability.\")\n", - "\n", - "else:\n", - " print(\"āš ļø No classifier with predictions available for chat demonstration\")\n", - " print(\"šŸ’” The chat feature requires running predict() or evaluate() first\")\n", - " \n", - " # Show example chat workflow\n", - " print(\"\\nšŸŽ­ Example Chat Workflow:\")\n", - " print(\"\"\"\n", - "# 1. Train and evaluate your model\n", - "classifier = MarvisTsneClassifier(modality=\"tabular\", vlm_model_id=\"Qwen/Qwen2.5-VL-3B-Instruct\")\n", - "classifier.fit(X_train, y_train, X_test, class_names=[\"A\", \"B\", \"C\"])\n", - "results = classifier.evaluate(X_test, y_test)\n", - "\n", - "# 2. Start chatting about the results\n", - "response = classifier.chat(\"How accurate were the predictions?\")\n", - "print(response)\n", - "\n", - "# 3. Continue the conversation\n", - "response = classifier.chat(\"What patterns did you see in the data?\")\n", - "print(response)\n", - "\n", - "# 4. Ask follow-up questions\n", - "response = classifier.chat(\"Which samples were misclassified and why?\")\n", - "print(response)\n", - "\n", - "# 5. Get chat history\n", - "history = classifier.get_chat_history()\n", - "print(f\"Had {len(history)} exchanges\")\n", - "\n", - "# 6. Clear history if needed\n", - "classifier.clear_chat_history()\n", - "\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## šŸ’¬ Part 5: Interactive Chat with MARVIS\n", - "\n", - "One of MARVIS's unique features is the ability to have a natural language conversation about your predictions! After running `predict()` or `evaluate()`, you can use the `.chat()` method to ask questions about the results, discuss patterns, or get explanations.\n", - "\n", - "Let's demonstrate this powerful feature:" - ] + "source": [] }, { "cell_type": "markdown", @@ -1736,4 +1517,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/marvis/models/marvis_tsne.py b/marvis/models/marvis_tsne.py index 3a511d4..6b80e54 100644 --- a/marvis/models/marvis_tsne.py +++ b/marvis/models/marvis_tsne.py @@ -2065,17 +2065,45 @@ def chat(self, user_input: str, max_history: int = 10) -> str: # Generate response using the VLM self.logger.info("Generating chat response...") - # Use the same interface as predictions but with text-only input + # Use the VLM wrapper interface for chat if hasattr(self.vlm_wrapper, 'generate_response'): + # Use the standard generate_response interface response = self.vlm_wrapper.generate_response( text_input=chat_prompt, image_input=None, # Text-only conversation max_tokens=1000, temperature=0.7 # Slightly higher temperature for conversational responses ) + elif hasattr(self.vlm_wrapper, 'generate'): + # Use the direct generate interface with proper parameters + from marvis.utils.model_loader import GenerationConfig + config = GenerationConfig( + max_new_tokens=512, + temperature=0.7, + do_sample=True, + top_p=0.9 + ) + response = self.vlm_wrapper.generate( + inputs=chat_prompt, + config=config + ) + elif hasattr(self.vlm_wrapper, 'generate_from_conversation'): + # Use conversation interface if available + from marvis.utils.model_loader import GenerationConfig + conversation = [{"role": "user", "content": chat_prompt}] + config = GenerationConfig( + max_new_tokens=512, + temperature=0.7, + do_sample=True, + top_p=0.9 + ) + response = self.vlm_wrapper.generate_from_conversation( + conversation, + config + ) else: - # Fallback for different wrapper interfaces - response = self.vlm_wrapper.chat(chat_prompt) + # Final fallback - raise informative error + raise AttributeError(f"VLM wrapper {type(self.vlm_wrapper)} doesn't have a supported generation method") # Clean up response if needed if isinstance(response, dict) and 'text' in response: