OpenGVLab
diff --git a/‎.gitignore
+6 b/‎.gitignore
+6
diff --git a/‎README.md
+29-1 b/‎README.md
+29-1
diff --git a/‎example/hitting_baseball.mp4
671 KB b/‎example/hitting_baseball.mp4
671 KB
diff --git a/‎example/yoga.mp4
758 KB b/‎example/yoga.mp4
758 KB
diff --git a/‎video_chat/README.md
+43 b/‎video_chat/README.md
+43
diff --git a/‎video_chat/app.py
+160 b/‎video_chat/app.py
+160
diff --git a/‎video_chat/assert/dancing.png
502 KB b/‎video_chat/assert/dancing.png
502 KB
diff --git a/‎video_chat/assert/dancing2.png
503 KB b/‎video_chat/assert/dancing2.png
503 KB
diff --git a/‎video_chat/assert/hugging.png
1.13 MB b/‎video_chat/assert/hugging.png
1.13 MB
diff --git a/‎video_chat/chatbot.py
+84 b/‎video_chat/chatbot.py
+84
diff --git a/‎video_chat/configs/med_config.json
+21 b/‎video_chat/configs/med_config.json
+21
diff --git a/‎video_chat/configs/q2l_config.json
+23 b/‎video_chat/configs/q2l_config.json
+23
diff --git a/‎video_chat/configs/swin/config_swinB_224.json
+10 b/‎video_chat/configs/swin/config_swinB_224.json
+10
@@ -127,3 +127,9 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# macOS
+.DS_Store
+
+# personal
+demo.ipynb
@@ -1,2 +1,30 @@
 # Ask-Anything
-a simple yet interesting tool for chatting with video
+
+Currently, Ask-Anything is a simple yet interesting tool for chatting with video.
+Our tean is trying to build smart and robust ChatBot for video understanding now.
+
+
+# :fire: Updates
+- 2023/04/19: Code release
+  - [VideoChat](./video_chat/): Explicit communication with ChatGPT. Sensitive with time.
+  - [MiniGPT-4 for video](./video_miniGPT4/): Implicit communication with Vicuna. Not sensitive with time. (Simple extension of [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4), which will be improved in the future.)
+
+
+# :speech_balloon: Example
+
+
+
+
+# :hourglass_flowing_sand: Ongoing
+
+Our team is mainly focus on general video understanding and long-term video reasoning:
+
+- [ ] Strong video foundation model.
+- [ ] Large-scale and high-quality video-text dataset.
+- [ ] Large-scale long-term video reasoning benchmark.
+- [ ] Short-term video-language system with LLMs.
+- [ ] Long-term video-language system with LLMs.
+- [ ] Artificial Intelligence Generated Content(AIGC) for Video.
+- [ ] ...
+
+We are hiring researchers, engineers and interns in **General Vision Group, Shanghai AI Lab**.  If you are interested in working with us, please contact [Yi Wang](https://shepnerd.github.io/) (`[email protected]`).
@@ -0,0 +1,43 @@
+# VideoChat
+
+VideoChat is a multifunctional video question answering tool that combines the functions of Action Recognition, Visual Captioning and ChatGPT. Our solution generates dense, descriptive captions for any object and action in a video, offering a range of language styles to suit different user preferences. It supports users to have conversations in different lengths, emotions, authenticity of language.
+- Video-Text Generation
+- Chat about uploaded video
+- Interactive demo
+
+# :fire: Updates
+
+- **2023/04/19**: Code Release
+
+# :speech_balloon: Example
+
+![images](video_chat/assert/hugging.png)
+![images](video_chat/assert/dancing.png)
+![images](video_chat/assert/dancing2.png)
+
+# :running: Usage
+
+```shell
+# Clone the repository:
+git clone ask-anything.git
+cd ask-anything/video_chat
+
+# Install dependencies:
+pip install -r requirements.txt
+
+# Download the checkpoints
+wget https://huggingface.co/spaces/xinyu1205/Tag2Text/resolve/main/tag2text_swin_14m.pth ./pretrained_models/tag2text_swin_14m.pth
+wget wget https://datarelease.blob.core.windows.net/grit/models/grit_b_densecap_objectdet.pth ./pretrained_models/grit_b_densecap_objectdet.pth
+git clone https://huggingface.co/mrm8488/flan-t5-large-finetuned-openai-summarize_from_feedback pretrained_models/flan-t5-large-finetuned-openai-summarize_from_feedback
+
+# Configure the necessary ChatGPT APIs
+export OPENAI_API_KEY={Your_Private_Openai_Key}
+
+# Run the VideoChat gradio demo.
+python app.py
+```
+
+# Acknowledgement
+
+The project is based on [InternVideo](https://github.com/OpenGVLab/InternVideo), [Tag2Text](https://github.com/xinyu1205/Tag2Text), [GRiT](https://github.com/JialianW/GRiT), [mrm8488](https://huggingface.co/mrm8488/flan-t5-large-finetuned-openai-summarize_from_feedback) and [ChatGPT](https://openai.com/blog/chatgpt). Thanks for the authors for their efforts.
+
@@ -0,0 +1,160 @@
+import os
+import numpy as np
+import random
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from models.tag2text import tag2text_caption
+from util import *
+import gradio as gr
+from chatbot import *
+from load_internvideo import *
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+from simplet5 import SimpleT5
+from models.grit_model import DenseCaptioning
+bot = ConversationBot()
+image_size = 384
+normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                std=[0.229, 0.224, 0.225])
+transform = transforms.Compose([transforms.ToPILImage(),transforms.Resize((image_size, image_size)),transforms.ToTensor(),normalize])
+
+
+# define model
+model = tag2text_caption(pretrained="pretrained_models/tag2text_swin_14m.pth", image_size=image_size, vit='swin_b' )
+model.eval()
+model = model.to(device)
+print("[INFO] initialize caption model success!")
+
+model_T5 = SimpleT5()
+if torch.cuda.is_available():
+    model_T5.load_model(
+        "t5", "./pretrained_models/flan-t5-large-finetuned-openai-summarize_from_feedback", use_gpu=True)
+else:
+    model_T5.load_model(
+        "t5", "./pretrained_models/flan-t5-large-finetuned-openai-summarize_from_feedback", use_gpu=False)
+print("[INFO] initialize summarize model success!")
+# action recognition
+intern_action = load_intern_action(device)
+trans_action = transform_action()
+topil =  T.ToPILImage()
+print("[INFO] initialize InternVideo model success!")
+
+dense_caption_model = DenseCaptioning(device)
+dense_caption_model.initialize_model()
+print("[INFO] initialize dense caption model success!")
+
+def inference(video_path, input_tag, progress=gr.Progress()):
+    data = loadvideo_decord_origin(video_path)
+    progress(0.2, desc="Loading Videos")
+
+    # InternVideo
+    action_index = np.linspace(0, len(data)-1, 8).astype(int)
+    tmp,tmpa = [],[]
+    for i,img in enumerate(data):
+        tmp.append(transform(img).to(device).unsqueeze(0))
+        if i in action_index:
+            tmpa.append(topil(img))
+    action_tensor = trans_action(tmpa)
+    TC, H, W = action_tensor.shape
+    action_tensor = action_tensor.reshape(1, TC//3, 3, H, W).permute(0, 2, 1, 3, 4).to(device)
+    prediction = intern_action(action_tensor)
+    prediction = F.softmax(prediction, dim=1).flatten()
+    prediction = kinetics_classnames[str(int(prediction.argmax()))]
+
+    # dense caption
+    dense_caption = []
+    dense_index = np.arange(0, len(data)-1, 5)
+    original_images = data[dense_index,:,:,::-1]
+    for original_image in original_images:
+        dense_caption.append(dense_caption_model.run_caption_tensor(original_image))
+    dense_caption = ' '.join([f"Second {i+1} : {j}.\n" for i,j in zip(dense_index,dense_caption)])
+    
+    # Video Caption
+    image = torch.cat(tmp).to(device)   
+    
+    model.threshold = 0.68
+    if input_tag == '' or input_tag == 'none' or input_tag == 'None':
+        input_tag_list = None
+    else:
+        input_tag_list = []
+        input_tag_list.append(input_tag.replace(',',' | '))
+    with torch.no_grad():
+        caption, tag_predict = model.generate(image,tag_input = input_tag_list,max_length = 50, return_tag_predict = True)
+        progress(0.6, desc="Watching Videos")
+        frame_caption = ' '.join([f"Second {i+1}:{j}.\n" for i,j in enumerate(caption)])
+        if input_tag_list == None:
+            tag_1 = set(tag_predict)
+            tag_2 = ['none']
+        else:
+            _, tag_1 = model.generate(image,tag_input = None, max_length = 50, return_tag_predict = True)
+            tag_2 = set(tag_predict)
+        progress(0.8, desc="Understanding Videos")
+        synth_caption = model_T5.predict('. '.join(caption))
+    print(frame_caption, dense_caption, synth_caption)
+    return ' | '.join(tag_1),' | '.join(tag_2), frame_caption, dense_caption, synth_caption[0], gr.update(interactive = True), prediction
+
+
+
+with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
+    gr.Markdown("<h1><center>Ask Anything with GPT</center></h1>")
+    gr.Markdown(
+        """
+        Ask-Anything is a multifunctional video question answering tool that combines the functions of Action Recognition, Visual Captioning and ChatGPT. Our solution generates dense, descriptive captions for any object and action in a video, offering a range of language styles to suit different user preferences. It supports users to have conversations in different lengths, emotions, authenticity of language.<br>  
+        """
+    )
+    
+    with gr.Row():
+        with gr.Column():
+            input_video_path = gr.inputs.Video(label="Input Video")
+            input_tag = gr.Textbox(lines=1, label="User Prompt (Optional, Enter with commas)",visible=False)
+          
+            with gr.Row():
+                with gr.Column(sclae=0.3, min_width=0):
+                    caption = gr.Button("✍ Upload")
+                    chat_video = gr.Button(" 🎥 Let's Chat! ", interactive=False)
+                with gr.Column(scale=0.7, min_width=0):
+                    loadinglabel = gr.Label(label="State")
+        with gr.Column():
+            openai_api_key_textbox = gr.Textbox(
+                value=os.environ["OPENAI_API_KEY"],
+                placeholder="Paste your OpenAI API key here to start (sk-...)",
+                show_label=False,
+                lines=1,
+                type="password",
+            )
+            chatbot = gr.Chatbot(elem_id="chatbot", label="gpt")
+            state = gr.State([])
+            user_tag_output = gr.State("")
+            image_caption_output = gr.State("")
+            video_caption_output  = gr.State("")
+            model_tag_output = gr.State("")
+            dense_caption_output = gr.State("")
+            with gr.Row(visible=False) as input_raws:
+                with gr.Column(scale=0.8):
+                    txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
+                with gr.Column(scale=0.10, min_width=0):
+                    run = gr.Button("🏃‍♂️Run")
+                with gr.Column(scale=0.10, min_width=0):
+                    clear = gr.Button("🔄Clear️")    
+
+    
+    caption.click(bot.memory.clear)
+    caption.click(lambda: gr.update(interactive = False), None, chat_video)
+    caption.click(lambda: [], None, chatbot)
+    caption.click(lambda: [], None, state)    
+    caption.click(inference,[input_video_path,input_tag],[model_tag_output, user_tag_output, image_caption_output, dense_caption_output,video_caption_output, chat_video, loadinglabel])
+
+    chat_video.click(bot.init_agent, [openai_api_key_textbox, image_caption_output, dense_caption_output, video_caption_output, model_tag_output, state], [input_raws,chatbot, state])
+
+    txt.submit(bot.run_text, [txt, state], [chatbot, state])
+    txt.submit(lambda: "", None, txt)
+    run.click(bot.run_text, [txt, state], [chatbot, state])
+    run.click(lambda: "", None, txt)
+
+    clear.click(bot.memory.clear)
+    clear.click(lambda: [], None, chatbot)
+    clear.click(lambda: [], None, state)
+    
+
+
+demo.launch(server_name="0.0.0.0",enable_queue=True,)#share=True)
@@ -0,0 +1,84 @@
+from langchain.agents.initialize import initialize_agent
+from langchain.agents.tools import Tool
+from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain.llms.openai import OpenAI
+import re
+import gradio as gr
+import openai
+
+
+def cut_dialogue_history(history_memory, keep_last_n_words=400):
+    if history_memory is None or len(history_memory) == 0:
+        return history_memory
+    tokens = history_memory.split()
+    n_tokens = len(tokens)
+    print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
+    if n_tokens < keep_last_n_words:
+        return history_memory
+    paragraphs = history_memory.split('\n')
+    last_n_tokens = n_tokens
+    while last_n_tokens >= keep_last_n_words:
+        last_n_tokens -= len(paragraphs[0].split(' '))
+        paragraphs = paragraphs[1:]
+    return '\n' + '\n'.join(paragraphs)
+
+
+class ConversationBot:
+    def __init__(self):
+        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
+        self.tools = []
+
+    def run_text(self, text, state):
+        self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
+        res = self.agent({"input": text.strip()})
+        res['output'] = res['output'].replace("\\", "/")
+        response = res['output'] 
+        state = state + [(text, response)]
+        print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
+              f"Current Memory: {self.agent.memory.buffer}")
+        return state, state
+
+
+    def init_agent(self, openai_api_key, image_caption, dense_caption, video_caption, tags, state):
+        chat_history =''
+        PREFIX = "ChatVideo is a chatbot that chats with you based on video descriptions."
+        FORMAT_INSTRUCTIONS = """
+        When you have a response to say to the Human,  you MUST use the format:
+        ```
+        {ai_prefix}: [your response here]
+        ```
+        """
+        SUFFIX = f"""You are a chatbot that conducts conversations based on video descriptions. You mainly answer based on the given description, and you can also modify the content according to the tag information, and you can also answer the relevant knowledge of the person or object contained in the video. The second description is a description for one second, so that you can convert it into time. When describing, please mainly refer to the sceond description. Dense caption is to give content every five seconds, you can disambiguate them in timing. But you don't create a video plot out of nothing.
+
+                Begin!
+
+                Video tags are: {tags}
+
+                The second description of the video is: {image_caption}
+
+                The dense caption of the video is: {dense_caption}
+
+                The general description of the video is: {video_caption}"""+"""Previous conversation history {chat_history}
+
+                New input: {input}
+
+                {agent_scratchpad}
+                """
+        self.memory.clear()
+        
+        self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
+       
+        self.agent = initialize_agent(
+            self.tools,
+            self.llm,
+            agent="conversational-react-description",
+            verbose=True,
+            memory=self.memory,
+            return_intermediate_steps=True,
+            agent_kwargs={'prefix': PREFIX, 'format_instructions': FORMAT_INSTRUCTIONS, 'suffix': SUFFIX}, )
+        state = state + [("I upload a video, Please watch it first! ","I have watch this video, Let's chat!")]
+        return gr.update(visible = True),state, state
+
+if __name__=="__main__":
+    import pdb
+    pdb.set_trace()
@@ -0,0 +1,21 @@
+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 30524,
+  "encoder_width": 768,
+  "add_cross_attention": true   
+}
@@ -0,0 +1,23 @@
+{
+    "architectures": [
+      "BertModel"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 2,
+    "pad_token_id": 0,
+    "type_vocab_size": 2,
+    "vocab_size": 30522,
+    "encoder_width": 768,
+    "add_cross_attention": true,
+    "add_tag_cross_attention": false
+  }
+  
@@ -0,0 +1,10 @@
+{
+    "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth",
+    "vision_width": 1024,
+    "image_res": 224,
+    "window_size": 7,
+    "embed_dim": 128,
+    "depths": [ 2, 2, 18, 2 ],
+    "num_heads": [ 4, 8, 16, 32 ]
+  }
+