import spaces import os import torch import gradio as gr import sys sys.path.append('./') from videollama2.constants import MMODAL_TOKEN_INDEX, DEFAULT_MMODAL_TOKEN from videollama2.conversation import conv_templates, SeparatorStyle, Conversation from videollama2.model.builder import load_pretrained_model from videollama2.mm_utils import KeywordsStoppingCriteria, tokenizer_MMODAL_token, get_model_name_from_path, process_image, process_video title_markdown = ("""
VideoLLaMA 2 πŸ”₯πŸš€πŸ”₯

VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs

""") plum_color = gr.themes.colors.Color( name='plum', c50='#F8E4EF', c100='#EDD2DF', c200='#E4BFD2', c300='#DBACC5', c400='#D299B8', c500='#C986AB', c600='#C0739E', c700='#B76091', c800='#AE4D84', c900='#A53A77', c950='#9C276A', ) class Chat: def __init__(self, model_path, conv_mode, model_base=None, load_8bit=False, load_4bit=False): # disable_torch_init() model_name = get_model_name_from_path(model_path) self.tokenizer, self.model, processor, context_len = load_pretrained_model( model_path, model_base, model_name, load_8bit, load_4bit, offload_folder="save_folder") self.processor = processor self.conv_mode = conv_mode self.conv = conv_templates[conv_mode].copy() def get_prompt(self, qs, state): state.append_message(state.roles[0], qs) state.append_message(state.roles[1], None) return state @spaces.GPU(duration=120) @torch.inference_mode() def generate(self, tensor: list, modals: list, prompt: str, first_run: bool, state, temperature, top_p, max_output_tokens): # TODO: support multiple turns of conversation. assert len(tensor) == len(modals) # 1. prepare model, tokenizer, and processor. tokenizer, model, processor = self.tokenizer, self.model, self.processor # 2. text preprocess (tag process & generate prompt). state = self.get_prompt(prompt, state) prompt = state.get_prompt() input_ids = tokenizer_MMODAL_token(prompt, tokenizer, MMODAL_TOKEN_INDEX[modals[0]], return_tensors='pt') input_ids = input_ids.unsqueeze(0).to(self.model.device) # 3. generate response according to visual signals and prompts. stop_str = self.conv.sep if self.conv.sep_style in [SeparatorStyle.SINGLE] else self.conv.sep2 # keywords = ["", ""] keywords = [stop_str] stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) with torch.inference_mode(): output_ids = model.generate( input_ids, images_or_videos=tensor, modal_list=modals, do_sample=True, temperature=temperature, top_p=top_p, max_new_tokens=max_output_tokens, use_cache=True, stopping_criteria=[stopping_criteria], ) outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] print(outputs) return outputs, state @spaces.GPU(duration=120) def generate(image, video, state, state_, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16): if not textbox_in: if len(state_.messages) > 0: textbox_in = state_.messages[-1][1] state_.messages.pop(-1) else: assert "Please enter instruction" image = image if image else "none" video = video if video else "none" assert not (os.path.exists(image) and os.path.exists(video)) tensor = [] modals = [] if type(state) is not Conversation: state = conv_templates[conv_mode].copy() state_ = conv_templates[conv_mode].copy() first_run = False if len(state.messages) > 0 else True text_en_in = textbox_in.replace("picture", "image") processor = handler.processor if os.path.exists(image) and not os.path.exists(video): tensor.append(process_image(image, processor).to(handler.model.device, dtype=dtype)) modals.append('IMAGE') if not os.path.exists(image) and os.path.exists(video): tensor.append(process_video(video, processor).to(handler.model.device, dtype=dtype)) modals.append('VIDEO') if os.path.exists(image) and os.path.exists(video): raise NotImplementedError("Not support image and video at the same time") # BUG: Only support single video and image inference now. if os.path.exists(image) and not os.path.exists(video): text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['IMAGE'], '').strip() text_en_in = DEFAULT_MMODAL_TOKEN['IMAGE'] + '\n' + text_en_in if not os.path.exists(image) and os.path.exists(video): text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['VIDEO'], '').strip() text_en_in = DEFAULT_MMODAL_TOKEN['VIDEO'] + '\n' + text_en_in if os.path.exists(image) and os.path.exists(video): text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['VIDEO'], '').strip() text_en_in = DEFAULT_MMODAL_TOKEN['VIDEO'] + '\n' + text_en_in text_en_out, state_ = handler.generate(tensor, modals, text_en_in, first_run=first_run, state=state_, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens) state_.messages[-1] = (state_.roles[1], text_en_out) text_en_out = text_en_out.split('#')[0] textbox_out = text_en_out show_images = "" if os.path.exists(image): show_images += f'' if os.path.exists(video): show_images += f'' state.append_message(state.roles[0], textbox_in + "\n" + show_images) state.append_message(state.roles[1], textbox_out) # BUG: only support single turn conversation now. state_.messages.pop(-1) state_.messages.pop(-1) return (gr.update(value=image if os.path.exists(image) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True), state.to_gradio_chatbot(), state, state_) def regenerate(state, state_): state.messages.pop(-1) state.messages.pop(-1) if len(state.messages) > 0: return state.to_gradio_chatbot(), state, state_ return state.to_gradio_chatbot(), state, state_ def clear_history(state, state_): state = conv_templates[conv_mode].copy() state_ = conv_templates[conv_mode].copy() return (gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), state.to_gradio_chatbot(), state, state_, gr.update(value=None, interactive=True)) # BUG of Zero Environment # 1. The environment is fixed to torch==2.0.1+cu117, gradio>=4.x.x # 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU # 3. The function can't return tensor or other cuda objects. conv_mode = "llama_2" model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B' device = torch.device("cuda") handler = Chat(model_path, conv_mode=conv_mode, load_8bit=False, load_4bit=True) textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False) with gr.Blocks(title='VideoLLaMA 2 πŸ”₯πŸš€πŸ”₯', theme=gr.themes.Default(primary_hue=gr.themes.colors.violet), css=block_css) as demo: gr.Markdown(title_markdown) state = gr.State() state_ = gr.State() with gr.Row(): with gr.Column(scale=3): image = gr.Image(label="Input Image", type="filepath") video = gr.Video(label="Input Video") with gr.Accordion("Parameters", open=True) as parameter_row: # num_beams = gr.Slider( # minimum=1, # maximum=10, # value=1, # step=1, # interactive=True, # label="beam search numbers", # ) temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature", ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P", ) max_output_tokens = gr.Slider( minimum=64, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens", ) with gr.Column(scale=7): chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750) with gr.Row(): with gr.Column(scale=8): textbox.render() with gr.Column(scale=1, min_width=50): submit_btn = gr.Button(value="Send", variant="primary", interactive=True) with gr.Row(elem_id="buttons") as button_row: upvote_btn = gr.Button(value="πŸ‘ Upvote", interactive=True) downvote_btn = gr.Button(value="πŸ‘Ž Downvote", interactive=True) # flag_btn = gr.Button(value="⚠️ Flag", interactive=True) # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False) regenerate_btn = gr.Button(value="πŸ”„ Regenerate", interactive=True) clear_btn = gr.Button(value="πŸ—‘οΈ Clear history", interactive=True) with gr.Column(): cur_dir = os.path.dirname(os.path.abspath(__file__)) gr.Examples( examples=[ [ f"{cur_dir}/examples/extreme_ironing.jpg", "What is the phone recording?", ], [ f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?", ], [ f"{cur_dir}/examples/desert.jpg", "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?", ], ], inputs=[image, textbox], ) gr.Examples( examples=[ [ f"{cur_dir}/examples/rap.mp4", "What happens in this video?", ], [ f"{cur_dir}/examples/demo2.mp4", "Do you think it's morning or night in this video? Why?", ], [ f"{cur_dir}/examples/demo3.mp4", "At the intersection, in which direction does the red car turn?", ], ], inputs=[video, textbox], ) gr.Markdown(tos_markdown) gr.Markdown(learn_more_markdown) generate, [image, video, state, state_, textbox, temperature, top_p, max_output_tokens], [image, video, chatbot, state, state_]) regenerate, [state, state_], [chatbot, state, state_]).then( generate, [image, video, state, state_, textbox, temperature, top_p, max_output_tokens], [image, video, chatbot, state, state_]) clear_history, [state, state_], [image, video, chatbot, state, state_, textbox]) demo.launch()