Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

__pycache__/Qformer.cpython-39.pyc +0 -0
__pycache__/__init__.cpython-39.pyc +0 -0
__pycache__/base_model.cpython-39.pyc +0 -0
__pycache__/base_processor.cpython-39.pyc +0 -0
__pycache__/blip2.cpython-39.pyc +0 -0
__pycache__/blip_processors.cpython-39.pyc +0 -0
__pycache__/clip_vision_encoder.cpython-39.pyc +0 -0
__pycache__/config.cpython-39.pyc +0 -0
__pycache__/conversation.cpython-39.pyc +0 -0
__pycache__/dist_utils.cpython-39.pyc +0 -0
__pycache__/eva_vit.cpython-310.pyc +0 -0
__pycache__/eva_vit.cpython-39.pyc +0 -0
__pycache__/logger.cpython-39.pyc +0 -0
__pycache__/mini_gpt4_llama_v2.cpython-310.pyc +0 -0
__pycache__/mini_gpt4_llama_v2.cpython-39.pyc +0 -0
__pycache__/modeling_llama_v2.cpython-39.pyc +0 -0
__pycache__/randaugment.cpython-39.pyc +0 -0
__pycache__/registry.cpython-39.pyc +0 -0
__pycache__/utils.cpython-39.pyc +0 -0
mini_gpt4_llama_v2.py +128 -16

__pycache__/Qformer.cpython-39.pyc ADDED Viewed

Binary file (30.9 kB). View file

__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (6.25 kB). View file

__pycache__/base_model.cpython-39.pyc ADDED Viewed

Binary file (8.25 kB). View file

__pycache__/base_processor.cpython-39.pyc ADDED Viewed

Binary file (1.35 kB). View file

__pycache__/blip2.cpython-39.pyc ADDED Viewed

Binary file (6.44 kB). View file

__pycache__/blip_processors.cpython-39.pyc ADDED Viewed

Binary file (4.42 kB). View file

__pycache__/clip_vision_encoder.cpython-39.pyc ADDED Viewed

Binary file (2.98 kB). View file

__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (12.3 kB). View file

__pycache__/conversation.cpython-39.pyc ADDED Viewed

Binary file (7.23 kB). View file

__pycache__/dist_utils.cpython-39.pyc ADDED Viewed

Binary file (3.89 kB). View file

__pycache__/eva_vit.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/eva_vit.cpython-310.pyc and b/__pycache__/eva_vit.cpython-310.pyc differ

__pycache__/eva_vit.cpython-39.pyc ADDED Viewed

Binary file (14 kB). View file

__pycache__/logger.cpython-39.pyc ADDED Viewed

Binary file (6.43 kB). View file

__pycache__/mini_gpt4_llama_v2.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mini_gpt4_llama_v2.cpython-310.pyc and b/__pycache__/mini_gpt4_llama_v2.cpython-310.pyc differ

__pycache__/mini_gpt4_llama_v2.cpython-39.pyc ADDED Viewed

Binary file (24.7 kB). View file

__pycache__/modeling_llama_v2.cpython-39.pyc ADDED Viewed

Binary file (4.22 kB). View file

__pycache__/randaugment.cpython-39.pyc ADDED Viewed

Binary file (12.2 kB). View file

__pycache__/registry.cpython-39.pyc ADDED Viewed

Binary file (8.68 kB). View file

__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (12.6 kB). View file

mini_gpt4_llama_v2.py CHANGED Viewed

@@ -1,16 +1,24 @@
 import logging
 import random
 import torch
 from torch.cuda.amp import autocast as autocast
 import torch.nn as nn
 from minigpt4_video.registry import registry
 from minigpt4_video.blip2 import Blip2Base, disabled_train
-# from minigpt4_video.modeling_llama_v2 import LlamaForCausalLM as llm_model
-# from minigpt4_video.modeling_mistral import MistralForCausalLM as llm_model
 from minigpt4_video.conversation import Conversation, SeparatorStyle, StoppingCriteriaList, StoppingCriteriaSub
 from transformers import LlamaTokenizer
 from transformers import BitsAndBytesConfig
 from transformers import AutoConfig, AutoTokenizer
@@ -22,14 +30,34 @@ from peft import (
     set_peft_model_state_dict,
 )
 import time
-import json
 import numpy as np
 import os
 from transformers import PretrainedConfig
 from transformers import PreTrainedModel
-from typing import List
-from collections import defaultdict
 from minigpt4_video.conversation import CONV_VISION
 class minigpt4_video_config(PretrainedConfig):
     model_type="minigpt4_video"
     PRETRAINED_MODEL_CONFIG_DICT = {
@@ -50,7 +78,6 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
     """
     BLIP2 GPT-LLAMA model.
     """
     PRETRAINED_MODEL_CONFIG_DICT = {
         "minigpt4_video": "minigpt4/configs/models/minigpt4.yaml",
     }
@@ -61,8 +88,8 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
         cfg={},
     ):
         ## loop through the config minigpt4_video_config object and set the attributes
-        # if isinstance(cfg, minigpt4_video_config):
-        cfg = cfg.to_dict()
         for key, value in cfg.items():
             try:
@@ -73,7 +100,6 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
         Blip2Base.__init__(self)
         vis_processor_cfg = {"name": "blip2_image_train","image_size": 224}
-        print(vis_processor_cfg)
         self.vis_processor = registry.get_processor_class(vis_processor_cfg["name"]).from_config(vis_processor_cfg)
         self.CONV_VISION = CONV_VISION
         if "Mistral" in self.llama_model:
@@ -177,7 +203,75 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
             print('Prompt Example \n{}'.format(random.choice(self.prompt_list)))
         else:
             self.prompt_list = []
     def encode_img(self, image):
         device = image.device
         if len(image.shape) > 4:
@@ -559,7 +653,24 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
             return answers, video_temporal_features
         else:
             return answers
     @torch.no_grad()
     def generate_text_only(
         self,
@@ -727,11 +838,12 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
             ckpt = torch.load(ckpt_path, map_location="cpu")
             msg = model.load_state_dict(ckpt['model'], strict=False)
         # push the model to the hub with its metadata and config file
-        # model.push_to_hub("MiniGPT4-video-v2")
-        # video_config = minigpt4_video_config(cfg)
         # video_config.save_pretrained("minigpt4_video_config")
         # print("Save Minigpt-4-LLM Config: minigpt4_video_config")
-        # video_config.push_to_hub("MiniGPT4-video")
         return model

 import logging
 import random
+import torch
+import webvtt
+import os
+import cv2
+from torchvision import transforms
+import soundfile as sf
+import moviepy.editor as mp
+from PIL import Image
+from moviepy.editor import VideoFileClip
+import torch
+import random
+import torch.backends.cudnn as cudnn
 import torch
 from torch.cuda.amp import autocast as autocast
 import torch.nn as nn
 from minigpt4_video.registry import registry
 from minigpt4_video.blip2 import Blip2Base, disabled_train
 from minigpt4_video.conversation import Conversation, SeparatorStyle, StoppingCriteriaList, StoppingCriteriaSub
 from transformers import LlamaTokenizer
 from transformers import BitsAndBytesConfig
 from transformers import AutoConfig, AutoTokenizer
     set_peft_model_state_dict,
 )
 import time
 import numpy as np
 import os
 from transformers import PretrainedConfig
 from transformers import PreTrainedModel
 from minigpt4_video.conversation import CONV_VISION
+import cv2
+def extract_audio(video_path, audio_path):
+    video_clip = mp.VideoFileClip(video_path)
+    audio_clip = video_clip.audio
+    audio_clip.write_audiofile(audio_path, codec="libmp3lame", bitrate="320k")
+def generate_subtitles(video_path):
+    video_id=video_path.split('/')[-1].split('.')[0]
+    audio_path = f"workspace/inference_subtitles/mp3/{video_id}"+'.mp3'
+    os.makedirs("workspace/inference_subtitles/mp3",exist_ok=True)
+    try:
+        extract_audio(video_path,audio_path)
+        print("successfully extracted")
+        os.system(f"whisper {audio_path}  --language English --model large --output_format vtt --output_dir workspace/inference_subtitles/")
+        # remove the audio file
+        os.system(f"rm {audio_path}")
+        print("subtitle successfully generated")
+        return f"workspace/inference_subtitles/{video_id}"+'.vtt'
+    except Exception as e:
+        print("error",e)
+        print("error",video_path)
+        return None
 class minigpt4_video_config(PretrainedConfig):
     model_type="minigpt4_video"
     PRETRAINED_MODEL_CONFIG_DICT = {
     """
     BLIP2 GPT-LLAMA model.
     """
     PRETRAINED_MODEL_CONFIG_DICT = {
         "minigpt4_video": "minigpt4/configs/models/minigpt4.yaml",
     }
         cfg={},
     ):
         ## loop through the config minigpt4_video_config object and set the attributes
+        if isinstance(cfg, minigpt4_video_config):
+            cfg = cfg.to_dict()
         for key, value in cfg.items():
             try:
         Blip2Base.__init__(self)
         vis_processor_cfg = {"name": "blip2_image_train","image_size": 224}
         self.vis_processor = registry.get_processor_class(vis_processor_cfg["name"]).from_config(vis_processor_cfg)
         self.CONV_VISION = CONV_VISION
         if "Mistral" in self.llama_model:
             print('Prompt Example \n{}'.format(random.choice(self.prompt_list)))
         else:
             self.prompt_list = []
+    def prepare_input(self,video_path,subtitle_path,instruction):
+        cap = cv2.VideoCapture(video_path)
+        if subtitle_path is not None:
+            # Load the VTT subtitle file
+            vtt_file = webvtt.read(subtitle_path)
+            print("subtitle loaded successfully")
+            clip = VideoFileClip(video_path)
+            total_num_frames = int(clip.duration * clip.fps)
+            # print("Video duration = ",clip.duration)
+            clip.close()
+        else :
+            # calculate the total number of frames in the video using opencv
+            total_num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        max_images_length = 45
+        max_sub_len = 400
+        images = []
+        frame_count = 0
+        sampling_interval = int(total_num_frames / max_images_length)
+        if sampling_interval == 0:
+            sampling_interval = 1
+        img_placeholder = ""
+        subtitle_text_in_interval = ""
+        history_subtitles = {}
+        raw_frames=[]
+        number_of_words=0
+        transform=transforms.Compose([
+                    transforms.ToPILImage(),
+                ])
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Find the corresponding subtitle for the frame and combine the interval subtitles into one subtitle
+            # we choose 1 frame for every 2 seconds,so we need to combine the subtitles in the interval of 2 seconds
+            if subtitle_path is not None:
+                for subtitle in vtt_file:
+                    sub=subtitle.text.replace('\n',' ')
+                    if (subtitle.start_in_seconds <= (frame_count / int(clip.fps)) <= subtitle.end_in_seconds) and sub not in subtitle_text_in_interval:
+                        if not history_subtitles.get(sub,False):
+                            subtitle_text_in_interval+=sub+" "
+                        history_subtitles[sub]=True
+                        break
+            if frame_count % sampling_interval == 0:
+                raw_frames.append(Image.fromarray(cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)))
+                frame = transform(frame[:,:,::-1]) # convert to RGB
+                frame = self.vis_processor(frame)
+                images.append(frame)
+                img_placeholder += '<Img><ImageHere>'
+                if subtitle_path is not None and subtitle_text_in_interval != "" and number_of_words< max_sub_len:
+                    img_placeholder+=f'<Cap>{subtitle_text_in_interval}'
+                    number_of_words+=len(subtitle_text_in_interval.split(' '))
+                    subtitle_text_in_interval = ""
+            frame_count += 1
+            if len(images) >= max_images_length:
+                break
+        while len(images) < max_images_length:
+            images.append(images[-1])
+            img_placeholder += '<Img><ImageHere>'
+        cap.release()
+        cv2.destroyAllWindows()
+        if len(images) == 0:
+            # skip the video if no frame is extracted
+            return None,None
+        images = torch.stack(images)
+        instruction = img_placeholder + '\n' + instruction
+        return images,instruction
     def encode_img(self, image):
         device = image.device
         if len(image.shape) > 4:
             return answers, video_temporal_features
         else:
             return answers
+    def inference_fun (self,video_path,instruction,gen_subtitles=True):
+        if gen_subtitles:
+            subtitle_path=generate_subtitles(video_path)
+        else :
+            subtitle_path=None
+        prepared_images,prepared_instruction=self.prepare_input(video_path,subtitle_path,instruction)
+        if prepared_images is None:
+            return "Video cann't be open ,check the video path again"
+        length=len(prepared_images)
+        prepared_images=prepared_images.unsqueeze(0)
+        conv = self.CONV_VISION.copy()
+        conv.system = ""
+        # if you want to make conversation comment the 2 lines above and make the conv is global variable
+        conv.append_message(conv.roles[0], prepared_instruction)
+        conv.append_message(conv.roles[1], None)
+        prompt = [conv.get_prompt()]
+        answers = self.generate(prepared_images, prompt, max_new_tokens=512, do_sample=True, lengths=[length],num_beams=1)
+        return answers[0]
     @torch.no_grad()
     def generate_text_only(
         self,
             ckpt = torch.load(ckpt_path, map_location="cpu")
             msg = model.load_state_dict(ckpt['model'], strict=False)
         # push the model to the hub with its metadata and config file
+        model.to('cuda')
+        model.push_to_hub("Vision-CAIR/MiniGPT4-video-hf")
+        video_config = minigpt4_video_config(cfg)
         # video_config.save_pretrained("minigpt4_video_config")
         # print("Save Minigpt-4-LLM Config: minigpt4_video_config")
+        video_config.push_to_hub("MiniGPT4-video")
         return model