Vision-CAIR
commited on
Commit
•
4af3b7a
1
Parent(s):
6d513de
Upload folder using huggingface_hub
Browse files- __pycache__/Qformer.cpython-39.pyc +0 -0
- __pycache__/__init__.cpython-39.pyc +0 -0
- __pycache__/base_model.cpython-39.pyc +0 -0
- __pycache__/base_processor.cpython-39.pyc +0 -0
- __pycache__/blip2.cpython-39.pyc +0 -0
- __pycache__/blip_processors.cpython-39.pyc +0 -0
- __pycache__/clip_vision_encoder.cpython-39.pyc +0 -0
- __pycache__/config.cpython-39.pyc +0 -0
- __pycache__/conversation.cpython-39.pyc +0 -0
- __pycache__/dist_utils.cpython-39.pyc +0 -0
- __pycache__/eva_vit.cpython-310.pyc +0 -0
- __pycache__/eva_vit.cpython-39.pyc +0 -0
- __pycache__/logger.cpython-39.pyc +0 -0
- __pycache__/mini_gpt4_llama_v2.cpython-310.pyc +0 -0
- __pycache__/mini_gpt4_llama_v2.cpython-39.pyc +0 -0
- __pycache__/modeling_llama_v2.cpython-39.pyc +0 -0
- __pycache__/randaugment.cpython-39.pyc +0 -0
- __pycache__/registry.cpython-39.pyc +0 -0
- __pycache__/utils.cpython-39.pyc +0 -0
- mini_gpt4_llama_v2.py +128 -16
__pycache__/Qformer.cpython-39.pyc
ADDED
Binary file (30.9 kB). View file
|
|
__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (6.25 kB). View file
|
|
__pycache__/base_model.cpython-39.pyc
ADDED
Binary file (8.25 kB). View file
|
|
__pycache__/base_processor.cpython-39.pyc
ADDED
Binary file (1.35 kB). View file
|
|
__pycache__/blip2.cpython-39.pyc
ADDED
Binary file (6.44 kB). View file
|
|
__pycache__/blip_processors.cpython-39.pyc
ADDED
Binary file (4.42 kB). View file
|
|
__pycache__/clip_vision_encoder.cpython-39.pyc
ADDED
Binary file (2.98 kB). View file
|
|
__pycache__/config.cpython-39.pyc
ADDED
Binary file (12.3 kB). View file
|
|
__pycache__/conversation.cpython-39.pyc
ADDED
Binary file (7.23 kB). View file
|
|
__pycache__/dist_utils.cpython-39.pyc
ADDED
Binary file (3.89 kB). View file
|
|
__pycache__/eva_vit.cpython-310.pyc
CHANGED
Binary files a/__pycache__/eva_vit.cpython-310.pyc and b/__pycache__/eva_vit.cpython-310.pyc differ
|
|
__pycache__/eva_vit.cpython-39.pyc
ADDED
Binary file (14 kB). View file
|
|
__pycache__/logger.cpython-39.pyc
ADDED
Binary file (6.43 kB). View file
|
|
__pycache__/mini_gpt4_llama_v2.cpython-310.pyc
CHANGED
Binary files a/__pycache__/mini_gpt4_llama_v2.cpython-310.pyc and b/__pycache__/mini_gpt4_llama_v2.cpython-310.pyc differ
|
|
__pycache__/mini_gpt4_llama_v2.cpython-39.pyc
ADDED
Binary file (24.7 kB). View file
|
|
__pycache__/modeling_llama_v2.cpython-39.pyc
ADDED
Binary file (4.22 kB). View file
|
|
__pycache__/randaugment.cpython-39.pyc
ADDED
Binary file (12.2 kB). View file
|
|
__pycache__/registry.cpython-39.pyc
ADDED
Binary file (8.68 kB). View file
|
|
__pycache__/utils.cpython-39.pyc
ADDED
Binary file (12.6 kB). View file
|
|
mini_gpt4_llama_v2.py
CHANGED
@@ -1,16 +1,24 @@
|
|
1 |
import logging
|
2 |
import random
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import torch
|
5 |
from torch.cuda.amp import autocast as autocast
|
6 |
import torch.nn as nn
|
7 |
|
8 |
from minigpt4_video.registry import registry
|
9 |
from minigpt4_video.blip2 import Blip2Base, disabled_train
|
10 |
-
# from minigpt4_video.modeling_llama_v2 import LlamaForCausalLM as llm_model
|
11 |
-
# from minigpt4_video.modeling_mistral import MistralForCausalLM as llm_model
|
12 |
from minigpt4_video.conversation import Conversation, SeparatorStyle, StoppingCriteriaList, StoppingCriteriaSub
|
13 |
-
|
14 |
from transformers import LlamaTokenizer
|
15 |
from transformers import BitsAndBytesConfig
|
16 |
from transformers import AutoConfig, AutoTokenizer
|
@@ -22,14 +30,34 @@ from peft import (
|
|
22 |
set_peft_model_state_dict,
|
23 |
)
|
24 |
import time
|
25 |
-
import json
|
26 |
import numpy as np
|
27 |
import os
|
28 |
from transformers import PretrainedConfig
|
29 |
from transformers import PreTrainedModel
|
30 |
-
from typing import List
|
31 |
-
from collections import defaultdict
|
32 |
from minigpt4_video.conversation import CONV_VISION
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
class minigpt4_video_config(PretrainedConfig):
|
34 |
model_type="minigpt4_video"
|
35 |
PRETRAINED_MODEL_CONFIG_DICT = {
|
@@ -50,7 +78,6 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
|
|
50 |
"""
|
51 |
BLIP2 GPT-LLAMA model.
|
52 |
"""
|
53 |
-
|
54 |
PRETRAINED_MODEL_CONFIG_DICT = {
|
55 |
"minigpt4_video": "minigpt4/configs/models/minigpt4.yaml",
|
56 |
}
|
@@ -61,8 +88,8 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
|
|
61 |
cfg={},
|
62 |
):
|
63 |
## loop through the config minigpt4_video_config object and set the attributes
|
64 |
-
|
65 |
-
|
66 |
|
67 |
for key, value in cfg.items():
|
68 |
try:
|
@@ -73,7 +100,6 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
|
|
73 |
Blip2Base.__init__(self)
|
74 |
|
75 |
vis_processor_cfg = {"name": "blip2_image_train","image_size": 224}
|
76 |
-
print(vis_processor_cfg)
|
77 |
self.vis_processor = registry.get_processor_class(vis_processor_cfg["name"]).from_config(vis_processor_cfg)
|
78 |
self.CONV_VISION = CONV_VISION
|
79 |
if "Mistral" in self.llama_model:
|
@@ -177,7 +203,75 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
|
|
177 |
print('Prompt Example \n{}'.format(random.choice(self.prompt_list)))
|
178 |
else:
|
179 |
self.prompt_list = []
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
def encode_img(self, image):
|
182 |
device = image.device
|
183 |
if len(image.shape) > 4:
|
@@ -559,7 +653,24 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
|
|
559 |
return answers, video_temporal_features
|
560 |
else:
|
561 |
return answers
|
562 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
563 |
@torch.no_grad()
|
564 |
def generate_text_only(
|
565 |
self,
|
@@ -727,11 +838,12 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
|
|
727 |
ckpt = torch.load(ckpt_path, map_location="cpu")
|
728 |
msg = model.load_state_dict(ckpt['model'], strict=False)
|
729 |
# push the model to the hub with its metadata and config file
|
730 |
-
|
731 |
-
|
|
|
732 |
# video_config.save_pretrained("minigpt4_video_config")
|
733 |
# print("Save Minigpt-4-LLM Config: minigpt4_video_config")
|
734 |
-
|
735 |
return model
|
736 |
|
737 |
|
|
|
1 |
import logging
|
2 |
import random
|
3 |
+
import torch
|
4 |
+
import webvtt
|
5 |
+
import os
|
6 |
+
import cv2
|
7 |
+
from torchvision import transforms
|
8 |
+
import soundfile as sf
|
9 |
+
import moviepy.editor as mp
|
10 |
+
from PIL import Image
|
11 |
+
from moviepy.editor import VideoFileClip
|
12 |
+
import torch
|
13 |
+
import random
|
14 |
+
import torch.backends.cudnn as cudnn
|
15 |
import torch
|
16 |
from torch.cuda.amp import autocast as autocast
|
17 |
import torch.nn as nn
|
18 |
|
19 |
from minigpt4_video.registry import registry
|
20 |
from minigpt4_video.blip2 import Blip2Base, disabled_train
|
|
|
|
|
21 |
from minigpt4_video.conversation import Conversation, SeparatorStyle, StoppingCriteriaList, StoppingCriteriaSub
|
|
|
22 |
from transformers import LlamaTokenizer
|
23 |
from transformers import BitsAndBytesConfig
|
24 |
from transformers import AutoConfig, AutoTokenizer
|
|
|
30 |
set_peft_model_state_dict,
|
31 |
)
|
32 |
import time
|
|
|
33 |
import numpy as np
|
34 |
import os
|
35 |
from transformers import PretrainedConfig
|
36 |
from transformers import PreTrainedModel
|
|
|
|
|
37 |
from minigpt4_video.conversation import CONV_VISION
|
38 |
+
import cv2
|
39 |
+
def extract_audio(video_path, audio_path):
|
40 |
+
video_clip = mp.VideoFileClip(video_path)
|
41 |
+
audio_clip = video_clip.audio
|
42 |
+
audio_clip.write_audiofile(audio_path, codec="libmp3lame", bitrate="320k")
|
43 |
+
|
44 |
+
def generate_subtitles(video_path):
|
45 |
+
video_id=video_path.split('/')[-1].split('.')[0]
|
46 |
+
audio_path = f"workspace/inference_subtitles/mp3/{video_id}"+'.mp3'
|
47 |
+
os.makedirs("workspace/inference_subtitles/mp3",exist_ok=True)
|
48 |
+
try:
|
49 |
+
extract_audio(video_path,audio_path)
|
50 |
+
print("successfully extracted")
|
51 |
+
os.system(f"whisper {audio_path} --language English --model large --output_format vtt --output_dir workspace/inference_subtitles/")
|
52 |
+
# remove the audio file
|
53 |
+
os.system(f"rm {audio_path}")
|
54 |
+
print("subtitle successfully generated")
|
55 |
+
return f"workspace/inference_subtitles/{video_id}"+'.vtt'
|
56 |
+
except Exception as e:
|
57 |
+
print("error",e)
|
58 |
+
print("error",video_path)
|
59 |
+
return None
|
60 |
+
|
61 |
class minigpt4_video_config(PretrainedConfig):
|
62 |
model_type="minigpt4_video"
|
63 |
PRETRAINED_MODEL_CONFIG_DICT = {
|
|
|
78 |
"""
|
79 |
BLIP2 GPT-LLAMA model.
|
80 |
"""
|
|
|
81 |
PRETRAINED_MODEL_CONFIG_DICT = {
|
82 |
"minigpt4_video": "minigpt4/configs/models/minigpt4.yaml",
|
83 |
}
|
|
|
88 |
cfg={},
|
89 |
):
|
90 |
## loop through the config minigpt4_video_config object and set the attributes
|
91 |
+
if isinstance(cfg, minigpt4_video_config):
|
92 |
+
cfg = cfg.to_dict()
|
93 |
|
94 |
for key, value in cfg.items():
|
95 |
try:
|
|
|
100 |
Blip2Base.__init__(self)
|
101 |
|
102 |
vis_processor_cfg = {"name": "blip2_image_train","image_size": 224}
|
|
|
103 |
self.vis_processor = registry.get_processor_class(vis_processor_cfg["name"]).from_config(vis_processor_cfg)
|
104 |
self.CONV_VISION = CONV_VISION
|
105 |
if "Mistral" in self.llama_model:
|
|
|
203 |
print('Prompt Example \n{}'.format(random.choice(self.prompt_list)))
|
204 |
else:
|
205 |
self.prompt_list = []
|
206 |
+
def prepare_input(self,video_path,subtitle_path,instruction):
|
207 |
+
cap = cv2.VideoCapture(video_path)
|
208 |
+
if subtitle_path is not None:
|
209 |
+
# Load the VTT subtitle file
|
210 |
+
vtt_file = webvtt.read(subtitle_path)
|
211 |
+
print("subtitle loaded successfully")
|
212 |
+
clip = VideoFileClip(video_path)
|
213 |
+
total_num_frames = int(clip.duration * clip.fps)
|
214 |
+
# print("Video duration = ",clip.duration)
|
215 |
+
clip.close()
|
216 |
+
else :
|
217 |
+
# calculate the total number of frames in the video using opencv
|
218 |
+
total_num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
219 |
+
max_images_length = 45
|
220 |
+
max_sub_len = 400
|
221 |
+
images = []
|
222 |
+
frame_count = 0
|
223 |
+
sampling_interval = int(total_num_frames / max_images_length)
|
224 |
+
if sampling_interval == 0:
|
225 |
+
sampling_interval = 1
|
226 |
+
img_placeholder = ""
|
227 |
+
subtitle_text_in_interval = ""
|
228 |
+
history_subtitles = {}
|
229 |
+
raw_frames=[]
|
230 |
+
number_of_words=0
|
231 |
+
transform=transforms.Compose([
|
232 |
+
transforms.ToPILImage(),
|
233 |
+
])
|
234 |
+
while cap.isOpened():
|
235 |
+
ret, frame = cap.read()
|
236 |
+
if not ret:
|
237 |
+
break
|
238 |
+
# Find the corresponding subtitle for the frame and combine the interval subtitles into one subtitle
|
239 |
+
# we choose 1 frame for every 2 seconds,so we need to combine the subtitles in the interval of 2 seconds
|
240 |
+
if subtitle_path is not None:
|
241 |
+
for subtitle in vtt_file:
|
242 |
+
sub=subtitle.text.replace('\n',' ')
|
243 |
+
if (subtitle.start_in_seconds <= (frame_count / int(clip.fps)) <= subtitle.end_in_seconds) and sub not in subtitle_text_in_interval:
|
244 |
+
if not history_subtitles.get(sub,False):
|
245 |
+
subtitle_text_in_interval+=sub+" "
|
246 |
+
history_subtitles[sub]=True
|
247 |
+
break
|
248 |
+
if frame_count % sampling_interval == 0:
|
249 |
+
raw_frames.append(Image.fromarray(cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)))
|
250 |
+
frame = transform(frame[:,:,::-1]) # convert to RGB
|
251 |
+
frame = self.vis_processor(frame)
|
252 |
+
images.append(frame)
|
253 |
+
img_placeholder += '<Img><ImageHere>'
|
254 |
+
if subtitle_path is not None and subtitle_text_in_interval != "" and number_of_words< max_sub_len:
|
255 |
+
img_placeholder+=f'<Cap>{subtitle_text_in_interval}'
|
256 |
+
number_of_words+=len(subtitle_text_in_interval.split(' '))
|
257 |
+
subtitle_text_in_interval = ""
|
258 |
+
frame_count += 1
|
259 |
+
|
260 |
+
if len(images) >= max_images_length:
|
261 |
+
break
|
262 |
+
|
263 |
+
while len(images) < max_images_length:
|
264 |
+
images.append(images[-1])
|
265 |
+
img_placeholder += '<Img><ImageHere>'
|
266 |
+
|
267 |
+
cap.release()
|
268 |
+
cv2.destroyAllWindows()
|
269 |
+
if len(images) == 0:
|
270 |
+
# skip the video if no frame is extracted
|
271 |
+
return None,None
|
272 |
+
images = torch.stack(images)
|
273 |
+
instruction = img_placeholder + '\n' + instruction
|
274 |
+
return images,instruction
|
275 |
def encode_img(self, image):
|
276 |
device = image.device
|
277 |
if len(image.shape) > 4:
|
|
|
653 |
return answers, video_temporal_features
|
654 |
else:
|
655 |
return answers
|
656 |
+
def inference_fun (self,video_path,instruction,gen_subtitles=True):
|
657 |
+
if gen_subtitles:
|
658 |
+
subtitle_path=generate_subtitles(video_path)
|
659 |
+
else :
|
660 |
+
subtitle_path=None
|
661 |
+
prepared_images,prepared_instruction=self.prepare_input(video_path,subtitle_path,instruction)
|
662 |
+
if prepared_images is None:
|
663 |
+
return "Video cann't be open ,check the video path again"
|
664 |
+
length=len(prepared_images)
|
665 |
+
prepared_images=prepared_images.unsqueeze(0)
|
666 |
+
conv = self.CONV_VISION.copy()
|
667 |
+
conv.system = ""
|
668 |
+
# if you want to make conversation comment the 2 lines above and make the conv is global variable
|
669 |
+
conv.append_message(conv.roles[0], prepared_instruction)
|
670 |
+
conv.append_message(conv.roles[1], None)
|
671 |
+
prompt = [conv.get_prompt()]
|
672 |
+
answers = self.generate(prepared_images, prompt, max_new_tokens=512, do_sample=True, lengths=[length],num_beams=1)
|
673 |
+
return answers[0]
|
674 |
@torch.no_grad()
|
675 |
def generate_text_only(
|
676 |
self,
|
|
|
838 |
ckpt = torch.load(ckpt_path, map_location="cpu")
|
839 |
msg = model.load_state_dict(ckpt['model'], strict=False)
|
840 |
# push the model to the hub with its metadata and config file
|
841 |
+
model.to('cuda')
|
842 |
+
model.push_to_hub("Vision-CAIR/MiniGPT4-video-hf")
|
843 |
+
video_config = minigpt4_video_config(cfg)
|
844 |
# video_config.save_pretrained("minigpt4_video_config")
|
845 |
# print("Save Minigpt-4-LLM Config: minigpt4_video_config")
|
846 |
+
video_config.push_to_hub("MiniGPT4-video")
|
847 |
return model
|
848 |
|
849 |
|