Vision-CAIR commited on
Commit
4af3b7a
1 Parent(s): 6d513de

Upload folder using huggingface_hub

Browse files
__pycache__/Qformer.cpython-39.pyc ADDED
Binary file (30.9 kB). View file
 
__pycache__/__init__.cpython-39.pyc ADDED
Binary file (6.25 kB). View file
 
__pycache__/base_model.cpython-39.pyc ADDED
Binary file (8.25 kB). View file
 
__pycache__/base_processor.cpython-39.pyc ADDED
Binary file (1.35 kB). View file
 
__pycache__/blip2.cpython-39.pyc ADDED
Binary file (6.44 kB). View file
 
__pycache__/blip_processors.cpython-39.pyc ADDED
Binary file (4.42 kB). View file
 
__pycache__/clip_vision_encoder.cpython-39.pyc ADDED
Binary file (2.98 kB). View file
 
__pycache__/config.cpython-39.pyc ADDED
Binary file (12.3 kB). View file
 
__pycache__/conversation.cpython-39.pyc ADDED
Binary file (7.23 kB). View file
 
__pycache__/dist_utils.cpython-39.pyc ADDED
Binary file (3.89 kB). View file
 
__pycache__/eva_vit.cpython-310.pyc CHANGED
Binary files a/__pycache__/eva_vit.cpython-310.pyc and b/__pycache__/eva_vit.cpython-310.pyc differ
 
__pycache__/eva_vit.cpython-39.pyc ADDED
Binary file (14 kB). View file
 
__pycache__/logger.cpython-39.pyc ADDED
Binary file (6.43 kB). View file
 
__pycache__/mini_gpt4_llama_v2.cpython-310.pyc CHANGED
Binary files a/__pycache__/mini_gpt4_llama_v2.cpython-310.pyc and b/__pycache__/mini_gpt4_llama_v2.cpython-310.pyc differ
 
__pycache__/mini_gpt4_llama_v2.cpython-39.pyc ADDED
Binary file (24.7 kB). View file
 
__pycache__/modeling_llama_v2.cpython-39.pyc ADDED
Binary file (4.22 kB). View file
 
__pycache__/randaugment.cpython-39.pyc ADDED
Binary file (12.2 kB). View file
 
__pycache__/registry.cpython-39.pyc ADDED
Binary file (8.68 kB). View file
 
__pycache__/utils.cpython-39.pyc ADDED
Binary file (12.6 kB). View file
 
mini_gpt4_llama_v2.py CHANGED
@@ -1,16 +1,24 @@
1
  import logging
2
  import random
3
-
 
 
 
 
 
 
 
 
 
 
 
4
  import torch
5
  from torch.cuda.amp import autocast as autocast
6
  import torch.nn as nn
7
 
8
  from minigpt4_video.registry import registry
9
  from minigpt4_video.blip2 import Blip2Base, disabled_train
10
- # from minigpt4_video.modeling_llama_v2 import LlamaForCausalLM as llm_model
11
- # from minigpt4_video.modeling_mistral import MistralForCausalLM as llm_model
12
  from minigpt4_video.conversation import Conversation, SeparatorStyle, StoppingCriteriaList, StoppingCriteriaSub
13
-
14
  from transformers import LlamaTokenizer
15
  from transformers import BitsAndBytesConfig
16
  from transformers import AutoConfig, AutoTokenizer
@@ -22,14 +30,34 @@ from peft import (
22
  set_peft_model_state_dict,
23
  )
24
  import time
25
- import json
26
  import numpy as np
27
  import os
28
  from transformers import PretrainedConfig
29
  from transformers import PreTrainedModel
30
- from typing import List
31
- from collections import defaultdict
32
  from minigpt4_video.conversation import CONV_VISION
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  class minigpt4_video_config(PretrainedConfig):
34
  model_type="minigpt4_video"
35
  PRETRAINED_MODEL_CONFIG_DICT = {
@@ -50,7 +78,6 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
50
  """
51
  BLIP2 GPT-LLAMA model.
52
  """
53
-
54
  PRETRAINED_MODEL_CONFIG_DICT = {
55
  "minigpt4_video": "minigpt4/configs/models/minigpt4.yaml",
56
  }
@@ -61,8 +88,8 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
61
  cfg={},
62
  ):
63
  ## loop through the config minigpt4_video_config object and set the attributes
64
- # if isinstance(cfg, minigpt4_video_config):
65
- cfg = cfg.to_dict()
66
 
67
  for key, value in cfg.items():
68
  try:
@@ -73,7 +100,6 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
73
  Blip2Base.__init__(self)
74
 
75
  vis_processor_cfg = {"name": "blip2_image_train","image_size": 224}
76
- print(vis_processor_cfg)
77
  self.vis_processor = registry.get_processor_class(vis_processor_cfg["name"]).from_config(vis_processor_cfg)
78
  self.CONV_VISION = CONV_VISION
79
  if "Mistral" in self.llama_model:
@@ -177,7 +203,75 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
177
  print('Prompt Example \n{}'.format(random.choice(self.prompt_list)))
178
  else:
179
  self.prompt_list = []
180
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  def encode_img(self, image):
182
  device = image.device
183
  if len(image.shape) > 4:
@@ -559,7 +653,24 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
559
  return answers, video_temporal_features
560
  else:
561
  return answers
562
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  @torch.no_grad()
564
  def generate_text_only(
565
  self,
@@ -727,11 +838,12 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
727
  ckpt = torch.load(ckpt_path, map_location="cpu")
728
  msg = model.load_state_dict(ckpt['model'], strict=False)
729
  # push the model to the hub with its metadata and config file
730
- # model.push_to_hub("MiniGPT4-video-v2")
731
- # video_config = minigpt4_video_config(cfg)
 
732
  # video_config.save_pretrained("minigpt4_video_config")
733
  # print("Save Minigpt-4-LLM Config: minigpt4_video_config")
734
- # video_config.push_to_hub("MiniGPT4-video")
735
  return model
736
 
737
 
 
1
  import logging
2
  import random
3
+ import torch
4
+ import webvtt
5
+ import os
6
+ import cv2
7
+ from torchvision import transforms
8
+ import soundfile as sf
9
+ import moviepy.editor as mp
10
+ from PIL import Image
11
+ from moviepy.editor import VideoFileClip
12
+ import torch
13
+ import random
14
+ import torch.backends.cudnn as cudnn
15
  import torch
16
  from torch.cuda.amp import autocast as autocast
17
  import torch.nn as nn
18
 
19
  from minigpt4_video.registry import registry
20
  from minigpt4_video.blip2 import Blip2Base, disabled_train
 
 
21
  from minigpt4_video.conversation import Conversation, SeparatorStyle, StoppingCriteriaList, StoppingCriteriaSub
 
22
  from transformers import LlamaTokenizer
23
  from transformers import BitsAndBytesConfig
24
  from transformers import AutoConfig, AutoTokenizer
 
30
  set_peft_model_state_dict,
31
  )
32
  import time
 
33
  import numpy as np
34
  import os
35
  from transformers import PretrainedConfig
36
  from transformers import PreTrainedModel
 
 
37
  from minigpt4_video.conversation import CONV_VISION
38
+ import cv2
39
+ def extract_audio(video_path, audio_path):
40
+ video_clip = mp.VideoFileClip(video_path)
41
+ audio_clip = video_clip.audio
42
+ audio_clip.write_audiofile(audio_path, codec="libmp3lame", bitrate="320k")
43
+
44
+ def generate_subtitles(video_path):
45
+ video_id=video_path.split('/')[-1].split('.')[0]
46
+ audio_path = f"workspace/inference_subtitles/mp3/{video_id}"+'.mp3'
47
+ os.makedirs("workspace/inference_subtitles/mp3",exist_ok=True)
48
+ try:
49
+ extract_audio(video_path,audio_path)
50
+ print("successfully extracted")
51
+ os.system(f"whisper {audio_path} --language English --model large --output_format vtt --output_dir workspace/inference_subtitles/")
52
+ # remove the audio file
53
+ os.system(f"rm {audio_path}")
54
+ print("subtitle successfully generated")
55
+ return f"workspace/inference_subtitles/{video_id}"+'.vtt'
56
+ except Exception as e:
57
+ print("error",e)
58
+ print("error",video_path)
59
+ return None
60
+
61
  class minigpt4_video_config(PretrainedConfig):
62
  model_type="minigpt4_video"
63
  PRETRAINED_MODEL_CONFIG_DICT = {
 
78
  """
79
  BLIP2 GPT-LLAMA model.
80
  """
 
81
  PRETRAINED_MODEL_CONFIG_DICT = {
82
  "minigpt4_video": "minigpt4/configs/models/minigpt4.yaml",
83
  }
 
88
  cfg={},
89
  ):
90
  ## loop through the config minigpt4_video_config object and set the attributes
91
+ if isinstance(cfg, minigpt4_video_config):
92
+ cfg = cfg.to_dict()
93
 
94
  for key, value in cfg.items():
95
  try:
 
100
  Blip2Base.__init__(self)
101
 
102
  vis_processor_cfg = {"name": "blip2_image_train","image_size": 224}
 
103
  self.vis_processor = registry.get_processor_class(vis_processor_cfg["name"]).from_config(vis_processor_cfg)
104
  self.CONV_VISION = CONV_VISION
105
  if "Mistral" in self.llama_model:
 
203
  print('Prompt Example \n{}'.format(random.choice(self.prompt_list)))
204
  else:
205
  self.prompt_list = []
206
+ def prepare_input(self,video_path,subtitle_path,instruction):
207
+ cap = cv2.VideoCapture(video_path)
208
+ if subtitle_path is not None:
209
+ # Load the VTT subtitle file
210
+ vtt_file = webvtt.read(subtitle_path)
211
+ print("subtitle loaded successfully")
212
+ clip = VideoFileClip(video_path)
213
+ total_num_frames = int(clip.duration * clip.fps)
214
+ # print("Video duration = ",clip.duration)
215
+ clip.close()
216
+ else :
217
+ # calculate the total number of frames in the video using opencv
218
+ total_num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
219
+ max_images_length = 45
220
+ max_sub_len = 400
221
+ images = []
222
+ frame_count = 0
223
+ sampling_interval = int(total_num_frames / max_images_length)
224
+ if sampling_interval == 0:
225
+ sampling_interval = 1
226
+ img_placeholder = ""
227
+ subtitle_text_in_interval = ""
228
+ history_subtitles = {}
229
+ raw_frames=[]
230
+ number_of_words=0
231
+ transform=transforms.Compose([
232
+ transforms.ToPILImage(),
233
+ ])
234
+ while cap.isOpened():
235
+ ret, frame = cap.read()
236
+ if not ret:
237
+ break
238
+ # Find the corresponding subtitle for the frame and combine the interval subtitles into one subtitle
239
+ # we choose 1 frame for every 2 seconds,so we need to combine the subtitles in the interval of 2 seconds
240
+ if subtitle_path is not None:
241
+ for subtitle in vtt_file:
242
+ sub=subtitle.text.replace('\n',' ')
243
+ if (subtitle.start_in_seconds <= (frame_count / int(clip.fps)) <= subtitle.end_in_seconds) and sub not in subtitle_text_in_interval:
244
+ if not history_subtitles.get(sub,False):
245
+ subtitle_text_in_interval+=sub+" "
246
+ history_subtitles[sub]=True
247
+ break
248
+ if frame_count % sampling_interval == 0:
249
+ raw_frames.append(Image.fromarray(cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)))
250
+ frame = transform(frame[:,:,::-1]) # convert to RGB
251
+ frame = self.vis_processor(frame)
252
+ images.append(frame)
253
+ img_placeholder += '<Img><ImageHere>'
254
+ if subtitle_path is not None and subtitle_text_in_interval != "" and number_of_words< max_sub_len:
255
+ img_placeholder+=f'<Cap>{subtitle_text_in_interval}'
256
+ number_of_words+=len(subtitle_text_in_interval.split(' '))
257
+ subtitle_text_in_interval = ""
258
+ frame_count += 1
259
+
260
+ if len(images) >= max_images_length:
261
+ break
262
+
263
+ while len(images) < max_images_length:
264
+ images.append(images[-1])
265
+ img_placeholder += '<Img><ImageHere>'
266
+
267
+ cap.release()
268
+ cv2.destroyAllWindows()
269
+ if len(images) == 0:
270
+ # skip the video if no frame is extracted
271
+ return None,None
272
+ images = torch.stack(images)
273
+ instruction = img_placeholder + '\n' + instruction
274
+ return images,instruction
275
  def encode_img(self, image):
276
  device = image.device
277
  if len(image.shape) > 4:
 
653
  return answers, video_temporal_features
654
  else:
655
  return answers
656
+ def inference_fun (self,video_path,instruction,gen_subtitles=True):
657
+ if gen_subtitles:
658
+ subtitle_path=generate_subtitles(video_path)
659
+ else :
660
+ subtitle_path=None
661
+ prepared_images,prepared_instruction=self.prepare_input(video_path,subtitle_path,instruction)
662
+ if prepared_images is None:
663
+ return "Video cann't be open ,check the video path again"
664
+ length=len(prepared_images)
665
+ prepared_images=prepared_images.unsqueeze(0)
666
+ conv = self.CONV_VISION.copy()
667
+ conv.system = ""
668
+ # if you want to make conversation comment the 2 lines above and make the conv is global variable
669
+ conv.append_message(conv.roles[0], prepared_instruction)
670
+ conv.append_message(conv.roles[1], None)
671
+ prompt = [conv.get_prompt()]
672
+ answers = self.generate(prepared_images, prompt, max_new_tokens=512, do_sample=True, lengths=[length],num_beams=1)
673
+ return answers[0]
674
  @torch.no_grad()
675
  def generate_text_only(
676
  self,
 
838
  ckpt = torch.load(ckpt_path, map_location="cpu")
839
  msg = model.load_state_dict(ckpt['model'], strict=False)
840
  # push the model to the hub with its metadata and config file
841
+ model.to('cuda')
842
+ model.push_to_hub("Vision-CAIR/MiniGPT4-video-hf")
843
+ video_config = minigpt4_video_config(cfg)
844
  # video_config.save_pretrained("minigpt4_video_config")
845
  # print("Save Minigpt-4-LLM Config: minigpt4_video_config")
846
+ video_config.push_to_hub("MiniGPT4-video")
847
  return model
848
 
849